1 /*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_subflow_abort(struct mptsub *, int);
113
114 static void mptcp_send_dfin(struct socket *so);
115 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
116 static int mptcp_freeq(struct mptcb *mp_tp);
117
118 /*
119 * Possible return values for subflow event handlers. Note that success
120 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
121 * indicate errors or actions which require immediate attention; they will
122 * prevent the rest of the handlers from processing their respective events
123 * until the next round of events processing.
124 */
125 typedef enum {
126 MPTS_EVRET_DELETE = 1, /* delete this subflow */
127 MPTS_EVRET_OK = 2, /* OK */
128 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
129 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
130 } ev_ret_t;
131
132 static void mptcp_do_sha1(mptcp_key_t *, char *);
133 static void mptcp_do_sha256(mptcp_key_t *, char *);
134
135 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
136
137 static ZONE_DEFINE_TYPE(mptsub_zone, "mptsub", struct mptsub, ZC_ZFREE_CLEARMEM);
138 static ZONE_DEFINE_TYPE(mptopt_zone, "mptopt", struct mptopt, ZC_ZFREE_CLEARMEM);
139 static ZONE_DEFINE(mpt_subauth_zone, "mptauth",
140 sizeof(struct mptcp_subf_auth_entry), ZC_NONE);
141
142 struct mppcbinfo mtcbinfo;
143
144 SYSCTL_DECL(_net_inet);
145
146 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
147
148 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
149 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
150
151
152 static int mptcp_alternate_port = 0;
153 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
154 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
155
156 static struct protosw mptcp_subflow_protosw;
157 static struct pr_usrreqs mptcp_subflow_usrreqs;
158 static struct ip6protosw mptcp_subflow_protosw6;
159 static struct pr_usrreqs mptcp_subflow_usrreqs6;
160
161 static uint8_t mptcp_create_subflows_scheduled;
162
163 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
164 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
165 static uint32_t mptcp_kern_skt_inuse = 0;
166 static uint32_t mptcp_kern_skt_unit;
167 static symptoms_advisory_t mptcp_advisory;
168
169 uint32_t mptcp_cellicon_refcount = 0;
170
171 os_log_t mptcp_log_handle;
172
173 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)174 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
175 {
176 int i, index = -1;
177
178 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
179 if (create && stats[i].ifindex == IFSCOPE_NONE) {
180 if (index < 0) {
181 index = i;
182 }
183 continue;
184 }
185
186 if (stats[i].ifindex == ifindex) {
187 index = i;
188 return index;
189 }
190 }
191
192 if (index != -1) {
193 stats[index].ifindex = ifindex;
194 }
195
196 return index;
197 }
198
199 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)200 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
201 {
202 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
203 int index;
204
205 if (ifp == NULL) {
206 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
207 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
208 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
209 return -1;
210 }
211
212 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
213
214 if (index != -1) {
215 if (stats[index].is_expensive == 0) {
216 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
217 }
218 }
219
220 return index;
221 }
222
223 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)224 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
225 {
226 int index;
227
228 tcpstat.tcps_mp_switches++;
229 mpte->mpte_subflow_switches++;
230
231 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
232
233 if (index != -1) {
234 mpte->mpte_itfstats[index].switches++;
235 }
236 }
237
238 /*
239 * Flushes all recorded socket options from an MP socket.
240 */
241 static void
mptcp_flush_sopts(struct mptses * mpte)242 mptcp_flush_sopts(struct mptses *mpte)
243 {
244 struct mptopt *mpo, *tmpo;
245
246 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
247 mptcp_sopt_remove(mpte, mpo);
248 mptcp_sopt_free(mpo);
249 }
250 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
251 }
252
253 /*
254 * Create an MPTCP session, called as a result of opening a MPTCP socket.
255 */
256 int
mptcp_session_create(struct mppcb * mpp)257 mptcp_session_create(struct mppcb *mpp)
258 {
259 struct mppcbinfo *mppi;
260 struct mptses *mpte;
261 struct mptcb *mp_tp;
262
263 VERIFY(mpp != NULL);
264 mppi = mpp->mpp_pcbinfo;
265 VERIFY(mppi != NULL);
266
267 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
268 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
269
270 /* MPTCP Multipath PCB Extension */
271 bzero(mpte, sizeof(*mpte));
272 VERIFY(mpp->mpp_pcbe == NULL);
273 mpp->mpp_pcbe = mpte;
274 mpte->mpte_mppcb = mpp;
275 mpte->mpte_mptcb = mp_tp;
276
277 TAILQ_INIT(&mpte->mpte_sopts);
278 TAILQ_INIT(&mpte->mpte_subflows);
279 mpte->mpte_associd = SAE_ASSOCID_ANY;
280 mpte->mpte_connid_last = SAE_CONNID_ANY;
281
282 mptcp_init_urgency_timer(mpte);
283
284 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
285 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
286
287 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
288 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
289 }
290
291 mpte->mpte_last_cellicon_set = tcp_now;
292
293 /* MPTCP Protocol Control Block */
294 bzero(mp_tp, sizeof(*mp_tp));
295 mp_tp->mpt_mpte = mpte;
296 mp_tp->mpt_state = MPTCPS_CLOSED;
297
298 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
299
300 return 0;
301 }
302
303 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)304 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
305 {
306 if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
307 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
308 }
309
310 if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
311 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
312 }
313
314 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
315 * meaning we prefer IPv6 over IPv4.
316 */
317 if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
318 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
319 }
320
321 if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
322 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
323 }
324
325 /* We don't yet have a unicast IP */
326 return NULL;
327 }
328
329 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)330 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
331 uint64_t *cellbytes, uint64_t *allbytes)
332 {
333 int64_t mycellbytes = 0;
334 uint64_t myallbytes = 0;
335 int i;
336
337 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
338 if (mpte->mpte_itfstats[i].is_expensive) {
339 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
340 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
341 }
342
343 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
344 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
345 }
346
347 if (initial_cell) {
348 mycellbytes -= mpte->mpte_init_txbytes;
349 mycellbytes -= mpte->mpte_init_rxbytes;
350 }
351
352 if (mycellbytes < 0) {
353 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
354 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
355 *cellbytes = 0;
356 *allbytes = 0;
357 } else {
358 *cellbytes = mycellbytes;
359 *allbytes = myallbytes;
360 }
361 }
362
363 static void
mptcpstats_session_wrapup(struct mptses * mpte)364 mptcpstats_session_wrapup(struct mptses *mpte)
365 {
366 boolean_t cell = mpte->mpte_initial_cell;
367
368 switch (mpte->mpte_svctype) {
369 case MPTCP_SVCTYPE_HANDOVER:
370 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
371 tcpstat.tcps_mptcp_fp_handover_attempt++;
372
373 if (cell && mpte->mpte_handshake_success) {
374 tcpstat.tcps_mptcp_fp_handover_success_cell++;
375
376 if (mpte->mpte_used_wifi) {
377 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
378 }
379 } else if (mpte->mpte_handshake_success) {
380 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
381
382 if (mpte->mpte_used_cell) {
383 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
384 }
385 }
386 } else {
387 tcpstat.tcps_mptcp_handover_attempt++;
388
389 if (cell && mpte->mpte_handshake_success) {
390 tcpstat.tcps_mptcp_handover_success_cell++;
391
392 if (mpte->mpte_used_wifi) {
393 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
394 }
395 } else if (mpte->mpte_handshake_success) {
396 tcpstat.tcps_mptcp_handover_success_wifi++;
397
398 if (mpte->mpte_used_cell) {
399 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
400 }
401 }
402 }
403
404 if (mpte->mpte_handshake_success) {
405 uint64_t cellbytes;
406 uint64_t allbytes;
407
408 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
409
410 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
411 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
412 }
413 break;
414 case MPTCP_SVCTYPE_INTERACTIVE:
415 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
416 tcpstat.tcps_mptcp_fp_interactive_attempt++;
417
418 if (mpte->mpte_handshake_success) {
419 tcpstat.tcps_mptcp_fp_interactive_success++;
420
421 if (!cell && mpte->mpte_used_cell) {
422 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
423 }
424 }
425 } else {
426 tcpstat.tcps_mptcp_interactive_attempt++;
427
428 if (mpte->mpte_handshake_success) {
429 tcpstat.tcps_mptcp_interactive_success++;
430
431 if (!cell && mpte->mpte_used_cell) {
432 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
433 }
434 }
435 }
436
437 if (mpte->mpte_handshake_success) {
438 uint64_t cellbytes;
439 uint64_t allbytes;
440
441 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
442
443 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
444 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
445 }
446 break;
447 case MPTCP_SVCTYPE_AGGREGATE:
448 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
449 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
450
451 if (mpte->mpte_handshake_success) {
452 tcpstat.tcps_mptcp_fp_aggregate_success++;
453 }
454 } else {
455 tcpstat.tcps_mptcp_aggregate_attempt++;
456
457 if (mpte->mpte_handshake_success) {
458 tcpstat.tcps_mptcp_aggregate_success++;
459 }
460 }
461
462 if (mpte->mpte_handshake_success) {
463 uint64_t cellbytes;
464 uint64_t allbytes;
465
466 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
467
468 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
469 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
470 }
471 break;
472 }
473
474 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
475 tcpstat.tcps_mptcp_back_to_wifi++;
476 }
477
478 if (mpte->mpte_triggered_cell) {
479 tcpstat.tcps_mptcp_triggered_cell++;
480 }
481 }
482
483 /*
484 * Destroy an MPTCP session.
485 */
486 static void
mptcp_session_destroy(struct mptses * mpte)487 mptcp_session_destroy(struct mptses *mpte)
488 {
489 struct mptcb *mp_tp = mpte->mpte_mptcb;
490
491 VERIFY(mp_tp != NULL);
492 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
493
494 mptcpstats_session_wrapup(mpte);
495 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
496 mptcp_flush_sopts(mpte);
497
498 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
499 kfree_data(mpte->mpte_itfinfo,
500 sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
501 }
502 mpte->mpte_itfinfo = NULL;
503
504 mptcp_freeq(mp_tp);
505 m_freem_list(mpte->mpte_reinjectq);
506
507 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
508 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
509 }
510
511 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)512 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
513 {
514 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
515 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
516 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
517 }
518
519 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)520 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
521 const struct in_addr *addrv4)
522 {
523 static const struct in6_addr well_known_prefix = {
524 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
525 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
526 0x00, 0x00, 0x00, 0x00},
527 };
528 const char *ptrv4 = (const char *)addrv4;
529 char *ptr = (char *)addr;
530
531 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
532 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
533 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
534 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
535 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
536 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
537 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
538 return -1;
539 }
540
541 /* Check for the well-known prefix */
542 if (len == NAT64_PREFIX_LEN_96 &&
543 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
544 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
545 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
546 return -1;
547 }
548 }
549
550 switch (len) {
551 case NAT64_PREFIX_LEN_96:
552 memcpy(ptr + 12, ptrv4, 4);
553 break;
554 case NAT64_PREFIX_LEN_64:
555 memcpy(ptr + 9, ptrv4, 4);
556 break;
557 case NAT64_PREFIX_LEN_56:
558 memcpy(ptr + 7, ptrv4, 1);
559 memcpy(ptr + 9, ptrv4 + 1, 3);
560 break;
561 case NAT64_PREFIX_LEN_48:
562 memcpy(ptr + 6, ptrv4, 2);
563 memcpy(ptr + 9, ptrv4 + 2, 2);
564 break;
565 case NAT64_PREFIX_LEN_40:
566 memcpy(ptr + 5, ptrv4, 3);
567 memcpy(ptr + 9, ptrv4 + 3, 1);
568 break;
569 case NAT64_PREFIX_LEN_32:
570 memcpy(ptr + 4, ptrv4, 4);
571 break;
572 default:
573 panic("NAT64-prefix len is wrong: %u", len);
574 }
575
576 return 0;
577 }
578
579 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)580 mptcp_trigger_cell_bringup(struct mptses *mpte)
581 {
582 struct socket *mp_so = mptetoso(mpte);
583
584 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
585 uuid_string_t uuidstr;
586 int err;
587
588 socket_unlock(mp_so, 0);
589 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
590 TRUE);
591 socket_lock(mp_so, 0);
592
593 if (err == 0) {
594 mpte->mpte_triggered_cell = 1;
595 }
596
597 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
598 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
599 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
600 } else {
601 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
602 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
603 }
604 }
605
606 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)607 mptcp_subflow_disconnecting(struct mptsub *mpts)
608 {
609 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
610 return true;
611 }
612
613 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
614 return true;
615 }
616
617 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
618 return true;
619 }
620
621 return false;
622 }
623
624 /*
625 * In Handover mode, only create cell subflow if
626 * - Symptoms marked WiFi as weak:
627 * Here, if we are sending data, then we can check the RTO-state. That is a
628 * stronger signal of WiFi quality than the Symptoms indicator.
629 * If however we are not sending any data, the only thing we can do is guess
630 * and thus bring up Cell.
631 *
632 * - Symptoms marked WiFi as unknown:
633 * In this state we don't know what the situation is and thus remain
634 * conservative, only bringing up cell if there are retransmissions going on.
635 */
636 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)637 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
638 {
639 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
640
641 if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
642 /* WiFi is good - don't use cell */
643 return false;
644 }
645
646 if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
647 /*
648 * We are in unknown state, only use Cell if we have confirmed
649 * that WiFi is bad.
650 */
651 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
652 return true;
653 } else {
654 return false;
655 }
656 }
657
658 if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
659 /*
660 * WiFi is confirmed to be bad from Symptoms-Framework.
661 * If we are sending data, check the RTOs.
662 * Otherwise, be pessimistic and use Cell.
663 */
664 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
665 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
666 return true;
667 } else {
668 return false;
669 }
670 } else {
671 return true;
672 }
673 }
674
675 return false;
676 }
677
678 void
mptcp_check_subflows_and_add(struct mptses * mpte)679 mptcp_check_subflows_and_add(struct mptses *mpte)
680 {
681 struct mptcb *mp_tp = mpte->mpte_mptcb;
682 boolean_t cellular_viable = FALSE;
683 boolean_t want_cellular = TRUE;
684 uint32_t i;
685
686 if (!mptcp_ok_to_create_subflows(mp_tp)) {
687 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
688 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
689 return;
690 }
691
692 /* Just to see if we have an IP-address available */
693 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
694 return;
695 }
696
697 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
698 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
699 struct mpt_itf_info *info;
700 struct sockaddr_in6 nat64pre;
701 struct sockaddr *dst;
702 struct mptsub *mpts;
703 struct ifnet *ifp;
704 uint32_t ifindex;
705
706 info = &mpte->mpte_itfinfo[i];
707
708 ifindex = info->ifindex;
709 if (ifindex == IFSCOPE_NONE) {
710 continue;
711 }
712
713 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
714 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
715 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
716
717 if (info->no_mptcp_support) {
718 continue;
719 }
720
721 ifnet_head_lock_shared();
722 ifp = ifindex2ifnet[ifindex];
723 ifnet_head_done();
724
725 if (ifp == NULL) {
726 continue;
727 }
728
729 if (IFNET_IS_CELLULAR(ifp)) {
730 cellular_viable = TRUE;
731
732 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
733 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
734 if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
735 continue;
736 }
737 }
738 }
739
740 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
741 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
742 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
743
744 if (subifp == NULL) {
745 continue;
746 }
747
748 /*
749 * If there is at least one functioning subflow on WiFi
750 * and we are checking for the cell interface, then
751 * we always need to ask symptoms for permission as
752 * cell is triggered even if WiFi is available.
753 */
754 if (!IFNET_IS_CELLULAR(subifp) &&
755 !mptcp_subflow_disconnecting(mpts) &&
756 IFNET_IS_CELLULAR(ifp)) {
757 need_to_ask_symptoms = TRUE;
758 }
759
760 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
761 os_log(mptcp_log_handle,
762 "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
763 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
764 mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
765 IFNET_IS_CELLULAR(subifp),
766 mptcp_wifi_quality_for_session(mpte),
767 mpts->mpts_flags,
768 tp->t_rxtshift,
769 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
770 mptetoso(mpte)->so_snd.sb_cc,
771 ifindex, subifp->if_index,
772 tp->t_srtt >> TCP_RTT_SHIFT,
773 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
774 tp->t_rxtcur);
775
776 if (!IFNET_IS_CELLULAR(subifp) &&
777 !mptcp_subflow_disconnecting(mpts) &&
778 (mpts->mpts_flags & MPTSF_CONNECTED) &&
779 !mptcp_handover_use_cellular(mpte, tp)) {
780 found = TRUE;
781
782 /* We found a proper subflow on WiFi - no need for cell */
783 want_cellular = FALSE;
784 break;
785 }
786 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
787 uint64_t time_now = mach_continuous_time();
788
789 os_log(mptcp_log_handle,
790 "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
791 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
792 time_now, mptcp_wifi_quality_for_session(mpte),
793 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
794 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
795
796 if (!IFNET_IS_CELLULAR(subifp) &&
797 !mptcp_subflow_disconnecting(mpts) &&
798 (mpte->mpte_time_target == 0 ||
799 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
800 mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
801 found = TRUE;
802
803 want_cellular = FALSE;
804 break;
805 }
806 }
807
808 if (subifp->if_index == ifindex &&
809 !mptcp_subflow_disconnecting(mpts)) {
810 /*
811 * We found a subflow on this interface.
812 * No need to create a new one.
813 */
814 found = TRUE;
815 break;
816 }
817 }
818
819 if (found) {
820 continue;
821 }
822
823 if (need_to_ask_symptoms &&
824 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
825 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
826 mptcp_developer_mode == 0) {
827 mptcp_ask_symptoms(mpte);
828 return;
829 }
830
831 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
832
833 if (dst->sa_family == AF_INET &&
834 !info->has_v4_conn && info->has_nat64_conn) {
835 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
836 int error, j;
837
838 bzero(&nat64pre, sizeof(struct sockaddr_in6));
839
840 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
841 if (error) {
842 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
843 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
844 continue;
845 }
846
847 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
848 if (nat64prefixes[j].prefix_len != 0) {
849 break;
850 }
851 }
852
853 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
854
855 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
856 nat64prefixes[j].prefix_len,
857 &((struct sockaddr_in *)(void *)dst)->sin_addr);
858 if (error != 0) {
859 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
860 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
861 continue;
862 }
863
864 memcpy(&nat64pre.sin6_addr,
865 &nat64prefixes[j].ipv6_prefix,
866 sizeof(nat64pre.sin6_addr));
867 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
868 nat64pre.sin6_family = AF_INET6;
869 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
870 nat64pre.sin6_flowinfo = 0;
871 nat64pre.sin6_scope_id = 0;
872
873 dst = (struct sockaddr *)&nat64pre;
874 }
875
876 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
877 continue;
878 }
879 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
880 continue;
881 }
882
883 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
884 }
885
886 if (!cellular_viable && want_cellular) {
887 /* Trigger Cell Bringup */
888 mptcp_trigger_cell_bringup(mpte);
889 }
890 }
891
892 static void
mptcp_remove_cell_subflows(struct mptses * mpte)893 mptcp_remove_cell_subflows(struct mptses *mpte)
894 {
895 struct mptsub *mpts, *tmpts;
896
897 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
898 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
899
900 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
901 continue;
902 }
903
904 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
905 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
906
907 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
908 }
909
910 return;
911 }
912
913 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)914 mptcp_remove_wifi_subflows(struct mptses *mpte)
915 {
916 struct mptsub *mpts, *tmpts;
917
918 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
919 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
920
921 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
922 continue;
923 }
924
925 os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
926 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
927
928 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
929 }
930
931 return;
932 }
933
934 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)935 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
936 {
937 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
938 boolean_t found_working_wifi_subflow = false;
939 boolean_t found_working_cell_subflow = false;
940
941 struct mptsub *mpts;
942
943 /*
944 * Look for a subflow that is on a non-cellular interface in connected
945 * state.
946 *
947 * In that case, remove all cellular subflows.
948 *
949 * If however there is no connected subflow
950 */
951 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
952 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
953 struct socket *so;
954 struct tcpcb *tp;
955
956 if (ifp == NULL) {
957 continue;
958 }
959
960 so = mpts->mpts_socket;
961 tp = sototcpcb(so);
962
963 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
964 tp->t_state != TCPS_ESTABLISHED ||
965 mptcp_subflow_disconnecting(mpts)) {
966 continue;
967 }
968
969 if (IFNET_IS_CELLULAR(ifp)) {
970 found_working_cell_subflow = true;
971 } else {
972 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
973 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
974 if (!mptcp_handover_use_cellular(mpte, tp)) {
975 found_working_wifi_subflow = true;
976 }
977 }
978 }
979
980 /*
981 * Couldn't find a working subflow, let's not remove those on a cellular
982 * interface.
983 */
984 os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
985 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
986 found_working_wifi_subflow, found_working_cell_subflow);
987 if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
988 if (found_working_cell_subflow) {
989 mptcp_remove_wifi_subflows(mpte);
990 }
991 return;
992 }
993
994 mptcp_remove_cell_subflows(mpte);
995 }
996
997 static void
mptcp_handover_subflows_remove(struct mptses * mpte)998 mptcp_handover_subflows_remove(struct mptses *mpte)
999 {
1000 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1001 boolean_t found_working_subflow = false;
1002 struct mptsub *mpts;
1003
1004 /*
1005 * Look for a subflow that is on a non-cellular interface
1006 * and actually works (aka, no retransmission timeout).
1007 */
1008 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1009 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1010 struct socket *so;
1011 struct tcpcb *tp;
1012
1013 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1014 continue;
1015 }
1016
1017 so = mpts->mpts_socket;
1018 tp = sototcpcb(so);
1019
1020 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1021 tp->t_state != TCPS_ESTABLISHED) {
1022 continue;
1023 }
1024
1025 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1026 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1027
1028 if (!mptcp_handover_use_cellular(mpte, tp)) {
1029 found_working_subflow = true;
1030 break;
1031 }
1032 }
1033
1034 /*
1035 * Couldn't find a working subflow, let's not remove those on a cellular
1036 * interface.
1037 */
1038 if (!found_working_subflow) {
1039 return;
1040 }
1041
1042 mptcp_remove_cell_subflows(mpte);
1043 }
1044
1045 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1046 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1047 {
1048 uint64_t time_now = mach_continuous_time();
1049 struct mptsub *mpts;
1050
1051 if (mpte->mpte_time_target != 0 &&
1052 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1053 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1054 /* WiFi is bad and we are below the target - don't remove any subflows */
1055 return;
1056 }
1057
1058 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1059 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1060
1061 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1062 continue;
1063 }
1064
1065 /* We have a functioning subflow on WiFi. No need for cell! */
1066 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1067 !mptcp_subflow_disconnecting(mpts)) {
1068 mptcp_remove_cell_subflows(mpte);
1069 break;
1070 }
1071 }
1072 }
1073
1074 /*
1075 * Based on the MPTCP Service-type and the state of the subflows, we
1076 * will destroy subflows here.
1077 */
1078 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1079 mptcp_check_subflows_and_remove(struct mptses *mpte)
1080 {
1081 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1082 return;
1083 }
1084
1085 socket_lock_assert_owned(mptetoso(mpte));
1086
1087 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1088 mptcp_pure_handover_subflows_remove(mpte);
1089 }
1090
1091 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1092 mptcp_handover_subflows_remove(mpte);
1093 }
1094
1095 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1096 mptcp_targetbased_subflows_remove(mpte);
1097 }
1098 }
1099
1100 static void
mptcp_remove_subflows(struct mptses * mpte)1101 mptcp_remove_subflows(struct mptses *mpte)
1102 {
1103 struct mptsub *mpts, *tmpts;
1104
1105 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1106 return;
1107 }
1108
1109 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1110 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1111 boolean_t found = false;
1112 uint32_t ifindex;
1113 uint32_t i;
1114
1115 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1116 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1117
1118 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1119 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1120 ifp ? ifp->if_index : -1);
1121 soevent(mpts->mpts_socket,
1122 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1123
1124 continue;
1125 }
1126
1127 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1128 continue;
1129 }
1130
1131 if (ifp) {
1132 ifindex = ifp->if_index;
1133 } else {
1134 ifindex = mpts->mpts_ifscope;
1135 }
1136
1137 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1138 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1139 continue;
1140 }
1141
1142 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1143 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1144 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1145 found = true;
1146 break;
1147 }
1148
1149 if (mpts->mpts_dst.sa_family == AF_INET &&
1150 mpte->mpte_itfinfo[i].has_v4_conn) {
1151 found = true;
1152 break;
1153 }
1154 }
1155 }
1156
1157 if (!found) {
1158 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1159 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1160 ifindex, mpts->mpts_flags);
1161
1162 soevent(mpts->mpts_socket,
1163 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1164 }
1165 }
1166 }
1167
1168 static void
mptcp_create_subflows(__unused void * arg)1169 mptcp_create_subflows(__unused void *arg)
1170 {
1171 struct mppcb *mpp;
1172
1173 /*
1174 * Start with clearing, because we might be processing connections
1175 * while a new event comes in.
1176 */
1177 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1178 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1179 }
1180
1181 /* Iterate over all MPTCP connections */
1182
1183 lck_mtx_lock(&mtcbinfo.mppi_lock);
1184
1185 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1186 struct socket *mp_so = mpp->mpp_socket;
1187 struct mptses *mpte = mpp->mpp_pcbe;
1188
1189 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1190 continue;
1191 }
1192
1193 socket_lock(mp_so, 1);
1194 VERIFY(mp_so->so_usecount > 0);
1195
1196 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1197
1198 mptcp_check_subflows_and_add(mpte);
1199 mptcp_remove_subflows(mpte);
1200
1201 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1202 socket_unlock(mp_so, 1);
1203 }
1204
1205 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1206 }
1207
1208 /*
1209 * We need this because we are coming from an NECP-event. This event gets posted
1210 * while holding NECP-locks. The creation of the subflow however leads us back
1211 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1212 * So, we would deadlock there as we already hold the NECP-lock.
1213 *
1214 * So, let's schedule this separately. It also gives NECP the chance to make
1215 * progress, without having to wait for MPTCP to finish its subflow creation.
1216 */
1217 void
mptcp_sched_create_subflows(struct mptses * mpte)1218 mptcp_sched_create_subflows(struct mptses *mpte)
1219 {
1220 struct mppcb *mpp = mpte->mpte_mppcb;
1221 struct mptcb *mp_tp = mpte->mpte_mptcb;
1222 struct socket *mp_so = mpp->mpp_socket;
1223
1224 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1225 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1226 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1227 return;
1228 }
1229
1230 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1231 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1232 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1233 }
1234
1235 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1236 return;
1237 }
1238
1239 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1240 timeout(mptcp_create_subflows, NULL, hz / 10);
1241 }
1242
1243 /*
1244 * Allocate an MPTCP socket option structure.
1245 */
1246 struct mptopt *
mptcp_sopt_alloc(zalloc_flags_t how)1247 mptcp_sopt_alloc(zalloc_flags_t how)
1248 {
1249 return zalloc_flags(mptopt_zone, how | Z_ZERO);
1250 }
1251
1252 /*
1253 * Free an MPTCP socket option structure.
1254 */
1255 void
mptcp_sopt_free(struct mptopt * mpo)1256 mptcp_sopt_free(struct mptopt *mpo)
1257 {
1258 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1259
1260 zfree(mptopt_zone, mpo);
1261 }
1262
1263 /*
1264 * Add a socket option to the MPTCP socket option list.
1265 */
1266 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1267 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1268 {
1269 socket_lock_assert_owned(mptetoso(mpte));
1270 mpo->mpo_flags |= MPOF_ATTACHED;
1271 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1272 }
1273
1274 /*
1275 * Remove a socket option from the MPTCP socket option list.
1276 */
1277 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1278 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1279 {
1280 socket_lock_assert_owned(mptetoso(mpte));
1281 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1282 mpo->mpo_flags &= ~MPOF_ATTACHED;
1283 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1284 }
1285
1286 /*
1287 * Search for an existing <sopt_level,sopt_name> socket option.
1288 */
1289 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1290 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1291 {
1292 struct mptopt *mpo;
1293
1294 socket_lock_assert_owned(mptetoso(mpte));
1295
1296 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1297 if (mpo->mpo_level == sopt->sopt_level &&
1298 mpo->mpo_name == sopt->sopt_name) {
1299 break;
1300 }
1301 }
1302 return mpo;
1303 }
1304
1305 /*
1306 * Allocate a MPTCP subflow structure.
1307 */
1308 static struct mptsub *
mptcp_subflow_alloc(void)1309 mptcp_subflow_alloc(void)
1310 {
1311 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1312 }
1313
1314 /*
1315 * Deallocate a subflow structure, called when all of the references held
1316 * on it have been released. This implies that the subflow has been deleted.
1317 */
1318 static void
mptcp_subflow_free(struct mptsub * mpts)1319 mptcp_subflow_free(struct mptsub *mpts)
1320 {
1321 VERIFY(mpts->mpts_refcnt == 0);
1322 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1323 VERIFY(mpts->mpts_mpte == NULL);
1324 VERIFY(mpts->mpts_socket == NULL);
1325
1326 free_sockaddr(mpts->mpts_src);
1327
1328 zfree(mptsub_zone, mpts);
1329 }
1330
1331 static void
mptcp_subflow_addref(struct mptsub * mpts)1332 mptcp_subflow_addref(struct mptsub *mpts)
1333 {
1334 if (++mpts->mpts_refcnt == 0) {
1335 panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1336 }
1337 /* NOTREACHED */
1338 }
1339
1340 static void
mptcp_subflow_remref(struct mptsub * mpts)1341 mptcp_subflow_remref(struct mptsub *mpts)
1342 {
1343 if (mpts->mpts_refcnt == 0) {
1344 panic("%s: mpts %p negative refcnt", __func__, mpts);
1345 /* NOTREACHED */
1346 }
1347 if (--mpts->mpts_refcnt > 0) {
1348 return;
1349 }
1350
1351 /* callee will unlock and destroy lock */
1352 mptcp_subflow_free(mpts);
1353 }
1354
1355 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1356 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1357 {
1358 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1359 struct tcpcb *tp = sototcpcb(so);
1360
1361 /*
1362 * From this moment on, the subflow is linked to the MPTCP-connection.
1363 * Locking,... happens now at the MPTCP-layer
1364 */
1365 tp->t_mptcb = mpte->mpte_mptcb;
1366 so->so_flags |= SOF_MP_SUBFLOW;
1367 mp_so->so_usecount++;
1368
1369 /*
1370 * Insert the subflow into the list, and associate the MPTCP PCB
1371 * as well as the the subflow socket. From this point on, removing
1372 * the subflow needs to be done via mptcp_subflow_del().
1373 */
1374 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1375 mpte->mpte_numflows++;
1376
1377 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1378 mpts->mpts_mpte = mpte;
1379 mpts->mpts_socket = so;
1380 tp->t_mpsub = mpts;
1381 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1382 mptcp_subflow_addref(mpts); /* for subflow socket */
1383 }
1384
1385 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1386 mptcp_subflow_necp_cb(void *handle, __unused int action,
1387 __unused uint32_t interface_index,
1388 uint32_t necp_flags, bool *viable)
1389 {
1390 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1391 struct inpcb *inp = (struct inpcb *)handle;
1392 struct socket *so = inp->inp_socket;
1393 struct mptsub *mpts;
1394 struct mptses *mpte;
1395
1396 if (low_power) {
1397 action = NECP_CLIENT_CBACTION_NONVIABLE;
1398 }
1399
1400 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1401 return;
1402 }
1403
1404 /*
1405 * The socket is being garbage-collected. There is nothing to be done
1406 * here.
1407 */
1408 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1409 return;
1410 }
1411
1412 socket_lock(so, 1);
1413
1414 /* Check again after we acquired the lock. */
1415 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1416 goto out;
1417 }
1418
1419 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1420 mpts = sototcpcb(so)->t_mpsub;
1421
1422 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1423 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1424
1425 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1426
1427 mptcp_sched_create_subflows(mpte);
1428
1429 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1430 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1431 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1432 viable != NULL) {
1433 *viable = 1;
1434 }
1435
1436 out:
1437 socket_unlock(so, 1);
1438 }
1439
1440 /*
1441 * Create an MPTCP subflow socket.
1442 */
1443 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1444 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1445 struct socket **so)
1446 {
1447 lck_mtx_t *subflow_mtx;
1448 struct mptopt smpo, *mpo, *tmpo;
1449 struct proc *p;
1450 struct socket *mp_so;
1451 struct mppcb *mpp;
1452 int error;
1453
1454 *so = NULL;
1455
1456 mp_so = mptetoso(mpte);
1457 mpp = mpsotomppcb(mp_so);
1458
1459 p = proc_find(mp_so->last_pid);
1460 if (p == PROC_NULL) {
1461 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1462 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1463
1464 mptcp_subflow_free(mpts);
1465 return ESRCH;
1466 }
1467
1468 /*
1469 * Create the subflow socket (multipath subflow, non-blocking.)
1470 *
1471 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1472 * socket; it will be cleared when the socket is peeled off or closed.
1473 * It also indicates to the underlying TCP to handle MPTCP options.
1474 * A multipath subflow socket implies SS_NOFDREF state.
1475 */
1476
1477 /*
1478 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1479 * the ipi-lock. We cannot hold the socket-lock at that point.
1480 */
1481 socket_unlock(mp_so, 0);
1482 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1483 SOCF_MPTCP, PROC_NULL);
1484 socket_lock(mp_so, 0);
1485 if (error) {
1486 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1487 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1488
1489 proc_rele(p);
1490
1491 mptcp_subflow_free(mpts);
1492 return error;
1493 }
1494
1495 /*
1496 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1497 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1498 * Which is why we also need to get the lock with pr_getlock, as after
1499 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1500 */
1501 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1502 lck_mtx_lock(subflow_mtx);
1503
1504 /*
1505 * Must be the first thing we do, to make sure all pointers for this
1506 * subflow are set.
1507 */
1508 mptcp_subflow_attach(mpte, mpts, *so);
1509
1510 /*
1511 * A multipath subflow socket is used internally in the kernel,
1512 * therefore it does not have a file desciptor associated by
1513 * default.
1514 */
1515 (*so)->so_state |= SS_NOFDREF;
1516
1517 lck_mtx_unlock(subflow_mtx);
1518
1519 /* prevent the socket buffers from being compressed */
1520 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1521 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1522
1523 /* Inherit preconnect and TFO data flags */
1524 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1525 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1526 }
1527 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1528 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1529 }
1530 if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1531 (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1532 }
1533
1534 /* Inherit uuid and create the related flow. */
1535 if (!uuid_is_null(mpp->necp_client_uuid)) {
1536 struct mptcb *mp_tp = mpte->mpte_mptcb;
1537
1538 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1539
1540 /*
1541 * A note on the unlock: With MPTCP, we do multiple times a
1542 * necp_client_register_socket_flow. This is problematic,
1543 * because now the lock-ordering guarantee (first necp-locks,
1544 * then socket-locks) is no more respected. So, we need to
1545 * unlock here.
1546 */
1547 socket_unlock(mp_so, 0);
1548 error = necp_client_register_socket_flow(mp_so->last_pid,
1549 mpp->necp_client_uuid, sotoinpcb(*so));
1550 socket_lock(mp_so, 0);
1551
1552 if (error) {
1553 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1554 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1555
1556 goto out_err;
1557 }
1558
1559 /* Possible state-change during the unlock above */
1560 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1561 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1562 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1563 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1564 mp_tp->mpt_state, mp_tp->mpt_flags);
1565
1566 error = EINVAL;
1567 goto out_err;
1568 }
1569
1570 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1571 }
1572
1573 if (mpp->inp_necp_attributes.inp_domain != NULL) {
1574 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1575 sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1576
1577 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1578 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1579 }
1580 }
1581 if (mpp->inp_necp_attributes.inp_account != NULL) {
1582 size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1583 sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1584
1585 if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1586 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1587 }
1588 }
1589
1590 if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1591 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1592 sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1593
1594 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1595 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1596 }
1597 }
1598
1599 if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1600 size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1601 sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1602
1603 if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1604 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1605 }
1606 }
1607
1608 /* Needs to happen prior to the delegation! */
1609 (*so)->last_pid = mp_so->last_pid;
1610
1611 if (mp_so->so_flags & SOF_DELEGATED) {
1612 if (mpte->mpte_epid) {
1613 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1614 if (error) {
1615 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1616 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1617 goto out_err;
1618 }
1619 }
1620 if (!uuid_is_null(mpte->mpte_euuid)) {
1621 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1622 if (error) {
1623 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1624 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1625 goto out_err;
1626 }
1627 }
1628 }
1629
1630 /* inherit the other socket options */
1631 bzero(&smpo, sizeof(smpo));
1632 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1633 smpo.mpo_level = SOL_SOCKET;
1634 smpo.mpo_intval = 1;
1635
1636 /* disable SIGPIPE */
1637 smpo.mpo_name = SO_NOSIGPIPE;
1638 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1639 goto out_err;
1640 }
1641
1642 /* find out if the subflow's source address goes away */
1643 smpo.mpo_name = SO_NOADDRERR;
1644 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1645 goto out_err;
1646 }
1647
1648 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1649 /*
1650 * On secondary subflows we might need to set the cell-fallback
1651 * flag (see conditions in mptcp_subflow_sosetopt).
1652 */
1653 smpo.mpo_level = SOL_SOCKET;
1654 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1655 smpo.mpo_intval = 1;
1656 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1657 goto out_err;
1658 }
1659 }
1660
1661 /* replay setsockopt(2) on the subflow sockets for eligible options */
1662 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1663 int interim;
1664
1665 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1666 continue;
1667 }
1668
1669 /*
1670 * Skip those that are handled internally; these options
1671 * should not have been recorded and marked with the
1672 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1673 */
1674 if (mpo->mpo_level == SOL_SOCKET &&
1675 (mpo->mpo_name == SO_NOSIGPIPE ||
1676 mpo->mpo_name == SO_NOADDRERR ||
1677 mpo->mpo_name == SO_KEEPALIVE)) {
1678 continue;
1679 }
1680
1681 interim = (mpo->mpo_flags & MPOF_INTERIM);
1682 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1683 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1684 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1685 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1686 mpo->mpo_intval);
1687 mptcp_sopt_remove(mpte, mpo);
1688 mptcp_sopt_free(mpo);
1689 continue;
1690 }
1691 }
1692
1693 /*
1694 * We need to receive everything that the subflow socket has,
1695 * so use a customized socket receive function. We will undo
1696 * this when the socket is peeled off or closed.
1697 */
1698 switch (dom) {
1699 case PF_INET:
1700 (*so)->so_proto = &mptcp_subflow_protosw;
1701 break;
1702 case PF_INET6:
1703 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1704 break;
1705 default:
1706 VERIFY(0);
1707 /* NOTREACHED */
1708 }
1709
1710 proc_rele(p);
1711
1712 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1713 int, dom, int, error);
1714
1715 return 0;
1716
1717 out_err:
1718 mptcp_subflow_abort(mpts, error);
1719
1720 proc_rele(p);
1721
1722 return error;
1723 }
1724
1725 /*
1726 * Close an MPTCP subflow socket.
1727 *
1728 * Note that this may be called on an embryonic subflow, and the only
1729 * thing that is guaranteed valid is the protocol-user request.
1730 */
1731 static void
mptcp_subflow_soclose(struct mptsub * mpts)1732 mptcp_subflow_soclose(struct mptsub *mpts)
1733 {
1734 struct socket *so = mpts->mpts_socket;
1735
1736 if (mpts->mpts_flags & MPTSF_CLOSED) {
1737 return;
1738 }
1739
1740 VERIFY(so != NULL);
1741 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1742 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1743
1744 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1745 struct socket *, so,
1746 struct sockbuf *, &so->so_rcv,
1747 struct sockbuf *, &so->so_snd,
1748 struct mptses *, mpts->mpts_mpte);
1749
1750 mpts->mpts_flags |= MPTSF_CLOSED;
1751
1752 if (so->so_retaincnt == 0) {
1753 soclose_locked(so);
1754
1755 return;
1756 } else {
1757 VERIFY(so->so_usecount > 0);
1758 so->so_usecount--;
1759 }
1760
1761 return;
1762 }
1763
1764 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1765 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1766 {
1767 struct tcpcb *tp = sototcpcb(so);
1768 struct mptcp_subf_auth_entry *sauth_entry;
1769
1770 /*
1771 * The address ID of the first flow is implicitly 0.
1772 */
1773 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1774 tp->t_local_aid = 0;
1775 } else {
1776 tp->t_local_aid = addr_id;
1777 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1778 so->so_flags |= SOF_MP_SEC_SUBFLOW;
1779 }
1780 sauth_entry = zalloc(mpt_subauth_zone);
1781 sauth_entry->msae_laddr_id = tp->t_local_aid;
1782 sauth_entry->msae_raddr_id = 0;
1783 sauth_entry->msae_raddr_rand = 0;
1784 try_again:
1785 sauth_entry->msae_laddr_rand = RandomULong();
1786 if (sauth_entry->msae_laddr_rand == 0) {
1787 goto try_again;
1788 }
1789 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1790 }
1791
1792 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1793 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1794 {
1795 struct mptcp_subf_auth_entry *sauth_entry;
1796 struct tcpcb *tp = NULL;
1797 int found = 0;
1798
1799 tp = sototcpcb(so);
1800 if (tp == NULL) {
1801 return;
1802 }
1803
1804 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1805 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1806 found = 1;
1807 break;
1808 }
1809 }
1810 if (found) {
1811 LIST_REMOVE(sauth_entry, msae_next);
1812 }
1813
1814 if (found) {
1815 zfree(mpt_subauth_zone, sauth_entry);
1816 }
1817 }
1818
1819 /*
1820 * Connect an MPTCP subflow socket.
1821 *
1822 * Note that in the pending connect case, the subflow socket may have been
1823 * bound to an interface and/or a source IP address which may no longer be
1824 * around by the time this routine is called; in that case the connect attempt
1825 * will most likely fail.
1826 */
1827 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1828 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1829 {
1830 char dbuf[MAX_IPv6_STR_LEN];
1831 struct socket *mp_so, *so;
1832 struct mptcb *mp_tp;
1833 struct sockaddr *dst;
1834 struct proc *p;
1835 int af, error, dport;
1836
1837 mp_so = mptetoso(mpte);
1838 mp_tp = mpte->mpte_mptcb;
1839 so = mpts->mpts_socket;
1840 af = mpts->mpts_dst.sa_family;
1841 dst = &mpts->mpts_dst;
1842
1843 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1844 VERIFY(mpts->mpts_socket != NULL);
1845 VERIFY(af == AF_INET || af == AF_INET6);
1846
1847 if (af == AF_INET) {
1848 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1849 dport = ntohs(SIN(dst)->sin_port);
1850 } else {
1851 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1852 dport = ntohs(SIN6(dst)->sin6_port);
1853 }
1854
1855 os_log(mptcp_log_handle,
1856 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1857 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1858
1859 p = proc_find(mp_so->last_pid);
1860 if (p == PROC_NULL) {
1861 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1862 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1863
1864 return ESRCH;
1865 }
1866
1867 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1868
1869 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1870
1871 /* connect the subflow socket */
1872 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1873 p, mpts->mpts_ifscope,
1874 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1875
1876 mpts->mpts_iss = sototcpcb(so)->iss;
1877
1878 /* See tcp_connect_complete */
1879 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1880 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1881 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1882 }
1883
1884 /* Allocate a unique address id per subflow */
1885 mpte->mpte_addrid_last++;
1886 if (mpte->mpte_addrid_last == 0) {
1887 mpte->mpte_addrid_last++;
1888 }
1889
1890 proc_rele(p);
1891
1892 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1893 struct mptsub *, mpts, int, error);
1894 if (error) {
1895 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1896 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1897 }
1898
1899 return error;
1900 }
1901
1902 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1903 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1904 uint32_t rseq, uint16_t dlen, uint8_t dfin)
1905 {
1906 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1907
1908 if (m_pktlen(m) == 0) {
1909 return 0;
1910 }
1911
1912 if (!(m->m_flags & M_PKTHDR)) {
1913 return 0;
1914 }
1915
1916 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1917 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1918 rseq != m->m_pkthdr.mp_rseq ||
1919 dlen != m->m_pkthdr.mp_rlen ||
1920 dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1921 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1922 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1923 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1924 rseq, m->m_pkthdr.mp_rseq,
1925 dlen, m->m_pkthdr.mp_rlen,
1926 dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1927
1928 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1929 return -1;
1930 }
1931 }
1932
1933 /* If mbuf is beyond right edge of the mapping, we need to split */
1934 if (m_pktlen(m) > dlen - dfin - off) {
1935 struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1936 if (new == NULL) {
1937 os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1938 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1939 dlen, dfin, off, m_pktlen(m),
1940 mpts->mpts_connid);
1941
1942 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1943 return -1;
1944 }
1945
1946 m->m_next = new;
1947 sballoc(&so->so_rcv, new);
1948 /* Undo, as sballoc will add to it as well */
1949 so->so_rcv.sb_cc -= new->m_len;
1950
1951 if (so->so_rcv.sb_mbtail == m) {
1952 so->so_rcv.sb_mbtail = new;
1953 }
1954 }
1955
1956 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1957 m->m_pkthdr.mp_dsn = dsn + off;
1958 m->m_pkthdr.mp_rseq = rseq + off;
1959 VERIFY(m_pktlen(m) < UINT16_MAX);
1960 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1961
1962 /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1963 if (dfin) {
1964 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1965 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1966 } else {
1967 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1968 }
1969 }
1970
1971
1972 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1973
1974 return 0;
1975 }
1976
1977 /*
1978 * Update the pid, upid, uuid of the subflow so, based on parent so
1979 */
1980 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1981 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1982 {
1983 if (so->last_pid != mp_so->last_pid ||
1984 so->last_upid != mp_so->last_upid) {
1985 so->last_upid = mp_so->last_upid;
1986 so->last_pid = mp_so->last_pid;
1987 uuid_copy(so->last_uuid, mp_so->last_uuid);
1988 }
1989 so_update_policy(so);
1990 }
1991
1992 /*
1993 * MPTCP subflow socket receive routine, derived from soreceive().
1994 */
1995 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1996 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1997 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1998 {
1999 #pragma unused(uio)
2000 struct socket *mp_so;
2001 struct mptses *mpte;
2002 struct mptcb *mp_tp;
2003 int flags, error = 0;
2004 struct mbuf *m, **mp = mp0;
2005
2006 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2007 mp_so = mptetoso(mpte);
2008 mp_tp = mpte->mpte_mptcb;
2009
2010 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2011
2012 #ifdef MORE_LOCKING_DEBUG
2013 if (so->so_usecount == 1) {
2014 panic("%s: so=%x no other reference on socket", __func__, so);
2015 /* NOTREACHED */
2016 }
2017 #endif
2018 /*
2019 * We return all that is there in the subflow's socket receive buffer
2020 * to the MPTCP layer, so we require that the caller passes in the
2021 * expected parameters.
2022 */
2023 if (mp == NULL || controlp != NULL) {
2024 return EINVAL;
2025 }
2026
2027 *mp = NULL;
2028 if (psa != NULL) {
2029 *psa = NULL;
2030 }
2031 if (flagsp != NULL) {
2032 flags = *flagsp & ~MSG_EOR;
2033 } else {
2034 flags = 0;
2035 }
2036
2037 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2038 return EOPNOTSUPP;
2039 }
2040
2041 flags |= (MSG_DONTWAIT | MSG_NBIO);
2042
2043 /*
2044 * If a recv attempt is made on a previously-accepted socket
2045 * that has been marked as inactive (disconnected), reject
2046 * the request.
2047 */
2048 if (so->so_flags & SOF_DEFUNCT) {
2049 struct sockbuf *sb = &so->so_rcv;
2050
2051 error = ENOTCONN;
2052 /*
2053 * This socket should have been disconnected and flushed
2054 * prior to being returned from sodefunct(); there should
2055 * be no data on its receive list, so panic otherwise.
2056 */
2057 if (so->so_state & SS_DEFUNCT) {
2058 sb_empty_assert(sb, __func__);
2059 }
2060 return error;
2061 }
2062
2063 /*
2064 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2065 * and if so just return to the caller. This could happen when
2066 * soreceive() is called by a socket upcall function during the
2067 * time the socket is freed. The socket buffer would have been
2068 * locked across the upcall, therefore we cannot put this thread
2069 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2070 * we may livelock), because the lock on the socket buffer will
2071 * only be released when the upcall routine returns to its caller.
2072 * Because the socket has been officially closed, there can be
2073 * no further read on it.
2074 *
2075 * A multipath subflow socket would have its SS_NOFDREF set by
2076 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2077 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2078 */
2079 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2080 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2081 return 0;
2082 }
2083
2084 /*
2085 * For consistency with soreceive() semantics, we need to obey
2086 * SB_LOCK in case some other code path has locked the buffer.
2087 */
2088 error = sblock(&so->so_rcv, 0);
2089 if (error != 0) {
2090 return error;
2091 }
2092
2093 m = so->so_rcv.sb_mb;
2094 if (m == NULL) {
2095 /*
2096 * Panic if we notice inconsistencies in the socket's
2097 * receive list; both sb_mb and sb_cc should correctly
2098 * reflect the contents of the list, otherwise we may
2099 * end up with false positives during select() or poll()
2100 * which could put the application in a bad state.
2101 */
2102 SB_MB_CHECK(&so->so_rcv);
2103
2104 if (so->so_error != 0) {
2105 error = so->so_error;
2106 so->so_error = 0;
2107 goto release;
2108 }
2109
2110 if (so->so_state & SS_CANTRCVMORE) {
2111 goto release;
2112 }
2113
2114 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2115 error = ENOTCONN;
2116 goto release;
2117 }
2118
2119 /*
2120 * MSG_DONTWAIT is implicitly defined and this routine will
2121 * never block, so return EWOULDBLOCK when there is nothing.
2122 */
2123 error = EWOULDBLOCK;
2124 goto release;
2125 }
2126
2127 mptcp_update_last_owner(so, mp_so);
2128
2129 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2130 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2131
2132 while (m != NULL) {
2133 int dlen = 0, error_out = 0, off = 0;
2134 uint8_t dfin = 0;
2135 struct mbuf *start = m;
2136 uint64_t dsn;
2137 uint32_t sseq;
2138 uint16_t orig_dlen;
2139 uint16_t csum;
2140
2141 VERIFY(m->m_nextpkt == NULL);
2142
2143 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2144 fallback:
2145 /* Just move mbuf to MPTCP-level */
2146
2147 sbfree(&so->so_rcv, m);
2148
2149 if (mp != NULL) {
2150 *mp = m;
2151 mp = &m->m_next;
2152 so->so_rcv.sb_mb = m = m->m_next;
2153 *mp = NULL;
2154 }
2155
2156 if (m != NULL) {
2157 so->so_rcv.sb_lastrecord = m;
2158 } else {
2159 SB_EMPTY_FIXUP(&so->so_rcv);
2160 }
2161
2162 continue;
2163 } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2164 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2165 boolean_t found_mapping = false;
2166 int parsed_length = 0;
2167 struct mbuf *m_iter;
2168
2169 /*
2170 * No MPTCP-option in the header. Either fallback or
2171 * wait for additional mappings.
2172 */
2173 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2174 /* data arrived without a DSS option mapping */
2175
2176 /* initial subflow can fallback right after SYN handshake */
2177 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2178 mptcp_notify_mpfail(so);
2179
2180 goto fallback;
2181 } else {
2182 os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2183 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2184 mpts->mpts_connid);
2185 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2186
2187 error = EIO;
2188 *mp0 = NULL;
2189 goto release;
2190 }
2191 }
2192
2193 /* Thus, let's look for an mbuf with the mapping */
2194 m_iter = m->m_next;
2195 parsed_length = m->m_len;
2196 while (m_iter != NULL && parsed_length < UINT16_MAX) {
2197 if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2198 parsed_length += m_iter->m_len;
2199 m_iter = m_iter->m_next;
2200 continue;
2201 }
2202
2203 found_mapping = true;
2204
2205 /* Found an mbuf with a DSS-mapping */
2206 orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2207 dsn = m_iter->m_pkthdr.mp_dsn;
2208 sseq = m_iter->m_pkthdr.mp_rseq;
2209 csum = m_iter->m_pkthdr.mp_csum;
2210
2211 if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2212 dfin = 1;
2213 dlen--;
2214 }
2215
2216 break;
2217 }
2218
2219 if (!found_mapping && parsed_length < UINT16_MAX) {
2220 /* Mapping not yet present, we can wait! */
2221 if (*mp0 == NULL) {
2222 error = EWOULDBLOCK;
2223 }
2224 goto release;
2225 } else if (!found_mapping && parsed_length >= UINT16_MAX) {
2226 os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2227 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2228 mpts->mpts_connid);
2229 /* Received 64KB without DSS-mapping. We should kill the subflow */
2230 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2231
2232 error = EIO;
2233 *mp0 = NULL;
2234 goto release;
2235 }
2236 } else {
2237 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2238 dsn = m->m_pkthdr.mp_dsn;
2239 sseq = m->m_pkthdr.mp_rseq;
2240 csum = m->m_pkthdr.mp_csum;
2241
2242 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2243 dfin = 1;
2244 dlen--;
2245 }
2246 }
2247
2248 /*
2249 * Check if the full mapping is now present
2250 */
2251 if ((int)so->so_rcv.sb_cc < dlen) {
2252 if (*mp0 == NULL) {
2253 error = EWOULDBLOCK;
2254 }
2255 goto release;
2256 }
2257
2258 /* Now, get the full mapping */
2259 off = 0;
2260 while (dlen > 0) {
2261 if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2262 error_out = 1;
2263 error = EIO;
2264 dlen = 0;
2265 *mp0 = NULL;
2266 break;
2267 }
2268
2269 dlen -= m->m_len;
2270 off += m->m_len;
2271 sbfree(&so->so_rcv, m);
2272
2273 if (mp != NULL) {
2274 *mp = m;
2275 mp = &m->m_next;
2276 so->so_rcv.sb_mb = m = m->m_next;
2277 *mp = NULL;
2278 }
2279
2280 ASSERT(dlen == 0 || m);
2281 if (dlen != 0 && m == NULL) {
2282 /* "try" to gracefully recover on customer builds */
2283 error_out = 1;
2284 error = EIO;
2285 dlen = 0;
2286
2287 *mp0 = NULL;
2288
2289 SB_EMPTY_FIXUP(&so->so_rcv);
2290 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2291
2292 break;
2293 }
2294 }
2295
2296 VERIFY(dlen == 0);
2297
2298 if (m != NULL) {
2299 so->so_rcv.sb_lastrecord = m;
2300 } else {
2301 SB_EMPTY_FIXUP(&so->so_rcv);
2302 }
2303
2304 if (error_out) {
2305 goto release;
2306 }
2307
2308 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2309 error = EIO;
2310 *mp0 = NULL;
2311 goto release;
2312 }
2313
2314 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2315 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2316 }
2317
2318 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2319 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2320
2321 if (flagsp != NULL) {
2322 *flagsp |= flags;
2323 }
2324
2325 release:
2326 sbunlock(&so->so_rcv, TRUE);
2327
2328 return error;
2329 }
2330
2331 /*
2332 * MPTCP subflow socket send routine, derived from sosend().
2333 */
2334 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2335 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2336 struct mbuf *top, struct mbuf *control, int flags)
2337 {
2338 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2339 boolean_t en_tracing = FALSE, proc_held = FALSE;
2340 struct proc *p = current_proc();
2341 int en_tracing_val;
2342 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2343 int error;
2344
2345 VERIFY(control == NULL);
2346 VERIFY(addr == NULL);
2347 VERIFY(uio == NULL);
2348 VERIFY(flags == 0);
2349 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2350
2351 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2352 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2353
2354 /*
2355 * trace if tracing & network (vs. unix) sockets & and
2356 * non-loopback
2357 */
2358 if (ENTR_SHOULDTRACE &&
2359 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2360 struct inpcb *inp = sotoinpcb(so);
2361 if (inp->inp_last_outifp != NULL &&
2362 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2363 en_tracing = TRUE;
2364 en_tracing_val = top->m_pkthdr.len;
2365 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2366 (unsigned long)VM_KERNEL_ADDRPERM(so),
2367 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2368 (int64_t)en_tracing_val);
2369 }
2370 }
2371
2372 mptcp_update_last_owner(so, mp_so);
2373
2374 if (mp_so->last_pid != proc_pid(p)) {
2375 p = proc_find(mp_so->last_pid);
2376 if (p == PROC_NULL) {
2377 p = current_proc();
2378 } else {
2379 proc_held = TRUE;
2380 }
2381 }
2382
2383 #if NECP
2384 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2385 #endif /* NECP */
2386
2387 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2388 if (error) {
2389 goto out;
2390 }
2391
2392 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2393 top = NULL;
2394
2395 out:
2396 if (top != NULL) {
2397 m_freem(top);
2398 }
2399
2400 if (proc_held) {
2401 proc_rele(p);
2402 }
2403
2404 soclearfastopen(so);
2405
2406 if (en_tracing) {
2407 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2408 (unsigned long)VM_KERNEL_ADDRPERM(so),
2409 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2410 (int64_t)en_tracing_val);
2411 }
2412
2413 return error;
2414 }
2415
2416 /*
2417 * Subflow socket write upcall.
2418 *
2419 * Called when the associated subflow socket posted a read event.
2420 */
2421 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2422 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2423 {
2424 #pragma unused(so, waitf)
2425 struct mptsub *mpts = arg;
2426 struct mptses *mpte = mpts->mpts_mpte;
2427
2428 VERIFY(mpte != NULL);
2429
2430 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2431 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2432 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2433 }
2434 return;
2435 }
2436
2437 mptcp_output(mpte);
2438 }
2439
2440 /*
2441 * Subflow socket control event upcall.
2442 */
2443 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2444 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2445 {
2446 #pragma unused(so)
2447 struct mptsub *mpts = arg;
2448 struct mptses *mpte = mpts->mpts_mpte;
2449
2450 socket_lock_assert_owned(mptetoso(mpte));
2451
2452 if ((mpts->mpts_evctl & events) == events) {
2453 return;
2454 }
2455
2456 mpts->mpts_evctl |= events;
2457
2458 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2459 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2460 return;
2461 }
2462
2463 mptcp_subflow_workloop(mpte);
2464 }
2465
2466 /*
2467 * Establish an initial MPTCP connection (if first subflow and not yet
2468 * connected), or add a subflow to an existing MPTCP connection.
2469 */
2470 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2471 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2472 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2473 {
2474 struct socket *mp_so, *so = NULL;
2475 struct mptcb *mp_tp;
2476 struct mptsub *mpts = NULL;
2477 int af, error = 0;
2478
2479 mp_so = mptetoso(mpte);
2480 mp_tp = mpte->mpte_mptcb;
2481
2482 socket_lock_assert_owned(mp_so);
2483
2484 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2485 /* If the remote end sends Data FIN, refuse subflow adds */
2486 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2487 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2488 error = ENOTCONN;
2489 goto out_err;
2490 }
2491
2492 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2493 error = EOVERFLOW;
2494 goto out_err;
2495 }
2496
2497 mpts = mptcp_subflow_alloc();
2498 if (mpts == NULL) {
2499 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2500 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2501 error = ENOMEM;
2502 goto out_err;
2503 }
2504
2505 if (src) {
2506 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2507 error = EAFNOSUPPORT;
2508 goto out_err;
2509 }
2510
2511 if (src->sa_family == AF_INET &&
2512 src->sa_len != sizeof(struct sockaddr_in)) {
2513 error = EINVAL;
2514 goto out_err;
2515 }
2516
2517 if (src->sa_family == AF_INET6 &&
2518 src->sa_len != sizeof(struct sockaddr_in6)) {
2519 error = EINVAL;
2520 goto out_err;
2521 }
2522
2523 mpts->mpts_src = (struct sockaddr *)alloc_sockaddr(src->sa_len,
2524 Z_WAITOK | Z_NOFAIL);
2525
2526 bcopy(src, mpts->mpts_src, src->sa_len);
2527 }
2528
2529 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2530 error = EAFNOSUPPORT;
2531 goto out_err;
2532 }
2533
2534 if (dst->sa_family == AF_INET &&
2535 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2536 error = EINVAL;
2537 goto out_err;
2538 }
2539
2540 if (dst->sa_family == AF_INET6 &&
2541 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2542 error = EINVAL;
2543 goto out_err;
2544 }
2545
2546 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2547
2548 af = mpts->mpts_dst.sa_family;
2549
2550 ifnet_head_lock_shared();
2551 if ((ifscope > (unsigned)if_index)) {
2552 ifnet_head_done();
2553 error = ENXIO;
2554 goto out_err;
2555 }
2556 ifnet_head_done();
2557
2558 mpts->mpts_ifscope = ifscope;
2559
2560 /* create the subflow socket */
2561 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2562 /*
2563 * Returning (error) and not cleaning up, because up to here
2564 * all we did is creating mpts.
2565 *
2566 * And the contract is that the call to mptcp_subflow_socreate,
2567 * moves ownership of mpts to mptcp_subflow_socreate.
2568 */
2569 return error;
2570 }
2571
2572 /*
2573 * We may be called from within the kernel. Still need to account this
2574 * one to the real app.
2575 */
2576 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2577
2578 /*
2579 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2580 * -1 (SAE_CONNID_ALL).
2581 */
2582 mpte->mpte_connid_last++;
2583 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2584 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2585 mpte->mpte_connid_last++;
2586 }
2587
2588 mpts->mpts_connid = mpte->mpte_connid_last;
2589
2590 mpts->mpts_rel_seq = 1;
2591
2592 /* Allocate a unique address id per subflow */
2593 mpte->mpte_addrid_last++;
2594 if (mpte->mpte_addrid_last == 0) {
2595 mpte->mpte_addrid_last++;
2596 }
2597
2598 /* register for subflow socket read/write events */
2599 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2600
2601 /* Register for subflow socket control events */
2602 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2603 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2604 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2605 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2606 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2607 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2608 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2609 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2610
2611 /* sanity check */
2612 VERIFY(!(mpts->mpts_flags &
2613 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2614
2615 /*
2616 * Indicate to the TCP subflow whether or not it should establish
2617 * the initial MPTCP connection, or join an existing one. Fill
2618 * in the connection request structure with additional info needed
2619 * by the underlying TCP (to be used in the TCP options, etc.)
2620 */
2621 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2622 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2623
2624 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2625 mptcp_init_local_parms(mpte, dst);
2626 }
2627 soisconnecting(mp_so);
2628
2629 /* If fastopen is requested, set state in mpts */
2630 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2631 mpts->mpts_flags |= MPTSF_TFO_REQD;
2632 }
2633 } else {
2634 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2635 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2636 }
2637 }
2638
2639 mpts->mpts_flags |= MPTSF_CONNECTING;
2640
2641 /* connect right away if first attempt, or if join can be done now */
2642 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2643 error = mptcp_subflow_soconnectx(mpte, mpts);
2644 }
2645
2646 if (error) {
2647 goto out_err_close;
2648 }
2649
2650 if (pcid) {
2651 *pcid = mpts->mpts_connid;
2652 }
2653
2654 return 0;
2655
2656 out_err_close:
2657 mptcp_subflow_abort(mpts, error);
2658
2659 return error;
2660
2661 out_err:
2662 if (mpts) {
2663 mptcp_subflow_free(mpts);
2664 }
2665
2666 return error;
2667 }
2668
2669 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2670 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2671 {
2672 int index = mptcpstats_get_index(stats, mpts);
2673
2674 if (index != -1) {
2675 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2676
2677 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2678 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2679
2680 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2681 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2682
2683 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2684 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2685
2686 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2687 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2688 }
2689 }
2690
2691 /*
2692 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2693 * will no longer be accessible after a subflow is deleted, thus this
2694 * should occur only after the subflow socket has been disconnected.
2695 */
2696 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2697 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2698 {
2699 struct socket *mp_so = mptetoso(mpte);
2700 struct socket *so = mpts->mpts_socket;
2701 struct tcpcb *tp = sototcpcb(so);
2702
2703 socket_lock_assert_owned(mp_so);
2704 VERIFY(mpts->mpts_mpte == mpte);
2705 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2706 VERIFY(mpte->mpte_numflows != 0);
2707 VERIFY(mp_so->so_usecount > 0);
2708
2709 mptcpstats_update(mpte->mpte_itfstats, mpts);
2710
2711 mptcp_unset_cellicon(mpte, mpts, 1);
2712
2713 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2714 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2715
2716 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2717 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2718 mpte->mpte_numflows--;
2719 if (mpte->mpte_active_sub == mpts) {
2720 mpte->mpte_active_sub = NULL;
2721 }
2722
2723 /*
2724 * Drop references held by this subflow socket; there
2725 * will be no further upcalls made from this point.
2726 */
2727 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2728 sock_catchevents_locked(so, NULL, NULL, 0);
2729
2730 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2731
2732 mp_so->so_usecount--; /* for subflow socket */
2733 mpts->mpts_mpte = NULL;
2734 mpts->mpts_socket = NULL;
2735
2736 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2737 mptcp_subflow_remref(mpts); /* for subflow socket */
2738
2739 so->so_flags &= ~SOF_MP_SUBFLOW;
2740 tp->t_mptcb = NULL;
2741 tp->t_mpsub = NULL;
2742 }
2743
2744 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2745 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2746 {
2747 struct socket *so = mpts->mpts_socket;
2748 struct mptcb *mp_tp = mpte->mpte_mptcb;
2749 int send_dfin = 0;
2750
2751 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2752 send_dfin = 1;
2753 }
2754
2755 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2756 (so->so_state & SS_ISCONNECTED)) {
2757 if (send_dfin) {
2758 mptcp_send_dfin(so);
2759 }
2760 soshutdownlock(so, SHUT_WR);
2761 }
2762 }
2763
2764 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2765 mptcp_subflow_abort(struct mptsub *mpts, int error)
2766 {
2767 struct socket *so = mpts->mpts_socket;
2768 struct tcpcb *tp = sototcpcb(so);
2769
2770 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2771 return;
2772 }
2773
2774 if (tp->t_state != TCPS_CLOSED) {
2775 tcp_drop(tp, error);
2776 }
2777
2778 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2779 }
2780
2781 /*
2782 * Disconnect a subflow socket.
2783 */
2784 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2785 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2786 {
2787 struct socket *so, *mp_so;
2788 struct mptcb *mp_tp;
2789 int send_dfin = 0;
2790
2791 so = mpts->mpts_socket;
2792 mp_tp = mpte->mpte_mptcb;
2793 mp_so = mptetoso(mpte);
2794
2795 socket_lock_assert_owned(mp_so);
2796
2797 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2798 return;
2799 }
2800
2801 mptcp_unset_cellicon(mpte, mpts, 1);
2802
2803 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2804
2805 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2806 send_dfin = 1;
2807 }
2808
2809 if (mp_so->so_flags & SOF_DEFUNCT) {
2810 errno_t ret;
2811
2812 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2813 if (ret == 0) {
2814 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2815
2816 if (ret != 0) {
2817 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2818 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2819 }
2820 } else {
2821 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2822 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2823 }
2824 }
2825
2826 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2827 (so->so_state & SS_ISCONNECTED)) {
2828 if (send_dfin) {
2829 mptcp_send_dfin(so);
2830 }
2831
2832 (void) soshutdownlock(so, SHUT_RD);
2833 (void) soshutdownlock(so, SHUT_WR);
2834 (void) sodisconnectlocked(so);
2835 }
2836
2837 /*
2838 * Generate a disconnect event for this subflow socket, in case
2839 * the lower layer doesn't do it; this is needed because the
2840 * subflow socket deletion relies on it.
2841 */
2842 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2843 }
2844
2845 /*
2846 * Subflow socket input.
2847 */
2848 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2849 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2850 {
2851 struct socket *mp_so = mptetoso(mpte);
2852 struct mbuf *m = NULL;
2853 struct socket *so;
2854 int error, wakeup = 0;
2855
2856 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2857 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2858
2859 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2860 struct mptsub *, mpts);
2861
2862 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2863 goto out;
2864 }
2865
2866 so = mpts->mpts_socket;
2867
2868 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2869 if (error != 0 && error != EWOULDBLOCK) {
2870 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2871 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2872 if (error == ENODATA) {
2873 /*
2874 * Don't ignore ENODATA so as to discover
2875 * nasty middleboxes.
2876 */
2877 mp_so->so_error = ENODATA;
2878
2879 wakeup = 1;
2880 goto out;
2881 }
2882 }
2883
2884 /* In fallback, make sure to accept data on all but one subflow */
2885 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2886 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2887 m_freem(m);
2888 goto out;
2889 }
2890
2891 if (m != NULL) {
2892 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2893 mptcp_set_cellicon(mpte, mpts);
2894
2895 mpte->mpte_used_cell = 1;
2896 } else {
2897 /*
2898 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2899 * explicitly set the cellicon, then we unset it again.
2900 */
2901 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2902 mptcp_unset_cellicon(mpte, NULL, 1);
2903 }
2904
2905 mpte->mpte_used_wifi = 1;
2906 }
2907
2908 mptcp_input(mpte, m);
2909 }
2910
2911 out:
2912 if (wakeup) {
2913 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2914 }
2915
2916 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2917 }
2918
2919 void
mptcp_handle_input(struct socket * so)2920 mptcp_handle_input(struct socket *so)
2921 {
2922 struct mptsub *mpts, *tmpts;
2923 struct mptses *mpte;
2924
2925 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2926 return;
2927 }
2928
2929 mpts = sototcpcb(so)->t_mpsub;
2930 mpte = mpts->mpts_mpte;
2931
2932 socket_lock_assert_owned(mptetoso(mpte));
2933
2934 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2935 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2936 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2937 }
2938 return;
2939 }
2940
2941 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2942 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2943 if (mpts->mpts_socket->so_usecount == 0) {
2944 /* Will be removed soon by tcp_garbage_collect */
2945 continue;
2946 }
2947
2948 mptcp_subflow_addref(mpts);
2949 mpts->mpts_socket->so_usecount++;
2950
2951 mptcp_subflow_input(mpte, mpts);
2952
2953 mptcp_subflow_remref(mpts); /* ours */
2954
2955 VERIFY(mpts->mpts_socket->so_usecount != 0);
2956 mpts->mpts_socket->so_usecount--;
2957 }
2958
2959 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2960 }
2961
2962 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)2963 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2964 {
2965 struct mbuf *so_m = so->so_snd.sb_mb;
2966 uint64_t dsn = m->m_pkthdr.mp_dsn;
2967
2968 while (so_m) {
2969 VERIFY(so_m->m_flags & M_PKTHDR);
2970 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2971
2972 /* Part of the segment is covered, don't reinject here */
2973 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2974 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2975 return TRUE;
2976 }
2977
2978 so_m = so_m->m_next;
2979 }
2980
2981 return FALSE;
2982 }
2983
2984 /*
2985 * Subflow socket output.
2986 *
2987 * Called for sending data from MPTCP to the underlying subflow socket.
2988 */
2989 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)2990 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2991 {
2992 struct mptcb *mp_tp = mpte->mpte_mptcb;
2993 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
2994 struct socket *mp_so, *so;
2995 struct tcpcb *tp;
2996 uint64_t mpt_dsn = 0, off = 0;
2997 int sb_cc = 0, error = 0, wakeup = 0;
2998 uint16_t dss_csum;
2999 uint16_t tot_sent = 0;
3000 boolean_t reinjected = FALSE;
3001
3002 mp_so = mptetoso(mpte);
3003 so = mpts->mpts_socket;
3004 tp = sototcpcb(so);
3005
3006 socket_lock_assert_owned(mp_so);
3007
3008 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3009 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3010
3011 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3012 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3013 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3014 (mpts->mpts_flags & MPTSF_TFO_REQD));
3015 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3016
3017 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3018 struct mptsub *, mpts);
3019
3020 /* Remove Addr Option is not sent reliably as per I-D */
3021 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3022 tp->t_rem_aid = mpte->mpte_lost_aid;
3023 tp->t_mpflags |= TMPF_SND_REM_ADDR;
3024 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3025 }
3026
3027 /*
3028 * The mbuf chains containing the metadata (as well as pointing to
3029 * the user data sitting at the MPTCP output queue) would then be
3030 * sent down to the subflow socket.
3031 *
3032 * Some notes on data sequencing:
3033 *
3034 * a. Each mbuf must be a M_PKTHDR.
3035 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
3036 * in the mbuf pkthdr structure.
3037 * c. Each mbuf containing the MPTCP metadata must have its
3038 * pkt_flags marked with the PKTF_MPTCP flag.
3039 */
3040
3041 if (mpte->mpte_reinjectq) {
3042 sb_mb = mpte->mpte_reinjectq;
3043 } else {
3044 sb_mb = mp_so->so_snd.sb_mb;
3045 }
3046
3047 if (sb_mb == NULL) {
3048 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3049 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3050 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3051 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3052
3053 /* Fix it to prevent looping */
3054 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3055 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3056 }
3057 goto out;
3058 }
3059
3060 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3061
3062 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3063 !(so->so_state & SS_ISCONNECTED) &&
3064 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3065 tp->t_mpflags |= TMPF_TFO_REQUEST;
3066
3067 /* Opting to call pru_send as no mbuf at subflow level */
3068 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3069 NULL, current_proc());
3070
3071 goto done_sending;
3072 }
3073
3074 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3075
3076 /* First, drop acknowledged data */
3077 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3078 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3079 "dsn %u suna %u reinject? %u\n",
3080 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3081 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3082 if (mpte->mpte_reinjectq) {
3083 mptcp_clean_reinjectq(mpte);
3084 } else {
3085 uint64_t len = 0;
3086 len = mp_tp->mpt_snduna - mpt_dsn;
3087 sbdrop(&mp_so->so_snd, (int)len);
3088 wakeup = 1;
3089 }
3090 }
3091
3092 /* Check again because of above sbdrop */
3093 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3094 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3095 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3096 goto out;
3097 }
3098
3099 /*
3100 * In degraded mode, we don't receive data acks, so force free
3101 * mbufs less than snd_nxt
3102 */
3103 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3104 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3105 mp_so->so_snd.sb_mb) {
3106 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3107 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3108 uint64_t len = 0;
3109 len = mp_tp->mpt_snduna - mpt_dsn;
3110 sbdrop(&mp_so->so_snd, (int)len);
3111 wakeup = 1;
3112
3113 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3114 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3115 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3116 }
3117 }
3118
3119 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3120 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3121 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3122 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3123 }
3124
3125 /*
3126 * Adjust the top level notion of next byte used for retransmissions
3127 * and sending FINs.
3128 */
3129 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3130 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3131 }
3132
3133 /* Now determine the offset from which to start transmitting data */
3134 if (mpte->mpte_reinjectq) {
3135 sb_mb = mpte->mpte_reinjectq;
3136 } else {
3137 dont_reinject:
3138 sb_mb = mp_so->so_snd.sb_mb;
3139 }
3140 if (sb_mb == NULL) {
3141 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3142 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3143 goto out;
3144 }
3145
3146 if (sb_mb == mpte->mpte_reinjectq) {
3147 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3148 off = 0;
3149
3150 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3151 if (mptcp_can_send_more(mp_tp, TRUE)) {
3152 goto dont_reinject;
3153 }
3154
3155 error = ECANCELED;
3156 goto out;
3157 }
3158
3159 reinjected = TRUE;
3160 } else if (flags & MPTCP_SUBOUT_PROBING) {
3161 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3162 off = 0;
3163 } else {
3164 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3165
3166 /*
3167 * With TFO, there might be no data at all, thus still go into this
3168 * code-path here.
3169 */
3170 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3171 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3172 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3173 sb_cc -= off;
3174 } else {
3175 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3176 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3177 (uint32_t)mp_tp->mpt_sndmax);
3178
3179 goto out;
3180 }
3181 }
3182
3183 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3184 if (sb_cc <= 0) {
3185 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3186 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3187 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3188 mptcp_subflow_cwnd_space(so));
3189 }
3190
3191 sb_cc = min(sb_cc, UINT16_MAX);
3192
3193 /*
3194 * Create a DSN mapping for the data we are about to send. It all
3195 * has the same mapping.
3196 */
3197 if (reinjected) {
3198 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3199 } else {
3200 mpt_dsn = mp_tp->mpt_snduna + off;
3201 }
3202
3203 mpt_mbuf = sb_mb;
3204 while (mpt_mbuf && reinjected == FALSE &&
3205 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3206 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3207 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3208 mpt_mbuf = mpt_mbuf->m_next;
3209 }
3210 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3211
3212 head = tail = NULL;
3213
3214 while (tot_sent < sb_cc) {
3215 int32_t mlen;
3216
3217 mlen = mpt_mbuf->m_len;
3218 mlen -= off;
3219 mlen = MIN(mlen, sb_cc - tot_sent);
3220
3221 if (mlen < 0) {
3222 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3223 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3224 (uint32_t)off, sb_cc, tot_sent);
3225 goto out;
3226 }
3227
3228 if (mlen == 0) {
3229 goto next;
3230 }
3231
3232 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3233 M_COPYM_MUST_COPY_HDR);
3234 if (m == NULL) {
3235 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3236 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3237 error = ENOBUFS;
3238 break;
3239 }
3240
3241 /* Create a DSN mapping for the data (m_copym does it) */
3242 VERIFY(m->m_flags & M_PKTHDR);
3243 VERIFY(m->m_next == NULL);
3244
3245 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3246 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3247 m->m_pkthdr.mp_dsn = mpt_dsn;
3248 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3249 m->m_pkthdr.len = mlen;
3250
3251 if (head == NULL) {
3252 head = tail = m;
3253 } else {
3254 tail->m_next = m;
3255 tail = m;
3256 }
3257
3258 tot_sent += mlen;
3259 off = 0;
3260 next:
3261 mpt_mbuf = mpt_mbuf->m_next;
3262 }
3263
3264 if (reinjected) {
3265 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3266 struct mbuf *n = sb_mb;
3267
3268 while (n) {
3269 n->m_pkthdr.mp_dsn += sb_cc;
3270 n->m_pkthdr.mp_rlen -= sb_cc;
3271 n = n->m_next;
3272 }
3273 m_adj(sb_mb, sb_cc);
3274 } else {
3275 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3276 m_freem(sb_mb);
3277 }
3278 }
3279
3280 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3281 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3282 tot_sent);
3283 }
3284
3285 /* Now, let's update rel-seq and the data-level length */
3286 mpts->mpts_rel_seq += tot_sent;
3287 m = head;
3288 while (m) {
3289 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3290 m->m_pkthdr.mp_csum = dss_csum;
3291 }
3292 m->m_pkthdr.mp_rlen = tot_sent;
3293 m = m->m_next;
3294 }
3295
3296 if (head != NULL) {
3297 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3298 (tp->t_tfo_stats == 0)) {
3299 tp->t_mpflags |= TMPF_TFO_REQUEST;
3300 }
3301
3302 error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3303 head = NULL;
3304 }
3305
3306 done_sending:
3307 if (error == 0 ||
3308 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3309 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3310
3311 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3312 tcpstat.tcps_mp_num_probes++;
3313 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3314 mpts->mpts_probecnt += 1;
3315 } else {
3316 mpts->mpts_probecnt +=
3317 tot_sent / mpts->mpts_maxseg;
3318 }
3319 }
3320
3321 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3322 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3323 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3324 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3325 }
3326 mp_tp->mpt_sndnxt = new_sndnxt;
3327 }
3328
3329 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3330
3331 /* Must be here as mptcp_can_send_more() checks for this */
3332 soclearfastopen(mp_so);
3333
3334 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3335 mptcp_set_cellicon(mpte, mpts);
3336
3337 mpte->mpte_used_cell = 1;
3338 } else {
3339 /*
3340 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3341 * explicitly set the cellicon, then we unset it again.
3342 */
3343 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3344 mptcp_unset_cellicon(mpte, NULL, 1);
3345 }
3346
3347 mpte->mpte_used_wifi = 1;
3348 }
3349
3350 /*
3351 * Don't propagate EWOULDBLOCK - it's already taken care of
3352 * in mptcp_usr_send for TFO.
3353 */
3354 error = 0;
3355 } else {
3356 /* We need to revert our change to mpts_rel_seq */
3357 mpts->mpts_rel_seq -= tot_sent;
3358
3359 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3360 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3361 }
3362 out:
3363
3364 if (head != NULL) {
3365 m_freem(head);
3366 }
3367
3368 if (wakeup) {
3369 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3370 }
3371
3372 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3373 return error;
3374 }
3375
3376 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3377 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3378 {
3379 struct mbuf *n, *prev = NULL;
3380
3381 n = mpte->mpte_reinjectq;
3382
3383 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3384 * equal than m's sequence number.
3385 */
3386 while (n) {
3387 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3388 break;
3389 }
3390
3391 prev = n;
3392
3393 n = n->m_nextpkt;
3394 }
3395
3396 if (n) {
3397 /* m is already fully covered by the next mbuf in the queue */
3398 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3399 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3400 os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3401 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3402 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3403 m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3404 goto dont_queue;
3405 }
3406
3407 /* m is covering the next mbuf entirely, thus we remove this guy */
3408 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3409 struct mbuf *tmp = n->m_nextpkt;
3410
3411 os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3412 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3413 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3414 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3415
3416 m->m_nextpkt = NULL;
3417 if (prev == NULL) {
3418 mpte->mpte_reinjectq = tmp;
3419 } else {
3420 prev->m_nextpkt = tmp;
3421 }
3422
3423 m_freem(n);
3424 n = tmp;
3425 }
3426 }
3427
3428 if (prev) {
3429 /* m is already fully covered by the previous mbuf in the queue */
3430 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3431 os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3432 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3433 (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3434 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3435 goto dont_queue;
3436 }
3437 }
3438
3439 if (prev == NULL) {
3440 mpte->mpte_reinjectq = m;
3441 } else {
3442 prev->m_nextpkt = m;
3443 }
3444
3445 m->m_nextpkt = n;
3446
3447 return;
3448
3449 dont_queue:
3450 m_freem(m);
3451 return;
3452 }
3453
3454 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3455 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3456 {
3457 struct socket *mp_so = mptetoso(mpte);
3458 struct mbuf *m;
3459
3460 m = mp_so->so_snd.sb_mb;
3461
3462 while (m) {
3463 /* If this segment covers what we are looking for, return it. */
3464 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3465 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3466 break;
3467 }
3468
3469
3470 /* Segment is no more in the queue */
3471 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3472 return NULL;
3473 }
3474
3475 m = m->m_next;
3476 }
3477
3478 return m;
3479 }
3480
3481 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3482 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3483 {
3484 struct mbuf *top = NULL, *tail = NULL;
3485 uint64_t dsn;
3486 uint32_t dlen, rseq;
3487
3488 dsn = m->m_pkthdr.mp_dsn;
3489 dlen = m->m_pkthdr.mp_rlen;
3490 rseq = m->m_pkthdr.mp_rseq;
3491
3492 while (len > 0) {
3493 struct mbuf *n;
3494
3495 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3496
3497 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3498 if (n == NULL) {
3499 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3500 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3501 goto err;
3502 }
3503
3504 VERIFY(n->m_flags & M_PKTHDR);
3505 VERIFY(n->m_next == NULL);
3506 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3507 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3508 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3509 VERIFY(n->m_len == m->m_len);
3510
3511 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3512
3513 if (top == NULL) {
3514 top = n;
3515 }
3516
3517 if (tail != NULL) {
3518 tail->m_next = n;
3519 }
3520
3521 tail = n;
3522
3523 len -= m->m_len;
3524 m = m->m_next;
3525 }
3526
3527 return top;
3528
3529 err:
3530 if (top) {
3531 m_freem(top);
3532 }
3533
3534 return NULL;
3535 }
3536
3537 static void
mptcp_reinject_mbufs(struct socket * so)3538 mptcp_reinject_mbufs(struct socket *so)
3539 {
3540 struct tcpcb *tp = sototcpcb(so);
3541 struct mptsub *mpts = tp->t_mpsub;
3542 struct mptcb *mp_tp = tptomptp(tp);
3543 struct mptses *mpte = mp_tp->mpt_mpte;
3544 struct sockbuf *sb = &so->so_snd;
3545 struct mbuf *m;
3546
3547 m = sb->sb_mb;
3548 while (m) {
3549 struct mbuf *n = m->m_next, *orig = m;
3550 bool set_reinject_flag = false;
3551
3552 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3553
3554 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3555 goto next;
3556 }
3557
3558 /* Has it all already been acknowledged at the data-level? */
3559 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3560 goto next;
3561 }
3562
3563 /* Part of this has already been acknowledged - lookup in the
3564 * MPTCP-socket for the segment.
3565 */
3566 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3567 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3568 if (m == NULL) {
3569 goto next;
3570 }
3571 }
3572
3573 /* Copy the mbuf with headers (aka, DSN-numbers) */
3574 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3575 if (m == NULL) {
3576 break;
3577 }
3578
3579 VERIFY(m->m_nextpkt == NULL);
3580
3581 /* Now, add to the reinject-queue, eliminating overlapping
3582 * segments
3583 */
3584 mptcp_add_reinjectq(mpte, m);
3585
3586 set_reinject_flag = true;
3587 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3588
3589 next:
3590 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3591 while (n) {
3592 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3593
3594 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3595 break;
3596 }
3597
3598 if (set_reinject_flag) {
3599 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3600 }
3601 n = n->m_next;
3602 }
3603
3604 m = n;
3605 }
3606 }
3607
3608 void
mptcp_clean_reinjectq(struct mptses * mpte)3609 mptcp_clean_reinjectq(struct mptses *mpte)
3610 {
3611 struct mptcb *mp_tp = mpte->mpte_mptcb;
3612
3613 socket_lock_assert_owned(mptetoso(mpte));
3614
3615 while (mpte->mpte_reinjectq) {
3616 struct mbuf *m = mpte->mpte_reinjectq;
3617
3618 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3619 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3620 break;
3621 }
3622
3623 mpte->mpte_reinjectq = m->m_nextpkt;
3624 m->m_nextpkt = NULL;
3625 m_freem(m);
3626 }
3627 }
3628
3629 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3630 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3631 uint32_t *p_mpsofilt_hint, uint32_t event)
3632 {
3633 struct socket *mp_so, *so;
3634 struct mptcb *mp_tp;
3635
3636 mp_so = mptetoso(mpte);
3637 mp_tp = mpte->mpte_mptcb;
3638 so = mpts->mpts_socket;
3639
3640 /*
3641 * We got an event for this subflow that might need to be propagated,
3642 * based on the state of the MPTCP connection.
3643 */
3644 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3645 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3646 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3647 mp_so->so_error = so->so_error;
3648 *p_mpsofilt_hint |= event;
3649 }
3650
3651 return MPTS_EVRET_OK;
3652 }
3653
3654 /*
3655 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3656 */
3657 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3658 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3659 uint32_t *p_mpsofilt_hint, uint32_t event)
3660 {
3661 struct socket *mp_so;
3662 struct tcpcb *tp;
3663
3664 mp_so = mptetoso(mpte);
3665 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3666
3667 /*
3668 * This overwrites any previous mpte_lost_aid to avoid storing
3669 * too much state when the typical case has only two subflows.
3670 */
3671 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3672 mpte->mpte_lost_aid = tp->t_local_aid;
3673
3674 /*
3675 * The subflow connection has lost its source address.
3676 */
3677 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3678
3679 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3680 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3681 }
3682
3683 return MPTS_EVRET_DELETE;
3684 }
3685
3686 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3687 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3688 uint32_t *p_mpsofilt_hint, uint32_t event)
3689 {
3690 #pragma unused(event, p_mpsofilt_hint)
3691 struct socket *so, *mp_so;
3692
3693 so = mpts->mpts_socket;
3694
3695 if (so->so_error != ENODATA) {
3696 return MPTS_EVRET_OK;
3697 }
3698
3699
3700 mp_so = mptetoso(mpte);
3701
3702 mp_so->so_error = ENODATA;
3703
3704 sorwakeup(mp_so);
3705 sowwakeup(mp_so);
3706
3707 return MPTS_EVRET_OK;
3708 }
3709
3710
3711 /*
3712 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3713 * indicates that the remote side sent a Data FIN
3714 */
3715 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3716 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3717 uint32_t *p_mpsofilt_hint, uint32_t event)
3718 {
3719 #pragma unused(event, mpts)
3720 struct mptcb *mp_tp = mpte->mpte_mptcb;
3721
3722 /*
3723 * We got a Data FIN for the MPTCP connection.
3724 * The FIN may arrive with data. The data is handed up to the
3725 * mptcp socket and the user is notified so that it may close
3726 * the socket if needed.
3727 */
3728 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3729 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3730 }
3731
3732 return MPTS_EVRET_OK; /* keep the subflow socket around */
3733 }
3734
3735 /*
3736 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3737 */
3738 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3739 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3740 uint32_t *p_mpsofilt_hint, uint32_t event)
3741 {
3742 #pragma unused(event, p_mpsofilt_hint)
3743 struct mptsub *mpts_alt = NULL;
3744 struct socket *alt_so = NULL;
3745 struct socket *mp_so;
3746 int altpath_exists = 0;
3747
3748 mp_so = mptetoso(mpte);
3749 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3750
3751 mptcp_reinject_mbufs(mpts->mpts_socket);
3752
3753 mpts_alt = mptcp_get_subflow(mpte, NULL);
3754
3755 /* If there is no alternate eligible subflow, ignore the failover hint. */
3756 if (mpts_alt == NULL || mpts_alt == mpts) {
3757 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3758 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3759
3760 goto done;
3761 }
3762
3763 altpath_exists = 1;
3764 alt_so = mpts_alt->mpts_socket;
3765 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3766 /* All data acknowledged and no RTT spike */
3767 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3768 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3769 } else {
3770 /* no alternate path available */
3771 altpath_exists = 0;
3772 }
3773 }
3774
3775 if (altpath_exists) {
3776 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3777
3778 mpte->mpte_active_sub = mpts_alt;
3779 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3780 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3781
3782 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3783 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3784
3785 mptcpstats_inc_switch(mpte, mpts);
3786
3787 sowwakeup(alt_so);
3788 } else {
3789 done:
3790 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3791 }
3792
3793 return MPTS_EVRET_OK;
3794 }
3795
3796 /*
3797 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3798 */
3799 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3800 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3801 uint32_t *p_mpsofilt_hint, uint32_t event)
3802 {
3803 /*
3804 * The subflow connection cannot use the outgoing interface, let's
3805 * close this subflow.
3806 */
3807 mptcp_subflow_abort(mpts, EPERM);
3808
3809 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3810
3811 return MPTS_EVRET_DELETE;
3812 }
3813
3814 /*
3815 * https://tools.ietf.org/html/rfc6052#section-2
3816 * https://tools.ietf.org/html/rfc6147#section-5.2
3817 */
3818 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)3819 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
3820 const struct ipv6_prefix *prefix,
3821 struct in_addr *addrv4)
3822 {
3823 char buf[MAX_IPv4_STR_LEN];
3824 char *ptrv4 = (char *)addrv4;
3825 const char *ptr = (const char *)addr;
3826
3827 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3828 return false;
3829 }
3830
3831 switch (prefix->prefix_len) {
3832 case NAT64_PREFIX_LEN_96:
3833 memcpy(ptrv4, ptr + 12, 4);
3834 break;
3835 case NAT64_PREFIX_LEN_64:
3836 memcpy(ptrv4, ptr + 9, 4);
3837 break;
3838 case NAT64_PREFIX_LEN_56:
3839 memcpy(ptrv4, ptr + 7, 1);
3840 memcpy(ptrv4 + 1, ptr + 9, 3);
3841 break;
3842 case NAT64_PREFIX_LEN_48:
3843 memcpy(ptrv4, ptr + 6, 2);
3844 memcpy(ptrv4 + 2, ptr + 9, 2);
3845 break;
3846 case NAT64_PREFIX_LEN_40:
3847 memcpy(ptrv4, ptr + 5, 3);
3848 memcpy(ptrv4 + 3, ptr + 9, 1);
3849 break;
3850 case NAT64_PREFIX_LEN_32:
3851 memcpy(ptrv4, ptr + 4, 4);
3852 break;
3853 default:
3854 panic("NAT64-prefix len is wrong: %u",
3855 prefix->prefix_len);
3856 }
3857
3858 os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3859 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3860 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3861
3862 return true;
3863 }
3864
3865 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3866 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3867 {
3868 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3869 struct socket *so = mpts->mpts_socket;
3870 struct ifnet *ifp;
3871 int j;
3872
3873 /* Subflow IPs will be steered directly by the server - no need to
3874 * desynthesize.
3875 */
3876 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3877 return;
3878 }
3879
3880 ifp = sotoinpcb(so)->inp_last_outifp;
3881
3882 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3883 return;
3884 }
3885
3886 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3887 int success;
3888
3889 if (nat64prefixes[j].prefix_len == 0) {
3890 continue;
3891 }
3892
3893 success = mptcp_desynthesize_ipv6_addr(mpte,
3894 &mpte->__mpte_dst_v6.sin6_addr,
3895 &nat64prefixes[j],
3896 &mpte->mpte_sub_dst_v4.sin_addr);
3897 if (success) {
3898 mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3899 mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3900 mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3901
3902 /*
3903 * We connected to a NAT64'ed address. Let's remove it
3904 * from the potential IPs to use. Whenever we are back on
3905 * that network and need to connect, we can synthesize again.
3906 *
3907 * Otherwise, on different IPv6 networks we will attempt
3908 * to connect to that NAT64 address...
3909 */
3910 memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3911 break;
3912 }
3913 }
3914 }
3915
3916 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3917 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3918 {
3919 struct inpcb *inp;
3920
3921 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3922 return;
3923 }
3924
3925 inp = sotoinpcb(mpts->mpts_socket);
3926 if (inp == NULL) {
3927 return;
3928 }
3929
3930 /* Should we try the alternate port? */
3931 if (mpte->mpte_alternate_port &&
3932 inp->inp_fport != mpte->mpte_alternate_port) {
3933 union sockaddr_in_4_6 dst;
3934 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3935
3936 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3937
3938 dst_in->sin_port = mpte->mpte_alternate_port;
3939
3940 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3941 mpts->mpts_ifscope, NULL);
3942 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3943 unsigned int i;
3944
3945 if (inp->inp_last_outifp == NULL) {
3946 return;
3947 }
3948
3949 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3950 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3951
3952 if (inp->inp_last_outifp->if_index == info->ifindex) {
3953 info->no_mptcp_support = 1;
3954 break;
3955 }
3956 }
3957 }
3958 }
3959
3960 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
3961 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)3962 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
3963 {
3964 struct socket *mp_so = mptetoso(mpte);
3965 struct socket *so = mpts->mpts_socket;
3966 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3967 struct mptcb *mp_tp = mpte->mpte_mptcb;
3968
3969 /* If data was sent with SYN, rewind state */
3970 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
3971 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3972 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
3973
3974 VERIFY(mp_droplen <= (UINT_MAX));
3975 VERIFY(mp_droplen >= tcp_droplen);
3976
3977 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
3978 mpts->mpts_iss += tcp_droplen;
3979 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3980
3981 if (mp_droplen > tcp_droplen) {
3982 /* handle partial TCP ack */
3983 mp_so->so_flags1 |= SOF1_TFO_REWIND;
3984 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
3985 mp_droplen = tcp_droplen;
3986 } else {
3987 /* all data on SYN was acked */
3988 mpts->mpts_rel_seq = 1;
3989 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3990 }
3991 mp_tp->mpt_sndmax -= tcp_droplen;
3992
3993 if (mp_droplen != 0) {
3994 VERIFY(mp_so->so_snd.sb_mb != NULL);
3995 sbdrop(&mp_so->so_snd, (int)mp_droplen);
3996 }
3997 }
3998 }
3999
4000 /*
4001 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4002 */
4003 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4004 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4005 uint32_t *p_mpsofilt_hint, uint32_t event)
4006 {
4007 #pragma unused(event, p_mpsofilt_hint)
4008 struct socket *mp_so, *so;
4009 struct inpcb *inp;
4010 struct tcpcb *tp;
4011 struct mptcb *mp_tp;
4012 int af;
4013 boolean_t mpok = FALSE;
4014
4015 mp_so = mptetoso(mpte);
4016 mp_tp = mpte->mpte_mptcb;
4017 so = mpts->mpts_socket;
4018 tp = sototcpcb(so);
4019 af = mpts->mpts_dst.sa_family;
4020
4021 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4022 return MPTS_EVRET_OK;
4023 }
4024
4025 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4026 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4027 return MPTS_EVRET_OK;
4028 }
4029
4030 /*
4031 * The subflow connection has been connected. Find out whether it
4032 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4033 *
4034 * a. If MPTCP connection is not yet established, then this must be
4035 * the first subflow connection. If MPTCP failed to negotiate,
4036 * fallback to regular TCP by degrading this subflow.
4037 *
4038 * b. If MPTCP connection has been established, then this must be
4039 * one of the subsequent subflow connections. If MPTCP failed
4040 * to negotiate, disconnect the connection.
4041 *
4042 * Right now, we simply unblock any waiters at the MPTCP socket layer
4043 * if the MPTCP connection has not been established.
4044 */
4045
4046 if (so->so_state & SS_ISDISCONNECTED) {
4047 /*
4048 * With MPTCP joins, a connection is connected at the subflow
4049 * level, but the 4th ACK from the server elevates the MPTCP
4050 * subflow to connected state. So there is a small window
4051 * where the subflow could get disconnected before the
4052 * connected event is processed.
4053 */
4054 return MPTS_EVRET_OK;
4055 }
4056
4057 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4058 mptcp_drop_tfo_data(mpte, mpts);
4059 }
4060
4061 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4062 mpts->mpts_flags |= MPTSF_CONNECTED;
4063
4064 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4065 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4066 }
4067
4068 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4069
4070 /* get/verify the outbound interface */
4071 inp = sotoinpcb(so);
4072
4073 mpts->mpts_maxseg = tp->t_maxseg;
4074
4075 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4076
4077 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4078 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4079 mpte->mpte_associd = mpts->mpts_connid;
4080 DTRACE_MPTCP2(state__change,
4081 struct mptcb *, mp_tp,
4082 uint32_t, 0 /* event */);
4083
4084 if (SOCK_DOM(so) == AF_INET) {
4085 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4086 } else {
4087 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4088 }
4089
4090 mpts->mpts_flags |= MPTSF_ACTIVE;
4091
4092 /* case (a) above */
4093 if (!mpok) {
4094 tcpstat.tcps_mpcap_fallback++;
4095
4096 tp->t_mpflags |= TMPF_INFIN_SENT;
4097 mptcp_notify_mpfail(so);
4098 } else {
4099 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4100 mptcp_subflows_need_backup_flag(mpte)) {
4101 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4102 } else {
4103 mpts->mpts_flags |= MPTSF_PREFERRED;
4104 }
4105 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4106 mpte->mpte_nummpcapflows++;
4107
4108 if (SOCK_DOM(so) == AF_INET6) {
4109 mptcp_handle_ipv6_connection(mpte, mpts);
4110 }
4111
4112 mptcp_check_subflows_and_add(mpte);
4113
4114 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4115 mpte->mpte_initial_cell = 1;
4116 }
4117
4118 mpte->mpte_handshake_success = 1;
4119 }
4120
4121 mp_tp->mpt_sndwnd = tp->snd_wnd;
4122 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4123 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4124 soisconnected(mp_so);
4125 } else if (mpok) {
4126 /*
4127 * case (b) above
4128 * In case of additional flows, the MPTCP socket is not
4129 * MPTSF_MP_CAPABLE until an ACK is received from server
4130 * for 3-way handshake. TCP would have guaranteed that this
4131 * is an MPTCP subflow.
4132 */
4133 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4134 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4135 mptcp_subflows_need_backup_flag(mpte)) {
4136 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4137 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4138 } else {
4139 mpts->mpts_flags |= MPTSF_PREFERRED;
4140 }
4141
4142 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4143 mpte->mpte_nummpcapflows++;
4144
4145 mpts->mpts_rel_seq = 1;
4146
4147 mptcp_check_subflows_and_remove(mpte);
4148 } else {
4149 mptcp_try_alternate_port(mpte, mpts);
4150
4151 tcpstat.tcps_join_fallback++;
4152 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4153 tcpstat.tcps_mptcp_cell_proxy++;
4154 } else {
4155 tcpstat.tcps_mptcp_wifi_proxy++;
4156 }
4157
4158 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4159
4160 return MPTS_EVRET_OK;
4161 }
4162
4163 /* This call, just to "book" an entry in the stats-table for this ifindex */
4164 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4165
4166 mptcp_output(mpte);
4167
4168 return MPTS_EVRET_OK; /* keep the subflow socket around */
4169 }
4170
4171 /*
4172 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4173 */
4174 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4175 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4176 uint32_t *p_mpsofilt_hint, uint32_t event)
4177 {
4178 #pragma unused(event, p_mpsofilt_hint)
4179 struct socket *mp_so, *so;
4180 struct mptcb *mp_tp;
4181
4182 mp_so = mptetoso(mpte);
4183 mp_tp = mpte->mpte_mptcb;
4184 so = mpts->mpts_socket;
4185
4186 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4187 return MPTS_EVRET_DELETE;
4188 }
4189
4190 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4191
4192 /* The subflow connection has been disconnected. */
4193
4194 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4195 mpte->mpte_nummpcapflows--;
4196 if (mpte->mpte_active_sub == mpts) {
4197 mpte->mpte_active_sub = NULL;
4198 }
4199 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4200 } else {
4201 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4202 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4203 mptcp_try_alternate_port(mpte, mpts);
4204 }
4205 }
4206
4207 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4208 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4209 mptcp_drop(mpte, mp_tp, so->so_error);
4210 }
4211
4212 /*
4213 * Clear flags that are used by getconninfo to return state.
4214 * Retain like MPTSF_DELETEOK for internal purposes.
4215 */
4216 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4217 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4218 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4219
4220 return MPTS_EVRET_DELETE;
4221 }
4222
4223 /*
4224 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4225 */
4226 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4227 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4228 uint32_t *p_mpsofilt_hint, uint32_t event)
4229 {
4230 #pragma unused(event, p_mpsofilt_hint)
4231 ev_ret_t ret = MPTS_EVRET_OK;
4232 struct socket *mp_so, *so;
4233 struct mptcb *mp_tp;
4234
4235 mp_so = mptetoso(mpte);
4236 mp_tp = mpte->mpte_mptcb;
4237 so = mpts->mpts_socket;
4238 struct inpcb *inp = sotoinpcb(so);
4239 struct tcpcb *tp = intotcpcb(inp);
4240
4241 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4242 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4243 } else {
4244 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4245 }
4246
4247 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4248 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4249 goto done;
4250 }
4251 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4252 } else {
4253 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4254 }
4255
4256 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4257 mpts->mpts_flags |= MPTSF_MP_READY;
4258 } else {
4259 mpts->mpts_flags &= ~MPTSF_MP_READY;
4260 }
4261
4262 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4263 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4264 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4265 tcp_cache_update_mptcp_version(tp, FALSE);
4266 }
4267
4268 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4269 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4270
4271 m_freem_list(mpte->mpte_reinjectq);
4272 mpte->mpte_reinjectq = NULL;
4273 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4274 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4275 ret = MPTS_EVRET_CONNECT_PENDING;
4276 }
4277
4278 done:
4279 return ret;
4280 }
4281
4282 /*
4283 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4284 */
4285 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4286 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4287 uint32_t *p_mpsofilt_hint, uint32_t event)
4288 {
4289 #pragma unused(event)
4290 struct socket *mp_so, *so;
4291 struct mptcb *mp_tp;
4292 boolean_t is_fastclose;
4293
4294 mp_so = mptetoso(mpte);
4295 mp_tp = mpte->mpte_mptcb;
4296 so = mpts->mpts_socket;
4297
4298 /* We got an invalid option or a fast close */
4299 struct inpcb *inp = sotoinpcb(so);
4300 struct tcpcb *tp = NULL;
4301
4302 tp = intotcpcb(inp);
4303 so->so_error = ECONNABORTED;
4304
4305 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4306
4307 tp->t_mpflags |= TMPF_RESET;
4308
4309 if (tp->t_state != TCPS_CLOSED) {
4310 struct tcptemp *t_template = tcp_maketemplate(tp);
4311
4312 if (t_template) {
4313 struct tcp_respond_args tra;
4314
4315 bzero(&tra, sizeof(tra));
4316 if (inp->inp_flags & INP_BOUND_IF) {
4317 tra.ifscope = inp->inp_boundifp->if_index;
4318 } else {
4319 tra.ifscope = IFSCOPE_NONE;
4320 }
4321 tra.awdl_unrestricted = 1;
4322
4323 tcp_respond(tp, t_template->tt_ipgen,
4324 &t_template->tt_t, (struct mbuf *)NULL,
4325 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4326 (void) m_free(dtom(t_template));
4327 }
4328 }
4329
4330 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4331 struct mptsub *iter, *tmp;
4332
4333 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4334
4335 mp_so->so_error = ECONNRESET;
4336
4337 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4338 if (iter == mpts) {
4339 continue;
4340 }
4341 mptcp_subflow_abort(iter, ECONNABORTED);
4342 }
4343
4344 /*
4345 * mptcp_drop is being called after processing the events, to fully
4346 * close the MPTCP connection
4347 */
4348 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4349 }
4350
4351 mptcp_subflow_abort(mpts, ECONNABORTED);
4352
4353 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4354 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4355 }
4356
4357 return MPTS_EVRET_DELETE;
4358 }
4359
4360 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4361 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4362 uint32_t *p_mpsofilt_hint, uint32_t event)
4363 {
4364 #pragma unused(event)
4365 bool found_active = false;
4366
4367 mpts->mpts_flags |= MPTSF_READ_STALL;
4368
4369 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4370 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4371
4372 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4373 TCPS_HAVERCVDFIN2(tp->t_state)) {
4374 continue;
4375 }
4376
4377 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4378 found_active = true;
4379 break;
4380 }
4381 }
4382
4383 if (!found_active) {
4384 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4385 }
4386
4387 return MPTS_EVRET_OK;
4388 }
4389
4390 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4391 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4392 uint32_t *p_mpsofilt_hint, uint32_t event)
4393 {
4394 #pragma unused(event)
4395 bool found_active = false;
4396
4397 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4398
4399 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4400 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4401
4402 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4403 tp->t_state > TCPS_CLOSE_WAIT) {
4404 continue;
4405 }
4406
4407 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4408 found_active = true;
4409 break;
4410 }
4411 }
4412
4413 if (!found_active) {
4414 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4415 }
4416
4417 return MPTS_EVRET_OK;
4418 }
4419
4420 /*
4421 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4422 * caller must ensure that the option can be issued on subflow sockets, via
4423 * MPOF_SUBFLOW_OK flag.
4424 */
4425 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4426 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4427 {
4428 struct socket *mp_so, *so;
4429 struct sockopt sopt;
4430 int error;
4431
4432 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4433
4434 mp_so = mptetoso(mpte);
4435 so = mpts->mpts_socket;
4436
4437 socket_lock_assert_owned(mp_so);
4438
4439 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4440 mpo->mpo_level == SOL_SOCKET &&
4441 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4442 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4443
4444 /*
4445 * When we open a new subflow, mark it as cell fallback, if
4446 * this subflow goes over cell.
4447 *
4448 * (except for first-party apps)
4449 */
4450
4451 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4452 return 0;
4453 }
4454
4455 if (sotoinpcb(so)->inp_last_outifp &&
4456 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4457 return 0;
4458 }
4459
4460 /*
4461 * This here is an OR, because if the app is not binding to the
4462 * interface, then it definitely is not a cell-fallback
4463 * connection.
4464 */
4465 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4466 !IFNET_IS_CELLULAR(ifp)) {
4467 return 0;
4468 }
4469 }
4470
4471 mpo->mpo_flags &= ~MPOF_INTERIM;
4472
4473 bzero(&sopt, sizeof(sopt));
4474 sopt.sopt_dir = SOPT_SET;
4475 sopt.sopt_level = mpo->mpo_level;
4476 sopt.sopt_name = mpo->mpo_name;
4477 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4478 sopt.sopt_valsize = sizeof(int);
4479 sopt.sopt_p = kernproc;
4480
4481 error = sosetoptlock(so, &sopt, 0);
4482 if (error) {
4483 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4484 "val %d set error %d\n", __func__,
4485 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4486 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4487 mpo->mpo_intval, error);
4488 }
4489 return error;
4490 }
4491
4492 /*
4493 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4494 * caller must ensure that the option can be issued on subflow sockets, via
4495 * MPOF_SUBFLOW_OK flag.
4496 */
4497 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4498 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4499 struct mptopt *mpo)
4500 {
4501 struct socket *mp_so;
4502 struct sockopt sopt;
4503 int error;
4504
4505 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4506 mp_so = mptetoso(mpte);
4507
4508 socket_lock_assert_owned(mp_so);
4509
4510 bzero(&sopt, sizeof(sopt));
4511 sopt.sopt_dir = SOPT_GET;
4512 sopt.sopt_level = mpo->mpo_level;
4513 sopt.sopt_name = mpo->mpo_name;
4514 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4515 sopt.sopt_valsize = sizeof(int);
4516 sopt.sopt_p = kernproc;
4517
4518 error = sogetoptlock(so, &sopt, 0); /* already locked */
4519 if (error) {
4520 os_log_error(mptcp_log_handle,
4521 "%s - %lx: sopt %s get error %d\n",
4522 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4523 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4524 }
4525 return error;
4526 }
4527
4528
4529 /*
4530 * MPTCP garbage collector.
4531 *
4532 * This routine is called by the MP domain on-demand, periodic callout,
4533 * which is triggered when a MPTCP socket is closed. The callout will
4534 * repeat as long as this routine returns a non-zero value.
4535 */
4536 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4537 mptcp_gc(struct mppcbinfo *mppi)
4538 {
4539 struct mppcb *mpp, *tmpp;
4540 uint32_t active = 0;
4541
4542 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4543
4544 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4545 struct socket *mp_so;
4546 struct mptses *mpte;
4547 struct mptcb *mp_tp;
4548
4549 mp_so = mpp->mpp_socket;
4550 mpte = mptompte(mpp);
4551 mp_tp = mpte->mpte_mptcb;
4552
4553 if (!mpp_try_lock(mpp)) {
4554 active++;
4555 continue;
4556 }
4557
4558 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4559
4560 /* check again under the lock */
4561 if (mp_so->so_usecount > 0) {
4562 boolean_t wakeup = FALSE;
4563 struct mptsub *mpts, *tmpts;
4564
4565 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4566 if (mp_tp->mpt_gc_ticks > 0) {
4567 mp_tp->mpt_gc_ticks--;
4568 }
4569 if (mp_tp->mpt_gc_ticks == 0) {
4570 wakeup = TRUE;
4571 }
4572 }
4573 if (wakeup) {
4574 TAILQ_FOREACH_SAFE(mpts,
4575 &mpte->mpte_subflows, mpts_entry, tmpts) {
4576 mptcp_subflow_eupcall1(mpts->mpts_socket,
4577 mpts, SO_FILT_HINT_DISCONNECTED);
4578 }
4579 }
4580 socket_unlock(mp_so, 0);
4581 active++;
4582 continue;
4583 }
4584
4585 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4586 panic("%s - %lx: skipped state "
4587 "[u=%d,r=%d,s=%d]\n", __func__,
4588 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4589 mp_so->so_usecount, mp_so->so_retaincnt,
4590 mpp->mpp_state);
4591 }
4592
4593 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4594 mptcp_close(mpte, mp_tp);
4595 }
4596
4597 mptcp_session_destroy(mpte);
4598
4599 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4600 struct sockbuf *, &mp_so->so_rcv,
4601 struct sockbuf *, &mp_so->so_snd,
4602 struct mppcb *, mpp);
4603
4604 mptcp_pcbdispose(mpp);
4605 sodealloc(mp_so);
4606 }
4607
4608 return active;
4609 }
4610
4611 /*
4612 * Drop a MPTCP connection, reporting the specified error.
4613 */
4614 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4615 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4616 {
4617 struct socket *mp_so = mptetoso(mpte);
4618
4619 VERIFY(mpte->mpte_mptcb == mp_tp);
4620
4621 socket_lock_assert_owned(mp_so);
4622
4623 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4624 uint32_t, 0 /* event */);
4625
4626 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4627 errno = mp_tp->mpt_softerror;
4628 }
4629 mp_so->so_error = errno;
4630
4631 return mptcp_close(mpte, mp_tp);
4632 }
4633
4634 /*
4635 * Close a MPTCP control block.
4636 */
4637 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4638 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4639 {
4640 struct mptsub *mpts = NULL, *tmpts = NULL;
4641 struct socket *mp_so = mptetoso(mpte);
4642
4643 socket_lock_assert_owned(mp_so);
4644 VERIFY(mpte->mpte_mptcb == mp_tp);
4645
4646 mp_tp->mpt_state = MPTCPS_TERMINATE;
4647
4648 mptcp_freeq(mp_tp);
4649
4650 soisdisconnected(mp_so);
4651
4652 /* Clean up all subflows */
4653 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4654 mptcp_subflow_disconnect(mpte, mpts);
4655 }
4656
4657 return NULL;
4658 }
4659
4660 void
mptcp_notify_close(struct socket * so)4661 mptcp_notify_close(struct socket *so)
4662 {
4663 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4664 }
4665
4666 typedef struct mptcp_subflow_event_entry {
4667 uint32_t sofilt_hint_mask;
4668 ev_ret_t (*sofilt_hint_ev_hdlr)(
4669 struct mptses *mpte,
4670 struct mptsub *mpts,
4671 uint32_t *p_mpsofilt_hint,
4672 uint32_t event);
4673 } mptsub_ev_entry_t;
4674
4675 /*
4676 * XXX The order of the event handlers below is really
4677 * really important. Think twice before changing it.
4678 */
4679 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4680 {
4681 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4682 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4683 },
4684 {
4685 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4686 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
4687 },
4688 {
4689 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4690 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4691 },
4692 {
4693 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4694 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4695 },
4696 {
4697 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4698 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4699 },
4700 {
4701 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4702 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4703 },
4704 {
4705 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4706 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4707 },
4708 {
4709 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4710 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4711 },
4712 {
4713 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4714 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4715 },
4716 {
4717 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4718 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4719 },
4720 {
4721 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4722 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4723 },
4724 {
4725 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4726 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4727 },
4728 {
4729 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4730 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4731 },
4732 {
4733 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4734 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4735 },
4736 };
4737
4738 /*
4739 * Subflow socket control events.
4740 *
4741 * Called for handling events related to the underlying subflow socket.
4742 */
4743 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4744 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4745 uint32_t *p_mpsofilt_hint)
4746 {
4747 ev_ret_t ret = MPTS_EVRET_OK;
4748 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4749 sizeof(mpsub_ev_entry_tbl[0]);
4750
4751 /* bail if there's nothing to process */
4752 if (!mpts->mpts_evctl) {
4753 return ret;
4754 }
4755
4756 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4757 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4758 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4759 SO_FILT_HINT_DISCONNECTED)) {
4760 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4761 }
4762
4763 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4764 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4765
4766 /*
4767 * Process all the socket filter hints and reset the hint
4768 * once it is handled
4769 */
4770 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4771 /*
4772 * Always execute the DISCONNECTED event, because it will wakeup
4773 * the app.
4774 */
4775 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4776 (ret >= MPTS_EVRET_OK ||
4777 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4778 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4779 ev_ret_t error =
4780 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4781 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4782 }
4783 }
4784
4785 return ret;
4786 }
4787
4788 /*
4789 * MPTCP workloop.
4790 */
4791 void
mptcp_subflow_workloop(struct mptses * mpte)4792 mptcp_subflow_workloop(struct mptses *mpte)
4793 {
4794 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4795 uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4796 struct mptsub *mpts, *tmpts;
4797 struct socket *mp_so;
4798
4799 mp_so = mptetoso(mpte);
4800
4801 socket_lock_assert_owned(mp_so);
4802
4803 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4804 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4805 return;
4806 }
4807 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4808
4809 relaunch:
4810 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4811
4812 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4813 ev_ret_t ret;
4814
4815 if (mpts->mpts_socket->so_usecount == 0) {
4816 /* Will be removed soon by tcp_garbage_collect */
4817 continue;
4818 }
4819
4820 mptcp_subflow_addref(mpts);
4821 mpts->mpts_socket->so_usecount++;
4822
4823 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4824
4825 /*
4826 * If MPTCP socket is closed, disconnect all subflows.
4827 * This will generate a disconnect event which will
4828 * be handled during the next iteration, causing a
4829 * non-zero error to be returned above.
4830 */
4831 if (mp_so->so_flags & SOF_PCBCLEARING) {
4832 mptcp_subflow_disconnect(mpte, mpts);
4833 }
4834
4835 switch (ret) {
4836 case MPTS_EVRET_OK:
4837 /* nothing to do */
4838 break;
4839 case MPTS_EVRET_DELETE:
4840 mptcp_subflow_soclose(mpts);
4841 break;
4842 case MPTS_EVRET_CONNECT_PENDING:
4843 connect_pending = TRUE;
4844 break;
4845 case MPTS_EVRET_DISCONNECT_FALLBACK:
4846 disconnect_fallback = TRUE;
4847 break;
4848 default:
4849 break;
4850 }
4851 mptcp_subflow_remref(mpts); /* ours */
4852
4853 VERIFY(mpts->mpts_socket->so_usecount != 0);
4854 mpts->mpts_socket->so_usecount--;
4855 }
4856
4857 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4858 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4859
4860 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4861 mp_so->so_state |= SS_CANTRCVMORE;
4862 sorwakeup(mp_so);
4863 }
4864
4865 soevent(mp_so, mpsofilt_hint_mask);
4866 }
4867
4868 if (!connect_pending && !disconnect_fallback) {
4869 goto exit;
4870 }
4871
4872 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4873 if (disconnect_fallback) {
4874 struct socket *so = NULL;
4875 struct inpcb *inp = NULL;
4876 struct tcpcb *tp = NULL;
4877
4878 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4879 continue;
4880 }
4881
4882 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4883
4884 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4885 MPTSF_DISCONNECTED)) {
4886 continue;
4887 }
4888
4889 so = mpts->mpts_socket;
4890
4891 /*
4892 * The MPTCP connection has degraded to a fallback
4893 * mode, so there is no point in keeping this subflow
4894 * regardless of its MPTCP-readiness state, unless it
4895 * is the primary one which we use for fallback. This
4896 * assumes that the subflow used for fallback is the
4897 * ACTIVE one.
4898 */
4899
4900 inp = sotoinpcb(so);
4901 tp = intotcpcb(inp);
4902 tp->t_mpflags &=
4903 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4904 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4905
4906 soevent(so, SO_FILT_HINT_MUSTRST);
4907 } else if (connect_pending) {
4908 /*
4909 * The MPTCP connection has progressed to a state
4910 * where it supports full multipath semantics; allow
4911 * additional joins to be attempted for all subflows
4912 * that are in the PENDING state.
4913 */
4914 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4915 int error = mptcp_subflow_soconnectx(mpte, mpts);
4916
4917 if (error) {
4918 mptcp_subflow_abort(mpts, error);
4919 }
4920 }
4921 }
4922 }
4923
4924 exit:
4925 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4926 goto relaunch;
4927 }
4928
4929 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4930 }
4931
4932 /*
4933 * Protocol pr_lock callback.
4934 */
4935 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4936 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4937 {
4938 struct mppcb *mpp = mpsotomppcb(mp_so);
4939 void *lr_saved;
4940
4941 if (lr == NULL) {
4942 lr_saved = __builtin_return_address(0);
4943 } else {
4944 lr_saved = lr;
4945 }
4946
4947 if (mpp == NULL) {
4948 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4949 mp_so, lr_saved, solockhistory_nr(mp_so));
4950 /* NOTREACHED */
4951 }
4952 mpp_lock(mpp);
4953
4954 if (mp_so->so_usecount < 0) {
4955 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4956 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4957 solockhistory_nr(mp_so));
4958 /* NOTREACHED */
4959 }
4960 if (refcount != 0) {
4961 mp_so->so_usecount++;
4962 mpp->mpp_inside++;
4963 }
4964 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4965 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4966
4967 return 0;
4968 }
4969
4970 /*
4971 * Protocol pr_unlock callback.
4972 */
4973 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)4974 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4975 {
4976 struct mppcb *mpp = mpsotomppcb(mp_so);
4977 void *lr_saved;
4978
4979 if (lr == NULL) {
4980 lr_saved = __builtin_return_address(0);
4981 } else {
4982 lr_saved = lr;
4983 }
4984
4985 if (mpp == NULL) {
4986 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
4987 mp_so, mp_so->so_usecount, lr_saved,
4988 solockhistory_nr(mp_so));
4989 /* NOTREACHED */
4990 }
4991 socket_lock_assert_owned(mp_so);
4992
4993 if (refcount != 0) {
4994 mp_so->so_usecount--;
4995 mpp->mpp_inside--;
4996 }
4997
4998 if (mp_so->so_usecount < 0) {
4999 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5000 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5001 /* NOTREACHED */
5002 }
5003 if (mpp->mpp_inside < 0) {
5004 panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5005 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5006 /* NOTREACHED */
5007 }
5008 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5009 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5010 mpp_unlock(mpp);
5011
5012 return 0;
5013 }
5014
5015 /*
5016 * Protocol pr_getlock callback.
5017 */
5018 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5019 mptcp_getlock(struct socket *mp_so, int flags)
5020 {
5021 struct mppcb *mpp = mpsotomppcb(mp_so);
5022
5023 if (mpp == NULL) {
5024 panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5025 solockhistory_nr(mp_so));
5026 /* NOTREACHED */
5027 }
5028 if (mp_so->so_usecount < 0) {
5029 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5030 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5031 /* NOTREACHED */
5032 }
5033 return mpp_getlock(mpp, flags);
5034 }
5035
5036 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5037 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5038 u_int32_t *rrand)
5039 {
5040 struct mptcp_subf_auth_entry *sauth_entry;
5041
5042 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5043 if (sauth_entry->msae_laddr_id == addr_id) {
5044 if (lrand) {
5045 *lrand = sauth_entry->msae_laddr_rand;
5046 }
5047 if (rrand) {
5048 *rrand = sauth_entry->msae_raddr_rand;
5049 }
5050 break;
5051 }
5052 }
5053 }
5054
5055 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5056 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5057 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5058 {
5059 struct mptcp_subf_auth_entry *sauth_entry;
5060
5061 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5062 if (sauth_entry->msae_laddr_id == laddr_id) {
5063 if ((sauth_entry->msae_raddr_id != 0) &&
5064 (sauth_entry->msae_raddr_id != raddr_id)) {
5065 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5066 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5067 raddr_id, sauth_entry->msae_raddr_id);
5068 return;
5069 }
5070 sauth_entry->msae_raddr_id = raddr_id;
5071 if ((sauth_entry->msae_raddr_rand != 0) &&
5072 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5073 os_log_error(mptcp_log_handle, "%s - %lx: "
5074 "dup SYN_ACK %d %d \n",
5075 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5076 raddr_rand, sauth_entry->msae_raddr_rand);
5077 return;
5078 }
5079 sauth_entry->msae_raddr_rand = raddr_rand;
5080 return;
5081 }
5082 }
5083 }
5084
5085 /*
5086 * SHA-256 support for MPTCP
5087 */
5088
5089 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5090 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5091 {
5092 const unsigned char *sha2_base;
5093 int sha2_size;
5094
5095 sha2_base = (const unsigned char *) key;
5096 sha2_size = sizeof(mptcp_key_t);
5097
5098 SHA256_CTX sha_ctx;
5099 SHA256_Init(&sha_ctx);
5100 SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5101 SHA256_Final(sha_digest, &sha_ctx);
5102 }
5103
5104 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5105 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5106 u_char *msg, uint16_t msg_len, u_char *digest)
5107 {
5108 SHA256_CTX sha_ctx;
5109 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5110 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5111 int i;
5112
5113 bzero(digest, SHA256_DIGEST_LENGTH);
5114
5115 /* Set up the Key for HMAC */
5116 key_ipad[0] = key1;
5117 key_ipad[1] = key2;
5118
5119 key_opad[0] = key1;
5120 key_opad[1] = key2;
5121
5122 /* Key is 512 block length, so no need to compute hash */
5123
5124 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5125
5126 for (i = 0; i < 8; i++) {
5127 key_ipad[i] ^= 0x3636363636363636;
5128 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5129 }
5130
5131 /* Perform inner SHA256 */
5132 SHA256_Init(&sha_ctx);
5133 SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5134 SHA256_Update(&sha_ctx, msg, msg_len);
5135 SHA256_Final(digest, &sha_ctx);
5136
5137 /* Perform outer SHA256 */
5138 SHA256_Init(&sha_ctx);
5139 SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5140 SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5141 SHA256_Final(digest, &sha_ctx);
5142 }
5143
5144 /*
5145 * SHA1 support for MPTCP
5146 */
5147
5148 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5149 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5150 {
5151 SHA1_CTX sha1ctxt;
5152 const unsigned char *sha1_base;
5153 int sha1_size;
5154
5155 sha1_base = (const unsigned char *) key;
5156 sha1_size = sizeof(mptcp_key_t);
5157 SHA1Init(&sha1ctxt);
5158 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5159 SHA1Final(sha_digest, &sha1ctxt);
5160 }
5161
5162 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5163 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5164 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5165 {
5166 SHA1_CTX sha1ctxt;
5167 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5168 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5169 u_int32_t data[2];
5170 int i;
5171
5172 bzero(digest, SHA1_RESULTLEN);
5173
5174 /* Set up the Key for HMAC */
5175 key_ipad[0] = key1;
5176 key_ipad[1] = key2;
5177
5178 key_opad[0] = key1;
5179 key_opad[1] = key2;
5180
5181 /* Set up the message for HMAC */
5182 data[0] = rand1;
5183 data[1] = rand2;
5184
5185 /* Key is 512 block length, so no need to compute hash */
5186
5187 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5188
5189 for (i = 0; i < 8; i++) {
5190 key_ipad[i] ^= 0x3636363636363636;
5191 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5192 }
5193
5194 /* Perform inner SHA1 */
5195 SHA1Init(&sha1ctxt);
5196 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5197 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5198 SHA1Final(digest, &sha1ctxt);
5199
5200 /* Perform outer SHA1 */
5201 SHA1Init(&sha1ctxt);
5202 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5203 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5204 SHA1Final(digest, &sha1ctxt);
5205 }
5206
5207 /*
5208 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5209 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5210 */
5211 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5212 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5213 {
5214 uint32_t lrand, rrand;
5215
5216 lrand = rrand = 0;
5217 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5218
5219 u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5220 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5221 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5222 } else {
5223 uint32_t data[2];
5224 data[0] = lrand;
5225 data[1] = rrand;
5226 mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5227 }
5228 bcopy(full_digest, digest, digest_len);
5229 }
5230
5231 /*
5232 * Authentication data generation
5233 */
5234 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5235 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5236 int token_len)
5237 {
5238 VERIFY(token_len == sizeof(u_int32_t));
5239 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5240 sha_digest_len == SHA256_DIGEST_LENGTH);
5241
5242 /* Most significant 32 bits of the SHA1/SHA256 hash */
5243 bcopy(sha_digest, token, sizeof(u_int32_t));
5244 return;
5245 }
5246
5247 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5248 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5249 int idsn_len, uint8_t mp_version)
5250 {
5251 VERIFY(idsn_len == sizeof(u_int64_t));
5252 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5253 sha_digest_len == SHA256_DIGEST_LENGTH);
5254 VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5255
5256 /*
5257 * Least significant 64 bits of the hash
5258 */
5259
5260 if (mp_version == MPTCP_VERSION_0) {
5261 idsn[7] = sha_digest[12];
5262 idsn[6] = sha_digest[13];
5263 idsn[5] = sha_digest[14];
5264 idsn[4] = sha_digest[15];
5265 idsn[3] = sha_digest[16];
5266 idsn[2] = sha_digest[17];
5267 idsn[1] = sha_digest[18];
5268 idsn[0] = sha_digest[19];
5269 } else {
5270 idsn[7] = sha_digest[24];
5271 idsn[6] = sha_digest[25];
5272 idsn[5] = sha_digest[26];
5273 idsn[4] = sha_digest[27];
5274 idsn[3] = sha_digest[28];
5275 idsn[2] = sha_digest[29];
5276 idsn[1] = sha_digest[30];
5277 idsn[0] = sha_digest[31];
5278 }
5279 return;
5280 }
5281
5282 static void
mptcp_conn_properties(struct mptcb * mp_tp)5283 mptcp_conn_properties(struct mptcb *mp_tp)
5284 {
5285 /* Set DSS checksum flag */
5286 if (mptcp_dss_csum) {
5287 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5288 }
5289
5290 /* Set up receive window */
5291 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5292
5293 /* Set up gc ticks */
5294 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5295 }
5296
5297 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5298 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5299 {
5300 struct mptcb *mp_tp = mpte->mpte_mptcb;
5301 char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5302 uint16_t digest_len;
5303
5304 if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5305 mp_tp->mpt_version = MPTCP_VERSION_0;
5306 } else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5307 mp_tp->mpt_version = MPTCP_VERSION_1;
5308 } else {
5309 mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5310 }
5311 VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5312 mp_tp->mpt_version == MPTCP_VERSION_1);
5313
5314 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5315 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5316 digest_len = SHA1_RESULTLEN;
5317 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5318 } else {
5319 digest_len = SHA256_DIGEST_LENGTH;
5320 mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5321 }
5322
5323 mptcp_generate_token(key_digest, digest_len,
5324 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5325 mptcp_generate_idsn(key_digest, digest_len,
5326 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5327 /* The subflow SYN is also first MPTCP byte */
5328 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5329 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5330
5331 mptcp_conn_properties(mp_tp);
5332 }
5333
5334 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5335 mptcp_init_remote_parms(struct mptcb *mp_tp)
5336 {
5337 /* Setup local and remote tokens and Initial DSNs */
5338 char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5339 uint16_t digest_len;
5340
5341 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5342 digest_len = SHA1_RESULTLEN;
5343 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5344 } else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5345 digest_len = SHA256_DIGEST_LENGTH;
5346 mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5347 } else {
5348 return -1;
5349 }
5350
5351 mptcp_generate_token(remote_digest, digest_len,
5352 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5353 mptcp_generate_idsn(remote_digest, digest_len,
5354 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5355 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5356 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5357 return 0;
5358 }
5359
5360 static void
mptcp_send_dfin(struct socket * so)5361 mptcp_send_dfin(struct socket *so)
5362 {
5363 struct tcpcb *tp = NULL;
5364 struct inpcb *inp = NULL;
5365
5366 inp = sotoinpcb(so);
5367 if (!inp) {
5368 return;
5369 }
5370
5371 tp = intotcpcb(inp);
5372 if (!tp) {
5373 return;
5374 }
5375
5376 if (!(tp->t_mpflags & TMPF_RESET)) {
5377 tp->t_mpflags |= TMPF_SEND_DFIN;
5378 }
5379 }
5380
5381 /*
5382 * Data Sequence Mapping routines
5383 */
5384 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5385 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5386 {
5387 struct mptcb *mp_tp;
5388
5389 if (m == NULL) {
5390 return;
5391 }
5392
5393 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5394
5395 while (m) {
5396 VERIFY(m->m_flags & M_PKTHDR);
5397 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5398 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5399 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5400 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5401 mp_tp->mpt_sndmax += m_pktlen(m);
5402 m = m->m_next;
5403 }
5404 }
5405
5406 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5407 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5408 {
5409 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5410 uint64_t data_ack;
5411 uint64_t dsn;
5412
5413 VERIFY(len >= 0);
5414
5415 if (!m || len == 0) {
5416 return;
5417 }
5418
5419 while (m && len > 0) {
5420 VERIFY(m->m_flags & M_PKTHDR);
5421 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5422
5423 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5424 dsn = m->m_pkthdr.mp_dsn;
5425
5426 len -= m->m_len;
5427 m = m->m_next;
5428 }
5429
5430 if (m && len == 0) {
5431 /*
5432 * If there is one more mbuf in the chain, it automatically means
5433 * that up to m->mp_dsn has been ack'ed.
5434 *
5435 * This means, we actually correct data_ack back down (compared
5436 * to what we set inside the loop - dsn + data_len). Because in
5437 * the loop we are "optimistic" and assume that the full mapping
5438 * will be acked. If that's not the case and we get out of the
5439 * loop with m != NULL, it means only up to m->mp_dsn has been
5440 * really acked.
5441 */
5442 data_ack = m->m_pkthdr.mp_dsn;
5443 }
5444
5445 if (len < 0) {
5446 /*
5447 * If len is negative, meaning we acked in the middle of an mbuf,
5448 * only up to this mbuf's data-sequence number has been acked
5449 * at the MPTCP-level.
5450 */
5451 data_ack = dsn;
5452 }
5453
5454 /* We can have data in the subflow's send-queue that is being acked,
5455 * while the DATA_ACK has already advanced. Thus, we should check whether
5456 * or not the DATA_ACK is actually new here.
5457 */
5458 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5459 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5460 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5461 }
5462 }
5463
5464 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5465 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5466 {
5467 int rewinding = 0;
5468
5469 /* TFO makes things complicated. */
5470 if (so->so_flags1 & SOF1_TFO_REWIND) {
5471 rewinding = 1;
5472 so->so_flags1 &= ~SOF1_TFO_REWIND;
5473 }
5474
5475 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5476 u_int32_t sub_len;
5477 VERIFY(m->m_flags & M_PKTHDR);
5478 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5479
5480 sub_len = m->m_pkthdr.mp_rlen;
5481
5482 if (sub_len < len) {
5483 m->m_pkthdr.mp_dsn += sub_len;
5484 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5485 m->m_pkthdr.mp_rseq += sub_len;
5486 }
5487 m->m_pkthdr.mp_rlen = 0;
5488 len -= sub_len;
5489 } else {
5490 /* sub_len >= len */
5491 if (rewinding == 0) {
5492 m->m_pkthdr.mp_dsn += len;
5493 }
5494 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5495 if (rewinding == 0) {
5496 m->m_pkthdr.mp_rseq += len;
5497 }
5498 }
5499 m->m_pkthdr.mp_rlen -= len;
5500 break;
5501 }
5502 m = m->m_next;
5503 }
5504
5505 if (so->so_flags & SOF_MP_SUBFLOW &&
5506 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5507 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5508 /*
5509 * Received an ack without receiving a DATA_ACK.
5510 * Need to fallback to regular TCP (or destroy this subflow).
5511 */
5512 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5513 mptcp_notify_mpfail(so);
5514 }
5515 }
5516
5517 /* Obtain the DSN mapping stored in the mbuf */
5518 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5519 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5520 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5521 {
5522 u_int64_t dsn64;
5523
5524 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5525 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5526 }
5527
5528 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5529 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5530 uint32_t *relseq, uint16_t *data_len,
5531 uint16_t *dss_csum)
5532 {
5533 struct mbuf *m = so->so_snd.sb_mb;
5534
5535 VERIFY(off >= 0);
5536
5537 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5538 *dsn = 0;
5539 *relseq = 0;
5540 *data_len = 0;
5541 *dss_csum = 0;
5542 return;
5543 }
5544
5545 /*
5546 * In the subflow socket, the DSN sequencing can be discontiguous,
5547 * but the subflow sequence mapping is contiguous. Use the subflow
5548 * sequence property to find the right mbuf and corresponding dsn
5549 * mapping.
5550 */
5551
5552 while (m) {
5553 VERIFY(m->m_flags & M_PKTHDR);
5554 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5555
5556 if (off >= m->m_len) {
5557 off -= m->m_len;
5558 m = m->m_next;
5559 } else {
5560 break;
5561 }
5562 }
5563
5564 VERIFY(off >= 0);
5565 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5566
5567 *dsn = m->m_pkthdr.mp_dsn;
5568 *relseq = m->m_pkthdr.mp_rseq;
5569 *data_len = m->m_pkthdr.mp_rlen;
5570 *dss_csum = m->m_pkthdr.mp_csum;
5571 }
5572
5573 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5574 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5575 {
5576 uint64_t dsn;
5577 uint32_t relseq;
5578
5579 mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5580 }
5581
5582 /*
5583 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5584 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5585 * When it trims data tcp_input calls m_adj() which does not remove the
5586 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5587 * The dsn map insertion cannot be delayed after trim, because data can be in
5588 * the reassembly queue for a while and the DSN option info in tp will be
5589 * overwritten for every new packet received.
5590 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5591 * with mptcp_adj_rmap()
5592 */
5593 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5594 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5595 {
5596 VERIFY(m->m_flags & M_PKTHDR);
5597 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5598
5599 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5600 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5601 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5602 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5603 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5604 if (tp->t_rcv_map.mpt_dfin) {
5605 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5606 }
5607
5608 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5609
5610 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5611 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5612 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5613 if (th->th_flags & TH_FIN) {
5614 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5615 }
5616 }
5617 }
5618
5619 /*
5620 * Following routines help with failure detection and failover of data
5621 * transfer from one subflow to another.
5622 */
5623 void
mptcp_act_on_txfail(struct socket * so)5624 mptcp_act_on_txfail(struct socket *so)
5625 {
5626 struct tcpcb *tp = NULL;
5627 struct inpcb *inp = sotoinpcb(so);
5628
5629 if (inp == NULL) {
5630 return;
5631 }
5632
5633 tp = intotcpcb(inp);
5634 if (tp == NULL) {
5635 return;
5636 }
5637
5638 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5639 return;
5640 }
5641
5642 so->so_flags |= SOF_MP_TRYFAILOVER;
5643 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5644 }
5645
5646 /*
5647 * Support for MP_FAIL option
5648 */
5649 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5650 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5651 {
5652 struct mbuf *m = so->so_snd.sb_mb;
5653 uint16_t datalen;
5654 uint64_t dsn;
5655 int off = 0;
5656
5657 if (m == NULL) {
5658 return -1;
5659 }
5660
5661 while (m != NULL) {
5662 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5663 VERIFY(m->m_flags & M_PKTHDR);
5664 dsn = m->m_pkthdr.mp_dsn;
5665 datalen = m->m_pkthdr.mp_rlen;
5666 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5667 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5668 off = (int)(dsn_fail - dsn);
5669 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5670 return 0;
5671 }
5672
5673 m = m->m_next;
5674 }
5675
5676 /*
5677 * If there was no mbuf data and a fallback to TCP occurred, there's
5678 * not much else to do.
5679 */
5680
5681 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5682 return -1;
5683 }
5684
5685 /*
5686 * Support for sending contiguous MPTCP bytes in subflow
5687 * Also for preventing sending data with ACK in 3-way handshake
5688 */
5689 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5690 mptcp_adj_sendlen(struct socket *so, int32_t off)
5691 {
5692 struct tcpcb *tp = sototcpcb(so);
5693 struct mptsub *mpts = tp->t_mpsub;
5694 uint64_t mdss_dsn;
5695 uint32_t mdss_subflow_seq;
5696 int mdss_subflow_off;
5697 uint16_t mdss_data_len;
5698 uint16_t dss_csum;
5699
5700 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5701 return 0;
5702 }
5703
5704 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5705 &mdss_data_len, &dss_csum);
5706
5707 /*
5708 * We need to compute how much of the mapping still remains.
5709 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5710 */
5711 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5712
5713 /*
5714 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5715 * seq has been set to 1 (while it should be 0).
5716 */
5717 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5718 mdss_subflow_off--;
5719 }
5720
5721 VERIFY(off >= mdss_subflow_off);
5722
5723 return mdss_data_len - (off - mdss_subflow_off);
5724 }
5725
5726 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5727 mptcp_get_maxseg(struct mptses *mpte)
5728 {
5729 struct mptsub *mpts;
5730 uint32_t maxseg = 0;
5731
5732 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5733 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5734
5735 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5736 TCPS_HAVERCVDFIN2(tp->t_state)) {
5737 continue;
5738 }
5739
5740 if (tp->t_maxseg > maxseg) {
5741 maxseg = tp->t_maxseg;
5742 }
5743 }
5744
5745 return maxseg;
5746 }
5747
5748 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5749 mptcp_get_rcvscale(struct mptses *mpte)
5750 {
5751 struct mptsub *mpts;
5752 uint8_t rcvscale = UINT8_MAX;
5753
5754 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5755 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5756
5757 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5758 TCPS_HAVERCVDFIN2(tp->t_state)) {
5759 continue;
5760 }
5761
5762 if (tp->rcv_scale < rcvscale) {
5763 rcvscale = tp->rcv_scale;
5764 }
5765 }
5766
5767 return rcvscale;
5768 }
5769
5770 /* Similar to tcp_sbrcv_reserve */
5771 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5772 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5773 u_int32_t newsize, u_int32_t idealsize)
5774 {
5775 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5776
5777 if (rcvscale == UINT8_MAX) {
5778 return;
5779 }
5780
5781 /* newsize should not exceed max */
5782 newsize = min(newsize, tcp_autorcvbuf_max);
5783
5784 /* The receive window scale negotiated at the
5785 * beginning of the connection will also set a
5786 * limit on the socket buffer size
5787 */
5788 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5789
5790 /* Set new socket buffer size */
5791 if (newsize > sbrcv->sb_hiwat &&
5792 (sbreserve(sbrcv, newsize) == 1)) {
5793 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5794 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5795
5796 /* Again check the limit set by the advertised
5797 * window scale
5798 */
5799 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5800 TCP_MAXWIN << rcvscale);
5801 }
5802 }
5803
5804 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5805 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5806 {
5807 struct mptses *mpte = mp_tp->mpt_mpte;
5808 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5809 struct sockbuf *sbrcv = &mp_so->so_rcv;
5810 uint32_t hiwat_sum = 0;
5811 uint32_t ideal_sum = 0;
5812 struct mptsub *mpts;
5813
5814 /*
5815 * Do not grow the receive socket buffer if
5816 * - auto resizing is disabled, globally or on this socket
5817 * - the high water mark already reached the maximum
5818 * - the stream is in background and receive side is being
5819 * throttled
5820 * - if there are segments in reassembly queue indicating loss,
5821 * do not need to increase recv window during recovery as more
5822 * data is not going to be sent. A duplicate ack sent during
5823 * recovery should not change the receive window
5824 */
5825 if (tcp_do_autorcvbuf == 0 ||
5826 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5827 tcp_cansbgrow(sbrcv) == 0 ||
5828 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5829 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5830 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5831 /* Can not resize the socket buffer, just return */
5832 return;
5833 }
5834
5835 /*
5836 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5837 *
5838 * But, for this we first need accurate receiver-RTT estimations, which
5839 * we currently don't have.
5840 *
5841 * Let's use a dummy algorithm for now, just taking the sum of all
5842 * subflow's receive-buffers. It's too low, but that's all we can get
5843 * for now.
5844 */
5845
5846 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5847 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5848 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5849 }
5850
5851 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5852 }
5853
5854 /*
5855 * Determine if we can grow the recieve socket buffer to avoid sending
5856 * a zero window update to the peer. We allow even socket buffers that
5857 * have fixed size (set by the application) to grow if the resource
5858 * constraints are met. They will also be trimmed after the application
5859 * reads data.
5860 *
5861 * Similar to tcp_sbrcv_grow_rwin
5862 */
5863 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5864 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5865 {
5866 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5867 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5868 u_int32_t rcvbuf = sb->sb_hiwat;
5869
5870 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5871 return;
5872 }
5873
5874 if (tcp_do_autorcvbuf == 1 &&
5875 tcp_cansbgrow(sb) &&
5876 /* Diff to tcp_sbrcv_grow_rwin */
5877 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5878 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5879 rcvbuf < tcp_autorcvbuf_max &&
5880 (sb->sb_idealsize > 0 &&
5881 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5882 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5883 }
5884 }
5885
5886 /* Similar to tcp_sbspace */
5887 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5888 mptcp_sbspace(struct mptcb *mp_tp)
5889 {
5890 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5891 uint32_t rcvbuf;
5892 int32_t space;
5893 int32_t pending = 0;
5894
5895 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5896
5897 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5898
5899 /* hiwat might have changed */
5900 rcvbuf = sb->sb_hiwat;
5901
5902 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5903 (sb->sb_mbmax - sb->sb_mbcnt)));
5904 if (space < 0) {
5905 space = 0;
5906 }
5907
5908 #if CONTENT_FILTER
5909 /* Compensate for data being processed by content filters */
5910 pending = cfil_sock_data_space(sb);
5911 #endif /* CONTENT_FILTER */
5912 if (pending > space) {
5913 space = 0;
5914 } else {
5915 space -= pending;
5916 }
5917
5918 return space;
5919 }
5920
5921 /*
5922 * Support Fallback to Regular TCP
5923 */
5924 void
mptcp_notify_mpready(struct socket * so)5925 mptcp_notify_mpready(struct socket *so)
5926 {
5927 struct tcpcb *tp = NULL;
5928
5929 if (so == NULL) {
5930 return;
5931 }
5932
5933 tp = intotcpcb(sotoinpcb(so));
5934
5935 if (tp == NULL) {
5936 return;
5937 }
5938
5939 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5940 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5941 struct tcpcb *, tp);
5942
5943 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5944 return;
5945 }
5946
5947 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5948 return;
5949 }
5950
5951 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5952 tp->t_mpflags |= TMPF_MPTCP_READY;
5953
5954 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5955 }
5956
5957 void
mptcp_notify_mpfail(struct socket * so)5958 mptcp_notify_mpfail(struct socket *so)
5959 {
5960 struct tcpcb *tp = NULL;
5961
5962 if (so == NULL) {
5963 return;
5964 }
5965
5966 tp = intotcpcb(sotoinpcb(so));
5967
5968 if (tp == NULL) {
5969 return;
5970 }
5971
5972 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5973 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5974 struct tcpcb *, tp);
5975
5976 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5977 return;
5978 }
5979
5980 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5981 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5982
5983 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5984 }
5985
5986 /*
5987 * Keepalive helper function
5988 */
5989 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)5990 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5991 {
5992 boolean_t ret = 1;
5993
5994 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5995
5996 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5997 ret = 0;
5998 }
5999 return ret;
6000 }
6001
6002 /*
6003 * MPTCP t_maxseg adjustment function
6004 */
6005 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6006 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6007 {
6008 int mss_lower = 0;
6009 struct mptcb *mp_tp = tptomptp(tp);
6010
6011 #define MPTCP_COMPUTE_LEN { \
6012 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
6013 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
6014 mss_lower += 2; \
6015 else \
6016 /* adjust to 32-bit boundary + EOL */ \
6017 mss_lower += 2; \
6018 }
6019 if (mp_tp == NULL) {
6020 return 0;
6021 }
6022
6023 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6024
6025 /*
6026 * For the first subflow and subsequent subflows, adjust mss for
6027 * most common MPTCP option size, for case where tcp_mss is called
6028 * during option processing and MTU discovery.
6029 */
6030 if (!mtudisc) {
6031 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6032 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6033 MPTCP_COMPUTE_LEN;
6034 }
6035
6036 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6037 tp->t_mpflags & TMPF_SENT_JOIN) {
6038 MPTCP_COMPUTE_LEN;
6039 }
6040 } else {
6041 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6042 MPTCP_COMPUTE_LEN;
6043 }
6044 }
6045
6046 return mss_lower;
6047 }
6048
6049 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6050 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6051 {
6052 struct inpcb *inp;
6053
6054 tcp_getconninfo(so, &flow->flow_ci);
6055 inp = sotoinpcb(so);
6056 if ((inp->inp_vflag & INP_IPV6) != 0) {
6057 flow->flow_src.ss_family = AF_INET6;
6058 flow->flow_dst.ss_family = AF_INET6;
6059 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6060 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6061 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6062 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6063 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6064 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6065 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
6066 flow->flow_src.ss_family = AF_INET;
6067 flow->flow_dst.ss_family = AF_INET;
6068 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6069 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6070 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6071 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6072 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6073 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6074 }
6075 flow->flow_len = sizeof(*flow);
6076 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6077 flow->flow_flags = mpts->mpts_flags;
6078 flow->flow_cid = mpts->mpts_connid;
6079 flow->flow_relseq = mpts->mpts_rel_seq;
6080 flow->flow_soerror = mpts->mpts_socket->so_error;
6081 flow->flow_probecnt = mpts->mpts_probecnt;
6082 }
6083
6084 static int
6085 mptcp_pcblist SYSCTL_HANDLER_ARGS
6086 {
6087 #pragma unused(oidp, arg1, arg2)
6088 int error = 0, f;
6089 size_t len;
6090 struct mppcb *mpp;
6091 struct mptses *mpte;
6092 struct mptcb *mp_tp;
6093 struct mptsub *mpts;
6094 struct socket *so;
6095 conninfo_mptcp_t mptcpci;
6096 mptcp_flow_t *flows = NULL;
6097
6098 if (req->newptr != USER_ADDR_NULL) {
6099 return EPERM;
6100 }
6101
6102 lck_mtx_lock(&mtcbinfo.mppi_lock);
6103 if (req->oldptr == USER_ADDR_NULL) {
6104 size_t n = mtcbinfo.mppi_count;
6105 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6106 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6107 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6108 return 0;
6109 }
6110 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6111 flows = NULL;
6112 socket_lock(mpp->mpp_socket, 1);
6113 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6114 mpte = mptompte(mpp);
6115
6116 socket_lock_assert_owned(mptetoso(mpte));
6117 mp_tp = mpte->mpte_mptcb;
6118
6119 bzero(&mptcpci, sizeof(mptcpci));
6120 mptcpci.mptcpci_state = mp_tp->mpt_state;
6121 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6122 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6123 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6124 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6125 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6126 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6127 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6128 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6129 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6130 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6131 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6132 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6133 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6134
6135 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6136 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6137 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6138 mptcpci.mptcpci_flow_offset =
6139 offsetof(conninfo_mptcp_t, mptcpci_flows);
6140
6141 len = sizeof(*flows) * mpte->mpte_numflows;
6142 if (mpte->mpte_numflows != 0) {
6143 flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6144 if (flows == NULL) {
6145 socket_unlock(mpp->mpp_socket, 1);
6146 break;
6147 }
6148 mptcpci.mptcpci_len = sizeof(mptcpci) +
6149 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6150 error = SYSCTL_OUT(req, &mptcpci,
6151 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6152 } else {
6153 mptcpci.mptcpci_len = sizeof(mptcpci);
6154 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6155 }
6156 if (error) {
6157 socket_unlock(mpp->mpp_socket, 1);
6158 kfree_data(flows, len);
6159 break;
6160 }
6161 f = 0;
6162 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6163 so = mpts->mpts_socket;
6164 fill_mptcp_subflow(so, &flows[f], mpts);
6165 f++;
6166 }
6167 socket_unlock(mpp->mpp_socket, 1);
6168 if (flows) {
6169 error = SYSCTL_OUT(req, flows, len);
6170 kfree_data(flows, len);
6171 if (error) {
6172 break;
6173 }
6174 }
6175 }
6176 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6177
6178 return error;
6179 }
6180
6181 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6182 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6183 "List of active MPTCP connections");
6184
6185 /*
6186 * Set notsent lowat mark on the MPTCB
6187 */
6188 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6189 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6190 {
6191 struct mptcb *mp_tp = NULL;
6192 int error = 0;
6193
6194 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6195 mp_tp = mpte->mpte_mptcb;
6196 }
6197
6198 if (mp_tp) {
6199 mp_tp->mpt_notsent_lowat = optval;
6200 } else {
6201 error = EINVAL;
6202 }
6203
6204 return error;
6205 }
6206
6207 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6208 mptcp_get_notsent_lowat(struct mptses *mpte)
6209 {
6210 struct mptcb *mp_tp = NULL;
6211
6212 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6213 mp_tp = mpte->mpte_mptcb;
6214 }
6215
6216 if (mp_tp) {
6217 return mp_tp->mpt_notsent_lowat;
6218 } else {
6219 return 0;
6220 }
6221 }
6222
6223 int
mptcp_notsent_lowat_check(struct socket * so)6224 mptcp_notsent_lowat_check(struct socket *so)
6225 {
6226 struct mptses *mpte;
6227 struct mppcb *mpp;
6228 struct mptcb *mp_tp;
6229 struct mptsub *mpts;
6230
6231 int notsent = 0;
6232
6233 mpp = mpsotomppcb(so);
6234 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6235 return 0;
6236 }
6237
6238 mpte = mptompte(mpp);
6239 socket_lock_assert_owned(mptetoso(mpte));
6240 mp_tp = mpte->mpte_mptcb;
6241
6242 notsent = so->so_snd.sb_cc;
6243
6244 if ((notsent == 0) ||
6245 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6246 mp_tp->mpt_notsent_lowat)) {
6247 return 1;
6248 }
6249
6250 /* When Nagle's algorithm is not disabled, it is better
6251 * to wakeup the client even before there is atleast one
6252 * maxseg of data to write.
6253 */
6254 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6255 int retval = 0;
6256 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6257 struct socket *subf_so = mpts->mpts_socket;
6258 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6259
6260 notsent = so->so_snd.sb_cc -
6261 (tp->snd_nxt - tp->snd_una);
6262
6263 if ((tp->t_flags & TF_NODELAY) == 0 &&
6264 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6265 retval = 1;
6266 }
6267 return retval;
6268 }
6269 }
6270 return 0;
6271 }
6272
6273 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6274 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6275 void **unitinfo)
6276 {
6277 #pragma unused(kctlref, sac, unitinfo)
6278
6279 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6280 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6281 }
6282
6283 mptcp_kern_skt_unit = sac->sc_unit;
6284
6285 return 0;
6286 }
6287
6288 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6289 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6290 {
6291 struct mppcb *mpp;
6292
6293 /* Iterate over all MPTCP connections */
6294
6295 lck_mtx_lock(&mtcbinfo.mppi_lock);
6296
6297 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6298 struct socket *mp_so = mpp->mpp_socket;
6299 struct mptses *mpte = mpp->mpp_pcbe;
6300
6301 socket_lock(mp_so, 1);
6302
6303 if (mp_so->so_flags & SOF_DELEGATED &&
6304 uuid_compare(uuid, mp_so->e_uuid)) {
6305 goto next;
6306 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6307 uuid_compare(uuid, mp_so->last_uuid)) {
6308 goto next;
6309 }
6310
6311 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6312 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6313
6314 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6315
6316 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6317 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6318 }
6319
6320 mptcp_check_subflows_and_add(mpte);
6321 mptcp_remove_subflows(mpte);
6322
6323 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6324
6325 next:
6326 socket_unlock(mp_so, 1);
6327 }
6328
6329 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6330 }
6331
6332 static void
mptcp_wifi_status_changed(void)6333 mptcp_wifi_status_changed(void)
6334 {
6335 struct mppcb *mpp;
6336
6337 /* Iterate over all MPTCP connections */
6338
6339 lck_mtx_lock(&mtcbinfo.mppi_lock);
6340
6341 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6342 struct socket *mp_so = mpp->mpp_socket;
6343 struct mptses *mpte = mpp->mpp_pcbe;
6344
6345 socket_lock(mp_so, 1);
6346
6347 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6348 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6349 mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6350 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6351 goto next;
6352 }
6353
6354 mptcp_check_subflows_and_add(mpte);
6355 mptcp_check_subflows_and_remove(mpte);
6356
6357 next:
6358 socket_unlock(mp_so, 1);
6359 }
6360
6361 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6362 }
6363
6364 struct mptcp_uuid_search_info {
6365 uuid_t target_uuid;
6366 proc_t found_proc;
6367 boolean_t is_proc_found;
6368 };
6369
6370 static int
mptcp_find_proc_filter(proc_t p,void * arg)6371 mptcp_find_proc_filter(proc_t p, void *arg)
6372 {
6373 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6374 int found;
6375
6376 if (info->is_proc_found) {
6377 return 0;
6378 }
6379
6380 /*
6381 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6382 * expects != 0 for a matching filter.
6383 */
6384 found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6385 if (found) {
6386 info->is_proc_found = true;
6387 }
6388
6389 return found;
6390 }
6391
6392 static int
mptcp_find_proc_callout(proc_t p,void * arg)6393 mptcp_find_proc_callout(proc_t p, void * arg)
6394 {
6395 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6396
6397 if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6398 info->found_proc = p;
6399 return PROC_CLAIMED_DONE;
6400 }
6401
6402 return PROC_RETURNED;
6403 }
6404
6405 static proc_t
mptcp_find_proc(const uuid_t uuid)6406 mptcp_find_proc(const uuid_t uuid)
6407 {
6408 struct mptcp_uuid_search_info info;
6409
6410 uuid_copy(info.target_uuid, uuid);
6411 info.found_proc = PROC_NULL;
6412 info.is_proc_found = false;
6413
6414 proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6415 mptcp_find_proc_filter, &info);
6416
6417 return info.found_proc;
6418 }
6419
6420 void
mptcp_ask_symptoms(struct mptses * mpte)6421 mptcp_ask_symptoms(struct mptses *mpte)
6422 {
6423 struct mptcp_symptoms_ask_uuid ask;
6424 struct socket *mp_so;
6425 struct proc *p = PROC_NULL;
6426 int pid, prio, err;
6427
6428 if (mptcp_kern_skt_unit == 0) {
6429 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6430 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6431 return;
6432 }
6433
6434 mp_so = mptetoso(mpte);
6435
6436 if (mp_so->so_flags & SOF_DELEGATED) {
6437 if (mpte->mpte_epid != 0) {
6438 p = proc_find(mpte->mpte_epid);
6439 if (p != PROC_NULL) {
6440 /* We found a pid, check its UUID */
6441 if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6442 /* It's not the same - we need to look for the real proc */
6443 proc_rele(p);
6444 p = PROC_NULL;
6445 }
6446 }
6447 }
6448
6449 if (p == PROC_NULL) {
6450 p = mptcp_find_proc(mp_so->e_uuid);
6451 if (p == PROC_NULL) {
6452 uuid_string_t uuid_string;
6453 uuid_unparse(mp_so->e_uuid, uuid_string);
6454
6455 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6456 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6457
6458 return;
6459 }
6460 mpte->mpte_epid = proc_pid(p);
6461 }
6462
6463 pid = mpte->mpte_epid;
6464 uuid_copy(ask.uuid, mp_so->e_uuid);
6465 } else {
6466 pid = mp_so->last_pid;
6467
6468 p = proc_find(pid);
6469 if (p == PROC_NULL) {
6470 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6471 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6472 return;
6473 }
6474
6475 uuid_copy(ask.uuid, mp_so->last_uuid);
6476 }
6477
6478
6479 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6480
6481 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6482
6483 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6484 prio == TASK_DARWINBG_APPLICATION) {
6485 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6486 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6487 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6488 } else {
6489 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6490 }
6491
6492 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6493 &ask, sizeof(ask), CTL_DATA_EOR);
6494
6495 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6496 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6497
6498
6499 proc_rele(p);
6500 }
6501
6502 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6503 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6504 void *unitinfo)
6505 {
6506 #pragma unused(kctlref, kcunit, unitinfo)
6507
6508 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6509
6510 return 0;
6511 }
6512
6513 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6514 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6515 mbuf_t m, int flags)
6516 {
6517 #pragma unused(kctlref, unitinfo, flags)
6518 symptoms_advisory_t *sa = NULL;
6519
6520 if (kcunit != mptcp_kern_skt_unit) {
6521 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6522 __func__, kcunit, mptcp_kern_skt_unit);
6523 }
6524
6525 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6526 mbuf_freem(m);
6527 return EINVAL;
6528 }
6529
6530 if (mbuf_len(m) < sizeof(*sa)) {
6531 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6532 __func__, mbuf_len(m), sizeof(*sa));
6533 mbuf_freem(m);
6534 return EINVAL;
6535 }
6536
6537 sa = mbuf_data(m);
6538
6539 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6540 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6541 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6542 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6543
6544 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6545 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6546 mptcp_wifi_status_changed();
6547 }
6548 } else {
6549 struct mptcp_symptoms_answer answer;
6550 errno_t err;
6551
6552 /* We temporarily allow different sizes for ease of submission */
6553 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6554 mbuf_len(m) != sizeof(answer)) {
6555 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6556 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6557 sizeof(answer));
6558 mbuf_free(m);
6559 return EINVAL;
6560 }
6561
6562 memset(&answer, 0, sizeof(answer));
6563
6564 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6565 if (err) {
6566 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6567 mbuf_free(m);
6568 return err;
6569 }
6570
6571 mptcp_allow_uuid(answer.uuid, answer.rssi);
6572 }
6573
6574 mbuf_freem(m);
6575 return 0;
6576 }
6577
6578 void
mptcp_control_register(void)6579 mptcp_control_register(void)
6580 {
6581 /* Set up the advisory control socket */
6582 struct kern_ctl_reg mptcp_kern_ctl;
6583
6584 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6585 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6586 sizeof(mptcp_kern_ctl.ctl_name));
6587 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6588 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6589 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6590 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6591
6592 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6593 }
6594
6595 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6596 mptcp_wifi_quality_for_session(struct mptses *mpte)
6597 {
6598 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6599 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6600 mptcp_advisory.sa_wifi_status) {
6601 return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6602 }
6603
6604 /*
6605 * If it's a first-party app and we don't have any info
6606 * about the Wi-Fi state, let's be pessimistic.
6607 */
6608 return MPTCP_WIFI_QUALITY_UNSURE;
6609 } else {
6610 if (symptoms_is_wifi_lossy()) {
6611 return MPTCP_WIFI_QUALITY_BAD;
6612 }
6613
6614 /*
6615 * If we are target-based (meaning, we allow to be more lax on
6616 * the when wifi is considered bad), we only *know* about the state once
6617 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6618 *
6619 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6620 * be set.
6621 *
6622 * In any other case (while in target-mode), consider WiFi bad
6623 * and we are going to ask for allowance from Symptoms anyway.
6624 */
6625 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6626 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6627 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6628 return MPTCP_WIFI_QUALITY_GOOD;
6629 }
6630
6631 return MPTCP_WIFI_QUALITY_BAD;
6632 }
6633
6634 return MPTCP_WIFI_QUALITY_GOOD;
6635 }
6636 }
6637
6638 boolean_t
symptoms_is_wifi_lossy(void)6639 symptoms_is_wifi_lossy(void)
6640 {
6641 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6642 }
6643
6644 int
mptcp_freeq(struct mptcb * mp_tp)6645 mptcp_freeq(struct mptcb *mp_tp)
6646 {
6647 struct tseg_qent *q;
6648 int rv = 0;
6649 int count = 0;
6650
6651 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6652 LIST_REMOVE(q, tqe_q);
6653 m_freem(q->tqe_m);
6654 zfree(tcp_reass_zone, q);
6655 count++;
6656 rv = 1;
6657 }
6658 mp_tp->mpt_reassqlen = 0;
6659
6660 if (count > 0) {
6661 OSAddAtomic(-count, &mptcp_reass_total_qlen);
6662 }
6663
6664 return rv;
6665 }
6666
6667 static int
mptcp_post_event(u_int32_t event_code,int value)6668 mptcp_post_event(u_int32_t event_code, int value)
6669 {
6670 struct kev_mptcp_data event_data;
6671 struct kev_msg ev_msg;
6672
6673 memset(&ev_msg, 0, sizeof(ev_msg));
6674
6675 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6676 ev_msg.kev_class = KEV_NETWORK_CLASS;
6677 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6678 ev_msg.event_code = event_code;
6679
6680 event_data.value = value;
6681
6682 ev_msg.dv[0].data_ptr = &event_data;
6683 ev_msg.dv[0].data_length = sizeof(event_data);
6684
6685 return kev_post_msg(&ev_msg);
6686 }
6687
6688 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6689 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6690 {
6691 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6692 int error;
6693
6694 /* First-party apps (Siri) don't flip the cellicon */
6695 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6696 return;
6697 }
6698
6699 /* Subflow is disappearing - don't set it on this one */
6700 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6701 return;
6702 }
6703
6704 /* Fallen back connections are not triggering the cellicon */
6705 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6706 return;
6707 }
6708
6709 /* Remember the last time we set the cellicon. Needed for debouncing */
6710 mpte->mpte_last_cellicon_set = tcp_now;
6711
6712 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6713 tcp_sched_timers(tp);
6714
6715 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6716 mpte->mpte_cellicon_increments != 0) {
6717 if (mptcp_cellicon_refcount == 0) {
6718 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6719 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6720
6721 /* Continue, so that the icon gets set... */
6722 } else {
6723 /*
6724 * In this case, the cellicon is already set. No need to bump it
6725 * even higher
6726 */
6727
6728 return;
6729 }
6730 }
6731
6732 /* When tearing down this subflow, we need to decrement the
6733 * reference counter
6734 */
6735 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6736
6737 /* This counter, so that when a session gets destroyed we decrement
6738 * the reference counter by whatever is left
6739 */
6740 mpte->mpte_cellicon_increments++;
6741
6742 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6743 /* If cellicon is already set, get out of here! */
6744 return;
6745 }
6746
6747 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6748
6749 if (error) {
6750 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6751 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6752 } else {
6753 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6754 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6755 }
6756 }
6757
6758 void
mptcp_clear_cellicon(void)6759 mptcp_clear_cellicon(void)
6760 {
6761 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6762
6763 if (error) {
6764 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6765 __func__, error);
6766 } else {
6767 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6768 __func__);
6769 }
6770 }
6771
6772 /*
6773 * Returns true if the icon has been flipped to WiFi.
6774 */
6775 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6776 __mptcp_unset_cellicon(uint32_t val)
6777 {
6778 VERIFY(val < INT32_MAX);
6779 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6780 return false;
6781 }
6782
6783 mptcp_clear_cellicon();
6784
6785 return true;
6786 }
6787
6788 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6789 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6790 {
6791 /* First-party apps (Siri) don't flip the cellicon */
6792 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6793 return;
6794 }
6795
6796 if (mpte->mpte_cellicon_increments == 0) {
6797 /* This flow never used cell - get out of here! */
6798 return;
6799 }
6800
6801 if (mptcp_cellicon_refcount == 0) {
6802 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6803 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6804
6805 return;
6806 }
6807
6808 if (mpts) {
6809 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6810 return;
6811 }
6812
6813 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6814 }
6815
6816 if (mpte->mpte_cellicon_increments < val) {
6817 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6818 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6819 val = mpte->mpte_cellicon_increments;
6820 }
6821
6822 mpte->mpte_cellicon_increments -= val;
6823
6824 if (__mptcp_unset_cellicon(val) == false) {
6825 return;
6826 }
6827
6828 /* All flows are gone - our counter should be at zero too! */
6829 if (mpte->mpte_cellicon_increments != 0) {
6830 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6831 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6832 }
6833 }
6834
6835 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6836 mptcp_reset_rexmit_state(struct tcpcb *tp)
6837 {
6838 struct mptsub *mpts;
6839 struct inpcb *inp;
6840 struct socket *so;
6841
6842 inp = tp->t_inpcb;
6843 if (inp == NULL) {
6844 return;
6845 }
6846
6847 so = inp->inp_socket;
6848 if (so == NULL) {
6849 return;
6850 }
6851
6852 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6853 return;
6854 }
6855
6856 mpts = tp->t_mpsub;
6857
6858 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6859 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6860 }
6861
6862 void
mptcp_reset_keepalive(struct tcpcb * tp)6863 mptcp_reset_keepalive(struct tcpcb *tp)
6864 {
6865 struct mptsub *mpts = tp->t_mpsub;
6866
6867 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6868 }
6869
6870 /*
6871 * Protocol pr_init callback.
6872 */
6873 void
mptcp_init(struct protosw * pp,struct domain * dp)6874 mptcp_init(struct protosw *pp, struct domain *dp)
6875 {
6876 #pragma unused(dp)
6877 static int mptcp_initialized = 0;
6878 struct protosw *prp;
6879 struct ip6protosw *prp6;
6880
6881 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6882
6883 /* do this only once */
6884 if (mptcp_initialized) {
6885 return;
6886 }
6887 mptcp_initialized = 1;
6888
6889 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6890
6891 /*
6892 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6893 * we must be able to find IPPROTO_TCP entries for both.
6894 */
6895 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6896 VERIFY(prp != NULL);
6897 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6898 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6899 sizeof(mptcp_subflow_usrreqs));
6900 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6901 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6902 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6903 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6904 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6905 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6906 /*
6907 * Socket filters shouldn't attach/detach to/from this protosw
6908 * since pr_protosw is to be used instead, which points to the
6909 * real protocol; if they do, it is a bug and we should panic.
6910 */
6911 mptcp_subflow_protosw.pr_filter_head.tqh_first =
6912 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6913 mptcp_subflow_protosw.pr_filter_head.tqh_last =
6914 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6915
6916 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6917 IPPROTO_TCP, SOCK_STREAM);
6918 VERIFY(prp6 != NULL);
6919 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6920 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6921 sizeof(mptcp_subflow_usrreqs6));
6922 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6923 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6924 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6925 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6926 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6927 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6928 /*
6929 * Socket filters shouldn't attach/detach to/from this protosw
6930 * since pr_protosw is to be used instead, which points to the
6931 * real protocol; if they do, it is a bug and we should panic.
6932 */
6933 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6934 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6935 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6936 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6937
6938 bzero(&mtcbinfo, sizeof(mtcbinfo));
6939 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6940 mtcbinfo.mppi_zone = zone_create("mptc", sizeof(struct mpp_mtp), ZC_NONE);
6941
6942 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6943 lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6944 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
6945 &mtcbinfo.mppi_lock_attr);
6946
6947 mtcbinfo.mppi_gc = mptcp_gc;
6948 mtcbinfo.mppi_timer = mptcp_timer;
6949
6950 /* attach to MP domain for garbage collection to take place */
6951 mp_pcbinfo_attach(&mtcbinfo);
6952
6953 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
6954 }
6955