1 /*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_subflow_abort(struct mptsub *, int);
113
114 static void mptcp_send_dfin(struct socket *so);
115 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
116 static int mptcp_freeq(struct mptcb *mp_tp);
117
118 /*
119 * Possible return values for subflow event handlers. Note that success
120 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
121 * indicate errors or actions which require immediate attention; they will
122 * prevent the rest of the handlers from processing their respective events
123 * until the next round of events processing.
124 */
125 typedef enum {
126 MPTS_EVRET_DELETE = 1, /* delete this subflow */
127 MPTS_EVRET_OK = 2, /* OK */
128 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
129 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
130 } ev_ret_t;
131
132 static void mptcp_do_sha1(mptcp_key_t *, char *);
133 static void mptcp_do_sha256(mptcp_key_t *, char *);
134
135 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
136
137 static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT);
138 static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT);
139 static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry,
140 NET_KT_DEFAULT);
141
142 struct mppcbinfo mtcbinfo;
143
144 SYSCTL_DECL(_net_inet);
145
146 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
147
148 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
149 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
150
151
152 static int mptcp_alternate_port = 0;
153 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
154 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
155
156 static struct protosw mptcp_subflow_protosw;
157 static struct pr_usrreqs mptcp_subflow_usrreqs;
158 static struct ip6protosw mptcp_subflow_protosw6;
159 static struct pr_usrreqs mptcp_subflow_usrreqs6;
160
161 static uint8_t mptcp_create_subflows_scheduled;
162
163 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
164 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
165 static uint32_t mptcp_kern_skt_inuse = 0;
166 static uint32_t mptcp_kern_skt_unit;
167 static symptoms_advisory_t mptcp_advisory;
168
169 uint32_t mptcp_cellicon_refcount = 0;
170
171 os_log_t mptcp_log_handle;
172
173 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)174 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
175 {
176 int i, index = -1;
177
178 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
179 if (create && stats[i].ifindex == IFSCOPE_NONE) {
180 if (index < 0) {
181 index = i;
182 }
183 continue;
184 }
185
186 if (stats[i].ifindex == ifindex) {
187 index = i;
188 return index;
189 }
190 }
191
192 if (index != -1) {
193 stats[index].ifindex = ifindex;
194 }
195
196 return index;
197 }
198
199 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)200 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
201 {
202 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
203 int index;
204
205 if (ifp == NULL) {
206 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
207 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
208 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
209 return -1;
210 }
211
212 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
213
214 if (index != -1) {
215 if (stats[index].is_expensive == 0) {
216 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
217 }
218 }
219
220 return index;
221 }
222
223 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)224 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
225 {
226 int index;
227
228 tcpstat.tcps_mp_switches++;
229 mpte->mpte_subflow_switches++;
230
231 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
232
233 if (index != -1) {
234 mpte->mpte_itfstats[index].switches++;
235 }
236 }
237
238 /*
239 * Flushes all recorded socket options from an MP socket.
240 */
241 static void
mptcp_flush_sopts(struct mptses * mpte)242 mptcp_flush_sopts(struct mptses *mpte)
243 {
244 struct mptopt *mpo, *tmpo;
245
246 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
247 mptcp_sopt_remove(mpte, mpo);
248 mptcp_sopt_free(mpo);
249 }
250 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
251 }
252
253 /*
254 * Create an MPTCP session, called as a result of opening a MPTCP socket.
255 */
256 int
mptcp_session_create(struct mppcb * mpp)257 mptcp_session_create(struct mppcb *mpp)
258 {
259 struct mpp_mtp *mtp;
260 struct mppcbinfo *mppi;
261 struct mptses *mpte;
262 struct mptcb *mp_tp;
263
264 VERIFY(mpp != NULL);
265 mppi = mpp->mpp_pcbinfo;
266 VERIFY(mppi != NULL);
267
268 mtp = __container_of(mpp, struct mpp_mtp, mpp);
269 mpte = &mtp->mpp_ses;
270 mp_tp = &mtp->mtcb;
271
272 /* MPTCP Multipath PCB Extension */
273 bzero(mpte, sizeof(*mpte));
274 VERIFY(mpp->mpp_pcbe == NULL);
275 mpp->mpp_pcbe = mpte;
276 mpte->mpte_mppcb = mpp;
277 mpte->mpte_mptcb = mp_tp;
278
279 TAILQ_INIT(&mpte->mpte_sopts);
280 TAILQ_INIT(&mpte->mpte_subflows);
281 mpte->mpte_associd = SAE_ASSOCID_ANY;
282 mpte->mpte_connid_last = SAE_CONNID_ANY;
283
284 mptcp_init_urgency_timer(mpte);
285
286 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
287 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
288
289 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
290 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
291 }
292
293 mpte->mpte_last_cellicon_set = tcp_now;
294
295 /* MPTCP Protocol Control Block */
296 bzero(mp_tp, sizeof(*mp_tp));
297 mp_tp->mpt_mpte = mpte;
298 mp_tp->mpt_state = MPTCPS_CLOSED;
299
300 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
301
302 return 0;
303 }
304
305 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)306 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
307 {
308 if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
309 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
310 }
311
312 if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
313 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
314 }
315
316 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
317 * meaning we prefer IPv6 over IPv4.
318 */
319 if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
320 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
321 }
322
323 if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
324 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
325 }
326
327 /* We don't yet have a unicast IP */
328 return NULL;
329 }
330
331 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)332 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
333 uint64_t *cellbytes, uint64_t *allbytes)
334 {
335 int64_t mycellbytes = 0;
336 uint64_t myallbytes = 0;
337 int i;
338
339 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
340 if (mpte->mpte_itfstats[i].is_expensive) {
341 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
342 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
343 }
344
345 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
346 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
347 }
348
349 if (initial_cell) {
350 mycellbytes -= mpte->mpte_init_txbytes;
351 mycellbytes -= mpte->mpte_init_rxbytes;
352 }
353
354 if (mycellbytes < 0) {
355 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
356 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
357 *cellbytes = 0;
358 *allbytes = 0;
359 } else {
360 *cellbytes = mycellbytes;
361 *allbytes = myallbytes;
362 }
363 }
364
365 static void
mptcpstats_session_wrapup(struct mptses * mpte)366 mptcpstats_session_wrapup(struct mptses *mpte)
367 {
368 boolean_t cell = mpte->mpte_initial_cell;
369
370 switch (mpte->mpte_svctype) {
371 case MPTCP_SVCTYPE_HANDOVER:
372 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
373 tcpstat.tcps_mptcp_fp_handover_attempt++;
374
375 if (cell && mpte->mpte_handshake_success) {
376 tcpstat.tcps_mptcp_fp_handover_success_cell++;
377
378 if (mpte->mpte_used_wifi) {
379 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
380 }
381 } else if (mpte->mpte_handshake_success) {
382 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
383
384 if (mpte->mpte_used_cell) {
385 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
386 }
387 }
388 } else {
389 tcpstat.tcps_mptcp_handover_attempt++;
390
391 if (cell && mpte->mpte_handshake_success) {
392 tcpstat.tcps_mptcp_handover_success_cell++;
393
394 if (mpte->mpte_used_wifi) {
395 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
396 }
397 } else if (mpte->mpte_handshake_success) {
398 tcpstat.tcps_mptcp_handover_success_wifi++;
399
400 if (mpte->mpte_used_cell) {
401 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
402 }
403 }
404 }
405
406 if (mpte->mpte_handshake_success) {
407 uint64_t cellbytes;
408 uint64_t allbytes;
409
410 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
411
412 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
413 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
414 }
415 break;
416 case MPTCP_SVCTYPE_INTERACTIVE:
417 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
418 tcpstat.tcps_mptcp_fp_interactive_attempt++;
419
420 if (mpte->mpte_handshake_success) {
421 tcpstat.tcps_mptcp_fp_interactive_success++;
422
423 if (!cell && mpte->mpte_used_cell) {
424 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
425 }
426 }
427 } else {
428 tcpstat.tcps_mptcp_interactive_attempt++;
429
430 if (mpte->mpte_handshake_success) {
431 tcpstat.tcps_mptcp_interactive_success++;
432
433 if (!cell && mpte->mpte_used_cell) {
434 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
435 }
436 }
437 }
438
439 if (mpte->mpte_handshake_success) {
440 uint64_t cellbytes;
441 uint64_t allbytes;
442
443 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
444
445 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
446 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
447 }
448 break;
449 case MPTCP_SVCTYPE_AGGREGATE:
450 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
451 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
452
453 if (mpte->mpte_handshake_success) {
454 tcpstat.tcps_mptcp_fp_aggregate_success++;
455 }
456 } else {
457 tcpstat.tcps_mptcp_aggregate_attempt++;
458
459 if (mpte->mpte_handshake_success) {
460 tcpstat.tcps_mptcp_aggregate_success++;
461 }
462 }
463
464 if (mpte->mpte_handshake_success) {
465 uint64_t cellbytes;
466 uint64_t allbytes;
467
468 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
469
470 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
471 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
472 }
473 break;
474 }
475
476 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
477 tcpstat.tcps_mptcp_back_to_wifi++;
478 }
479
480 if (mpte->mpte_triggered_cell) {
481 tcpstat.tcps_mptcp_triggered_cell++;
482 }
483 }
484
485 /*
486 * Destroy an MPTCP session.
487 */
488 static void
mptcp_session_destroy(struct mptses * mpte)489 mptcp_session_destroy(struct mptses *mpte)
490 {
491 struct mptcb *mp_tp = mpte->mpte_mptcb;
492
493 VERIFY(mp_tp != NULL);
494 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
495
496 mptcpstats_session_wrapup(mpte);
497 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
498 mptcp_flush_sopts(mpte);
499
500 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
501 kfree_data(mpte->mpte_itfinfo,
502 sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
503 }
504 mpte->mpte_itfinfo = NULL;
505
506 mptcp_freeq(mp_tp);
507 m_freem_list(mpte->mpte_reinjectq);
508
509 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
510 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
511 }
512
513 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)514 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
515 {
516 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
517 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
518 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
519 }
520
521 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)522 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
523 const struct in_addr *addrv4)
524 {
525 static const struct in6_addr well_known_prefix = {
526 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
527 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
528 0x00, 0x00, 0x00, 0x00},
529 };
530 const char *ptrv4 = (const char *)addrv4;
531 char *ptr = (char *)addr;
532
533 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
534 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
535 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
536 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
537 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
538 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
539 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
540 return -1;
541 }
542
543 /* Check for the well-known prefix */
544 if (len == NAT64_PREFIX_LEN_96 &&
545 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
546 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
547 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
548 return -1;
549 }
550 }
551
552 switch (len) {
553 case NAT64_PREFIX_LEN_96:
554 memcpy(ptr + 12, ptrv4, 4);
555 break;
556 case NAT64_PREFIX_LEN_64:
557 memcpy(ptr + 9, ptrv4, 4);
558 break;
559 case NAT64_PREFIX_LEN_56:
560 memcpy(ptr + 7, ptrv4, 1);
561 memcpy(ptr + 9, ptrv4 + 1, 3);
562 break;
563 case NAT64_PREFIX_LEN_48:
564 memcpy(ptr + 6, ptrv4, 2);
565 memcpy(ptr + 9, ptrv4 + 2, 2);
566 break;
567 case NAT64_PREFIX_LEN_40:
568 memcpy(ptr + 5, ptrv4, 3);
569 memcpy(ptr + 9, ptrv4 + 3, 1);
570 break;
571 case NAT64_PREFIX_LEN_32:
572 memcpy(ptr + 4, ptrv4, 4);
573 break;
574 default:
575 panic("NAT64-prefix len is wrong: %u", len);
576 }
577
578 return 0;
579 }
580
581 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)582 mptcp_trigger_cell_bringup(struct mptses *mpte)
583 {
584 struct socket *mp_so = mptetoso(mpte);
585
586 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
587 uuid_string_t uuidstr;
588 int err;
589
590 socket_unlock(mp_so, 0);
591 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
592 TRUE);
593 socket_lock(mp_so, 0);
594
595 if (err == 0) {
596 mpte->mpte_triggered_cell = 1;
597 }
598
599 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
600 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
601 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
602 } else {
603 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
604 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
605 }
606 }
607
608 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)609 mptcp_subflow_disconnecting(struct mptsub *mpts)
610 {
611 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
612 return true;
613 }
614
615 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
616 return true;
617 }
618
619 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
620 return true;
621 }
622
623 return false;
624 }
625
626 /*
627 * In Handover mode, only create cell subflow if
628 * - Symptoms marked WiFi as weak:
629 * Here, if we are sending data, then we can check the RTO-state. That is a
630 * stronger signal of WiFi quality than the Symptoms indicator.
631 * If however we are not sending any data, the only thing we can do is guess
632 * and thus bring up Cell.
633 *
634 * - Symptoms marked WiFi as unknown:
635 * In this state we don't know what the situation is and thus remain
636 * conservative, only bringing up cell if there are retransmissions going on.
637 */
638 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)639 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
640 {
641 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
642
643 if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
644 /* WiFi is good - don't use cell */
645 return false;
646 }
647
648 if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
649 /*
650 * We are in unknown state, only use Cell if we have confirmed
651 * that WiFi is bad.
652 */
653 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
654 return true;
655 } else {
656 return false;
657 }
658 }
659
660 if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
661 /*
662 * WiFi is confirmed to be bad from Symptoms-Framework.
663 * If we are sending data, check the RTOs.
664 * Otherwise, be pessimistic and use Cell.
665 */
666 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
667 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
668 return true;
669 } else {
670 return false;
671 }
672 } else {
673 return true;
674 }
675 }
676
677 return false;
678 }
679
680 void
mptcp_check_subflows_and_add(struct mptses * mpte)681 mptcp_check_subflows_and_add(struct mptses *mpte)
682 {
683 struct mptcb *mp_tp = mpte->mpte_mptcb;
684 boolean_t cellular_viable = FALSE;
685 boolean_t want_cellular = TRUE;
686 uint32_t i;
687
688 if (!mptcp_ok_to_create_subflows(mp_tp)) {
689 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
690 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
691 return;
692 }
693
694 /* Just to see if we have an IP-address available */
695 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
696 return;
697 }
698
699 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
700 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
701 struct mpt_itf_info *info;
702 struct sockaddr_in6 nat64pre;
703 struct sockaddr *dst;
704 struct mptsub *mpts;
705 struct ifnet *ifp;
706 uint32_t ifindex;
707
708 info = &mpte->mpte_itfinfo[i];
709
710 ifindex = info->ifindex;
711 if (ifindex == IFSCOPE_NONE) {
712 continue;
713 }
714
715 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
716 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
717 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
718
719 if (info->no_mptcp_support) {
720 continue;
721 }
722
723 ifnet_head_lock_shared();
724 ifp = ifindex2ifnet[ifindex];
725 ifnet_head_done();
726
727 if (ifp == NULL) {
728 continue;
729 }
730
731 if (IFNET_IS_CELLULAR(ifp)) {
732 cellular_viable = TRUE;
733
734 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
735 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
736 if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
737 continue;
738 }
739 }
740 }
741
742 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
743 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
744 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
745
746 if (subifp == NULL) {
747 continue;
748 }
749
750 /*
751 * If there is at least one functioning subflow on WiFi
752 * and we are checking for the cell interface, then
753 * we always need to ask symptoms for permission as
754 * cell is triggered even if WiFi is available.
755 */
756 if (!IFNET_IS_CELLULAR(subifp) &&
757 !mptcp_subflow_disconnecting(mpts) &&
758 IFNET_IS_CELLULAR(ifp)) {
759 need_to_ask_symptoms = TRUE;
760 }
761
762 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
763 os_log(mptcp_log_handle,
764 "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
765 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
766 mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
767 IFNET_IS_CELLULAR(subifp),
768 mptcp_wifi_quality_for_session(mpte),
769 mpts->mpts_flags,
770 tp->t_rxtshift,
771 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
772 mptetoso(mpte)->so_snd.sb_cc,
773 ifindex, subifp->if_index,
774 tp->t_srtt >> TCP_RTT_SHIFT,
775 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
776 tp->t_rxtcur);
777
778 if (!IFNET_IS_CELLULAR(subifp) &&
779 !mptcp_subflow_disconnecting(mpts) &&
780 (mpts->mpts_flags & MPTSF_CONNECTED) &&
781 !mptcp_handover_use_cellular(mpte, tp)) {
782 found = TRUE;
783
784 /* We found a proper subflow on WiFi - no need for cell */
785 want_cellular = FALSE;
786 break;
787 }
788 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
789 uint64_t time_now = mach_continuous_time();
790
791 os_log(mptcp_log_handle,
792 "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
793 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
794 time_now, mptcp_wifi_quality_for_session(mpte),
795 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
796 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
797
798 if (!IFNET_IS_CELLULAR(subifp) &&
799 !mptcp_subflow_disconnecting(mpts) &&
800 (mpte->mpte_time_target == 0 ||
801 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
802 mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
803 found = TRUE;
804
805 want_cellular = FALSE;
806 break;
807 }
808 }
809
810 if (subifp->if_index == ifindex &&
811 !mptcp_subflow_disconnecting(mpts)) {
812 /*
813 * We found a subflow on this interface.
814 * No need to create a new one.
815 */
816 found = TRUE;
817 break;
818 }
819 }
820
821 if (found) {
822 continue;
823 }
824
825 if (need_to_ask_symptoms &&
826 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
827 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
828 mptcp_developer_mode == 0) {
829 mptcp_ask_symptoms(mpte);
830 return;
831 }
832
833 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
834
835 if (dst->sa_family == AF_INET &&
836 !info->has_v4_conn && info->has_nat64_conn) {
837 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
838 int error, j;
839
840 bzero(&nat64pre, sizeof(struct sockaddr_in6));
841
842 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
843 if (error) {
844 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
845 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
846 continue;
847 }
848
849 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
850 if (nat64prefixes[j].prefix_len != 0) {
851 break;
852 }
853 }
854
855 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
856
857 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
858 nat64prefixes[j].prefix_len,
859 &((struct sockaddr_in *)(void *)dst)->sin_addr);
860 if (error != 0) {
861 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
862 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
863 continue;
864 }
865
866 memcpy(&nat64pre.sin6_addr,
867 &nat64prefixes[j].ipv6_prefix,
868 sizeof(nat64pre.sin6_addr));
869 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
870 nat64pre.sin6_family = AF_INET6;
871 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
872 nat64pre.sin6_flowinfo = 0;
873 nat64pre.sin6_scope_id = 0;
874
875 dst = (struct sockaddr *)&nat64pre;
876 }
877
878 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
879 continue;
880 }
881 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
882 continue;
883 }
884
885 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
886 }
887
888 if (!cellular_viable && want_cellular) {
889 /* Trigger Cell Bringup */
890 mptcp_trigger_cell_bringup(mpte);
891 }
892 }
893
894 static void
mptcp_remove_cell_subflows(struct mptses * mpte)895 mptcp_remove_cell_subflows(struct mptses *mpte)
896 {
897 struct mptsub *mpts, *tmpts;
898
899 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
900 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
901
902 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
903 continue;
904 }
905
906 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
907 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
908
909 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
910 }
911
912 return;
913 }
914
915 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)916 mptcp_remove_wifi_subflows(struct mptses *mpte)
917 {
918 struct mptsub *mpts, *tmpts;
919
920 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
921 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
922
923 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
924 continue;
925 }
926
927 os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
928 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
929
930 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
931 }
932
933 return;
934 }
935
936 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)937 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
938 {
939 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
940 boolean_t found_working_wifi_subflow = false;
941 boolean_t found_working_cell_subflow = false;
942
943 struct mptsub *mpts;
944
945 /*
946 * Look for a subflow that is on a non-cellular interface in connected
947 * state.
948 *
949 * In that case, remove all cellular subflows.
950 *
951 * If however there is no connected subflow
952 */
953 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
954 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
955 struct socket *so;
956 struct tcpcb *tp;
957
958 if (ifp == NULL) {
959 continue;
960 }
961
962 so = mpts->mpts_socket;
963 tp = sototcpcb(so);
964
965 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
966 tp->t_state != TCPS_ESTABLISHED ||
967 mptcp_subflow_disconnecting(mpts)) {
968 continue;
969 }
970
971 if (IFNET_IS_CELLULAR(ifp)) {
972 found_working_cell_subflow = true;
973 } else {
974 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
975 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
976 if (!mptcp_handover_use_cellular(mpte, tp)) {
977 found_working_wifi_subflow = true;
978 }
979 }
980 }
981
982 /*
983 * Couldn't find a working subflow, let's not remove those on a cellular
984 * interface.
985 */
986 os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
987 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
988 found_working_wifi_subflow, found_working_cell_subflow);
989 if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
990 if (found_working_cell_subflow) {
991 mptcp_remove_wifi_subflows(mpte);
992 }
993 return;
994 }
995
996 mptcp_remove_cell_subflows(mpte);
997 }
998
999 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1000 mptcp_handover_subflows_remove(struct mptses *mpte)
1001 {
1002 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1003 boolean_t found_working_subflow = false;
1004 struct mptsub *mpts;
1005
1006 /*
1007 * Look for a subflow that is on a non-cellular interface
1008 * and actually works (aka, no retransmission timeout).
1009 */
1010 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1011 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1012 struct socket *so;
1013 struct tcpcb *tp;
1014
1015 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1016 continue;
1017 }
1018
1019 so = mpts->mpts_socket;
1020 tp = sototcpcb(so);
1021
1022 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1023 tp->t_state != TCPS_ESTABLISHED) {
1024 continue;
1025 }
1026
1027 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1028 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1029
1030 if (!mptcp_handover_use_cellular(mpte, tp)) {
1031 found_working_subflow = true;
1032 break;
1033 }
1034 }
1035
1036 /*
1037 * Couldn't find a working subflow, let's not remove those on a cellular
1038 * interface.
1039 */
1040 if (!found_working_subflow) {
1041 return;
1042 }
1043
1044 mptcp_remove_cell_subflows(mpte);
1045 }
1046
1047 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1048 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1049 {
1050 uint64_t time_now = mach_continuous_time();
1051 struct mptsub *mpts;
1052
1053 if (mpte->mpte_time_target != 0 &&
1054 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1055 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1056 /* WiFi is bad and we are below the target - don't remove any subflows */
1057 return;
1058 }
1059
1060 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1061 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1062
1063 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1064 continue;
1065 }
1066
1067 /* We have a functioning subflow on WiFi. No need for cell! */
1068 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1069 !mptcp_subflow_disconnecting(mpts)) {
1070 mptcp_remove_cell_subflows(mpte);
1071 break;
1072 }
1073 }
1074 }
1075
1076 /*
1077 * Based on the MPTCP Service-type and the state of the subflows, we
1078 * will destroy subflows here.
1079 */
1080 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1081 mptcp_check_subflows_and_remove(struct mptses *mpte)
1082 {
1083 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1084 return;
1085 }
1086
1087 socket_lock_assert_owned(mptetoso(mpte));
1088
1089 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1090 mptcp_pure_handover_subflows_remove(mpte);
1091 }
1092
1093 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1094 mptcp_handover_subflows_remove(mpte);
1095 }
1096
1097 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1098 mptcp_targetbased_subflows_remove(mpte);
1099 }
1100 }
1101
1102 static void
mptcp_remove_subflows(struct mptses * mpte)1103 mptcp_remove_subflows(struct mptses *mpte)
1104 {
1105 struct mptsub *mpts, *tmpts;
1106
1107 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1108 return;
1109 }
1110
1111 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1112 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1113 boolean_t found = false;
1114 uint32_t ifindex;
1115 uint32_t i;
1116
1117 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1118 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1119
1120 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1121 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1122 ifp ? ifp->if_index : -1);
1123 soevent(mpts->mpts_socket,
1124 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1125
1126 continue;
1127 }
1128
1129 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1130 continue;
1131 }
1132
1133 if (ifp) {
1134 ifindex = ifp->if_index;
1135 } else {
1136 ifindex = mpts->mpts_ifscope;
1137 }
1138
1139 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1140 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1141 continue;
1142 }
1143
1144 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1145 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1146 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1147 found = true;
1148 break;
1149 }
1150
1151 if (mpts->mpts_dst.sa_family == AF_INET &&
1152 mpte->mpte_itfinfo[i].has_v4_conn) {
1153 found = true;
1154 break;
1155 }
1156 }
1157 }
1158
1159 if (!found) {
1160 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1161 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1162 ifindex, mpts->mpts_flags);
1163
1164 soevent(mpts->mpts_socket,
1165 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1166 }
1167 }
1168 }
1169
1170 static void
mptcp_create_subflows(__unused void * arg)1171 mptcp_create_subflows(__unused void *arg)
1172 {
1173 struct mppcb *mpp;
1174
1175 /*
1176 * Start with clearing, because we might be processing connections
1177 * while a new event comes in.
1178 */
1179 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1180 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1181 }
1182
1183 /* Iterate over all MPTCP connections */
1184
1185 lck_mtx_lock(&mtcbinfo.mppi_lock);
1186
1187 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1188 struct socket *mp_so = mpp->mpp_socket;
1189 struct mptses *mpte = mpp->mpp_pcbe;
1190
1191 socket_lock(mp_so, 1);
1192 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1193 socket_unlock(mp_so, 1);
1194 continue;
1195 }
1196
1197 VERIFY(mp_so->so_usecount > 0);
1198
1199 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1200
1201 mptcp_check_subflows_and_add(mpte);
1202 mptcp_remove_subflows(mpte);
1203
1204 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1205 socket_unlock(mp_so, 1);
1206 }
1207
1208 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1209 }
1210
1211 /*
1212 * We need this because we are coming from an NECP-event. This event gets posted
1213 * while holding NECP-locks. The creation of the subflow however leads us back
1214 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1215 * So, we would deadlock there as we already hold the NECP-lock.
1216 *
1217 * So, let's schedule this separately. It also gives NECP the chance to make
1218 * progress, without having to wait for MPTCP to finish its subflow creation.
1219 */
1220 void
mptcp_sched_create_subflows(struct mptses * mpte)1221 mptcp_sched_create_subflows(struct mptses *mpte)
1222 {
1223 struct mppcb *mpp = mpte->mpte_mppcb;
1224 struct mptcb *mp_tp = mpte->mpte_mptcb;
1225 struct socket *mp_so = mpp->mpp_socket;
1226
1227 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1228 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1229 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1230 return;
1231 }
1232
1233 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1234 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1235 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1236 }
1237
1238 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1239 return;
1240 }
1241
1242 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1243 timeout(mptcp_create_subflows, NULL, hz / 10);
1244 }
1245
1246 /*
1247 * Allocate an MPTCP socket option structure.
1248 */
1249 struct mptopt *
mptcp_sopt_alloc(zalloc_flags_t how)1250 mptcp_sopt_alloc(zalloc_flags_t how)
1251 {
1252 return zalloc_flags(mptopt_zone, how | Z_ZERO);
1253 }
1254
1255 /*
1256 * Free an MPTCP socket option structure.
1257 */
1258 void
mptcp_sopt_free(struct mptopt * mpo)1259 mptcp_sopt_free(struct mptopt *mpo)
1260 {
1261 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1262
1263 zfree(mptopt_zone, mpo);
1264 }
1265
1266 /*
1267 * Add a socket option to the MPTCP socket option list.
1268 */
1269 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1270 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1271 {
1272 socket_lock_assert_owned(mptetoso(mpte));
1273 mpo->mpo_flags |= MPOF_ATTACHED;
1274 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1275 }
1276
1277 /*
1278 * Remove a socket option from the MPTCP socket option list.
1279 */
1280 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1281 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1282 {
1283 socket_lock_assert_owned(mptetoso(mpte));
1284 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1285 mpo->mpo_flags &= ~MPOF_ATTACHED;
1286 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1287 }
1288
1289 /*
1290 * Search for an existing <sopt_level,sopt_name> socket option.
1291 */
1292 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1293 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1294 {
1295 struct mptopt *mpo;
1296
1297 socket_lock_assert_owned(mptetoso(mpte));
1298
1299 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1300 if (mpo->mpo_level == sopt->sopt_level &&
1301 mpo->mpo_name == sopt->sopt_name) {
1302 break;
1303 }
1304 }
1305 return mpo;
1306 }
1307
1308 /*
1309 * Allocate a MPTCP subflow structure.
1310 */
1311 static struct mptsub *
mptcp_subflow_alloc(void)1312 mptcp_subflow_alloc(void)
1313 {
1314 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1315 }
1316
1317 /*
1318 * Deallocate a subflow structure, called when all of the references held
1319 * on it have been released. This implies that the subflow has been deleted.
1320 */
1321 static void
mptcp_subflow_free(struct mptsub * mpts)1322 mptcp_subflow_free(struct mptsub *mpts)
1323 {
1324 VERIFY(mpts->mpts_refcnt == 0);
1325 VERIFY(mpts->mpts_mpte == NULL);
1326 VERIFY(mpts->mpts_socket == NULL);
1327
1328 free_sockaddr(mpts->mpts_src);
1329
1330 zfree(mptsub_zone, mpts);
1331 }
1332
1333 static void
mptcp_subflow_addref(struct mptsub * mpts)1334 mptcp_subflow_addref(struct mptsub *mpts)
1335 {
1336 if (++mpts->mpts_refcnt == 0) {
1337 panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1338 }
1339 /* NOTREACHED */
1340 }
1341
1342 static void
mptcp_subflow_remref(struct mptsub * mpts)1343 mptcp_subflow_remref(struct mptsub *mpts)
1344 {
1345 if (mpts->mpts_refcnt == 0) {
1346 panic("%s: mpts %p negative refcnt", __func__, mpts);
1347 /* NOTREACHED */
1348 }
1349 if (--mpts->mpts_refcnt > 0) {
1350 return;
1351 }
1352
1353 /* callee will unlock and destroy lock */
1354 mptcp_subflow_free(mpts);
1355 }
1356
1357 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1358 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1359 {
1360 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1361 struct tcpcb *tp = sototcpcb(so);
1362
1363 /*
1364 * From this moment on, the subflow is linked to the MPTCP-connection.
1365 * Locking,... happens now at the MPTCP-layer
1366 */
1367 tp->t_mptcb = mpte->mpte_mptcb;
1368 so->so_flags |= SOF_MP_SUBFLOW;
1369 mp_so->so_usecount++;
1370
1371 /*
1372 * Insert the subflow into the list, and associate the MPTCP PCB
1373 * as well as the the subflow socket. From this point on, removing
1374 * the subflow needs to be done via mptcp_subflow_del().
1375 */
1376 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1377 mpte->mpte_numflows++;
1378
1379 mpts->mpts_mpte = mpte;
1380 mpts->mpts_socket = so;
1381 tp->t_mpsub = mpts;
1382 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1383 mptcp_subflow_addref(mpts); /* for subflow socket */
1384 }
1385
1386 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1387 mptcp_subflow_necp_cb(void *handle, __unused int action,
1388 __unused uint32_t interface_index,
1389 uint32_t necp_flags, bool *viable)
1390 {
1391 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1392 struct inpcb *inp = (struct inpcb *)handle;
1393 struct socket *so = inp->inp_socket;
1394 struct mptsub *mpts;
1395 struct mptses *mpte;
1396
1397 if (low_power) {
1398 action = NECP_CLIENT_CBACTION_NONVIABLE;
1399 }
1400
1401 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1402 return;
1403 }
1404
1405 /*
1406 * The socket is being garbage-collected. There is nothing to be done
1407 * here.
1408 */
1409 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1410 return;
1411 }
1412
1413 socket_lock(so, 1);
1414
1415 /* Check again after we acquired the lock. */
1416 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1417 goto out;
1418 }
1419
1420 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1421 mpts = sototcpcb(so)->t_mpsub;
1422
1423 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1424 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1425
1426 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1427
1428 mptcp_sched_create_subflows(mpte);
1429
1430 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1431 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1432 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1433 viable != NULL) {
1434 *viable = 1;
1435 }
1436
1437 out:
1438 socket_unlock(so, 1);
1439 }
1440
1441 /*
1442 * Create an MPTCP subflow socket.
1443 */
1444 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1445 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1446 struct socket **so)
1447 {
1448 lck_mtx_t *subflow_mtx;
1449 struct mptopt smpo, *mpo, *tmpo;
1450 struct proc *p;
1451 struct socket *mp_so;
1452 struct mppcb *mpp;
1453 int error;
1454
1455 *so = NULL;
1456
1457 mp_so = mptetoso(mpte);
1458 mpp = mpsotomppcb(mp_so);
1459
1460 p = proc_find(mp_so->last_pid);
1461 if (p == PROC_NULL) {
1462 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1463 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1464
1465 mptcp_subflow_free(mpts);
1466 return ESRCH;
1467 }
1468
1469 /*
1470 * Create the subflow socket (multipath subflow, non-blocking.)
1471 *
1472 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1473 * socket; it will be cleared when the socket is peeled off or closed.
1474 * It also indicates to the underlying TCP to handle MPTCP options.
1475 * A multipath subflow socket implies SS_NOFDREF state.
1476 */
1477
1478 /*
1479 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1480 * the ipi-lock. We cannot hold the socket-lock at that point.
1481 */
1482 socket_unlock(mp_so, 0);
1483 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1484 SOCF_MPTCP, PROC_NULL);
1485 socket_lock(mp_so, 0);
1486 if (error) {
1487 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1488 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1489
1490 proc_rele(p);
1491
1492 mptcp_subflow_free(mpts);
1493 return error;
1494 }
1495
1496 /*
1497 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1498 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1499 * Which is why we also need to get the lock with pr_getlock, as after
1500 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1501 */
1502 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1503 lck_mtx_lock(subflow_mtx);
1504
1505 /*
1506 * Must be the first thing we do, to make sure all pointers for this
1507 * subflow are set.
1508 */
1509 mptcp_subflow_attach(mpte, mpts, *so);
1510
1511 /*
1512 * A multipath subflow socket is used internally in the kernel,
1513 * therefore it does not have a file desciptor associated by
1514 * default.
1515 */
1516 (*so)->so_state |= SS_NOFDREF;
1517
1518 lck_mtx_unlock(subflow_mtx);
1519
1520 /* prevent the socket buffers from being compressed */
1521 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1522 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1523
1524 /* Inherit preconnect and TFO data flags */
1525 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1526 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1527 }
1528 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1529 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1530 }
1531 if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1532 (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1533 }
1534
1535 /* Inherit uuid and create the related flow. */
1536 if (!uuid_is_null(mpp->necp_client_uuid)) {
1537 struct mptcb *mp_tp = mpte->mpte_mptcb;
1538
1539 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1540
1541 /*
1542 * A note on the unlock: With MPTCP, we do multiple times a
1543 * necp_client_register_socket_flow. This is problematic,
1544 * because now the lock-ordering guarantee (first necp-locks,
1545 * then socket-locks) is no more respected. So, we need to
1546 * unlock here.
1547 */
1548 socket_unlock(mp_so, 0);
1549 error = necp_client_register_socket_flow(mp_so->last_pid,
1550 mpp->necp_client_uuid, sotoinpcb(*so));
1551 socket_lock(mp_so, 0);
1552
1553 if (error) {
1554 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1555 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1556
1557 goto out_err;
1558 }
1559
1560 /* Possible state-change during the unlock above */
1561 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1562 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1563 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1564 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1565 mp_tp->mpt_state, mp_tp->mpt_flags);
1566
1567 error = EINVAL;
1568 goto out_err;
1569 }
1570
1571 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1572 }
1573
1574 if (mpp->inp_necp_attributes.inp_domain != NULL) {
1575 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1576 sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1577
1578 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1579 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1580 }
1581 }
1582 if (mpp->inp_necp_attributes.inp_account != NULL) {
1583 size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1584 sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1585
1586 if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1587 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1588 }
1589 }
1590
1591 if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1592 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1593 sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1594
1595 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1596 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1597 }
1598 }
1599
1600 if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1601 size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1602 sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1603
1604 if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1605 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1606 }
1607 }
1608
1609 /* Needs to happen prior to the delegation! */
1610 (*so)->last_pid = mp_so->last_pid;
1611
1612 if (mp_so->so_flags & SOF_DELEGATED) {
1613 if (mpte->mpte_epid) {
1614 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1615 if (error) {
1616 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1617 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1618 goto out_err;
1619 }
1620 }
1621 if (!uuid_is_null(mpte->mpte_euuid)) {
1622 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1623 if (error) {
1624 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1625 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1626 goto out_err;
1627 }
1628 }
1629 }
1630
1631 /* inherit the other socket options */
1632 bzero(&smpo, sizeof(smpo));
1633 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1634 smpo.mpo_level = SOL_SOCKET;
1635 smpo.mpo_intval = 1;
1636
1637 /* disable SIGPIPE */
1638 smpo.mpo_name = SO_NOSIGPIPE;
1639 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1640 goto out_err;
1641 }
1642
1643 /* find out if the subflow's source address goes away */
1644 smpo.mpo_name = SO_NOADDRERR;
1645 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1646 goto out_err;
1647 }
1648
1649 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1650 /*
1651 * On secondary subflows we might need to set the cell-fallback
1652 * flag (see conditions in mptcp_subflow_sosetopt).
1653 */
1654 smpo.mpo_level = SOL_SOCKET;
1655 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1656 smpo.mpo_intval = 1;
1657 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1658 goto out_err;
1659 }
1660 }
1661
1662 /* replay setsockopt(2) on the subflow sockets for eligible options */
1663 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1664 int interim;
1665
1666 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1667 continue;
1668 }
1669
1670 /*
1671 * Skip those that are handled internally; these options
1672 * should not have been recorded and marked with the
1673 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1674 */
1675 if (mpo->mpo_level == SOL_SOCKET &&
1676 (mpo->mpo_name == SO_NOSIGPIPE ||
1677 mpo->mpo_name == SO_NOADDRERR ||
1678 mpo->mpo_name == SO_KEEPALIVE)) {
1679 continue;
1680 }
1681
1682 interim = (mpo->mpo_flags & MPOF_INTERIM);
1683 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1684 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1685 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1686 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1687 mpo->mpo_intval);
1688 mptcp_sopt_remove(mpte, mpo);
1689 mptcp_sopt_free(mpo);
1690 continue;
1691 }
1692 }
1693
1694 /*
1695 * We need to receive everything that the subflow socket has,
1696 * so use a customized socket receive function. We will undo
1697 * this when the socket is peeled off or closed.
1698 */
1699 switch (dom) {
1700 case PF_INET:
1701 (*so)->so_proto = &mptcp_subflow_protosw;
1702 break;
1703 case PF_INET6:
1704 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1705 break;
1706 default:
1707 VERIFY(0);
1708 /* NOTREACHED */
1709 }
1710
1711 proc_rele(p);
1712
1713 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1714 int, dom, int, error);
1715
1716 return 0;
1717
1718 out_err:
1719 mptcp_subflow_abort(mpts, error);
1720
1721 proc_rele(p);
1722
1723 return error;
1724 }
1725
1726 /*
1727 * Close an MPTCP subflow socket.
1728 *
1729 * Note that this may be called on an embryonic subflow, and the only
1730 * thing that is guaranteed valid is the protocol-user request.
1731 */
1732 static void
mptcp_subflow_soclose(struct mptsub * mpts)1733 mptcp_subflow_soclose(struct mptsub *mpts)
1734 {
1735 struct socket *so = mpts->mpts_socket;
1736
1737 if (mpts->mpts_flags & MPTSF_CLOSED) {
1738 return;
1739 }
1740
1741 VERIFY(so != NULL);
1742 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1743 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1744
1745 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1746 struct socket *, so,
1747 struct sockbuf *, &so->so_rcv,
1748 struct sockbuf *, &so->so_snd,
1749 struct mptses *, mpts->mpts_mpte);
1750
1751 mpts->mpts_flags |= MPTSF_CLOSED;
1752
1753 if (so->so_retaincnt == 0) {
1754 soclose_locked(so);
1755
1756 return;
1757 } else {
1758 VERIFY(so->so_usecount > 0);
1759 so->so_usecount--;
1760 }
1761
1762 return;
1763 }
1764
1765 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1766 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1767 {
1768 struct tcpcb *tp = sototcpcb(so);
1769 struct mptcp_subf_auth_entry *sauth_entry;
1770
1771 /*
1772 * The address ID of the first flow is implicitly 0.
1773 */
1774 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1775 tp->t_local_aid = 0;
1776 } else {
1777 tp->t_local_aid = addr_id;
1778 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1779 so->so_flags |= SOF_MP_SEC_SUBFLOW;
1780 }
1781 sauth_entry = zalloc(mpt_subauth_zone);
1782 sauth_entry->msae_laddr_id = tp->t_local_aid;
1783 sauth_entry->msae_raddr_id = 0;
1784 sauth_entry->msae_raddr_rand = 0;
1785 try_again:
1786 sauth_entry->msae_laddr_rand = RandomULong();
1787 if (sauth_entry->msae_laddr_rand == 0) {
1788 goto try_again;
1789 }
1790 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1791 }
1792
1793 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1794 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1795 {
1796 struct mptcp_subf_auth_entry *sauth_entry;
1797 struct tcpcb *tp = NULL;
1798 int found = 0;
1799
1800 tp = sototcpcb(so);
1801 if (tp == NULL) {
1802 return;
1803 }
1804
1805 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1806 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1807 found = 1;
1808 break;
1809 }
1810 }
1811 if (found) {
1812 LIST_REMOVE(sauth_entry, msae_next);
1813 }
1814
1815 if (found) {
1816 zfree(mpt_subauth_zone, sauth_entry);
1817 }
1818 }
1819
1820 /*
1821 * Connect an MPTCP subflow socket.
1822 *
1823 * Note that in the pending connect case, the subflow socket may have been
1824 * bound to an interface and/or a source IP address which may no longer be
1825 * around by the time this routine is called; in that case the connect attempt
1826 * will most likely fail.
1827 */
1828 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1829 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1830 {
1831 char dbuf[MAX_IPv6_STR_LEN];
1832 struct socket *mp_so, *so;
1833 struct mptcb *mp_tp;
1834 struct sockaddr *dst;
1835 struct proc *p;
1836 int af, error, dport;
1837
1838 mp_so = mptetoso(mpte);
1839 mp_tp = mpte->mpte_mptcb;
1840 so = mpts->mpts_socket;
1841 af = mpts->mpts_dst.sa_family;
1842 dst = &mpts->mpts_dst;
1843
1844 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1845 VERIFY(mpts->mpts_socket != NULL);
1846 VERIFY(af == AF_INET || af == AF_INET6);
1847
1848 if (af == AF_INET) {
1849 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1850 dport = ntohs(SIN(dst)->sin_port);
1851 } else {
1852 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1853 dport = ntohs(SIN6(dst)->sin6_port);
1854 }
1855
1856 os_log(mptcp_log_handle,
1857 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1858 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1859
1860 p = proc_find(mp_so->last_pid);
1861 if (p == PROC_NULL) {
1862 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1863 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1864
1865 return ESRCH;
1866 }
1867
1868 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1869
1870 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1871
1872 /* connect the subflow socket */
1873 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1874 p, mpts->mpts_ifscope,
1875 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1876
1877 mpts->mpts_iss = sototcpcb(so)->iss;
1878
1879 /* See tcp_connect_complete */
1880 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1881 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1882 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1883 }
1884
1885 /* Allocate a unique address id per subflow */
1886 mpte->mpte_addrid_last++;
1887 if (mpte->mpte_addrid_last == 0) {
1888 mpte->mpte_addrid_last++;
1889 }
1890
1891 proc_rele(p);
1892
1893 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1894 struct mptsub *, mpts, int, error);
1895 if (error) {
1896 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1897 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1898 }
1899
1900 return error;
1901 }
1902
1903 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1904 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1905 uint32_t rseq, uint16_t dlen, uint8_t dfin)
1906 {
1907 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1908
1909 if (m_pktlen(m) == 0) {
1910 return 0;
1911 }
1912
1913 if (!(m->m_flags & M_PKTHDR)) {
1914 return 0;
1915 }
1916
1917 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1918 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1919 rseq != m->m_pkthdr.mp_rseq ||
1920 dlen != m->m_pkthdr.mp_rlen ||
1921 dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1922 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1923 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1924 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1925 rseq, m->m_pkthdr.mp_rseq,
1926 dlen, m->m_pkthdr.mp_rlen,
1927 dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1928
1929 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1930 return -1;
1931 }
1932 }
1933
1934 /* If mbuf is beyond right edge of the mapping, we need to split */
1935 if (m_pktlen(m) > dlen - dfin - off) {
1936 struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1937 if (new == NULL) {
1938 os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1939 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1940 dlen, dfin, off, m_pktlen(m),
1941 mpts->mpts_connid);
1942
1943 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1944 return -1;
1945 }
1946
1947 m->m_next = new;
1948 sballoc(&so->so_rcv, new);
1949 /* Undo, as sballoc will add to it as well */
1950 so->so_rcv.sb_cc -= new->m_len;
1951
1952 if (so->so_rcv.sb_mbtail == m) {
1953 so->so_rcv.sb_mbtail = new;
1954 }
1955 }
1956
1957 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1958 m->m_pkthdr.mp_dsn = dsn + off;
1959 m->m_pkthdr.mp_rseq = rseq + off;
1960 VERIFY(m_pktlen(m) < UINT16_MAX);
1961 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1962
1963 /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1964 if (dfin) {
1965 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1966 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1967 } else {
1968 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1969 }
1970 }
1971
1972
1973 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1974
1975 return 0;
1976 }
1977
1978 /*
1979 * Update the pid, upid, uuid of the subflow so, based on parent so
1980 */
1981 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1982 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1983 {
1984 if (so->last_pid != mp_so->last_pid ||
1985 so->last_upid != mp_so->last_upid) {
1986 so->last_upid = mp_so->last_upid;
1987 so->last_pid = mp_so->last_pid;
1988 uuid_copy(so->last_uuid, mp_so->last_uuid);
1989 }
1990 so_update_policy(so);
1991 }
1992
1993 /*
1994 * MPTCP subflow socket receive routine, derived from soreceive().
1995 */
1996 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1997 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1998 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1999 {
2000 #pragma unused(uio)
2001 struct socket *mp_so;
2002 struct mptses *mpte;
2003 struct mptcb *mp_tp;
2004 int flags, error = 0;
2005 struct mbuf *m, **mp = mp0;
2006 struct tcpcb *tp = sototcpcb(so);
2007
2008 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2009 mp_so = mptetoso(mpte);
2010 mp_tp = mpte->mpte_mptcb;
2011
2012 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2013
2014 #ifdef MORE_LOCKING_DEBUG
2015 if (so->so_usecount == 1) {
2016 panic("%s: so=%x no other reference on socket", __func__, so);
2017 /* NOTREACHED */
2018 }
2019 #endif
2020 /*
2021 * We return all that is there in the subflow's socket receive buffer
2022 * to the MPTCP layer, so we require that the caller passes in the
2023 * expected parameters.
2024 */
2025 if (mp == NULL || controlp != NULL) {
2026 return EINVAL;
2027 }
2028
2029 *mp = NULL;
2030 if (psa != NULL) {
2031 *psa = NULL;
2032 }
2033 if (flagsp != NULL) {
2034 flags = *flagsp & ~MSG_EOR;
2035 } else {
2036 flags = 0;
2037 }
2038
2039 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2040 return EOPNOTSUPP;
2041 }
2042
2043 flags |= (MSG_DONTWAIT | MSG_NBIO);
2044
2045 /*
2046 * If a recv attempt is made on a previously-accepted socket
2047 * that has been marked as inactive (disconnected), reject
2048 * the request.
2049 */
2050 if (so->so_flags & SOF_DEFUNCT) {
2051 struct sockbuf *sb = &so->so_rcv;
2052
2053 error = ENOTCONN;
2054 /*
2055 * This socket should have been disconnected and flushed
2056 * prior to being returned from sodefunct(); there should
2057 * be no data on its receive list, so panic otherwise.
2058 */
2059 if (so->so_state & SS_DEFUNCT) {
2060 sb_empty_assert(sb, __func__);
2061 }
2062 return error;
2063 }
2064
2065 /*
2066 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2067 * and if so just return to the caller. This could happen when
2068 * soreceive() is called by a socket upcall function during the
2069 * time the socket is freed. The socket buffer would have been
2070 * locked across the upcall, therefore we cannot put this thread
2071 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2072 * we may livelock), because the lock on the socket buffer will
2073 * only be released when the upcall routine returns to its caller.
2074 * Because the socket has been officially closed, there can be
2075 * no further read on it.
2076 *
2077 * A multipath subflow socket would have its SS_NOFDREF set by
2078 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2079 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2080 */
2081 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2082 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2083 return 0;
2084 }
2085
2086 /*
2087 * For consistency with soreceive() semantics, we need to obey
2088 * SB_LOCK in case some other code path has locked the buffer.
2089 */
2090 error = sblock(&so->so_rcv, 0);
2091 if (error != 0) {
2092 return error;
2093 }
2094
2095 m = so->so_rcv.sb_mb;
2096 if (m == NULL) {
2097 /*
2098 * Panic if we notice inconsistencies in the socket's
2099 * receive list; both sb_mb and sb_cc should correctly
2100 * reflect the contents of the list, otherwise we may
2101 * end up with false positives during select() or poll()
2102 * which could put the application in a bad state.
2103 */
2104 SB_MB_CHECK(&so->so_rcv);
2105
2106 if (so->so_error != 0) {
2107 error = so->so_error;
2108 so->so_error = 0;
2109 goto release;
2110 }
2111
2112 if (so->so_state & SS_CANTRCVMORE) {
2113 goto release;
2114 }
2115
2116 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2117 error = ENOTCONN;
2118 goto release;
2119 }
2120
2121 /*
2122 * MSG_DONTWAIT is implicitly defined and this routine will
2123 * never block, so return EWOULDBLOCK when there is nothing.
2124 */
2125 error = EWOULDBLOCK;
2126 goto release;
2127 }
2128
2129 mptcp_update_last_owner(so, mp_so);
2130
2131 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2132 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2133
2134 while (m != NULL) {
2135 int dlen = 0, error_out = 0, off = 0;
2136 uint8_t dfin = 0;
2137 struct mbuf *start = m;
2138 uint64_t dsn;
2139 uint32_t sseq;
2140 uint16_t orig_dlen;
2141 uint16_t csum;
2142
2143 VERIFY(m->m_nextpkt == NULL);
2144
2145 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2146 fallback:
2147 /* Just move mbuf to MPTCP-level */
2148
2149 sbfree(&so->so_rcv, m);
2150
2151 if (mp != NULL) {
2152 *mp = m;
2153 mp = &m->m_next;
2154 so->so_rcv.sb_mb = m = m->m_next;
2155 *mp = NULL;
2156 }
2157
2158 if (m != NULL) {
2159 so->so_rcv.sb_lastrecord = m;
2160 } else {
2161 SB_EMPTY_FIXUP(&so->so_rcv);
2162 }
2163
2164 continue;
2165 } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2166 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2167 boolean_t found_mapping = false;
2168 int parsed_length = 0;
2169 struct mbuf *m_iter;
2170
2171 /*
2172 * No MPTCP-option in the header. Either fallback or
2173 * wait for additional mappings.
2174 */
2175 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2176 /* data arrived without a DSS option mapping */
2177
2178 /* initial subflow can fallback right after SYN handshake */
2179 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2180 mptcp_notify_mpfail(so);
2181
2182 goto fallback;
2183 } else {
2184 os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2185 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2186 mpts->mpts_connid);
2187 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2188
2189 error = EIO;
2190 *mp0 = NULL;
2191 goto release;
2192 }
2193 }
2194
2195 /* Thus, let's look for an mbuf with the mapping */
2196 m_iter = m->m_next;
2197 parsed_length = m->m_len;
2198 while (m_iter != NULL && parsed_length < UINT16_MAX) {
2199 if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2200 parsed_length += m_iter->m_len;
2201 m_iter = m_iter->m_next;
2202 continue;
2203 }
2204
2205 found_mapping = true;
2206
2207 /* Found an mbuf with a DSS-mapping */
2208 orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2209 dsn = m_iter->m_pkthdr.mp_dsn;
2210 sseq = m_iter->m_pkthdr.mp_rseq;
2211 csum = m_iter->m_pkthdr.mp_csum;
2212
2213 if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2214 dfin = 1;
2215 dlen--;
2216 }
2217
2218 break;
2219 }
2220
2221 if (!found_mapping && parsed_length < UINT16_MAX) {
2222 /* Mapping not yet present, we can wait! */
2223 if (*mp0 == NULL) {
2224 error = EWOULDBLOCK;
2225 }
2226 goto release;
2227 } else if (!found_mapping && parsed_length >= UINT16_MAX) {
2228 os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2229 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2230 mpts->mpts_connid);
2231 /* Received 64KB without DSS-mapping. We should kill the subflow */
2232 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2233
2234 error = EIO;
2235 *mp0 = NULL;
2236 goto release;
2237 }
2238 } else {
2239 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2240 dsn = m->m_pkthdr.mp_dsn;
2241 sseq = m->m_pkthdr.mp_rseq;
2242 csum = m->m_pkthdr.mp_csum;
2243
2244 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2245 dfin = 1;
2246 dlen--;
2247 }
2248 }
2249
2250 /* Now, see if we need to remove previous packets */
2251 if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) {
2252 /* Ok, there is data in there that we don't need - let's throw it away! */
2253 int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc);
2254
2255 sbdrop(&so->so_rcv, totrim);
2256
2257 m = so->so_rcv.sb_mb;
2258 }
2259
2260 /*
2261 * Check if the full mapping is now present
2262 */
2263 if ((int)so->so_rcv.sb_cc < dlen) {
2264 if (*mp0 == NULL) {
2265 error = EWOULDBLOCK;
2266 }
2267 goto release;
2268 }
2269
2270 /* Now, get the full mapping */
2271 off = 0;
2272 while (dlen > 0) {
2273 if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2274 error_out = 1;
2275 error = EIO;
2276 dlen = 0;
2277 *mp0 = NULL;
2278 break;
2279 }
2280
2281 dlen -= m->m_len;
2282 off += m->m_len;
2283 sbfree(&so->so_rcv, m);
2284
2285 if (mp != NULL) {
2286 *mp = m;
2287 mp = &m->m_next;
2288 so->so_rcv.sb_mb = m = m->m_next;
2289 *mp = NULL;
2290 }
2291
2292 ASSERT(dlen == 0 || m);
2293 if (dlen != 0 && m == NULL) {
2294 /* "try" to gracefully recover on customer builds */
2295 error_out = 1;
2296 error = EIO;
2297 dlen = 0;
2298
2299 *mp0 = NULL;
2300
2301 SB_EMPTY_FIXUP(&so->so_rcv);
2302 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2303
2304 break;
2305 }
2306 }
2307
2308 VERIFY(dlen == 0);
2309
2310 if (m != NULL) {
2311 so->so_rcv.sb_lastrecord = m;
2312 } else {
2313 SB_EMPTY_FIXUP(&so->so_rcv);
2314 }
2315
2316 if (error_out) {
2317 goto release;
2318 }
2319
2320 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2321 error = EIO;
2322 *mp0 = NULL;
2323 goto release;
2324 }
2325
2326 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2327 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2328 }
2329
2330 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2331 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2332
2333 if (flagsp != NULL) {
2334 *flagsp |= flags;
2335 }
2336
2337 release:
2338 sbunlock(&so->so_rcv, TRUE);
2339
2340 return error;
2341 }
2342
2343 /*
2344 * MPTCP subflow socket send routine, derived from sosend().
2345 */
2346 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2347 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2348 struct mbuf *top, struct mbuf *control, int flags)
2349 {
2350 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2351 boolean_t en_tracing = FALSE, proc_held = FALSE;
2352 struct proc *p = current_proc();
2353 int en_tracing_val;
2354 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2355 int error;
2356
2357 VERIFY(control == NULL);
2358 VERIFY(addr == NULL);
2359 VERIFY(uio == NULL);
2360 VERIFY(flags == 0);
2361 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2362
2363 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2364 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2365
2366 /*
2367 * trace if tracing & network (vs. unix) sockets & and
2368 * non-loopback
2369 */
2370 if (ENTR_SHOULDTRACE &&
2371 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2372 struct inpcb *inp = sotoinpcb(so);
2373 if (inp->inp_last_outifp != NULL &&
2374 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2375 en_tracing = TRUE;
2376 en_tracing_val = top->m_pkthdr.len;
2377 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2378 (unsigned long)VM_KERNEL_ADDRPERM(so),
2379 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2380 (int64_t)en_tracing_val);
2381 }
2382 }
2383
2384 mptcp_update_last_owner(so, mp_so);
2385
2386 if (mp_so->last_pid != proc_pid(p)) {
2387 p = proc_find(mp_so->last_pid);
2388 if (p == PROC_NULL) {
2389 p = current_proc();
2390 } else {
2391 proc_held = TRUE;
2392 }
2393 }
2394
2395 #if NECP
2396 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2397 #endif /* NECP */
2398
2399 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2400 if (error) {
2401 goto out;
2402 }
2403
2404 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2405 top = NULL;
2406
2407 out:
2408 if (top != NULL) {
2409 m_freem(top);
2410 }
2411
2412 if (proc_held) {
2413 proc_rele(p);
2414 }
2415
2416 soclearfastopen(so);
2417
2418 if (en_tracing) {
2419 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2420 (unsigned long)VM_KERNEL_ADDRPERM(so),
2421 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2422 (int64_t)en_tracing_val);
2423 }
2424
2425 return error;
2426 }
2427
2428 /*
2429 * Subflow socket write upcall.
2430 *
2431 * Called when the associated subflow socket posted a read event.
2432 */
2433 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2434 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2435 {
2436 #pragma unused(so, waitf)
2437 struct mptsub *mpts = arg;
2438 struct mptses *mpte = mpts->mpts_mpte;
2439
2440 VERIFY(mpte != NULL);
2441
2442 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2443 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2444 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2445 }
2446 return;
2447 }
2448
2449 mptcp_output(mpte);
2450 }
2451
2452 /*
2453 * Subflow socket control event upcall.
2454 */
2455 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2456 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2457 {
2458 #pragma unused(so)
2459 struct mptsub *mpts = arg;
2460 struct mptses *mpte = mpts->mpts_mpte;
2461
2462 socket_lock_assert_owned(mptetoso(mpte));
2463
2464 if ((mpts->mpts_evctl & events) == events) {
2465 return;
2466 }
2467
2468 mpts->mpts_evctl |= events;
2469
2470 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2471 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2472 return;
2473 }
2474
2475 mptcp_subflow_workloop(mpte);
2476 }
2477
2478 /*
2479 * Establish an initial MPTCP connection (if first subflow and not yet
2480 * connected), or add a subflow to an existing MPTCP connection.
2481 */
2482 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2483 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2484 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2485 {
2486 struct socket *mp_so, *so = NULL;
2487 struct mptcb *mp_tp;
2488 struct mptsub *mpts = NULL;
2489 int af, error = 0;
2490
2491 mp_so = mptetoso(mpte);
2492 mp_tp = mpte->mpte_mptcb;
2493
2494 socket_lock_assert_owned(mp_so);
2495
2496 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2497 /* If the remote end sends Data FIN, refuse subflow adds */
2498 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2499 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2500 error = ENOTCONN;
2501 goto out_err;
2502 }
2503
2504 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2505 error = EOVERFLOW;
2506 goto out_err;
2507 }
2508
2509 mpts = mptcp_subflow_alloc();
2510 if (mpts == NULL) {
2511 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2512 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2513 error = ENOMEM;
2514 goto out_err;
2515 }
2516
2517 if (src) {
2518 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2519 error = EAFNOSUPPORT;
2520 goto out_err;
2521 }
2522
2523 if (src->sa_family == AF_INET &&
2524 src->sa_len != sizeof(struct sockaddr_in)) {
2525 error = EINVAL;
2526 goto out_err;
2527 }
2528
2529 if (src->sa_family == AF_INET6 &&
2530 src->sa_len != sizeof(struct sockaddr_in6)) {
2531 error = EINVAL;
2532 goto out_err;
2533 }
2534
2535 mpts->mpts_src = (struct sockaddr *)alloc_sockaddr(src->sa_len,
2536 Z_WAITOK | Z_NOFAIL);
2537
2538 bcopy(src, mpts->mpts_src, src->sa_len);
2539 }
2540
2541 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2542 error = EAFNOSUPPORT;
2543 goto out_err;
2544 }
2545
2546 if (dst->sa_family == AF_INET &&
2547 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2548 error = EINVAL;
2549 goto out_err;
2550 }
2551
2552 if (dst->sa_family == AF_INET6 &&
2553 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2554 error = EINVAL;
2555 goto out_err;
2556 }
2557
2558 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2559
2560 af = mpts->mpts_dst.sa_family;
2561
2562 ifnet_head_lock_shared();
2563 if ((ifscope > (unsigned)if_index)) {
2564 ifnet_head_done();
2565 error = ENXIO;
2566 goto out_err;
2567 }
2568 ifnet_head_done();
2569
2570 mpts->mpts_ifscope = ifscope;
2571
2572 /* create the subflow socket */
2573 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2574 /*
2575 * Returning (error) and not cleaning up, because up to here
2576 * all we did is creating mpts.
2577 *
2578 * And the contract is that the call to mptcp_subflow_socreate,
2579 * moves ownership of mpts to mptcp_subflow_socreate.
2580 */
2581 return error;
2582 }
2583
2584 /*
2585 * We may be called from within the kernel. Still need to account this
2586 * one to the real app.
2587 */
2588 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2589
2590 /*
2591 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2592 * -1 (SAE_CONNID_ALL).
2593 */
2594 mpte->mpte_connid_last++;
2595 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2596 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2597 mpte->mpte_connid_last++;
2598 }
2599
2600 mpts->mpts_connid = mpte->mpte_connid_last;
2601
2602 mpts->mpts_rel_seq = 1;
2603
2604 /* Allocate a unique address id per subflow */
2605 mpte->mpte_addrid_last++;
2606 if (mpte->mpte_addrid_last == 0) {
2607 mpte->mpte_addrid_last++;
2608 }
2609
2610 /* register for subflow socket read/write events */
2611 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2612
2613 /* Register for subflow socket control events */
2614 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2615 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2616 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2617 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2618 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2619 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2620 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2621 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2622
2623 /* sanity check */
2624 VERIFY(!(mpts->mpts_flags &
2625 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2626
2627 /*
2628 * Indicate to the TCP subflow whether or not it should establish
2629 * the initial MPTCP connection, or join an existing one. Fill
2630 * in the connection request structure with additional info needed
2631 * by the underlying TCP (to be used in the TCP options, etc.)
2632 */
2633 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2634 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2635
2636 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2637 mptcp_init_local_parms(mpte, dst);
2638 }
2639 soisconnecting(mp_so);
2640
2641 /* If fastopen is requested, set state in mpts */
2642 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2643 mpts->mpts_flags |= MPTSF_TFO_REQD;
2644 }
2645 } else {
2646 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2647 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2648 }
2649 }
2650
2651 mpts->mpts_flags |= MPTSF_CONNECTING;
2652
2653 /* connect right away if first attempt, or if join can be done now */
2654 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2655 error = mptcp_subflow_soconnectx(mpte, mpts);
2656 }
2657
2658 if (error) {
2659 goto out_err_close;
2660 }
2661
2662 if (pcid) {
2663 *pcid = mpts->mpts_connid;
2664 }
2665
2666 return 0;
2667
2668 out_err_close:
2669 mptcp_subflow_abort(mpts, error);
2670
2671 return error;
2672
2673 out_err:
2674 if (mpts) {
2675 mptcp_subflow_free(mpts);
2676 }
2677
2678 return error;
2679 }
2680
2681 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2682 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2683 {
2684 int index = mptcpstats_get_index(stats, mpts);
2685
2686 if (index != -1) {
2687 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2688
2689 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2690 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2691
2692 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2693 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2694
2695 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2696 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2697
2698 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2699 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2700 }
2701 }
2702
2703 /*
2704 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2705 * will no longer be accessible after a subflow is deleted, thus this
2706 * should occur only after the subflow socket has been disconnected.
2707 */
2708 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2709 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2710 {
2711 struct socket *mp_so = mptetoso(mpte);
2712 struct socket *so = mpts->mpts_socket;
2713 struct tcpcb *tp = sototcpcb(so);
2714
2715 socket_lock_assert_owned(mp_so);
2716 VERIFY(mpts->mpts_mpte == mpte);
2717 VERIFY(mpte->mpte_numflows != 0);
2718 VERIFY(mp_so->so_usecount > 0);
2719
2720 mptcpstats_update(mpte->mpte_itfstats, mpts);
2721
2722 mptcp_unset_cellicon(mpte, mpts, 1);
2723
2724 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2725 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2726
2727 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2728 mpte->mpte_numflows--;
2729 if (mpte->mpte_active_sub == mpts) {
2730 mpte->mpte_active_sub = NULL;
2731 }
2732
2733 /*
2734 * Drop references held by this subflow socket; there
2735 * will be no further upcalls made from this point.
2736 */
2737 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2738 sock_catchevents_locked(so, NULL, NULL, 0);
2739
2740 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2741
2742 mp_so->so_usecount--; /* for subflow socket */
2743 mpts->mpts_mpte = NULL;
2744 mpts->mpts_socket = NULL;
2745
2746 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2747 mptcp_subflow_remref(mpts); /* for subflow socket */
2748
2749 so->so_flags &= ~SOF_MP_SUBFLOW;
2750 tp->t_mptcb = NULL;
2751 tp->t_mpsub = NULL;
2752 }
2753
2754 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2755 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2756 {
2757 struct socket *so = mpts->mpts_socket;
2758 struct mptcb *mp_tp = mpte->mpte_mptcb;
2759 int send_dfin = 0;
2760
2761 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2762 send_dfin = 1;
2763 }
2764
2765 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2766 (so->so_state & SS_ISCONNECTED)) {
2767 if (send_dfin) {
2768 mptcp_send_dfin(so);
2769 }
2770 soshutdownlock(so, SHUT_WR);
2771 }
2772 }
2773
2774 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2775 mptcp_subflow_abort(struct mptsub *mpts, int error)
2776 {
2777 struct socket *so = mpts->mpts_socket;
2778 struct tcpcb *tp = sototcpcb(so);
2779
2780 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2781 return;
2782 }
2783
2784 if (tp->t_state != TCPS_CLOSED) {
2785 tcp_drop(tp, error);
2786 }
2787
2788 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2789 }
2790
2791 /*
2792 * Disconnect a subflow socket.
2793 */
2794 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2795 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2796 {
2797 struct socket *so, *mp_so;
2798 struct mptcb *mp_tp;
2799 int send_dfin = 0;
2800
2801 so = mpts->mpts_socket;
2802 mp_tp = mpte->mpte_mptcb;
2803 mp_so = mptetoso(mpte);
2804
2805 socket_lock_assert_owned(mp_so);
2806
2807 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2808 return;
2809 }
2810
2811 mptcp_unset_cellicon(mpte, mpts, 1);
2812
2813 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2814
2815 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2816 send_dfin = 1;
2817 }
2818
2819 if (mp_so->so_flags & SOF_DEFUNCT) {
2820 errno_t ret;
2821
2822 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2823 if (ret == 0) {
2824 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2825
2826 if (ret != 0) {
2827 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2828 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2829 }
2830 } else {
2831 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2832 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2833 }
2834 }
2835
2836 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2837 (so->so_state & SS_ISCONNECTED)) {
2838 if (send_dfin) {
2839 mptcp_send_dfin(so);
2840 }
2841
2842 (void) soshutdownlock(so, SHUT_RD);
2843 (void) soshutdownlock(so, SHUT_WR);
2844 (void) sodisconnectlocked(so);
2845 }
2846
2847 /*
2848 * Generate a disconnect event for this subflow socket, in case
2849 * the lower layer doesn't do it; this is needed because the
2850 * subflow socket deletion relies on it.
2851 */
2852 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2853 }
2854
2855 /*
2856 * Subflow socket input.
2857 */
2858 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2859 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2860 {
2861 struct socket *mp_so = mptetoso(mpte);
2862 struct mbuf *m = NULL;
2863 struct socket *so;
2864 int error, wakeup = 0;
2865
2866 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2867 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2868
2869 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2870 struct mptsub *, mpts);
2871
2872 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2873 goto out;
2874 }
2875
2876 so = mpts->mpts_socket;
2877
2878 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2879 if (error != 0 && error != EWOULDBLOCK) {
2880 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2881 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2882 if (error == ENODATA) {
2883 /*
2884 * Don't ignore ENODATA so as to discover
2885 * nasty middleboxes.
2886 */
2887 mp_so->so_error = ENODATA;
2888
2889 wakeup = 1;
2890 goto out;
2891 }
2892 }
2893
2894 /* In fallback, make sure to accept data on all but one subflow */
2895 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2896 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2897 m_freem(m);
2898 goto out;
2899 }
2900
2901 if (m != NULL) {
2902 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2903 mptcp_set_cellicon(mpte, mpts);
2904
2905 mpte->mpte_used_cell = 1;
2906 } else {
2907 /*
2908 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2909 * explicitly set the cellicon, then we unset it again.
2910 */
2911 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2912 mptcp_unset_cellicon(mpte, NULL, 1);
2913 }
2914
2915 mpte->mpte_used_wifi = 1;
2916 }
2917
2918 mptcp_input(mpte, m);
2919 }
2920
2921 out:
2922 if (wakeup) {
2923 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2924 }
2925
2926 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2927 }
2928
2929 void
mptcp_handle_input(struct socket * so)2930 mptcp_handle_input(struct socket *so)
2931 {
2932 struct mptsub *mpts, *tmpts;
2933 struct mptses *mpte;
2934
2935 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2936 return;
2937 }
2938
2939 mpts = sototcpcb(so)->t_mpsub;
2940 mpte = mpts->mpts_mpte;
2941
2942 socket_lock_assert_owned(mptetoso(mpte));
2943
2944 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2945 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2946 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2947 }
2948 return;
2949 }
2950
2951 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2952 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2953 if (mpts->mpts_socket->so_usecount == 0) {
2954 /* Will be removed soon by tcp_garbage_collect */
2955 continue;
2956 }
2957
2958 mptcp_subflow_addref(mpts);
2959 mpts->mpts_socket->so_usecount++;
2960
2961 mptcp_subflow_input(mpte, mpts);
2962
2963 mptcp_subflow_remref(mpts); /* ours */
2964
2965 VERIFY(mpts->mpts_socket->so_usecount != 0);
2966 mpts->mpts_socket->so_usecount--;
2967 }
2968
2969 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2970 }
2971
2972 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)2973 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2974 {
2975 struct mbuf *so_m = so->so_snd.sb_mb;
2976 uint64_t dsn = m->m_pkthdr.mp_dsn;
2977
2978 while (so_m) {
2979 VERIFY(so_m->m_flags & M_PKTHDR);
2980 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2981
2982 /* Part of the segment is covered, don't reinject here */
2983 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2984 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2985 return TRUE;
2986 }
2987
2988 so_m = so_m->m_next;
2989 }
2990
2991 return FALSE;
2992 }
2993
2994 /*
2995 * Subflow socket output.
2996 *
2997 * Called for sending data from MPTCP to the underlying subflow socket.
2998 */
2999 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3000 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3001 {
3002 struct mptcb *mp_tp = mpte->mpte_mptcb;
3003 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3004 struct socket *mp_so, *so;
3005 struct tcpcb *tp;
3006 uint64_t mpt_dsn = 0, off = 0;
3007 int sb_cc = 0, error = 0, wakeup = 0;
3008 uint16_t dss_csum;
3009 uint16_t tot_sent = 0;
3010 boolean_t reinjected = FALSE;
3011
3012 mp_so = mptetoso(mpte);
3013 so = mpts->mpts_socket;
3014 tp = sototcpcb(so);
3015
3016 socket_lock_assert_owned(mp_so);
3017
3018 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3019 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3020
3021 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3022 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3023 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3024 (mpts->mpts_flags & MPTSF_TFO_REQD));
3025 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3026
3027 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3028 struct mptsub *, mpts);
3029
3030 /* Remove Addr Option is not sent reliably as per I-D */
3031 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3032 tp->t_rem_aid = mpte->mpte_lost_aid;
3033 tp->t_mpflags |= TMPF_SND_REM_ADDR;
3034 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3035 }
3036
3037 /*
3038 * The mbuf chains containing the metadata (as well as pointing to
3039 * the user data sitting at the MPTCP output queue) would then be
3040 * sent down to the subflow socket.
3041 *
3042 * Some notes on data sequencing:
3043 *
3044 * a. Each mbuf must be a M_PKTHDR.
3045 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
3046 * in the mbuf pkthdr structure.
3047 * c. Each mbuf containing the MPTCP metadata must have its
3048 * pkt_flags marked with the PKTF_MPTCP flag.
3049 */
3050
3051 if (mpte->mpte_reinjectq) {
3052 sb_mb = mpte->mpte_reinjectq;
3053 } else {
3054 sb_mb = mp_so->so_snd.sb_mb;
3055 }
3056
3057 if (sb_mb == NULL) {
3058 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3059 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3060 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3061 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3062
3063 /* Fix it to prevent looping */
3064 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3065 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3066 }
3067 goto out;
3068 }
3069
3070 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3071
3072 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3073 !(so->so_state & SS_ISCONNECTED) &&
3074 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3075 tp->t_mpflags |= TMPF_TFO_REQUEST;
3076
3077 /* Opting to call pru_send as no mbuf at subflow level */
3078 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3079 NULL, current_proc());
3080
3081 goto done_sending;
3082 }
3083
3084 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3085
3086 /* First, drop acknowledged data */
3087 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3088 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3089 "dsn %u suna %u reinject? %u\n",
3090 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3091 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3092 if (mpte->mpte_reinjectq) {
3093 mptcp_clean_reinjectq(mpte);
3094 } else {
3095 uint64_t len = 0;
3096 len = mp_tp->mpt_snduna - mpt_dsn;
3097 sbdrop(&mp_so->so_snd, (int)len);
3098 wakeup = 1;
3099 }
3100 }
3101
3102 /* Check again because of above sbdrop */
3103 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3104 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3105 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3106 goto out;
3107 }
3108
3109 /*
3110 * In degraded mode, we don't receive data acks, so force free
3111 * mbufs less than snd_nxt
3112 */
3113 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3114 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3115 mp_so->so_snd.sb_mb) {
3116 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3117 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3118 uint64_t len = 0;
3119 len = mp_tp->mpt_snduna - mpt_dsn;
3120 sbdrop(&mp_so->so_snd, (int)len);
3121 wakeup = 1;
3122
3123 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3124 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3125 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3126 }
3127 }
3128
3129 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3130 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3131 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3132 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3133 }
3134
3135 /*
3136 * Adjust the top level notion of next byte used for retransmissions
3137 * and sending FINs.
3138 */
3139 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3140 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3141 }
3142
3143 /* Now determine the offset from which to start transmitting data */
3144 if (mpte->mpte_reinjectq) {
3145 sb_mb = mpte->mpte_reinjectq;
3146 } else {
3147 dont_reinject:
3148 sb_mb = mp_so->so_snd.sb_mb;
3149 }
3150 if (sb_mb == NULL) {
3151 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3152 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3153 goto out;
3154 }
3155
3156 if (sb_mb == mpte->mpte_reinjectq) {
3157 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3158 off = 0;
3159
3160 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3161 if (mptcp_can_send_more(mp_tp, TRUE)) {
3162 goto dont_reinject;
3163 }
3164
3165 error = ECANCELED;
3166 goto out;
3167 }
3168
3169 reinjected = TRUE;
3170 } else if (flags & MPTCP_SUBOUT_PROBING) {
3171 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3172 off = 0;
3173 } else {
3174 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3175
3176 /*
3177 * With TFO, there might be no data at all, thus still go into this
3178 * code-path here.
3179 */
3180 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3181 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3182 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3183 sb_cc -= off;
3184 } else {
3185 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3186 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3187 (uint32_t)mp_tp->mpt_sndmax);
3188
3189 goto out;
3190 }
3191 }
3192
3193 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3194 if (sb_cc <= 0) {
3195 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3196 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3197 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3198 mptcp_subflow_cwnd_space(so));
3199 }
3200
3201 sb_cc = min(sb_cc, UINT16_MAX);
3202
3203 /*
3204 * Create a DSN mapping for the data we are about to send. It all
3205 * has the same mapping.
3206 */
3207 if (reinjected) {
3208 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3209 } else {
3210 mpt_dsn = mp_tp->mpt_snduna + off;
3211 }
3212
3213 mpt_mbuf = sb_mb;
3214 while (mpt_mbuf && reinjected == FALSE &&
3215 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3216 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3217 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3218 mpt_mbuf = mpt_mbuf->m_next;
3219 }
3220 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3221
3222 head = tail = NULL;
3223
3224 while (tot_sent < sb_cc) {
3225 int32_t mlen;
3226
3227 mlen = mpt_mbuf->m_len;
3228 mlen -= off;
3229 mlen = MIN(mlen, sb_cc - tot_sent);
3230
3231 if (mlen < 0) {
3232 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3233 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3234 (uint32_t)off, sb_cc, tot_sent);
3235 goto out;
3236 }
3237
3238 if (mlen == 0) {
3239 goto next;
3240 }
3241
3242 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, NULL, NULL,
3243 M_COPYM_MUST_COPY_HDR);
3244 if (m == NULL) {
3245 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3246 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3247 error = ENOBUFS;
3248 break;
3249 }
3250
3251 /* Create a DSN mapping for the data (m_copym does it) */
3252 VERIFY(m->m_flags & M_PKTHDR);
3253 VERIFY(m->m_next == NULL);
3254
3255 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3256 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3257 m->m_pkthdr.mp_dsn = mpt_dsn;
3258 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3259 m->m_pkthdr.len = mlen;
3260
3261 if (head == NULL) {
3262 head = tail = m;
3263 } else {
3264 tail->m_next = m;
3265 tail = m;
3266 }
3267
3268 tot_sent += mlen;
3269 off = 0;
3270 next:
3271 mpt_mbuf = mpt_mbuf->m_next;
3272 }
3273
3274 if (reinjected) {
3275 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3276 struct mbuf *n = sb_mb;
3277
3278 while (n) {
3279 n->m_pkthdr.mp_dsn += sb_cc;
3280 n->m_pkthdr.mp_rlen -= sb_cc;
3281 n = n->m_next;
3282 }
3283 m_adj(sb_mb, sb_cc);
3284 } else {
3285 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3286 m_freem(sb_mb);
3287 }
3288 }
3289
3290 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3291 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3292 tot_sent);
3293 }
3294
3295 /* Now, let's update rel-seq and the data-level length */
3296 mpts->mpts_rel_seq += tot_sent;
3297 m = head;
3298 while (m) {
3299 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3300 m->m_pkthdr.mp_csum = dss_csum;
3301 }
3302 m->m_pkthdr.mp_rlen = tot_sent;
3303 m = m->m_next;
3304 }
3305
3306 if (head != NULL) {
3307 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3308 (tp->t_tfo_stats == 0)) {
3309 tp->t_mpflags |= TMPF_TFO_REQUEST;
3310 }
3311
3312 error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3313 head = NULL;
3314 }
3315
3316 done_sending:
3317 if (error == 0 ||
3318 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3319 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3320
3321 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3322 tcpstat.tcps_mp_num_probes++;
3323 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3324 mpts->mpts_probecnt += 1;
3325 } else {
3326 mpts->mpts_probecnt +=
3327 tot_sent / mpts->mpts_maxseg;
3328 }
3329 }
3330
3331 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3332 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3333 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3334 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3335 }
3336 mp_tp->mpt_sndnxt = new_sndnxt;
3337 }
3338
3339 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3340
3341 /* Must be here as mptcp_can_send_more() checks for this */
3342 soclearfastopen(mp_so);
3343
3344 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3345 mptcp_set_cellicon(mpte, mpts);
3346
3347 mpte->mpte_used_cell = 1;
3348 } else {
3349 /*
3350 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3351 * explicitly set the cellicon, then we unset it again.
3352 */
3353 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3354 mptcp_unset_cellicon(mpte, NULL, 1);
3355 }
3356
3357 mpte->mpte_used_wifi = 1;
3358 }
3359
3360 /*
3361 * Don't propagate EWOULDBLOCK - it's already taken care of
3362 * in mptcp_usr_send for TFO.
3363 */
3364 error = 0;
3365 } else {
3366 /* We need to revert our change to mpts_rel_seq */
3367 mpts->mpts_rel_seq -= tot_sent;
3368
3369 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3370 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3371 }
3372 out:
3373
3374 if (head != NULL) {
3375 m_freem(head);
3376 }
3377
3378 if (wakeup) {
3379 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3380 }
3381
3382 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3383 return error;
3384 }
3385
3386 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3387 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3388 {
3389 struct mbuf *n, *prev = NULL;
3390
3391 n = mpte->mpte_reinjectq;
3392
3393 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3394 * equal than m's sequence number.
3395 */
3396 while (n) {
3397 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3398 break;
3399 }
3400
3401 prev = n;
3402
3403 n = n->m_nextpkt;
3404 }
3405
3406 if (n) {
3407 /* m is already fully covered by the next mbuf in the queue */
3408 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3409 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3410 os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3411 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3412 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3413 m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3414 goto dont_queue;
3415 }
3416
3417 /* m is covering the next mbuf entirely, thus we remove this guy */
3418 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3419 struct mbuf *tmp = n->m_nextpkt;
3420
3421 os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3422 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3423 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3424 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3425
3426 m->m_nextpkt = NULL;
3427 if (prev == NULL) {
3428 mpte->mpte_reinjectq = tmp;
3429 } else {
3430 prev->m_nextpkt = tmp;
3431 }
3432
3433 m_freem(n);
3434 n = tmp;
3435 }
3436 }
3437
3438 if (prev) {
3439 /* m is already fully covered by the previous mbuf in the queue */
3440 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3441 os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3442 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3443 (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3444 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3445 goto dont_queue;
3446 }
3447 }
3448
3449 if (prev == NULL) {
3450 mpte->mpte_reinjectq = m;
3451 } else {
3452 prev->m_nextpkt = m;
3453 }
3454
3455 m->m_nextpkt = n;
3456
3457 return;
3458
3459 dont_queue:
3460 m_freem(m);
3461 return;
3462 }
3463
3464 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3465 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3466 {
3467 struct socket *mp_so = mptetoso(mpte);
3468 struct mbuf *m;
3469
3470 m = mp_so->so_snd.sb_mb;
3471
3472 while (m) {
3473 /* If this segment covers what we are looking for, return it. */
3474 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3475 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3476 break;
3477 }
3478
3479
3480 /* Segment is no more in the queue */
3481 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3482 return NULL;
3483 }
3484
3485 m = m->m_next;
3486 }
3487
3488 return m;
3489 }
3490
3491 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3492 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3493 {
3494 struct mbuf *top = NULL, *tail = NULL;
3495 uint64_t dsn;
3496 uint32_t dlen, rseq;
3497
3498 dsn = m->m_pkthdr.mp_dsn;
3499 dlen = m->m_pkthdr.mp_rlen;
3500 rseq = m->m_pkthdr.mp_rseq;
3501
3502 while (len > 0) {
3503 struct mbuf *n;
3504
3505 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3506
3507 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
3508 if (n == NULL) {
3509 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3510 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3511 goto err;
3512 }
3513
3514 VERIFY(n->m_flags & M_PKTHDR);
3515 VERIFY(n->m_next == NULL);
3516 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3517 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3518 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3519 VERIFY(n->m_len == m->m_len);
3520
3521 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3522
3523 if (top == NULL) {
3524 top = n;
3525 }
3526
3527 if (tail != NULL) {
3528 tail->m_next = n;
3529 }
3530
3531 tail = n;
3532
3533 len -= m->m_len;
3534 m = m->m_next;
3535 }
3536
3537 return top;
3538
3539 err:
3540 if (top) {
3541 m_freem(top);
3542 }
3543
3544 return NULL;
3545 }
3546
3547 static void
mptcp_reinject_mbufs(struct socket * so)3548 mptcp_reinject_mbufs(struct socket *so)
3549 {
3550 struct tcpcb *tp = sototcpcb(so);
3551 struct mptsub *mpts = tp->t_mpsub;
3552 struct mptcb *mp_tp = tptomptp(tp);
3553 struct mptses *mpte = mp_tp->mpt_mpte;
3554 struct sockbuf *sb = &so->so_snd;
3555 struct mbuf *m;
3556
3557 m = sb->sb_mb;
3558 while (m) {
3559 struct mbuf *n = m->m_next, *orig = m;
3560 bool set_reinject_flag = false;
3561
3562 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3563
3564 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3565 goto next;
3566 }
3567
3568 /* Has it all already been acknowledged at the data-level? */
3569 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3570 goto next;
3571 }
3572
3573 /* Part of this has already been acknowledged - lookup in the
3574 * MPTCP-socket for the segment.
3575 */
3576 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3577 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3578 if (m == NULL) {
3579 goto next;
3580 }
3581 }
3582
3583 /* Copy the mbuf with headers (aka, DSN-numbers) */
3584 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3585 if (m == NULL) {
3586 break;
3587 }
3588
3589 VERIFY(m->m_nextpkt == NULL);
3590
3591 /* Now, add to the reinject-queue, eliminating overlapping
3592 * segments
3593 */
3594 mptcp_add_reinjectq(mpte, m);
3595
3596 set_reinject_flag = true;
3597 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3598
3599 next:
3600 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3601 while (n) {
3602 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3603
3604 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3605 break;
3606 }
3607
3608 if (set_reinject_flag) {
3609 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3610 }
3611 n = n->m_next;
3612 }
3613
3614 m = n;
3615 }
3616 }
3617
3618 void
mptcp_clean_reinjectq(struct mptses * mpte)3619 mptcp_clean_reinjectq(struct mptses *mpte)
3620 {
3621 struct mptcb *mp_tp = mpte->mpte_mptcb;
3622
3623 socket_lock_assert_owned(mptetoso(mpte));
3624
3625 while (mpte->mpte_reinjectq) {
3626 struct mbuf *m = mpte->mpte_reinjectq;
3627
3628 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3629 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3630 break;
3631 }
3632
3633 mpte->mpte_reinjectq = m->m_nextpkt;
3634 m->m_nextpkt = NULL;
3635 m_freem(m);
3636 }
3637 }
3638
3639 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3640 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3641 uint32_t *p_mpsofilt_hint, uint32_t event)
3642 {
3643 struct socket *mp_so, *so;
3644 struct mptcb *mp_tp;
3645
3646 mp_so = mptetoso(mpte);
3647 mp_tp = mpte->mpte_mptcb;
3648 so = mpts->mpts_socket;
3649
3650 /*
3651 * We got an event for this subflow that might need to be propagated,
3652 * based on the state of the MPTCP connection.
3653 */
3654 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3655 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3656 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3657 mp_so->so_error = so->so_error;
3658 *p_mpsofilt_hint |= event;
3659 }
3660
3661 return MPTS_EVRET_OK;
3662 }
3663
3664 /*
3665 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3666 */
3667 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3668 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3669 uint32_t *p_mpsofilt_hint, uint32_t event)
3670 {
3671 struct socket *mp_so;
3672 struct tcpcb *tp;
3673
3674 mp_so = mptetoso(mpte);
3675 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3676
3677 /*
3678 * This overwrites any previous mpte_lost_aid to avoid storing
3679 * too much state when the typical case has only two subflows.
3680 */
3681 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3682 mpte->mpte_lost_aid = tp->t_local_aid;
3683
3684 /*
3685 * The subflow connection has lost its source address.
3686 */
3687 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3688
3689 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3690 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3691 }
3692
3693 return MPTS_EVRET_DELETE;
3694 }
3695
3696 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3697 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3698 uint32_t *p_mpsofilt_hint, uint32_t event)
3699 {
3700 #pragma unused(event, p_mpsofilt_hint)
3701 struct socket *so, *mp_so;
3702
3703 so = mpts->mpts_socket;
3704
3705 if (so->so_error != ENODATA) {
3706 return MPTS_EVRET_OK;
3707 }
3708
3709
3710 mp_so = mptetoso(mpte);
3711
3712 mp_so->so_error = ENODATA;
3713
3714 sorwakeup(mp_so);
3715 sowwakeup(mp_so);
3716
3717 return MPTS_EVRET_OK;
3718 }
3719
3720
3721 /*
3722 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3723 * indicates that the remote side sent a Data FIN
3724 */
3725 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3726 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3727 uint32_t *p_mpsofilt_hint, uint32_t event)
3728 {
3729 #pragma unused(event, mpts)
3730 struct mptcb *mp_tp = mpte->mpte_mptcb;
3731
3732 /*
3733 * We got a Data FIN for the MPTCP connection.
3734 * The FIN may arrive with data. The data is handed up to the
3735 * mptcp socket and the user is notified so that it may close
3736 * the socket if needed.
3737 */
3738 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3739 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3740 }
3741
3742 return MPTS_EVRET_OK; /* keep the subflow socket around */
3743 }
3744
3745 /*
3746 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3747 */
3748 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3749 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3750 uint32_t *p_mpsofilt_hint, uint32_t event)
3751 {
3752 #pragma unused(event, p_mpsofilt_hint)
3753 struct mptsub *mpts_alt = NULL;
3754 struct socket *alt_so = NULL;
3755 struct socket *mp_so;
3756 int altpath_exists = 0;
3757
3758 mp_so = mptetoso(mpte);
3759 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3760
3761 mptcp_reinject_mbufs(mpts->mpts_socket);
3762
3763 mpts_alt = mptcp_get_subflow(mpte, NULL);
3764
3765 /* If there is no alternate eligible subflow, ignore the failover hint. */
3766 if (mpts_alt == NULL || mpts_alt == mpts) {
3767 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3768 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3769
3770 goto done;
3771 }
3772
3773 altpath_exists = 1;
3774 alt_so = mpts_alt->mpts_socket;
3775 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3776 /* All data acknowledged and no RTT spike */
3777 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3778 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3779 } else {
3780 /* no alternate path available */
3781 altpath_exists = 0;
3782 }
3783 }
3784
3785 if (altpath_exists) {
3786 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3787
3788 mpte->mpte_active_sub = mpts_alt;
3789 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3790 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3791
3792 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3793 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3794
3795 mptcpstats_inc_switch(mpte, mpts);
3796
3797 sowwakeup(alt_so);
3798 } else {
3799 done:
3800 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3801 }
3802
3803 return MPTS_EVRET_OK;
3804 }
3805
3806 /*
3807 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3808 */
3809 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3810 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3811 uint32_t *p_mpsofilt_hint, uint32_t event)
3812 {
3813 /*
3814 * The subflow connection cannot use the outgoing interface, let's
3815 * close this subflow.
3816 */
3817 mptcp_subflow_abort(mpts, EPERM);
3818
3819 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3820
3821 return MPTS_EVRET_DELETE;
3822 }
3823
3824 /*
3825 * https://tools.ietf.org/html/rfc6052#section-2
3826 * https://tools.ietf.org/html/rfc6147#section-5.2
3827 */
3828 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)3829 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
3830 const struct ipv6_prefix *prefix,
3831 struct in_addr *addrv4)
3832 {
3833 char buf[MAX_IPv4_STR_LEN];
3834 char *ptrv4 = (char *)addrv4;
3835 const char *ptr = (const char *)addr;
3836
3837 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3838 return false;
3839 }
3840
3841 switch (prefix->prefix_len) {
3842 case NAT64_PREFIX_LEN_96:
3843 memcpy(ptrv4, ptr + 12, 4);
3844 break;
3845 case NAT64_PREFIX_LEN_64:
3846 memcpy(ptrv4, ptr + 9, 4);
3847 break;
3848 case NAT64_PREFIX_LEN_56:
3849 memcpy(ptrv4, ptr + 7, 1);
3850 memcpy(ptrv4 + 1, ptr + 9, 3);
3851 break;
3852 case NAT64_PREFIX_LEN_48:
3853 memcpy(ptrv4, ptr + 6, 2);
3854 memcpy(ptrv4 + 2, ptr + 9, 2);
3855 break;
3856 case NAT64_PREFIX_LEN_40:
3857 memcpy(ptrv4, ptr + 5, 3);
3858 memcpy(ptrv4 + 3, ptr + 9, 1);
3859 break;
3860 case NAT64_PREFIX_LEN_32:
3861 memcpy(ptrv4, ptr + 4, 4);
3862 break;
3863 default:
3864 panic("NAT64-prefix len is wrong: %u",
3865 prefix->prefix_len);
3866 }
3867
3868 os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3869 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3870 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3871
3872 return true;
3873 }
3874
3875 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3876 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3877 {
3878 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3879 struct socket *so = mpts->mpts_socket;
3880 struct ifnet *ifp;
3881 int j;
3882
3883 /* Subflow IPs will be steered directly by the server - no need to
3884 * desynthesize.
3885 */
3886 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3887 return;
3888 }
3889
3890 ifp = sotoinpcb(so)->inp_last_outifp;
3891
3892 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3893 return;
3894 }
3895
3896 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3897 int success;
3898
3899 if (nat64prefixes[j].prefix_len == 0) {
3900 continue;
3901 }
3902
3903 success = mptcp_desynthesize_ipv6_addr(mpte,
3904 &mpte->__mpte_dst_v6.sin6_addr,
3905 &nat64prefixes[j],
3906 &mpte->mpte_sub_dst_v4.sin_addr);
3907 if (success) {
3908 mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3909 mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3910 mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3911
3912 /*
3913 * We connected to a NAT64'ed address. Let's remove it
3914 * from the potential IPs to use. Whenever we are back on
3915 * that network and need to connect, we can synthesize again.
3916 *
3917 * Otherwise, on different IPv6 networks we will attempt
3918 * to connect to that NAT64 address...
3919 */
3920 memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3921 break;
3922 }
3923 }
3924 }
3925
3926 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3927 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3928 {
3929 struct inpcb *inp;
3930
3931 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3932 return;
3933 }
3934
3935 inp = sotoinpcb(mpts->mpts_socket);
3936 if (inp == NULL) {
3937 return;
3938 }
3939
3940 /* Should we try the alternate port? */
3941 if (mpte->mpte_alternate_port &&
3942 inp->inp_fport != mpte->mpte_alternate_port) {
3943 union sockaddr_in_4_6 dst;
3944 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3945
3946 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3947
3948 dst_in->sin_port = mpte->mpte_alternate_port;
3949
3950 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3951 mpts->mpts_ifscope, NULL);
3952 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3953 unsigned int i;
3954
3955 if (inp->inp_last_outifp == NULL) {
3956 return;
3957 }
3958
3959 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3960 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3961
3962 if (inp->inp_last_outifp->if_index == info->ifindex) {
3963 info->no_mptcp_support = 1;
3964 break;
3965 }
3966 }
3967 }
3968 }
3969
3970 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
3971 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)3972 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
3973 {
3974 struct socket *mp_so = mptetoso(mpte);
3975 struct socket *so = mpts->mpts_socket;
3976 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3977 struct mptcb *mp_tp = mpte->mpte_mptcb;
3978
3979 /* If data was sent with SYN, rewind state */
3980 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
3981 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3982 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
3983
3984 VERIFY(mp_droplen <= (UINT_MAX));
3985 VERIFY(mp_droplen >= tcp_droplen);
3986
3987 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
3988 mpts->mpts_iss += tcp_droplen;
3989 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3990
3991 if (mp_droplen > tcp_droplen) {
3992 /* handle partial TCP ack */
3993 mp_so->so_flags1 |= SOF1_TFO_REWIND;
3994 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
3995 mp_droplen = tcp_droplen;
3996 } else {
3997 /* all data on SYN was acked */
3998 mpts->mpts_rel_seq = 1;
3999 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4000 }
4001 mp_tp->mpt_sndmax -= tcp_droplen;
4002
4003 if (mp_droplen != 0) {
4004 VERIFY(mp_so->so_snd.sb_mb != NULL);
4005 sbdrop(&mp_so->so_snd, (int)mp_droplen);
4006 }
4007 }
4008 }
4009
4010 /*
4011 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4012 */
4013 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4014 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4015 uint32_t *p_mpsofilt_hint, uint32_t event)
4016 {
4017 #pragma unused(event, p_mpsofilt_hint)
4018 struct socket *mp_so, *so;
4019 struct inpcb *inp;
4020 struct tcpcb *tp;
4021 struct mptcb *mp_tp;
4022 int af;
4023 boolean_t mpok = FALSE;
4024
4025 mp_so = mptetoso(mpte);
4026 mp_tp = mpte->mpte_mptcb;
4027 so = mpts->mpts_socket;
4028 tp = sototcpcb(so);
4029 af = mpts->mpts_dst.sa_family;
4030
4031 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4032 return MPTS_EVRET_OK;
4033 }
4034
4035 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4036 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4037 return MPTS_EVRET_OK;
4038 }
4039
4040 /*
4041 * The subflow connection has been connected. Find out whether it
4042 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4043 *
4044 * a. If MPTCP connection is not yet established, then this must be
4045 * the first subflow connection. If MPTCP failed to negotiate,
4046 * fallback to regular TCP by degrading this subflow.
4047 *
4048 * b. If MPTCP connection has been established, then this must be
4049 * one of the subsequent subflow connections. If MPTCP failed
4050 * to negotiate, disconnect the connection.
4051 *
4052 * Right now, we simply unblock any waiters at the MPTCP socket layer
4053 * if the MPTCP connection has not been established.
4054 */
4055
4056 if (so->so_state & SS_ISDISCONNECTED) {
4057 /*
4058 * With MPTCP joins, a connection is connected at the subflow
4059 * level, but the 4th ACK from the server elevates the MPTCP
4060 * subflow to connected state. So there is a small window
4061 * where the subflow could get disconnected before the
4062 * connected event is processed.
4063 */
4064 return MPTS_EVRET_OK;
4065 }
4066
4067 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4068 mptcp_drop_tfo_data(mpte, mpts);
4069 }
4070
4071 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4072 mpts->mpts_flags |= MPTSF_CONNECTED;
4073
4074 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4075 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4076 }
4077
4078 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4079
4080 /* get/verify the outbound interface */
4081 inp = sotoinpcb(so);
4082
4083 mpts->mpts_maxseg = tp->t_maxseg;
4084
4085 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4086
4087 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4088 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4089 mpte->mpte_associd = mpts->mpts_connid;
4090 DTRACE_MPTCP2(state__change,
4091 struct mptcb *, mp_tp,
4092 uint32_t, 0 /* event */);
4093
4094 if (SOCK_DOM(so) == AF_INET) {
4095 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4096 } else {
4097 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4098 }
4099
4100 mpts->mpts_flags |= MPTSF_ACTIVE;
4101
4102 /* case (a) above */
4103 if (!mpok) {
4104 tcpstat.tcps_mpcap_fallback++;
4105
4106 tp->t_mpflags |= TMPF_INFIN_SENT;
4107 mptcp_notify_mpfail(so);
4108 } else {
4109 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4110 mptcp_subflows_need_backup_flag(mpte)) {
4111 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4112 } else {
4113 mpts->mpts_flags |= MPTSF_PREFERRED;
4114 }
4115 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4116 mpte->mpte_nummpcapflows++;
4117
4118 if (SOCK_DOM(so) == AF_INET6) {
4119 mptcp_handle_ipv6_connection(mpte, mpts);
4120 }
4121
4122 mptcp_check_subflows_and_add(mpte);
4123
4124 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4125 mpte->mpte_initial_cell = 1;
4126 }
4127
4128 mpte->mpte_handshake_success = 1;
4129 }
4130
4131 mp_tp->mpt_sndwnd = tp->snd_wnd;
4132 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4133 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4134 soisconnected(mp_so);
4135 } else if (mpok) {
4136 /*
4137 * case (b) above
4138 * In case of additional flows, the MPTCP socket is not
4139 * MPTSF_MP_CAPABLE until an ACK is received from server
4140 * for 3-way handshake. TCP would have guaranteed that this
4141 * is an MPTCP subflow.
4142 */
4143 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4144 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4145 mptcp_subflows_need_backup_flag(mpte)) {
4146 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4147 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4148 } else {
4149 mpts->mpts_flags |= MPTSF_PREFERRED;
4150 }
4151
4152 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4153 mpte->mpte_nummpcapflows++;
4154
4155 mpts->mpts_rel_seq = 1;
4156
4157 mptcp_check_subflows_and_remove(mpte);
4158 } else {
4159 mptcp_try_alternate_port(mpte, mpts);
4160
4161 tcpstat.tcps_join_fallback++;
4162 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4163 tcpstat.tcps_mptcp_cell_proxy++;
4164 } else {
4165 tcpstat.tcps_mptcp_wifi_proxy++;
4166 }
4167
4168 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4169
4170 return MPTS_EVRET_OK;
4171 }
4172
4173 /* This call, just to "book" an entry in the stats-table for this ifindex */
4174 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4175
4176 mptcp_output(mpte);
4177
4178 return MPTS_EVRET_OK; /* keep the subflow socket around */
4179 }
4180
4181 /*
4182 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4183 */
4184 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4185 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4186 uint32_t *p_mpsofilt_hint, uint32_t event)
4187 {
4188 #pragma unused(event, p_mpsofilt_hint)
4189 struct socket *mp_so, *so;
4190 struct mptcb *mp_tp;
4191
4192 mp_so = mptetoso(mpte);
4193 mp_tp = mpte->mpte_mptcb;
4194 so = mpts->mpts_socket;
4195
4196 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4197 return MPTS_EVRET_DELETE;
4198 }
4199
4200 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4201
4202 /* The subflow connection has been disconnected. */
4203
4204 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4205 mpte->mpte_nummpcapflows--;
4206 if (mpte->mpte_active_sub == mpts) {
4207 mpte->mpte_active_sub = NULL;
4208 }
4209 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4210 } else {
4211 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4212 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4213 mptcp_try_alternate_port(mpte, mpts);
4214 }
4215 }
4216
4217 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4218 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4219 mptcp_drop(mpte, mp_tp, so->so_error);
4220 }
4221
4222 /*
4223 * Clear flags that are used by getconninfo to return state.
4224 * Retain like MPTSF_DELETEOK for internal purposes.
4225 */
4226 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4227 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4228 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4229
4230 return MPTS_EVRET_DELETE;
4231 }
4232
4233 /*
4234 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4235 */
4236 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4237 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4238 uint32_t *p_mpsofilt_hint, uint32_t event)
4239 {
4240 #pragma unused(event, p_mpsofilt_hint)
4241 ev_ret_t ret = MPTS_EVRET_OK;
4242 struct socket *mp_so, *so;
4243 struct mptcb *mp_tp;
4244
4245 mp_so = mptetoso(mpte);
4246 mp_tp = mpte->mpte_mptcb;
4247 so = mpts->mpts_socket;
4248 struct inpcb *inp = sotoinpcb(so);
4249 struct tcpcb *tp = intotcpcb(inp);
4250
4251 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4252 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4253 } else {
4254 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4255 }
4256
4257 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4258 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4259 goto done;
4260 }
4261 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4262 } else {
4263 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4264 }
4265
4266 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4267 mpts->mpts_flags |= MPTSF_MP_READY;
4268 } else {
4269 mpts->mpts_flags &= ~MPTSF_MP_READY;
4270 }
4271
4272 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4273 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4274 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4275 tcp_cache_update_mptcp_version(tp, FALSE);
4276 }
4277
4278 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4279 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4280
4281 m_freem_list(mpte->mpte_reinjectq);
4282 mpte->mpte_reinjectq = NULL;
4283 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4284 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4285 ret = MPTS_EVRET_CONNECT_PENDING;
4286 }
4287
4288 done:
4289 return ret;
4290 }
4291
4292 /*
4293 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4294 */
4295 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4296 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4297 uint32_t *p_mpsofilt_hint, uint32_t event)
4298 {
4299 #pragma unused(event)
4300 struct socket *mp_so, *so;
4301 struct mptcb *mp_tp;
4302 boolean_t is_fastclose;
4303
4304 mp_so = mptetoso(mpte);
4305 mp_tp = mpte->mpte_mptcb;
4306 so = mpts->mpts_socket;
4307
4308 /* We got an invalid option or a fast close */
4309 struct inpcb *inp = sotoinpcb(so);
4310 struct tcpcb *tp = NULL;
4311
4312 tp = intotcpcb(inp);
4313 so->so_error = ECONNABORTED;
4314
4315 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4316
4317 tp->t_mpflags |= TMPF_RESET;
4318
4319 if (tp->t_state != TCPS_CLOSED) {
4320 struct mbuf *m;
4321 struct tcptemp *t_template = tcp_maketemplate(tp, &m);
4322
4323 if (t_template) {
4324 struct tcp_respond_args tra;
4325
4326 bzero(&tra, sizeof(tra));
4327 if (inp->inp_flags & INP_BOUND_IF) {
4328 tra.ifscope = inp->inp_boundifp->if_index;
4329 } else {
4330 tra.ifscope = IFSCOPE_NONE;
4331 }
4332 tra.awdl_unrestricted = 1;
4333
4334 tcp_respond(tp, t_template->tt_ipgen,
4335 &t_template->tt_t, (struct mbuf *)NULL,
4336 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4337 (void) m_free(m);
4338 }
4339 }
4340
4341 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4342 struct mptsub *iter, *tmp;
4343
4344 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4345
4346 mp_so->so_error = ECONNRESET;
4347
4348 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4349 if (iter == mpts) {
4350 continue;
4351 }
4352 mptcp_subflow_abort(iter, ECONNABORTED);
4353 }
4354
4355 /*
4356 * mptcp_drop is being called after processing the events, to fully
4357 * close the MPTCP connection
4358 */
4359 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4360 }
4361
4362 mptcp_subflow_abort(mpts, ECONNABORTED);
4363
4364 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4365 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4366 }
4367
4368 return MPTS_EVRET_DELETE;
4369 }
4370
4371 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4372 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4373 uint32_t *p_mpsofilt_hint, uint32_t event)
4374 {
4375 #pragma unused(event)
4376 bool found_active = false;
4377
4378 mpts->mpts_flags |= MPTSF_READ_STALL;
4379
4380 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4381 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4382
4383 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4384 TCPS_HAVERCVDFIN2(tp->t_state)) {
4385 continue;
4386 }
4387
4388 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4389 found_active = true;
4390 break;
4391 }
4392 }
4393
4394 if (!found_active) {
4395 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4396 }
4397
4398 return MPTS_EVRET_OK;
4399 }
4400
4401 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4402 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4403 uint32_t *p_mpsofilt_hint, uint32_t event)
4404 {
4405 #pragma unused(event)
4406 bool found_active = false;
4407
4408 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4409
4410 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4411 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4412
4413 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4414 tp->t_state > TCPS_CLOSE_WAIT) {
4415 continue;
4416 }
4417
4418 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4419 found_active = true;
4420 break;
4421 }
4422 }
4423
4424 if (!found_active) {
4425 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4426 }
4427
4428 return MPTS_EVRET_OK;
4429 }
4430
4431 /*
4432 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4433 * caller must ensure that the option can be issued on subflow sockets, via
4434 * MPOF_SUBFLOW_OK flag.
4435 */
4436 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4437 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4438 {
4439 struct socket *mp_so, *so;
4440 struct sockopt sopt;
4441 int error;
4442
4443 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4444
4445 mp_so = mptetoso(mpte);
4446 so = mpts->mpts_socket;
4447
4448 socket_lock_assert_owned(mp_so);
4449
4450 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4451 mpo->mpo_level == SOL_SOCKET &&
4452 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4453 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4454
4455 /*
4456 * When we open a new subflow, mark it as cell fallback, if
4457 * this subflow goes over cell.
4458 *
4459 * (except for first-party apps)
4460 */
4461
4462 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4463 return 0;
4464 }
4465
4466 if (sotoinpcb(so)->inp_last_outifp &&
4467 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4468 return 0;
4469 }
4470
4471 /*
4472 * This here is an OR, because if the app is not binding to the
4473 * interface, then it definitely is not a cell-fallback
4474 * connection.
4475 */
4476 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4477 !IFNET_IS_CELLULAR(ifp)) {
4478 return 0;
4479 }
4480 }
4481
4482 mpo->mpo_flags &= ~MPOF_INTERIM;
4483
4484 bzero(&sopt, sizeof(sopt));
4485 sopt.sopt_dir = SOPT_SET;
4486 sopt.sopt_level = mpo->mpo_level;
4487 sopt.sopt_name = mpo->mpo_name;
4488 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4489 sopt.sopt_valsize = sizeof(int);
4490 sopt.sopt_p = kernproc;
4491
4492 error = sosetoptlock(so, &sopt, 0);
4493 if (error) {
4494 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4495 "val %d set error %d\n", __func__,
4496 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4497 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4498 mpo->mpo_intval, error);
4499 }
4500 return error;
4501 }
4502
4503 /*
4504 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4505 * caller must ensure that the option can be issued on subflow sockets, via
4506 * MPOF_SUBFLOW_OK flag.
4507 */
4508 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4509 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4510 struct mptopt *mpo)
4511 {
4512 struct socket *mp_so;
4513 struct sockopt sopt;
4514 int error;
4515
4516 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4517 mp_so = mptetoso(mpte);
4518
4519 socket_lock_assert_owned(mp_so);
4520
4521 bzero(&sopt, sizeof(sopt));
4522 sopt.sopt_dir = SOPT_GET;
4523 sopt.sopt_level = mpo->mpo_level;
4524 sopt.sopt_name = mpo->mpo_name;
4525 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4526 sopt.sopt_valsize = sizeof(int);
4527 sopt.sopt_p = kernproc;
4528
4529 error = sogetoptlock(so, &sopt, 0); /* already locked */
4530 if (error) {
4531 os_log_error(mptcp_log_handle,
4532 "%s - %lx: sopt %s get error %d\n",
4533 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4534 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4535 }
4536 return error;
4537 }
4538
4539
4540 /*
4541 * MPTCP garbage collector.
4542 *
4543 * This routine is called by the MP domain on-demand, periodic callout,
4544 * which is triggered when a MPTCP socket is closed. The callout will
4545 * repeat as long as this routine returns a non-zero value.
4546 */
4547 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4548 mptcp_gc(struct mppcbinfo *mppi)
4549 {
4550 struct mppcb *mpp, *tmpp;
4551 uint32_t active = 0;
4552
4553 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4554
4555 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4556 struct socket *mp_so;
4557 struct mptses *mpte;
4558 struct mptcb *mp_tp;
4559
4560 mp_so = mpp->mpp_socket;
4561 mpte = mptompte(mpp);
4562 mp_tp = mpte->mpte_mptcb;
4563
4564 if (!mpp_try_lock(mpp)) {
4565 active++;
4566 continue;
4567 }
4568
4569 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4570
4571 /* check again under the lock */
4572 if (mp_so->so_usecount > 0) {
4573 boolean_t wakeup = FALSE;
4574 struct mptsub *mpts, *tmpts;
4575
4576 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4577 if (mp_tp->mpt_gc_ticks > 0) {
4578 mp_tp->mpt_gc_ticks--;
4579 }
4580 if (mp_tp->mpt_gc_ticks == 0) {
4581 wakeup = TRUE;
4582 }
4583 }
4584 if (wakeup) {
4585 TAILQ_FOREACH_SAFE(mpts,
4586 &mpte->mpte_subflows, mpts_entry, tmpts) {
4587 mptcp_subflow_eupcall1(mpts->mpts_socket,
4588 mpts, SO_FILT_HINT_DISCONNECTED);
4589 }
4590 }
4591 socket_unlock(mp_so, 0);
4592 active++;
4593 continue;
4594 }
4595
4596 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4597 panic("%s - %lx: skipped state "
4598 "[u=%d,r=%d,s=%d]\n", __func__,
4599 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4600 mp_so->so_usecount, mp_so->so_retaincnt,
4601 mpp->mpp_state);
4602 }
4603
4604 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4605 mptcp_close(mpte, mp_tp);
4606 }
4607
4608 mptcp_session_destroy(mpte);
4609
4610 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4611 struct sockbuf *, &mp_so->so_rcv,
4612 struct sockbuf *, &mp_so->so_snd,
4613 struct mppcb *, mpp);
4614
4615 mptcp_pcbdispose(mpp);
4616 sodealloc(mp_so);
4617 }
4618
4619 return active;
4620 }
4621
4622 /*
4623 * Drop a MPTCP connection, reporting the specified error.
4624 */
4625 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4626 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4627 {
4628 struct socket *mp_so = mptetoso(mpte);
4629
4630 VERIFY(mpte->mpte_mptcb == mp_tp);
4631
4632 socket_lock_assert_owned(mp_so);
4633
4634 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4635 uint32_t, 0 /* event */);
4636
4637 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4638 errno = mp_tp->mpt_softerror;
4639 }
4640 mp_so->so_error = errno;
4641
4642 return mptcp_close(mpte, mp_tp);
4643 }
4644
4645 /*
4646 * Close a MPTCP control block.
4647 */
4648 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4649 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4650 {
4651 struct mptsub *mpts = NULL, *tmpts = NULL;
4652 struct socket *mp_so = mptetoso(mpte);
4653
4654 socket_lock_assert_owned(mp_so);
4655 VERIFY(mpte->mpte_mptcb == mp_tp);
4656
4657 mp_tp->mpt_state = MPTCPS_TERMINATE;
4658
4659 mptcp_freeq(mp_tp);
4660
4661 soisdisconnected(mp_so);
4662
4663 /* Clean up all subflows */
4664 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4665 mptcp_subflow_disconnect(mpte, mpts);
4666 }
4667
4668 return NULL;
4669 }
4670
4671 void
mptcp_notify_close(struct socket * so)4672 mptcp_notify_close(struct socket *so)
4673 {
4674 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4675 }
4676
4677 typedef struct mptcp_subflow_event_entry {
4678 uint32_t sofilt_hint_mask;
4679 ev_ret_t (*sofilt_hint_ev_hdlr)(
4680 struct mptses *mpte,
4681 struct mptsub *mpts,
4682 uint32_t *p_mpsofilt_hint,
4683 uint32_t event);
4684 } mptsub_ev_entry_t;
4685
4686 /*
4687 * XXX The order of the event handlers below is really
4688 * really important. Think twice before changing it.
4689 */
4690 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4691 {
4692 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4693 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4694 },
4695 {
4696 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4697 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
4698 },
4699 {
4700 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4701 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4702 },
4703 {
4704 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4705 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4706 },
4707 {
4708 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4709 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4710 },
4711 {
4712 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4713 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4714 },
4715 {
4716 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4717 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4718 },
4719 {
4720 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4721 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4722 },
4723 {
4724 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4725 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4726 },
4727 {
4728 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4729 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4730 },
4731 {
4732 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4733 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4734 },
4735 {
4736 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4737 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4738 },
4739 {
4740 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4741 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4742 },
4743 {
4744 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4745 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4746 },
4747 };
4748
4749 /*
4750 * Subflow socket control events.
4751 *
4752 * Called for handling events related to the underlying subflow socket.
4753 */
4754 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4755 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4756 uint32_t *p_mpsofilt_hint)
4757 {
4758 ev_ret_t ret = MPTS_EVRET_OK;
4759 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4760 sizeof(mpsub_ev_entry_tbl[0]);
4761
4762 /* bail if there's nothing to process */
4763 if (!mpts->mpts_evctl) {
4764 return ret;
4765 }
4766
4767 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4768 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4769 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4770 SO_FILT_HINT_DISCONNECTED)) {
4771 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4772 }
4773
4774 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4775 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4776
4777 /*
4778 * Process all the socket filter hints and reset the hint
4779 * once it is handled
4780 */
4781 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4782 /*
4783 * Always execute the DISCONNECTED event, because it will wakeup
4784 * the app.
4785 */
4786 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4787 (ret >= MPTS_EVRET_OK ||
4788 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4789 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4790 ev_ret_t error =
4791 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4792 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4793 }
4794 }
4795
4796 return ret;
4797 }
4798
4799 /*
4800 * MPTCP workloop.
4801 */
4802 void
mptcp_subflow_workloop(struct mptses * mpte)4803 mptcp_subflow_workloop(struct mptses *mpte)
4804 {
4805 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4806 uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4807 struct mptsub *mpts, *tmpts;
4808 struct socket *mp_so;
4809
4810 mp_so = mptetoso(mpte);
4811
4812 socket_lock_assert_owned(mp_so);
4813
4814 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4815 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4816 return;
4817 }
4818 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4819
4820 relaunch:
4821 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4822
4823 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4824 ev_ret_t ret;
4825
4826 if (mpts->mpts_socket->so_usecount == 0) {
4827 /* Will be removed soon by tcp_garbage_collect */
4828 continue;
4829 }
4830
4831 mptcp_subflow_addref(mpts);
4832 mpts->mpts_socket->so_usecount++;
4833
4834 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4835
4836 /*
4837 * If MPTCP socket is closed, disconnect all subflows.
4838 * This will generate a disconnect event which will
4839 * be handled during the next iteration, causing a
4840 * non-zero error to be returned above.
4841 */
4842 if (mp_so->so_flags & SOF_PCBCLEARING) {
4843 mptcp_subflow_disconnect(mpte, mpts);
4844 }
4845
4846 switch (ret) {
4847 case MPTS_EVRET_OK:
4848 /* nothing to do */
4849 break;
4850 case MPTS_EVRET_DELETE:
4851 mptcp_subflow_soclose(mpts);
4852 break;
4853 case MPTS_EVRET_CONNECT_PENDING:
4854 connect_pending = TRUE;
4855 break;
4856 case MPTS_EVRET_DISCONNECT_FALLBACK:
4857 disconnect_fallback = TRUE;
4858 break;
4859 default:
4860 break;
4861 }
4862 mptcp_subflow_remref(mpts); /* ours */
4863
4864 VERIFY(mpts->mpts_socket->so_usecount != 0);
4865 mpts->mpts_socket->so_usecount--;
4866 }
4867
4868 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4869 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4870
4871 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4872 mp_so->so_state |= SS_CANTRCVMORE;
4873 sorwakeup(mp_so);
4874 }
4875
4876 soevent(mp_so, mpsofilt_hint_mask);
4877 }
4878
4879 if (!connect_pending && !disconnect_fallback) {
4880 goto exit;
4881 }
4882
4883 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4884 if (disconnect_fallback) {
4885 struct socket *so = NULL;
4886 struct inpcb *inp = NULL;
4887 struct tcpcb *tp = NULL;
4888
4889 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4890 continue;
4891 }
4892
4893 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4894
4895 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4896 MPTSF_DISCONNECTED)) {
4897 continue;
4898 }
4899
4900 so = mpts->mpts_socket;
4901
4902 /*
4903 * The MPTCP connection has degraded to a fallback
4904 * mode, so there is no point in keeping this subflow
4905 * regardless of its MPTCP-readiness state, unless it
4906 * is the primary one which we use for fallback. This
4907 * assumes that the subflow used for fallback is the
4908 * ACTIVE one.
4909 */
4910
4911 inp = sotoinpcb(so);
4912 tp = intotcpcb(inp);
4913 tp->t_mpflags &=
4914 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4915 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4916
4917 soevent(so, SO_FILT_HINT_MUSTRST);
4918 } else if (connect_pending) {
4919 /*
4920 * The MPTCP connection has progressed to a state
4921 * where it supports full multipath semantics; allow
4922 * additional joins to be attempted for all subflows
4923 * that are in the PENDING state.
4924 */
4925 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4926 int error = mptcp_subflow_soconnectx(mpte, mpts);
4927
4928 if (error) {
4929 mptcp_subflow_abort(mpts, error);
4930 }
4931 }
4932 }
4933 }
4934
4935 exit:
4936 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4937 goto relaunch;
4938 }
4939
4940 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4941 }
4942
4943 /*
4944 * Protocol pr_lock callback.
4945 */
4946 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4947 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4948 {
4949 struct mppcb *mpp = mpsotomppcb(mp_so);
4950 void *lr_saved;
4951
4952 if (lr == NULL) {
4953 lr_saved = __builtin_return_address(0);
4954 } else {
4955 lr_saved = lr;
4956 }
4957
4958 if (mpp == NULL) {
4959 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4960 mp_so, lr_saved, solockhistory_nr(mp_so));
4961 /* NOTREACHED */
4962 }
4963 mpp_lock(mpp);
4964
4965 if (mp_so->so_usecount < 0) {
4966 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4967 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4968 solockhistory_nr(mp_so));
4969 /* NOTREACHED */
4970 }
4971 if (refcount != 0) {
4972 mp_so->so_usecount++;
4973 mpp->mpp_inside++;
4974 }
4975 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4976 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4977
4978 return 0;
4979 }
4980
4981 /*
4982 * Protocol pr_unlock callback.
4983 */
4984 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)4985 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4986 {
4987 struct mppcb *mpp = mpsotomppcb(mp_so);
4988 void *lr_saved;
4989
4990 if (lr == NULL) {
4991 lr_saved = __builtin_return_address(0);
4992 } else {
4993 lr_saved = lr;
4994 }
4995
4996 if (mpp == NULL) {
4997 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
4998 mp_so, mp_so->so_usecount, lr_saved,
4999 solockhistory_nr(mp_so));
5000 /* NOTREACHED */
5001 }
5002 socket_lock_assert_owned(mp_so);
5003
5004 if (refcount != 0) {
5005 mp_so->so_usecount--;
5006 mpp->mpp_inside--;
5007 }
5008
5009 if (mp_so->so_usecount < 0) {
5010 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5011 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5012 /* NOTREACHED */
5013 }
5014 if (mpp->mpp_inside < 0) {
5015 panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5016 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5017 /* NOTREACHED */
5018 }
5019 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5020 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5021 mpp_unlock(mpp);
5022
5023 return 0;
5024 }
5025
5026 /*
5027 * Protocol pr_getlock callback.
5028 */
5029 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5030 mptcp_getlock(struct socket *mp_so, int flags)
5031 {
5032 struct mppcb *mpp = mpsotomppcb(mp_so);
5033
5034 if (mpp == NULL) {
5035 panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5036 solockhistory_nr(mp_so));
5037 /* NOTREACHED */
5038 }
5039 if (mp_so->so_usecount < 0) {
5040 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5041 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5042 /* NOTREACHED */
5043 }
5044 return mpp_getlock(mpp, flags);
5045 }
5046
5047 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5048 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5049 u_int32_t *rrand)
5050 {
5051 struct mptcp_subf_auth_entry *sauth_entry;
5052
5053 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5054 if (sauth_entry->msae_laddr_id == addr_id) {
5055 if (lrand) {
5056 *lrand = sauth_entry->msae_laddr_rand;
5057 }
5058 if (rrand) {
5059 *rrand = sauth_entry->msae_raddr_rand;
5060 }
5061 break;
5062 }
5063 }
5064 }
5065
5066 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5067 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5068 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5069 {
5070 struct mptcp_subf_auth_entry *sauth_entry;
5071
5072 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5073 if (sauth_entry->msae_laddr_id == laddr_id) {
5074 if ((sauth_entry->msae_raddr_id != 0) &&
5075 (sauth_entry->msae_raddr_id != raddr_id)) {
5076 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5077 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5078 raddr_id, sauth_entry->msae_raddr_id);
5079 return;
5080 }
5081 sauth_entry->msae_raddr_id = raddr_id;
5082 if ((sauth_entry->msae_raddr_rand != 0) &&
5083 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5084 os_log_error(mptcp_log_handle, "%s - %lx: "
5085 "dup SYN_ACK %d %d \n",
5086 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5087 raddr_rand, sauth_entry->msae_raddr_rand);
5088 return;
5089 }
5090 sauth_entry->msae_raddr_rand = raddr_rand;
5091 return;
5092 }
5093 }
5094 }
5095
5096 /*
5097 * SHA-256 support for MPTCP
5098 */
5099
5100 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5101 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5102 {
5103 const unsigned char *sha2_base;
5104 int sha2_size;
5105
5106 sha2_base = (const unsigned char *) key;
5107 sha2_size = sizeof(mptcp_key_t);
5108
5109 SHA256_CTX sha_ctx;
5110 SHA256_Init(&sha_ctx);
5111 SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5112 SHA256_Final(sha_digest, &sha_ctx);
5113 }
5114
5115 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5116 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5117 u_char *msg, uint16_t msg_len, u_char *digest)
5118 {
5119 SHA256_CTX sha_ctx;
5120 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5121 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5122 int i;
5123
5124 bzero(digest, SHA256_DIGEST_LENGTH);
5125
5126 /* Set up the Key for HMAC */
5127 key_ipad[0] = key1;
5128 key_ipad[1] = key2;
5129
5130 key_opad[0] = key1;
5131 key_opad[1] = key2;
5132
5133 /* Key is 512 block length, so no need to compute hash */
5134
5135 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5136
5137 for (i = 0; i < 8; i++) {
5138 key_ipad[i] ^= 0x3636363636363636;
5139 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5140 }
5141
5142 /* Perform inner SHA256 */
5143 SHA256_Init(&sha_ctx);
5144 SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5145 SHA256_Update(&sha_ctx, msg, msg_len);
5146 SHA256_Final(digest, &sha_ctx);
5147
5148 /* Perform outer SHA256 */
5149 SHA256_Init(&sha_ctx);
5150 SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5151 SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5152 SHA256_Final(digest, &sha_ctx);
5153 }
5154
5155 /*
5156 * SHA1 support for MPTCP
5157 */
5158
5159 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5160 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5161 {
5162 SHA1_CTX sha1ctxt;
5163 const unsigned char *sha1_base;
5164 int sha1_size;
5165
5166 sha1_base = (const unsigned char *) key;
5167 sha1_size = sizeof(mptcp_key_t);
5168 SHA1Init(&sha1ctxt);
5169 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5170 SHA1Final(sha_digest, &sha1ctxt);
5171 }
5172
5173 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5174 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5175 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5176 {
5177 SHA1_CTX sha1ctxt;
5178 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5179 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5180 u_int32_t data[2];
5181 int i;
5182
5183 bzero(digest, SHA1_RESULTLEN);
5184
5185 /* Set up the Key for HMAC */
5186 key_ipad[0] = key1;
5187 key_ipad[1] = key2;
5188
5189 key_opad[0] = key1;
5190 key_opad[1] = key2;
5191
5192 /* Set up the message for HMAC */
5193 data[0] = rand1;
5194 data[1] = rand2;
5195
5196 /* Key is 512 block length, so no need to compute hash */
5197
5198 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5199
5200 for (i = 0; i < 8; i++) {
5201 key_ipad[i] ^= 0x3636363636363636;
5202 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5203 }
5204
5205 /* Perform inner SHA1 */
5206 SHA1Init(&sha1ctxt);
5207 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5208 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5209 SHA1Final(digest, &sha1ctxt);
5210
5211 /* Perform outer SHA1 */
5212 SHA1Init(&sha1ctxt);
5213 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5214 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5215 SHA1Final(digest, &sha1ctxt);
5216 }
5217
5218 /*
5219 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5220 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5221 */
5222 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5223 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5224 {
5225 uint32_t lrand, rrand;
5226
5227 lrand = rrand = 0;
5228 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5229
5230 u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5231 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5232 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5233 } else {
5234 uint32_t data[2];
5235 data[0] = lrand;
5236 data[1] = rrand;
5237 mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5238 }
5239 bcopy(full_digest, digest, digest_len);
5240 }
5241
5242 /*
5243 * Authentication data generation
5244 */
5245 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5246 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5247 int token_len)
5248 {
5249 VERIFY(token_len == sizeof(u_int32_t));
5250 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5251 sha_digest_len == SHA256_DIGEST_LENGTH);
5252
5253 /* Most significant 32 bits of the SHA1/SHA256 hash */
5254 bcopy(sha_digest, token, sizeof(u_int32_t));
5255 return;
5256 }
5257
5258 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5259 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5260 int idsn_len, uint8_t mp_version)
5261 {
5262 VERIFY(idsn_len == sizeof(u_int64_t));
5263 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5264 sha_digest_len == SHA256_DIGEST_LENGTH);
5265 VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5266
5267 /*
5268 * Least significant 64 bits of the hash
5269 */
5270
5271 if (mp_version == MPTCP_VERSION_0) {
5272 idsn[7] = sha_digest[12];
5273 idsn[6] = sha_digest[13];
5274 idsn[5] = sha_digest[14];
5275 idsn[4] = sha_digest[15];
5276 idsn[3] = sha_digest[16];
5277 idsn[2] = sha_digest[17];
5278 idsn[1] = sha_digest[18];
5279 idsn[0] = sha_digest[19];
5280 } else {
5281 idsn[7] = sha_digest[24];
5282 idsn[6] = sha_digest[25];
5283 idsn[5] = sha_digest[26];
5284 idsn[4] = sha_digest[27];
5285 idsn[3] = sha_digest[28];
5286 idsn[2] = sha_digest[29];
5287 idsn[1] = sha_digest[30];
5288 idsn[0] = sha_digest[31];
5289 }
5290 return;
5291 }
5292
5293 static void
mptcp_conn_properties(struct mptcb * mp_tp)5294 mptcp_conn_properties(struct mptcb *mp_tp)
5295 {
5296 /* Set DSS checksum flag */
5297 if (mptcp_dss_csum) {
5298 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5299 }
5300
5301 /* Set up receive window */
5302 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5303
5304 /* Set up gc ticks */
5305 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5306 }
5307
5308 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5309 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5310 {
5311 struct mptcb *mp_tp = mpte->mpte_mptcb;
5312 char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5313 uint16_t digest_len;
5314
5315 if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5316 mp_tp->mpt_version = MPTCP_VERSION_0;
5317 } else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5318 mp_tp->mpt_version = MPTCP_VERSION_1;
5319 } else {
5320 mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5321 }
5322 VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5323 mp_tp->mpt_version == MPTCP_VERSION_1);
5324
5325 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5326 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5327 digest_len = SHA1_RESULTLEN;
5328 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5329 } else {
5330 digest_len = SHA256_DIGEST_LENGTH;
5331 mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5332 }
5333
5334 mptcp_generate_token(key_digest, digest_len,
5335 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5336 mptcp_generate_idsn(key_digest, digest_len,
5337 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5338 /* The subflow SYN is also first MPTCP byte */
5339 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5340 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5341
5342 mptcp_conn_properties(mp_tp);
5343 }
5344
5345 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5346 mptcp_init_remote_parms(struct mptcb *mp_tp)
5347 {
5348 /* Setup local and remote tokens and Initial DSNs */
5349 char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5350 uint16_t digest_len;
5351
5352 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5353 digest_len = SHA1_RESULTLEN;
5354 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5355 } else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5356 digest_len = SHA256_DIGEST_LENGTH;
5357 mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5358 } else {
5359 return -1;
5360 }
5361
5362 mptcp_generate_token(remote_digest, digest_len,
5363 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5364 mptcp_generate_idsn(remote_digest, digest_len,
5365 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5366 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5367 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5368 return 0;
5369 }
5370
5371 static void
mptcp_send_dfin(struct socket * so)5372 mptcp_send_dfin(struct socket *so)
5373 {
5374 struct tcpcb *tp = NULL;
5375 struct inpcb *inp = NULL;
5376
5377 inp = sotoinpcb(so);
5378 if (!inp) {
5379 return;
5380 }
5381
5382 tp = intotcpcb(inp);
5383 if (!tp) {
5384 return;
5385 }
5386
5387 if (!(tp->t_mpflags & TMPF_RESET)) {
5388 tp->t_mpflags |= TMPF_SEND_DFIN;
5389 }
5390 }
5391
5392 /*
5393 * Data Sequence Mapping routines
5394 */
5395 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5396 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5397 {
5398 struct mptcb *mp_tp;
5399
5400 if (m == NULL) {
5401 return;
5402 }
5403
5404 mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb;
5405
5406 while (m) {
5407 VERIFY(m->m_flags & M_PKTHDR);
5408 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5409 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5410 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5411 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5412 mp_tp->mpt_sndmax += m_pktlen(m);
5413 m = m->m_next;
5414 }
5415 }
5416
5417 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5418 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5419 {
5420 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5421 uint64_t data_ack;
5422 uint64_t dsn;
5423
5424 VERIFY(len >= 0);
5425
5426 if (!m || len == 0) {
5427 return;
5428 }
5429
5430 while (m && len > 0) {
5431 VERIFY(m->m_flags & M_PKTHDR);
5432 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5433
5434 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5435 dsn = m->m_pkthdr.mp_dsn;
5436
5437 len -= m->m_len;
5438 m = m->m_next;
5439 }
5440
5441 if (m && len == 0) {
5442 /*
5443 * If there is one more mbuf in the chain, it automatically means
5444 * that up to m->mp_dsn has been ack'ed.
5445 *
5446 * This means, we actually correct data_ack back down (compared
5447 * to what we set inside the loop - dsn + data_len). Because in
5448 * the loop we are "optimistic" and assume that the full mapping
5449 * will be acked. If that's not the case and we get out of the
5450 * loop with m != NULL, it means only up to m->mp_dsn has been
5451 * really acked.
5452 */
5453 data_ack = m->m_pkthdr.mp_dsn;
5454 }
5455
5456 if (len < 0) {
5457 /*
5458 * If len is negative, meaning we acked in the middle of an mbuf,
5459 * only up to this mbuf's data-sequence number has been acked
5460 * at the MPTCP-level.
5461 */
5462 data_ack = dsn;
5463 }
5464
5465 /* We can have data in the subflow's send-queue that is being acked,
5466 * while the DATA_ACK has already advanced. Thus, we should check whether
5467 * or not the DATA_ACK is actually new here.
5468 */
5469 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5470 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5471 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5472 }
5473 }
5474
5475 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5476 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5477 {
5478 int rewinding = 0;
5479
5480 /* TFO makes things complicated. */
5481 if (so->so_flags1 & SOF1_TFO_REWIND) {
5482 rewinding = 1;
5483 so->so_flags1 &= ~SOF1_TFO_REWIND;
5484 }
5485
5486 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5487 u_int32_t sub_len;
5488 VERIFY(m->m_flags & M_PKTHDR);
5489 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5490
5491 sub_len = m->m_pkthdr.mp_rlen;
5492
5493 if (sub_len < len) {
5494 m->m_pkthdr.mp_dsn += sub_len;
5495 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5496 m->m_pkthdr.mp_rseq += sub_len;
5497 }
5498 m->m_pkthdr.mp_rlen = 0;
5499 len -= sub_len;
5500 } else {
5501 /* sub_len >= len */
5502 if (rewinding == 0) {
5503 m->m_pkthdr.mp_dsn += len;
5504 }
5505 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5506 if (rewinding == 0) {
5507 m->m_pkthdr.mp_rseq += len;
5508 }
5509 }
5510 m->m_pkthdr.mp_rlen -= len;
5511 break;
5512 }
5513 m = m->m_next;
5514 }
5515
5516 if (so->so_flags & SOF_MP_SUBFLOW &&
5517 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5518 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5519 /*
5520 * Received an ack without receiving a DATA_ACK.
5521 * Need to fallback to regular TCP (or destroy this subflow).
5522 */
5523 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5524 mptcp_notify_mpfail(so);
5525 }
5526 }
5527
5528 /* Obtain the DSN mapping stored in the mbuf */
5529 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5530 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5531 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5532 {
5533 u_int64_t dsn64;
5534
5535 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5536 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5537 }
5538
5539 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5540 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5541 uint32_t *relseq, uint16_t *data_len,
5542 uint16_t *dss_csum)
5543 {
5544 struct mbuf *m = so->so_snd.sb_mb;
5545
5546 VERIFY(off >= 0);
5547
5548 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5549 *dsn = 0;
5550 *relseq = 0;
5551 *data_len = 0;
5552 *dss_csum = 0;
5553 return;
5554 }
5555
5556 /*
5557 * In the subflow socket, the DSN sequencing can be discontiguous,
5558 * but the subflow sequence mapping is contiguous. Use the subflow
5559 * sequence property to find the right mbuf and corresponding dsn
5560 * mapping.
5561 */
5562
5563 while (m) {
5564 VERIFY(m->m_flags & M_PKTHDR);
5565 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5566
5567 if (off >= m->m_len) {
5568 off -= m->m_len;
5569 m = m->m_next;
5570 } else {
5571 break;
5572 }
5573 }
5574
5575 VERIFY(off >= 0);
5576 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5577
5578 *dsn = m->m_pkthdr.mp_dsn;
5579 *relseq = m->m_pkthdr.mp_rseq;
5580 *data_len = m->m_pkthdr.mp_rlen;
5581 *dss_csum = m->m_pkthdr.mp_csum;
5582 }
5583
5584 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5585 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5586 {
5587 uint64_t dsn;
5588 uint32_t relseq;
5589
5590 mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5591 }
5592
5593 /*
5594 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5595 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5596 * When it trims data tcp_input calls m_adj() which does not remove the
5597 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5598 * The dsn map insertion cannot be delayed after trim, because data can be in
5599 * the reassembly queue for a while and the DSN option info in tp will be
5600 * overwritten for every new packet received.
5601 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5602 * with mptcp_adj_rmap()
5603 */
5604 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5605 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5606 {
5607 VERIFY(m->m_flags & M_PKTHDR);
5608 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5609
5610 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5611 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5612 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5613 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5614 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5615 if (tp->t_rcv_map.mpt_dfin) {
5616 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5617 }
5618
5619 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5620
5621 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5622 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5623 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5624 if (th->th_flags & TH_FIN) {
5625 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5626 }
5627 }
5628 }
5629
5630 /*
5631 * Following routines help with failure detection and failover of data
5632 * transfer from one subflow to another.
5633 */
5634 void
mptcp_act_on_txfail(struct socket * so)5635 mptcp_act_on_txfail(struct socket *so)
5636 {
5637 struct tcpcb *tp = NULL;
5638 struct inpcb *inp = sotoinpcb(so);
5639
5640 if (inp == NULL) {
5641 return;
5642 }
5643
5644 tp = intotcpcb(inp);
5645 if (tp == NULL) {
5646 return;
5647 }
5648
5649 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5650 return;
5651 }
5652
5653 so->so_flags |= SOF_MP_TRYFAILOVER;
5654 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5655 }
5656
5657 /*
5658 * Support for MP_FAIL option
5659 */
5660 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5661 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5662 {
5663 struct mbuf *m = so->so_snd.sb_mb;
5664 uint16_t datalen;
5665 uint64_t dsn;
5666 int off = 0;
5667
5668 if (m == NULL) {
5669 return -1;
5670 }
5671
5672 while (m != NULL) {
5673 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5674 VERIFY(m->m_flags & M_PKTHDR);
5675 dsn = m->m_pkthdr.mp_dsn;
5676 datalen = m->m_pkthdr.mp_rlen;
5677 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5678 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5679 off = (int)(dsn_fail - dsn);
5680 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5681 return 0;
5682 }
5683
5684 m = m->m_next;
5685 }
5686
5687 /*
5688 * If there was no mbuf data and a fallback to TCP occurred, there's
5689 * not much else to do.
5690 */
5691
5692 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5693 return -1;
5694 }
5695
5696 /*
5697 * Support for sending contiguous MPTCP bytes in subflow
5698 * Also for preventing sending data with ACK in 3-way handshake
5699 */
5700 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5701 mptcp_adj_sendlen(struct socket *so, int32_t off)
5702 {
5703 struct tcpcb *tp = sototcpcb(so);
5704 struct mptsub *mpts = tp->t_mpsub;
5705 uint64_t mdss_dsn;
5706 uint32_t mdss_subflow_seq;
5707 int mdss_subflow_off;
5708 uint16_t mdss_data_len;
5709 uint16_t dss_csum;
5710
5711 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5712 return 0;
5713 }
5714
5715 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5716 &mdss_data_len, &dss_csum);
5717
5718 /*
5719 * We need to compute how much of the mapping still remains.
5720 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5721 */
5722 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5723
5724 /*
5725 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5726 * seq has been set to 1 (while it should be 0).
5727 */
5728 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5729 mdss_subflow_off--;
5730 }
5731
5732 VERIFY(off >= mdss_subflow_off);
5733
5734 return mdss_data_len - (off - mdss_subflow_off);
5735 }
5736
5737 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5738 mptcp_get_maxseg(struct mptses *mpte)
5739 {
5740 struct mptsub *mpts;
5741 uint32_t maxseg = 0;
5742
5743 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5744 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5745
5746 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5747 TCPS_HAVERCVDFIN2(tp->t_state)) {
5748 continue;
5749 }
5750
5751 if (tp->t_maxseg > maxseg) {
5752 maxseg = tp->t_maxseg;
5753 }
5754 }
5755
5756 return maxseg;
5757 }
5758
5759 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5760 mptcp_get_rcvscale(struct mptses *mpte)
5761 {
5762 struct mptsub *mpts;
5763 uint8_t rcvscale = UINT8_MAX;
5764
5765 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5766 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5767
5768 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5769 TCPS_HAVERCVDFIN2(tp->t_state)) {
5770 continue;
5771 }
5772
5773 if (tp->rcv_scale < rcvscale) {
5774 rcvscale = tp->rcv_scale;
5775 }
5776 }
5777
5778 return rcvscale;
5779 }
5780
5781 /* Similar to tcp_sbrcv_reserve */
5782 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5783 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5784 u_int32_t newsize, u_int32_t idealsize)
5785 {
5786 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5787
5788 if (rcvscale == UINT8_MAX) {
5789 return;
5790 }
5791
5792 /* newsize should not exceed max */
5793 newsize = min(newsize, tcp_autorcvbuf_max);
5794
5795 /* The receive window scale negotiated at the
5796 * beginning of the connection will also set a
5797 * limit on the socket buffer size
5798 */
5799 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5800
5801 /* Set new socket buffer size */
5802 if (newsize > sbrcv->sb_hiwat &&
5803 (sbreserve(sbrcv, newsize) == 1)) {
5804 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5805 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5806
5807 /* Again check the limit set by the advertised
5808 * window scale
5809 */
5810 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5811 TCP_MAXWIN << rcvscale);
5812 }
5813 }
5814
5815 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5816 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5817 {
5818 struct mptses *mpte = mp_tp->mpt_mpte;
5819 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5820 struct sockbuf *sbrcv = &mp_so->so_rcv;
5821 uint32_t hiwat_sum = 0;
5822 uint32_t ideal_sum = 0;
5823 struct mptsub *mpts;
5824
5825 /*
5826 * Do not grow the receive socket buffer if
5827 * - auto resizing is disabled, globally or on this socket
5828 * - the high water mark already reached the maximum
5829 * - the stream is in background and receive side is being
5830 * throttled
5831 * - if there are segments in reassembly queue indicating loss,
5832 * do not need to increase recv window during recovery as more
5833 * data is not going to be sent. A duplicate ack sent during
5834 * recovery should not change the receive window
5835 */
5836 if (tcp_do_autorcvbuf == 0 ||
5837 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5838 tcp_cansbgrow(sbrcv) == 0 ||
5839 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5840 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5841 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5842 /* Can not resize the socket buffer, just return */
5843 return;
5844 }
5845
5846 /*
5847 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5848 *
5849 * But, for this we first need accurate receiver-RTT estimations, which
5850 * we currently don't have.
5851 *
5852 * Let's use a dummy algorithm for now, just taking the sum of all
5853 * subflow's receive-buffers. It's too low, but that's all we can get
5854 * for now.
5855 */
5856
5857 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5858 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5859 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5860 }
5861
5862 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5863 }
5864
5865 /*
5866 * Determine if we can grow the recieve socket buffer to avoid sending
5867 * a zero window update to the peer. We allow even socket buffers that
5868 * have fixed size (set by the application) to grow if the resource
5869 * constraints are met. They will also be trimmed after the application
5870 * reads data.
5871 *
5872 * Similar to tcp_sbrcv_grow_rwin
5873 */
5874 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5875 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5876 {
5877 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5878 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5879 u_int32_t rcvbuf = sb->sb_hiwat;
5880
5881 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5882 return;
5883 }
5884
5885 if (tcp_do_autorcvbuf == 1 &&
5886 tcp_cansbgrow(sb) &&
5887 /* Diff to tcp_sbrcv_grow_rwin */
5888 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5889 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5890 rcvbuf < tcp_autorcvbuf_max &&
5891 (sb->sb_idealsize > 0 &&
5892 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5893 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5894 }
5895 }
5896
5897 /* Similar to tcp_sbspace */
5898 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5899 mptcp_sbspace(struct mptcb *mp_tp)
5900 {
5901 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5902 uint32_t rcvbuf;
5903 int32_t space;
5904 int32_t pending = 0;
5905
5906 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5907
5908 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5909
5910 /* hiwat might have changed */
5911 rcvbuf = sb->sb_hiwat;
5912
5913 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5914 (sb->sb_mbmax - sb->sb_mbcnt)));
5915 if (space < 0) {
5916 space = 0;
5917 }
5918
5919 #if CONTENT_FILTER
5920 /* Compensate for data being processed by content filters */
5921 pending = cfil_sock_data_space(sb);
5922 #endif /* CONTENT_FILTER */
5923 if (pending > space) {
5924 space = 0;
5925 } else {
5926 space -= pending;
5927 }
5928
5929 return space;
5930 }
5931
5932 /*
5933 * Support Fallback to Regular TCP
5934 */
5935 void
mptcp_notify_mpready(struct socket * so)5936 mptcp_notify_mpready(struct socket *so)
5937 {
5938 struct tcpcb *tp = NULL;
5939
5940 if (so == NULL) {
5941 return;
5942 }
5943
5944 tp = intotcpcb(sotoinpcb(so));
5945
5946 if (tp == NULL) {
5947 return;
5948 }
5949
5950 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5951 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5952 struct tcpcb *, tp);
5953
5954 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5955 return;
5956 }
5957
5958 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5959 return;
5960 }
5961
5962 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5963 tp->t_mpflags |= TMPF_MPTCP_READY;
5964
5965 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5966 }
5967
5968 void
mptcp_notify_mpfail(struct socket * so)5969 mptcp_notify_mpfail(struct socket *so)
5970 {
5971 struct tcpcb *tp = NULL;
5972
5973 if (so == NULL) {
5974 return;
5975 }
5976
5977 tp = intotcpcb(sotoinpcb(so));
5978
5979 if (tp == NULL) {
5980 return;
5981 }
5982
5983 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5984 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5985 struct tcpcb *, tp);
5986
5987 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5988 return;
5989 }
5990
5991 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5992 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5993
5994 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5995 }
5996
5997 /*
5998 * Keepalive helper function
5999 */
6000 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6001 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6002 {
6003 boolean_t ret = 1;
6004
6005 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6006
6007 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6008 ret = 0;
6009 }
6010 return ret;
6011 }
6012
6013 /*
6014 * MPTCP t_maxseg adjustment function
6015 */
6016 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6017 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6018 {
6019 int mss_lower = 0;
6020 struct mptcb *mp_tp = tptomptp(tp);
6021
6022 #define MPTCP_COMPUTE_LEN { \
6023 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
6024 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
6025 mss_lower += 2; \
6026 else \
6027 /* adjust to 32-bit boundary + EOL */ \
6028 mss_lower += 2; \
6029 }
6030 if (mp_tp == NULL) {
6031 return 0;
6032 }
6033
6034 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6035
6036 /*
6037 * For the first subflow and subsequent subflows, adjust mss for
6038 * most common MPTCP option size, for case where tcp_mss is called
6039 * during option processing and MTU discovery.
6040 */
6041 if (!mtudisc) {
6042 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6043 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6044 MPTCP_COMPUTE_LEN;
6045 }
6046
6047 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6048 tp->t_mpflags & TMPF_SENT_JOIN) {
6049 MPTCP_COMPUTE_LEN;
6050 }
6051 } else {
6052 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6053 MPTCP_COMPUTE_LEN;
6054 }
6055 }
6056
6057 return mss_lower;
6058 }
6059
6060 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6061 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6062 {
6063 struct inpcb *inp;
6064
6065 tcp_getconninfo(so, &flow->flow_ci);
6066 inp = sotoinpcb(so);
6067 if ((inp->inp_vflag & INP_IPV6) != 0) {
6068 flow->flow_src.ss_family = AF_INET6;
6069 flow->flow_dst.ss_family = AF_INET6;
6070 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6071 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6072 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6073 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6074 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6075 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6076 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
6077 flow->flow_src.ss_family = AF_INET;
6078 flow->flow_dst.ss_family = AF_INET;
6079 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6080 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6081 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6082 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6083 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6084 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6085 }
6086 flow->flow_len = sizeof(*flow);
6087 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6088 flow->flow_flags = mpts->mpts_flags;
6089 flow->flow_cid = mpts->mpts_connid;
6090 flow->flow_relseq = mpts->mpts_rel_seq;
6091 flow->flow_soerror = mpts->mpts_socket->so_error;
6092 flow->flow_probecnt = mpts->mpts_probecnt;
6093 }
6094
6095 static int
6096 mptcp_pcblist SYSCTL_HANDLER_ARGS
6097 {
6098 #pragma unused(oidp, arg1, arg2)
6099 int error = 0, f;
6100 size_t len;
6101 struct mppcb *mpp;
6102 struct mptses *mpte;
6103 struct mptcb *mp_tp;
6104 struct mptsub *mpts;
6105 struct socket *so;
6106 conninfo_mptcp_t mptcpci;
6107 mptcp_flow_t *flows = NULL;
6108
6109 if (req->newptr != USER_ADDR_NULL) {
6110 return EPERM;
6111 }
6112
6113 lck_mtx_lock(&mtcbinfo.mppi_lock);
6114 if (req->oldptr == USER_ADDR_NULL) {
6115 size_t n = mtcbinfo.mppi_count;
6116 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6117 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6118 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6119 return 0;
6120 }
6121 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6122 flows = NULL;
6123 socket_lock(mpp->mpp_socket, 1);
6124 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6125 mpte = mptompte(mpp);
6126
6127 socket_lock_assert_owned(mptetoso(mpte));
6128 mp_tp = mpte->mpte_mptcb;
6129
6130 bzero(&mptcpci, sizeof(mptcpci));
6131 mptcpci.mptcpci_state = mp_tp->mpt_state;
6132 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6133 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6134 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6135 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6136 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6137 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6138 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6139 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6140 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6141 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6142 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6143 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6144 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6145
6146 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6147 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6148 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6149 mptcpci.mptcpci_flow_offset =
6150 offsetof(conninfo_mptcp_t, mptcpci_flows);
6151
6152 len = sizeof(*flows) * mpte->mpte_numflows;
6153 if (mpte->mpte_numflows != 0) {
6154 flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6155 if (flows == NULL) {
6156 socket_unlock(mpp->mpp_socket, 1);
6157 break;
6158 }
6159 mptcpci.mptcpci_len = sizeof(mptcpci) +
6160 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6161 error = SYSCTL_OUT(req, &mptcpci,
6162 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6163 } else {
6164 mptcpci.mptcpci_len = sizeof(mptcpci);
6165 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6166 }
6167 if (error) {
6168 socket_unlock(mpp->mpp_socket, 1);
6169 kfree_data(flows, len);
6170 break;
6171 }
6172 f = 0;
6173 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6174 so = mpts->mpts_socket;
6175 fill_mptcp_subflow(so, &flows[f], mpts);
6176 f++;
6177 }
6178 socket_unlock(mpp->mpp_socket, 1);
6179 if (flows) {
6180 error = SYSCTL_OUT(req, flows, len);
6181 kfree_data(flows, len);
6182 if (error) {
6183 break;
6184 }
6185 }
6186 }
6187 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6188
6189 return error;
6190 }
6191
6192 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6193 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6194 "List of active MPTCP connections");
6195
6196 /*
6197 * Set notsent lowat mark on the MPTCB
6198 */
6199 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6200 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6201 {
6202 struct mptcb *mp_tp = NULL;
6203 int error = 0;
6204
6205 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6206 mp_tp = mpte->mpte_mptcb;
6207 }
6208
6209 if (mp_tp) {
6210 mp_tp->mpt_notsent_lowat = optval;
6211 } else {
6212 error = EINVAL;
6213 }
6214
6215 return error;
6216 }
6217
6218 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6219 mptcp_get_notsent_lowat(struct mptses *mpte)
6220 {
6221 struct mptcb *mp_tp = NULL;
6222
6223 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6224 mp_tp = mpte->mpte_mptcb;
6225 }
6226
6227 if (mp_tp) {
6228 return mp_tp->mpt_notsent_lowat;
6229 } else {
6230 return 0;
6231 }
6232 }
6233
6234 int
mptcp_notsent_lowat_check(struct socket * so)6235 mptcp_notsent_lowat_check(struct socket *so)
6236 {
6237 struct mptses *mpte;
6238 struct mppcb *mpp;
6239 struct mptcb *mp_tp;
6240 struct mptsub *mpts;
6241
6242 int notsent = 0;
6243
6244 mpp = mpsotomppcb(so);
6245 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6246 return 0;
6247 }
6248
6249 mpte = mptompte(mpp);
6250 socket_lock_assert_owned(mptetoso(mpte));
6251 mp_tp = mpte->mpte_mptcb;
6252
6253 notsent = so->so_snd.sb_cc;
6254
6255 if ((notsent == 0) ||
6256 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6257 mp_tp->mpt_notsent_lowat)) {
6258 return 1;
6259 }
6260
6261 /* When Nagle's algorithm is not disabled, it is better
6262 * to wakeup the client even before there is atleast one
6263 * maxseg of data to write.
6264 */
6265 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6266 int retval = 0;
6267 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6268 struct socket *subf_so = mpts->mpts_socket;
6269 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6270
6271 notsent = so->so_snd.sb_cc -
6272 (tp->snd_nxt - tp->snd_una);
6273
6274 if ((tp->t_flags & TF_NODELAY) == 0 &&
6275 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6276 retval = 1;
6277 }
6278 return retval;
6279 }
6280 }
6281 return 0;
6282 }
6283
6284 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6285 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6286 void **unitinfo)
6287 {
6288 #pragma unused(kctlref, sac, unitinfo)
6289
6290 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6291 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6292 }
6293
6294 mptcp_kern_skt_unit = sac->sc_unit;
6295
6296 return 0;
6297 }
6298
6299 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6300 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6301 {
6302 struct mppcb *mpp;
6303
6304 /* Iterate over all MPTCP connections */
6305
6306 lck_mtx_lock(&mtcbinfo.mppi_lock);
6307
6308 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6309 struct socket *mp_so = mpp->mpp_socket;
6310 struct mptses *mpte = mpp->mpp_pcbe;
6311
6312 socket_lock(mp_so, 1);
6313
6314 if (mp_so->so_flags & SOF_DELEGATED &&
6315 uuid_compare(uuid, mp_so->e_uuid)) {
6316 goto next;
6317 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6318 uuid_compare(uuid, mp_so->last_uuid)) {
6319 goto next;
6320 }
6321
6322 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6323 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6324
6325 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6326
6327 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6328 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6329 }
6330
6331 mptcp_check_subflows_and_add(mpte);
6332 mptcp_remove_subflows(mpte);
6333
6334 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6335
6336 next:
6337 socket_unlock(mp_so, 1);
6338 }
6339
6340 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6341 }
6342
6343 static void
mptcp_wifi_status_changed(void)6344 mptcp_wifi_status_changed(void)
6345 {
6346 struct mppcb *mpp;
6347
6348 /* Iterate over all MPTCP connections */
6349
6350 lck_mtx_lock(&mtcbinfo.mppi_lock);
6351
6352 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6353 struct socket *mp_so = mpp->mpp_socket;
6354 struct mptses *mpte = mpp->mpp_pcbe;
6355
6356 socket_lock(mp_so, 1);
6357
6358 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6359 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6360 mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6361 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6362 goto next;
6363 }
6364
6365 mptcp_check_subflows_and_add(mpte);
6366 mptcp_check_subflows_and_remove(mpte);
6367
6368 next:
6369 socket_unlock(mp_so, 1);
6370 }
6371
6372 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6373 }
6374
6375 struct mptcp_uuid_search_info {
6376 uuid_t target_uuid;
6377 proc_t found_proc;
6378 boolean_t is_proc_found;
6379 };
6380
6381 static int
mptcp_find_proc_filter(proc_t p,void * arg)6382 mptcp_find_proc_filter(proc_t p, void *arg)
6383 {
6384 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6385 int found;
6386
6387 if (info->is_proc_found) {
6388 return 0;
6389 }
6390
6391 /*
6392 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6393 * expects != 0 for a matching filter.
6394 */
6395 found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6396 if (found) {
6397 info->is_proc_found = true;
6398 }
6399
6400 return found;
6401 }
6402
6403 static int
mptcp_find_proc_callout(proc_t p,void * arg)6404 mptcp_find_proc_callout(proc_t p, void * arg)
6405 {
6406 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6407
6408 if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6409 info->found_proc = p;
6410 return PROC_CLAIMED_DONE;
6411 }
6412
6413 return PROC_RETURNED;
6414 }
6415
6416 static proc_t
mptcp_find_proc(const uuid_t uuid)6417 mptcp_find_proc(const uuid_t uuid)
6418 {
6419 struct mptcp_uuid_search_info info;
6420
6421 uuid_copy(info.target_uuid, uuid);
6422 info.found_proc = PROC_NULL;
6423 info.is_proc_found = false;
6424
6425 proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6426 mptcp_find_proc_filter, &info);
6427
6428 return info.found_proc;
6429 }
6430
6431 void
mptcp_ask_symptoms(struct mptses * mpte)6432 mptcp_ask_symptoms(struct mptses *mpte)
6433 {
6434 struct mptcp_symptoms_ask_uuid ask;
6435 struct socket *mp_so;
6436 struct proc *p = PROC_NULL;
6437 int pid, prio, err;
6438
6439 if (mptcp_kern_skt_unit == 0) {
6440 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6441 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6442 return;
6443 }
6444
6445 mp_so = mptetoso(mpte);
6446
6447 if (mp_so->so_flags & SOF_DELEGATED) {
6448 if (mpte->mpte_epid != 0) {
6449 p = proc_find(mpte->mpte_epid);
6450 if (p != PROC_NULL) {
6451 /* We found a pid, check its UUID */
6452 if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6453 /* It's not the same - we need to look for the real proc */
6454 proc_rele(p);
6455 p = PROC_NULL;
6456 }
6457 }
6458 }
6459
6460 if (p == PROC_NULL) {
6461 p = mptcp_find_proc(mp_so->e_uuid);
6462 if (p == PROC_NULL) {
6463 uuid_string_t uuid_string;
6464 uuid_unparse(mp_so->e_uuid, uuid_string);
6465
6466 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6467 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6468
6469 return;
6470 }
6471 mpte->mpte_epid = proc_pid(p);
6472 }
6473
6474 pid = mpte->mpte_epid;
6475 uuid_copy(ask.uuid, mp_so->e_uuid);
6476 } else {
6477 pid = mp_so->last_pid;
6478
6479 p = proc_find(pid);
6480 if (p == PROC_NULL) {
6481 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6482 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6483 return;
6484 }
6485
6486 uuid_copy(ask.uuid, mp_so->last_uuid);
6487 }
6488
6489
6490 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6491
6492 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6493
6494 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6495 prio == TASK_DARWINBG_APPLICATION) {
6496 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6497 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6498 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6499 } else {
6500 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6501 }
6502
6503 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6504 &ask, sizeof(ask), CTL_DATA_EOR);
6505
6506 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6507 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6508
6509
6510 proc_rele(p);
6511 }
6512
6513 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6514 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6515 void *unitinfo)
6516 {
6517 #pragma unused(kctlref, kcunit, unitinfo)
6518
6519 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6520
6521 return 0;
6522 }
6523
6524 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6525 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6526 mbuf_t m, int flags)
6527 {
6528 #pragma unused(kctlref, unitinfo, flags)
6529 symptoms_advisory_t *sa = NULL;
6530
6531 if (kcunit != mptcp_kern_skt_unit) {
6532 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6533 __func__, kcunit, mptcp_kern_skt_unit);
6534 }
6535
6536 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6537 mbuf_freem(m);
6538 return EINVAL;
6539 }
6540
6541 if (mbuf_len(m) < sizeof(*sa)) {
6542 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6543 __func__, mbuf_len(m), sizeof(*sa));
6544 mbuf_freem(m);
6545 return EINVAL;
6546 }
6547
6548 sa = mbuf_data(m);
6549
6550 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6551 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6552 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6553 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6554
6555 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6556 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6557 mptcp_wifi_status_changed();
6558 }
6559 } else {
6560 struct mptcp_symptoms_answer answer;
6561 errno_t err;
6562
6563 /* We temporarily allow different sizes for ease of submission */
6564 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6565 mbuf_len(m) != sizeof(answer)) {
6566 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6567 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6568 sizeof(answer));
6569 mbuf_free(m);
6570 return EINVAL;
6571 }
6572
6573 memset(&answer, 0, sizeof(answer));
6574
6575 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6576 if (err) {
6577 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6578 mbuf_free(m);
6579 return err;
6580 }
6581
6582 mptcp_allow_uuid(answer.uuid, answer.rssi);
6583 }
6584
6585 mbuf_freem(m);
6586 return 0;
6587 }
6588
6589 void
mptcp_control_register(void)6590 mptcp_control_register(void)
6591 {
6592 /* Set up the advisory control socket */
6593 struct kern_ctl_reg mptcp_kern_ctl;
6594
6595 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6596 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6597 sizeof(mptcp_kern_ctl.ctl_name));
6598 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6599 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6600 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6601 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6602
6603 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6604 }
6605
6606 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6607 mptcp_wifi_quality_for_session(struct mptses *mpte)
6608 {
6609 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6610 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6611 mptcp_advisory.sa_wifi_status) {
6612 return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6613 }
6614
6615 /*
6616 * If it's a first-party app and we don't have any info
6617 * about the Wi-Fi state, let's be pessimistic.
6618 */
6619 return MPTCP_WIFI_QUALITY_UNSURE;
6620 } else {
6621 if (symptoms_is_wifi_lossy()) {
6622 return MPTCP_WIFI_QUALITY_BAD;
6623 }
6624
6625 /*
6626 * If we are target-based (meaning, we allow to be more lax on
6627 * the when wifi is considered bad), we only *know* about the state once
6628 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6629 *
6630 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6631 * be set.
6632 *
6633 * In any other case (while in target-mode), consider WiFi bad
6634 * and we are going to ask for allowance from Symptoms anyway.
6635 */
6636 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6637 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6638 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6639 return MPTCP_WIFI_QUALITY_GOOD;
6640 }
6641
6642 return MPTCP_WIFI_QUALITY_BAD;
6643 }
6644
6645 return MPTCP_WIFI_QUALITY_GOOD;
6646 }
6647 }
6648
6649 boolean_t
symptoms_is_wifi_lossy(void)6650 symptoms_is_wifi_lossy(void)
6651 {
6652 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6653 }
6654
6655 int
mptcp_freeq(struct mptcb * mp_tp)6656 mptcp_freeq(struct mptcb *mp_tp)
6657 {
6658 struct tseg_qent *q;
6659 int rv = 0;
6660 int count = 0;
6661
6662 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6663 LIST_REMOVE(q, tqe_q);
6664 m_freem(q->tqe_m);
6665 zfree(tcp_reass_zone, q);
6666 count++;
6667 rv = 1;
6668 }
6669 mp_tp->mpt_reassqlen = 0;
6670
6671 if (count > 0) {
6672 OSAddAtomic(-count, &mptcp_reass_total_qlen);
6673 }
6674
6675 return rv;
6676 }
6677
6678 static int
mptcp_post_event(u_int32_t event_code,int value)6679 mptcp_post_event(u_int32_t event_code, int value)
6680 {
6681 struct kev_mptcp_data event_data;
6682 struct kev_msg ev_msg;
6683
6684 memset(&ev_msg, 0, sizeof(ev_msg));
6685
6686 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6687 ev_msg.kev_class = KEV_NETWORK_CLASS;
6688 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6689 ev_msg.event_code = event_code;
6690
6691 event_data.value = value;
6692
6693 ev_msg.dv[0].data_ptr = &event_data;
6694 ev_msg.dv[0].data_length = sizeof(event_data);
6695
6696 return kev_post_msg(&ev_msg);
6697 }
6698
6699 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6700 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6701 {
6702 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6703 int error;
6704
6705 /* First-party apps (Siri) don't flip the cellicon */
6706 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6707 return;
6708 }
6709
6710 /* Subflow is disappearing - don't set it on this one */
6711 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6712 return;
6713 }
6714
6715 /* Fallen back connections are not triggering the cellicon */
6716 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6717 return;
6718 }
6719
6720 /* Remember the last time we set the cellicon. Needed for debouncing */
6721 mpte->mpte_last_cellicon_set = tcp_now;
6722
6723 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6724 tcp_sched_timers(tp);
6725
6726 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6727 mpte->mpte_cellicon_increments != 0) {
6728 if (mptcp_cellicon_refcount == 0) {
6729 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6730 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6731
6732 /* Continue, so that the icon gets set... */
6733 } else {
6734 /*
6735 * In this case, the cellicon is already set. No need to bump it
6736 * even higher
6737 */
6738
6739 return;
6740 }
6741 }
6742
6743 /* When tearing down this subflow, we need to decrement the
6744 * reference counter
6745 */
6746 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6747
6748 /* This counter, so that when a session gets destroyed we decrement
6749 * the reference counter by whatever is left
6750 */
6751 mpte->mpte_cellicon_increments++;
6752
6753 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6754 /* If cellicon is already set, get out of here! */
6755 return;
6756 }
6757
6758 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6759
6760 if (error) {
6761 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6762 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6763 } else {
6764 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6765 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6766 }
6767 }
6768
6769 void
mptcp_clear_cellicon(void)6770 mptcp_clear_cellicon(void)
6771 {
6772 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6773
6774 if (error) {
6775 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6776 __func__, error);
6777 } else {
6778 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6779 __func__);
6780 }
6781 }
6782
6783 /*
6784 * Returns true if the icon has been flipped to WiFi.
6785 */
6786 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6787 __mptcp_unset_cellicon(uint32_t val)
6788 {
6789 VERIFY(val < INT32_MAX);
6790 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6791 return false;
6792 }
6793
6794 mptcp_clear_cellicon();
6795
6796 return true;
6797 }
6798
6799 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6800 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6801 {
6802 /* First-party apps (Siri) don't flip the cellicon */
6803 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6804 return;
6805 }
6806
6807 if (mpte->mpte_cellicon_increments == 0) {
6808 /* This flow never used cell - get out of here! */
6809 return;
6810 }
6811
6812 if (mptcp_cellicon_refcount == 0) {
6813 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6814 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6815
6816 return;
6817 }
6818
6819 if (mpts) {
6820 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6821 return;
6822 }
6823
6824 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6825 }
6826
6827 if (mpte->mpte_cellicon_increments < val) {
6828 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6829 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6830 val = mpte->mpte_cellicon_increments;
6831 }
6832
6833 mpte->mpte_cellicon_increments -= val;
6834
6835 if (__mptcp_unset_cellicon(val) == false) {
6836 return;
6837 }
6838
6839 /* All flows are gone - our counter should be at zero too! */
6840 if (mpte->mpte_cellicon_increments != 0) {
6841 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6842 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6843 }
6844 }
6845
6846 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6847 mptcp_reset_rexmit_state(struct tcpcb *tp)
6848 {
6849 struct mptsub *mpts;
6850 struct inpcb *inp;
6851 struct socket *so;
6852
6853 inp = tp->t_inpcb;
6854 if (inp == NULL) {
6855 return;
6856 }
6857
6858 so = inp->inp_socket;
6859 if (so == NULL) {
6860 return;
6861 }
6862
6863 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6864 return;
6865 }
6866
6867 mpts = tp->t_mpsub;
6868
6869 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6870 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6871 }
6872
6873 void
mptcp_reset_keepalive(struct tcpcb * tp)6874 mptcp_reset_keepalive(struct tcpcb *tp)
6875 {
6876 struct mptsub *mpts = tp->t_mpsub;
6877
6878 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6879 }
6880
6881 static struct mppcb *
mtcp_alloc(void)6882 mtcp_alloc(void)
6883 {
6884 return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp;
6885 }
6886
6887 static void
mtcp_free(struct mppcb * mpp)6888 mtcp_free(struct mppcb *mpp)
6889 {
6890 struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp);
6891
6892 kfree_type(struct mpp_mtp, mtp);
6893 }
6894
6895 /*
6896 * Protocol pr_init callback.
6897 */
6898 void
mptcp_init(struct protosw * pp,struct domain * dp)6899 mptcp_init(struct protosw *pp, struct domain *dp)
6900 {
6901 #pragma unused(dp)
6902 static int mptcp_initialized = 0;
6903 struct protosw *prp;
6904 struct ip6protosw *prp6;
6905
6906 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6907
6908 /* do this only once */
6909 if (mptcp_initialized) {
6910 return;
6911 }
6912 mptcp_initialized = 1;
6913
6914 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6915
6916 /*
6917 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6918 * we must be able to find IPPROTO_TCP entries for both.
6919 */
6920 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6921 VERIFY(prp != NULL);
6922 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6923 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6924 sizeof(mptcp_subflow_usrreqs));
6925 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6926 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6927 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6928 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6929 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6930 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6931 /*
6932 * Socket filters shouldn't attach/detach to/from this protosw
6933 * since pr_protosw is to be used instead, which points to the
6934 * real protocol; if they do, it is a bug and we should panic.
6935 */
6936 mptcp_subflow_protosw.pr_filter_head.tqh_first =
6937 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6938 mptcp_subflow_protosw.pr_filter_head.tqh_last =
6939 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6940
6941 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6942 IPPROTO_TCP, SOCK_STREAM);
6943 VERIFY(prp6 != NULL);
6944 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6945 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6946 sizeof(mptcp_subflow_usrreqs6));
6947 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6948 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6949 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6950 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6951 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6952 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6953 /*
6954 * Socket filters shouldn't attach/detach to/from this protosw
6955 * since pr_protosw is to be used instead, which points to the
6956 * real protocol; if they do, it is a bug and we should panic.
6957 */
6958 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6959 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6960 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6961 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6962
6963 bzero(&mtcbinfo, sizeof(mtcbinfo));
6964 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6965 mtcbinfo.mppi_alloc = mtcp_alloc;
6966 mtcbinfo.mppi_free = mtcp_free;
6967
6968 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6969 lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6970 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
6971 &mtcbinfo.mppi_lock_attr);
6972
6973 mtcbinfo.mppi_gc = mptcp_gc;
6974 mtcbinfo.mppi_timer = mptcp_timer;
6975
6976 /* attach to MP domain for garbage collection to take place */
6977 mp_pcbinfo_attach(&mtcbinfo);
6978
6979 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
6980 }
6981