1 /*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_subflow_abort(struct mptsub *, int);
113
114 static void mptcp_send_dfin(struct socket *so);
115 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
116 static int mptcp_freeq(struct mptcb *mp_tp);
117
118 /*
119 * Possible return values for subflow event handlers. Note that success
120 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
121 * indicate errors or actions which require immediate attention; they will
122 * prevent the rest of the handlers from processing their respective events
123 * until the next round of events processing.
124 */
125 typedef enum {
126 MPTS_EVRET_DELETE = 1, /* delete this subflow */
127 MPTS_EVRET_OK = 2, /* OK */
128 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
129 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
130 } ev_ret_t;
131
132 static void mptcp_do_sha1(mptcp_key_t *, char *);
133 static void mptcp_do_sha256(mptcp_key_t *, char *);
134
135 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
136
137 static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT);
138 static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT);
139 static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry,
140 NET_KT_DEFAULT);
141
142 struct mppcbinfo mtcbinfo;
143
144 SYSCTL_DECL(_net_inet);
145
146 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
147
148 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
149 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
150
151
152 static int mptcp_alternate_port = 0;
153 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
154 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
155
156 static struct protosw mptcp_subflow_protosw;
157 static struct pr_usrreqs mptcp_subflow_usrreqs;
158 static struct ip6protosw mptcp_subflow_protosw6;
159 static struct pr_usrreqs mptcp_subflow_usrreqs6;
160
161 static uint8_t mptcp_create_subflows_scheduled;
162
163 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
164 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
165 static uint32_t mptcp_kern_skt_inuse = 0;
166 static uint32_t mptcp_kern_skt_unit;
167 static symptoms_advisory_t mptcp_advisory;
168
169 uint32_t mptcp_cellicon_refcount = 0;
170
171 os_log_t mptcp_log_handle;
172
173 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)174 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
175 {
176 int i, index = -1;
177
178 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
179 if (create && stats[i].ifindex == IFSCOPE_NONE) {
180 if (index < 0) {
181 index = i;
182 }
183 continue;
184 }
185
186 if (stats[i].ifindex == ifindex) {
187 index = i;
188 return index;
189 }
190 }
191
192 if (index != -1) {
193 stats[index].ifindex = ifindex;
194 }
195
196 return index;
197 }
198
199 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)200 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
201 {
202 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
203 int index;
204
205 if (ifp == NULL) {
206 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
207 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
208 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
209 return -1;
210 }
211
212 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
213
214 if (index != -1) {
215 if (stats[index].is_expensive == 0) {
216 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
217 }
218 }
219
220 return index;
221 }
222
223 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)224 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
225 {
226 int index;
227
228 tcpstat.tcps_mp_switches++;
229 mpte->mpte_subflow_switches++;
230
231 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
232
233 if (index != -1) {
234 mpte->mpte_itfstats[index].switches++;
235 }
236 }
237
238 /*
239 * Flushes all recorded socket options from an MP socket.
240 */
241 static void
mptcp_flush_sopts(struct mptses * mpte)242 mptcp_flush_sopts(struct mptses *mpte)
243 {
244 struct mptopt *mpo, *tmpo;
245
246 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
247 mptcp_sopt_remove(mpte, mpo);
248 mptcp_sopt_free(mpo);
249 }
250 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
251 }
252
253 /*
254 * Create an MPTCP session, called as a result of opening a MPTCP socket.
255 */
256 int
mptcp_session_create(struct mppcb * mpp)257 mptcp_session_create(struct mppcb *mpp)
258 {
259 struct mpp_mtp *mtp;
260 struct mppcbinfo *mppi;
261 struct mptses *mpte;
262 struct mptcb *mp_tp;
263
264 VERIFY(mpp != NULL);
265 mppi = mpp->mpp_pcbinfo;
266 VERIFY(mppi != NULL);
267
268 mtp = __container_of(mpp, struct mpp_mtp, mpp);
269 mpte = &mtp->mpp_ses;
270 mp_tp = &mtp->mtcb;
271
272 /* MPTCP Multipath PCB Extension */
273 bzero(mpte, sizeof(*mpte));
274 VERIFY(mpp->mpp_pcbe == NULL);
275 mpp->mpp_pcbe = mpte;
276 mpte->mpte_mppcb = mpp;
277 mpte->mpte_mptcb = mp_tp;
278
279 TAILQ_INIT(&mpte->mpte_sopts);
280 TAILQ_INIT(&mpte->mpte_subflows);
281 mpte->mpte_associd = SAE_ASSOCID_ANY;
282 mpte->mpte_connid_last = SAE_CONNID_ANY;
283
284 mptcp_init_urgency_timer(mpte);
285
286 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
287 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
288
289 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
290 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
291 }
292
293 mpte->mpte_last_cellicon_set = tcp_now;
294
295 /* MPTCP Protocol Control Block */
296 bzero(mp_tp, sizeof(*mp_tp));
297 mp_tp->mpt_mpte = mpte;
298 mp_tp->mpt_state = MPTCPS_CLOSED;
299
300 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
301
302 return 0;
303 }
304
305 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)306 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
307 {
308 if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
309 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
310 }
311
312 if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
313 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
314 }
315
316 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
317 * meaning we prefer IPv6 over IPv4.
318 */
319 if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
320 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
321 }
322
323 if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
324 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
325 }
326
327 /* We don't yet have a unicast IP */
328 return NULL;
329 }
330
331 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)332 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
333 uint64_t *cellbytes, uint64_t *allbytes)
334 {
335 int64_t mycellbytes = 0;
336 uint64_t myallbytes = 0;
337 int i;
338
339 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
340 if (mpte->mpte_itfstats[i].is_expensive) {
341 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
342 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
343 }
344
345 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
346 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
347 }
348
349 if (initial_cell) {
350 mycellbytes -= mpte->mpte_init_txbytes;
351 mycellbytes -= mpte->mpte_init_rxbytes;
352 }
353
354 if (mycellbytes < 0) {
355 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
356 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
357 *cellbytes = 0;
358 *allbytes = 0;
359 } else {
360 *cellbytes = mycellbytes;
361 *allbytes = myallbytes;
362 }
363 }
364
365 static void
mptcpstats_session_wrapup(struct mptses * mpte)366 mptcpstats_session_wrapup(struct mptses *mpte)
367 {
368 boolean_t cell = mpte->mpte_initial_cell;
369
370 switch (mpte->mpte_svctype) {
371 case MPTCP_SVCTYPE_HANDOVER:
372 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
373 tcpstat.tcps_mptcp_fp_handover_attempt++;
374
375 if (cell && mpte->mpte_handshake_success) {
376 tcpstat.tcps_mptcp_fp_handover_success_cell++;
377
378 if (mpte->mpte_used_wifi) {
379 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
380 }
381 } else if (mpte->mpte_handshake_success) {
382 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
383
384 if (mpte->mpte_used_cell) {
385 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
386 }
387 }
388 } else {
389 tcpstat.tcps_mptcp_handover_attempt++;
390
391 if (cell && mpte->mpte_handshake_success) {
392 tcpstat.tcps_mptcp_handover_success_cell++;
393
394 if (mpte->mpte_used_wifi) {
395 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
396 }
397 } else if (mpte->mpte_handshake_success) {
398 tcpstat.tcps_mptcp_handover_success_wifi++;
399
400 if (mpte->mpte_used_cell) {
401 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
402 }
403 }
404 }
405
406 if (mpte->mpte_handshake_success) {
407 uint64_t cellbytes;
408 uint64_t allbytes;
409
410 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
411
412 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
413 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
414 }
415 break;
416 case MPTCP_SVCTYPE_INTERACTIVE:
417 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
418 tcpstat.tcps_mptcp_fp_interactive_attempt++;
419
420 if (mpte->mpte_handshake_success) {
421 tcpstat.tcps_mptcp_fp_interactive_success++;
422
423 if (!cell && mpte->mpte_used_cell) {
424 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
425 }
426 }
427 } else {
428 tcpstat.tcps_mptcp_interactive_attempt++;
429
430 if (mpte->mpte_handshake_success) {
431 tcpstat.tcps_mptcp_interactive_success++;
432
433 if (!cell && mpte->mpte_used_cell) {
434 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
435 }
436 }
437 }
438
439 if (mpte->mpte_handshake_success) {
440 uint64_t cellbytes;
441 uint64_t allbytes;
442
443 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
444
445 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
446 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
447 }
448 break;
449 case MPTCP_SVCTYPE_AGGREGATE:
450 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
451 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
452
453 if (mpte->mpte_handshake_success) {
454 tcpstat.tcps_mptcp_fp_aggregate_success++;
455 }
456 } else {
457 tcpstat.tcps_mptcp_aggregate_attempt++;
458
459 if (mpte->mpte_handshake_success) {
460 tcpstat.tcps_mptcp_aggregate_success++;
461 }
462 }
463
464 if (mpte->mpte_handshake_success) {
465 uint64_t cellbytes;
466 uint64_t allbytes;
467
468 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
469
470 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
471 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
472 }
473 break;
474 }
475
476 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
477 tcpstat.tcps_mptcp_back_to_wifi++;
478 }
479
480 if (mpte->mpte_triggered_cell) {
481 tcpstat.tcps_mptcp_triggered_cell++;
482 }
483 }
484
485 /*
486 * Destroy an MPTCP session.
487 */
488 static void
mptcp_session_destroy(struct mptses * mpte)489 mptcp_session_destroy(struct mptses *mpte)
490 {
491 struct mptcb *mp_tp = mpte->mpte_mptcb;
492
493 VERIFY(mp_tp != NULL);
494 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
495
496 mptcpstats_session_wrapup(mpte);
497 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
498 mptcp_flush_sopts(mpte);
499
500 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
501 kfree_data(mpte->mpte_itfinfo,
502 sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
503 }
504 mpte->mpte_itfinfo = NULL;
505
506 mptcp_freeq(mp_tp);
507 m_freem_list(mpte->mpte_reinjectq);
508
509 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
510 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
511 }
512
513 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)514 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
515 {
516 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
517 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
518 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
519 }
520
521 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)522 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
523 const struct in_addr *addrv4)
524 {
525 static const struct in6_addr well_known_prefix = {
526 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
527 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
528 0x00, 0x00, 0x00, 0x00},
529 };
530 const char *ptrv4 = (const char *)addrv4;
531 char *ptr = (char *)addr;
532
533 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
534 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
535 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
536 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
537 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
538 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
539 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
540 return -1;
541 }
542
543 /* Check for the well-known prefix */
544 if (len == NAT64_PREFIX_LEN_96 &&
545 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
546 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
547 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
548 return -1;
549 }
550 }
551
552 switch (len) {
553 case NAT64_PREFIX_LEN_96:
554 memcpy(ptr + 12, ptrv4, 4);
555 break;
556 case NAT64_PREFIX_LEN_64:
557 memcpy(ptr + 9, ptrv4, 4);
558 break;
559 case NAT64_PREFIX_LEN_56:
560 memcpy(ptr + 7, ptrv4, 1);
561 memcpy(ptr + 9, ptrv4 + 1, 3);
562 break;
563 case NAT64_PREFIX_LEN_48:
564 memcpy(ptr + 6, ptrv4, 2);
565 memcpy(ptr + 9, ptrv4 + 2, 2);
566 break;
567 case NAT64_PREFIX_LEN_40:
568 memcpy(ptr + 5, ptrv4, 3);
569 memcpy(ptr + 9, ptrv4 + 3, 1);
570 break;
571 case NAT64_PREFIX_LEN_32:
572 memcpy(ptr + 4, ptrv4, 4);
573 break;
574 default:
575 panic("NAT64-prefix len is wrong: %u", len);
576 }
577
578 return 0;
579 }
580
581 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)582 mptcp_trigger_cell_bringup(struct mptses *mpte)
583 {
584 struct socket *mp_so = mptetoso(mpte);
585
586 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
587 uuid_string_t uuidstr;
588 int err;
589
590 socket_unlock(mp_so, 0);
591 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
592 TRUE);
593 socket_lock(mp_so, 0);
594
595 if (err == 0) {
596 mpte->mpte_triggered_cell = 1;
597 }
598
599 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
600 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
601 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
602 } else {
603 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
604 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
605 }
606 }
607
608 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)609 mptcp_subflow_disconnecting(struct mptsub *mpts)
610 {
611 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
612 return true;
613 }
614
615 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
616 return true;
617 }
618
619 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
620 return true;
621 }
622
623 return false;
624 }
625
626 /*
627 * In Handover mode, only create cell subflow if
628 * - Symptoms marked WiFi as weak:
629 * Here, if we are sending data, then we can check the RTO-state. That is a
630 * stronger signal of WiFi quality than the Symptoms indicator.
631 * If however we are not sending any data, the only thing we can do is guess
632 * and thus bring up Cell.
633 *
634 * - Symptoms marked WiFi as unknown:
635 * In this state we don't know what the situation is and thus remain
636 * conservative, only bringing up cell if there are retransmissions going on.
637 */
638 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)639 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
640 {
641 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
642
643 if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
644 /* WiFi is good - don't use cell */
645 return false;
646 }
647
648 if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
649 /*
650 * We are in unknown state, only use Cell if we have confirmed
651 * that WiFi is bad.
652 */
653 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
654 return true;
655 } else {
656 return false;
657 }
658 }
659
660 if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
661 /*
662 * WiFi is confirmed to be bad from Symptoms-Framework.
663 * If we are sending data, check the RTOs.
664 * Otherwise, be pessimistic and use Cell.
665 */
666 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
667 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
668 return true;
669 } else {
670 return false;
671 }
672 } else {
673 return true;
674 }
675 }
676
677 return false;
678 }
679
680 void
mptcp_check_subflows_and_add(struct mptses * mpte)681 mptcp_check_subflows_and_add(struct mptses *mpte)
682 {
683 struct mptcb *mp_tp = mpte->mpte_mptcb;
684 boolean_t cellular_viable = FALSE;
685 boolean_t want_cellular = TRUE;
686 uint32_t i;
687
688 if (!mptcp_ok_to_create_subflows(mp_tp)) {
689 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
690 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
691 return;
692 }
693
694 /* Just to see if we have an IP-address available */
695 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
696 return;
697 }
698
699 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
700 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
701 struct mpt_itf_info *info;
702 struct sockaddr_in6 nat64pre;
703 struct sockaddr *dst;
704 struct mptsub *mpts;
705 struct ifnet *ifp;
706 uint32_t ifindex;
707
708 info = &mpte->mpte_itfinfo[i];
709
710 ifindex = info->ifindex;
711 if (ifindex == IFSCOPE_NONE) {
712 continue;
713 }
714
715 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
716 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
717 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
718
719 if (info->no_mptcp_support) {
720 continue;
721 }
722
723 ifnet_head_lock_shared();
724 ifp = ifindex2ifnet[ifindex];
725 ifnet_head_done();
726
727 if (ifp == NULL) {
728 continue;
729 }
730
731 if (IFNET_IS_CELLULAR(ifp)) {
732 cellular_viable = TRUE;
733
734 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
735 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
736 if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
737 continue;
738 }
739 }
740 }
741
742 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
743 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
744 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
745
746 if (subifp == NULL) {
747 continue;
748 }
749
750 /*
751 * If there is at least one functioning subflow on WiFi
752 * and we are checking for the cell interface, then
753 * we always need to ask symptoms for permission as
754 * cell is triggered even if WiFi is available.
755 */
756 if (!IFNET_IS_CELLULAR(subifp) &&
757 !mptcp_subflow_disconnecting(mpts) &&
758 IFNET_IS_CELLULAR(ifp)) {
759 need_to_ask_symptoms = TRUE;
760 }
761
762 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
763 os_log(mptcp_log_handle,
764 "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
765 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
766 mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
767 IFNET_IS_CELLULAR(subifp),
768 mptcp_wifi_quality_for_session(mpte),
769 mpts->mpts_flags,
770 tp->t_rxtshift,
771 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
772 mptetoso(mpte)->so_snd.sb_cc,
773 ifindex, subifp->if_index,
774 tp->t_srtt >> TCP_RTT_SHIFT,
775 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
776 tp->t_rxtcur);
777
778 if (!IFNET_IS_CELLULAR(subifp) &&
779 !mptcp_subflow_disconnecting(mpts) &&
780 (mpts->mpts_flags & MPTSF_CONNECTED) &&
781 !mptcp_handover_use_cellular(mpte, tp)) {
782 found = TRUE;
783
784 /* We found a proper subflow on WiFi - no need for cell */
785 want_cellular = FALSE;
786 break;
787 }
788 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
789 uint64_t time_now = mach_continuous_time();
790
791 os_log(mptcp_log_handle,
792 "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
793 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
794 time_now, mptcp_wifi_quality_for_session(mpte),
795 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
796 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
797
798 if (!IFNET_IS_CELLULAR(subifp) &&
799 !mptcp_subflow_disconnecting(mpts) &&
800 (mpte->mpte_time_target == 0 ||
801 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
802 mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
803 found = TRUE;
804
805 want_cellular = FALSE;
806 break;
807 }
808 }
809
810 if (subifp->if_index == ifindex &&
811 !mptcp_subflow_disconnecting(mpts)) {
812 /*
813 * We found a subflow on this interface.
814 * No need to create a new one.
815 */
816 found = TRUE;
817 break;
818 }
819 }
820
821 if (found) {
822 continue;
823 }
824
825 if (need_to_ask_symptoms &&
826 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
827 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
828 mptcp_developer_mode == 0) {
829 mptcp_ask_symptoms(mpte);
830 return;
831 }
832
833 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
834
835 if (dst->sa_family == AF_INET &&
836 !info->has_v4_conn && info->has_nat64_conn) {
837 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
838 int error, j;
839
840 bzero(&nat64pre, sizeof(struct sockaddr_in6));
841
842 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
843 if (error) {
844 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
845 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
846 continue;
847 }
848
849 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
850 if (nat64prefixes[j].prefix_len != 0) {
851 break;
852 }
853 }
854
855 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
856
857 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
858 nat64prefixes[j].prefix_len,
859 &((struct sockaddr_in *)(void *)dst)->sin_addr);
860 if (error != 0) {
861 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
862 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
863 continue;
864 }
865
866 memcpy(&nat64pre.sin6_addr,
867 &nat64prefixes[j].ipv6_prefix,
868 sizeof(nat64pre.sin6_addr));
869 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
870 nat64pre.sin6_family = AF_INET6;
871 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
872 nat64pre.sin6_flowinfo = 0;
873 nat64pre.sin6_scope_id = 0;
874
875 dst = (struct sockaddr *)&nat64pre;
876 }
877
878 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
879 continue;
880 }
881 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
882 continue;
883 }
884
885 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
886 }
887
888 if (!cellular_viable && want_cellular) {
889 /* Trigger Cell Bringup */
890 mptcp_trigger_cell_bringup(mpte);
891 }
892 }
893
894 static void
mptcp_remove_cell_subflows(struct mptses * mpte)895 mptcp_remove_cell_subflows(struct mptses *mpte)
896 {
897 struct mptsub *mpts, *tmpts;
898
899 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
900 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
901
902 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
903 continue;
904 }
905
906 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
907 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
908
909 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
910 }
911
912 return;
913 }
914
915 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)916 mptcp_remove_wifi_subflows(struct mptses *mpte)
917 {
918 struct mptsub *mpts, *tmpts;
919
920 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
921 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
922
923 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
924 continue;
925 }
926
927 os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
928 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
929
930 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
931 }
932
933 return;
934 }
935
936 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)937 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
938 {
939 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
940 boolean_t found_working_wifi_subflow = false;
941 boolean_t found_working_cell_subflow = false;
942
943 struct mptsub *mpts;
944
945 /*
946 * Look for a subflow that is on a non-cellular interface in connected
947 * state.
948 *
949 * In that case, remove all cellular subflows.
950 *
951 * If however there is no connected subflow
952 */
953 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
954 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
955 struct socket *so;
956 struct tcpcb *tp;
957
958 if (ifp == NULL) {
959 continue;
960 }
961
962 so = mpts->mpts_socket;
963 tp = sototcpcb(so);
964
965 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
966 tp->t_state != TCPS_ESTABLISHED ||
967 mptcp_subflow_disconnecting(mpts)) {
968 continue;
969 }
970
971 if (IFNET_IS_CELLULAR(ifp)) {
972 found_working_cell_subflow = true;
973 } else {
974 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
975 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
976 if (!mptcp_handover_use_cellular(mpte, tp)) {
977 found_working_wifi_subflow = true;
978 }
979 }
980 }
981
982 /*
983 * Couldn't find a working subflow, let's not remove those on a cellular
984 * interface.
985 */
986 os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
987 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
988 found_working_wifi_subflow, found_working_cell_subflow);
989 if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
990 if (found_working_cell_subflow) {
991 mptcp_remove_wifi_subflows(mpte);
992 }
993 return;
994 }
995
996 mptcp_remove_cell_subflows(mpte);
997 }
998
999 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1000 mptcp_handover_subflows_remove(struct mptses *mpte)
1001 {
1002 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1003 boolean_t found_working_subflow = false;
1004 struct mptsub *mpts;
1005
1006 /*
1007 * Look for a subflow that is on a non-cellular interface
1008 * and actually works (aka, no retransmission timeout).
1009 */
1010 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1011 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1012 struct socket *so;
1013 struct tcpcb *tp;
1014
1015 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1016 continue;
1017 }
1018
1019 so = mpts->mpts_socket;
1020 tp = sototcpcb(so);
1021
1022 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1023 tp->t_state != TCPS_ESTABLISHED) {
1024 continue;
1025 }
1026
1027 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1028 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1029
1030 if (!mptcp_handover_use_cellular(mpte, tp)) {
1031 found_working_subflow = true;
1032 break;
1033 }
1034 }
1035
1036 /*
1037 * Couldn't find a working subflow, let's not remove those on a cellular
1038 * interface.
1039 */
1040 if (!found_working_subflow) {
1041 return;
1042 }
1043
1044 mptcp_remove_cell_subflows(mpte);
1045 }
1046
1047 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1048 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1049 {
1050 uint64_t time_now = mach_continuous_time();
1051 struct mptsub *mpts;
1052
1053 if (mpte->mpte_time_target != 0 &&
1054 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1055 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1056 /* WiFi is bad and we are below the target - don't remove any subflows */
1057 return;
1058 }
1059
1060 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1061 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1062
1063 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1064 continue;
1065 }
1066
1067 /* We have a functioning subflow on WiFi. No need for cell! */
1068 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1069 !mptcp_subflow_disconnecting(mpts)) {
1070 mptcp_remove_cell_subflows(mpte);
1071 break;
1072 }
1073 }
1074 }
1075
1076 /*
1077 * Based on the MPTCP Service-type and the state of the subflows, we
1078 * will destroy subflows here.
1079 */
1080 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1081 mptcp_check_subflows_and_remove(struct mptses *mpte)
1082 {
1083 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1084 return;
1085 }
1086
1087 socket_lock_assert_owned(mptetoso(mpte));
1088
1089 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1090 mptcp_pure_handover_subflows_remove(mpte);
1091 }
1092
1093 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1094 mptcp_handover_subflows_remove(mpte);
1095 }
1096
1097 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1098 mptcp_targetbased_subflows_remove(mpte);
1099 }
1100 }
1101
1102 static void
mptcp_remove_subflows(struct mptses * mpte)1103 mptcp_remove_subflows(struct mptses *mpte)
1104 {
1105 struct mptsub *mpts, *tmpts;
1106
1107 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1108 return;
1109 }
1110
1111 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1112 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1113 boolean_t found = false;
1114 uint32_t ifindex;
1115 uint32_t i;
1116
1117 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1118 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1119
1120 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1121 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1122 ifp ? ifp->if_index : -1);
1123 soevent(mpts->mpts_socket,
1124 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1125
1126 continue;
1127 }
1128
1129 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1130 continue;
1131 }
1132
1133 if (ifp) {
1134 ifindex = ifp->if_index;
1135 } else {
1136 ifindex = mpts->mpts_ifscope;
1137 }
1138
1139 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1140 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1141 continue;
1142 }
1143
1144 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1145 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1146 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1147 found = true;
1148 break;
1149 }
1150
1151 if (mpts->mpts_dst.sa_family == AF_INET &&
1152 mpte->mpte_itfinfo[i].has_v4_conn) {
1153 found = true;
1154 break;
1155 }
1156 }
1157 }
1158
1159 if (!found) {
1160 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1161 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1162 ifindex, mpts->mpts_flags);
1163
1164 soevent(mpts->mpts_socket,
1165 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1166 }
1167 }
1168 }
1169
1170 static void
mptcp_create_subflows(__unused void * arg)1171 mptcp_create_subflows(__unused void *arg)
1172 {
1173 struct mppcb *mpp;
1174
1175 /*
1176 * Start with clearing, because we might be processing connections
1177 * while a new event comes in.
1178 */
1179 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1180 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1181 }
1182
1183 /* Iterate over all MPTCP connections */
1184
1185 lck_mtx_lock(&mtcbinfo.mppi_lock);
1186
1187 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1188 struct socket *mp_so = mpp->mpp_socket;
1189 struct mptses *mpte = mpp->mpp_pcbe;
1190
1191 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1192 continue;
1193 }
1194
1195 socket_lock(mp_so, 1);
1196 VERIFY(mp_so->so_usecount > 0);
1197
1198 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1199
1200 mptcp_check_subflows_and_add(mpte);
1201 mptcp_remove_subflows(mpte);
1202
1203 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1204 socket_unlock(mp_so, 1);
1205 }
1206
1207 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1208 }
1209
1210 /*
1211 * We need this because we are coming from an NECP-event. This event gets posted
1212 * while holding NECP-locks. The creation of the subflow however leads us back
1213 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1214 * So, we would deadlock there as we already hold the NECP-lock.
1215 *
1216 * So, let's schedule this separately. It also gives NECP the chance to make
1217 * progress, without having to wait for MPTCP to finish its subflow creation.
1218 */
1219 void
mptcp_sched_create_subflows(struct mptses * mpte)1220 mptcp_sched_create_subflows(struct mptses *mpte)
1221 {
1222 struct mppcb *mpp = mpte->mpte_mppcb;
1223 struct mptcb *mp_tp = mpte->mpte_mptcb;
1224 struct socket *mp_so = mpp->mpp_socket;
1225
1226 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1227 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1228 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1229 return;
1230 }
1231
1232 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1233 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1234 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1235 }
1236
1237 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1238 return;
1239 }
1240
1241 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1242 timeout(mptcp_create_subflows, NULL, hz / 10);
1243 }
1244
1245 /*
1246 * Allocate an MPTCP socket option structure.
1247 */
1248 struct mptopt *
mptcp_sopt_alloc(zalloc_flags_t how)1249 mptcp_sopt_alloc(zalloc_flags_t how)
1250 {
1251 return zalloc_flags(mptopt_zone, how | Z_ZERO);
1252 }
1253
1254 /*
1255 * Free an MPTCP socket option structure.
1256 */
1257 void
mptcp_sopt_free(struct mptopt * mpo)1258 mptcp_sopt_free(struct mptopt *mpo)
1259 {
1260 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1261
1262 zfree(mptopt_zone, mpo);
1263 }
1264
1265 /*
1266 * Add a socket option to the MPTCP socket option list.
1267 */
1268 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1269 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1270 {
1271 socket_lock_assert_owned(mptetoso(mpte));
1272 mpo->mpo_flags |= MPOF_ATTACHED;
1273 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1274 }
1275
1276 /*
1277 * Remove a socket option from the MPTCP socket option list.
1278 */
1279 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1280 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1281 {
1282 socket_lock_assert_owned(mptetoso(mpte));
1283 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1284 mpo->mpo_flags &= ~MPOF_ATTACHED;
1285 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1286 }
1287
1288 /*
1289 * Search for an existing <sopt_level,sopt_name> socket option.
1290 */
1291 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1292 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1293 {
1294 struct mptopt *mpo;
1295
1296 socket_lock_assert_owned(mptetoso(mpte));
1297
1298 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1299 if (mpo->mpo_level == sopt->sopt_level &&
1300 mpo->mpo_name == sopt->sopt_name) {
1301 break;
1302 }
1303 }
1304 return mpo;
1305 }
1306
1307 /*
1308 * Allocate a MPTCP subflow structure.
1309 */
1310 static struct mptsub *
mptcp_subflow_alloc(void)1311 mptcp_subflow_alloc(void)
1312 {
1313 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1314 }
1315
1316 /*
1317 * Deallocate a subflow structure, called when all of the references held
1318 * on it have been released. This implies that the subflow has been deleted.
1319 */
1320 static void
mptcp_subflow_free(struct mptsub * mpts)1321 mptcp_subflow_free(struct mptsub *mpts)
1322 {
1323 VERIFY(mpts->mpts_refcnt == 0);
1324 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1325 VERIFY(mpts->mpts_mpte == NULL);
1326 VERIFY(mpts->mpts_socket == NULL);
1327
1328 free_sockaddr(mpts->mpts_src);
1329
1330 zfree(mptsub_zone, mpts);
1331 }
1332
1333 static void
mptcp_subflow_addref(struct mptsub * mpts)1334 mptcp_subflow_addref(struct mptsub *mpts)
1335 {
1336 if (++mpts->mpts_refcnt == 0) {
1337 panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1338 }
1339 /* NOTREACHED */
1340 }
1341
1342 static void
mptcp_subflow_remref(struct mptsub * mpts)1343 mptcp_subflow_remref(struct mptsub *mpts)
1344 {
1345 if (mpts->mpts_refcnt == 0) {
1346 panic("%s: mpts %p negative refcnt", __func__, mpts);
1347 /* NOTREACHED */
1348 }
1349 if (--mpts->mpts_refcnt > 0) {
1350 return;
1351 }
1352
1353 /* callee will unlock and destroy lock */
1354 mptcp_subflow_free(mpts);
1355 }
1356
1357 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1358 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1359 {
1360 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1361 struct tcpcb *tp = sototcpcb(so);
1362
1363 /*
1364 * From this moment on, the subflow is linked to the MPTCP-connection.
1365 * Locking,... happens now at the MPTCP-layer
1366 */
1367 tp->t_mptcb = mpte->mpte_mptcb;
1368 so->so_flags |= SOF_MP_SUBFLOW;
1369 mp_so->so_usecount++;
1370
1371 /*
1372 * Insert the subflow into the list, and associate the MPTCP PCB
1373 * as well as the the subflow socket. From this point on, removing
1374 * the subflow needs to be done via mptcp_subflow_del().
1375 */
1376 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1377 mpte->mpte_numflows++;
1378
1379 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1380 mpts->mpts_mpte = mpte;
1381 mpts->mpts_socket = so;
1382 tp->t_mpsub = mpts;
1383 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1384 mptcp_subflow_addref(mpts); /* for subflow socket */
1385 }
1386
1387 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1388 mptcp_subflow_necp_cb(void *handle, __unused int action,
1389 __unused uint32_t interface_index,
1390 uint32_t necp_flags, bool *viable)
1391 {
1392 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1393 struct inpcb *inp = (struct inpcb *)handle;
1394 struct socket *so = inp->inp_socket;
1395 struct mptsub *mpts;
1396 struct mptses *mpte;
1397
1398 if (low_power) {
1399 action = NECP_CLIENT_CBACTION_NONVIABLE;
1400 }
1401
1402 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1403 return;
1404 }
1405
1406 /*
1407 * The socket is being garbage-collected. There is nothing to be done
1408 * here.
1409 */
1410 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1411 return;
1412 }
1413
1414 socket_lock(so, 1);
1415
1416 /* Check again after we acquired the lock. */
1417 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1418 goto out;
1419 }
1420
1421 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1422 mpts = sototcpcb(so)->t_mpsub;
1423
1424 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1425 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1426
1427 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1428
1429 mptcp_sched_create_subflows(mpte);
1430
1431 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1432 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1433 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1434 viable != NULL) {
1435 *viable = 1;
1436 }
1437
1438 out:
1439 socket_unlock(so, 1);
1440 }
1441
1442 /*
1443 * Create an MPTCP subflow socket.
1444 */
1445 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1446 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1447 struct socket **so)
1448 {
1449 lck_mtx_t *subflow_mtx;
1450 struct mptopt smpo, *mpo, *tmpo;
1451 struct proc *p;
1452 struct socket *mp_so;
1453 struct mppcb *mpp;
1454 int error;
1455
1456 *so = NULL;
1457
1458 mp_so = mptetoso(mpte);
1459 mpp = mpsotomppcb(mp_so);
1460
1461 p = proc_find(mp_so->last_pid);
1462 if (p == PROC_NULL) {
1463 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1464 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1465
1466 mptcp_subflow_free(mpts);
1467 return ESRCH;
1468 }
1469
1470 /*
1471 * Create the subflow socket (multipath subflow, non-blocking.)
1472 *
1473 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1474 * socket; it will be cleared when the socket is peeled off or closed.
1475 * It also indicates to the underlying TCP to handle MPTCP options.
1476 * A multipath subflow socket implies SS_NOFDREF state.
1477 */
1478
1479 /*
1480 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1481 * the ipi-lock. We cannot hold the socket-lock at that point.
1482 */
1483 socket_unlock(mp_so, 0);
1484 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1485 SOCF_MPTCP, PROC_NULL);
1486 socket_lock(mp_so, 0);
1487 if (error) {
1488 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1489 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1490
1491 proc_rele(p);
1492
1493 mptcp_subflow_free(mpts);
1494 return error;
1495 }
1496
1497 /*
1498 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1499 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1500 * Which is why we also need to get the lock with pr_getlock, as after
1501 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1502 */
1503 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1504 lck_mtx_lock(subflow_mtx);
1505
1506 /*
1507 * Must be the first thing we do, to make sure all pointers for this
1508 * subflow are set.
1509 */
1510 mptcp_subflow_attach(mpte, mpts, *so);
1511
1512 /*
1513 * A multipath subflow socket is used internally in the kernel,
1514 * therefore it does not have a file desciptor associated by
1515 * default.
1516 */
1517 (*so)->so_state |= SS_NOFDREF;
1518
1519 lck_mtx_unlock(subflow_mtx);
1520
1521 /* prevent the socket buffers from being compressed */
1522 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1523 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1524
1525 /* Inherit preconnect and TFO data flags */
1526 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1527 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1528 }
1529 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1530 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1531 }
1532 if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1533 (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1534 }
1535
1536 /* Inherit uuid and create the related flow. */
1537 if (!uuid_is_null(mpp->necp_client_uuid)) {
1538 struct mptcb *mp_tp = mpte->mpte_mptcb;
1539
1540 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1541
1542 /*
1543 * A note on the unlock: With MPTCP, we do multiple times a
1544 * necp_client_register_socket_flow. This is problematic,
1545 * because now the lock-ordering guarantee (first necp-locks,
1546 * then socket-locks) is no more respected. So, we need to
1547 * unlock here.
1548 */
1549 socket_unlock(mp_so, 0);
1550 error = necp_client_register_socket_flow(mp_so->last_pid,
1551 mpp->necp_client_uuid, sotoinpcb(*so));
1552 socket_lock(mp_so, 0);
1553
1554 if (error) {
1555 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1556 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1557
1558 goto out_err;
1559 }
1560
1561 /* Possible state-change during the unlock above */
1562 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1563 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1564 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1565 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1566 mp_tp->mpt_state, mp_tp->mpt_flags);
1567
1568 error = EINVAL;
1569 goto out_err;
1570 }
1571
1572 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1573 }
1574
1575 if (mpp->inp_necp_attributes.inp_domain != NULL) {
1576 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1577 sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1578
1579 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1580 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1581 }
1582 }
1583 if (mpp->inp_necp_attributes.inp_account != NULL) {
1584 size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1585 sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1586
1587 if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1588 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1589 }
1590 }
1591
1592 if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1593 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1594 sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1595
1596 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1597 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1598 }
1599 }
1600
1601 if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1602 size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1603 sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1604
1605 if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1606 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1607 }
1608 }
1609
1610 /* Needs to happen prior to the delegation! */
1611 (*so)->last_pid = mp_so->last_pid;
1612
1613 if (mp_so->so_flags & SOF_DELEGATED) {
1614 if (mpte->mpte_epid) {
1615 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1616 if (error) {
1617 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1618 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1619 goto out_err;
1620 }
1621 }
1622 if (!uuid_is_null(mpte->mpte_euuid)) {
1623 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1624 if (error) {
1625 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1626 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1627 goto out_err;
1628 }
1629 }
1630 }
1631
1632 /* inherit the other socket options */
1633 bzero(&smpo, sizeof(smpo));
1634 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1635 smpo.mpo_level = SOL_SOCKET;
1636 smpo.mpo_intval = 1;
1637
1638 /* disable SIGPIPE */
1639 smpo.mpo_name = SO_NOSIGPIPE;
1640 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1641 goto out_err;
1642 }
1643
1644 /* find out if the subflow's source address goes away */
1645 smpo.mpo_name = SO_NOADDRERR;
1646 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1647 goto out_err;
1648 }
1649
1650 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1651 /*
1652 * On secondary subflows we might need to set the cell-fallback
1653 * flag (see conditions in mptcp_subflow_sosetopt).
1654 */
1655 smpo.mpo_level = SOL_SOCKET;
1656 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1657 smpo.mpo_intval = 1;
1658 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1659 goto out_err;
1660 }
1661 }
1662
1663 /* replay setsockopt(2) on the subflow sockets for eligible options */
1664 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1665 int interim;
1666
1667 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1668 continue;
1669 }
1670
1671 /*
1672 * Skip those that are handled internally; these options
1673 * should not have been recorded and marked with the
1674 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1675 */
1676 if (mpo->mpo_level == SOL_SOCKET &&
1677 (mpo->mpo_name == SO_NOSIGPIPE ||
1678 mpo->mpo_name == SO_NOADDRERR ||
1679 mpo->mpo_name == SO_KEEPALIVE)) {
1680 continue;
1681 }
1682
1683 interim = (mpo->mpo_flags & MPOF_INTERIM);
1684 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1685 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1686 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1687 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1688 mpo->mpo_intval);
1689 mptcp_sopt_remove(mpte, mpo);
1690 mptcp_sopt_free(mpo);
1691 continue;
1692 }
1693 }
1694
1695 /*
1696 * We need to receive everything that the subflow socket has,
1697 * so use a customized socket receive function. We will undo
1698 * this when the socket is peeled off or closed.
1699 */
1700 switch (dom) {
1701 case PF_INET:
1702 (*so)->so_proto = &mptcp_subflow_protosw;
1703 break;
1704 case PF_INET6:
1705 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1706 break;
1707 default:
1708 VERIFY(0);
1709 /* NOTREACHED */
1710 }
1711
1712 proc_rele(p);
1713
1714 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1715 int, dom, int, error);
1716
1717 return 0;
1718
1719 out_err:
1720 mptcp_subflow_abort(mpts, error);
1721
1722 proc_rele(p);
1723
1724 return error;
1725 }
1726
1727 /*
1728 * Close an MPTCP subflow socket.
1729 *
1730 * Note that this may be called on an embryonic subflow, and the only
1731 * thing that is guaranteed valid is the protocol-user request.
1732 */
1733 static void
mptcp_subflow_soclose(struct mptsub * mpts)1734 mptcp_subflow_soclose(struct mptsub *mpts)
1735 {
1736 struct socket *so = mpts->mpts_socket;
1737
1738 if (mpts->mpts_flags & MPTSF_CLOSED) {
1739 return;
1740 }
1741
1742 VERIFY(so != NULL);
1743 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1744 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1745
1746 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1747 struct socket *, so,
1748 struct sockbuf *, &so->so_rcv,
1749 struct sockbuf *, &so->so_snd,
1750 struct mptses *, mpts->mpts_mpte);
1751
1752 mpts->mpts_flags |= MPTSF_CLOSED;
1753
1754 if (so->so_retaincnt == 0) {
1755 soclose_locked(so);
1756
1757 return;
1758 } else {
1759 VERIFY(so->so_usecount > 0);
1760 so->so_usecount--;
1761 }
1762
1763 return;
1764 }
1765
1766 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1767 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1768 {
1769 struct tcpcb *tp = sototcpcb(so);
1770 struct mptcp_subf_auth_entry *sauth_entry;
1771
1772 /*
1773 * The address ID of the first flow is implicitly 0.
1774 */
1775 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1776 tp->t_local_aid = 0;
1777 } else {
1778 tp->t_local_aid = addr_id;
1779 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1780 so->so_flags |= SOF_MP_SEC_SUBFLOW;
1781 }
1782 sauth_entry = zalloc(mpt_subauth_zone);
1783 sauth_entry->msae_laddr_id = tp->t_local_aid;
1784 sauth_entry->msae_raddr_id = 0;
1785 sauth_entry->msae_raddr_rand = 0;
1786 try_again:
1787 sauth_entry->msae_laddr_rand = RandomULong();
1788 if (sauth_entry->msae_laddr_rand == 0) {
1789 goto try_again;
1790 }
1791 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1792 }
1793
1794 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1795 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1796 {
1797 struct mptcp_subf_auth_entry *sauth_entry;
1798 struct tcpcb *tp = NULL;
1799 int found = 0;
1800
1801 tp = sototcpcb(so);
1802 if (tp == NULL) {
1803 return;
1804 }
1805
1806 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1807 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1808 found = 1;
1809 break;
1810 }
1811 }
1812 if (found) {
1813 LIST_REMOVE(sauth_entry, msae_next);
1814 }
1815
1816 if (found) {
1817 zfree(mpt_subauth_zone, sauth_entry);
1818 }
1819 }
1820
1821 /*
1822 * Connect an MPTCP subflow socket.
1823 *
1824 * Note that in the pending connect case, the subflow socket may have been
1825 * bound to an interface and/or a source IP address which may no longer be
1826 * around by the time this routine is called; in that case the connect attempt
1827 * will most likely fail.
1828 */
1829 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1830 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1831 {
1832 char dbuf[MAX_IPv6_STR_LEN];
1833 struct socket *mp_so, *so;
1834 struct mptcb *mp_tp;
1835 struct sockaddr *dst;
1836 struct proc *p;
1837 int af, error, dport;
1838
1839 mp_so = mptetoso(mpte);
1840 mp_tp = mpte->mpte_mptcb;
1841 so = mpts->mpts_socket;
1842 af = mpts->mpts_dst.sa_family;
1843 dst = &mpts->mpts_dst;
1844
1845 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1846 VERIFY(mpts->mpts_socket != NULL);
1847 VERIFY(af == AF_INET || af == AF_INET6);
1848
1849 if (af == AF_INET) {
1850 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1851 dport = ntohs(SIN(dst)->sin_port);
1852 } else {
1853 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1854 dport = ntohs(SIN6(dst)->sin6_port);
1855 }
1856
1857 os_log(mptcp_log_handle,
1858 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1859 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1860
1861 p = proc_find(mp_so->last_pid);
1862 if (p == PROC_NULL) {
1863 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1864 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1865
1866 return ESRCH;
1867 }
1868
1869 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1870
1871 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1872
1873 /* connect the subflow socket */
1874 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1875 p, mpts->mpts_ifscope,
1876 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1877
1878 mpts->mpts_iss = sototcpcb(so)->iss;
1879
1880 /* See tcp_connect_complete */
1881 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1882 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1883 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1884 }
1885
1886 /* Allocate a unique address id per subflow */
1887 mpte->mpte_addrid_last++;
1888 if (mpte->mpte_addrid_last == 0) {
1889 mpte->mpte_addrid_last++;
1890 }
1891
1892 proc_rele(p);
1893
1894 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1895 struct mptsub *, mpts, int, error);
1896 if (error) {
1897 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1898 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1899 }
1900
1901 return error;
1902 }
1903
1904 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1905 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1906 uint32_t rseq, uint16_t dlen, uint8_t dfin)
1907 {
1908 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1909
1910 if (m_pktlen(m) == 0) {
1911 return 0;
1912 }
1913
1914 if (!(m->m_flags & M_PKTHDR)) {
1915 return 0;
1916 }
1917
1918 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1919 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1920 rseq != m->m_pkthdr.mp_rseq ||
1921 dlen != m->m_pkthdr.mp_rlen ||
1922 dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1923 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1924 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1925 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1926 rseq, m->m_pkthdr.mp_rseq,
1927 dlen, m->m_pkthdr.mp_rlen,
1928 dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1929
1930 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1931 return -1;
1932 }
1933 }
1934
1935 /* If mbuf is beyond right edge of the mapping, we need to split */
1936 if (m_pktlen(m) > dlen - dfin - off) {
1937 struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1938 if (new == NULL) {
1939 os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1940 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1941 dlen, dfin, off, m_pktlen(m),
1942 mpts->mpts_connid);
1943
1944 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1945 return -1;
1946 }
1947
1948 m->m_next = new;
1949 sballoc(&so->so_rcv, new);
1950 /* Undo, as sballoc will add to it as well */
1951 so->so_rcv.sb_cc -= new->m_len;
1952
1953 if (so->so_rcv.sb_mbtail == m) {
1954 so->so_rcv.sb_mbtail = new;
1955 }
1956 }
1957
1958 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1959 m->m_pkthdr.mp_dsn = dsn + off;
1960 m->m_pkthdr.mp_rseq = rseq + off;
1961 VERIFY(m_pktlen(m) < UINT16_MAX);
1962 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1963
1964 /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1965 if (dfin) {
1966 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1967 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1968 } else {
1969 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1970 }
1971 }
1972
1973
1974 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1975
1976 return 0;
1977 }
1978
1979 /*
1980 * Update the pid, upid, uuid of the subflow so, based on parent so
1981 */
1982 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1983 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1984 {
1985 if (so->last_pid != mp_so->last_pid ||
1986 so->last_upid != mp_so->last_upid) {
1987 so->last_upid = mp_so->last_upid;
1988 so->last_pid = mp_so->last_pid;
1989 uuid_copy(so->last_uuid, mp_so->last_uuid);
1990 }
1991 so_update_policy(so);
1992 }
1993
1994 /*
1995 * MPTCP subflow socket receive routine, derived from soreceive().
1996 */
1997 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1998 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1999 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2000 {
2001 #pragma unused(uio)
2002 struct socket *mp_so;
2003 struct mptses *mpte;
2004 struct mptcb *mp_tp;
2005 int flags, error = 0;
2006 struct mbuf *m, **mp = mp0;
2007 struct tcpcb *tp = sototcpcb(so);
2008
2009 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2010 mp_so = mptetoso(mpte);
2011 mp_tp = mpte->mpte_mptcb;
2012
2013 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2014
2015 #ifdef MORE_LOCKING_DEBUG
2016 if (so->so_usecount == 1) {
2017 panic("%s: so=%x no other reference on socket", __func__, so);
2018 /* NOTREACHED */
2019 }
2020 #endif
2021 /*
2022 * We return all that is there in the subflow's socket receive buffer
2023 * to the MPTCP layer, so we require that the caller passes in the
2024 * expected parameters.
2025 */
2026 if (mp == NULL || controlp != NULL) {
2027 return EINVAL;
2028 }
2029
2030 *mp = NULL;
2031 if (psa != NULL) {
2032 *psa = NULL;
2033 }
2034 if (flagsp != NULL) {
2035 flags = *flagsp & ~MSG_EOR;
2036 } else {
2037 flags = 0;
2038 }
2039
2040 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2041 return EOPNOTSUPP;
2042 }
2043
2044 flags |= (MSG_DONTWAIT | MSG_NBIO);
2045
2046 /*
2047 * If a recv attempt is made on a previously-accepted socket
2048 * that has been marked as inactive (disconnected), reject
2049 * the request.
2050 */
2051 if (so->so_flags & SOF_DEFUNCT) {
2052 struct sockbuf *sb = &so->so_rcv;
2053
2054 error = ENOTCONN;
2055 /*
2056 * This socket should have been disconnected and flushed
2057 * prior to being returned from sodefunct(); there should
2058 * be no data on its receive list, so panic otherwise.
2059 */
2060 if (so->so_state & SS_DEFUNCT) {
2061 sb_empty_assert(sb, __func__);
2062 }
2063 return error;
2064 }
2065
2066 /*
2067 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2068 * and if so just return to the caller. This could happen when
2069 * soreceive() is called by a socket upcall function during the
2070 * time the socket is freed. The socket buffer would have been
2071 * locked across the upcall, therefore we cannot put this thread
2072 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2073 * we may livelock), because the lock on the socket buffer will
2074 * only be released when the upcall routine returns to its caller.
2075 * Because the socket has been officially closed, there can be
2076 * no further read on it.
2077 *
2078 * A multipath subflow socket would have its SS_NOFDREF set by
2079 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2080 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2081 */
2082 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2083 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2084 return 0;
2085 }
2086
2087 /*
2088 * For consistency with soreceive() semantics, we need to obey
2089 * SB_LOCK in case some other code path has locked the buffer.
2090 */
2091 error = sblock(&so->so_rcv, 0);
2092 if (error != 0) {
2093 return error;
2094 }
2095
2096 m = so->so_rcv.sb_mb;
2097 if (m == NULL) {
2098 /*
2099 * Panic if we notice inconsistencies in the socket's
2100 * receive list; both sb_mb and sb_cc should correctly
2101 * reflect the contents of the list, otherwise we may
2102 * end up with false positives during select() or poll()
2103 * which could put the application in a bad state.
2104 */
2105 SB_MB_CHECK(&so->so_rcv);
2106
2107 if (so->so_error != 0) {
2108 error = so->so_error;
2109 so->so_error = 0;
2110 goto release;
2111 }
2112
2113 if (so->so_state & SS_CANTRCVMORE) {
2114 goto release;
2115 }
2116
2117 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2118 error = ENOTCONN;
2119 goto release;
2120 }
2121
2122 /*
2123 * MSG_DONTWAIT is implicitly defined and this routine will
2124 * never block, so return EWOULDBLOCK when there is nothing.
2125 */
2126 error = EWOULDBLOCK;
2127 goto release;
2128 }
2129
2130 mptcp_update_last_owner(so, mp_so);
2131
2132 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2133 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2134
2135 while (m != NULL) {
2136 int dlen = 0, error_out = 0, off = 0;
2137 uint8_t dfin = 0;
2138 struct mbuf *start = m;
2139 uint64_t dsn;
2140 uint32_t sseq;
2141 uint16_t orig_dlen;
2142 uint16_t csum;
2143
2144 VERIFY(m->m_nextpkt == NULL);
2145
2146 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2147 fallback:
2148 /* Just move mbuf to MPTCP-level */
2149
2150 sbfree(&so->so_rcv, m);
2151
2152 if (mp != NULL) {
2153 *mp = m;
2154 mp = &m->m_next;
2155 so->so_rcv.sb_mb = m = m->m_next;
2156 *mp = NULL;
2157 }
2158
2159 if (m != NULL) {
2160 so->so_rcv.sb_lastrecord = m;
2161 } else {
2162 SB_EMPTY_FIXUP(&so->so_rcv);
2163 }
2164
2165 continue;
2166 } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2167 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2168 boolean_t found_mapping = false;
2169 int parsed_length = 0;
2170 struct mbuf *m_iter;
2171
2172 /*
2173 * No MPTCP-option in the header. Either fallback or
2174 * wait for additional mappings.
2175 */
2176 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2177 /* data arrived without a DSS option mapping */
2178
2179 /* initial subflow can fallback right after SYN handshake */
2180 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2181 mptcp_notify_mpfail(so);
2182
2183 goto fallback;
2184 } else {
2185 os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2186 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2187 mpts->mpts_connid);
2188 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2189
2190 error = EIO;
2191 *mp0 = NULL;
2192 goto release;
2193 }
2194 }
2195
2196 /* Thus, let's look for an mbuf with the mapping */
2197 m_iter = m->m_next;
2198 parsed_length = m->m_len;
2199 while (m_iter != NULL && parsed_length < UINT16_MAX) {
2200 if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2201 parsed_length += m_iter->m_len;
2202 m_iter = m_iter->m_next;
2203 continue;
2204 }
2205
2206 found_mapping = true;
2207
2208 /* Found an mbuf with a DSS-mapping */
2209 orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2210 dsn = m_iter->m_pkthdr.mp_dsn;
2211 sseq = m_iter->m_pkthdr.mp_rseq;
2212 csum = m_iter->m_pkthdr.mp_csum;
2213
2214 if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2215 dfin = 1;
2216 dlen--;
2217 }
2218
2219 break;
2220 }
2221
2222 if (!found_mapping && parsed_length < UINT16_MAX) {
2223 /* Mapping not yet present, we can wait! */
2224 if (*mp0 == NULL) {
2225 error = EWOULDBLOCK;
2226 }
2227 goto release;
2228 } else if (!found_mapping && parsed_length >= UINT16_MAX) {
2229 os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2230 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2231 mpts->mpts_connid);
2232 /* Received 64KB without DSS-mapping. We should kill the subflow */
2233 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2234
2235 error = EIO;
2236 *mp0 = NULL;
2237 goto release;
2238 }
2239 } else {
2240 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2241 dsn = m->m_pkthdr.mp_dsn;
2242 sseq = m->m_pkthdr.mp_rseq;
2243 csum = m->m_pkthdr.mp_csum;
2244
2245 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2246 dfin = 1;
2247 dlen--;
2248 }
2249 }
2250
2251 /* Now, see if we need to remove previous packets */
2252 if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) {
2253 /* Ok, there is data in there that we don't need - let's throw it away! */
2254 int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc);
2255
2256 sbdrop(&so->so_rcv, totrim);
2257
2258 m = so->so_rcv.sb_mb;
2259 }
2260
2261 /*
2262 * Check if the full mapping is now present
2263 */
2264 if ((int)so->so_rcv.sb_cc < dlen) {
2265 if (*mp0 == NULL) {
2266 error = EWOULDBLOCK;
2267 }
2268 goto release;
2269 }
2270
2271 /* Now, get the full mapping */
2272 off = 0;
2273 while (dlen > 0) {
2274 if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2275 error_out = 1;
2276 error = EIO;
2277 dlen = 0;
2278 *mp0 = NULL;
2279 break;
2280 }
2281
2282 dlen -= m->m_len;
2283 off += m->m_len;
2284 sbfree(&so->so_rcv, m);
2285
2286 if (mp != NULL) {
2287 *mp = m;
2288 mp = &m->m_next;
2289 so->so_rcv.sb_mb = m = m->m_next;
2290 *mp = NULL;
2291 }
2292
2293 ASSERT(dlen == 0 || m);
2294 if (dlen != 0 && m == NULL) {
2295 /* "try" to gracefully recover on customer builds */
2296 error_out = 1;
2297 error = EIO;
2298 dlen = 0;
2299
2300 *mp0 = NULL;
2301
2302 SB_EMPTY_FIXUP(&so->so_rcv);
2303 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2304
2305 break;
2306 }
2307 }
2308
2309 VERIFY(dlen == 0);
2310
2311 if (m != NULL) {
2312 so->so_rcv.sb_lastrecord = m;
2313 } else {
2314 SB_EMPTY_FIXUP(&so->so_rcv);
2315 }
2316
2317 if (error_out) {
2318 goto release;
2319 }
2320
2321 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2322 error = EIO;
2323 *mp0 = NULL;
2324 goto release;
2325 }
2326
2327 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2328 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2329 }
2330
2331 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2332 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2333
2334 if (flagsp != NULL) {
2335 *flagsp |= flags;
2336 }
2337
2338 release:
2339 sbunlock(&so->so_rcv, TRUE);
2340
2341 return error;
2342 }
2343
2344 /*
2345 * MPTCP subflow socket send routine, derived from sosend().
2346 */
2347 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2348 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2349 struct mbuf *top, struct mbuf *control, int flags)
2350 {
2351 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2352 boolean_t en_tracing = FALSE, proc_held = FALSE;
2353 struct proc *p = current_proc();
2354 int en_tracing_val;
2355 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2356 int error;
2357
2358 VERIFY(control == NULL);
2359 VERIFY(addr == NULL);
2360 VERIFY(uio == NULL);
2361 VERIFY(flags == 0);
2362 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2363
2364 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2365 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2366
2367 /*
2368 * trace if tracing & network (vs. unix) sockets & and
2369 * non-loopback
2370 */
2371 if (ENTR_SHOULDTRACE &&
2372 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2373 struct inpcb *inp = sotoinpcb(so);
2374 if (inp->inp_last_outifp != NULL &&
2375 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2376 en_tracing = TRUE;
2377 en_tracing_val = top->m_pkthdr.len;
2378 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2379 (unsigned long)VM_KERNEL_ADDRPERM(so),
2380 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2381 (int64_t)en_tracing_val);
2382 }
2383 }
2384
2385 mptcp_update_last_owner(so, mp_so);
2386
2387 if (mp_so->last_pid != proc_pid(p)) {
2388 p = proc_find(mp_so->last_pid);
2389 if (p == PROC_NULL) {
2390 p = current_proc();
2391 } else {
2392 proc_held = TRUE;
2393 }
2394 }
2395
2396 #if NECP
2397 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2398 #endif /* NECP */
2399
2400 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2401 if (error) {
2402 goto out;
2403 }
2404
2405 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2406 top = NULL;
2407
2408 out:
2409 if (top != NULL) {
2410 m_freem(top);
2411 }
2412
2413 if (proc_held) {
2414 proc_rele(p);
2415 }
2416
2417 soclearfastopen(so);
2418
2419 if (en_tracing) {
2420 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2421 (unsigned long)VM_KERNEL_ADDRPERM(so),
2422 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2423 (int64_t)en_tracing_val);
2424 }
2425
2426 return error;
2427 }
2428
2429 /*
2430 * Subflow socket write upcall.
2431 *
2432 * Called when the associated subflow socket posted a read event.
2433 */
2434 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2435 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2436 {
2437 #pragma unused(so, waitf)
2438 struct mptsub *mpts = arg;
2439 struct mptses *mpte = mpts->mpts_mpte;
2440
2441 VERIFY(mpte != NULL);
2442
2443 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2444 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2445 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2446 }
2447 return;
2448 }
2449
2450 mptcp_output(mpte);
2451 }
2452
2453 /*
2454 * Subflow socket control event upcall.
2455 */
2456 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2457 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2458 {
2459 #pragma unused(so)
2460 struct mptsub *mpts = arg;
2461 struct mptses *mpte = mpts->mpts_mpte;
2462
2463 socket_lock_assert_owned(mptetoso(mpte));
2464
2465 if ((mpts->mpts_evctl & events) == events) {
2466 return;
2467 }
2468
2469 mpts->mpts_evctl |= events;
2470
2471 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2472 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2473 return;
2474 }
2475
2476 mptcp_subflow_workloop(mpte);
2477 }
2478
2479 /*
2480 * Establish an initial MPTCP connection (if first subflow and not yet
2481 * connected), or add a subflow to an existing MPTCP connection.
2482 */
2483 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2484 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2485 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2486 {
2487 struct socket *mp_so, *so = NULL;
2488 struct mptcb *mp_tp;
2489 struct mptsub *mpts = NULL;
2490 int af, error = 0;
2491
2492 mp_so = mptetoso(mpte);
2493 mp_tp = mpte->mpte_mptcb;
2494
2495 socket_lock_assert_owned(mp_so);
2496
2497 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2498 /* If the remote end sends Data FIN, refuse subflow adds */
2499 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2500 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2501 error = ENOTCONN;
2502 goto out_err;
2503 }
2504
2505 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2506 error = EOVERFLOW;
2507 goto out_err;
2508 }
2509
2510 mpts = mptcp_subflow_alloc();
2511 if (mpts == NULL) {
2512 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2513 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2514 error = ENOMEM;
2515 goto out_err;
2516 }
2517
2518 if (src) {
2519 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2520 error = EAFNOSUPPORT;
2521 goto out_err;
2522 }
2523
2524 if (src->sa_family == AF_INET &&
2525 src->sa_len != sizeof(struct sockaddr_in)) {
2526 error = EINVAL;
2527 goto out_err;
2528 }
2529
2530 if (src->sa_family == AF_INET6 &&
2531 src->sa_len != sizeof(struct sockaddr_in6)) {
2532 error = EINVAL;
2533 goto out_err;
2534 }
2535
2536 mpts->mpts_src = (struct sockaddr *)alloc_sockaddr(src->sa_len,
2537 Z_WAITOK | Z_NOFAIL);
2538
2539 bcopy(src, mpts->mpts_src, src->sa_len);
2540 }
2541
2542 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2543 error = EAFNOSUPPORT;
2544 goto out_err;
2545 }
2546
2547 if (dst->sa_family == AF_INET &&
2548 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2549 error = EINVAL;
2550 goto out_err;
2551 }
2552
2553 if (dst->sa_family == AF_INET6 &&
2554 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2555 error = EINVAL;
2556 goto out_err;
2557 }
2558
2559 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2560
2561 af = mpts->mpts_dst.sa_family;
2562
2563 ifnet_head_lock_shared();
2564 if ((ifscope > (unsigned)if_index)) {
2565 ifnet_head_done();
2566 error = ENXIO;
2567 goto out_err;
2568 }
2569 ifnet_head_done();
2570
2571 mpts->mpts_ifscope = ifscope;
2572
2573 /* create the subflow socket */
2574 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2575 /*
2576 * Returning (error) and not cleaning up, because up to here
2577 * all we did is creating mpts.
2578 *
2579 * And the contract is that the call to mptcp_subflow_socreate,
2580 * moves ownership of mpts to mptcp_subflow_socreate.
2581 */
2582 return error;
2583 }
2584
2585 /*
2586 * We may be called from within the kernel. Still need to account this
2587 * one to the real app.
2588 */
2589 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2590
2591 /*
2592 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2593 * -1 (SAE_CONNID_ALL).
2594 */
2595 mpte->mpte_connid_last++;
2596 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2597 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2598 mpte->mpte_connid_last++;
2599 }
2600
2601 mpts->mpts_connid = mpte->mpte_connid_last;
2602
2603 mpts->mpts_rel_seq = 1;
2604
2605 /* Allocate a unique address id per subflow */
2606 mpte->mpte_addrid_last++;
2607 if (mpte->mpte_addrid_last == 0) {
2608 mpte->mpte_addrid_last++;
2609 }
2610
2611 /* register for subflow socket read/write events */
2612 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2613
2614 /* Register for subflow socket control events */
2615 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2616 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2617 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2618 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2619 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2620 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2621 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2622 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2623
2624 /* sanity check */
2625 VERIFY(!(mpts->mpts_flags &
2626 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2627
2628 /*
2629 * Indicate to the TCP subflow whether or not it should establish
2630 * the initial MPTCP connection, or join an existing one. Fill
2631 * in the connection request structure with additional info needed
2632 * by the underlying TCP (to be used in the TCP options, etc.)
2633 */
2634 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2635 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2636
2637 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2638 mptcp_init_local_parms(mpte, dst);
2639 }
2640 soisconnecting(mp_so);
2641
2642 /* If fastopen is requested, set state in mpts */
2643 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2644 mpts->mpts_flags |= MPTSF_TFO_REQD;
2645 }
2646 } else {
2647 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2648 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2649 }
2650 }
2651
2652 mpts->mpts_flags |= MPTSF_CONNECTING;
2653
2654 /* connect right away if first attempt, or if join can be done now */
2655 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2656 error = mptcp_subflow_soconnectx(mpte, mpts);
2657 }
2658
2659 if (error) {
2660 goto out_err_close;
2661 }
2662
2663 if (pcid) {
2664 *pcid = mpts->mpts_connid;
2665 }
2666
2667 return 0;
2668
2669 out_err_close:
2670 mptcp_subflow_abort(mpts, error);
2671
2672 return error;
2673
2674 out_err:
2675 if (mpts) {
2676 mptcp_subflow_free(mpts);
2677 }
2678
2679 return error;
2680 }
2681
2682 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2683 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2684 {
2685 int index = mptcpstats_get_index(stats, mpts);
2686
2687 if (index != -1) {
2688 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2689
2690 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2691 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2692
2693 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2694 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2695
2696 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2697 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2698
2699 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2700 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2701 }
2702 }
2703
2704 /*
2705 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2706 * will no longer be accessible after a subflow is deleted, thus this
2707 * should occur only after the subflow socket has been disconnected.
2708 */
2709 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2710 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2711 {
2712 struct socket *mp_so = mptetoso(mpte);
2713 struct socket *so = mpts->mpts_socket;
2714 struct tcpcb *tp = sototcpcb(so);
2715
2716 socket_lock_assert_owned(mp_so);
2717 VERIFY(mpts->mpts_mpte == mpte);
2718 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2719 VERIFY(mpte->mpte_numflows != 0);
2720 VERIFY(mp_so->so_usecount > 0);
2721
2722 mptcpstats_update(mpte->mpte_itfstats, mpts);
2723
2724 mptcp_unset_cellicon(mpte, mpts, 1);
2725
2726 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2727 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2728
2729 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2730 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2731 mpte->mpte_numflows--;
2732 if (mpte->mpte_active_sub == mpts) {
2733 mpte->mpte_active_sub = NULL;
2734 }
2735
2736 /*
2737 * Drop references held by this subflow socket; there
2738 * will be no further upcalls made from this point.
2739 */
2740 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2741 sock_catchevents_locked(so, NULL, NULL, 0);
2742
2743 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2744
2745 mp_so->so_usecount--; /* for subflow socket */
2746 mpts->mpts_mpte = NULL;
2747 mpts->mpts_socket = NULL;
2748
2749 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2750 mptcp_subflow_remref(mpts); /* for subflow socket */
2751
2752 so->so_flags &= ~SOF_MP_SUBFLOW;
2753 tp->t_mptcb = NULL;
2754 tp->t_mpsub = NULL;
2755 }
2756
2757 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2758 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2759 {
2760 struct socket *so = mpts->mpts_socket;
2761 struct mptcb *mp_tp = mpte->mpte_mptcb;
2762 int send_dfin = 0;
2763
2764 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2765 send_dfin = 1;
2766 }
2767
2768 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2769 (so->so_state & SS_ISCONNECTED)) {
2770 if (send_dfin) {
2771 mptcp_send_dfin(so);
2772 }
2773 soshutdownlock(so, SHUT_WR);
2774 }
2775 }
2776
2777 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2778 mptcp_subflow_abort(struct mptsub *mpts, int error)
2779 {
2780 struct socket *so = mpts->mpts_socket;
2781 struct tcpcb *tp = sototcpcb(so);
2782
2783 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2784 return;
2785 }
2786
2787 if (tp->t_state != TCPS_CLOSED) {
2788 tcp_drop(tp, error);
2789 }
2790
2791 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2792 }
2793
2794 /*
2795 * Disconnect a subflow socket.
2796 */
2797 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2798 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2799 {
2800 struct socket *so, *mp_so;
2801 struct mptcb *mp_tp;
2802 int send_dfin = 0;
2803
2804 so = mpts->mpts_socket;
2805 mp_tp = mpte->mpte_mptcb;
2806 mp_so = mptetoso(mpte);
2807
2808 socket_lock_assert_owned(mp_so);
2809
2810 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2811 return;
2812 }
2813
2814 mptcp_unset_cellicon(mpte, mpts, 1);
2815
2816 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2817
2818 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2819 send_dfin = 1;
2820 }
2821
2822 if (mp_so->so_flags & SOF_DEFUNCT) {
2823 errno_t ret;
2824
2825 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2826 if (ret == 0) {
2827 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2828
2829 if (ret != 0) {
2830 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2831 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2832 }
2833 } else {
2834 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2835 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2836 }
2837 }
2838
2839 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2840 (so->so_state & SS_ISCONNECTED)) {
2841 if (send_dfin) {
2842 mptcp_send_dfin(so);
2843 }
2844
2845 (void) soshutdownlock(so, SHUT_RD);
2846 (void) soshutdownlock(so, SHUT_WR);
2847 (void) sodisconnectlocked(so);
2848 }
2849
2850 /*
2851 * Generate a disconnect event for this subflow socket, in case
2852 * the lower layer doesn't do it; this is needed because the
2853 * subflow socket deletion relies on it.
2854 */
2855 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2856 }
2857
2858 /*
2859 * Subflow socket input.
2860 */
2861 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2862 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2863 {
2864 struct socket *mp_so = mptetoso(mpte);
2865 struct mbuf *m = NULL;
2866 struct socket *so;
2867 int error, wakeup = 0;
2868
2869 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2870 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2871
2872 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2873 struct mptsub *, mpts);
2874
2875 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2876 goto out;
2877 }
2878
2879 so = mpts->mpts_socket;
2880
2881 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2882 if (error != 0 && error != EWOULDBLOCK) {
2883 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2884 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2885 if (error == ENODATA) {
2886 /*
2887 * Don't ignore ENODATA so as to discover
2888 * nasty middleboxes.
2889 */
2890 mp_so->so_error = ENODATA;
2891
2892 wakeup = 1;
2893 goto out;
2894 }
2895 }
2896
2897 /* In fallback, make sure to accept data on all but one subflow */
2898 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2899 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2900 m_freem(m);
2901 goto out;
2902 }
2903
2904 if (m != NULL) {
2905 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2906 mptcp_set_cellicon(mpte, mpts);
2907
2908 mpte->mpte_used_cell = 1;
2909 } else {
2910 /*
2911 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2912 * explicitly set the cellicon, then we unset it again.
2913 */
2914 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2915 mptcp_unset_cellicon(mpte, NULL, 1);
2916 }
2917
2918 mpte->mpte_used_wifi = 1;
2919 }
2920
2921 mptcp_input(mpte, m);
2922 }
2923
2924 out:
2925 if (wakeup) {
2926 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2927 }
2928
2929 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2930 }
2931
2932 void
mptcp_handle_input(struct socket * so)2933 mptcp_handle_input(struct socket *so)
2934 {
2935 struct mptsub *mpts, *tmpts;
2936 struct mptses *mpte;
2937
2938 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2939 return;
2940 }
2941
2942 mpts = sototcpcb(so)->t_mpsub;
2943 mpte = mpts->mpts_mpte;
2944
2945 socket_lock_assert_owned(mptetoso(mpte));
2946
2947 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2948 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2949 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2950 }
2951 return;
2952 }
2953
2954 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2955 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2956 if (mpts->mpts_socket->so_usecount == 0) {
2957 /* Will be removed soon by tcp_garbage_collect */
2958 continue;
2959 }
2960
2961 mptcp_subflow_addref(mpts);
2962 mpts->mpts_socket->so_usecount++;
2963
2964 mptcp_subflow_input(mpte, mpts);
2965
2966 mptcp_subflow_remref(mpts); /* ours */
2967
2968 VERIFY(mpts->mpts_socket->so_usecount != 0);
2969 mpts->mpts_socket->so_usecount--;
2970 }
2971
2972 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2973 }
2974
2975 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)2976 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2977 {
2978 struct mbuf *so_m = so->so_snd.sb_mb;
2979 uint64_t dsn = m->m_pkthdr.mp_dsn;
2980
2981 while (so_m) {
2982 VERIFY(so_m->m_flags & M_PKTHDR);
2983 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2984
2985 /* Part of the segment is covered, don't reinject here */
2986 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2987 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2988 return TRUE;
2989 }
2990
2991 so_m = so_m->m_next;
2992 }
2993
2994 return FALSE;
2995 }
2996
2997 /*
2998 * Subflow socket output.
2999 *
3000 * Called for sending data from MPTCP to the underlying subflow socket.
3001 */
3002 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3003 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3004 {
3005 struct mptcb *mp_tp = mpte->mpte_mptcb;
3006 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3007 struct socket *mp_so, *so;
3008 struct tcpcb *tp;
3009 uint64_t mpt_dsn = 0, off = 0;
3010 int sb_cc = 0, error = 0, wakeup = 0;
3011 uint16_t dss_csum;
3012 uint16_t tot_sent = 0;
3013 boolean_t reinjected = FALSE;
3014
3015 mp_so = mptetoso(mpte);
3016 so = mpts->mpts_socket;
3017 tp = sototcpcb(so);
3018
3019 socket_lock_assert_owned(mp_so);
3020
3021 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3022 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3023
3024 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3025 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3026 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3027 (mpts->mpts_flags & MPTSF_TFO_REQD));
3028 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3029
3030 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3031 struct mptsub *, mpts);
3032
3033 /* Remove Addr Option is not sent reliably as per I-D */
3034 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3035 tp->t_rem_aid = mpte->mpte_lost_aid;
3036 tp->t_mpflags |= TMPF_SND_REM_ADDR;
3037 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3038 }
3039
3040 /*
3041 * The mbuf chains containing the metadata (as well as pointing to
3042 * the user data sitting at the MPTCP output queue) would then be
3043 * sent down to the subflow socket.
3044 *
3045 * Some notes on data sequencing:
3046 *
3047 * a. Each mbuf must be a M_PKTHDR.
3048 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
3049 * in the mbuf pkthdr structure.
3050 * c. Each mbuf containing the MPTCP metadata must have its
3051 * pkt_flags marked with the PKTF_MPTCP flag.
3052 */
3053
3054 if (mpte->mpte_reinjectq) {
3055 sb_mb = mpte->mpte_reinjectq;
3056 } else {
3057 sb_mb = mp_so->so_snd.sb_mb;
3058 }
3059
3060 if (sb_mb == NULL) {
3061 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3062 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3063 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3064 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3065
3066 /* Fix it to prevent looping */
3067 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3068 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3069 }
3070 goto out;
3071 }
3072
3073 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3074
3075 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3076 !(so->so_state & SS_ISCONNECTED) &&
3077 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3078 tp->t_mpflags |= TMPF_TFO_REQUEST;
3079
3080 /* Opting to call pru_send as no mbuf at subflow level */
3081 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3082 NULL, current_proc());
3083
3084 goto done_sending;
3085 }
3086
3087 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3088
3089 /* First, drop acknowledged data */
3090 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3091 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3092 "dsn %u suna %u reinject? %u\n",
3093 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3094 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3095 if (mpte->mpte_reinjectq) {
3096 mptcp_clean_reinjectq(mpte);
3097 } else {
3098 uint64_t len = 0;
3099 len = mp_tp->mpt_snduna - mpt_dsn;
3100 sbdrop(&mp_so->so_snd, (int)len);
3101 wakeup = 1;
3102 }
3103 }
3104
3105 /* Check again because of above sbdrop */
3106 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3107 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3108 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3109 goto out;
3110 }
3111
3112 /*
3113 * In degraded mode, we don't receive data acks, so force free
3114 * mbufs less than snd_nxt
3115 */
3116 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3117 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3118 mp_so->so_snd.sb_mb) {
3119 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3120 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3121 uint64_t len = 0;
3122 len = mp_tp->mpt_snduna - mpt_dsn;
3123 sbdrop(&mp_so->so_snd, (int)len);
3124 wakeup = 1;
3125
3126 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3127 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3128 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3129 }
3130 }
3131
3132 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3133 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3134 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3135 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3136 }
3137
3138 /*
3139 * Adjust the top level notion of next byte used for retransmissions
3140 * and sending FINs.
3141 */
3142 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3143 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3144 }
3145
3146 /* Now determine the offset from which to start transmitting data */
3147 if (mpte->mpte_reinjectq) {
3148 sb_mb = mpte->mpte_reinjectq;
3149 } else {
3150 dont_reinject:
3151 sb_mb = mp_so->so_snd.sb_mb;
3152 }
3153 if (sb_mb == NULL) {
3154 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3155 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3156 goto out;
3157 }
3158
3159 if (sb_mb == mpte->mpte_reinjectq) {
3160 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3161 off = 0;
3162
3163 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3164 if (mptcp_can_send_more(mp_tp, TRUE)) {
3165 goto dont_reinject;
3166 }
3167
3168 error = ECANCELED;
3169 goto out;
3170 }
3171
3172 reinjected = TRUE;
3173 } else if (flags & MPTCP_SUBOUT_PROBING) {
3174 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3175 off = 0;
3176 } else {
3177 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3178
3179 /*
3180 * With TFO, there might be no data at all, thus still go into this
3181 * code-path here.
3182 */
3183 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3184 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3185 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3186 sb_cc -= off;
3187 } else {
3188 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3189 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3190 (uint32_t)mp_tp->mpt_sndmax);
3191
3192 goto out;
3193 }
3194 }
3195
3196 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3197 if (sb_cc <= 0) {
3198 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3199 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3200 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3201 mptcp_subflow_cwnd_space(so));
3202 }
3203
3204 sb_cc = min(sb_cc, UINT16_MAX);
3205
3206 /*
3207 * Create a DSN mapping for the data we are about to send. It all
3208 * has the same mapping.
3209 */
3210 if (reinjected) {
3211 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3212 } else {
3213 mpt_dsn = mp_tp->mpt_snduna + off;
3214 }
3215
3216 mpt_mbuf = sb_mb;
3217 while (mpt_mbuf && reinjected == FALSE &&
3218 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3219 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3220 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3221 mpt_mbuf = mpt_mbuf->m_next;
3222 }
3223 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3224
3225 head = tail = NULL;
3226
3227 while (tot_sent < sb_cc) {
3228 int32_t mlen;
3229
3230 mlen = mpt_mbuf->m_len;
3231 mlen -= off;
3232 mlen = MIN(mlen, sb_cc - tot_sent);
3233
3234 if (mlen < 0) {
3235 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3236 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3237 (uint32_t)off, sb_cc, tot_sent);
3238 goto out;
3239 }
3240
3241 if (mlen == 0) {
3242 goto next;
3243 }
3244
3245 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3246 M_COPYM_MUST_COPY_HDR);
3247 if (m == NULL) {
3248 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3249 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3250 error = ENOBUFS;
3251 break;
3252 }
3253
3254 /* Create a DSN mapping for the data (m_copym does it) */
3255 VERIFY(m->m_flags & M_PKTHDR);
3256 VERIFY(m->m_next == NULL);
3257
3258 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3259 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3260 m->m_pkthdr.mp_dsn = mpt_dsn;
3261 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3262 m->m_pkthdr.len = mlen;
3263
3264 if (head == NULL) {
3265 head = tail = m;
3266 } else {
3267 tail->m_next = m;
3268 tail = m;
3269 }
3270
3271 tot_sent += mlen;
3272 off = 0;
3273 next:
3274 mpt_mbuf = mpt_mbuf->m_next;
3275 }
3276
3277 if (reinjected) {
3278 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3279 struct mbuf *n = sb_mb;
3280
3281 while (n) {
3282 n->m_pkthdr.mp_dsn += sb_cc;
3283 n->m_pkthdr.mp_rlen -= sb_cc;
3284 n = n->m_next;
3285 }
3286 m_adj(sb_mb, sb_cc);
3287 } else {
3288 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3289 m_freem(sb_mb);
3290 }
3291 }
3292
3293 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3294 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3295 tot_sent);
3296 }
3297
3298 /* Now, let's update rel-seq and the data-level length */
3299 mpts->mpts_rel_seq += tot_sent;
3300 m = head;
3301 while (m) {
3302 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3303 m->m_pkthdr.mp_csum = dss_csum;
3304 }
3305 m->m_pkthdr.mp_rlen = tot_sent;
3306 m = m->m_next;
3307 }
3308
3309 if (head != NULL) {
3310 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3311 (tp->t_tfo_stats == 0)) {
3312 tp->t_mpflags |= TMPF_TFO_REQUEST;
3313 }
3314
3315 error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3316 head = NULL;
3317 }
3318
3319 done_sending:
3320 if (error == 0 ||
3321 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3322 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3323
3324 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3325 tcpstat.tcps_mp_num_probes++;
3326 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3327 mpts->mpts_probecnt += 1;
3328 } else {
3329 mpts->mpts_probecnt +=
3330 tot_sent / mpts->mpts_maxseg;
3331 }
3332 }
3333
3334 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3335 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3336 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3337 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3338 }
3339 mp_tp->mpt_sndnxt = new_sndnxt;
3340 }
3341
3342 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3343
3344 /* Must be here as mptcp_can_send_more() checks for this */
3345 soclearfastopen(mp_so);
3346
3347 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3348 mptcp_set_cellicon(mpte, mpts);
3349
3350 mpte->mpte_used_cell = 1;
3351 } else {
3352 /*
3353 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3354 * explicitly set the cellicon, then we unset it again.
3355 */
3356 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3357 mptcp_unset_cellicon(mpte, NULL, 1);
3358 }
3359
3360 mpte->mpte_used_wifi = 1;
3361 }
3362
3363 /*
3364 * Don't propagate EWOULDBLOCK - it's already taken care of
3365 * in mptcp_usr_send for TFO.
3366 */
3367 error = 0;
3368 } else {
3369 /* We need to revert our change to mpts_rel_seq */
3370 mpts->mpts_rel_seq -= tot_sent;
3371
3372 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3373 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3374 }
3375 out:
3376
3377 if (head != NULL) {
3378 m_freem(head);
3379 }
3380
3381 if (wakeup) {
3382 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3383 }
3384
3385 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3386 return error;
3387 }
3388
3389 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3390 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3391 {
3392 struct mbuf *n, *prev = NULL;
3393
3394 n = mpte->mpte_reinjectq;
3395
3396 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3397 * equal than m's sequence number.
3398 */
3399 while (n) {
3400 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3401 break;
3402 }
3403
3404 prev = n;
3405
3406 n = n->m_nextpkt;
3407 }
3408
3409 if (n) {
3410 /* m is already fully covered by the next mbuf in the queue */
3411 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3412 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3413 os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3414 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3415 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3416 m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3417 goto dont_queue;
3418 }
3419
3420 /* m is covering the next mbuf entirely, thus we remove this guy */
3421 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3422 struct mbuf *tmp = n->m_nextpkt;
3423
3424 os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3425 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3426 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3427 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3428
3429 m->m_nextpkt = NULL;
3430 if (prev == NULL) {
3431 mpte->mpte_reinjectq = tmp;
3432 } else {
3433 prev->m_nextpkt = tmp;
3434 }
3435
3436 m_freem(n);
3437 n = tmp;
3438 }
3439 }
3440
3441 if (prev) {
3442 /* m is already fully covered by the previous mbuf in the queue */
3443 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3444 os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3445 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3446 (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3447 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3448 goto dont_queue;
3449 }
3450 }
3451
3452 if (prev == NULL) {
3453 mpte->mpte_reinjectq = m;
3454 } else {
3455 prev->m_nextpkt = m;
3456 }
3457
3458 m->m_nextpkt = n;
3459
3460 return;
3461
3462 dont_queue:
3463 m_freem(m);
3464 return;
3465 }
3466
3467 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3468 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3469 {
3470 struct socket *mp_so = mptetoso(mpte);
3471 struct mbuf *m;
3472
3473 m = mp_so->so_snd.sb_mb;
3474
3475 while (m) {
3476 /* If this segment covers what we are looking for, return it. */
3477 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3478 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3479 break;
3480 }
3481
3482
3483 /* Segment is no more in the queue */
3484 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3485 return NULL;
3486 }
3487
3488 m = m->m_next;
3489 }
3490
3491 return m;
3492 }
3493
3494 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3495 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3496 {
3497 struct mbuf *top = NULL, *tail = NULL;
3498 uint64_t dsn;
3499 uint32_t dlen, rseq;
3500
3501 dsn = m->m_pkthdr.mp_dsn;
3502 dlen = m->m_pkthdr.mp_rlen;
3503 rseq = m->m_pkthdr.mp_rseq;
3504
3505 while (len > 0) {
3506 struct mbuf *n;
3507
3508 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3509
3510 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3511 if (n == NULL) {
3512 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3513 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3514 goto err;
3515 }
3516
3517 VERIFY(n->m_flags & M_PKTHDR);
3518 VERIFY(n->m_next == NULL);
3519 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3520 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3521 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3522 VERIFY(n->m_len == m->m_len);
3523
3524 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3525
3526 if (top == NULL) {
3527 top = n;
3528 }
3529
3530 if (tail != NULL) {
3531 tail->m_next = n;
3532 }
3533
3534 tail = n;
3535
3536 len -= m->m_len;
3537 m = m->m_next;
3538 }
3539
3540 return top;
3541
3542 err:
3543 if (top) {
3544 m_freem(top);
3545 }
3546
3547 return NULL;
3548 }
3549
3550 static void
mptcp_reinject_mbufs(struct socket * so)3551 mptcp_reinject_mbufs(struct socket *so)
3552 {
3553 struct tcpcb *tp = sototcpcb(so);
3554 struct mptsub *mpts = tp->t_mpsub;
3555 struct mptcb *mp_tp = tptomptp(tp);
3556 struct mptses *mpte = mp_tp->mpt_mpte;
3557 struct sockbuf *sb = &so->so_snd;
3558 struct mbuf *m;
3559
3560 m = sb->sb_mb;
3561 while (m) {
3562 struct mbuf *n = m->m_next, *orig = m;
3563 bool set_reinject_flag = false;
3564
3565 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3566
3567 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3568 goto next;
3569 }
3570
3571 /* Has it all already been acknowledged at the data-level? */
3572 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3573 goto next;
3574 }
3575
3576 /* Part of this has already been acknowledged - lookup in the
3577 * MPTCP-socket for the segment.
3578 */
3579 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3580 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3581 if (m == NULL) {
3582 goto next;
3583 }
3584 }
3585
3586 /* Copy the mbuf with headers (aka, DSN-numbers) */
3587 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3588 if (m == NULL) {
3589 break;
3590 }
3591
3592 VERIFY(m->m_nextpkt == NULL);
3593
3594 /* Now, add to the reinject-queue, eliminating overlapping
3595 * segments
3596 */
3597 mptcp_add_reinjectq(mpte, m);
3598
3599 set_reinject_flag = true;
3600 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3601
3602 next:
3603 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3604 while (n) {
3605 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3606
3607 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3608 break;
3609 }
3610
3611 if (set_reinject_flag) {
3612 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3613 }
3614 n = n->m_next;
3615 }
3616
3617 m = n;
3618 }
3619 }
3620
3621 void
mptcp_clean_reinjectq(struct mptses * mpte)3622 mptcp_clean_reinjectq(struct mptses *mpte)
3623 {
3624 struct mptcb *mp_tp = mpte->mpte_mptcb;
3625
3626 socket_lock_assert_owned(mptetoso(mpte));
3627
3628 while (mpte->mpte_reinjectq) {
3629 struct mbuf *m = mpte->mpte_reinjectq;
3630
3631 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3632 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3633 break;
3634 }
3635
3636 mpte->mpte_reinjectq = m->m_nextpkt;
3637 m->m_nextpkt = NULL;
3638 m_freem(m);
3639 }
3640 }
3641
3642 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3643 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3644 uint32_t *p_mpsofilt_hint, uint32_t event)
3645 {
3646 struct socket *mp_so, *so;
3647 struct mptcb *mp_tp;
3648
3649 mp_so = mptetoso(mpte);
3650 mp_tp = mpte->mpte_mptcb;
3651 so = mpts->mpts_socket;
3652
3653 /*
3654 * We got an event for this subflow that might need to be propagated,
3655 * based on the state of the MPTCP connection.
3656 */
3657 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3658 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3659 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3660 mp_so->so_error = so->so_error;
3661 *p_mpsofilt_hint |= event;
3662 }
3663
3664 return MPTS_EVRET_OK;
3665 }
3666
3667 /*
3668 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3669 */
3670 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3671 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3672 uint32_t *p_mpsofilt_hint, uint32_t event)
3673 {
3674 struct socket *mp_so;
3675 struct tcpcb *tp;
3676
3677 mp_so = mptetoso(mpte);
3678 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3679
3680 /*
3681 * This overwrites any previous mpte_lost_aid to avoid storing
3682 * too much state when the typical case has only two subflows.
3683 */
3684 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3685 mpte->mpte_lost_aid = tp->t_local_aid;
3686
3687 /*
3688 * The subflow connection has lost its source address.
3689 */
3690 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3691
3692 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3693 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3694 }
3695
3696 return MPTS_EVRET_DELETE;
3697 }
3698
3699 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3700 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3701 uint32_t *p_mpsofilt_hint, uint32_t event)
3702 {
3703 #pragma unused(event, p_mpsofilt_hint)
3704 struct socket *so, *mp_so;
3705
3706 so = mpts->mpts_socket;
3707
3708 if (so->so_error != ENODATA) {
3709 return MPTS_EVRET_OK;
3710 }
3711
3712
3713 mp_so = mptetoso(mpte);
3714
3715 mp_so->so_error = ENODATA;
3716
3717 sorwakeup(mp_so);
3718 sowwakeup(mp_so);
3719
3720 return MPTS_EVRET_OK;
3721 }
3722
3723
3724 /*
3725 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3726 * indicates that the remote side sent a Data FIN
3727 */
3728 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3729 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3730 uint32_t *p_mpsofilt_hint, uint32_t event)
3731 {
3732 #pragma unused(event, mpts)
3733 struct mptcb *mp_tp = mpte->mpte_mptcb;
3734
3735 /*
3736 * We got a Data FIN for the MPTCP connection.
3737 * The FIN may arrive with data. The data is handed up to the
3738 * mptcp socket and the user is notified so that it may close
3739 * the socket if needed.
3740 */
3741 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3742 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3743 }
3744
3745 return MPTS_EVRET_OK; /* keep the subflow socket around */
3746 }
3747
3748 /*
3749 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3750 */
3751 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3752 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3753 uint32_t *p_mpsofilt_hint, uint32_t event)
3754 {
3755 #pragma unused(event, p_mpsofilt_hint)
3756 struct mptsub *mpts_alt = NULL;
3757 struct socket *alt_so = NULL;
3758 struct socket *mp_so;
3759 int altpath_exists = 0;
3760
3761 mp_so = mptetoso(mpte);
3762 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3763
3764 mptcp_reinject_mbufs(mpts->mpts_socket);
3765
3766 mpts_alt = mptcp_get_subflow(mpte, NULL);
3767
3768 /* If there is no alternate eligible subflow, ignore the failover hint. */
3769 if (mpts_alt == NULL || mpts_alt == mpts) {
3770 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3771 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3772
3773 goto done;
3774 }
3775
3776 altpath_exists = 1;
3777 alt_so = mpts_alt->mpts_socket;
3778 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3779 /* All data acknowledged and no RTT spike */
3780 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3781 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3782 } else {
3783 /* no alternate path available */
3784 altpath_exists = 0;
3785 }
3786 }
3787
3788 if (altpath_exists) {
3789 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3790
3791 mpte->mpte_active_sub = mpts_alt;
3792 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3793 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3794
3795 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3796 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3797
3798 mptcpstats_inc_switch(mpte, mpts);
3799
3800 sowwakeup(alt_so);
3801 } else {
3802 done:
3803 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3804 }
3805
3806 return MPTS_EVRET_OK;
3807 }
3808
3809 /*
3810 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3811 */
3812 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3813 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3814 uint32_t *p_mpsofilt_hint, uint32_t event)
3815 {
3816 /*
3817 * The subflow connection cannot use the outgoing interface, let's
3818 * close this subflow.
3819 */
3820 mptcp_subflow_abort(mpts, EPERM);
3821
3822 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3823
3824 return MPTS_EVRET_DELETE;
3825 }
3826
3827 /*
3828 * https://tools.ietf.org/html/rfc6052#section-2
3829 * https://tools.ietf.org/html/rfc6147#section-5.2
3830 */
3831 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)3832 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
3833 const struct ipv6_prefix *prefix,
3834 struct in_addr *addrv4)
3835 {
3836 char buf[MAX_IPv4_STR_LEN];
3837 char *ptrv4 = (char *)addrv4;
3838 const char *ptr = (const char *)addr;
3839
3840 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3841 return false;
3842 }
3843
3844 switch (prefix->prefix_len) {
3845 case NAT64_PREFIX_LEN_96:
3846 memcpy(ptrv4, ptr + 12, 4);
3847 break;
3848 case NAT64_PREFIX_LEN_64:
3849 memcpy(ptrv4, ptr + 9, 4);
3850 break;
3851 case NAT64_PREFIX_LEN_56:
3852 memcpy(ptrv4, ptr + 7, 1);
3853 memcpy(ptrv4 + 1, ptr + 9, 3);
3854 break;
3855 case NAT64_PREFIX_LEN_48:
3856 memcpy(ptrv4, ptr + 6, 2);
3857 memcpy(ptrv4 + 2, ptr + 9, 2);
3858 break;
3859 case NAT64_PREFIX_LEN_40:
3860 memcpy(ptrv4, ptr + 5, 3);
3861 memcpy(ptrv4 + 3, ptr + 9, 1);
3862 break;
3863 case NAT64_PREFIX_LEN_32:
3864 memcpy(ptrv4, ptr + 4, 4);
3865 break;
3866 default:
3867 panic("NAT64-prefix len is wrong: %u",
3868 prefix->prefix_len);
3869 }
3870
3871 os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3872 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3873 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3874
3875 return true;
3876 }
3877
3878 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3879 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3880 {
3881 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3882 struct socket *so = mpts->mpts_socket;
3883 struct ifnet *ifp;
3884 int j;
3885
3886 /* Subflow IPs will be steered directly by the server - no need to
3887 * desynthesize.
3888 */
3889 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3890 return;
3891 }
3892
3893 ifp = sotoinpcb(so)->inp_last_outifp;
3894
3895 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3896 return;
3897 }
3898
3899 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3900 int success;
3901
3902 if (nat64prefixes[j].prefix_len == 0) {
3903 continue;
3904 }
3905
3906 success = mptcp_desynthesize_ipv6_addr(mpte,
3907 &mpte->__mpte_dst_v6.sin6_addr,
3908 &nat64prefixes[j],
3909 &mpte->mpte_sub_dst_v4.sin_addr);
3910 if (success) {
3911 mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3912 mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3913 mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3914
3915 /*
3916 * We connected to a NAT64'ed address. Let's remove it
3917 * from the potential IPs to use. Whenever we are back on
3918 * that network and need to connect, we can synthesize again.
3919 *
3920 * Otherwise, on different IPv6 networks we will attempt
3921 * to connect to that NAT64 address...
3922 */
3923 memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3924 break;
3925 }
3926 }
3927 }
3928
3929 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3930 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3931 {
3932 struct inpcb *inp;
3933
3934 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3935 return;
3936 }
3937
3938 inp = sotoinpcb(mpts->mpts_socket);
3939 if (inp == NULL) {
3940 return;
3941 }
3942
3943 /* Should we try the alternate port? */
3944 if (mpte->mpte_alternate_port &&
3945 inp->inp_fport != mpte->mpte_alternate_port) {
3946 union sockaddr_in_4_6 dst;
3947 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3948
3949 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3950
3951 dst_in->sin_port = mpte->mpte_alternate_port;
3952
3953 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3954 mpts->mpts_ifscope, NULL);
3955 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3956 unsigned int i;
3957
3958 if (inp->inp_last_outifp == NULL) {
3959 return;
3960 }
3961
3962 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3963 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3964
3965 if (inp->inp_last_outifp->if_index == info->ifindex) {
3966 info->no_mptcp_support = 1;
3967 break;
3968 }
3969 }
3970 }
3971 }
3972
3973 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
3974 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)3975 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
3976 {
3977 struct socket *mp_so = mptetoso(mpte);
3978 struct socket *so = mpts->mpts_socket;
3979 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3980 struct mptcb *mp_tp = mpte->mpte_mptcb;
3981
3982 /* If data was sent with SYN, rewind state */
3983 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
3984 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3985 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
3986
3987 VERIFY(mp_droplen <= (UINT_MAX));
3988 VERIFY(mp_droplen >= tcp_droplen);
3989
3990 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
3991 mpts->mpts_iss += tcp_droplen;
3992 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3993
3994 if (mp_droplen > tcp_droplen) {
3995 /* handle partial TCP ack */
3996 mp_so->so_flags1 |= SOF1_TFO_REWIND;
3997 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
3998 mp_droplen = tcp_droplen;
3999 } else {
4000 /* all data on SYN was acked */
4001 mpts->mpts_rel_seq = 1;
4002 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4003 }
4004 mp_tp->mpt_sndmax -= tcp_droplen;
4005
4006 if (mp_droplen != 0) {
4007 VERIFY(mp_so->so_snd.sb_mb != NULL);
4008 sbdrop(&mp_so->so_snd, (int)mp_droplen);
4009 }
4010 }
4011 }
4012
4013 /*
4014 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4015 */
4016 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4017 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4018 uint32_t *p_mpsofilt_hint, uint32_t event)
4019 {
4020 #pragma unused(event, p_mpsofilt_hint)
4021 struct socket *mp_so, *so;
4022 struct inpcb *inp;
4023 struct tcpcb *tp;
4024 struct mptcb *mp_tp;
4025 int af;
4026 boolean_t mpok = FALSE;
4027
4028 mp_so = mptetoso(mpte);
4029 mp_tp = mpte->mpte_mptcb;
4030 so = mpts->mpts_socket;
4031 tp = sototcpcb(so);
4032 af = mpts->mpts_dst.sa_family;
4033
4034 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4035 return MPTS_EVRET_OK;
4036 }
4037
4038 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4039 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4040 return MPTS_EVRET_OK;
4041 }
4042
4043 /*
4044 * The subflow connection has been connected. Find out whether it
4045 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4046 *
4047 * a. If MPTCP connection is not yet established, then this must be
4048 * the first subflow connection. If MPTCP failed to negotiate,
4049 * fallback to regular TCP by degrading this subflow.
4050 *
4051 * b. If MPTCP connection has been established, then this must be
4052 * one of the subsequent subflow connections. If MPTCP failed
4053 * to negotiate, disconnect the connection.
4054 *
4055 * Right now, we simply unblock any waiters at the MPTCP socket layer
4056 * if the MPTCP connection has not been established.
4057 */
4058
4059 if (so->so_state & SS_ISDISCONNECTED) {
4060 /*
4061 * With MPTCP joins, a connection is connected at the subflow
4062 * level, but the 4th ACK from the server elevates the MPTCP
4063 * subflow to connected state. So there is a small window
4064 * where the subflow could get disconnected before the
4065 * connected event is processed.
4066 */
4067 return MPTS_EVRET_OK;
4068 }
4069
4070 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4071 mptcp_drop_tfo_data(mpte, mpts);
4072 }
4073
4074 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4075 mpts->mpts_flags |= MPTSF_CONNECTED;
4076
4077 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4078 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4079 }
4080
4081 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4082
4083 /* get/verify the outbound interface */
4084 inp = sotoinpcb(so);
4085
4086 mpts->mpts_maxseg = tp->t_maxseg;
4087
4088 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4089
4090 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4091 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4092 mpte->mpte_associd = mpts->mpts_connid;
4093 DTRACE_MPTCP2(state__change,
4094 struct mptcb *, mp_tp,
4095 uint32_t, 0 /* event */);
4096
4097 if (SOCK_DOM(so) == AF_INET) {
4098 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4099 } else {
4100 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4101 }
4102
4103 mpts->mpts_flags |= MPTSF_ACTIVE;
4104
4105 /* case (a) above */
4106 if (!mpok) {
4107 tcpstat.tcps_mpcap_fallback++;
4108
4109 tp->t_mpflags |= TMPF_INFIN_SENT;
4110 mptcp_notify_mpfail(so);
4111 } else {
4112 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4113 mptcp_subflows_need_backup_flag(mpte)) {
4114 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4115 } else {
4116 mpts->mpts_flags |= MPTSF_PREFERRED;
4117 }
4118 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4119 mpte->mpte_nummpcapflows++;
4120
4121 if (SOCK_DOM(so) == AF_INET6) {
4122 mptcp_handle_ipv6_connection(mpte, mpts);
4123 }
4124
4125 mptcp_check_subflows_and_add(mpte);
4126
4127 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4128 mpte->mpte_initial_cell = 1;
4129 }
4130
4131 mpte->mpte_handshake_success = 1;
4132 }
4133
4134 mp_tp->mpt_sndwnd = tp->snd_wnd;
4135 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4136 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4137 soisconnected(mp_so);
4138 } else if (mpok) {
4139 /*
4140 * case (b) above
4141 * In case of additional flows, the MPTCP socket is not
4142 * MPTSF_MP_CAPABLE until an ACK is received from server
4143 * for 3-way handshake. TCP would have guaranteed that this
4144 * is an MPTCP subflow.
4145 */
4146 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4147 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4148 mptcp_subflows_need_backup_flag(mpte)) {
4149 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4150 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4151 } else {
4152 mpts->mpts_flags |= MPTSF_PREFERRED;
4153 }
4154
4155 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4156 mpte->mpte_nummpcapflows++;
4157
4158 mpts->mpts_rel_seq = 1;
4159
4160 mptcp_check_subflows_and_remove(mpte);
4161 } else {
4162 mptcp_try_alternate_port(mpte, mpts);
4163
4164 tcpstat.tcps_join_fallback++;
4165 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4166 tcpstat.tcps_mptcp_cell_proxy++;
4167 } else {
4168 tcpstat.tcps_mptcp_wifi_proxy++;
4169 }
4170
4171 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4172
4173 return MPTS_EVRET_OK;
4174 }
4175
4176 /* This call, just to "book" an entry in the stats-table for this ifindex */
4177 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4178
4179 mptcp_output(mpte);
4180
4181 return MPTS_EVRET_OK; /* keep the subflow socket around */
4182 }
4183
4184 /*
4185 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4186 */
4187 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4188 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4189 uint32_t *p_mpsofilt_hint, uint32_t event)
4190 {
4191 #pragma unused(event, p_mpsofilt_hint)
4192 struct socket *mp_so, *so;
4193 struct mptcb *mp_tp;
4194
4195 mp_so = mptetoso(mpte);
4196 mp_tp = mpte->mpte_mptcb;
4197 so = mpts->mpts_socket;
4198
4199 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4200 return MPTS_EVRET_DELETE;
4201 }
4202
4203 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4204
4205 /* The subflow connection has been disconnected. */
4206
4207 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4208 mpte->mpte_nummpcapflows--;
4209 if (mpte->mpte_active_sub == mpts) {
4210 mpte->mpte_active_sub = NULL;
4211 }
4212 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4213 } else {
4214 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4215 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4216 mptcp_try_alternate_port(mpte, mpts);
4217 }
4218 }
4219
4220 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4221 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4222 mptcp_drop(mpte, mp_tp, so->so_error);
4223 }
4224
4225 /*
4226 * Clear flags that are used by getconninfo to return state.
4227 * Retain like MPTSF_DELETEOK for internal purposes.
4228 */
4229 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4230 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4231 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4232
4233 return MPTS_EVRET_DELETE;
4234 }
4235
4236 /*
4237 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4238 */
4239 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4240 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4241 uint32_t *p_mpsofilt_hint, uint32_t event)
4242 {
4243 #pragma unused(event, p_mpsofilt_hint)
4244 ev_ret_t ret = MPTS_EVRET_OK;
4245 struct socket *mp_so, *so;
4246 struct mptcb *mp_tp;
4247
4248 mp_so = mptetoso(mpte);
4249 mp_tp = mpte->mpte_mptcb;
4250 so = mpts->mpts_socket;
4251 struct inpcb *inp = sotoinpcb(so);
4252 struct tcpcb *tp = intotcpcb(inp);
4253
4254 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4255 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4256 } else {
4257 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4258 }
4259
4260 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4261 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4262 goto done;
4263 }
4264 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4265 } else {
4266 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4267 }
4268
4269 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4270 mpts->mpts_flags |= MPTSF_MP_READY;
4271 } else {
4272 mpts->mpts_flags &= ~MPTSF_MP_READY;
4273 }
4274
4275 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4276 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4277 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4278 tcp_cache_update_mptcp_version(tp, FALSE);
4279 }
4280
4281 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4282 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4283
4284 m_freem_list(mpte->mpte_reinjectq);
4285 mpte->mpte_reinjectq = NULL;
4286 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4287 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4288 ret = MPTS_EVRET_CONNECT_PENDING;
4289 }
4290
4291 done:
4292 return ret;
4293 }
4294
4295 /*
4296 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4297 */
4298 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4299 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4300 uint32_t *p_mpsofilt_hint, uint32_t event)
4301 {
4302 #pragma unused(event)
4303 struct socket *mp_so, *so;
4304 struct mptcb *mp_tp;
4305 boolean_t is_fastclose;
4306
4307 mp_so = mptetoso(mpte);
4308 mp_tp = mpte->mpte_mptcb;
4309 so = mpts->mpts_socket;
4310
4311 /* We got an invalid option or a fast close */
4312 struct inpcb *inp = sotoinpcb(so);
4313 struct tcpcb *tp = NULL;
4314
4315 tp = intotcpcb(inp);
4316 so->so_error = ECONNABORTED;
4317
4318 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4319
4320 tp->t_mpflags |= TMPF_RESET;
4321
4322 if (tp->t_state != TCPS_CLOSED) {
4323 struct tcptemp *t_template = tcp_maketemplate(tp);
4324
4325 if (t_template) {
4326 struct tcp_respond_args tra;
4327
4328 bzero(&tra, sizeof(tra));
4329 if (inp->inp_flags & INP_BOUND_IF) {
4330 tra.ifscope = inp->inp_boundifp->if_index;
4331 } else {
4332 tra.ifscope = IFSCOPE_NONE;
4333 }
4334 tra.awdl_unrestricted = 1;
4335
4336 tcp_respond(tp, t_template->tt_ipgen,
4337 &t_template->tt_t, (struct mbuf *)NULL,
4338 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4339 (void) m_free(dtom(t_template));
4340 }
4341 }
4342
4343 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4344 struct mptsub *iter, *tmp;
4345
4346 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4347
4348 mp_so->so_error = ECONNRESET;
4349
4350 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4351 if (iter == mpts) {
4352 continue;
4353 }
4354 mptcp_subflow_abort(iter, ECONNABORTED);
4355 }
4356
4357 /*
4358 * mptcp_drop is being called after processing the events, to fully
4359 * close the MPTCP connection
4360 */
4361 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4362 }
4363
4364 mptcp_subflow_abort(mpts, ECONNABORTED);
4365
4366 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4367 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4368 }
4369
4370 return MPTS_EVRET_DELETE;
4371 }
4372
4373 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4374 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4375 uint32_t *p_mpsofilt_hint, uint32_t event)
4376 {
4377 #pragma unused(event)
4378 bool found_active = false;
4379
4380 mpts->mpts_flags |= MPTSF_READ_STALL;
4381
4382 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4383 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4384
4385 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4386 TCPS_HAVERCVDFIN2(tp->t_state)) {
4387 continue;
4388 }
4389
4390 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4391 found_active = true;
4392 break;
4393 }
4394 }
4395
4396 if (!found_active) {
4397 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4398 }
4399
4400 return MPTS_EVRET_OK;
4401 }
4402
4403 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4404 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4405 uint32_t *p_mpsofilt_hint, uint32_t event)
4406 {
4407 #pragma unused(event)
4408 bool found_active = false;
4409
4410 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4411
4412 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4413 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4414
4415 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4416 tp->t_state > TCPS_CLOSE_WAIT) {
4417 continue;
4418 }
4419
4420 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4421 found_active = true;
4422 break;
4423 }
4424 }
4425
4426 if (!found_active) {
4427 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4428 }
4429
4430 return MPTS_EVRET_OK;
4431 }
4432
4433 /*
4434 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4435 * caller must ensure that the option can be issued on subflow sockets, via
4436 * MPOF_SUBFLOW_OK flag.
4437 */
4438 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4439 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4440 {
4441 struct socket *mp_so, *so;
4442 struct sockopt sopt;
4443 int error;
4444
4445 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4446
4447 mp_so = mptetoso(mpte);
4448 so = mpts->mpts_socket;
4449
4450 socket_lock_assert_owned(mp_so);
4451
4452 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4453 mpo->mpo_level == SOL_SOCKET &&
4454 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4455 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4456
4457 /*
4458 * When we open a new subflow, mark it as cell fallback, if
4459 * this subflow goes over cell.
4460 *
4461 * (except for first-party apps)
4462 */
4463
4464 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4465 return 0;
4466 }
4467
4468 if (sotoinpcb(so)->inp_last_outifp &&
4469 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4470 return 0;
4471 }
4472
4473 /*
4474 * This here is an OR, because if the app is not binding to the
4475 * interface, then it definitely is not a cell-fallback
4476 * connection.
4477 */
4478 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4479 !IFNET_IS_CELLULAR(ifp)) {
4480 return 0;
4481 }
4482 }
4483
4484 mpo->mpo_flags &= ~MPOF_INTERIM;
4485
4486 bzero(&sopt, sizeof(sopt));
4487 sopt.sopt_dir = SOPT_SET;
4488 sopt.sopt_level = mpo->mpo_level;
4489 sopt.sopt_name = mpo->mpo_name;
4490 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4491 sopt.sopt_valsize = sizeof(int);
4492 sopt.sopt_p = kernproc;
4493
4494 error = sosetoptlock(so, &sopt, 0);
4495 if (error) {
4496 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4497 "val %d set error %d\n", __func__,
4498 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4499 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4500 mpo->mpo_intval, error);
4501 }
4502 return error;
4503 }
4504
4505 /*
4506 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4507 * caller must ensure that the option can be issued on subflow sockets, via
4508 * MPOF_SUBFLOW_OK flag.
4509 */
4510 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4511 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4512 struct mptopt *mpo)
4513 {
4514 struct socket *mp_so;
4515 struct sockopt sopt;
4516 int error;
4517
4518 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4519 mp_so = mptetoso(mpte);
4520
4521 socket_lock_assert_owned(mp_so);
4522
4523 bzero(&sopt, sizeof(sopt));
4524 sopt.sopt_dir = SOPT_GET;
4525 sopt.sopt_level = mpo->mpo_level;
4526 sopt.sopt_name = mpo->mpo_name;
4527 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4528 sopt.sopt_valsize = sizeof(int);
4529 sopt.sopt_p = kernproc;
4530
4531 error = sogetoptlock(so, &sopt, 0); /* already locked */
4532 if (error) {
4533 os_log_error(mptcp_log_handle,
4534 "%s - %lx: sopt %s get error %d\n",
4535 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4536 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4537 }
4538 return error;
4539 }
4540
4541
4542 /*
4543 * MPTCP garbage collector.
4544 *
4545 * This routine is called by the MP domain on-demand, periodic callout,
4546 * which is triggered when a MPTCP socket is closed. The callout will
4547 * repeat as long as this routine returns a non-zero value.
4548 */
4549 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4550 mptcp_gc(struct mppcbinfo *mppi)
4551 {
4552 struct mppcb *mpp, *tmpp;
4553 uint32_t active = 0;
4554
4555 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4556
4557 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4558 struct socket *mp_so;
4559 struct mptses *mpte;
4560 struct mptcb *mp_tp;
4561
4562 mp_so = mpp->mpp_socket;
4563 mpte = mptompte(mpp);
4564 mp_tp = mpte->mpte_mptcb;
4565
4566 if (!mpp_try_lock(mpp)) {
4567 active++;
4568 continue;
4569 }
4570
4571 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4572
4573 /* check again under the lock */
4574 if (mp_so->so_usecount > 0) {
4575 boolean_t wakeup = FALSE;
4576 struct mptsub *mpts, *tmpts;
4577
4578 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4579 if (mp_tp->mpt_gc_ticks > 0) {
4580 mp_tp->mpt_gc_ticks--;
4581 }
4582 if (mp_tp->mpt_gc_ticks == 0) {
4583 wakeup = TRUE;
4584 }
4585 }
4586 if (wakeup) {
4587 TAILQ_FOREACH_SAFE(mpts,
4588 &mpte->mpte_subflows, mpts_entry, tmpts) {
4589 mptcp_subflow_eupcall1(mpts->mpts_socket,
4590 mpts, SO_FILT_HINT_DISCONNECTED);
4591 }
4592 }
4593 socket_unlock(mp_so, 0);
4594 active++;
4595 continue;
4596 }
4597
4598 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4599 panic("%s - %lx: skipped state "
4600 "[u=%d,r=%d,s=%d]\n", __func__,
4601 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4602 mp_so->so_usecount, mp_so->so_retaincnt,
4603 mpp->mpp_state);
4604 }
4605
4606 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4607 mptcp_close(mpte, mp_tp);
4608 }
4609
4610 mptcp_session_destroy(mpte);
4611
4612 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4613 struct sockbuf *, &mp_so->so_rcv,
4614 struct sockbuf *, &mp_so->so_snd,
4615 struct mppcb *, mpp);
4616
4617 mptcp_pcbdispose(mpp);
4618 sodealloc(mp_so);
4619 }
4620
4621 return active;
4622 }
4623
4624 /*
4625 * Drop a MPTCP connection, reporting the specified error.
4626 */
4627 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4628 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4629 {
4630 struct socket *mp_so = mptetoso(mpte);
4631
4632 VERIFY(mpte->mpte_mptcb == mp_tp);
4633
4634 socket_lock_assert_owned(mp_so);
4635
4636 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4637 uint32_t, 0 /* event */);
4638
4639 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4640 errno = mp_tp->mpt_softerror;
4641 }
4642 mp_so->so_error = errno;
4643
4644 return mptcp_close(mpte, mp_tp);
4645 }
4646
4647 /*
4648 * Close a MPTCP control block.
4649 */
4650 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4651 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4652 {
4653 struct mptsub *mpts = NULL, *tmpts = NULL;
4654 struct socket *mp_so = mptetoso(mpte);
4655
4656 socket_lock_assert_owned(mp_so);
4657 VERIFY(mpte->mpte_mptcb == mp_tp);
4658
4659 mp_tp->mpt_state = MPTCPS_TERMINATE;
4660
4661 mptcp_freeq(mp_tp);
4662
4663 soisdisconnected(mp_so);
4664
4665 /* Clean up all subflows */
4666 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4667 mptcp_subflow_disconnect(mpte, mpts);
4668 }
4669
4670 return NULL;
4671 }
4672
4673 void
mptcp_notify_close(struct socket * so)4674 mptcp_notify_close(struct socket *so)
4675 {
4676 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4677 }
4678
4679 typedef struct mptcp_subflow_event_entry {
4680 uint32_t sofilt_hint_mask;
4681 ev_ret_t (*sofilt_hint_ev_hdlr)(
4682 struct mptses *mpte,
4683 struct mptsub *mpts,
4684 uint32_t *p_mpsofilt_hint,
4685 uint32_t event);
4686 } mptsub_ev_entry_t;
4687
4688 /*
4689 * XXX The order of the event handlers below is really
4690 * really important. Think twice before changing it.
4691 */
4692 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4693 {
4694 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4695 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4696 },
4697 {
4698 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4699 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
4700 },
4701 {
4702 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4703 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4704 },
4705 {
4706 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4707 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4708 },
4709 {
4710 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4711 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4712 },
4713 {
4714 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4715 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4716 },
4717 {
4718 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4719 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4720 },
4721 {
4722 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4723 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4724 },
4725 {
4726 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4727 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4728 },
4729 {
4730 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4731 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4732 },
4733 {
4734 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4735 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4736 },
4737 {
4738 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4739 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4740 },
4741 {
4742 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4743 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4744 },
4745 {
4746 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4747 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4748 },
4749 };
4750
4751 /*
4752 * Subflow socket control events.
4753 *
4754 * Called for handling events related to the underlying subflow socket.
4755 */
4756 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4757 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4758 uint32_t *p_mpsofilt_hint)
4759 {
4760 ev_ret_t ret = MPTS_EVRET_OK;
4761 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4762 sizeof(mpsub_ev_entry_tbl[0]);
4763
4764 /* bail if there's nothing to process */
4765 if (!mpts->mpts_evctl) {
4766 return ret;
4767 }
4768
4769 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4770 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4771 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4772 SO_FILT_HINT_DISCONNECTED)) {
4773 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4774 }
4775
4776 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4777 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4778
4779 /*
4780 * Process all the socket filter hints and reset the hint
4781 * once it is handled
4782 */
4783 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4784 /*
4785 * Always execute the DISCONNECTED event, because it will wakeup
4786 * the app.
4787 */
4788 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4789 (ret >= MPTS_EVRET_OK ||
4790 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4791 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4792 ev_ret_t error =
4793 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4794 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4795 }
4796 }
4797
4798 return ret;
4799 }
4800
4801 /*
4802 * MPTCP workloop.
4803 */
4804 void
mptcp_subflow_workloop(struct mptses * mpte)4805 mptcp_subflow_workloop(struct mptses *mpte)
4806 {
4807 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4808 uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4809 struct mptsub *mpts, *tmpts;
4810 struct socket *mp_so;
4811
4812 mp_so = mptetoso(mpte);
4813
4814 socket_lock_assert_owned(mp_so);
4815
4816 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4817 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4818 return;
4819 }
4820 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4821
4822 relaunch:
4823 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4824
4825 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4826 ev_ret_t ret;
4827
4828 if (mpts->mpts_socket->so_usecount == 0) {
4829 /* Will be removed soon by tcp_garbage_collect */
4830 continue;
4831 }
4832
4833 mptcp_subflow_addref(mpts);
4834 mpts->mpts_socket->so_usecount++;
4835
4836 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4837
4838 /*
4839 * If MPTCP socket is closed, disconnect all subflows.
4840 * This will generate a disconnect event which will
4841 * be handled during the next iteration, causing a
4842 * non-zero error to be returned above.
4843 */
4844 if (mp_so->so_flags & SOF_PCBCLEARING) {
4845 mptcp_subflow_disconnect(mpte, mpts);
4846 }
4847
4848 switch (ret) {
4849 case MPTS_EVRET_OK:
4850 /* nothing to do */
4851 break;
4852 case MPTS_EVRET_DELETE:
4853 mptcp_subflow_soclose(mpts);
4854 break;
4855 case MPTS_EVRET_CONNECT_PENDING:
4856 connect_pending = TRUE;
4857 break;
4858 case MPTS_EVRET_DISCONNECT_FALLBACK:
4859 disconnect_fallback = TRUE;
4860 break;
4861 default:
4862 break;
4863 }
4864 mptcp_subflow_remref(mpts); /* ours */
4865
4866 VERIFY(mpts->mpts_socket->so_usecount != 0);
4867 mpts->mpts_socket->so_usecount--;
4868 }
4869
4870 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4871 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4872
4873 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4874 mp_so->so_state |= SS_CANTRCVMORE;
4875 sorwakeup(mp_so);
4876 }
4877
4878 soevent(mp_so, mpsofilt_hint_mask);
4879 }
4880
4881 if (!connect_pending && !disconnect_fallback) {
4882 goto exit;
4883 }
4884
4885 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4886 if (disconnect_fallback) {
4887 struct socket *so = NULL;
4888 struct inpcb *inp = NULL;
4889 struct tcpcb *tp = NULL;
4890
4891 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4892 continue;
4893 }
4894
4895 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4896
4897 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4898 MPTSF_DISCONNECTED)) {
4899 continue;
4900 }
4901
4902 so = mpts->mpts_socket;
4903
4904 /*
4905 * The MPTCP connection has degraded to a fallback
4906 * mode, so there is no point in keeping this subflow
4907 * regardless of its MPTCP-readiness state, unless it
4908 * is the primary one which we use for fallback. This
4909 * assumes that the subflow used for fallback is the
4910 * ACTIVE one.
4911 */
4912
4913 inp = sotoinpcb(so);
4914 tp = intotcpcb(inp);
4915 tp->t_mpflags &=
4916 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4917 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4918
4919 soevent(so, SO_FILT_HINT_MUSTRST);
4920 } else if (connect_pending) {
4921 /*
4922 * The MPTCP connection has progressed to a state
4923 * where it supports full multipath semantics; allow
4924 * additional joins to be attempted for all subflows
4925 * that are in the PENDING state.
4926 */
4927 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4928 int error = mptcp_subflow_soconnectx(mpte, mpts);
4929
4930 if (error) {
4931 mptcp_subflow_abort(mpts, error);
4932 }
4933 }
4934 }
4935 }
4936
4937 exit:
4938 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4939 goto relaunch;
4940 }
4941
4942 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4943 }
4944
4945 /*
4946 * Protocol pr_lock callback.
4947 */
4948 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4949 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4950 {
4951 struct mppcb *mpp = mpsotomppcb(mp_so);
4952 void *lr_saved;
4953
4954 if (lr == NULL) {
4955 lr_saved = __builtin_return_address(0);
4956 } else {
4957 lr_saved = lr;
4958 }
4959
4960 if (mpp == NULL) {
4961 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4962 mp_so, lr_saved, solockhistory_nr(mp_so));
4963 /* NOTREACHED */
4964 }
4965 mpp_lock(mpp);
4966
4967 if (mp_so->so_usecount < 0) {
4968 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4969 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4970 solockhistory_nr(mp_so));
4971 /* NOTREACHED */
4972 }
4973 if (refcount != 0) {
4974 mp_so->so_usecount++;
4975 mpp->mpp_inside++;
4976 }
4977 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4978 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4979
4980 return 0;
4981 }
4982
4983 /*
4984 * Protocol pr_unlock callback.
4985 */
4986 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)4987 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4988 {
4989 struct mppcb *mpp = mpsotomppcb(mp_so);
4990 void *lr_saved;
4991
4992 if (lr == NULL) {
4993 lr_saved = __builtin_return_address(0);
4994 } else {
4995 lr_saved = lr;
4996 }
4997
4998 if (mpp == NULL) {
4999 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
5000 mp_so, mp_so->so_usecount, lr_saved,
5001 solockhistory_nr(mp_so));
5002 /* NOTREACHED */
5003 }
5004 socket_lock_assert_owned(mp_so);
5005
5006 if (refcount != 0) {
5007 mp_so->so_usecount--;
5008 mpp->mpp_inside--;
5009 }
5010
5011 if (mp_so->so_usecount < 0) {
5012 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5013 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5014 /* NOTREACHED */
5015 }
5016 if (mpp->mpp_inside < 0) {
5017 panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5018 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5019 /* NOTREACHED */
5020 }
5021 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5022 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5023 mpp_unlock(mpp);
5024
5025 return 0;
5026 }
5027
5028 /*
5029 * Protocol pr_getlock callback.
5030 */
5031 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5032 mptcp_getlock(struct socket *mp_so, int flags)
5033 {
5034 struct mppcb *mpp = mpsotomppcb(mp_so);
5035
5036 if (mpp == NULL) {
5037 panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5038 solockhistory_nr(mp_so));
5039 /* NOTREACHED */
5040 }
5041 if (mp_so->so_usecount < 0) {
5042 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5043 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5044 /* NOTREACHED */
5045 }
5046 return mpp_getlock(mpp, flags);
5047 }
5048
5049 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5050 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5051 u_int32_t *rrand)
5052 {
5053 struct mptcp_subf_auth_entry *sauth_entry;
5054
5055 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5056 if (sauth_entry->msae_laddr_id == addr_id) {
5057 if (lrand) {
5058 *lrand = sauth_entry->msae_laddr_rand;
5059 }
5060 if (rrand) {
5061 *rrand = sauth_entry->msae_raddr_rand;
5062 }
5063 break;
5064 }
5065 }
5066 }
5067
5068 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5069 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5070 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5071 {
5072 struct mptcp_subf_auth_entry *sauth_entry;
5073
5074 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5075 if (sauth_entry->msae_laddr_id == laddr_id) {
5076 if ((sauth_entry->msae_raddr_id != 0) &&
5077 (sauth_entry->msae_raddr_id != raddr_id)) {
5078 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5079 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5080 raddr_id, sauth_entry->msae_raddr_id);
5081 return;
5082 }
5083 sauth_entry->msae_raddr_id = raddr_id;
5084 if ((sauth_entry->msae_raddr_rand != 0) &&
5085 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5086 os_log_error(mptcp_log_handle, "%s - %lx: "
5087 "dup SYN_ACK %d %d \n",
5088 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5089 raddr_rand, sauth_entry->msae_raddr_rand);
5090 return;
5091 }
5092 sauth_entry->msae_raddr_rand = raddr_rand;
5093 return;
5094 }
5095 }
5096 }
5097
5098 /*
5099 * SHA-256 support for MPTCP
5100 */
5101
5102 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5103 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5104 {
5105 const unsigned char *sha2_base;
5106 int sha2_size;
5107
5108 sha2_base = (const unsigned char *) key;
5109 sha2_size = sizeof(mptcp_key_t);
5110
5111 SHA256_CTX sha_ctx;
5112 SHA256_Init(&sha_ctx);
5113 SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5114 SHA256_Final(sha_digest, &sha_ctx);
5115 }
5116
5117 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5118 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5119 u_char *msg, uint16_t msg_len, u_char *digest)
5120 {
5121 SHA256_CTX sha_ctx;
5122 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5123 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5124 int i;
5125
5126 bzero(digest, SHA256_DIGEST_LENGTH);
5127
5128 /* Set up the Key for HMAC */
5129 key_ipad[0] = key1;
5130 key_ipad[1] = key2;
5131
5132 key_opad[0] = key1;
5133 key_opad[1] = key2;
5134
5135 /* Key is 512 block length, so no need to compute hash */
5136
5137 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5138
5139 for (i = 0; i < 8; i++) {
5140 key_ipad[i] ^= 0x3636363636363636;
5141 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5142 }
5143
5144 /* Perform inner SHA256 */
5145 SHA256_Init(&sha_ctx);
5146 SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5147 SHA256_Update(&sha_ctx, msg, msg_len);
5148 SHA256_Final(digest, &sha_ctx);
5149
5150 /* Perform outer SHA256 */
5151 SHA256_Init(&sha_ctx);
5152 SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5153 SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5154 SHA256_Final(digest, &sha_ctx);
5155 }
5156
5157 /*
5158 * SHA1 support for MPTCP
5159 */
5160
5161 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5162 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5163 {
5164 SHA1_CTX sha1ctxt;
5165 const unsigned char *sha1_base;
5166 int sha1_size;
5167
5168 sha1_base = (const unsigned char *) key;
5169 sha1_size = sizeof(mptcp_key_t);
5170 SHA1Init(&sha1ctxt);
5171 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5172 SHA1Final(sha_digest, &sha1ctxt);
5173 }
5174
5175 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5176 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5177 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5178 {
5179 SHA1_CTX sha1ctxt;
5180 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5181 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5182 u_int32_t data[2];
5183 int i;
5184
5185 bzero(digest, SHA1_RESULTLEN);
5186
5187 /* Set up the Key for HMAC */
5188 key_ipad[0] = key1;
5189 key_ipad[1] = key2;
5190
5191 key_opad[0] = key1;
5192 key_opad[1] = key2;
5193
5194 /* Set up the message for HMAC */
5195 data[0] = rand1;
5196 data[1] = rand2;
5197
5198 /* Key is 512 block length, so no need to compute hash */
5199
5200 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5201
5202 for (i = 0; i < 8; i++) {
5203 key_ipad[i] ^= 0x3636363636363636;
5204 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5205 }
5206
5207 /* Perform inner SHA1 */
5208 SHA1Init(&sha1ctxt);
5209 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5210 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5211 SHA1Final(digest, &sha1ctxt);
5212
5213 /* Perform outer SHA1 */
5214 SHA1Init(&sha1ctxt);
5215 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5216 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5217 SHA1Final(digest, &sha1ctxt);
5218 }
5219
5220 /*
5221 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5222 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5223 */
5224 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5225 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5226 {
5227 uint32_t lrand, rrand;
5228
5229 lrand = rrand = 0;
5230 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5231
5232 u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5233 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5234 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5235 } else {
5236 uint32_t data[2];
5237 data[0] = lrand;
5238 data[1] = rrand;
5239 mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5240 }
5241 bcopy(full_digest, digest, digest_len);
5242 }
5243
5244 /*
5245 * Authentication data generation
5246 */
5247 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5248 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5249 int token_len)
5250 {
5251 VERIFY(token_len == sizeof(u_int32_t));
5252 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5253 sha_digest_len == SHA256_DIGEST_LENGTH);
5254
5255 /* Most significant 32 bits of the SHA1/SHA256 hash */
5256 bcopy(sha_digest, token, sizeof(u_int32_t));
5257 return;
5258 }
5259
5260 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5261 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5262 int idsn_len, uint8_t mp_version)
5263 {
5264 VERIFY(idsn_len == sizeof(u_int64_t));
5265 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5266 sha_digest_len == SHA256_DIGEST_LENGTH);
5267 VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5268
5269 /*
5270 * Least significant 64 bits of the hash
5271 */
5272
5273 if (mp_version == MPTCP_VERSION_0) {
5274 idsn[7] = sha_digest[12];
5275 idsn[6] = sha_digest[13];
5276 idsn[5] = sha_digest[14];
5277 idsn[4] = sha_digest[15];
5278 idsn[3] = sha_digest[16];
5279 idsn[2] = sha_digest[17];
5280 idsn[1] = sha_digest[18];
5281 idsn[0] = sha_digest[19];
5282 } else {
5283 idsn[7] = sha_digest[24];
5284 idsn[6] = sha_digest[25];
5285 idsn[5] = sha_digest[26];
5286 idsn[4] = sha_digest[27];
5287 idsn[3] = sha_digest[28];
5288 idsn[2] = sha_digest[29];
5289 idsn[1] = sha_digest[30];
5290 idsn[0] = sha_digest[31];
5291 }
5292 return;
5293 }
5294
5295 static void
mptcp_conn_properties(struct mptcb * mp_tp)5296 mptcp_conn_properties(struct mptcb *mp_tp)
5297 {
5298 /* Set DSS checksum flag */
5299 if (mptcp_dss_csum) {
5300 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5301 }
5302
5303 /* Set up receive window */
5304 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5305
5306 /* Set up gc ticks */
5307 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5308 }
5309
5310 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5311 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5312 {
5313 struct mptcb *mp_tp = mpte->mpte_mptcb;
5314 char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5315 uint16_t digest_len;
5316
5317 if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5318 mp_tp->mpt_version = MPTCP_VERSION_0;
5319 } else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5320 mp_tp->mpt_version = MPTCP_VERSION_1;
5321 } else {
5322 mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5323 }
5324 VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5325 mp_tp->mpt_version == MPTCP_VERSION_1);
5326
5327 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5328 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5329 digest_len = SHA1_RESULTLEN;
5330 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5331 } else {
5332 digest_len = SHA256_DIGEST_LENGTH;
5333 mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5334 }
5335
5336 mptcp_generate_token(key_digest, digest_len,
5337 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5338 mptcp_generate_idsn(key_digest, digest_len,
5339 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5340 /* The subflow SYN is also first MPTCP byte */
5341 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5342 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5343
5344 mptcp_conn_properties(mp_tp);
5345 }
5346
5347 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5348 mptcp_init_remote_parms(struct mptcb *mp_tp)
5349 {
5350 /* Setup local and remote tokens and Initial DSNs */
5351 char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5352 uint16_t digest_len;
5353
5354 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5355 digest_len = SHA1_RESULTLEN;
5356 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5357 } else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5358 digest_len = SHA256_DIGEST_LENGTH;
5359 mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5360 } else {
5361 return -1;
5362 }
5363
5364 mptcp_generate_token(remote_digest, digest_len,
5365 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5366 mptcp_generate_idsn(remote_digest, digest_len,
5367 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5368 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5369 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5370 return 0;
5371 }
5372
5373 static void
mptcp_send_dfin(struct socket * so)5374 mptcp_send_dfin(struct socket *so)
5375 {
5376 struct tcpcb *tp = NULL;
5377 struct inpcb *inp = NULL;
5378
5379 inp = sotoinpcb(so);
5380 if (!inp) {
5381 return;
5382 }
5383
5384 tp = intotcpcb(inp);
5385 if (!tp) {
5386 return;
5387 }
5388
5389 if (!(tp->t_mpflags & TMPF_RESET)) {
5390 tp->t_mpflags |= TMPF_SEND_DFIN;
5391 }
5392 }
5393
5394 /*
5395 * Data Sequence Mapping routines
5396 */
5397 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5398 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5399 {
5400 struct mptcb *mp_tp;
5401
5402 if (m == NULL) {
5403 return;
5404 }
5405
5406 mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb;
5407
5408 while (m) {
5409 VERIFY(m->m_flags & M_PKTHDR);
5410 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5411 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5412 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5413 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5414 mp_tp->mpt_sndmax += m_pktlen(m);
5415 m = m->m_next;
5416 }
5417 }
5418
5419 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5420 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5421 {
5422 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5423 uint64_t data_ack;
5424 uint64_t dsn;
5425
5426 VERIFY(len >= 0);
5427
5428 if (!m || len == 0) {
5429 return;
5430 }
5431
5432 while (m && len > 0) {
5433 VERIFY(m->m_flags & M_PKTHDR);
5434 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5435
5436 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5437 dsn = m->m_pkthdr.mp_dsn;
5438
5439 len -= m->m_len;
5440 m = m->m_next;
5441 }
5442
5443 if (m && len == 0) {
5444 /*
5445 * If there is one more mbuf in the chain, it automatically means
5446 * that up to m->mp_dsn has been ack'ed.
5447 *
5448 * This means, we actually correct data_ack back down (compared
5449 * to what we set inside the loop - dsn + data_len). Because in
5450 * the loop we are "optimistic" and assume that the full mapping
5451 * will be acked. If that's not the case and we get out of the
5452 * loop with m != NULL, it means only up to m->mp_dsn has been
5453 * really acked.
5454 */
5455 data_ack = m->m_pkthdr.mp_dsn;
5456 }
5457
5458 if (len < 0) {
5459 /*
5460 * If len is negative, meaning we acked in the middle of an mbuf,
5461 * only up to this mbuf's data-sequence number has been acked
5462 * at the MPTCP-level.
5463 */
5464 data_ack = dsn;
5465 }
5466
5467 /* We can have data in the subflow's send-queue that is being acked,
5468 * while the DATA_ACK has already advanced. Thus, we should check whether
5469 * or not the DATA_ACK is actually new here.
5470 */
5471 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5472 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5473 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5474 }
5475 }
5476
5477 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5478 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5479 {
5480 int rewinding = 0;
5481
5482 /* TFO makes things complicated. */
5483 if (so->so_flags1 & SOF1_TFO_REWIND) {
5484 rewinding = 1;
5485 so->so_flags1 &= ~SOF1_TFO_REWIND;
5486 }
5487
5488 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5489 u_int32_t sub_len;
5490 VERIFY(m->m_flags & M_PKTHDR);
5491 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5492
5493 sub_len = m->m_pkthdr.mp_rlen;
5494
5495 if (sub_len < len) {
5496 m->m_pkthdr.mp_dsn += sub_len;
5497 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5498 m->m_pkthdr.mp_rseq += sub_len;
5499 }
5500 m->m_pkthdr.mp_rlen = 0;
5501 len -= sub_len;
5502 } else {
5503 /* sub_len >= len */
5504 if (rewinding == 0) {
5505 m->m_pkthdr.mp_dsn += len;
5506 }
5507 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5508 if (rewinding == 0) {
5509 m->m_pkthdr.mp_rseq += len;
5510 }
5511 }
5512 m->m_pkthdr.mp_rlen -= len;
5513 break;
5514 }
5515 m = m->m_next;
5516 }
5517
5518 if (so->so_flags & SOF_MP_SUBFLOW &&
5519 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5520 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5521 /*
5522 * Received an ack without receiving a DATA_ACK.
5523 * Need to fallback to regular TCP (or destroy this subflow).
5524 */
5525 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5526 mptcp_notify_mpfail(so);
5527 }
5528 }
5529
5530 /* Obtain the DSN mapping stored in the mbuf */
5531 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5532 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5533 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5534 {
5535 u_int64_t dsn64;
5536
5537 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5538 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5539 }
5540
5541 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5542 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5543 uint32_t *relseq, uint16_t *data_len,
5544 uint16_t *dss_csum)
5545 {
5546 struct mbuf *m = so->so_snd.sb_mb;
5547
5548 VERIFY(off >= 0);
5549
5550 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5551 *dsn = 0;
5552 *relseq = 0;
5553 *data_len = 0;
5554 *dss_csum = 0;
5555 return;
5556 }
5557
5558 /*
5559 * In the subflow socket, the DSN sequencing can be discontiguous,
5560 * but the subflow sequence mapping is contiguous. Use the subflow
5561 * sequence property to find the right mbuf and corresponding dsn
5562 * mapping.
5563 */
5564
5565 while (m) {
5566 VERIFY(m->m_flags & M_PKTHDR);
5567 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5568
5569 if (off >= m->m_len) {
5570 off -= m->m_len;
5571 m = m->m_next;
5572 } else {
5573 break;
5574 }
5575 }
5576
5577 VERIFY(off >= 0);
5578 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5579
5580 *dsn = m->m_pkthdr.mp_dsn;
5581 *relseq = m->m_pkthdr.mp_rseq;
5582 *data_len = m->m_pkthdr.mp_rlen;
5583 *dss_csum = m->m_pkthdr.mp_csum;
5584 }
5585
5586 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5587 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5588 {
5589 uint64_t dsn;
5590 uint32_t relseq;
5591
5592 mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5593 }
5594
5595 /*
5596 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5597 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5598 * When it trims data tcp_input calls m_adj() which does not remove the
5599 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5600 * The dsn map insertion cannot be delayed after trim, because data can be in
5601 * the reassembly queue for a while and the DSN option info in tp will be
5602 * overwritten for every new packet received.
5603 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5604 * with mptcp_adj_rmap()
5605 */
5606 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5607 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5608 {
5609 VERIFY(m->m_flags & M_PKTHDR);
5610 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5611
5612 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5613 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5614 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5615 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5616 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5617 if (tp->t_rcv_map.mpt_dfin) {
5618 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5619 }
5620
5621 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5622
5623 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5624 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5625 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5626 if (th->th_flags & TH_FIN) {
5627 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5628 }
5629 }
5630 }
5631
5632 /*
5633 * Following routines help with failure detection and failover of data
5634 * transfer from one subflow to another.
5635 */
5636 void
mptcp_act_on_txfail(struct socket * so)5637 mptcp_act_on_txfail(struct socket *so)
5638 {
5639 struct tcpcb *tp = NULL;
5640 struct inpcb *inp = sotoinpcb(so);
5641
5642 if (inp == NULL) {
5643 return;
5644 }
5645
5646 tp = intotcpcb(inp);
5647 if (tp == NULL) {
5648 return;
5649 }
5650
5651 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5652 return;
5653 }
5654
5655 so->so_flags |= SOF_MP_TRYFAILOVER;
5656 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5657 }
5658
5659 /*
5660 * Support for MP_FAIL option
5661 */
5662 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5663 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5664 {
5665 struct mbuf *m = so->so_snd.sb_mb;
5666 uint16_t datalen;
5667 uint64_t dsn;
5668 int off = 0;
5669
5670 if (m == NULL) {
5671 return -1;
5672 }
5673
5674 while (m != NULL) {
5675 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5676 VERIFY(m->m_flags & M_PKTHDR);
5677 dsn = m->m_pkthdr.mp_dsn;
5678 datalen = m->m_pkthdr.mp_rlen;
5679 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5680 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5681 off = (int)(dsn_fail - dsn);
5682 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5683 return 0;
5684 }
5685
5686 m = m->m_next;
5687 }
5688
5689 /*
5690 * If there was no mbuf data and a fallback to TCP occurred, there's
5691 * not much else to do.
5692 */
5693
5694 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5695 return -1;
5696 }
5697
5698 /*
5699 * Support for sending contiguous MPTCP bytes in subflow
5700 * Also for preventing sending data with ACK in 3-way handshake
5701 */
5702 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5703 mptcp_adj_sendlen(struct socket *so, int32_t off)
5704 {
5705 struct tcpcb *tp = sototcpcb(so);
5706 struct mptsub *mpts = tp->t_mpsub;
5707 uint64_t mdss_dsn;
5708 uint32_t mdss_subflow_seq;
5709 int mdss_subflow_off;
5710 uint16_t mdss_data_len;
5711 uint16_t dss_csum;
5712
5713 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5714 return 0;
5715 }
5716
5717 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5718 &mdss_data_len, &dss_csum);
5719
5720 /*
5721 * We need to compute how much of the mapping still remains.
5722 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5723 */
5724 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5725
5726 /*
5727 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5728 * seq has been set to 1 (while it should be 0).
5729 */
5730 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5731 mdss_subflow_off--;
5732 }
5733
5734 VERIFY(off >= mdss_subflow_off);
5735
5736 return mdss_data_len - (off - mdss_subflow_off);
5737 }
5738
5739 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5740 mptcp_get_maxseg(struct mptses *mpte)
5741 {
5742 struct mptsub *mpts;
5743 uint32_t maxseg = 0;
5744
5745 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5746 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5747
5748 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5749 TCPS_HAVERCVDFIN2(tp->t_state)) {
5750 continue;
5751 }
5752
5753 if (tp->t_maxseg > maxseg) {
5754 maxseg = tp->t_maxseg;
5755 }
5756 }
5757
5758 return maxseg;
5759 }
5760
5761 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5762 mptcp_get_rcvscale(struct mptses *mpte)
5763 {
5764 struct mptsub *mpts;
5765 uint8_t rcvscale = UINT8_MAX;
5766
5767 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5768 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5769
5770 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5771 TCPS_HAVERCVDFIN2(tp->t_state)) {
5772 continue;
5773 }
5774
5775 if (tp->rcv_scale < rcvscale) {
5776 rcvscale = tp->rcv_scale;
5777 }
5778 }
5779
5780 return rcvscale;
5781 }
5782
5783 /* Similar to tcp_sbrcv_reserve */
5784 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5785 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5786 u_int32_t newsize, u_int32_t idealsize)
5787 {
5788 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5789
5790 if (rcvscale == UINT8_MAX) {
5791 return;
5792 }
5793
5794 /* newsize should not exceed max */
5795 newsize = min(newsize, tcp_autorcvbuf_max);
5796
5797 /* The receive window scale negotiated at the
5798 * beginning of the connection will also set a
5799 * limit on the socket buffer size
5800 */
5801 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5802
5803 /* Set new socket buffer size */
5804 if (newsize > sbrcv->sb_hiwat &&
5805 (sbreserve(sbrcv, newsize) == 1)) {
5806 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5807 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5808
5809 /* Again check the limit set by the advertised
5810 * window scale
5811 */
5812 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5813 TCP_MAXWIN << rcvscale);
5814 }
5815 }
5816
5817 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5818 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5819 {
5820 struct mptses *mpte = mp_tp->mpt_mpte;
5821 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5822 struct sockbuf *sbrcv = &mp_so->so_rcv;
5823 uint32_t hiwat_sum = 0;
5824 uint32_t ideal_sum = 0;
5825 struct mptsub *mpts;
5826
5827 /*
5828 * Do not grow the receive socket buffer if
5829 * - auto resizing is disabled, globally or on this socket
5830 * - the high water mark already reached the maximum
5831 * - the stream is in background and receive side is being
5832 * throttled
5833 * - if there are segments in reassembly queue indicating loss,
5834 * do not need to increase recv window during recovery as more
5835 * data is not going to be sent. A duplicate ack sent during
5836 * recovery should not change the receive window
5837 */
5838 if (tcp_do_autorcvbuf == 0 ||
5839 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5840 tcp_cansbgrow(sbrcv) == 0 ||
5841 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5842 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5843 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5844 /* Can not resize the socket buffer, just return */
5845 return;
5846 }
5847
5848 /*
5849 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5850 *
5851 * But, for this we first need accurate receiver-RTT estimations, which
5852 * we currently don't have.
5853 *
5854 * Let's use a dummy algorithm for now, just taking the sum of all
5855 * subflow's receive-buffers. It's too low, but that's all we can get
5856 * for now.
5857 */
5858
5859 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5860 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5861 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5862 }
5863
5864 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5865 }
5866
5867 /*
5868 * Determine if we can grow the recieve socket buffer to avoid sending
5869 * a zero window update to the peer. We allow even socket buffers that
5870 * have fixed size (set by the application) to grow if the resource
5871 * constraints are met. They will also be trimmed after the application
5872 * reads data.
5873 *
5874 * Similar to tcp_sbrcv_grow_rwin
5875 */
5876 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5877 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5878 {
5879 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5880 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5881 u_int32_t rcvbuf = sb->sb_hiwat;
5882
5883 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5884 return;
5885 }
5886
5887 if (tcp_do_autorcvbuf == 1 &&
5888 tcp_cansbgrow(sb) &&
5889 /* Diff to tcp_sbrcv_grow_rwin */
5890 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5891 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5892 rcvbuf < tcp_autorcvbuf_max &&
5893 (sb->sb_idealsize > 0 &&
5894 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5895 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5896 }
5897 }
5898
5899 /* Similar to tcp_sbspace */
5900 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5901 mptcp_sbspace(struct mptcb *mp_tp)
5902 {
5903 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5904 uint32_t rcvbuf;
5905 int32_t space;
5906 int32_t pending = 0;
5907
5908 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5909
5910 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5911
5912 /* hiwat might have changed */
5913 rcvbuf = sb->sb_hiwat;
5914
5915 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5916 (sb->sb_mbmax - sb->sb_mbcnt)));
5917 if (space < 0) {
5918 space = 0;
5919 }
5920
5921 #if CONTENT_FILTER
5922 /* Compensate for data being processed by content filters */
5923 pending = cfil_sock_data_space(sb);
5924 #endif /* CONTENT_FILTER */
5925 if (pending > space) {
5926 space = 0;
5927 } else {
5928 space -= pending;
5929 }
5930
5931 return space;
5932 }
5933
5934 /*
5935 * Support Fallback to Regular TCP
5936 */
5937 void
mptcp_notify_mpready(struct socket * so)5938 mptcp_notify_mpready(struct socket *so)
5939 {
5940 struct tcpcb *tp = NULL;
5941
5942 if (so == NULL) {
5943 return;
5944 }
5945
5946 tp = intotcpcb(sotoinpcb(so));
5947
5948 if (tp == NULL) {
5949 return;
5950 }
5951
5952 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5953 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5954 struct tcpcb *, tp);
5955
5956 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5957 return;
5958 }
5959
5960 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5961 return;
5962 }
5963
5964 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5965 tp->t_mpflags |= TMPF_MPTCP_READY;
5966
5967 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5968 }
5969
5970 void
mptcp_notify_mpfail(struct socket * so)5971 mptcp_notify_mpfail(struct socket *so)
5972 {
5973 struct tcpcb *tp = NULL;
5974
5975 if (so == NULL) {
5976 return;
5977 }
5978
5979 tp = intotcpcb(sotoinpcb(so));
5980
5981 if (tp == NULL) {
5982 return;
5983 }
5984
5985 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5986 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5987 struct tcpcb *, tp);
5988
5989 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5990 return;
5991 }
5992
5993 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5994 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5995
5996 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5997 }
5998
5999 /*
6000 * Keepalive helper function
6001 */
6002 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6003 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6004 {
6005 boolean_t ret = 1;
6006
6007 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6008
6009 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6010 ret = 0;
6011 }
6012 return ret;
6013 }
6014
6015 /*
6016 * MPTCP t_maxseg adjustment function
6017 */
6018 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6019 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6020 {
6021 int mss_lower = 0;
6022 struct mptcb *mp_tp = tptomptp(tp);
6023
6024 #define MPTCP_COMPUTE_LEN { \
6025 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
6026 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
6027 mss_lower += 2; \
6028 else \
6029 /* adjust to 32-bit boundary + EOL */ \
6030 mss_lower += 2; \
6031 }
6032 if (mp_tp == NULL) {
6033 return 0;
6034 }
6035
6036 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6037
6038 /*
6039 * For the first subflow and subsequent subflows, adjust mss for
6040 * most common MPTCP option size, for case where tcp_mss is called
6041 * during option processing and MTU discovery.
6042 */
6043 if (!mtudisc) {
6044 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6045 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6046 MPTCP_COMPUTE_LEN;
6047 }
6048
6049 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6050 tp->t_mpflags & TMPF_SENT_JOIN) {
6051 MPTCP_COMPUTE_LEN;
6052 }
6053 } else {
6054 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6055 MPTCP_COMPUTE_LEN;
6056 }
6057 }
6058
6059 return mss_lower;
6060 }
6061
6062 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6063 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6064 {
6065 struct inpcb *inp;
6066
6067 tcp_getconninfo(so, &flow->flow_ci);
6068 inp = sotoinpcb(so);
6069 if ((inp->inp_vflag & INP_IPV6) != 0) {
6070 flow->flow_src.ss_family = AF_INET6;
6071 flow->flow_dst.ss_family = AF_INET6;
6072 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6073 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6074 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6075 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6076 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6077 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6078 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
6079 flow->flow_src.ss_family = AF_INET;
6080 flow->flow_dst.ss_family = AF_INET;
6081 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6082 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6083 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6084 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6085 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6086 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6087 }
6088 flow->flow_len = sizeof(*flow);
6089 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6090 flow->flow_flags = mpts->mpts_flags;
6091 flow->flow_cid = mpts->mpts_connid;
6092 flow->flow_relseq = mpts->mpts_rel_seq;
6093 flow->flow_soerror = mpts->mpts_socket->so_error;
6094 flow->flow_probecnt = mpts->mpts_probecnt;
6095 }
6096
6097 static int
6098 mptcp_pcblist SYSCTL_HANDLER_ARGS
6099 {
6100 #pragma unused(oidp, arg1, arg2)
6101 int error = 0, f;
6102 size_t len;
6103 struct mppcb *mpp;
6104 struct mptses *mpte;
6105 struct mptcb *mp_tp;
6106 struct mptsub *mpts;
6107 struct socket *so;
6108 conninfo_mptcp_t mptcpci;
6109 mptcp_flow_t *flows = NULL;
6110
6111 if (req->newptr != USER_ADDR_NULL) {
6112 return EPERM;
6113 }
6114
6115 lck_mtx_lock(&mtcbinfo.mppi_lock);
6116 if (req->oldptr == USER_ADDR_NULL) {
6117 size_t n = mtcbinfo.mppi_count;
6118 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6119 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6120 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6121 return 0;
6122 }
6123 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6124 flows = NULL;
6125 socket_lock(mpp->mpp_socket, 1);
6126 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6127 mpte = mptompte(mpp);
6128
6129 socket_lock_assert_owned(mptetoso(mpte));
6130 mp_tp = mpte->mpte_mptcb;
6131
6132 bzero(&mptcpci, sizeof(mptcpci));
6133 mptcpci.mptcpci_state = mp_tp->mpt_state;
6134 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6135 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6136 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6137 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6138 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6139 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6140 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6141 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6142 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6143 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6144 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6145 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6146 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6147
6148 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6149 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6150 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6151 mptcpci.mptcpci_flow_offset =
6152 offsetof(conninfo_mptcp_t, mptcpci_flows);
6153
6154 len = sizeof(*flows) * mpte->mpte_numflows;
6155 if (mpte->mpte_numflows != 0) {
6156 flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6157 if (flows == NULL) {
6158 socket_unlock(mpp->mpp_socket, 1);
6159 break;
6160 }
6161 mptcpci.mptcpci_len = sizeof(mptcpci) +
6162 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6163 error = SYSCTL_OUT(req, &mptcpci,
6164 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6165 } else {
6166 mptcpci.mptcpci_len = sizeof(mptcpci);
6167 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6168 }
6169 if (error) {
6170 socket_unlock(mpp->mpp_socket, 1);
6171 kfree_data(flows, len);
6172 break;
6173 }
6174 f = 0;
6175 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6176 so = mpts->mpts_socket;
6177 fill_mptcp_subflow(so, &flows[f], mpts);
6178 f++;
6179 }
6180 socket_unlock(mpp->mpp_socket, 1);
6181 if (flows) {
6182 error = SYSCTL_OUT(req, flows, len);
6183 kfree_data(flows, len);
6184 if (error) {
6185 break;
6186 }
6187 }
6188 }
6189 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6190
6191 return error;
6192 }
6193
6194 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6195 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6196 "List of active MPTCP connections");
6197
6198 /*
6199 * Set notsent lowat mark on the MPTCB
6200 */
6201 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6202 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6203 {
6204 struct mptcb *mp_tp = NULL;
6205 int error = 0;
6206
6207 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6208 mp_tp = mpte->mpte_mptcb;
6209 }
6210
6211 if (mp_tp) {
6212 mp_tp->mpt_notsent_lowat = optval;
6213 } else {
6214 error = EINVAL;
6215 }
6216
6217 return error;
6218 }
6219
6220 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6221 mptcp_get_notsent_lowat(struct mptses *mpte)
6222 {
6223 struct mptcb *mp_tp = NULL;
6224
6225 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6226 mp_tp = mpte->mpte_mptcb;
6227 }
6228
6229 if (mp_tp) {
6230 return mp_tp->mpt_notsent_lowat;
6231 } else {
6232 return 0;
6233 }
6234 }
6235
6236 int
mptcp_notsent_lowat_check(struct socket * so)6237 mptcp_notsent_lowat_check(struct socket *so)
6238 {
6239 struct mptses *mpte;
6240 struct mppcb *mpp;
6241 struct mptcb *mp_tp;
6242 struct mptsub *mpts;
6243
6244 int notsent = 0;
6245
6246 mpp = mpsotomppcb(so);
6247 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6248 return 0;
6249 }
6250
6251 mpte = mptompte(mpp);
6252 socket_lock_assert_owned(mptetoso(mpte));
6253 mp_tp = mpte->mpte_mptcb;
6254
6255 notsent = so->so_snd.sb_cc;
6256
6257 if ((notsent == 0) ||
6258 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6259 mp_tp->mpt_notsent_lowat)) {
6260 return 1;
6261 }
6262
6263 /* When Nagle's algorithm is not disabled, it is better
6264 * to wakeup the client even before there is atleast one
6265 * maxseg of data to write.
6266 */
6267 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6268 int retval = 0;
6269 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6270 struct socket *subf_so = mpts->mpts_socket;
6271 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6272
6273 notsent = so->so_snd.sb_cc -
6274 (tp->snd_nxt - tp->snd_una);
6275
6276 if ((tp->t_flags & TF_NODELAY) == 0 &&
6277 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6278 retval = 1;
6279 }
6280 return retval;
6281 }
6282 }
6283 return 0;
6284 }
6285
6286 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6287 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6288 void **unitinfo)
6289 {
6290 #pragma unused(kctlref, sac, unitinfo)
6291
6292 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6293 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6294 }
6295
6296 mptcp_kern_skt_unit = sac->sc_unit;
6297
6298 return 0;
6299 }
6300
6301 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6302 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6303 {
6304 struct mppcb *mpp;
6305
6306 /* Iterate over all MPTCP connections */
6307
6308 lck_mtx_lock(&mtcbinfo.mppi_lock);
6309
6310 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6311 struct socket *mp_so = mpp->mpp_socket;
6312 struct mptses *mpte = mpp->mpp_pcbe;
6313
6314 socket_lock(mp_so, 1);
6315
6316 if (mp_so->so_flags & SOF_DELEGATED &&
6317 uuid_compare(uuid, mp_so->e_uuid)) {
6318 goto next;
6319 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6320 uuid_compare(uuid, mp_so->last_uuid)) {
6321 goto next;
6322 }
6323
6324 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6325 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6326
6327 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6328
6329 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6330 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6331 }
6332
6333 mptcp_check_subflows_and_add(mpte);
6334 mptcp_remove_subflows(mpte);
6335
6336 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6337
6338 next:
6339 socket_unlock(mp_so, 1);
6340 }
6341
6342 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6343 }
6344
6345 static void
mptcp_wifi_status_changed(void)6346 mptcp_wifi_status_changed(void)
6347 {
6348 struct mppcb *mpp;
6349
6350 /* Iterate over all MPTCP connections */
6351
6352 lck_mtx_lock(&mtcbinfo.mppi_lock);
6353
6354 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6355 struct socket *mp_so = mpp->mpp_socket;
6356 struct mptses *mpte = mpp->mpp_pcbe;
6357
6358 socket_lock(mp_so, 1);
6359
6360 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6361 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6362 mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6363 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6364 goto next;
6365 }
6366
6367 mptcp_check_subflows_and_add(mpte);
6368 mptcp_check_subflows_and_remove(mpte);
6369
6370 next:
6371 socket_unlock(mp_so, 1);
6372 }
6373
6374 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6375 }
6376
6377 struct mptcp_uuid_search_info {
6378 uuid_t target_uuid;
6379 proc_t found_proc;
6380 boolean_t is_proc_found;
6381 };
6382
6383 static int
mptcp_find_proc_filter(proc_t p,void * arg)6384 mptcp_find_proc_filter(proc_t p, void *arg)
6385 {
6386 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6387 int found;
6388
6389 if (info->is_proc_found) {
6390 return 0;
6391 }
6392
6393 /*
6394 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6395 * expects != 0 for a matching filter.
6396 */
6397 found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6398 if (found) {
6399 info->is_proc_found = true;
6400 }
6401
6402 return found;
6403 }
6404
6405 static int
mptcp_find_proc_callout(proc_t p,void * arg)6406 mptcp_find_proc_callout(proc_t p, void * arg)
6407 {
6408 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6409
6410 if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6411 info->found_proc = p;
6412 return PROC_CLAIMED_DONE;
6413 }
6414
6415 return PROC_RETURNED;
6416 }
6417
6418 static proc_t
mptcp_find_proc(const uuid_t uuid)6419 mptcp_find_proc(const uuid_t uuid)
6420 {
6421 struct mptcp_uuid_search_info info;
6422
6423 uuid_copy(info.target_uuid, uuid);
6424 info.found_proc = PROC_NULL;
6425 info.is_proc_found = false;
6426
6427 proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6428 mptcp_find_proc_filter, &info);
6429
6430 return info.found_proc;
6431 }
6432
6433 void
mptcp_ask_symptoms(struct mptses * mpte)6434 mptcp_ask_symptoms(struct mptses *mpte)
6435 {
6436 struct mptcp_symptoms_ask_uuid ask;
6437 struct socket *mp_so;
6438 struct proc *p = PROC_NULL;
6439 int pid, prio, err;
6440
6441 if (mptcp_kern_skt_unit == 0) {
6442 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6443 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6444 return;
6445 }
6446
6447 mp_so = mptetoso(mpte);
6448
6449 if (mp_so->so_flags & SOF_DELEGATED) {
6450 if (mpte->mpte_epid != 0) {
6451 p = proc_find(mpte->mpte_epid);
6452 if (p != PROC_NULL) {
6453 /* We found a pid, check its UUID */
6454 if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6455 /* It's not the same - we need to look for the real proc */
6456 proc_rele(p);
6457 p = PROC_NULL;
6458 }
6459 }
6460 }
6461
6462 if (p == PROC_NULL) {
6463 p = mptcp_find_proc(mp_so->e_uuid);
6464 if (p == PROC_NULL) {
6465 uuid_string_t uuid_string;
6466 uuid_unparse(mp_so->e_uuid, uuid_string);
6467
6468 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6469 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6470
6471 return;
6472 }
6473 mpte->mpte_epid = proc_pid(p);
6474 }
6475
6476 pid = mpte->mpte_epid;
6477 uuid_copy(ask.uuid, mp_so->e_uuid);
6478 } else {
6479 pid = mp_so->last_pid;
6480
6481 p = proc_find(pid);
6482 if (p == PROC_NULL) {
6483 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6484 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6485 return;
6486 }
6487
6488 uuid_copy(ask.uuid, mp_so->last_uuid);
6489 }
6490
6491
6492 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6493
6494 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6495
6496 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6497 prio == TASK_DARWINBG_APPLICATION) {
6498 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6499 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6500 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6501 } else {
6502 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6503 }
6504
6505 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6506 &ask, sizeof(ask), CTL_DATA_EOR);
6507
6508 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6509 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6510
6511
6512 proc_rele(p);
6513 }
6514
6515 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6516 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6517 void *unitinfo)
6518 {
6519 #pragma unused(kctlref, kcunit, unitinfo)
6520
6521 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6522
6523 return 0;
6524 }
6525
6526 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6527 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6528 mbuf_t m, int flags)
6529 {
6530 #pragma unused(kctlref, unitinfo, flags)
6531 symptoms_advisory_t *sa = NULL;
6532
6533 if (kcunit != mptcp_kern_skt_unit) {
6534 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6535 __func__, kcunit, mptcp_kern_skt_unit);
6536 }
6537
6538 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6539 mbuf_freem(m);
6540 return EINVAL;
6541 }
6542
6543 if (mbuf_len(m) < sizeof(*sa)) {
6544 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6545 __func__, mbuf_len(m), sizeof(*sa));
6546 mbuf_freem(m);
6547 return EINVAL;
6548 }
6549
6550 sa = mbuf_data(m);
6551
6552 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6553 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6554 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6555 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6556
6557 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6558 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6559 mptcp_wifi_status_changed();
6560 }
6561 } else {
6562 struct mptcp_symptoms_answer answer;
6563 errno_t err;
6564
6565 /* We temporarily allow different sizes for ease of submission */
6566 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6567 mbuf_len(m) != sizeof(answer)) {
6568 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6569 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6570 sizeof(answer));
6571 mbuf_free(m);
6572 return EINVAL;
6573 }
6574
6575 memset(&answer, 0, sizeof(answer));
6576
6577 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6578 if (err) {
6579 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6580 mbuf_free(m);
6581 return err;
6582 }
6583
6584 mptcp_allow_uuid(answer.uuid, answer.rssi);
6585 }
6586
6587 mbuf_freem(m);
6588 return 0;
6589 }
6590
6591 void
mptcp_control_register(void)6592 mptcp_control_register(void)
6593 {
6594 /* Set up the advisory control socket */
6595 struct kern_ctl_reg mptcp_kern_ctl;
6596
6597 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6598 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6599 sizeof(mptcp_kern_ctl.ctl_name));
6600 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6601 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6602 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6603 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6604
6605 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6606 }
6607
6608 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6609 mptcp_wifi_quality_for_session(struct mptses *mpte)
6610 {
6611 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6612 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6613 mptcp_advisory.sa_wifi_status) {
6614 return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6615 }
6616
6617 /*
6618 * If it's a first-party app and we don't have any info
6619 * about the Wi-Fi state, let's be pessimistic.
6620 */
6621 return MPTCP_WIFI_QUALITY_UNSURE;
6622 } else {
6623 if (symptoms_is_wifi_lossy()) {
6624 return MPTCP_WIFI_QUALITY_BAD;
6625 }
6626
6627 /*
6628 * If we are target-based (meaning, we allow to be more lax on
6629 * the when wifi is considered bad), we only *know* about the state once
6630 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6631 *
6632 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6633 * be set.
6634 *
6635 * In any other case (while in target-mode), consider WiFi bad
6636 * and we are going to ask for allowance from Symptoms anyway.
6637 */
6638 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6639 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6640 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6641 return MPTCP_WIFI_QUALITY_GOOD;
6642 }
6643
6644 return MPTCP_WIFI_QUALITY_BAD;
6645 }
6646
6647 return MPTCP_WIFI_QUALITY_GOOD;
6648 }
6649 }
6650
6651 boolean_t
symptoms_is_wifi_lossy(void)6652 symptoms_is_wifi_lossy(void)
6653 {
6654 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6655 }
6656
6657 int
mptcp_freeq(struct mptcb * mp_tp)6658 mptcp_freeq(struct mptcb *mp_tp)
6659 {
6660 struct tseg_qent *q;
6661 int rv = 0;
6662 int count = 0;
6663
6664 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6665 LIST_REMOVE(q, tqe_q);
6666 m_freem(q->tqe_m);
6667 zfree(tcp_reass_zone, q);
6668 count++;
6669 rv = 1;
6670 }
6671 mp_tp->mpt_reassqlen = 0;
6672
6673 if (count > 0) {
6674 OSAddAtomic(-count, &mptcp_reass_total_qlen);
6675 }
6676
6677 return rv;
6678 }
6679
6680 static int
mptcp_post_event(u_int32_t event_code,int value)6681 mptcp_post_event(u_int32_t event_code, int value)
6682 {
6683 struct kev_mptcp_data event_data;
6684 struct kev_msg ev_msg;
6685
6686 memset(&ev_msg, 0, sizeof(ev_msg));
6687
6688 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6689 ev_msg.kev_class = KEV_NETWORK_CLASS;
6690 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6691 ev_msg.event_code = event_code;
6692
6693 event_data.value = value;
6694
6695 ev_msg.dv[0].data_ptr = &event_data;
6696 ev_msg.dv[0].data_length = sizeof(event_data);
6697
6698 return kev_post_msg(&ev_msg);
6699 }
6700
6701 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6702 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6703 {
6704 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6705 int error;
6706
6707 /* First-party apps (Siri) don't flip the cellicon */
6708 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6709 return;
6710 }
6711
6712 /* Subflow is disappearing - don't set it on this one */
6713 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6714 return;
6715 }
6716
6717 /* Fallen back connections are not triggering the cellicon */
6718 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6719 return;
6720 }
6721
6722 /* Remember the last time we set the cellicon. Needed for debouncing */
6723 mpte->mpte_last_cellicon_set = tcp_now;
6724
6725 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6726 tcp_sched_timers(tp);
6727
6728 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6729 mpte->mpte_cellicon_increments != 0) {
6730 if (mptcp_cellicon_refcount == 0) {
6731 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6732 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6733
6734 /* Continue, so that the icon gets set... */
6735 } else {
6736 /*
6737 * In this case, the cellicon is already set. No need to bump it
6738 * even higher
6739 */
6740
6741 return;
6742 }
6743 }
6744
6745 /* When tearing down this subflow, we need to decrement the
6746 * reference counter
6747 */
6748 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6749
6750 /* This counter, so that when a session gets destroyed we decrement
6751 * the reference counter by whatever is left
6752 */
6753 mpte->mpte_cellicon_increments++;
6754
6755 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6756 /* If cellicon is already set, get out of here! */
6757 return;
6758 }
6759
6760 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6761
6762 if (error) {
6763 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6764 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6765 } else {
6766 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6767 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6768 }
6769 }
6770
6771 void
mptcp_clear_cellicon(void)6772 mptcp_clear_cellicon(void)
6773 {
6774 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6775
6776 if (error) {
6777 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6778 __func__, error);
6779 } else {
6780 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6781 __func__);
6782 }
6783 }
6784
6785 /*
6786 * Returns true if the icon has been flipped to WiFi.
6787 */
6788 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6789 __mptcp_unset_cellicon(uint32_t val)
6790 {
6791 VERIFY(val < INT32_MAX);
6792 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6793 return false;
6794 }
6795
6796 mptcp_clear_cellicon();
6797
6798 return true;
6799 }
6800
6801 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6802 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6803 {
6804 /* First-party apps (Siri) don't flip the cellicon */
6805 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6806 return;
6807 }
6808
6809 if (mpte->mpte_cellicon_increments == 0) {
6810 /* This flow never used cell - get out of here! */
6811 return;
6812 }
6813
6814 if (mptcp_cellicon_refcount == 0) {
6815 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6816 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6817
6818 return;
6819 }
6820
6821 if (mpts) {
6822 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6823 return;
6824 }
6825
6826 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6827 }
6828
6829 if (mpte->mpte_cellicon_increments < val) {
6830 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6831 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6832 val = mpte->mpte_cellicon_increments;
6833 }
6834
6835 mpte->mpte_cellicon_increments -= val;
6836
6837 if (__mptcp_unset_cellicon(val) == false) {
6838 return;
6839 }
6840
6841 /* All flows are gone - our counter should be at zero too! */
6842 if (mpte->mpte_cellicon_increments != 0) {
6843 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6844 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6845 }
6846 }
6847
6848 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6849 mptcp_reset_rexmit_state(struct tcpcb *tp)
6850 {
6851 struct mptsub *mpts;
6852 struct inpcb *inp;
6853 struct socket *so;
6854
6855 inp = tp->t_inpcb;
6856 if (inp == NULL) {
6857 return;
6858 }
6859
6860 so = inp->inp_socket;
6861 if (so == NULL) {
6862 return;
6863 }
6864
6865 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6866 return;
6867 }
6868
6869 mpts = tp->t_mpsub;
6870
6871 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6872 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6873 }
6874
6875 void
mptcp_reset_keepalive(struct tcpcb * tp)6876 mptcp_reset_keepalive(struct tcpcb *tp)
6877 {
6878 struct mptsub *mpts = tp->t_mpsub;
6879
6880 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6881 }
6882
6883 static struct mppcb *
mtcp_alloc(void)6884 mtcp_alloc(void)
6885 {
6886 return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp;
6887 }
6888
6889 static void
mtcp_free(struct mppcb * mpp)6890 mtcp_free(struct mppcb *mpp)
6891 {
6892 struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp);
6893
6894 kfree_type(struct mpp_mtp, mtp);
6895 }
6896
6897 /*
6898 * Protocol pr_init callback.
6899 */
6900 void
mptcp_init(struct protosw * pp,struct domain * dp)6901 mptcp_init(struct protosw *pp, struct domain *dp)
6902 {
6903 #pragma unused(dp)
6904 static int mptcp_initialized = 0;
6905 struct protosw *prp;
6906 struct ip6protosw *prp6;
6907
6908 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6909
6910 /* do this only once */
6911 if (mptcp_initialized) {
6912 return;
6913 }
6914 mptcp_initialized = 1;
6915
6916 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6917
6918 /*
6919 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6920 * we must be able to find IPPROTO_TCP entries for both.
6921 */
6922 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6923 VERIFY(prp != NULL);
6924 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6925 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6926 sizeof(mptcp_subflow_usrreqs));
6927 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6928 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6929 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6930 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6931 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6932 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6933 /*
6934 * Socket filters shouldn't attach/detach to/from this protosw
6935 * since pr_protosw is to be used instead, which points to the
6936 * real protocol; if they do, it is a bug and we should panic.
6937 */
6938 mptcp_subflow_protosw.pr_filter_head.tqh_first =
6939 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6940 mptcp_subflow_protosw.pr_filter_head.tqh_last =
6941 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6942
6943 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6944 IPPROTO_TCP, SOCK_STREAM);
6945 VERIFY(prp6 != NULL);
6946 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6947 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6948 sizeof(mptcp_subflow_usrreqs6));
6949 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6950 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6951 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6952 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6953 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6954 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6955 /*
6956 * Socket filters shouldn't attach/detach to/from this protosw
6957 * since pr_protosw is to be used instead, which points to the
6958 * real protocol; if they do, it is a bug and we should panic.
6959 */
6960 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6961 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6962 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6963 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6964
6965 bzero(&mtcbinfo, sizeof(mtcbinfo));
6966 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6967 mtcbinfo.mppi_alloc = mtcp_alloc;
6968 mtcbinfo.mppi_free = mtcp_free;
6969
6970 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6971 lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6972 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
6973 &mtcbinfo.mppi_lock_attr);
6974
6975 mtcbinfo.mppi_gc = mptcp_gc;
6976 mtcbinfo.mppi_timer = mptcp_timer;
6977
6978 /* attach to MP domain for garbage collection to take place */
6979 mp_pcbinfo_attach(&mtcbinfo);
6980
6981 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
6982 }
6983