1 /*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_wupcall(struct socket *, void *, int);
121 static void mptcp_subflow_eupcall1(struct socket *so, void *arg, long events);
122 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
123 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
124
125 static void mptcp_subflow_abort(struct mptsub *, int);
126
127 static void mptcp_send_dfin(struct socket *so);
128 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
129 static int mptcp_freeq(struct mptcb *mp_tp);
130
131 /*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138 typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
143 } ev_ret_t;
144
145 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, long *, long);
146 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, long *, long);
147 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, long *, long);
148 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, long *, long);
149 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, long *, long);
150 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, long *, long);
151 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, long *, long);
152 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, long *, long);
153 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, long *, long);
154 static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, long *, long);
155 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, long *, long);
156 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, long *, long);
157
158 static void mptcp_do_sha1(mptcp_key_t *, char *);
159 static void mptcp_do_sha256(mptcp_key_t *, char *);
160
161 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
162
163 static ZONE_DECLARE(mptsub_zone, "mptsub", sizeof(struct mptsub), ZC_ZFREE_CLEARMEM);
164 static ZONE_DECLARE(mptopt_zone, "mptopt", sizeof(struct mptopt), ZC_ZFREE_CLEARMEM);
165 static ZONE_DECLARE(mpt_subauth_zone, "mptauth",
166 sizeof(struct mptcp_subf_auth_entry), ZC_NONE);
167
168 struct mppcbinfo mtcbinfo;
169
170 SYSCTL_DECL(_net_inet);
171
172 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
173
174 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
175 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
176 &mptcp_dbg_area, 0, "MPTCP debug area");
177
178 uint32_t mptcp_dbg_level = 1;
179 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
180 &mptcp_dbg_level, 0, "MPTCP debug level");
181
182 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
183 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
184
185
186 static int mptcp_alternate_port = 0;
187 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
188 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
189
190 static struct protosw mptcp_subflow_protosw;
191 static struct pr_usrreqs mptcp_subflow_usrreqs;
192 static struct ip6protosw mptcp_subflow_protosw6;
193 static struct pr_usrreqs mptcp_subflow_usrreqs6;
194
195 static uint8_t mptcp_create_subflows_scheduled;
196
197 typedef struct mptcp_subflow_event_entry {
198 long sofilt_hint_mask;
199 ev_ret_t (*sofilt_hint_ev_hdlr)(
200 struct mptses *mpte,
201 struct mptsub *mpts,
202 long *p_mpsofilt_hint,
203 long event);
204 } mptsub_ev_entry_t;
205
206 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
207 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
208 static uint32_t mptcp_kern_skt_inuse = 0;
209 static uint32_t mptcp_kern_skt_unit;
210 static symptoms_advisory_t mptcp_advisory;
211
212 uint32_t mptcp_cellicon_refcount = 0;
213
214 /*
215 * XXX The order of the event handlers below is really
216 * really important. Think twice before changing it.
217 */
218 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
219 {
220 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
221 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
222 },
223 {
224 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
225 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
226 },
227 {
228 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
229 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
230 },
231 {
232 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
233 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
234 },
235 {
236 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
237 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
238 },
239 {
240 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
241 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
242 },
243 {
244 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
245 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
246 },
247 {
248 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
249 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
250 },
251 {
252 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
253 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
254 },
255 {
256 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
257 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
258 },
259 {
260 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
261 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
262 },
263 {
264 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
265 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
266 },
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
269 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
270 },
271 {
272 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
273 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
274 },
275 };
276
277 os_log_t mptcp_log_handle;
278
279 /*
280 * Protocol pr_init callback.
281 */
282 void
mptcp_init(struct protosw * pp,struct domain * dp)283 mptcp_init(struct protosw *pp, struct domain *dp)
284 {
285 #pragma unused(dp)
286 static int mptcp_initialized = 0;
287 struct protosw *prp;
288 struct ip6protosw *prp6;
289
290 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
291
292 /* do this only once */
293 if (mptcp_initialized) {
294 return;
295 }
296 mptcp_initialized = 1;
297
298 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
299
300 /*
301 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
302 * we must be able to find IPPROTO_TCP entries for both.
303 */
304 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
305 VERIFY(prp != NULL);
306 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
307 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
308 sizeof(mptcp_subflow_usrreqs));
309 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
310 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
311 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
312 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
313 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
314 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
315 /*
316 * Socket filters shouldn't attach/detach to/from this protosw
317 * since pr_protosw is to be used instead, which points to the
318 * real protocol; if they do, it is a bug and we should panic.
319 */
320 mptcp_subflow_protosw.pr_filter_head.tqh_first =
321 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
322 mptcp_subflow_protosw.pr_filter_head.tqh_last =
323 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
324
325 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
326 IPPROTO_TCP, SOCK_STREAM);
327 VERIFY(prp6 != NULL);
328 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
329 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
330 sizeof(mptcp_subflow_usrreqs6));
331 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
332 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
333 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
334 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
335 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
336 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
337 /*
338 * Socket filters shouldn't attach/detach to/from this protosw
339 * since pr_protosw is to be used instead, which points to the
340 * real protocol; if they do, it is a bug and we should panic.
341 */
342 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
343 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
344 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
345 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
346
347 bzero(&mtcbinfo, sizeof(mtcbinfo));
348 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
349 mtcbinfo.mppi_zone = zone_create("mptc", sizeof(struct mpp_mtp), ZC_NONE);
350
351 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
352 lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
353 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
354 &mtcbinfo.mppi_lock_attr);
355
356 mtcbinfo.mppi_gc = mptcp_gc;
357 mtcbinfo.mppi_timer = mptcp_timer;
358
359 /* attach to MP domain for garbage collection to take place */
360 mp_pcbinfo_attach(&mtcbinfo);
361
362 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
363 }
364
365 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)366 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
367 {
368 int i, index = -1;
369
370 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
371 if (create && stats[i].ifindex == IFSCOPE_NONE) {
372 if (index < 0) {
373 index = i;
374 }
375 continue;
376 }
377
378 if (stats[i].ifindex == ifindex) {
379 index = i;
380 return index;
381 }
382 }
383
384 if (index != -1) {
385 stats[index].ifindex = ifindex;
386 }
387
388 return index;
389 }
390
391 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)392 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
393 {
394 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
395 int index;
396
397 if (ifp == NULL) {
398 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
399 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
400 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
401 return -1;
402 }
403
404 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
405
406 if (index != -1) {
407 if (stats[index].is_expensive == 0) {
408 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
409 }
410 }
411
412 return index;
413 }
414
415 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)416 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
417 {
418 int index;
419
420 tcpstat.tcps_mp_switches++;
421 mpte->mpte_subflow_switches++;
422
423 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
424
425 if (index != -1) {
426 mpte->mpte_itfstats[index].switches++;
427 }
428 }
429
430 /*
431 * Flushes all recorded socket options from an MP socket.
432 */
433 static void
mptcp_flush_sopts(struct mptses * mpte)434 mptcp_flush_sopts(struct mptses *mpte)
435 {
436 struct mptopt *mpo, *tmpo;
437
438 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
439 mptcp_sopt_remove(mpte, mpo);
440 mptcp_sopt_free(mpo);
441 }
442 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
443 }
444
445 /*
446 * Create an MPTCP session, called as a result of opening a MPTCP socket.
447 */
448 int
mptcp_session_create(struct mppcb * mpp)449 mptcp_session_create(struct mppcb *mpp)
450 {
451 struct mppcbinfo *mppi;
452 struct mptses *mpte;
453 struct mptcb *mp_tp;
454
455 VERIFY(mpp != NULL);
456 mppi = mpp->mpp_pcbinfo;
457 VERIFY(mppi != NULL);
458
459 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
460 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
461
462 /* MPTCP Multipath PCB Extension */
463 bzero(mpte, sizeof(*mpte));
464 VERIFY(mpp->mpp_pcbe == NULL);
465 mpp->mpp_pcbe = mpte;
466 mpte->mpte_mppcb = mpp;
467 mpte->mpte_mptcb = mp_tp;
468
469 TAILQ_INIT(&mpte->mpte_sopts);
470 TAILQ_INIT(&mpte->mpte_subflows);
471 mpte->mpte_associd = SAE_ASSOCID_ANY;
472 mpte->mpte_connid_last = SAE_CONNID_ANY;
473
474 mptcp_init_urgency_timer(mpte);
475
476 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
477 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
478
479 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
480 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
481 }
482
483 mpte->mpte_last_cellicon_set = tcp_now;
484
485 /* MPTCP Protocol Control Block */
486 bzero(mp_tp, sizeof(*mp_tp));
487 mp_tp->mpt_mpte = mpte;
488 mp_tp->mpt_state = MPTCPS_CLOSED;
489
490 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
491
492 return 0;
493 }
494
495 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)496 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
497 {
498 if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
499 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
500 }
501
502 if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
503 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
504 }
505
506 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
507 * meaning we prefer IPv6 over IPv4.
508 */
509 if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
510 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
511 }
512
513 if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
514 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
515 }
516
517 /* We don't yet have a unicast IP */
518 return NULL;
519 }
520
521 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)522 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
523 uint64_t *cellbytes, uint64_t *allbytes)
524 {
525 int64_t mycellbytes = 0;
526 uint64_t myallbytes = 0;
527 int i;
528
529 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
530 if (mpte->mpte_itfstats[i].is_expensive) {
531 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
532 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
533 }
534
535 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
536 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
537 }
538
539 if (initial_cell) {
540 mycellbytes -= mpte->mpte_init_txbytes;
541 mycellbytes -= mpte->mpte_init_rxbytes;
542 }
543
544 if (mycellbytes < 0) {
545 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
546 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
547 *cellbytes = 0;
548 *allbytes = 0;
549 } else {
550 *cellbytes = mycellbytes;
551 *allbytes = myallbytes;
552 }
553 }
554
555 static void
mptcpstats_session_wrapup(struct mptses * mpte)556 mptcpstats_session_wrapup(struct mptses *mpte)
557 {
558 boolean_t cell = mpte->mpte_initial_cell;
559
560 switch (mpte->mpte_svctype) {
561 case MPTCP_SVCTYPE_HANDOVER:
562 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
563 tcpstat.tcps_mptcp_fp_handover_attempt++;
564
565 if (cell && mpte->mpte_handshake_success) {
566 tcpstat.tcps_mptcp_fp_handover_success_cell++;
567
568 if (mpte->mpte_used_wifi) {
569 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
570 }
571 } else if (mpte->mpte_handshake_success) {
572 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
573
574 if (mpte->mpte_used_cell) {
575 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
576 }
577 }
578 } else {
579 tcpstat.tcps_mptcp_handover_attempt++;
580
581 if (cell && mpte->mpte_handshake_success) {
582 tcpstat.tcps_mptcp_handover_success_cell++;
583
584 if (mpte->mpte_used_wifi) {
585 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
586 }
587 } else if (mpte->mpte_handshake_success) {
588 tcpstat.tcps_mptcp_handover_success_wifi++;
589
590 if (mpte->mpte_used_cell) {
591 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
592 }
593 }
594 }
595
596 if (mpte->mpte_handshake_success) {
597 uint64_t cellbytes;
598 uint64_t allbytes;
599
600 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
601
602 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
603 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
604 }
605 break;
606 case MPTCP_SVCTYPE_INTERACTIVE:
607 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
608 tcpstat.tcps_mptcp_fp_interactive_attempt++;
609
610 if (mpte->mpte_handshake_success) {
611 tcpstat.tcps_mptcp_fp_interactive_success++;
612
613 if (!cell && mpte->mpte_used_cell) {
614 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
615 }
616 }
617 } else {
618 tcpstat.tcps_mptcp_interactive_attempt++;
619
620 if (mpte->mpte_handshake_success) {
621 tcpstat.tcps_mptcp_interactive_success++;
622
623 if (!cell && mpte->mpte_used_cell) {
624 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
625 }
626 }
627 }
628
629 if (mpte->mpte_handshake_success) {
630 uint64_t cellbytes;
631 uint64_t allbytes;
632
633 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
634
635 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
636 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
637 }
638 break;
639 case MPTCP_SVCTYPE_AGGREGATE:
640 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
641 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
642
643 if (mpte->mpte_handshake_success) {
644 tcpstat.tcps_mptcp_fp_aggregate_success++;
645 }
646 } else {
647 tcpstat.tcps_mptcp_aggregate_attempt++;
648
649 if (mpte->mpte_handshake_success) {
650 tcpstat.tcps_mptcp_aggregate_success++;
651 }
652 }
653
654 if (mpte->mpte_handshake_success) {
655 uint64_t cellbytes;
656 uint64_t allbytes;
657
658 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
659
660 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
661 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
662 }
663 break;
664 }
665
666 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
667 tcpstat.tcps_mptcp_back_to_wifi++;
668 }
669
670 if (mpte->mpte_triggered_cell) {
671 tcpstat.tcps_mptcp_triggered_cell++;
672 }
673 }
674
675 /*
676 * Destroy an MPTCP session.
677 */
678 static void
mptcp_session_destroy(struct mptses * mpte)679 mptcp_session_destroy(struct mptses *mpte)
680 {
681 struct mptcb *mp_tp = mpte->mpte_mptcb;
682
683 VERIFY(mp_tp != NULL);
684 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
685
686 mptcpstats_session_wrapup(mpte);
687 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
688 mptcp_flush_sopts(mpte);
689
690 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
691 kfree_data(mpte->mpte_itfinfo,
692 sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
693 }
694 mpte->mpte_itfinfo = NULL;
695
696 mptcp_freeq(mp_tp);
697 m_freem_list(mpte->mpte_reinjectq);
698
699 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
700 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
701 }
702
703 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)704 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
705 {
706 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
707 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
708 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
709 }
710
711 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)712 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
713 const struct in_addr *addrv4)
714 {
715 static const struct in6_addr well_known_prefix = {
716 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
717 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
718 0x00, 0x00, 0x00, 0x00},
719 };
720 const char *ptrv4 = (const char *)addrv4;
721 char *ptr = (char *)addr;
722
723 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
724 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
725 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
726 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
727 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
728 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
729 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
730 return -1;
731 }
732
733 /* Check for the well-known prefix */
734 if (len == NAT64_PREFIX_LEN_96 &&
735 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
736 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
737 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
738 return -1;
739 }
740 }
741
742 switch (len) {
743 case NAT64_PREFIX_LEN_96:
744 memcpy(ptr + 12, ptrv4, 4);
745 break;
746 case NAT64_PREFIX_LEN_64:
747 memcpy(ptr + 9, ptrv4, 4);
748 break;
749 case NAT64_PREFIX_LEN_56:
750 memcpy(ptr + 7, ptrv4, 1);
751 memcpy(ptr + 9, ptrv4 + 1, 3);
752 break;
753 case NAT64_PREFIX_LEN_48:
754 memcpy(ptr + 6, ptrv4, 2);
755 memcpy(ptr + 9, ptrv4 + 2, 2);
756 break;
757 case NAT64_PREFIX_LEN_40:
758 memcpy(ptr + 5, ptrv4, 3);
759 memcpy(ptr + 9, ptrv4 + 3, 1);
760 break;
761 case NAT64_PREFIX_LEN_32:
762 memcpy(ptr + 4, ptrv4, 4);
763 break;
764 default:
765 panic("NAT64-prefix len is wrong: %u", len);
766 }
767
768 return 0;
769 }
770
771 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)772 mptcp_trigger_cell_bringup(struct mptses *mpte)
773 {
774 struct socket *mp_so = mptetoso(mpte);
775
776 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
777 uuid_string_t uuidstr;
778 int err;
779
780 socket_unlock(mp_so, 0);
781 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
782 TRUE);
783 socket_lock(mp_so, 0);
784
785 if (err == 0) {
786 mpte->mpte_triggered_cell = 1;
787 }
788
789 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
790 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
791 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
792 } else {
793 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
794 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
795 }
796 }
797
798 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)799 mptcp_subflow_disconnecting(struct mptsub *mpts)
800 {
801 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
802 return true;
803 }
804
805 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
806 return true;
807 }
808
809 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
810 return true;
811 }
812
813 return false;
814 }
815
816 /*
817 * In Handover mode, only create cell subflow if
818 * - Symptoms marked WiFi as weak:
819 * Here, if we are sending data, then we can check the RTO-state. That is a
820 * stronger signal of WiFi quality than the Symptoms indicator.
821 * If however we are not sending any data, the only thing we can do is guess
822 * and thus bring up Cell.
823 *
824 * - Symptoms marked WiFi as unknown:
825 * In this state we don't know what the situation is and thus remain
826 * conservative, only bringing up cell if there are retransmissions going on.
827 */
828 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)829 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
830 {
831 int unusable_state = mptcp_is_wifi_unusable_for_session(mpte);
832
833 if (unusable_state == 0) {
834 /* WiFi is good - don't use cell */
835 return false;
836 }
837
838 if (unusable_state == -1) {
839 /*
840 * We are in unknown state, only use Cell if we have confirmed
841 * that WiFi is bad.
842 */
843 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
844 return true;
845 } else {
846 return false;
847 }
848 }
849
850 if (unusable_state == 1) {
851 /*
852 * WiFi is confirmed to be bad from Symptoms-Framework.
853 * If we are sending data, check the RTOs.
854 * Otherwise, be pessimistic and use Cell.
855 */
856 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
857 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
858 return true;
859 } else {
860 return false;
861 }
862 } else {
863 return true;
864 }
865 }
866
867 return false;
868 }
869
870 void
mptcp_check_subflows_and_add(struct mptses * mpte)871 mptcp_check_subflows_and_add(struct mptses *mpte)
872 {
873 struct mptcb *mp_tp = mpte->mpte_mptcb;
874 boolean_t cellular_viable = FALSE;
875 boolean_t want_cellular = TRUE;
876 uint32_t i;
877
878 if (!mptcp_ok_to_create_subflows(mp_tp)) {
879 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
880 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
881 return;
882 }
883
884 /* Just to see if we have an IP-address available */
885 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
886 return;
887 }
888
889 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
890 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
891 struct mpt_itf_info *info;
892 struct sockaddr_in6 nat64pre;
893 struct sockaddr *dst;
894 struct mptsub *mpts;
895 struct ifnet *ifp;
896 uint32_t ifindex;
897
898 info = &mpte->mpte_itfinfo[i];
899
900 ifindex = info->ifindex;
901 if (ifindex == IFSCOPE_NONE) {
902 continue;
903 }
904
905 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
906 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
907 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
908
909 if (info->no_mptcp_support) {
910 continue;
911 }
912
913 ifnet_head_lock_shared();
914 ifp = ifindex2ifnet[ifindex];
915 ifnet_head_done();
916
917 if (ifp == NULL) {
918 continue;
919 }
920
921 if (IFNET_IS_CELLULAR(ifp)) {
922 cellular_viable = TRUE;
923
924 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
925 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
926 if (!mptcp_is_wifi_unusable_for_session(mpte)) {
927 continue;
928 }
929 }
930 }
931
932 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
933 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
934 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
935
936 if (subifp == NULL) {
937 continue;
938 }
939
940 /*
941 * If there is at least one functioning subflow on WiFi
942 * and we are checking for the cell interface, then
943 * we always need to ask symptoms for permission as
944 * cell is triggered even if WiFi is available.
945 */
946 if (!IFNET_IS_CELLULAR(subifp) &&
947 !mptcp_subflow_disconnecting(mpts) &&
948 IFNET_IS_CELLULAR(ifp)) {
949 need_to_ask_symptoms = TRUE;
950 }
951
952 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
953 os_log(mptcp_log_handle,
954 "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
955 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
956 mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
957 IFNET_IS_CELLULAR(subifp),
958 mptcp_is_wifi_unusable_for_session(mpte),
959 mpts->mpts_flags,
960 tp->t_rxtshift,
961 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
962 mptetoso(mpte)->so_snd.sb_cc,
963 ifindex, subifp->if_index,
964 tp->t_srtt >> TCP_RTT_SHIFT,
965 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
966 tp->t_rxtcur);
967
968 if (!IFNET_IS_CELLULAR(subifp) &&
969 !mptcp_subflow_disconnecting(mpts) &&
970 (mpts->mpts_flags & MPTSF_CONNECTED) &&
971 !mptcp_handover_use_cellular(mpte, tp)) {
972 found = TRUE;
973
974 /* We found a proper subflow on WiFi - no need for cell */
975 want_cellular = FALSE;
976 break;
977 }
978 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
979 uint64_t time_now = mach_continuous_time();
980
981 os_log(mptcp_log_handle,
982 "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
983 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
984 time_now, mptcp_is_wifi_unusable_for_session(mpte),
985 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
986 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
987
988 if (!IFNET_IS_CELLULAR(subifp) &&
989 !mptcp_subflow_disconnecting(mpts) &&
990 (mpte->mpte_time_target == 0 ||
991 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
992 !mptcp_is_wifi_unusable_for_session(mpte))) {
993 found = TRUE;
994
995 want_cellular = FALSE;
996 break;
997 }
998 }
999
1000 if (subifp->if_index == ifindex &&
1001 !mptcp_subflow_disconnecting(mpts)) {
1002 /*
1003 * We found a subflow on this interface.
1004 * No need to create a new one.
1005 */
1006 found = TRUE;
1007 break;
1008 }
1009 }
1010
1011 if (found) {
1012 continue;
1013 }
1014
1015 if (need_to_ask_symptoms &&
1016 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
1017 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
1018 mptcp_developer_mode == 0) {
1019 mptcp_ask_symptoms(mpte);
1020 return;
1021 }
1022
1023 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
1024
1025 if (dst->sa_family == AF_INET &&
1026 !info->has_v4_conn && info->has_nat64_conn) {
1027 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
1028 int error, j;
1029
1030 bzero(&nat64pre, sizeof(struct sockaddr_in6));
1031
1032 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
1033 if (error) {
1034 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
1035 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
1036 continue;
1037 }
1038
1039 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
1040 if (nat64prefixes[j].prefix_len != 0) {
1041 break;
1042 }
1043 }
1044
1045 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
1046
1047 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
1048 nat64prefixes[j].prefix_len,
1049 &((struct sockaddr_in *)(void *)dst)->sin_addr);
1050 if (error != 0) {
1051 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
1052 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1053 continue;
1054 }
1055
1056 memcpy(&nat64pre.sin6_addr,
1057 &nat64prefixes[j].ipv6_prefix,
1058 sizeof(nat64pre.sin6_addr));
1059 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
1060 nat64pre.sin6_family = AF_INET6;
1061 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
1062 nat64pre.sin6_flowinfo = 0;
1063 nat64pre.sin6_scope_id = 0;
1064
1065 dst = (struct sockaddr *)&nat64pre;
1066 }
1067
1068 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
1069 continue;
1070 }
1071 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
1072 continue;
1073 }
1074
1075 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
1076 }
1077
1078 if (!cellular_viable && want_cellular) {
1079 /* Trigger Cell Bringup */
1080 mptcp_trigger_cell_bringup(mpte);
1081 }
1082 }
1083
1084 static void
mptcp_remove_cell_subflows(struct mptses * mpte)1085 mptcp_remove_cell_subflows(struct mptses *mpte)
1086 {
1087 struct mptsub *mpts, *tmpts;
1088
1089 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1090 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1091
1092 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
1093 continue;
1094 }
1095
1096 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
1097 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1098
1099 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1100 }
1101
1102 return;
1103 }
1104
1105 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)1106 mptcp_remove_wifi_subflows(struct mptses *mpte)
1107 {
1108 struct mptsub *mpts, *tmpts;
1109
1110 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1111 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1112
1113 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1114 continue;
1115 }
1116
1117 os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
1118 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1119
1120 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1121 }
1122
1123 return;
1124 }
1125
1126 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)1127 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
1128 {
1129 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1130 boolean_t found_working_wifi_subflow = false;
1131 boolean_t found_working_cell_subflow = false;
1132
1133 struct mptsub *mpts;
1134
1135 /*
1136 * Look for a subflow that is on a non-cellular interface in connected
1137 * state.
1138 *
1139 * In that case, remove all cellular subflows.
1140 *
1141 * If however there is no connected subflow
1142 */
1143 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1144 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1145 struct socket *so;
1146 struct tcpcb *tp;
1147
1148 if (ifp == NULL) {
1149 continue;
1150 }
1151
1152 so = mpts->mpts_socket;
1153 tp = sototcpcb(so);
1154
1155 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1156 tp->t_state != TCPS_ESTABLISHED ||
1157 mptcp_subflow_disconnecting(mpts)) {
1158 continue;
1159 }
1160
1161 if (IFNET_IS_CELLULAR(ifp)) {
1162 found_working_cell_subflow = true;
1163 } else {
1164 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1165 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1166 if (!mptcp_handover_use_cellular(mpte, tp)) {
1167 found_working_wifi_subflow = true;
1168 }
1169 }
1170 }
1171
1172 /*
1173 * Couldn't find a working subflow, let's not remove those on a cellular
1174 * interface.
1175 */
1176 os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
1177 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1178 found_working_wifi_subflow, found_working_cell_subflow);
1179 if (!found_working_wifi_subflow && wifi_unusable) {
1180 if (found_working_cell_subflow) {
1181 mptcp_remove_wifi_subflows(mpte);
1182 }
1183 return;
1184 }
1185
1186 mptcp_remove_cell_subflows(mpte);
1187 }
1188
1189 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1190 mptcp_handover_subflows_remove(struct mptses *mpte)
1191 {
1192 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1193 boolean_t found_working_subflow = false;
1194 struct mptsub *mpts;
1195
1196 /*
1197 * Look for a subflow that is on a non-cellular interface
1198 * and actually works (aka, no retransmission timeout).
1199 */
1200 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1201 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1202 struct socket *so;
1203 struct tcpcb *tp;
1204
1205 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1206 continue;
1207 }
1208
1209 so = mpts->mpts_socket;
1210 tp = sototcpcb(so);
1211
1212 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1213 tp->t_state != TCPS_ESTABLISHED) {
1214 continue;
1215 }
1216
1217 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1218 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1219
1220 if (!mptcp_handover_use_cellular(mpte, tp)) {
1221 found_working_subflow = true;
1222 break;
1223 }
1224 }
1225
1226 /*
1227 * Couldn't find a working subflow, let's not remove those on a cellular
1228 * interface.
1229 */
1230 if (!found_working_subflow) {
1231 return;
1232 }
1233
1234 mptcp_remove_cell_subflows(mpte);
1235 }
1236
1237 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1238 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1239 {
1240 uint64_t time_now = mach_continuous_time();
1241 struct mptsub *mpts;
1242
1243 if (mpte->mpte_time_target != 0 &&
1244 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1245 mptcp_is_wifi_unusable_for_session(mpte)) {
1246 /* WiFi is bad and we are below the target - don't remove any subflows */
1247 return;
1248 }
1249
1250 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1251 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1252
1253 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1254 continue;
1255 }
1256
1257 /* We have a functioning subflow on WiFi. No need for cell! */
1258 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1259 !mptcp_subflow_disconnecting(mpts)) {
1260 mptcp_remove_cell_subflows(mpte);
1261 break;
1262 }
1263 }
1264 }
1265
1266 /*
1267 * Based on the MPTCP Service-type and the state of the subflows, we
1268 * will destroy subflows here.
1269 */
1270 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1271 mptcp_check_subflows_and_remove(struct mptses *mpte)
1272 {
1273 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1274 return;
1275 }
1276
1277 socket_lock_assert_owned(mptetoso(mpte));
1278
1279 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1280 mptcp_pure_handover_subflows_remove(mpte);
1281 }
1282
1283 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1284 mptcp_handover_subflows_remove(mpte);
1285 }
1286
1287 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1288 mptcp_targetbased_subflows_remove(mpte);
1289 }
1290 }
1291
1292 static void
mptcp_remove_subflows(struct mptses * mpte)1293 mptcp_remove_subflows(struct mptses *mpte)
1294 {
1295 struct mptsub *mpts, *tmpts;
1296
1297 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1298 return;
1299 }
1300
1301 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1302 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1303 boolean_t found = false;
1304 uint32_t ifindex;
1305 uint32_t i;
1306
1307 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1308 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1309
1310 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1311 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1312 ifp ? ifp->if_index : -1);
1313 soevent(mpts->mpts_socket,
1314 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1315
1316 continue;
1317 }
1318
1319 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1320 continue;
1321 }
1322
1323 if (ifp) {
1324 ifindex = ifp->if_index;
1325 } else {
1326 ifindex = mpts->mpts_ifscope;
1327 }
1328
1329 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1330 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1331 continue;
1332 }
1333
1334 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1335 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1336 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1337 found = true;
1338 break;
1339 }
1340
1341 if (mpts->mpts_dst.sa_family == AF_INET &&
1342 mpte->mpte_itfinfo[i].has_v4_conn) {
1343 found = true;
1344 break;
1345 }
1346 }
1347 }
1348
1349 if (!found) {
1350 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1351 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1352 ifindex, mpts->mpts_flags);
1353
1354 soevent(mpts->mpts_socket,
1355 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1356 }
1357 }
1358 }
1359
1360 static void
mptcp_create_subflows(__unused void * arg)1361 mptcp_create_subflows(__unused void *arg)
1362 {
1363 struct mppcb *mpp;
1364
1365 /*
1366 * Start with clearing, because we might be processing connections
1367 * while a new event comes in.
1368 */
1369 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1370 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1371 }
1372
1373 /* Iterate over all MPTCP connections */
1374
1375 lck_mtx_lock(&mtcbinfo.mppi_lock);
1376
1377 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1378 struct socket *mp_so = mpp->mpp_socket;
1379 struct mptses *mpte = mpp->mpp_pcbe;
1380
1381 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1382 continue;
1383 }
1384
1385 socket_lock(mp_so, 1);
1386 VERIFY(mp_so->so_usecount > 0);
1387
1388 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1389
1390 mptcp_check_subflows_and_add(mpte);
1391 mptcp_remove_subflows(mpte);
1392
1393 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1394 socket_unlock(mp_so, 1);
1395 }
1396
1397 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1398 }
1399
1400 /*
1401 * We need this because we are coming from an NECP-event. This event gets posted
1402 * while holding NECP-locks. The creation of the subflow however leads us back
1403 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1404 * So, we would deadlock there as we already hold the NECP-lock.
1405 *
1406 * So, let's schedule this separately. It also gives NECP the chance to make
1407 * progress, without having to wait for MPTCP to finish its subflow creation.
1408 */
1409 void
mptcp_sched_create_subflows(struct mptses * mpte)1410 mptcp_sched_create_subflows(struct mptses *mpte)
1411 {
1412 struct mppcb *mpp = mpte->mpte_mppcb;
1413 struct mptcb *mp_tp = mpte->mpte_mptcb;
1414 struct socket *mp_so = mpp->mpp_socket;
1415
1416 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1417 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1418 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1419 return;
1420 }
1421
1422 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1423 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1424 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1425 }
1426
1427 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1428 return;
1429 }
1430
1431 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1432 timeout(mptcp_create_subflows, NULL, hz / 10);
1433 }
1434
1435 /*
1436 * Allocate an MPTCP socket option structure.
1437 */
1438 struct mptopt *
mptcp_sopt_alloc(zalloc_flags_t how)1439 mptcp_sopt_alloc(zalloc_flags_t how)
1440 {
1441 return zalloc_flags(mptopt_zone, how | Z_ZERO);
1442 }
1443
1444 /*
1445 * Free an MPTCP socket option structure.
1446 */
1447 void
mptcp_sopt_free(struct mptopt * mpo)1448 mptcp_sopt_free(struct mptopt *mpo)
1449 {
1450 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1451
1452 zfree(mptopt_zone, mpo);
1453 }
1454
1455 /*
1456 * Add a socket option to the MPTCP socket option list.
1457 */
1458 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1459 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1460 {
1461 socket_lock_assert_owned(mptetoso(mpte));
1462 mpo->mpo_flags |= MPOF_ATTACHED;
1463 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1464 }
1465
1466 /*
1467 * Remove a socket option from the MPTCP socket option list.
1468 */
1469 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1470 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1471 {
1472 socket_lock_assert_owned(mptetoso(mpte));
1473 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1474 mpo->mpo_flags &= ~MPOF_ATTACHED;
1475 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1476 }
1477
1478 /*
1479 * Search for an existing <sopt_level,sopt_name> socket option.
1480 */
1481 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1482 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1483 {
1484 struct mptopt *mpo;
1485
1486 socket_lock_assert_owned(mptetoso(mpte));
1487
1488 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1489 if (mpo->mpo_level == sopt->sopt_level &&
1490 mpo->mpo_name == sopt->sopt_name) {
1491 break;
1492 }
1493 }
1494 return mpo;
1495 }
1496
1497 /*
1498 * Allocate a MPTCP subflow structure.
1499 */
1500 static struct mptsub *
mptcp_subflow_alloc(void)1501 mptcp_subflow_alloc(void)
1502 {
1503 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1504 }
1505
1506 /*
1507 * Deallocate a subflow structure, called when all of the references held
1508 * on it have been released. This implies that the subflow has been deleted.
1509 */
1510 static void
mptcp_subflow_free(struct mptsub * mpts)1511 mptcp_subflow_free(struct mptsub *mpts)
1512 {
1513 VERIFY(mpts->mpts_refcnt == 0);
1514 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1515 VERIFY(mpts->mpts_mpte == NULL);
1516 VERIFY(mpts->mpts_socket == NULL);
1517
1518 free_sockaddr(mpts->mpts_src);
1519
1520 zfree(mptsub_zone, mpts);
1521 }
1522
1523 static void
mptcp_subflow_addref(struct mptsub * mpts)1524 mptcp_subflow_addref(struct mptsub *mpts)
1525 {
1526 if (++mpts->mpts_refcnt == 0) {
1527 panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1528 }
1529 /* NOTREACHED */
1530 }
1531
1532 static void
mptcp_subflow_remref(struct mptsub * mpts)1533 mptcp_subflow_remref(struct mptsub *mpts)
1534 {
1535 if (mpts->mpts_refcnt == 0) {
1536 panic("%s: mpts %p negative refcnt", __func__, mpts);
1537 /* NOTREACHED */
1538 }
1539 if (--mpts->mpts_refcnt > 0) {
1540 return;
1541 }
1542
1543 /* callee will unlock and destroy lock */
1544 mptcp_subflow_free(mpts);
1545 }
1546
1547 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1548 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1549 {
1550 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1551 struct tcpcb *tp = sototcpcb(so);
1552
1553 /*
1554 * From this moment on, the subflow is linked to the MPTCP-connection.
1555 * Locking,... happens now at the MPTCP-layer
1556 */
1557 tp->t_mptcb = mpte->mpte_mptcb;
1558 so->so_flags |= SOF_MP_SUBFLOW;
1559 mp_so->so_usecount++;
1560
1561 /*
1562 * Insert the subflow into the list, and associate the MPTCP PCB
1563 * as well as the the subflow socket. From this point on, removing
1564 * the subflow needs to be done via mptcp_subflow_del().
1565 */
1566 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1567 mpte->mpte_numflows++;
1568
1569 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1570 mpts->mpts_mpte = mpte;
1571 mpts->mpts_socket = so;
1572 tp->t_mpsub = mpts;
1573 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1574 mptcp_subflow_addref(mpts); /* for subflow socket */
1575 }
1576
1577 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1578 mptcp_subflow_necp_cb(void *handle, __unused int action,
1579 __unused uint32_t interface_index,
1580 uint32_t necp_flags, bool *viable)
1581 {
1582 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1583 struct inpcb *inp = (struct inpcb *)handle;
1584 struct socket *so = inp->inp_socket;
1585 struct mptsub *mpts;
1586 struct mptses *mpte;
1587
1588 if (low_power) {
1589 action = NECP_CLIENT_CBACTION_NONVIABLE;
1590 }
1591
1592 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1593 return;
1594 }
1595
1596 /*
1597 * The socket is being garbage-collected. There is nothing to be done
1598 * here.
1599 */
1600 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1601 return;
1602 }
1603
1604 socket_lock(so, 1);
1605
1606 /* Check again after we acquired the lock. */
1607 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1608 goto out;
1609 }
1610
1611 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1612 mpts = sototcpcb(so)->t_mpsub;
1613
1614 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1615 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1616
1617 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1618
1619 mptcp_sched_create_subflows(mpte);
1620
1621 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1622 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1623 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1624 viable != NULL) {
1625 *viable = 1;
1626 }
1627
1628 out:
1629 socket_unlock(so, 1);
1630 }
1631
1632 /*
1633 * Create an MPTCP subflow socket.
1634 */
1635 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1636 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1637 struct socket **so)
1638 {
1639 lck_mtx_t *subflow_mtx;
1640 struct mptopt smpo, *mpo, *tmpo;
1641 struct proc *p;
1642 struct socket *mp_so;
1643 struct mppcb *mpp;
1644 int error;
1645
1646 *so = NULL;
1647
1648 mp_so = mptetoso(mpte);
1649 mpp = mpsotomppcb(mp_so);
1650
1651 p = proc_find(mp_so->last_pid);
1652 if (p == PROC_NULL) {
1653 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1654 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1655
1656 mptcp_subflow_free(mpts);
1657 return ESRCH;
1658 }
1659
1660 /*
1661 * Create the subflow socket (multipath subflow, non-blocking.)
1662 *
1663 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1664 * socket; it will be cleared when the socket is peeled off or closed.
1665 * It also indicates to the underlying TCP to handle MPTCP options.
1666 * A multipath subflow socket implies SS_NOFDREF state.
1667 */
1668
1669 /*
1670 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1671 * the ipi-lock. We cannot hold the socket-lock at that point.
1672 */
1673 socket_unlock(mp_so, 0);
1674 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1675 SOCF_MPTCP, PROC_NULL);
1676 socket_lock(mp_so, 0);
1677 if (error) {
1678 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1679 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1680
1681 proc_rele(p);
1682
1683 mptcp_subflow_free(mpts);
1684 return error;
1685 }
1686
1687 /*
1688 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1689 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1690 * Which is why we also need to get the lock with pr_getlock, as after
1691 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1692 */
1693 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1694 lck_mtx_lock(subflow_mtx);
1695
1696 /*
1697 * Must be the first thing we do, to make sure all pointers for this
1698 * subflow are set.
1699 */
1700 mptcp_subflow_attach(mpte, mpts, *so);
1701
1702 /*
1703 * A multipath subflow socket is used internally in the kernel,
1704 * therefore it does not have a file desciptor associated by
1705 * default.
1706 */
1707 (*so)->so_state |= SS_NOFDREF;
1708
1709 lck_mtx_unlock(subflow_mtx);
1710
1711 /* prevent the socket buffers from being compressed */
1712 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1713 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1714
1715 /* Inherit preconnect and TFO data flags */
1716 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1717 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1718 }
1719 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1720 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1721 }
1722 if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1723 (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1724 }
1725
1726 /* Inherit uuid and create the related flow. */
1727 if (!uuid_is_null(mpp->necp_client_uuid)) {
1728 struct mptcb *mp_tp = mpte->mpte_mptcb;
1729
1730 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1731
1732 /*
1733 * A note on the unlock: With MPTCP, we do multiple times a
1734 * necp_client_register_socket_flow. This is problematic,
1735 * because now the lock-ordering guarantee (first necp-locks,
1736 * then socket-locks) is no more respected. So, we need to
1737 * unlock here.
1738 */
1739 socket_unlock(mp_so, 0);
1740 error = necp_client_register_socket_flow(mp_so->last_pid,
1741 mpp->necp_client_uuid, sotoinpcb(*so));
1742 socket_lock(mp_so, 0);
1743
1744 if (error) {
1745 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1746 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1747
1748 goto out_err;
1749 }
1750
1751 /* Possible state-change during the unlock above */
1752 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1753 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1754 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1755 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1756 mp_tp->mpt_state, mp_tp->mpt_flags);
1757
1758 error = EINVAL;
1759 goto out_err;
1760 }
1761
1762 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1763 }
1764
1765 if (mpp->inp_necp_attributes.inp_domain != NULL) {
1766 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1767 sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1768
1769 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1770 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1771 }
1772 }
1773 if (mpp->inp_necp_attributes.inp_account != NULL) {
1774 size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1775 sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1776
1777 if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1778 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1779 }
1780 }
1781
1782 if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1783 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1784 sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1785
1786 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1787 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1788 }
1789 }
1790
1791 if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1792 size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1793 sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1794
1795 if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1796 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1797 }
1798 }
1799
1800 /* Needs to happen prior to the delegation! */
1801 (*so)->last_pid = mp_so->last_pid;
1802
1803 if (mp_so->so_flags & SOF_DELEGATED) {
1804 if (mpte->mpte_epid) {
1805 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1806 if (error) {
1807 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1808 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1809 goto out_err;
1810 }
1811 }
1812 if (!uuid_is_null(mpte->mpte_euuid)) {
1813 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1814 if (error) {
1815 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1816 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1817 goto out_err;
1818 }
1819 }
1820 }
1821
1822 /* inherit the other socket options */
1823 bzero(&smpo, sizeof(smpo));
1824 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1825 smpo.mpo_level = SOL_SOCKET;
1826 smpo.mpo_intval = 1;
1827
1828 /* disable SIGPIPE */
1829 smpo.mpo_name = SO_NOSIGPIPE;
1830 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1831 goto out_err;
1832 }
1833
1834 /* find out if the subflow's source address goes away */
1835 smpo.mpo_name = SO_NOADDRERR;
1836 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1837 goto out_err;
1838 }
1839
1840 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1841 /*
1842 * On secondary subflows we might need to set the cell-fallback
1843 * flag (see conditions in mptcp_subflow_sosetopt).
1844 */
1845 smpo.mpo_level = SOL_SOCKET;
1846 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1847 smpo.mpo_intval = 1;
1848 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1849 goto out_err;
1850 }
1851 }
1852
1853 /* replay setsockopt(2) on the subflow sockets for eligible options */
1854 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1855 int interim;
1856
1857 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1858 continue;
1859 }
1860
1861 /*
1862 * Skip those that are handled internally; these options
1863 * should not have been recorded and marked with the
1864 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1865 */
1866 if (mpo->mpo_level == SOL_SOCKET &&
1867 (mpo->mpo_name == SO_NOSIGPIPE ||
1868 mpo->mpo_name == SO_NOADDRERR ||
1869 mpo->mpo_name == SO_KEEPALIVE)) {
1870 continue;
1871 }
1872
1873 interim = (mpo->mpo_flags & MPOF_INTERIM);
1874 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1875 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1876 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1877 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1878 mpo->mpo_intval);
1879 mptcp_sopt_remove(mpte, mpo);
1880 mptcp_sopt_free(mpo);
1881 continue;
1882 }
1883 }
1884
1885 /*
1886 * We need to receive everything that the subflow socket has,
1887 * so use a customized socket receive function. We will undo
1888 * this when the socket is peeled off or closed.
1889 */
1890 switch (dom) {
1891 case PF_INET:
1892 (*so)->so_proto = &mptcp_subflow_protosw;
1893 break;
1894 case PF_INET6:
1895 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1896 break;
1897 default:
1898 VERIFY(0);
1899 /* NOTREACHED */
1900 }
1901
1902 proc_rele(p);
1903
1904 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1905 int, dom, int, error);
1906
1907 return 0;
1908
1909 out_err:
1910 mptcp_subflow_abort(mpts, error);
1911
1912 proc_rele(p);
1913
1914 return error;
1915 }
1916
1917 /*
1918 * Close an MPTCP subflow socket.
1919 *
1920 * Note that this may be called on an embryonic subflow, and the only
1921 * thing that is guaranteed valid is the protocol-user request.
1922 */
1923 static void
mptcp_subflow_soclose(struct mptsub * mpts)1924 mptcp_subflow_soclose(struct mptsub *mpts)
1925 {
1926 struct socket *so = mpts->mpts_socket;
1927
1928 if (mpts->mpts_flags & MPTSF_CLOSED) {
1929 return;
1930 }
1931
1932 VERIFY(so != NULL);
1933 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1934 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1935
1936 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1937 struct socket *, so,
1938 struct sockbuf *, &so->so_rcv,
1939 struct sockbuf *, &so->so_snd,
1940 struct mptses *, mpts->mpts_mpte);
1941
1942 mpts->mpts_flags |= MPTSF_CLOSED;
1943
1944 if (so->so_retaincnt == 0) {
1945 soclose_locked(so);
1946
1947 return;
1948 } else {
1949 VERIFY(so->so_usecount > 0);
1950 so->so_usecount--;
1951 }
1952
1953 return;
1954 }
1955
1956 /*
1957 * Connect an MPTCP subflow socket.
1958 *
1959 * Note that in the pending connect case, the subflow socket may have been
1960 * bound to an interface and/or a source IP address which may no longer be
1961 * around by the time this routine is called; in that case the connect attempt
1962 * will most likely fail.
1963 */
1964 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1965 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1966 {
1967 char dbuf[MAX_IPv6_STR_LEN];
1968 struct socket *mp_so, *so;
1969 struct mptcb *mp_tp;
1970 struct sockaddr *dst;
1971 struct proc *p;
1972 int af, error, dport;
1973
1974 mp_so = mptetoso(mpte);
1975 mp_tp = mpte->mpte_mptcb;
1976 so = mpts->mpts_socket;
1977 af = mpts->mpts_dst.sa_family;
1978 dst = &mpts->mpts_dst;
1979
1980 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1981 VERIFY(mpts->mpts_socket != NULL);
1982 VERIFY(af == AF_INET || af == AF_INET6);
1983
1984 if (af == AF_INET) {
1985 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1986 dport = ntohs(SIN(dst)->sin_port);
1987 } else {
1988 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1989 dport = ntohs(SIN6(dst)->sin6_port);
1990 }
1991
1992 os_log(mptcp_log_handle,
1993 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1994 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1995
1996 p = proc_find(mp_so->last_pid);
1997 if (p == PROC_NULL) {
1998 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1999 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
2000
2001 return ESRCH;
2002 }
2003
2004 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
2005
2006 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
2007
2008 /* connect the subflow socket */
2009 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
2010 p, mpts->mpts_ifscope,
2011 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
2012
2013 mpts->mpts_iss = sototcpcb(so)->iss;
2014
2015 /* See tcp_connect_complete */
2016 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
2017 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2018 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
2019 }
2020
2021 /* Allocate a unique address id per subflow */
2022 mpte->mpte_addrid_last++;
2023 if (mpte->mpte_addrid_last == 0) {
2024 mpte->mpte_addrid_last++;
2025 }
2026
2027 proc_rele(p);
2028
2029 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
2030 struct mptsub *, mpts, int, error);
2031 if (error) {
2032 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
2033 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
2034 }
2035
2036 return error;
2037 }
2038
2039 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)2040 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
2041 uint32_t rseq, uint16_t dlen, uint8_t dfin)
2042 {
2043 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2044
2045 if (m_pktlen(m) == 0) {
2046 return 0;
2047 }
2048
2049 if (!(m->m_flags & M_PKTHDR)) {
2050 return 0;
2051 }
2052
2053 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
2054 if (off && (dsn != m->m_pkthdr.mp_dsn ||
2055 rseq != m->m_pkthdr.mp_rseq ||
2056 dlen != m->m_pkthdr.mp_rlen ||
2057 dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
2058 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
2059 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
2060 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
2061 rseq, m->m_pkthdr.mp_rseq,
2062 dlen, m->m_pkthdr.mp_rlen,
2063 dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
2064
2065 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2066 return -1;
2067 }
2068 }
2069
2070 /* If mbuf is beyond right edge of the mapping, we need to split */
2071 if (m_pktlen(m) > dlen - dfin - off) {
2072 struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
2073 if (new == NULL) {
2074 os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
2075 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
2076 dlen, dfin, off, m_pktlen(m),
2077 mpts->mpts_connid);
2078
2079 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2080 return -1;
2081 }
2082
2083 m->m_next = new;
2084 sballoc(&so->so_rcv, new);
2085 /* Undo, as sballoc will add to it as well */
2086 so->so_rcv.sb_cc -= new->m_len;
2087
2088 if (so->so_rcv.sb_mbtail == m) {
2089 so->so_rcv.sb_mbtail = new;
2090 }
2091 }
2092
2093 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2094 m->m_pkthdr.mp_dsn = dsn + off;
2095 m->m_pkthdr.mp_rseq = rseq + off;
2096 VERIFY(m_pktlen(m) < UINT16_MAX);
2097 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
2098
2099 /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
2100 if (dfin) {
2101 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
2102 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
2103 } else {
2104 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
2105 }
2106 }
2107
2108
2109 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
2110
2111 return 0;
2112 }
2113
2114 /*
2115 * MPTCP subflow socket receive routine, derived from soreceive().
2116 */
2117 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)2118 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2119 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2120 {
2121 #pragma unused(uio)
2122 struct socket *mp_so;
2123 struct mptses *mpte;
2124 struct mptcb *mp_tp;
2125 int flags, error = 0;
2126 struct mbuf *m, **mp = mp0;
2127
2128 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2129 mp_so = mptetoso(mpte);
2130 mp_tp = mpte->mpte_mptcb;
2131
2132 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2133
2134 #ifdef MORE_LOCKING_DEBUG
2135 if (so->so_usecount == 1) {
2136 panic("%s: so=%x no other reference on socket", __func__, so);
2137 /* NOTREACHED */
2138 }
2139 #endif
2140 /*
2141 * We return all that is there in the subflow's socket receive buffer
2142 * to the MPTCP layer, so we require that the caller passes in the
2143 * expected parameters.
2144 */
2145 if (mp == NULL || controlp != NULL) {
2146 return EINVAL;
2147 }
2148
2149 *mp = NULL;
2150 if (psa != NULL) {
2151 *psa = NULL;
2152 }
2153 if (flagsp != NULL) {
2154 flags = *flagsp & ~MSG_EOR;
2155 } else {
2156 flags = 0;
2157 }
2158
2159 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2160 return EOPNOTSUPP;
2161 }
2162
2163 flags |= (MSG_DONTWAIT | MSG_NBIO);
2164
2165 /*
2166 * If a recv attempt is made on a previously-accepted socket
2167 * that has been marked as inactive (disconnected), reject
2168 * the request.
2169 */
2170 if (so->so_flags & SOF_DEFUNCT) {
2171 struct sockbuf *sb = &so->so_rcv;
2172
2173 error = ENOTCONN;
2174 /*
2175 * This socket should have been disconnected and flushed
2176 * prior to being returned from sodefunct(); there should
2177 * be no data on its receive list, so panic otherwise.
2178 */
2179 if (so->so_state & SS_DEFUNCT) {
2180 sb_empty_assert(sb, __func__);
2181 }
2182 return error;
2183 }
2184
2185 /*
2186 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2187 * and if so just return to the caller. This could happen when
2188 * soreceive() is called by a socket upcall function during the
2189 * time the socket is freed. The socket buffer would have been
2190 * locked across the upcall, therefore we cannot put this thread
2191 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2192 * we may livelock), because the lock on the socket buffer will
2193 * only be released when the upcall routine returns to its caller.
2194 * Because the socket has been officially closed, there can be
2195 * no further read on it.
2196 *
2197 * A multipath subflow socket would have its SS_NOFDREF set by
2198 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2199 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2200 */
2201 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2202 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2203 return 0;
2204 }
2205
2206 /*
2207 * For consistency with soreceive() semantics, we need to obey
2208 * SB_LOCK in case some other code path has locked the buffer.
2209 */
2210 error = sblock(&so->so_rcv, 0);
2211 if (error != 0) {
2212 return error;
2213 }
2214
2215 m = so->so_rcv.sb_mb;
2216 if (m == NULL) {
2217 /*
2218 * Panic if we notice inconsistencies in the socket's
2219 * receive list; both sb_mb and sb_cc should correctly
2220 * reflect the contents of the list, otherwise we may
2221 * end up with false positives during select() or poll()
2222 * which could put the application in a bad state.
2223 */
2224 SB_MB_CHECK(&so->so_rcv);
2225
2226 if (so->so_error != 0) {
2227 error = so->so_error;
2228 so->so_error = 0;
2229 goto release;
2230 }
2231
2232 if (so->so_state & SS_CANTRCVMORE) {
2233 goto release;
2234 }
2235
2236 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2237 error = ENOTCONN;
2238 goto release;
2239 }
2240
2241 /*
2242 * MSG_DONTWAIT is implicitly defined and this routine will
2243 * never block, so return EWOULDBLOCK when there is nothing.
2244 */
2245 error = EWOULDBLOCK;
2246 goto release;
2247 }
2248
2249 mptcp_update_last_owner(so, mp_so);
2250
2251 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2252 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2253
2254 while (m != NULL) {
2255 int dlen = 0, error_out = 0, off = 0;
2256 uint8_t dfin = 0;
2257 struct mbuf *start = m;
2258 uint64_t dsn;
2259 uint32_t sseq;
2260 uint16_t orig_dlen;
2261 uint16_t csum;
2262
2263 VERIFY(m->m_nextpkt == NULL);
2264
2265 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2266 fallback:
2267 /* Just move mbuf to MPTCP-level */
2268
2269 sbfree(&so->so_rcv, m);
2270
2271 if (mp != NULL) {
2272 *mp = m;
2273 mp = &m->m_next;
2274 so->so_rcv.sb_mb = m = m->m_next;
2275 *mp = NULL;
2276 }
2277
2278 if (m != NULL) {
2279 so->so_rcv.sb_lastrecord = m;
2280 } else {
2281 SB_EMPTY_FIXUP(&so->so_rcv);
2282 }
2283
2284 continue;
2285 } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2286 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2287 boolean_t found_mapping = false;
2288 int parsed_length = 0;
2289 struct mbuf *m_iter;
2290
2291 /*
2292 * No MPTCP-option in the header. Either fallback or
2293 * wait for additional mappings.
2294 */
2295 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2296 /* data arrived without a DSS option mapping */
2297
2298 /* initial subflow can fallback right after SYN handshake */
2299 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2300 mptcp_notify_mpfail(so);
2301
2302 goto fallback;
2303 } else {
2304 os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2305 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2306 mpts->mpts_connid);
2307 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2308
2309 error = EIO;
2310 *mp0 = NULL;
2311 goto release;
2312 }
2313 }
2314
2315 /* Thus, let's look for an mbuf with the mapping */
2316 m_iter = m->m_next;
2317 parsed_length = m->m_len;
2318 while (m_iter != NULL && parsed_length < UINT16_MAX) {
2319 if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2320 parsed_length += m_iter->m_len;
2321 m_iter = m_iter->m_next;
2322 continue;
2323 }
2324
2325 found_mapping = true;
2326
2327 /* Found an mbuf with a DSS-mapping */
2328 orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2329 dsn = m_iter->m_pkthdr.mp_dsn;
2330 sseq = m_iter->m_pkthdr.mp_rseq;
2331 csum = m_iter->m_pkthdr.mp_csum;
2332
2333 if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2334 dfin = 1;
2335 dlen--;
2336 }
2337
2338 break;
2339 }
2340
2341 if (!found_mapping && parsed_length < UINT16_MAX) {
2342 /* Mapping not yet present, we can wait! */
2343 if (*mp0 == NULL) {
2344 error = EWOULDBLOCK;
2345 }
2346 goto release;
2347 } else if (!found_mapping && parsed_length >= UINT16_MAX) {
2348 os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2349 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2350 mpts->mpts_connid);
2351 /* Received 64KB without DSS-mapping. We should kill the subflow */
2352 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2353
2354 error = EIO;
2355 *mp0 = NULL;
2356 goto release;
2357 }
2358 } else {
2359 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2360 dsn = m->m_pkthdr.mp_dsn;
2361 sseq = m->m_pkthdr.mp_rseq;
2362 csum = m->m_pkthdr.mp_csum;
2363
2364 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2365 dfin = 1;
2366 dlen--;
2367 }
2368 }
2369
2370 /*
2371 * Check if the full mapping is now present
2372 */
2373 if ((int)so->so_rcv.sb_cc < dlen) {
2374 if (*mp0 == NULL) {
2375 error = EWOULDBLOCK;
2376 }
2377 goto release;
2378 }
2379
2380 /* Now, get the full mapping */
2381 off = 0;
2382 while (dlen > 0) {
2383 if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2384 error_out = 1;
2385 error = EIO;
2386 dlen = 0;
2387 *mp0 = NULL;
2388 break;
2389 }
2390
2391 dlen -= m->m_len;
2392 off += m->m_len;
2393 sbfree(&so->so_rcv, m);
2394
2395 if (mp != NULL) {
2396 *mp = m;
2397 mp = &m->m_next;
2398 so->so_rcv.sb_mb = m = m->m_next;
2399 *mp = NULL;
2400 }
2401
2402 VERIFY(dlen == 0 || m);
2403 }
2404
2405 VERIFY(dlen == 0);
2406
2407 if (m != NULL) {
2408 so->so_rcv.sb_lastrecord = m;
2409 } else {
2410 SB_EMPTY_FIXUP(&so->so_rcv);
2411 }
2412
2413 if (error_out) {
2414 goto release;
2415 }
2416
2417 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2418 error = EIO;
2419 *mp0 = NULL;
2420 goto release;
2421 }
2422
2423 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2424 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2425 }
2426
2427 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2428 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2429
2430 if (flagsp != NULL) {
2431 *flagsp |= flags;
2432 }
2433
2434 release:
2435 sbunlock(&so->so_rcv, TRUE);
2436
2437 return error;
2438 }
2439
2440 /*
2441 * MPTCP subflow socket send routine, derived from sosend().
2442 */
2443 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2444 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2445 struct mbuf *top, struct mbuf *control, int flags)
2446 {
2447 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2448 boolean_t en_tracing = FALSE, proc_held = FALSE;
2449 struct proc *p = current_proc();
2450 int en_tracing_val;
2451 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2452 int error;
2453
2454 VERIFY(control == NULL);
2455 VERIFY(addr == NULL);
2456 VERIFY(uio == NULL);
2457 VERIFY(flags == 0);
2458 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2459
2460 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2461 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2462
2463 /*
2464 * trace if tracing & network (vs. unix) sockets & and
2465 * non-loopback
2466 */
2467 if (ENTR_SHOULDTRACE &&
2468 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2469 struct inpcb *inp = sotoinpcb(so);
2470 if (inp->inp_last_outifp != NULL &&
2471 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2472 en_tracing = TRUE;
2473 en_tracing_val = top->m_pkthdr.len;
2474 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2475 (unsigned long)VM_KERNEL_ADDRPERM(so),
2476 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2477 (int64_t)en_tracing_val);
2478 }
2479 }
2480
2481 mptcp_update_last_owner(so, mp_so);
2482
2483 if (mp_so->last_pid != proc_pid(p)) {
2484 p = proc_find(mp_so->last_pid);
2485 if (p == PROC_NULL) {
2486 p = current_proc();
2487 } else {
2488 proc_held = TRUE;
2489 }
2490 }
2491
2492 #if NECP
2493 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2494 #endif /* NECP */
2495
2496 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2497 if (error) {
2498 goto out;
2499 }
2500
2501 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2502 top = NULL;
2503
2504 out:
2505 if (top != NULL) {
2506 m_freem(top);
2507 }
2508
2509 if (proc_held) {
2510 proc_rele(p);
2511 }
2512
2513 soclearfastopen(so);
2514
2515 if (en_tracing) {
2516 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2517 (unsigned long)VM_KERNEL_ADDRPERM(so),
2518 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2519 (int64_t)en_tracing_val);
2520 }
2521
2522 return error;
2523 }
2524
2525 /*
2526 * Establish an initial MPTCP connection (if first subflow and not yet
2527 * connected), or add a subflow to an existing MPTCP connection.
2528 */
2529 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2530 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2531 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2532 {
2533 struct socket *mp_so, *so = NULL;
2534 struct mptcb *mp_tp;
2535 struct mptsub *mpts = NULL;
2536 int af, error = 0;
2537
2538 mp_so = mptetoso(mpte);
2539 mp_tp = mpte->mpte_mptcb;
2540
2541 socket_lock_assert_owned(mp_so);
2542
2543 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2544 /* If the remote end sends Data FIN, refuse subflow adds */
2545 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2546 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2547 error = ENOTCONN;
2548 goto out_err;
2549 }
2550
2551 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2552 error = EOVERFLOW;
2553 goto out_err;
2554 }
2555
2556 mpts = mptcp_subflow_alloc();
2557 if (mpts == NULL) {
2558 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2559 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2560 error = ENOMEM;
2561 goto out_err;
2562 }
2563
2564 if (src) {
2565 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2566 error = EAFNOSUPPORT;
2567 goto out_err;
2568 }
2569
2570 if (src->sa_family == AF_INET &&
2571 src->sa_len != sizeof(struct sockaddr_in)) {
2572 error = EINVAL;
2573 goto out_err;
2574 }
2575
2576 if (src->sa_family == AF_INET6 &&
2577 src->sa_len != sizeof(struct sockaddr_in6)) {
2578 error = EINVAL;
2579 goto out_err;
2580 }
2581
2582 mpts->mpts_src = (struct sockaddr *)alloc_sockaddr(src->sa_len,
2583 Z_WAITOK | Z_NOFAIL);
2584
2585 bcopy(src, mpts->mpts_src, src->sa_len);
2586 }
2587
2588 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2589 error = EAFNOSUPPORT;
2590 goto out_err;
2591 }
2592
2593 if (dst->sa_family == AF_INET &&
2594 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2595 error = EINVAL;
2596 goto out_err;
2597 }
2598
2599 if (dst->sa_family == AF_INET6 &&
2600 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2601 error = EINVAL;
2602 goto out_err;
2603 }
2604
2605 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2606
2607 af = mpts->mpts_dst.sa_family;
2608
2609 ifnet_head_lock_shared();
2610 if ((ifscope > (unsigned)if_index)) {
2611 ifnet_head_done();
2612 error = ENXIO;
2613 goto out_err;
2614 }
2615 ifnet_head_done();
2616
2617 mpts->mpts_ifscope = ifscope;
2618
2619 /* create the subflow socket */
2620 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2621 /*
2622 * Returning (error) and not cleaning up, because up to here
2623 * all we did is creating mpts.
2624 *
2625 * And the contract is that the call to mptcp_subflow_socreate,
2626 * moves ownership of mpts to mptcp_subflow_socreate.
2627 */
2628 return error;
2629 }
2630
2631 /*
2632 * We may be called from within the kernel. Still need to account this
2633 * one to the real app.
2634 */
2635 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2636
2637 /*
2638 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2639 * -1 (SAE_CONNID_ALL).
2640 */
2641 mpte->mpte_connid_last++;
2642 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2643 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2644 mpte->mpte_connid_last++;
2645 }
2646
2647 mpts->mpts_connid = mpte->mpte_connid_last;
2648
2649 mpts->mpts_rel_seq = 1;
2650
2651 /* Allocate a unique address id per subflow */
2652 mpte->mpte_addrid_last++;
2653 if (mpte->mpte_addrid_last == 0) {
2654 mpte->mpte_addrid_last++;
2655 }
2656
2657 /* register for subflow socket read/write events */
2658 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2659
2660 /* Register for subflow socket control events */
2661 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2662 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2663 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2664 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2665 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2666 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2667 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2668 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2669
2670 /* sanity check */
2671 VERIFY(!(mpts->mpts_flags &
2672 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2673
2674 /*
2675 * Indicate to the TCP subflow whether or not it should establish
2676 * the initial MPTCP connection, or join an existing one. Fill
2677 * in the connection request structure with additional info needed
2678 * by the underlying TCP (to be used in the TCP options, etc.)
2679 */
2680 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2681 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2682
2683 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2684 mptcp_init_local_parms(mpte, dst);
2685 }
2686 soisconnecting(mp_so);
2687
2688 /* If fastopen is requested, set state in mpts */
2689 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2690 mpts->mpts_flags |= MPTSF_TFO_REQD;
2691 }
2692 } else {
2693 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2694 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2695 }
2696 }
2697
2698 mpts->mpts_flags |= MPTSF_CONNECTING;
2699
2700 /* connect right away if first attempt, or if join can be done now */
2701 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2702 error = mptcp_subflow_soconnectx(mpte, mpts);
2703 }
2704
2705 if (error) {
2706 goto out_err_close;
2707 }
2708
2709 if (pcid) {
2710 *pcid = mpts->mpts_connid;
2711 }
2712
2713 return 0;
2714
2715 out_err_close:
2716 mptcp_subflow_abort(mpts, error);
2717
2718 return error;
2719
2720 out_err:
2721 if (mpts) {
2722 mptcp_subflow_free(mpts);
2723 }
2724
2725 return error;
2726 }
2727
2728 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2729 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2730 {
2731 int index = mptcpstats_get_index(stats, mpts);
2732
2733 if (index != -1) {
2734 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2735
2736 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2737 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2738
2739 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2740 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2741
2742 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2743 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2744
2745 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2746 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2747 }
2748 }
2749
2750 /*
2751 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2752 * will no longer be accessible after a subflow is deleted, thus this
2753 * should occur only after the subflow socket has been disconnected.
2754 */
2755 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2756 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2757 {
2758 struct socket *mp_so = mptetoso(mpte);
2759 struct socket *so = mpts->mpts_socket;
2760 struct tcpcb *tp = sototcpcb(so);
2761
2762 socket_lock_assert_owned(mp_so);
2763 VERIFY(mpts->mpts_mpte == mpte);
2764 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2765 VERIFY(mpte->mpte_numflows != 0);
2766 VERIFY(mp_so->so_usecount > 0);
2767
2768 mptcpstats_update(mpte->mpte_itfstats, mpts);
2769
2770 mptcp_unset_cellicon(mpte, mpts, 1);
2771
2772 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2773 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2774
2775 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2776 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2777 mpte->mpte_numflows--;
2778 if (mpte->mpte_active_sub == mpts) {
2779 mpte->mpte_active_sub = NULL;
2780 }
2781
2782 /*
2783 * Drop references held by this subflow socket; there
2784 * will be no further upcalls made from this point.
2785 */
2786 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2787 sock_catchevents_locked(so, NULL, NULL, 0);
2788
2789 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2790
2791 mp_so->so_usecount--; /* for subflow socket */
2792 mpts->mpts_mpte = NULL;
2793 mpts->mpts_socket = NULL;
2794
2795 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2796 mptcp_subflow_remref(mpts); /* for subflow socket */
2797
2798 so->so_flags &= ~SOF_MP_SUBFLOW;
2799 tp->t_mptcb = NULL;
2800 tp->t_mpsub = NULL;
2801 }
2802
2803 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2804 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2805 {
2806 struct socket *so = mpts->mpts_socket;
2807 struct mptcb *mp_tp = mpte->mpte_mptcb;
2808 int send_dfin = 0;
2809
2810 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2811 send_dfin = 1;
2812 }
2813
2814 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2815 (so->so_state & SS_ISCONNECTED)) {
2816 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2817 __func__, mpts->mpts_connid, send_dfin),
2818 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2819
2820 if (send_dfin) {
2821 mptcp_send_dfin(so);
2822 }
2823 soshutdownlock(so, SHUT_WR);
2824 }
2825 }
2826
2827 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2828 mptcp_subflow_abort(struct mptsub *mpts, int error)
2829 {
2830 struct socket *so = mpts->mpts_socket;
2831 struct tcpcb *tp = sototcpcb(so);
2832
2833 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2834 return;
2835 }
2836
2837 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2838 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2839
2840 if (tp->t_state != TCPS_CLOSED) {
2841 tcp_drop(tp, error);
2842 }
2843
2844 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2845 }
2846
2847 /*
2848 * Disconnect a subflow socket.
2849 */
2850 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2851 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2852 {
2853 struct socket *so, *mp_so;
2854 struct mptcb *mp_tp;
2855 int send_dfin = 0;
2856
2857 so = mpts->mpts_socket;
2858 mp_tp = mpte->mpte_mptcb;
2859 mp_so = mptetoso(mpte);
2860
2861 socket_lock_assert_owned(mp_so);
2862
2863 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2864 return;
2865 }
2866
2867 mptcp_unset_cellicon(mpte, mpts, 1);
2868
2869 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2870
2871 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2872 send_dfin = 1;
2873 }
2874
2875 if (mp_so->so_flags & SOF_DEFUNCT) {
2876 errno_t ret;
2877
2878 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2879 if (ret == 0) {
2880 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2881
2882 if (ret != 0) {
2883 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2884 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2885 }
2886 } else {
2887 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2888 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2889 }
2890 }
2891
2892 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2893 (so->so_state & SS_ISCONNECTED)) {
2894 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2895 __func__, mpts->mpts_connid, send_dfin),
2896 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2897
2898 if (send_dfin) {
2899 mptcp_send_dfin(so);
2900 }
2901
2902 (void) soshutdownlock(so, SHUT_RD);
2903 (void) soshutdownlock(so, SHUT_WR);
2904 (void) sodisconnectlocked(so);
2905 }
2906
2907 /*
2908 * Generate a disconnect event for this subflow socket, in case
2909 * the lower layer doesn't do it; this is needed because the
2910 * subflow socket deletion relies on it.
2911 */
2912 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2913 }
2914
2915 /*
2916 * Subflow socket input.
2917 */
2918 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2919 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2920 {
2921 struct socket *mp_so = mptetoso(mpte);
2922 struct mbuf *m = NULL;
2923 struct socket *so;
2924 int error, wakeup = 0;
2925
2926 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2927 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2928
2929 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2930 struct mptsub *, mpts);
2931
2932 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2933 goto out;
2934 }
2935
2936 so = mpts->mpts_socket;
2937
2938 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2939 if (error != 0 && error != EWOULDBLOCK) {
2940 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2941 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2942 if (error == ENODATA) {
2943 /*
2944 * Don't ignore ENODATA so as to discover
2945 * nasty middleboxes.
2946 */
2947 mp_so->so_error = ENODATA;
2948
2949 wakeup = 1;
2950 goto out;
2951 }
2952 } else if (error == 0) {
2953 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2954 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2955 }
2956
2957 /* In fallback, make sure to accept data on all but one subflow */
2958 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2959 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2960 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2961 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2962 m_freem(m);
2963 goto out;
2964 }
2965
2966 if (m != NULL) {
2967 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2968 mptcp_set_cellicon(mpte, mpts);
2969
2970 mpte->mpte_used_cell = 1;
2971 } else {
2972 /*
2973 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2974 * explicitly set the cellicon, then we unset it again.
2975 */
2976 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2977 mptcp_unset_cellicon(mpte, NULL, 1);
2978 }
2979
2980 mpte->mpte_used_wifi = 1;
2981 }
2982
2983 mptcp_input(mpte, m);
2984 }
2985
2986 out:
2987 if (wakeup) {
2988 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2989 }
2990
2991 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2992 }
2993
2994 void
mptcp_handle_input(struct socket * so)2995 mptcp_handle_input(struct socket *so)
2996 {
2997 struct mptsub *mpts, *tmpts;
2998 struct mptses *mpte;
2999
3000 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3001 return;
3002 }
3003
3004 mpts = sototcpcb(so)->t_mpsub;
3005 mpte = mpts->mpts_mpte;
3006
3007 socket_lock_assert_owned(mptetoso(mpte));
3008
3009 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3010 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
3011 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
3012 }
3013 return;
3014 }
3015
3016 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
3017 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3018 if (mpts->mpts_socket->so_usecount == 0) {
3019 /* Will be removed soon by tcp_garbage_collect */
3020 continue;
3021 }
3022
3023 mptcp_subflow_addref(mpts);
3024 mpts->mpts_socket->so_usecount++;
3025
3026 mptcp_subflow_input(mpte, mpts);
3027
3028 mptcp_subflow_remref(mpts); /* ours */
3029
3030 VERIFY(mpts->mpts_socket->so_usecount != 0);
3031 mpts->mpts_socket->so_usecount--;
3032 }
3033
3034 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
3035 }
3036
3037 /*
3038 * Subflow socket write upcall.
3039 *
3040 * Called when the associated subflow socket posted a read event.
3041 */
3042 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)3043 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
3044 {
3045 #pragma unused(so, waitf)
3046 struct mptsub *mpts = arg;
3047 struct mptses *mpte = mpts->mpts_mpte;
3048
3049 VERIFY(mpte != NULL);
3050
3051 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3052 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
3053 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3054 }
3055 return;
3056 }
3057
3058 mptcp_output(mpte);
3059 }
3060
3061 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)3062 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
3063 {
3064 struct mbuf *so_m = so->so_snd.sb_mb;
3065 uint64_t dsn = m->m_pkthdr.mp_dsn;
3066
3067 while (so_m) {
3068 VERIFY(so_m->m_flags & M_PKTHDR);
3069 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
3070
3071 /* Part of the segment is covered, don't reinject here */
3072 if (so_m->m_pkthdr.mp_dsn <= dsn &&
3073 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
3074 return TRUE;
3075 }
3076
3077 so_m = so_m->m_next;
3078 }
3079
3080 return FALSE;
3081 }
3082
3083 /*
3084 * Subflow socket output.
3085 *
3086 * Called for sending data from MPTCP to the underlying subflow socket.
3087 */
3088 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3089 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3090 {
3091 struct mptcb *mp_tp = mpte->mpte_mptcb;
3092 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3093 struct socket *mp_so, *so;
3094 struct tcpcb *tp;
3095 uint64_t mpt_dsn = 0, off = 0;
3096 int sb_cc = 0, error = 0, wakeup = 0;
3097 uint16_t dss_csum;
3098 uint16_t tot_sent = 0;
3099 boolean_t reinjected = FALSE;
3100
3101 mp_so = mptetoso(mpte);
3102 so = mpts->mpts_socket;
3103 tp = sototcpcb(so);
3104
3105 socket_lock_assert_owned(mp_so);
3106
3107 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3108 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3109
3110 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3111 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3112 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3113 (mpts->mpts_flags & MPTSF_TFO_REQD));
3114 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3115
3116 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
3117 __func__, mpts->mpts_flags, mpte->mpte_flags,
3118 mptcp_subflow_cwnd_space(so)),
3119 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3120 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3121 struct mptsub *, mpts);
3122
3123 /* Remove Addr Option is not sent reliably as per I-D */
3124 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3125 tp->t_rem_aid = mpte->mpte_lost_aid;
3126 tp->t_mpflags |= TMPF_SND_REM_ADDR;
3127 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3128 }
3129
3130 /*
3131 * The mbuf chains containing the metadata (as well as pointing to
3132 * the user data sitting at the MPTCP output queue) would then be
3133 * sent down to the subflow socket.
3134 *
3135 * Some notes on data sequencing:
3136 *
3137 * a. Each mbuf must be a M_PKTHDR.
3138 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
3139 * in the mbuf pkthdr structure.
3140 * c. Each mbuf containing the MPTCP metadata must have its
3141 * pkt_flags marked with the PKTF_MPTCP flag.
3142 */
3143
3144 if (mpte->mpte_reinjectq) {
3145 sb_mb = mpte->mpte_reinjectq;
3146 } else {
3147 sb_mb = mp_so->so_snd.sb_mb;
3148 }
3149
3150 if (sb_mb == NULL) {
3151 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3152 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3153 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3154 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3155
3156 /* Fix it to prevent looping */
3157 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3158 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3159 }
3160 goto out;
3161 }
3162
3163 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3164
3165 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3166 !(so->so_state & SS_ISCONNECTED) &&
3167 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3168 tp->t_mpflags |= TMPF_TFO_REQUEST;
3169
3170 /* Opting to call pru_send as no mbuf at subflow level */
3171 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3172 NULL, current_proc());
3173
3174 goto done_sending;
3175 }
3176
3177 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3178
3179 /* First, drop acknowledged data */
3180 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3181 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3182 "dsn %u suna %u reinject? %u\n",
3183 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3184 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3185 if (mpte->mpte_reinjectq) {
3186 mptcp_clean_reinjectq(mpte);
3187 } else {
3188 uint64_t len = 0;
3189 len = mp_tp->mpt_snduna - mpt_dsn;
3190 sbdrop(&mp_so->so_snd, (int)len);
3191 wakeup = 1;
3192 }
3193 }
3194
3195 /* Check again because of above sbdrop */
3196 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3197 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3198 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3199 goto out;
3200 }
3201
3202 /*
3203 * In degraded mode, we don't receive data acks, so force free
3204 * mbufs less than snd_nxt
3205 */
3206 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3207 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3208 mp_so->so_snd.sb_mb) {
3209 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3210 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3211 uint64_t len = 0;
3212 len = mp_tp->mpt_snduna - mpt_dsn;
3213 sbdrop(&mp_so->so_snd, (int)len);
3214 wakeup = 1;
3215
3216 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3217 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3218 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3219 }
3220 }
3221
3222 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3223 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3224 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3225 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3226 }
3227
3228 /*
3229 * Adjust the top level notion of next byte used for retransmissions
3230 * and sending FINs.
3231 */
3232 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3233 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3234 }
3235
3236 /* Now determine the offset from which to start transmitting data */
3237 if (mpte->mpte_reinjectq) {
3238 sb_mb = mpte->mpte_reinjectq;
3239 } else {
3240 dont_reinject:
3241 sb_mb = mp_so->so_snd.sb_mb;
3242 }
3243 if (sb_mb == NULL) {
3244 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3245 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3246 goto out;
3247 }
3248
3249 if (sb_mb == mpte->mpte_reinjectq) {
3250 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3251 off = 0;
3252
3253 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3254 if (mptcp_can_send_more(mp_tp, TRUE)) {
3255 goto dont_reinject;
3256 }
3257
3258 error = ECANCELED;
3259 goto out;
3260 }
3261
3262 reinjected = TRUE;
3263 } else if (flags & MPTCP_SUBOUT_PROBING) {
3264 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3265 off = 0;
3266 } else {
3267 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3268
3269 /*
3270 * With TFO, there might be no data at all, thus still go into this
3271 * code-path here.
3272 */
3273 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3274 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3275 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3276 sb_cc -= off;
3277 } else {
3278 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3279 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3280 (uint32_t)mp_tp->mpt_sndmax);
3281
3282 goto out;
3283 }
3284 }
3285
3286 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3287 if (sb_cc <= 0) {
3288 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3289 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3290 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3291 mptcp_subflow_cwnd_space(so));
3292 }
3293
3294 sb_cc = min(sb_cc, UINT16_MAX);
3295
3296 /*
3297 * Create a DSN mapping for the data we are about to send. It all
3298 * has the same mapping.
3299 */
3300 if (reinjected) {
3301 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3302 } else {
3303 mpt_dsn = mp_tp->mpt_snduna + off;
3304 }
3305
3306 mpt_mbuf = sb_mb;
3307 while (mpt_mbuf && reinjected == FALSE &&
3308 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3309 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3310 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3311 mpt_mbuf = mpt_mbuf->m_next;
3312 }
3313 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3314 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
3315 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3316 mpts->mpts_probecnt),
3317 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3318 }
3319
3320 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3321
3322 head = tail = NULL;
3323
3324 while (tot_sent < sb_cc) {
3325 int32_t mlen;
3326
3327 mlen = mpt_mbuf->m_len;
3328 mlen -= off;
3329 mlen = MIN(mlen, sb_cc - tot_sent);
3330
3331 if (mlen < 0) {
3332 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3333 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3334 (uint32_t)off, sb_cc, tot_sent);
3335 goto out;
3336 }
3337
3338 if (mlen == 0) {
3339 goto next;
3340 }
3341
3342 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3343 M_COPYM_MUST_COPY_HDR);
3344 if (m == NULL) {
3345 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3346 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3347 error = ENOBUFS;
3348 break;
3349 }
3350
3351 /* Create a DSN mapping for the data (m_copym does it) */
3352 VERIFY(m->m_flags & M_PKTHDR);
3353 VERIFY(m->m_next == NULL);
3354
3355 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3356 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3357 m->m_pkthdr.mp_dsn = mpt_dsn;
3358 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3359 m->m_pkthdr.len = mlen;
3360
3361 if (head == NULL) {
3362 head = tail = m;
3363 } else {
3364 tail->m_next = m;
3365 tail = m;
3366 }
3367
3368 tot_sent += mlen;
3369 off = 0;
3370 next:
3371 mpt_mbuf = mpt_mbuf->m_next;
3372 }
3373
3374 if (reinjected) {
3375 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3376 struct mbuf *n = sb_mb;
3377
3378 while (n) {
3379 n->m_pkthdr.mp_dsn += sb_cc;
3380 n->m_pkthdr.mp_rlen -= sb_cc;
3381 n = n->m_next;
3382 }
3383 m_adj(sb_mb, sb_cc);
3384 } else {
3385 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3386 m_freem(sb_mb);
3387 }
3388 }
3389
3390 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
3391 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
3392 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3393
3394 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3395 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3396 tot_sent);
3397 }
3398
3399 /* Now, let's update rel-seq and the data-level length */
3400 mpts->mpts_rel_seq += tot_sent;
3401 m = head;
3402 while (m) {
3403 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3404 m->m_pkthdr.mp_csum = dss_csum;
3405 }
3406 m->m_pkthdr.mp_rlen = tot_sent;
3407 m = m->m_next;
3408 }
3409
3410 if (head != NULL) {
3411 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3412 (tp->t_tfo_stats == 0)) {
3413 tp->t_mpflags |= TMPF_TFO_REQUEST;
3414 }
3415
3416 error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3417 head = NULL;
3418 }
3419
3420 done_sending:
3421 if (error == 0 ||
3422 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3423 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3424
3425 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3426 tcpstat.tcps_mp_num_probes++;
3427 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3428 mpts->mpts_probecnt += 1;
3429 } else {
3430 mpts->mpts_probecnt +=
3431 tot_sent / mpts->mpts_maxseg;
3432 }
3433 }
3434
3435 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3436 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3437 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3438 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3439 }
3440 mp_tp->mpt_sndnxt = new_sndnxt;
3441 }
3442
3443 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3444
3445 /* Must be here as mptcp_can_send_more() checks for this */
3446 soclearfastopen(mp_so);
3447
3448 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3449 (mpts->mpts_probesoon != 0)) {
3450 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3451 __func__, mpts->mpts_connid,
3452 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3453 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3454 (tcp_now - mpts->mpts_probesoon)),
3455 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3456 }
3457
3458 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3459 mptcp_set_cellicon(mpte, mpts);
3460
3461 mpte->mpte_used_cell = 1;
3462 } else {
3463 /*
3464 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3465 * explicitly set the cellicon, then we unset it again.
3466 */
3467 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3468 mptcp_unset_cellicon(mpte, NULL, 1);
3469 }
3470
3471 mpte->mpte_used_wifi = 1;
3472 }
3473
3474 /*
3475 * Don't propagate EWOULDBLOCK - it's already taken care of
3476 * in mptcp_usr_send for TFO.
3477 */
3478 error = 0;
3479 } else {
3480 /* We need to revert our change to mpts_rel_seq */
3481 mpts->mpts_rel_seq -= tot_sent;
3482
3483 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3484 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3485 }
3486 out:
3487
3488 if (head != NULL) {
3489 m_freem(head);
3490 }
3491
3492 if (wakeup) {
3493 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3494 }
3495
3496 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3497 return error;
3498 }
3499
3500 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3501 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3502 {
3503 struct mbuf *n, *prev = NULL;
3504
3505 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
3506 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3507 m->m_pkthdr.mp_rseq),
3508 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3509
3510 n = mpte->mpte_reinjectq;
3511
3512 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3513 * equal than m's sequence number.
3514 */
3515 while (n) {
3516 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3517 break;
3518 }
3519
3520 prev = n;
3521
3522 n = n->m_nextpkt;
3523 }
3524
3525 if (n) {
3526 /* m is already fully covered by the next mbuf in the queue */
3527 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3528 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3529 os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3530 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3531 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3532 m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3533 goto dont_queue;
3534 }
3535
3536 /* m is covering the next mbuf entirely, thus we remove this guy */
3537 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3538 struct mbuf *tmp = n->m_nextpkt;
3539
3540 os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3541 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3542 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3543 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3544
3545 m->m_nextpkt = NULL;
3546 if (prev == NULL) {
3547 mpte->mpte_reinjectq = tmp;
3548 } else {
3549 prev->m_nextpkt = tmp;
3550 }
3551
3552 m_freem(n);
3553 n = tmp;
3554 }
3555 }
3556
3557 if (prev) {
3558 /* m is already fully covered by the previous mbuf in the queue */
3559 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3560 os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3561 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3562 (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3563 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3564 goto dont_queue;
3565 }
3566 }
3567
3568 if (prev == NULL) {
3569 mpte->mpte_reinjectq = m;
3570 } else {
3571 prev->m_nextpkt = m;
3572 }
3573
3574 m->m_nextpkt = n;
3575
3576 return;
3577
3578 dont_queue:
3579 m_freem(m);
3580 return;
3581 }
3582
3583 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3584 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3585 {
3586 struct socket *mp_so = mptetoso(mpte);
3587 struct mbuf *m;
3588
3589 m = mp_so->so_snd.sb_mb;
3590
3591 while (m) {
3592 /* If this segment covers what we are looking for, return it. */
3593 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3594 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3595 break;
3596 }
3597
3598
3599 /* Segment is no more in the queue */
3600 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3601 return NULL;
3602 }
3603
3604 m = m->m_next;
3605 }
3606
3607 return m;
3608 }
3609
3610 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3611 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3612 {
3613 struct mbuf *top = NULL, *tail = NULL;
3614 uint64_t dsn;
3615 uint32_t dlen, rseq;
3616
3617 dsn = m->m_pkthdr.mp_dsn;
3618 dlen = m->m_pkthdr.mp_rlen;
3619 rseq = m->m_pkthdr.mp_rseq;
3620
3621 while (len > 0) {
3622 struct mbuf *n;
3623
3624 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3625
3626 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3627 if (n == NULL) {
3628 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3629 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3630 goto err;
3631 }
3632
3633 VERIFY(n->m_flags & M_PKTHDR);
3634 VERIFY(n->m_next == NULL);
3635 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3636 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3637 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3638 VERIFY(n->m_len == m->m_len);
3639
3640 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3641
3642 if (top == NULL) {
3643 top = n;
3644 }
3645
3646 if (tail != NULL) {
3647 tail->m_next = n;
3648 }
3649
3650 tail = n;
3651
3652 len -= m->m_len;
3653 m = m->m_next;
3654 }
3655
3656 return top;
3657
3658 err:
3659 if (top) {
3660 m_freem(top);
3661 }
3662
3663 return NULL;
3664 }
3665
3666 static void
mptcp_reinject_mbufs(struct socket * so)3667 mptcp_reinject_mbufs(struct socket *so)
3668 {
3669 struct tcpcb *tp = sototcpcb(so);
3670 struct mptsub *mpts = tp->t_mpsub;
3671 struct mptcb *mp_tp = tptomptp(tp);
3672 struct mptses *mpte = mp_tp->mpt_mpte;
3673 struct sockbuf *sb = &so->so_snd;
3674 struct mbuf *m;
3675
3676 m = sb->sb_mb;
3677 while (m) {
3678 struct mbuf *n = m->m_next, *orig = m;
3679 bool set_reinject_flag = false;
3680
3681 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3682 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3683 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3684 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3685
3686 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3687
3688 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3689 goto next;
3690 }
3691
3692 /* Has it all already been acknowledged at the data-level? */
3693 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3694 goto next;
3695 }
3696
3697 /* Part of this has already been acknowledged - lookup in the
3698 * MPTCP-socket for the segment.
3699 */
3700 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3701 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3702 if (m == NULL) {
3703 goto next;
3704 }
3705 }
3706
3707 /* Copy the mbuf with headers (aka, DSN-numbers) */
3708 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3709 if (m == NULL) {
3710 break;
3711 }
3712
3713 VERIFY(m->m_nextpkt == NULL);
3714
3715 /* Now, add to the reinject-queue, eliminating overlapping
3716 * segments
3717 */
3718 mptcp_add_reinjectq(mpte, m);
3719
3720 set_reinject_flag = true;
3721 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3722
3723 next:
3724 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3725 while (n) {
3726 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3727
3728 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3729 break;
3730 }
3731
3732 if (set_reinject_flag) {
3733 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3734 }
3735 n = n->m_next;
3736 }
3737
3738 m = n;
3739 }
3740 }
3741
3742 void
mptcp_clean_reinjectq(struct mptses * mpte)3743 mptcp_clean_reinjectq(struct mptses *mpte)
3744 {
3745 struct mptcb *mp_tp = mpte->mpte_mptcb;
3746
3747 socket_lock_assert_owned(mptetoso(mpte));
3748
3749 while (mpte->mpte_reinjectq) {
3750 struct mbuf *m = mpte->mpte_reinjectq;
3751
3752 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3753 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3754 break;
3755 }
3756
3757 mpte->mpte_reinjectq = m->m_nextpkt;
3758 m->m_nextpkt = NULL;
3759 m_freem(m);
3760 }
3761 }
3762
3763 /*
3764 * Subflow socket control event upcall.
3765 */
3766 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,long events)3767 mptcp_subflow_eupcall1(struct socket *so, void *arg, long events)
3768 {
3769 #pragma unused(so)
3770 struct mptsub *mpts = arg;
3771 struct mptses *mpte = mpts->mpts_mpte;
3772
3773 socket_lock_assert_owned(mptetoso(mpte));
3774
3775 if ((mpts->mpts_evctl & events) == events) {
3776 return;
3777 }
3778
3779 mpts->mpts_evctl |= events;
3780
3781 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3782 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3783 return;
3784 }
3785
3786 mptcp_subflow_workloop(mpte);
3787 }
3788
3789 /*
3790 * Subflow socket control events.
3791 *
3792 * Called for handling events related to the underlying subflow socket.
3793 */
3794 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint)3795 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3796 long *p_mpsofilt_hint)
3797 {
3798 ev_ret_t ret = MPTS_EVRET_OK;
3799 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3800 sizeof(mpsub_ev_entry_tbl[0]);
3801
3802 /* bail if there's nothing to process */
3803 if (!mpts->mpts_evctl) {
3804 return ret;
3805 }
3806
3807 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3808 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3809 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
3810 SO_FILT_HINT_DISCONNECTED)) {
3811 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3812 }
3813
3814 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3815 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3816
3817 /*
3818 * Process all the socket filter hints and reset the hint
3819 * once it is handled
3820 */
3821 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3822 /*
3823 * Always execute the DISCONNECTED event, because it will wakeup
3824 * the app.
3825 */
3826 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3827 (ret >= MPTS_EVRET_OK ||
3828 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3829 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3830 ev_ret_t error =
3831 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3832 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3833 }
3834 }
3835
3836 return ret;
3837 }
3838
3839 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3840 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3841 long *p_mpsofilt_hint, long event)
3842 {
3843 struct socket *mp_so, *so;
3844 struct mptcb *mp_tp;
3845
3846 mp_so = mptetoso(mpte);
3847 mp_tp = mpte->mpte_mptcb;
3848 so = mpts->mpts_socket;
3849
3850 /*
3851 * We got an event for this subflow that might need to be propagated,
3852 * based on the state of the MPTCP connection.
3853 */
3854 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3855 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3856 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3857 mp_so->so_error = so->so_error;
3858 *p_mpsofilt_hint |= event;
3859 }
3860
3861 return MPTS_EVRET_OK;
3862 }
3863
3864 /*
3865 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3866 */
3867 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3868 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3869 long *p_mpsofilt_hint, long event)
3870 {
3871 #pragma unused(p_mpsofilt_hint, event)
3872 struct socket *mp_so;
3873 struct tcpcb *tp;
3874
3875 mp_so = mptetoso(mpte);
3876 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3877
3878 /*
3879 * This overwrites any previous mpte_lost_aid to avoid storing
3880 * too much state when the typical case has only two subflows.
3881 */
3882 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3883 mpte->mpte_lost_aid = tp->t_local_aid;
3884
3885 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3886 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3887
3888 /*
3889 * The subflow connection has lost its source address.
3890 */
3891 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3892
3893 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3894 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3895 }
3896
3897 return MPTS_EVRET_DELETE;
3898 }
3899
3900 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3901 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3902 long *p_mpsofilt_hint, long event)
3903 {
3904 #pragma unused(event, p_mpsofilt_hint)
3905 struct socket *so, *mp_so;
3906
3907 so = mpts->mpts_socket;
3908
3909 if (so->so_error != ENODATA) {
3910 return MPTS_EVRET_OK;
3911 }
3912
3913
3914 mp_so = mptetoso(mpte);
3915
3916 mp_so->so_error = ENODATA;
3917
3918 sorwakeup(mp_so);
3919 sowwakeup(mp_so);
3920
3921 return MPTS_EVRET_OK;
3922 }
3923
3924
3925 /*
3926 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3927 * indicates that the remote side sent a Data FIN
3928 */
3929 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3930 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3931 long *p_mpsofilt_hint, long event)
3932 {
3933 #pragma unused(event)
3934 struct mptcb *mp_tp = mpte->mpte_mptcb;
3935
3936 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3937 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3938
3939 /*
3940 * We got a Data FIN for the MPTCP connection.
3941 * The FIN may arrive with data. The data is handed up to the
3942 * mptcp socket and the user is notified so that it may close
3943 * the socket if needed.
3944 */
3945 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3946 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3947 }
3948
3949 return MPTS_EVRET_OK; /* keep the subflow socket around */
3950 }
3951
3952 /*
3953 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3954 */
3955 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3956 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3957 long *p_mpsofilt_hint, long event)
3958 {
3959 #pragma unused(event, p_mpsofilt_hint)
3960 struct mptsub *mpts_alt = NULL;
3961 struct socket *alt_so = NULL;
3962 struct socket *mp_so;
3963 int altpath_exists = 0;
3964
3965 mp_so = mptetoso(mpte);
3966 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3967
3968 mptcp_reinject_mbufs(mpts->mpts_socket);
3969
3970 mpts_alt = mptcp_get_subflow(mpte, NULL);
3971
3972 /* If there is no alternate eligible subflow, ignore the failover hint. */
3973 if (mpts_alt == NULL || mpts_alt == mpts) {
3974 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3975 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3976
3977 goto done;
3978 }
3979
3980 altpath_exists = 1;
3981 alt_so = mpts_alt->mpts_socket;
3982 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3983 /* All data acknowledged and no RTT spike */
3984 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3985 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3986 } else {
3987 /* no alternate path available */
3988 altpath_exists = 0;
3989 }
3990 }
3991
3992 if (altpath_exists) {
3993 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3994
3995 mpte->mpte_active_sub = mpts_alt;
3996 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3997 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3998
3999 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
4000 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
4001
4002 mptcpstats_inc_switch(mpte, mpts);
4003
4004 sowwakeup(alt_so);
4005 } else {
4006 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
4007 mpts->mpts_connid),
4008 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4009 done:
4010 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
4011 }
4012
4013 return MPTS_EVRET_OK;
4014 }
4015
4016 /*
4017 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
4018 */
4019 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4020 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
4021 long *p_mpsofilt_hint, long event)
4022 {
4023 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
4024 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4025
4026 /*
4027 * The subflow connection cannot use the outgoing interface, let's
4028 * close this subflow.
4029 */
4030 mptcp_subflow_abort(mpts, EPERM);
4031
4032 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
4033
4034 return MPTS_EVRET_DELETE;
4035 }
4036
4037 /*
4038 * https://tools.ietf.org/html/rfc6052#section-2
4039 * https://tools.ietf.org/html/rfc6147#section-5.2
4040 */
4041 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)4042 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
4043 const struct ipv6_prefix *prefix,
4044 struct in_addr *addrv4)
4045 {
4046 char buf[MAX_IPv4_STR_LEN];
4047 char *ptrv4 = (char *)addrv4;
4048 const char *ptr = (const char *)addr;
4049
4050 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
4051 return false;
4052 }
4053
4054 switch (prefix->prefix_len) {
4055 case NAT64_PREFIX_LEN_96:
4056 memcpy(ptrv4, ptr + 12, 4);
4057 break;
4058 case NAT64_PREFIX_LEN_64:
4059 memcpy(ptrv4, ptr + 9, 4);
4060 break;
4061 case NAT64_PREFIX_LEN_56:
4062 memcpy(ptrv4, ptr + 7, 1);
4063 memcpy(ptrv4 + 1, ptr + 9, 3);
4064 break;
4065 case NAT64_PREFIX_LEN_48:
4066 memcpy(ptrv4, ptr + 6, 2);
4067 memcpy(ptrv4 + 2, ptr + 9, 2);
4068 break;
4069 case NAT64_PREFIX_LEN_40:
4070 memcpy(ptrv4, ptr + 5, 3);
4071 memcpy(ptrv4 + 3, ptr + 9, 1);
4072 break;
4073 case NAT64_PREFIX_LEN_32:
4074 memcpy(ptrv4, ptr + 4, 4);
4075 break;
4076 default:
4077 panic("NAT64-prefix len is wrong: %u",
4078 prefix->prefix_len);
4079 }
4080
4081 os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
4082 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4083 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
4084
4085 return true;
4086 }
4087
4088 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)4089 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
4090 {
4091 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
4092 struct socket *so = mpts->mpts_socket;
4093 struct ifnet *ifp;
4094 int j;
4095
4096 /* Subflow IPs will be steered directly by the server - no need to
4097 * desynthesize.
4098 */
4099 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
4100 return;
4101 }
4102
4103 ifp = sotoinpcb(so)->inp_last_outifp;
4104
4105 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
4106 return;
4107 }
4108
4109 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
4110 int success;
4111
4112 if (nat64prefixes[j].prefix_len == 0) {
4113 continue;
4114 }
4115
4116 success = mptcp_desynthesize_ipv6_addr(mpte,
4117 &mpte->__mpte_dst_v6.sin6_addr,
4118 &nat64prefixes[j],
4119 &mpte->mpte_sub_dst_v4.sin_addr);
4120 if (success) {
4121 mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
4122 mpte->mpte_sub_dst_v4.sin_family = AF_INET;
4123 mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
4124
4125 /*
4126 * We connected to a NAT64'ed address. Let's remove it
4127 * from the potential IPs to use. Whenever we are back on
4128 * that network and need to connect, we can synthesize again.
4129 *
4130 * Otherwise, on different IPv6 networks we will attempt
4131 * to connect to that NAT64 address...
4132 */
4133 memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
4134 break;
4135 }
4136 }
4137 }
4138
4139 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)4140 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
4141 {
4142 struct inpcb *inp;
4143
4144 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
4145 return;
4146 }
4147
4148 inp = sotoinpcb(mpts->mpts_socket);
4149 if (inp == NULL) {
4150 return;
4151 }
4152
4153 /* Should we try the alternate port? */
4154 if (mpte->mpte_alternate_port &&
4155 inp->inp_fport != mpte->mpte_alternate_port) {
4156 union sockaddr_in_4_6 dst;
4157 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
4158
4159 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
4160
4161 dst_in->sin_port = mpte->mpte_alternate_port;
4162
4163 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
4164 mpts->mpts_ifscope, NULL);
4165 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
4166 unsigned int i;
4167
4168 if (inp->inp_last_outifp == NULL) {
4169 return;
4170 }
4171
4172 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
4173 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
4174
4175 if (inp->inp_last_outifp->if_index == info->ifindex) {
4176 info->no_mptcp_support = 1;
4177 break;
4178 }
4179 }
4180 }
4181 }
4182
4183 /*
4184 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4185 */
4186 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4187 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4188 long *p_mpsofilt_hint, long event)
4189 {
4190 #pragma unused(event, p_mpsofilt_hint)
4191 struct socket *mp_so, *so;
4192 struct inpcb *inp;
4193 struct tcpcb *tp;
4194 struct mptcb *mp_tp;
4195 int af;
4196 boolean_t mpok = FALSE;
4197
4198 mp_so = mptetoso(mpte);
4199 mp_tp = mpte->mpte_mptcb;
4200 so = mpts->mpts_socket;
4201 tp = sototcpcb(so);
4202 af = mpts->mpts_dst.sa_family;
4203
4204 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4205 return MPTS_EVRET_OK;
4206 }
4207
4208 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4209 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4210 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
4211 (so->so_state & SS_ISCONNECTED)) {
4212 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
4213 __func__, mpts->mpts_connid),
4214 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4215 (void) soshutdownlock(so, SHUT_RD);
4216 (void) soshutdownlock(so, SHUT_WR);
4217 (void) sodisconnectlocked(so);
4218 }
4219 return MPTS_EVRET_OK;
4220 }
4221
4222 /*
4223 * The subflow connection has been connected. Find out whether it
4224 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4225 *
4226 * a. If MPTCP connection is not yet established, then this must be
4227 * the first subflow connection. If MPTCP failed to negotiate,
4228 * fallback to regular TCP by degrading this subflow.
4229 *
4230 * b. If MPTCP connection has been established, then this must be
4231 * one of the subsequent subflow connections. If MPTCP failed
4232 * to negotiate, disconnect the connection.
4233 *
4234 * Right now, we simply unblock any waiters at the MPTCP socket layer
4235 * if the MPTCP connection has not been established.
4236 */
4237
4238 if (so->so_state & SS_ISDISCONNECTED) {
4239 /*
4240 * With MPTCP joins, a connection is connected at the subflow
4241 * level, but the 4th ACK from the server elevates the MPTCP
4242 * subflow to connected state. So there is a small window
4243 * where the subflow could get disconnected before the
4244 * connected event is processed.
4245 */
4246 return MPTS_EVRET_OK;
4247 }
4248
4249 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4250 mptcp_drop_tfo_data(mpte, mpts);
4251 }
4252
4253 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4254 mpts->mpts_flags |= MPTSF_CONNECTED;
4255
4256 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4257 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4258 }
4259
4260 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4261
4262 /* get/verify the outbound interface */
4263 inp = sotoinpcb(so);
4264
4265 mpts->mpts_maxseg = tp->t_maxseg;
4266
4267 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
4268 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
4269 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
4270 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
4271
4272 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4273
4274 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4275 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4276 mpte->mpte_associd = mpts->mpts_connid;
4277 DTRACE_MPTCP2(state__change,
4278 struct mptcb *, mp_tp,
4279 uint32_t, 0 /* event */);
4280
4281 if (SOCK_DOM(so) == AF_INET) {
4282 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4283 } else {
4284 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4285 }
4286
4287 mpts->mpts_flags |= MPTSF_ACTIVE;
4288
4289 /* case (a) above */
4290 if (!mpok) {
4291 tcpstat.tcps_mpcap_fallback++;
4292
4293 tp->t_mpflags |= TMPF_INFIN_SENT;
4294 mptcp_notify_mpfail(so);
4295 } else {
4296 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4297 mptcp_subflows_need_backup_flag(mpte)) {
4298 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4299 } else {
4300 mpts->mpts_flags |= MPTSF_PREFERRED;
4301 }
4302 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4303 mpte->mpte_nummpcapflows++;
4304
4305 if (SOCK_DOM(so) == AF_INET6) {
4306 mptcp_handle_ipv6_connection(mpte, mpts);
4307 }
4308
4309 mptcp_check_subflows_and_add(mpte);
4310
4311 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4312 mpte->mpte_initial_cell = 1;
4313 }
4314
4315 mpte->mpte_handshake_success = 1;
4316 }
4317
4318 mp_tp->mpt_sndwnd = tp->snd_wnd;
4319 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4320 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4321 soisconnected(mp_so);
4322 } else if (mpok) {
4323 /*
4324 * case (b) above
4325 * In case of additional flows, the MPTCP socket is not
4326 * MPTSF_MP_CAPABLE until an ACK is received from server
4327 * for 3-way handshake. TCP would have guaranteed that this
4328 * is an MPTCP subflow.
4329 */
4330 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4331 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4332 mptcp_subflows_need_backup_flag(mpte)) {
4333 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4334 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4335 } else {
4336 mpts->mpts_flags |= MPTSF_PREFERRED;
4337 }
4338
4339 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4340 mpte->mpte_nummpcapflows++;
4341
4342 mpts->mpts_rel_seq = 1;
4343
4344 mptcp_check_subflows_and_remove(mpte);
4345 } else {
4346 mptcp_try_alternate_port(mpte, mpts);
4347
4348 tcpstat.tcps_join_fallback++;
4349 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4350 tcpstat.tcps_mptcp_cell_proxy++;
4351 } else {
4352 tcpstat.tcps_mptcp_wifi_proxy++;
4353 }
4354
4355 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4356
4357 return MPTS_EVRET_OK;
4358 }
4359
4360 /* This call, just to "book" an entry in the stats-table for this ifindex */
4361 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4362
4363 mptcp_output(mpte);
4364
4365 return MPTS_EVRET_OK; /* keep the subflow socket around */
4366 }
4367
4368 /*
4369 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4370 */
4371 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4372 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4373 long *p_mpsofilt_hint, long event)
4374 {
4375 #pragma unused(event, p_mpsofilt_hint)
4376 struct socket *mp_so, *so;
4377 struct mptcb *mp_tp;
4378
4379 mp_so = mptetoso(mpte);
4380 mp_tp = mpte->mpte_mptcb;
4381 so = mpts->mpts_socket;
4382
4383 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
4384 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
4385 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
4386 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
4387 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4388
4389 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4390 return MPTS_EVRET_DELETE;
4391 }
4392
4393 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4394
4395 /* The subflow connection has been disconnected. */
4396
4397 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4398 mpte->mpte_nummpcapflows--;
4399 if (mpte->mpte_active_sub == mpts) {
4400 mpte->mpte_active_sub = NULL;
4401 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
4402 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4403 }
4404 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4405 } else {
4406 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4407 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4408 mptcp_try_alternate_port(mpte, mpts);
4409 }
4410 }
4411
4412 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4413 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4414 mptcp_drop(mpte, mp_tp, so->so_error);
4415 }
4416
4417 /*
4418 * Clear flags that are used by getconninfo to return state.
4419 * Retain like MPTSF_DELETEOK for internal purposes.
4420 */
4421 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4422 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4423 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4424
4425 return MPTS_EVRET_DELETE;
4426 }
4427
4428 /*
4429 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4430 */
4431 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4432 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4433 long *p_mpsofilt_hint, long event)
4434 {
4435 #pragma unused(event, p_mpsofilt_hint)
4436 ev_ret_t ret = MPTS_EVRET_OK;
4437 struct socket *mp_so, *so;
4438 struct mptcb *mp_tp;
4439
4440 mp_so = mptetoso(mpte);
4441 mp_tp = mpte->mpte_mptcb;
4442 so = mpts->mpts_socket;
4443 struct inpcb *inp = sotoinpcb(so);
4444 struct tcpcb *tp = intotcpcb(inp);
4445
4446 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4447 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4448 } else {
4449 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4450 }
4451
4452 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4453 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4454 goto done;
4455 }
4456 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4457 } else {
4458 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4459 }
4460
4461 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4462 mpts->mpts_flags |= MPTSF_MP_READY;
4463 } else {
4464 mpts->mpts_flags &= ~MPTSF_MP_READY;
4465 }
4466
4467 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4468 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4469 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4470 tcp_cache_update_mptcp_version(tp, FALSE);
4471 }
4472
4473 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4474 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4475
4476 m_freem_list(mpte->mpte_reinjectq);
4477 mpte->mpte_reinjectq = NULL;
4478 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4479 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4480 ret = MPTS_EVRET_CONNECT_PENDING;
4481 }
4482
4483 done:
4484 return ret;
4485 }
4486
4487 /*
4488 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4489 */
4490 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4491 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4492 long *p_mpsofilt_hint, long event)
4493 {
4494 #pragma unused(event)
4495 struct socket *mp_so, *so;
4496 struct mptcb *mp_tp;
4497 boolean_t is_fastclose;
4498
4499 mp_so = mptetoso(mpte);
4500 mp_tp = mpte->mpte_mptcb;
4501 so = mpts->mpts_socket;
4502
4503 /* We got an invalid option or a fast close */
4504 struct inpcb *inp = sotoinpcb(so);
4505 struct tcpcb *tp = NULL;
4506
4507 tp = intotcpcb(inp);
4508 so->so_error = ECONNABORTED;
4509
4510 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4511
4512 tp->t_mpflags |= TMPF_RESET;
4513
4514 if (tp->t_state != TCPS_CLOSED) {
4515 struct tcptemp *t_template = tcp_maketemplate(tp);
4516
4517 if (t_template) {
4518 struct tcp_respond_args tra;
4519
4520 bzero(&tra, sizeof(tra));
4521 if (inp->inp_flags & INP_BOUND_IF) {
4522 tra.ifscope = inp->inp_boundifp->if_index;
4523 } else {
4524 tra.ifscope = IFSCOPE_NONE;
4525 }
4526 tra.awdl_unrestricted = 1;
4527
4528 tcp_respond(tp, t_template->tt_ipgen,
4529 &t_template->tt_t, (struct mbuf *)NULL,
4530 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4531 (void) m_free(dtom(t_template));
4532 }
4533 }
4534
4535 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4536 struct mptsub *iter, *tmp;
4537
4538 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4539
4540 mp_so->so_error = ECONNRESET;
4541
4542 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4543 if (iter == mpts) {
4544 continue;
4545 }
4546 mptcp_subflow_abort(iter, ECONNABORTED);
4547 }
4548
4549 /*
4550 * mptcp_drop is being called after processing the events, to fully
4551 * close the MPTCP connection
4552 */
4553 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4554 }
4555
4556 mptcp_subflow_abort(mpts, ECONNABORTED);
4557
4558 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4559 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4560 }
4561
4562 return MPTS_EVRET_DELETE;
4563 }
4564
4565 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4566 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4567 long *p_mpsofilt_hint, long event)
4568 {
4569 #pragma unused(event)
4570 bool found_active = false;
4571
4572 mpts->mpts_flags |= MPTSF_READ_STALL;
4573
4574 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4575 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4576
4577 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4578 TCPS_HAVERCVDFIN2(tp->t_state)) {
4579 continue;
4580 }
4581
4582 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4583 found_active = true;
4584 break;
4585 }
4586 }
4587
4588 if (!found_active) {
4589 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4590 }
4591
4592 return MPTS_EVRET_OK;
4593 }
4594
4595 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4596 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4597 long *p_mpsofilt_hint, long event)
4598 {
4599 #pragma unused(event)
4600 bool found_active = false;
4601
4602 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4603
4604 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4605 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4606
4607 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4608 tp->t_state > TCPS_CLOSE_WAIT) {
4609 continue;
4610 }
4611
4612 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4613 found_active = true;
4614 break;
4615 }
4616 }
4617
4618 if (!found_active) {
4619 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4620 }
4621
4622 return MPTS_EVRET_OK;
4623 }
4624
4625 /*
4626 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4627 * caller must ensure that the option can be issued on subflow sockets, via
4628 * MPOF_SUBFLOW_OK flag.
4629 */
4630 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4631 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4632 {
4633 struct socket *mp_so, *so;
4634 struct sockopt sopt;
4635 int error;
4636
4637 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4638
4639 mp_so = mptetoso(mpte);
4640 so = mpts->mpts_socket;
4641
4642 socket_lock_assert_owned(mp_so);
4643
4644 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4645 mpo->mpo_level == SOL_SOCKET &&
4646 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4647 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4648
4649 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4650 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
4651 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4652 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4653 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4654
4655 /*
4656 * When we open a new subflow, mark it as cell fallback, if
4657 * this subflow goes over cell.
4658 *
4659 * (except for first-party apps)
4660 */
4661
4662 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4663 return 0;
4664 }
4665
4666 if (sotoinpcb(so)->inp_last_outifp &&
4667 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4668 return 0;
4669 }
4670
4671 /*
4672 * This here is an OR, because if the app is not binding to the
4673 * interface, then it definitely is not a cell-fallback
4674 * connection.
4675 */
4676 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4677 !IFNET_IS_CELLULAR(ifp)) {
4678 return 0;
4679 }
4680 }
4681
4682 mpo->mpo_flags &= ~MPOF_INTERIM;
4683
4684 bzero(&sopt, sizeof(sopt));
4685 sopt.sopt_dir = SOPT_SET;
4686 sopt.sopt_level = mpo->mpo_level;
4687 sopt.sopt_name = mpo->mpo_name;
4688 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4689 sopt.sopt_valsize = sizeof(int);
4690 sopt.sopt_p = kernproc;
4691
4692 error = sosetoptlock(so, &sopt, 0);
4693 if (error) {
4694 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4695 "val %d set error %d\n", __func__,
4696 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4697 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4698 mpo->mpo_intval, error);
4699 }
4700 return error;
4701 }
4702
4703 /*
4704 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4705 * caller must ensure that the option can be issued on subflow sockets, via
4706 * MPOF_SUBFLOW_OK flag.
4707 */
4708 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4709 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4710 struct mptopt *mpo)
4711 {
4712 struct socket *mp_so;
4713 struct sockopt sopt;
4714 int error;
4715
4716 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4717 mp_so = mptetoso(mpte);
4718
4719 socket_lock_assert_owned(mp_so);
4720
4721 bzero(&sopt, sizeof(sopt));
4722 sopt.sopt_dir = SOPT_GET;
4723 sopt.sopt_level = mpo->mpo_level;
4724 sopt.sopt_name = mpo->mpo_name;
4725 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4726 sopt.sopt_valsize = sizeof(int);
4727 sopt.sopt_p = kernproc;
4728
4729 error = sogetoptlock(so, &sopt, 0); /* already locked */
4730 if (error) {
4731 os_log_error(mptcp_log_handle,
4732 "%s - %lx: sopt %s get error %d\n",
4733 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4734 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4735 }
4736 return error;
4737 }
4738
4739
4740 /*
4741 * MPTCP garbage collector.
4742 *
4743 * This routine is called by the MP domain on-demand, periodic callout,
4744 * which is triggered when a MPTCP socket is closed. The callout will
4745 * repeat as long as this routine returns a non-zero value.
4746 */
4747 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4748 mptcp_gc(struct mppcbinfo *mppi)
4749 {
4750 struct mppcb *mpp, *tmpp;
4751 uint32_t active = 0;
4752
4753 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4754
4755 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4756 struct socket *mp_so;
4757 struct mptses *mpte;
4758 struct mptcb *mp_tp;
4759
4760 mp_so = mpp->mpp_socket;
4761 mpte = mptompte(mpp);
4762 mp_tp = mpte->mpte_mptcb;
4763
4764 if (!mpp_try_lock(mpp)) {
4765 active++;
4766 continue;
4767 }
4768
4769 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4770
4771 /* check again under the lock */
4772 if (mp_so->so_usecount > 0) {
4773 boolean_t wakeup = FALSE;
4774 struct mptsub *mpts, *tmpts;
4775
4776 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4777 if (mp_tp->mpt_gc_ticks > 0) {
4778 mp_tp->mpt_gc_ticks--;
4779 }
4780 if (mp_tp->mpt_gc_ticks == 0) {
4781 wakeup = TRUE;
4782 }
4783 }
4784 if (wakeup) {
4785 TAILQ_FOREACH_SAFE(mpts,
4786 &mpte->mpte_subflows, mpts_entry, tmpts) {
4787 mptcp_subflow_eupcall1(mpts->mpts_socket,
4788 mpts, SO_FILT_HINT_DISCONNECTED);
4789 }
4790 }
4791 socket_unlock(mp_so, 0);
4792 active++;
4793 continue;
4794 }
4795
4796 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4797 panic("%s - %lx: skipped state "
4798 "[u=%d,r=%d,s=%d]\n", __func__,
4799 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4800 mp_so->so_usecount, mp_so->so_retaincnt,
4801 mpp->mpp_state);
4802 }
4803
4804 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4805 mptcp_close(mpte, mp_tp);
4806 }
4807
4808 mptcp_session_destroy(mpte);
4809
4810 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4811 struct sockbuf *, &mp_so->so_rcv,
4812 struct sockbuf *, &mp_so->so_snd,
4813 struct mppcb *, mpp);
4814
4815 mptcp_pcbdispose(mpp);
4816 sodealloc(mp_so);
4817 }
4818
4819 return active;
4820 }
4821
4822 /*
4823 * Drop a MPTCP connection, reporting the specified error.
4824 */
4825 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4826 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4827 {
4828 struct socket *mp_so = mptetoso(mpte);
4829
4830 VERIFY(mpte->mpte_mptcb == mp_tp);
4831
4832 socket_lock_assert_owned(mp_so);
4833
4834 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4835 uint32_t, 0 /* event */);
4836
4837 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4838 errno = mp_tp->mpt_softerror;
4839 }
4840 mp_so->so_error = errno;
4841
4842 return mptcp_close(mpte, mp_tp);
4843 }
4844
4845 /*
4846 * Close a MPTCP control block.
4847 */
4848 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4849 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4850 {
4851 struct mptsub *mpts = NULL, *tmpts = NULL;
4852 struct socket *mp_so = mptetoso(mpte);
4853
4854 socket_lock_assert_owned(mp_so);
4855 VERIFY(mpte->mpte_mptcb == mp_tp);
4856
4857 mp_tp->mpt_state = MPTCPS_TERMINATE;
4858
4859 mptcp_freeq(mp_tp);
4860
4861 soisdisconnected(mp_so);
4862
4863 /* Clean up all subflows */
4864 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4865 mptcp_subflow_disconnect(mpte, mpts);
4866 }
4867
4868 return NULL;
4869 }
4870
4871 void
mptcp_notify_close(struct socket * so)4872 mptcp_notify_close(struct socket *so)
4873 {
4874 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4875 }
4876
4877 /*
4878 * MPTCP workloop.
4879 */
4880 void
mptcp_subflow_workloop(struct mptses * mpte)4881 mptcp_subflow_workloop(struct mptses *mpte)
4882 {
4883 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4884 long mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4885 struct mptsub *mpts, *tmpts;
4886 struct socket *mp_so;
4887
4888 mp_so = mptetoso(mpte);
4889
4890 socket_lock_assert_owned(mp_so);
4891
4892 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4893 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4894 return;
4895 }
4896 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4897
4898 relaunch:
4899 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4900
4901 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4902 ev_ret_t ret;
4903
4904 if (mpts->mpts_socket->so_usecount == 0) {
4905 /* Will be removed soon by tcp_garbage_collect */
4906 continue;
4907 }
4908
4909 mptcp_subflow_addref(mpts);
4910 mpts->mpts_socket->so_usecount++;
4911
4912 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4913
4914 /*
4915 * If MPTCP socket is closed, disconnect all subflows.
4916 * This will generate a disconnect event which will
4917 * be handled during the next iteration, causing a
4918 * non-zero error to be returned above.
4919 */
4920 if (mp_so->so_flags & SOF_PCBCLEARING) {
4921 mptcp_subflow_disconnect(mpte, mpts);
4922 }
4923
4924 switch (ret) {
4925 case MPTS_EVRET_OK:
4926 /* nothing to do */
4927 break;
4928 case MPTS_EVRET_DELETE:
4929 mptcp_subflow_soclose(mpts);
4930 break;
4931 case MPTS_EVRET_CONNECT_PENDING:
4932 connect_pending = TRUE;
4933 break;
4934 case MPTS_EVRET_DISCONNECT_FALLBACK:
4935 disconnect_fallback = TRUE;
4936 break;
4937 default:
4938 mptcplog((LOG_DEBUG,
4939 "MPTCP Socket: %s: mptcp_subflow_events "
4940 "returned invalid value: %d\n", __func__,
4941 ret),
4942 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4943 break;
4944 }
4945 mptcp_subflow_remref(mpts); /* ours */
4946
4947 VERIFY(mpts->mpts_socket->so_usecount != 0);
4948 mpts->mpts_socket->so_usecount--;
4949 }
4950
4951 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4952 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4953
4954 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4955 mp_so->so_state |= SS_CANTRCVMORE;
4956 sorwakeup(mp_so);
4957 }
4958
4959 soevent(mp_so, mpsofilt_hint_mask);
4960 }
4961
4962 if (!connect_pending && !disconnect_fallback) {
4963 goto exit;
4964 }
4965
4966 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4967 if (disconnect_fallback) {
4968 struct socket *so = NULL;
4969 struct inpcb *inp = NULL;
4970 struct tcpcb *tp = NULL;
4971
4972 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4973 continue;
4974 }
4975
4976 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4977
4978 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4979 MPTSF_DISCONNECTED)) {
4980 continue;
4981 }
4982
4983 so = mpts->mpts_socket;
4984
4985 /*
4986 * The MPTCP connection has degraded to a fallback
4987 * mode, so there is no point in keeping this subflow
4988 * regardless of its MPTCP-readiness state, unless it
4989 * is the primary one which we use for fallback. This
4990 * assumes that the subflow used for fallback is the
4991 * ACTIVE one.
4992 */
4993
4994 inp = sotoinpcb(so);
4995 tp = intotcpcb(inp);
4996 tp->t_mpflags &=
4997 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4998 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4999
5000 soevent(so, SO_FILT_HINT_MUSTRST);
5001 } else if (connect_pending) {
5002 /*
5003 * The MPTCP connection has progressed to a state
5004 * where it supports full multipath semantics; allow
5005 * additional joins to be attempted for all subflows
5006 * that are in the PENDING state.
5007 */
5008 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
5009 int error = mptcp_subflow_soconnectx(mpte, mpts);
5010
5011 if (error) {
5012 mptcp_subflow_abort(mpts, error);
5013 }
5014 }
5015 }
5016 }
5017
5018 exit:
5019 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
5020 goto relaunch;
5021 }
5022
5023 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
5024 }
5025
5026 /*
5027 * Protocol pr_lock callback.
5028 */
5029 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)5030 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
5031 {
5032 struct mppcb *mpp = mpsotomppcb(mp_so);
5033 void *lr_saved;
5034
5035 if (lr == NULL) {
5036 lr_saved = __builtin_return_address(0);
5037 } else {
5038 lr_saved = lr;
5039 }
5040
5041 if (mpp == NULL) {
5042 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
5043 mp_so, lr_saved, solockhistory_nr(mp_so));
5044 /* NOTREACHED */
5045 }
5046 mpp_lock(mpp);
5047
5048 if (mp_so->so_usecount < 0) {
5049 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
5050 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
5051 solockhistory_nr(mp_so));
5052 /* NOTREACHED */
5053 }
5054 if (refcount != 0) {
5055 mp_so->so_usecount++;
5056 mpp->mpp_inside++;
5057 }
5058 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
5059 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
5060
5061 return 0;
5062 }
5063
5064 /*
5065 * Protocol pr_unlock callback.
5066 */
5067 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)5068 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
5069 {
5070 struct mppcb *mpp = mpsotomppcb(mp_so);
5071 void *lr_saved;
5072
5073 if (lr == NULL) {
5074 lr_saved = __builtin_return_address(0);
5075 } else {
5076 lr_saved = lr;
5077 }
5078
5079 if (mpp == NULL) {
5080 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
5081 mp_so, mp_so->so_usecount, lr_saved,
5082 solockhistory_nr(mp_so));
5083 /* NOTREACHED */
5084 }
5085 socket_lock_assert_owned(mp_so);
5086
5087 if (refcount != 0) {
5088 mp_so->so_usecount--;
5089 mpp->mpp_inside--;
5090 }
5091
5092 if (mp_so->so_usecount < 0) {
5093 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5094 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5095 /* NOTREACHED */
5096 }
5097 if (mpp->mpp_inside < 0) {
5098 panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5099 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5100 /* NOTREACHED */
5101 }
5102 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5103 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5104 mpp_unlock(mpp);
5105
5106 return 0;
5107 }
5108
5109 /*
5110 * Protocol pr_getlock callback.
5111 */
5112 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5113 mptcp_getlock(struct socket *mp_so, int flags)
5114 {
5115 struct mppcb *mpp = mpsotomppcb(mp_so);
5116
5117 if (mpp == NULL) {
5118 panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5119 solockhistory_nr(mp_so));
5120 /* NOTREACHED */
5121 }
5122 if (mp_so->so_usecount < 0) {
5123 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5124 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5125 /* NOTREACHED */
5126 }
5127 return mpp_getlock(mpp, flags);
5128 }
5129
5130 /*
5131 * MPTCP Join support
5132 */
5133
5134 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)5135 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
5136 {
5137 struct tcpcb *tp = sototcpcb(so);
5138 struct mptcp_subf_auth_entry *sauth_entry;
5139
5140 /*
5141 * The address ID of the first flow is implicitly 0.
5142 */
5143 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
5144 tp->t_local_aid = 0;
5145 } else {
5146 tp->t_local_aid = addr_id;
5147 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
5148 so->so_flags |= SOF_MP_SEC_SUBFLOW;
5149 }
5150 sauth_entry = zalloc(mpt_subauth_zone);
5151 sauth_entry->msae_laddr_id = tp->t_local_aid;
5152 sauth_entry->msae_raddr_id = 0;
5153 sauth_entry->msae_raddr_rand = 0;
5154 try_again:
5155 sauth_entry->msae_laddr_rand = RandomULong();
5156 if (sauth_entry->msae_laddr_rand == 0) {
5157 goto try_again;
5158 }
5159 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
5160 }
5161
5162 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)5163 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
5164 {
5165 struct mptcp_subf_auth_entry *sauth_entry;
5166 struct tcpcb *tp = NULL;
5167 int found = 0;
5168
5169 tp = sototcpcb(so);
5170 if (tp == NULL) {
5171 return;
5172 }
5173
5174 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5175 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
5176 found = 1;
5177 break;
5178 }
5179 }
5180 if (found) {
5181 LIST_REMOVE(sauth_entry, msae_next);
5182 }
5183
5184 if (found) {
5185 zfree(mpt_subauth_zone, sauth_entry);
5186 }
5187 }
5188
5189 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5190 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5191 u_int32_t *rrand)
5192 {
5193 struct mptcp_subf_auth_entry *sauth_entry;
5194
5195 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5196 if (sauth_entry->msae_laddr_id == addr_id) {
5197 if (lrand) {
5198 *lrand = sauth_entry->msae_laddr_rand;
5199 }
5200 if (rrand) {
5201 *rrand = sauth_entry->msae_raddr_rand;
5202 }
5203 break;
5204 }
5205 }
5206 }
5207
5208 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5209 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5210 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5211 {
5212 struct mptcp_subf_auth_entry *sauth_entry;
5213
5214 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5215 if (sauth_entry->msae_laddr_id == laddr_id) {
5216 if ((sauth_entry->msae_raddr_id != 0) &&
5217 (sauth_entry->msae_raddr_id != raddr_id)) {
5218 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5219 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5220 raddr_id, sauth_entry->msae_raddr_id);
5221 return;
5222 }
5223 sauth_entry->msae_raddr_id = raddr_id;
5224 if ((sauth_entry->msae_raddr_rand != 0) &&
5225 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5226 os_log_error(mptcp_log_handle, "%s - %lx: "
5227 "dup SYN_ACK %d %d \n",
5228 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5229 raddr_rand, sauth_entry->msae_raddr_rand);
5230 return;
5231 }
5232 sauth_entry->msae_raddr_rand = raddr_rand;
5233 return;
5234 }
5235 }
5236 }
5237
5238 /*
5239 * SHA-256 support for MPTCP
5240 */
5241
5242 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5243 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5244 {
5245 const unsigned char *sha2_base;
5246 int sha2_size;
5247
5248 sha2_base = (const unsigned char *) key;
5249 sha2_size = sizeof(mptcp_key_t);
5250
5251 SHA256_CTX sha_ctx;
5252 SHA256_Init(&sha_ctx);
5253 SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5254 SHA256_Final(sha_digest, &sha_ctx);
5255 }
5256
5257 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5258 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5259 u_char *msg, uint16_t msg_len, u_char *digest)
5260 {
5261 SHA256_CTX sha_ctx;
5262 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5263 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5264 int i;
5265
5266 bzero(digest, SHA256_DIGEST_LENGTH);
5267
5268 /* Set up the Key for HMAC */
5269 key_ipad[0] = key1;
5270 key_ipad[1] = key2;
5271
5272 key_opad[0] = key1;
5273 key_opad[1] = key2;
5274
5275 /* Key is 512 block length, so no need to compute hash */
5276
5277 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5278
5279 for (i = 0; i < 8; i++) {
5280 key_ipad[i] ^= 0x3636363636363636;
5281 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5282 }
5283
5284 /* Perform inner SHA256 */
5285 SHA256_Init(&sha_ctx);
5286 SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5287 SHA256_Update(&sha_ctx, msg, msg_len);
5288 SHA256_Final(digest, &sha_ctx);
5289
5290 /* Perform outer SHA256 */
5291 SHA256_Init(&sha_ctx);
5292 SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5293 SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5294 SHA256_Final(digest, &sha_ctx);
5295 }
5296
5297 /*
5298 * SHA1 support for MPTCP
5299 */
5300
5301 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5302 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5303 {
5304 SHA1_CTX sha1ctxt;
5305 const unsigned char *sha1_base;
5306 int sha1_size;
5307
5308 sha1_base = (const unsigned char *) key;
5309 sha1_size = sizeof(mptcp_key_t);
5310 SHA1Init(&sha1ctxt);
5311 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5312 SHA1Final(sha_digest, &sha1ctxt);
5313 }
5314
5315 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5316 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5317 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5318 {
5319 SHA1_CTX sha1ctxt;
5320 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5321 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5322 u_int32_t data[2];
5323 int i;
5324
5325 bzero(digest, SHA1_RESULTLEN);
5326
5327 /* Set up the Key for HMAC */
5328 key_ipad[0] = key1;
5329 key_ipad[1] = key2;
5330
5331 key_opad[0] = key1;
5332 key_opad[1] = key2;
5333
5334 /* Set up the message for HMAC */
5335 data[0] = rand1;
5336 data[1] = rand2;
5337
5338 /* Key is 512 block length, so no need to compute hash */
5339
5340 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5341
5342 for (i = 0; i < 8; i++) {
5343 key_ipad[i] ^= 0x3636363636363636;
5344 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5345 }
5346
5347 /* Perform inner SHA1 */
5348 SHA1Init(&sha1ctxt);
5349 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5350 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5351 SHA1Final(digest, &sha1ctxt);
5352
5353 /* Perform outer SHA1 */
5354 SHA1Init(&sha1ctxt);
5355 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5356 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5357 SHA1Final(digest, &sha1ctxt);
5358 }
5359
5360 /*
5361 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5362 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5363 */
5364 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5365 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5366 {
5367 uint32_t lrand, rrand;
5368
5369 lrand = rrand = 0;
5370 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5371
5372 u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5373 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5374 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5375 } else {
5376 uint32_t data[2];
5377 data[0] = lrand;
5378 data[1] = rrand;
5379 mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5380 }
5381 bcopy(full_digest, digest, digest_len);
5382 }
5383
5384 /*
5385 * Authentication data generation
5386 */
5387 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5388 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5389 int token_len)
5390 {
5391 VERIFY(token_len == sizeof(u_int32_t));
5392 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5393 sha_digest_len == SHA256_DIGEST_LENGTH);
5394
5395 /* Most significant 32 bits of the SHA1/SHA256 hash */
5396 bcopy(sha_digest, token, sizeof(u_int32_t));
5397 return;
5398 }
5399
5400 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5401 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5402 int idsn_len, uint8_t mp_version)
5403 {
5404 VERIFY(idsn_len == sizeof(u_int64_t));
5405 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5406 sha_digest_len == SHA256_DIGEST_LENGTH);
5407 VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5408
5409 /*
5410 * Least significant 64 bits of the hash
5411 */
5412
5413 if (mp_version == MPTCP_VERSION_0) {
5414 idsn[7] = sha_digest[12];
5415 idsn[6] = sha_digest[13];
5416 idsn[5] = sha_digest[14];
5417 idsn[4] = sha_digest[15];
5418 idsn[3] = sha_digest[16];
5419 idsn[2] = sha_digest[17];
5420 idsn[1] = sha_digest[18];
5421 idsn[0] = sha_digest[19];
5422 } else {
5423 idsn[7] = sha_digest[24];
5424 idsn[6] = sha_digest[25];
5425 idsn[5] = sha_digest[26];
5426 idsn[4] = sha_digest[27];
5427 idsn[3] = sha_digest[28];
5428 idsn[2] = sha_digest[29];
5429 idsn[1] = sha_digest[30];
5430 idsn[0] = sha_digest[31];
5431 }
5432 return;
5433 }
5434
5435 static void
mptcp_conn_properties(struct mptcb * mp_tp)5436 mptcp_conn_properties(struct mptcb *mp_tp)
5437 {
5438 /* Set DSS checksum flag */
5439 if (mptcp_dss_csum) {
5440 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5441 }
5442
5443 /* Set up receive window */
5444 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5445
5446 /* Set up gc ticks */
5447 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5448 }
5449
5450 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5451 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5452 {
5453 struct mptcb *mp_tp = mpte->mpte_mptcb;
5454 char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5455 uint16_t digest_len;
5456
5457 if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5458 mp_tp->mpt_version = MPTCP_VERSION_0;
5459 } else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5460 mp_tp->mpt_version = MPTCP_VERSION_1;
5461 } else {
5462 mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5463 }
5464 VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5465 mp_tp->mpt_version == MPTCP_VERSION_1);
5466
5467 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5468 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5469 digest_len = SHA1_RESULTLEN;
5470 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5471 } else {
5472 digest_len = SHA256_DIGEST_LENGTH;
5473 mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5474 }
5475
5476 mptcp_generate_token(key_digest, digest_len,
5477 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5478 mptcp_generate_idsn(key_digest, digest_len,
5479 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5480 /* The subflow SYN is also first MPTCP byte */
5481 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5482 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5483
5484 mptcp_conn_properties(mp_tp);
5485 }
5486
5487 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5488 mptcp_init_remote_parms(struct mptcb *mp_tp)
5489 {
5490 /* Setup local and remote tokens and Initial DSNs */
5491 char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5492 uint16_t digest_len;
5493
5494 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5495 digest_len = SHA1_RESULTLEN;
5496 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5497 } else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5498 digest_len = SHA256_DIGEST_LENGTH;
5499 mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5500 } else {
5501 return -1;
5502 }
5503
5504 mptcp_generate_token(remote_digest, digest_len,
5505 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5506 mptcp_generate_idsn(remote_digest, digest_len,
5507 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5508 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5509 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5510 return 0;
5511 }
5512
5513 static void
mptcp_send_dfin(struct socket * so)5514 mptcp_send_dfin(struct socket *so)
5515 {
5516 struct tcpcb *tp = NULL;
5517 struct inpcb *inp = NULL;
5518
5519 inp = sotoinpcb(so);
5520 if (!inp) {
5521 return;
5522 }
5523
5524 tp = intotcpcb(inp);
5525 if (!tp) {
5526 return;
5527 }
5528
5529 if (!(tp->t_mpflags & TMPF_RESET)) {
5530 tp->t_mpflags |= TMPF_SEND_DFIN;
5531 }
5532 }
5533
5534 /*
5535 * Data Sequence Mapping routines
5536 */
5537 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5538 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5539 {
5540 struct mptcb *mp_tp;
5541
5542 if (m == NULL) {
5543 return;
5544 }
5545
5546 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5547
5548 while (m) {
5549 VERIFY(m->m_flags & M_PKTHDR);
5550 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5551 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5552 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5553 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5554 mp_tp->mpt_sndmax += m_pktlen(m);
5555 m = m->m_next;
5556 }
5557 }
5558
5559 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5560 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5561 {
5562 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5563 uint64_t data_ack;
5564 uint64_t dsn;
5565
5566 VERIFY(len >= 0);
5567
5568 if (!m || len == 0) {
5569 return;
5570 }
5571
5572 while (m && len > 0) {
5573 VERIFY(m->m_flags & M_PKTHDR);
5574 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5575
5576 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5577 dsn = m->m_pkthdr.mp_dsn;
5578
5579 len -= m->m_len;
5580 m = m->m_next;
5581 }
5582
5583 if (m && len == 0) {
5584 /*
5585 * If there is one more mbuf in the chain, it automatically means
5586 * that up to m->mp_dsn has been ack'ed.
5587 *
5588 * This means, we actually correct data_ack back down (compared
5589 * to what we set inside the loop - dsn + data_len). Because in
5590 * the loop we are "optimistic" and assume that the full mapping
5591 * will be acked. If that's not the case and we get out of the
5592 * loop with m != NULL, it means only up to m->mp_dsn has been
5593 * really acked.
5594 */
5595 data_ack = m->m_pkthdr.mp_dsn;
5596 }
5597
5598 if (len < 0) {
5599 /*
5600 * If len is negative, meaning we acked in the middle of an mbuf,
5601 * only up to this mbuf's data-sequence number has been acked
5602 * at the MPTCP-level.
5603 */
5604 data_ack = dsn;
5605 }
5606
5607 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
5608 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5609
5610 /* We can have data in the subflow's send-queue that is being acked,
5611 * while the DATA_ACK has already advanced. Thus, we should check whether
5612 * or not the DATA_ACK is actually new here.
5613 */
5614 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5615 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5616 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5617 }
5618 }
5619
5620 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5621 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5622 {
5623 int rewinding = 0;
5624
5625 /* TFO makes things complicated. */
5626 if (so->so_flags1 & SOF1_TFO_REWIND) {
5627 rewinding = 1;
5628 so->so_flags1 &= ~SOF1_TFO_REWIND;
5629 }
5630
5631 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5632 u_int32_t sub_len;
5633 VERIFY(m->m_flags & M_PKTHDR);
5634 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5635
5636 sub_len = m->m_pkthdr.mp_rlen;
5637
5638 if (sub_len < len) {
5639 m->m_pkthdr.mp_dsn += sub_len;
5640 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5641 m->m_pkthdr.mp_rseq += sub_len;
5642 }
5643 m->m_pkthdr.mp_rlen = 0;
5644 len -= sub_len;
5645 } else {
5646 /* sub_len >= len */
5647 if (rewinding == 0) {
5648 m->m_pkthdr.mp_dsn += len;
5649 }
5650 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5651 if (rewinding == 0) {
5652 m->m_pkthdr.mp_rseq += len;
5653 }
5654 }
5655 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5656 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5657 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5658 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5659 m->m_pkthdr.mp_rlen -= len;
5660 break;
5661 }
5662 m = m->m_next;
5663 }
5664
5665 if (so->so_flags & SOF_MP_SUBFLOW &&
5666 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5667 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5668 /*
5669 * Received an ack without receiving a DATA_ACK.
5670 * Need to fallback to regular TCP (or destroy this subflow).
5671 */
5672 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5673 mptcp_notify_mpfail(so);
5674 }
5675 }
5676
5677 /* Obtain the DSN mapping stored in the mbuf */
5678 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5679 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5680 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5681 {
5682 u_int64_t dsn64;
5683
5684 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5685 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5686 }
5687
5688 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5689 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5690 uint32_t *relseq, uint16_t *data_len,
5691 uint16_t *dss_csum)
5692 {
5693 struct mbuf *m = so->so_snd.sb_mb;
5694 int off_orig = off;
5695
5696 VERIFY(off >= 0);
5697
5698 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5699 *dsn = 0;
5700 *relseq = 0;
5701 *data_len = 0;
5702 *dss_csum = 0;
5703 return;
5704 }
5705
5706 /*
5707 * In the subflow socket, the DSN sequencing can be discontiguous,
5708 * but the subflow sequence mapping is contiguous. Use the subflow
5709 * sequence property to find the right mbuf and corresponding dsn
5710 * mapping.
5711 */
5712
5713 while (m) {
5714 VERIFY(m->m_flags & M_PKTHDR);
5715 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5716
5717 if (off >= m->m_len) {
5718 off -= m->m_len;
5719 m = m->m_next;
5720 } else {
5721 break;
5722 }
5723 }
5724
5725 VERIFY(off >= 0);
5726 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5727
5728 *dsn = m->m_pkthdr.mp_dsn;
5729 *relseq = m->m_pkthdr.mp_rseq;
5730 *data_len = m->m_pkthdr.mp_rlen;
5731 *dss_csum = m->m_pkthdr.mp_csum;
5732
5733 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
5734 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5735 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5736 }
5737
5738 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5739 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5740 {
5741 uint64_t dsn;
5742 uint32_t relseq;
5743
5744 mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5745 }
5746
5747 /*
5748 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5749 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5750 * When it trims data tcp_input calls m_adj() which does not remove the
5751 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5752 * The dsn map insertion cannot be delayed after trim, because data can be in
5753 * the reassembly queue for a while and the DSN option info in tp will be
5754 * overwritten for every new packet received.
5755 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5756 * with mptcp_adj_rmap()
5757 */
5758 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5759 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5760 {
5761 VERIFY(m->m_flags & M_PKTHDR);
5762 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5763
5764 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5765 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5766 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5767 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5768 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5769 if (tp->t_rcv_map.mpt_dfin) {
5770 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5771 }
5772
5773 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5774
5775 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5776 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5777 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5778 if (th->th_flags & TH_FIN) {
5779 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5780 }
5781 }
5782 }
5783
5784 /*
5785 * Following routines help with failure detection and failover of data
5786 * transfer from one subflow to another.
5787 */
5788 void
mptcp_act_on_txfail(struct socket * so)5789 mptcp_act_on_txfail(struct socket *so)
5790 {
5791 struct tcpcb *tp = NULL;
5792 struct inpcb *inp = sotoinpcb(so);
5793
5794 if (inp == NULL) {
5795 return;
5796 }
5797
5798 tp = intotcpcb(inp);
5799 if (tp == NULL) {
5800 return;
5801 }
5802
5803 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5804 return;
5805 }
5806
5807 so->so_flags |= SOF_MP_TRYFAILOVER;
5808 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5809 }
5810
5811 /*
5812 * Support for MP_FAIL option
5813 */
5814 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5815 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5816 {
5817 struct mbuf *m = so->so_snd.sb_mb;
5818 uint16_t datalen;
5819 uint64_t dsn;
5820 int off = 0;
5821
5822 if (m == NULL) {
5823 return -1;
5824 }
5825
5826 while (m != NULL) {
5827 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5828 VERIFY(m->m_flags & M_PKTHDR);
5829 dsn = m->m_pkthdr.mp_dsn;
5830 datalen = m->m_pkthdr.mp_rlen;
5831 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5832 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5833 off = (int)(dsn_fail - dsn);
5834 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5835 return 0;
5836 }
5837
5838 m = m->m_next;
5839 }
5840
5841 /*
5842 * If there was no mbuf data and a fallback to TCP occurred, there's
5843 * not much else to do.
5844 */
5845
5846 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5847 return -1;
5848 }
5849
5850 /*
5851 * Support for sending contiguous MPTCP bytes in subflow
5852 * Also for preventing sending data with ACK in 3-way handshake
5853 */
5854 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5855 mptcp_adj_sendlen(struct socket *so, int32_t off)
5856 {
5857 struct tcpcb *tp = sototcpcb(so);
5858 struct mptsub *mpts = tp->t_mpsub;
5859 uint64_t mdss_dsn;
5860 uint32_t mdss_subflow_seq;
5861 int mdss_subflow_off;
5862 uint16_t mdss_data_len;
5863 uint16_t dss_csum;
5864
5865 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5866 return 0;
5867 }
5868
5869 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5870 &mdss_data_len, &dss_csum);
5871
5872 /*
5873 * We need to compute how much of the mapping still remains.
5874 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5875 */
5876 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5877
5878 /*
5879 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5880 * seq has been set to 1 (while it should be 0).
5881 */
5882 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5883 mdss_subflow_off--;
5884 }
5885
5886 VERIFY(off >= mdss_subflow_off);
5887
5888 return mdss_data_len - (off - mdss_subflow_off);
5889 }
5890
5891 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5892 mptcp_get_maxseg(struct mptses *mpte)
5893 {
5894 struct mptsub *mpts;
5895 uint32_t maxseg = 0;
5896
5897 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5898 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5899
5900 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5901 TCPS_HAVERCVDFIN2(tp->t_state)) {
5902 continue;
5903 }
5904
5905 if (tp->t_maxseg > maxseg) {
5906 maxseg = tp->t_maxseg;
5907 }
5908 }
5909
5910 return maxseg;
5911 }
5912
5913 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5914 mptcp_get_rcvscale(struct mptses *mpte)
5915 {
5916 struct mptsub *mpts;
5917 uint8_t rcvscale = UINT8_MAX;
5918
5919 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5920 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5921
5922 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5923 TCPS_HAVERCVDFIN2(tp->t_state)) {
5924 continue;
5925 }
5926
5927 if (tp->rcv_scale < rcvscale) {
5928 rcvscale = tp->rcv_scale;
5929 }
5930 }
5931
5932 return rcvscale;
5933 }
5934
5935 /* Similar to tcp_sbrcv_reserve */
5936 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5937 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5938 u_int32_t newsize, u_int32_t idealsize)
5939 {
5940 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5941
5942 /* newsize should not exceed max */
5943 newsize = min(newsize, tcp_autorcvbuf_max);
5944
5945 /* The receive window scale negotiated at the
5946 * beginning of the connection will also set a
5947 * limit on the socket buffer size
5948 */
5949 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5950
5951 /* Set new socket buffer size */
5952 if (newsize > sbrcv->sb_hiwat &&
5953 (sbreserve(sbrcv, newsize) == 1)) {
5954 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5955 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5956
5957 /* Again check the limit set by the advertised
5958 * window scale
5959 */
5960 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5961 TCP_MAXWIN << rcvscale);
5962 }
5963 }
5964
5965 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5966 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5967 {
5968 struct mptses *mpte = mp_tp->mpt_mpte;
5969 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5970 struct sockbuf *sbrcv = &mp_so->so_rcv;
5971 uint32_t hiwat_sum = 0;
5972 uint32_t ideal_sum = 0;
5973 struct mptsub *mpts;
5974
5975 /*
5976 * Do not grow the receive socket buffer if
5977 * - auto resizing is disabled, globally or on this socket
5978 * - the high water mark already reached the maximum
5979 * - the stream is in background and receive side is being
5980 * throttled
5981 * - if there are segments in reassembly queue indicating loss,
5982 * do not need to increase recv window during recovery as more
5983 * data is not going to be sent. A duplicate ack sent during
5984 * recovery should not change the receive window
5985 */
5986 if (tcp_do_autorcvbuf == 0 ||
5987 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5988 tcp_cansbgrow(sbrcv) == 0 ||
5989 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5990 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5991 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5992 /* Can not resize the socket buffer, just return */
5993 return;
5994 }
5995
5996 /*
5997 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5998 *
5999 * But, for this we first need accurate receiver-RTT estimations, which
6000 * we currently don't have.
6001 *
6002 * Let's use a dummy algorithm for now, just taking the sum of all
6003 * subflow's receive-buffers. It's too low, but that's all we can get
6004 * for now.
6005 */
6006
6007 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6008 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
6009 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
6010 }
6011
6012 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
6013 }
6014
6015 /*
6016 * Determine if we can grow the recieve socket buffer to avoid sending
6017 * a zero window update to the peer. We allow even socket buffers that
6018 * have fixed size (set by the application) to grow if the resource
6019 * constraints are met. They will also be trimmed after the application
6020 * reads data.
6021 *
6022 * Similar to tcp_sbrcv_grow_rwin
6023 */
6024 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)6025 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
6026 {
6027 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
6028 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
6029 u_int32_t rcvbuf = sb->sb_hiwat;
6030
6031 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
6032 return;
6033 }
6034
6035 if (tcp_do_autorcvbuf == 1 &&
6036 tcp_cansbgrow(sb) &&
6037 /* Diff to tcp_sbrcv_grow_rwin */
6038 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
6039 (rcvbuf - sb->sb_cc) < rcvbufinc &&
6040 rcvbuf < tcp_autorcvbuf_max &&
6041 (sb->sb_idealsize > 0 &&
6042 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
6043 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
6044 }
6045 }
6046
6047 /* Similar to tcp_sbspace */
6048 int32_t
mptcp_sbspace(struct mptcb * mp_tp)6049 mptcp_sbspace(struct mptcb *mp_tp)
6050 {
6051 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
6052 uint32_t rcvbuf;
6053 int32_t space;
6054 int32_t pending = 0;
6055
6056 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6057
6058 mptcp_sbrcv_grow_rwin(mp_tp, sb);
6059
6060 /* hiwat might have changed */
6061 rcvbuf = sb->sb_hiwat;
6062
6063 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
6064 (sb->sb_mbmax - sb->sb_mbcnt)));
6065 if (space < 0) {
6066 space = 0;
6067 }
6068
6069 #if CONTENT_FILTER
6070 /* Compensate for data being processed by content filters */
6071 pending = cfil_sock_data_space(sb);
6072 #endif /* CONTENT_FILTER */
6073 if (pending > space) {
6074 space = 0;
6075 } else {
6076 space -= pending;
6077 }
6078
6079 return space;
6080 }
6081
6082 /*
6083 * Support Fallback to Regular TCP
6084 */
6085 void
mptcp_notify_mpready(struct socket * so)6086 mptcp_notify_mpready(struct socket *so)
6087 {
6088 struct tcpcb *tp = NULL;
6089
6090 if (so == NULL) {
6091 return;
6092 }
6093
6094 tp = intotcpcb(sotoinpcb(so));
6095
6096 if (tp == NULL) {
6097 return;
6098 }
6099
6100 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
6101 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
6102 struct tcpcb *, tp);
6103
6104 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
6105 return;
6106 }
6107
6108 if (tp->t_mpflags & TMPF_MPTCP_READY) {
6109 return;
6110 }
6111
6112 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
6113 tp->t_mpflags |= TMPF_MPTCP_READY;
6114
6115 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
6116 }
6117
6118 void
mptcp_notify_mpfail(struct socket * so)6119 mptcp_notify_mpfail(struct socket *so)
6120 {
6121 struct tcpcb *tp = NULL;
6122
6123 if (so == NULL) {
6124 return;
6125 }
6126
6127 tp = intotcpcb(sotoinpcb(so));
6128
6129 if (tp == NULL) {
6130 return;
6131 }
6132
6133 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
6134 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
6135 struct tcpcb *, tp);
6136
6137 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
6138 return;
6139 }
6140
6141 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
6142 tp->t_mpflags |= TMPF_TCP_FALLBACK;
6143
6144 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
6145 }
6146
6147 /*
6148 * Keepalive helper function
6149 */
6150 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6151 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6152 {
6153 boolean_t ret = 1;
6154
6155 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6156
6157 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6158 ret = 0;
6159 }
6160 return ret;
6161 }
6162
6163 /*
6164 * MPTCP t_maxseg adjustment function
6165 */
6166 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6167 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6168 {
6169 int mss_lower = 0;
6170 struct mptcb *mp_tp = tptomptp(tp);
6171
6172 #define MPTCP_COMPUTE_LEN { \
6173 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
6174 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
6175 mss_lower += 2; \
6176 else \
6177 /* adjust to 32-bit boundary + EOL */ \
6178 mss_lower += 2; \
6179 }
6180 if (mp_tp == NULL) {
6181 return 0;
6182 }
6183
6184 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6185
6186 /*
6187 * For the first subflow and subsequent subflows, adjust mss for
6188 * most common MPTCP option size, for case where tcp_mss is called
6189 * during option processing and MTU discovery.
6190 */
6191 if (!mtudisc) {
6192 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6193 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6194 MPTCP_COMPUTE_LEN;
6195 }
6196
6197 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6198 tp->t_mpflags & TMPF_SENT_JOIN) {
6199 MPTCP_COMPUTE_LEN;
6200 }
6201 } else {
6202 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6203 MPTCP_COMPUTE_LEN;
6204 }
6205 }
6206
6207 return mss_lower;
6208 }
6209
6210 /*
6211 * Update the pid, upid, uuid of the subflow so, based on parent so
6212 */
6213 void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)6214 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
6215 {
6216 if (so->last_pid != mp_so->last_pid ||
6217 so->last_upid != mp_so->last_upid) {
6218 so->last_upid = mp_so->last_upid;
6219 so->last_pid = mp_so->last_pid;
6220 uuid_copy(so->last_uuid, mp_so->last_uuid);
6221 }
6222 so_update_policy(so);
6223 }
6224
6225 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6226 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6227 {
6228 struct inpcb *inp;
6229
6230 tcp_getconninfo(so, &flow->flow_ci);
6231 inp = sotoinpcb(so);
6232 if ((inp->inp_vflag & INP_IPV6) != 0) {
6233 flow->flow_src.ss_family = AF_INET6;
6234 flow->flow_dst.ss_family = AF_INET6;
6235 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6236 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6237 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6238 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6239 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6240 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6241 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
6242 flow->flow_src.ss_family = AF_INET;
6243 flow->flow_dst.ss_family = AF_INET;
6244 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6245 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6246 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6247 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6248 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6249 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6250 }
6251 flow->flow_len = sizeof(*flow);
6252 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6253 flow->flow_flags = mpts->mpts_flags;
6254 flow->flow_cid = mpts->mpts_connid;
6255 flow->flow_relseq = mpts->mpts_rel_seq;
6256 flow->flow_soerror = mpts->mpts_socket->so_error;
6257 flow->flow_probecnt = mpts->mpts_probecnt;
6258 }
6259
6260 static int
6261 mptcp_pcblist SYSCTL_HANDLER_ARGS
6262 {
6263 #pragma unused(oidp, arg1, arg2)
6264 int error = 0, f;
6265 size_t len;
6266 struct mppcb *mpp;
6267 struct mptses *mpte;
6268 struct mptcb *mp_tp;
6269 struct mptsub *mpts;
6270 struct socket *so;
6271 conninfo_mptcp_t mptcpci;
6272 mptcp_flow_t *flows = NULL;
6273
6274 if (req->newptr != USER_ADDR_NULL) {
6275 return EPERM;
6276 }
6277
6278 lck_mtx_lock(&mtcbinfo.mppi_lock);
6279 if (req->oldptr == USER_ADDR_NULL) {
6280 size_t n = mtcbinfo.mppi_count;
6281 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6282 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6283 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6284 return 0;
6285 }
6286 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6287 flows = NULL;
6288 socket_lock(mpp->mpp_socket, 1);
6289 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6290 mpte = mptompte(mpp);
6291
6292 socket_lock_assert_owned(mptetoso(mpte));
6293 mp_tp = mpte->mpte_mptcb;
6294
6295 bzero(&mptcpci, sizeof(mptcpci));
6296 mptcpci.mptcpci_state = mp_tp->mpt_state;
6297 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6298 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6299 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6300 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6301 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6302 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6303 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6304 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6305 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6306 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6307 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6308 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6309 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6310
6311 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6312 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6313 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6314 mptcpci.mptcpci_flow_offset =
6315 offsetof(conninfo_mptcp_t, mptcpci_flows);
6316
6317 len = sizeof(*flows) * mpte->mpte_numflows;
6318 if (mpte->mpte_numflows != 0) {
6319 flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6320 if (flows == NULL) {
6321 socket_unlock(mpp->mpp_socket, 1);
6322 break;
6323 }
6324 mptcpci.mptcpci_len = sizeof(mptcpci) +
6325 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6326 error = SYSCTL_OUT(req, &mptcpci,
6327 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6328 } else {
6329 mptcpci.mptcpci_len = sizeof(mptcpci);
6330 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6331 }
6332 if (error) {
6333 socket_unlock(mpp->mpp_socket, 1);
6334 kfree_data(flows, len);
6335 break;
6336 }
6337 f = 0;
6338 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6339 so = mpts->mpts_socket;
6340 fill_mptcp_subflow(so, &flows[f], mpts);
6341 f++;
6342 }
6343 socket_unlock(mpp->mpp_socket, 1);
6344 if (flows) {
6345 error = SYSCTL_OUT(req, flows, len);
6346 kfree_data(flows, len);
6347 if (error) {
6348 break;
6349 }
6350 }
6351 }
6352 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6353
6354 return error;
6355 }
6356
6357 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6358 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6359 "List of active MPTCP connections");
6360
6361 /*
6362 * Set notsent lowat mark on the MPTCB
6363 */
6364 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6365 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6366 {
6367 struct mptcb *mp_tp = NULL;
6368 int error = 0;
6369
6370 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6371 mp_tp = mpte->mpte_mptcb;
6372 }
6373
6374 if (mp_tp) {
6375 mp_tp->mpt_notsent_lowat = optval;
6376 } else {
6377 error = EINVAL;
6378 }
6379
6380 return error;
6381 }
6382
6383 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6384 mptcp_get_notsent_lowat(struct mptses *mpte)
6385 {
6386 struct mptcb *mp_tp = NULL;
6387
6388 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6389 mp_tp = mpte->mpte_mptcb;
6390 }
6391
6392 if (mp_tp) {
6393 return mp_tp->mpt_notsent_lowat;
6394 } else {
6395 return 0;
6396 }
6397 }
6398
6399 int
mptcp_notsent_lowat_check(struct socket * so)6400 mptcp_notsent_lowat_check(struct socket *so)
6401 {
6402 struct mptses *mpte;
6403 struct mppcb *mpp;
6404 struct mptcb *mp_tp;
6405 struct mptsub *mpts;
6406
6407 int notsent = 0;
6408
6409 mpp = mpsotomppcb(so);
6410 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6411 return 0;
6412 }
6413
6414 mpte = mptompte(mpp);
6415 socket_lock_assert_owned(mptetoso(mpte));
6416 mp_tp = mpte->mpte_mptcb;
6417
6418 notsent = so->so_snd.sb_cc;
6419
6420 if ((notsent == 0) ||
6421 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6422 mp_tp->mpt_notsent_lowat)) {
6423 mptcplog((LOG_DEBUG, "MPTCP Sender: "
6424 "lowat %d notsent %d actual %llu \n",
6425 mp_tp->mpt_notsent_lowat, notsent,
6426 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
6427 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6428 return 1;
6429 }
6430
6431 /* When Nagle's algorithm is not disabled, it is better
6432 * to wakeup the client even before there is atleast one
6433 * maxseg of data to write.
6434 */
6435 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6436 int retval = 0;
6437 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6438 struct socket *subf_so = mpts->mpts_socket;
6439 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6440
6441 notsent = so->so_snd.sb_cc -
6442 (tp->snd_nxt - tp->snd_una);
6443
6444 if ((tp->t_flags & TF_NODELAY) == 0 &&
6445 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6446 retval = 1;
6447 }
6448 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
6449 " nodelay false \n",
6450 mp_tp->mpt_notsent_lowat, notsent),
6451 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6452 return retval;
6453 }
6454 }
6455 return 0;
6456 }
6457
6458 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6459 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6460 void **unitinfo)
6461 {
6462 #pragma unused(kctlref, sac, unitinfo)
6463
6464 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6465 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6466 }
6467
6468 mptcp_kern_skt_unit = sac->sc_unit;
6469
6470 return 0;
6471 }
6472
6473 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6474 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6475 {
6476 struct mppcb *mpp;
6477
6478 /* Iterate over all MPTCP connections */
6479
6480 lck_mtx_lock(&mtcbinfo.mppi_lock);
6481
6482 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6483 struct socket *mp_so = mpp->mpp_socket;
6484 struct mptses *mpte = mpp->mpp_pcbe;
6485
6486 socket_lock(mp_so, 1);
6487
6488 if (mp_so->so_flags & SOF_DELEGATED &&
6489 uuid_compare(uuid, mp_so->e_uuid)) {
6490 goto next;
6491 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6492 uuid_compare(uuid, mp_so->last_uuid)) {
6493 goto next;
6494 }
6495
6496 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6497 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6498
6499 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6500
6501 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6502 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6503 }
6504
6505 mptcp_check_subflows_and_add(mpte);
6506 mptcp_remove_subflows(mpte);
6507
6508 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6509
6510 next:
6511 socket_unlock(mp_so, 1);
6512 }
6513
6514 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6515 }
6516
6517 static void
mptcp_wifi_status_changed(void)6518 mptcp_wifi_status_changed(void)
6519 {
6520 struct mppcb *mpp;
6521
6522 /* Iterate over all MPTCP connections */
6523
6524 lck_mtx_lock(&mtcbinfo.mppi_lock);
6525
6526 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6527 struct socket *mp_so = mpp->mpp_socket;
6528 struct mptses *mpte = mpp->mpp_pcbe;
6529
6530 socket_lock(mp_so, 1);
6531
6532 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6533 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6534 mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6535 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6536 goto next;
6537 }
6538
6539 mptcp_check_subflows_and_add(mpte);
6540 mptcp_check_subflows_and_remove(mpte);
6541
6542 next:
6543 socket_unlock(mp_so, 1);
6544 }
6545
6546 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6547 }
6548
6549 struct mptcp_uuid_search_info {
6550 uuid_t target_uuid;
6551 proc_t found_proc;
6552 boolean_t is_proc_found;
6553 };
6554
6555 static int
mptcp_find_proc_filter(proc_t p,void * arg)6556 mptcp_find_proc_filter(proc_t p, void *arg)
6557 {
6558 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6559 int found;
6560
6561 if (info->is_proc_found) {
6562 return 0;
6563 }
6564
6565 /*
6566 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6567 * expects != 0 for a matching filter.
6568 */
6569 found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6570 if (found) {
6571 info->is_proc_found = true;
6572 }
6573
6574 return found;
6575 }
6576
6577 static int
mptcp_find_proc_callout(proc_t p,void * arg)6578 mptcp_find_proc_callout(proc_t p, void * arg)
6579 {
6580 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6581
6582 if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6583 info->found_proc = p;
6584 return PROC_CLAIMED_DONE;
6585 }
6586
6587 return PROC_RETURNED;
6588 }
6589
6590 static proc_t
mptcp_find_proc(const uuid_t uuid)6591 mptcp_find_proc(const uuid_t uuid)
6592 {
6593 struct mptcp_uuid_search_info info;
6594
6595 uuid_copy(info.target_uuid, uuid);
6596 info.found_proc = PROC_NULL;
6597 info.is_proc_found = false;
6598
6599 proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6600 mptcp_find_proc_filter, &info);
6601
6602 return info.found_proc;
6603 }
6604
6605 void
mptcp_ask_symptoms(struct mptses * mpte)6606 mptcp_ask_symptoms(struct mptses *mpte)
6607 {
6608 struct mptcp_symptoms_ask_uuid ask;
6609 struct socket *mp_so;
6610 struct proc *p = PROC_NULL;
6611 int pid, prio, err;
6612
6613 if (mptcp_kern_skt_unit == 0) {
6614 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6615 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6616 return;
6617 }
6618
6619 mp_so = mptetoso(mpte);
6620
6621 if (mp_so->so_flags & SOF_DELEGATED) {
6622 if (mpte->mpte_epid != 0) {
6623 p = proc_find(mpte->mpte_epid);
6624 if (p != PROC_NULL) {
6625 /* We found a pid, check its UUID */
6626 if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6627 /* It's not the same - we need to look for the real proc */
6628 proc_rele(p);
6629 p = PROC_NULL;
6630 }
6631 }
6632 }
6633
6634 if (p == PROC_NULL) {
6635 p = mptcp_find_proc(mp_so->e_uuid);
6636 if (p == PROC_NULL) {
6637 uuid_string_t uuid_string;
6638 uuid_unparse(mp_so->e_uuid, uuid_string);
6639
6640 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6641 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6642
6643 return;
6644 }
6645 mpte->mpte_epid = proc_pid(p);
6646 }
6647
6648 pid = mpte->mpte_epid;
6649 uuid_copy(ask.uuid, mp_so->e_uuid);
6650 } else {
6651 pid = mp_so->last_pid;
6652
6653 p = proc_find(pid);
6654 if (p == PROC_NULL) {
6655 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6656 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6657 return;
6658 }
6659
6660 uuid_copy(ask.uuid, mp_so->last_uuid);
6661 }
6662
6663
6664 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6665
6666 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6667
6668 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6669 prio == TASK_DARWINBG_APPLICATION) {
6670 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6671 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6672 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6673 } else {
6674 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6675 }
6676
6677 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6678 &ask, sizeof(ask), CTL_DATA_EOR);
6679
6680 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6681 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6682
6683
6684 proc_rele(p);
6685 }
6686
6687 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6688 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6689 void *unitinfo)
6690 {
6691 #pragma unused(kctlref, kcunit, unitinfo)
6692
6693 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6694
6695 return 0;
6696 }
6697
6698 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6699 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6700 mbuf_t m, int flags)
6701 {
6702 #pragma unused(kctlref, unitinfo, flags)
6703 symptoms_advisory_t *sa = NULL;
6704
6705 if (kcunit != mptcp_kern_skt_unit) {
6706 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6707 __func__, kcunit, mptcp_kern_skt_unit);
6708 }
6709
6710 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6711 mbuf_freem(m);
6712 return EINVAL;
6713 }
6714
6715 if (mbuf_len(m) < sizeof(*sa)) {
6716 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6717 __func__, mbuf_len(m), sizeof(*sa));
6718 mbuf_freem(m);
6719 return EINVAL;
6720 }
6721
6722 sa = mbuf_data(m);
6723
6724 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6725 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6726 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6727 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6728
6729 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6730 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6731 mptcp_wifi_status_changed();
6732 }
6733 } else {
6734 struct mptcp_symptoms_answer answer;
6735 errno_t err;
6736
6737 /* We temporarily allow different sizes for ease of submission */
6738 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6739 mbuf_len(m) != sizeof(answer)) {
6740 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6741 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6742 sizeof(answer));
6743 mbuf_free(m);
6744 return EINVAL;
6745 }
6746
6747 memset(&answer, 0, sizeof(answer));
6748
6749 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6750 if (err) {
6751 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6752 mbuf_free(m);
6753 return err;
6754 }
6755
6756 mptcp_allow_uuid(answer.uuid, answer.rssi);
6757 }
6758
6759 mbuf_freem(m);
6760 return 0;
6761 }
6762
6763 void
mptcp_control_register(void)6764 mptcp_control_register(void)
6765 {
6766 /* Set up the advisory control socket */
6767 struct kern_ctl_reg mptcp_kern_ctl;
6768
6769 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6770 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6771 sizeof(mptcp_kern_ctl.ctl_name));
6772 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6773 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6774 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6775 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6776
6777 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6778 }
6779
6780 /*
6781 * Three return-values:
6782 * 1 : WiFi is bad
6783 * 0 : WiFi is good
6784 * -1 : WiFi-state is unknown
6785 */
6786 int
mptcp_is_wifi_unusable_for_session(struct mptses * mpte)6787 mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
6788 {
6789 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6790 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6791 mptcp_advisory.sa_wifi_status) {
6792 return symptoms_is_wifi_lossy() ? 1 : 0;
6793 }
6794
6795 /*
6796 * If it's a first-party app and we don't have any info
6797 * about the Wi-Fi state, let's be pessimistic.
6798 */
6799 return -1;
6800 } else {
6801 if (symptoms_is_wifi_lossy()) {
6802 return 1;
6803 }
6804
6805 /*
6806 * If we are target-based (meaning, we allow to be more lax on
6807 * the "unusable" target. We only *know* about the state once
6808 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6809 *
6810 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6811 * be set.
6812 *
6813 * In any other case (while in target-mode), consider WiFi bad
6814 * and we are going to ask for allowance from Symptoms anyway.
6815 */
6816 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6817 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6818 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6819 return 0;
6820 }
6821
6822 return 1;
6823 }
6824
6825 return 0;
6826 }
6827 }
6828
6829 boolean_t
symptoms_is_wifi_lossy(void)6830 symptoms_is_wifi_lossy(void)
6831 {
6832 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6833 }
6834
6835 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6836 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)6837 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
6838 {
6839 struct socket *mp_so = mptetoso(mpte);
6840 struct socket *so = mpts->mpts_socket;
6841 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6842 struct mptcb *mp_tp = mpte->mpte_mptcb;
6843
6844 /* If data was sent with SYN, rewind state */
6845 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
6846 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6847 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6848
6849 VERIFY(mp_droplen <= (UINT_MAX));
6850 VERIFY(mp_droplen >= tcp_droplen);
6851
6852 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6853 mpts->mpts_iss += tcp_droplen;
6854 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6855
6856 if (mp_droplen > tcp_droplen) {
6857 /* handle partial TCP ack */
6858 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6859 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6860 mp_droplen = tcp_droplen;
6861 } else {
6862 /* all data on SYN was acked */
6863 mpts->mpts_rel_seq = 1;
6864 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6865 }
6866 mp_tp->mpt_sndmax -= tcp_droplen;
6867
6868 if (mp_droplen != 0) {
6869 VERIFY(mp_so->so_snd.sb_mb != NULL);
6870 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6871 }
6872 }
6873 }
6874
6875 int
mptcp_freeq(struct mptcb * mp_tp)6876 mptcp_freeq(struct mptcb *mp_tp)
6877 {
6878 struct tseg_qent *q;
6879 int rv = 0;
6880
6881 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6882 LIST_REMOVE(q, tqe_q);
6883 m_freem(q->tqe_m);
6884 zfree(tcp_reass_zone, q);
6885 rv = 1;
6886 }
6887 mp_tp->mpt_reassqlen = 0;
6888 return rv;
6889 }
6890
6891 static int
mptcp_post_event(u_int32_t event_code,int value)6892 mptcp_post_event(u_int32_t event_code, int value)
6893 {
6894 struct kev_mptcp_data event_data;
6895 struct kev_msg ev_msg;
6896
6897 memset(&ev_msg, 0, sizeof(ev_msg));
6898
6899 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6900 ev_msg.kev_class = KEV_NETWORK_CLASS;
6901 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6902 ev_msg.event_code = event_code;
6903
6904 event_data.value = value;
6905
6906 ev_msg.dv[0].data_ptr = &event_data;
6907 ev_msg.dv[0].data_length = sizeof(event_data);
6908
6909 return kev_post_msg(&ev_msg);
6910 }
6911
6912 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6913 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6914 {
6915 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6916 int error;
6917
6918 /* First-party apps (Siri) don't flip the cellicon */
6919 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6920 return;
6921 }
6922
6923 /* Subflow is disappearing - don't set it on this one */
6924 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6925 return;
6926 }
6927
6928 /* Fallen back connections are not triggering the cellicon */
6929 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6930 return;
6931 }
6932
6933 /* Remember the last time we set the cellicon. Needed for debouncing */
6934 mpte->mpte_last_cellicon_set = tcp_now;
6935
6936 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6937 tcp_sched_timers(tp);
6938
6939 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6940 mpte->mpte_cellicon_increments != 0) {
6941 if (mptcp_cellicon_refcount == 0) {
6942 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6943 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6944
6945 /* Continue, so that the icon gets set... */
6946 } else {
6947 /*
6948 * In this case, the cellicon is already set. No need to bump it
6949 * even higher
6950 */
6951
6952 return;
6953 }
6954 }
6955
6956 /* When tearing down this subflow, we need to decrement the
6957 * reference counter
6958 */
6959 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6960
6961 /* This counter, so that when a session gets destroyed we decrement
6962 * the reference counter by whatever is left
6963 */
6964 mpte->mpte_cellicon_increments++;
6965
6966 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6967 /* If cellicon is already set, get out of here! */
6968 return;
6969 }
6970
6971 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6972
6973 if (error) {
6974 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6975 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6976 } else {
6977 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6978 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6979 }
6980 }
6981
6982 void
mptcp_clear_cellicon(void)6983 mptcp_clear_cellicon(void)
6984 {
6985 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6986
6987 if (error) {
6988 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6989 __func__, error);
6990 } else {
6991 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6992 __func__);
6993 }
6994 }
6995
6996 /*
6997 * Returns true if the icon has been flipped to WiFi.
6998 */
6999 static boolean_t
__mptcp_unset_cellicon(uint32_t val)7000 __mptcp_unset_cellicon(uint32_t val)
7001 {
7002 VERIFY(val < INT32_MAX);
7003 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
7004 return false;
7005 }
7006
7007 mptcp_clear_cellicon();
7008
7009 return true;
7010 }
7011
7012 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)7013 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
7014 {
7015 /* First-party apps (Siri) don't flip the cellicon */
7016 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
7017 return;
7018 }
7019
7020 if (mpte->mpte_cellicon_increments == 0) {
7021 /* This flow never used cell - get out of here! */
7022 return;
7023 }
7024
7025 if (mptcp_cellicon_refcount == 0) {
7026 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
7027 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
7028
7029 return;
7030 }
7031
7032 if (mpts) {
7033 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
7034 return;
7035 }
7036
7037 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
7038 }
7039
7040 if (mpte->mpte_cellicon_increments < val) {
7041 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
7042 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
7043 val = mpte->mpte_cellicon_increments;
7044 }
7045
7046 mpte->mpte_cellicon_increments -= val;
7047
7048 if (__mptcp_unset_cellicon(val) == false) {
7049 return;
7050 }
7051
7052 /* All flows are gone - our counter should be at zero too! */
7053 if (mpte->mpte_cellicon_increments != 0) {
7054 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
7055 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
7056 }
7057 }
7058
7059 void
mptcp_reset_rexmit_state(struct tcpcb * tp)7060 mptcp_reset_rexmit_state(struct tcpcb *tp)
7061 {
7062 struct mptsub *mpts;
7063 struct inpcb *inp;
7064 struct socket *so;
7065
7066 inp = tp->t_inpcb;
7067 if (inp == NULL) {
7068 return;
7069 }
7070
7071 so = inp->inp_socket;
7072 if (so == NULL) {
7073 return;
7074 }
7075
7076 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
7077 return;
7078 }
7079
7080 mpts = tp->t_mpsub;
7081
7082 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
7083 so->so_flags &= ~SOF_MP_TRYFAILOVER;
7084 }
7085
7086 void
mptcp_reset_keepalive(struct tcpcb * tp)7087 mptcp_reset_keepalive(struct tcpcb *tp)
7088 {
7089 struct mptsub *mpts = tp->t_mpsub;
7090
7091 mpts->mpts_flags &= ~MPTSF_READ_STALL;
7092 }
7093