1 /*
2 * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
61 */
62 /*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69 #include "tcp_includes.h"
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/sysctl.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/domain.h>
78 #include <sys/proc.h>
79 #include <sys/kauth.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/protosw.h>
83 #include <sys/random.h>
84 #include <sys/syslog.h>
85 #include <sys/mcache.h>
86 #include <kern/locks.h>
87 #include <kern/zalloc.h>
88
89 #include <dev/random/randomdev.h>
90
91 #include <net/route.h>
92 #include <net/if.h>
93 #include <net/content_filter.h>
94 #include <net/ntstat.h>
95 #include <net/multi_layer_pkt_log.h>
96
97 #define tcp_minmssoverload fring
98 #define _IP_VHL
99 #include <netinet/in.h>
100 #include <netinet/in_systm.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip_icmp.h>
103 #include <netinet/ip6.h>
104 #include <netinet/icmp6.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet6/in6_pcb.h>
107 #include <netinet/in_var.h>
108 #include <netinet/ip_var.h>
109 #include <netinet/icmp_var.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/mptcp_var.h>
112 #include <netinet/tcp.h>
113 #include <netinet/tcp_fsm.h>
114 #include <netinet/tcp_seq.h>
115 #include <netinet/tcp_timer.h>
116 #include <netinet/tcp_var.h>
117 #include <netinet/tcp_cc.h>
118 #include <netinet/tcp_cache.h>
119 #include <kern/thread_call.h>
120
121 #include <netinet6/tcp6_var.h>
122 #include <netinet/tcpip.h>
123 #include <netinet/tcp_log.h>
124
125 #include <netinet6/ip6protosw.h>
126 #include <netinet6/esp.h>
127
128 #if IPSEC
129 #include <netinet6/ipsec.h>
130 #include <netinet6/ipsec6.h>
131 #endif /* IPSEC */
132
133 #if NECP
134 #include <net/necp.h>
135 #endif /* NECP */
136
137 #undef tcp_minmssoverload
138
139 #include <net/sockaddr_utils.h>
140
141 #include <corecrypto/ccaes.h>
142 #include <libkern/crypto/aes.h>
143 #include <libkern/crypto/md5.h>
144 #include <sys/kdebug.h>
145 #include <mach/sdt.h>
146 #include <pexpert/pexpert.h>
147 #include <mach/mach_time.h>
148
149 #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
150
151 static tcp_cc tcp_ccgen;
152
153 extern struct tcptimerlist tcp_timer_list;
154 extern struct tcptailq tcp_tw_tailq;
155
156 extern int tcp_awdl_rtobase;
157
158 SYSCTL_SKMEM_TCP_INT(TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
159 int, tcp_mssdflt, TCP_MSS, "Default TCP Maximum Segment Size");
160
161 SYSCTL_SKMEM_TCP_INT(TCPCTL_V6MSSDFLT, v6mssdflt,
162 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_v6mssdflt, TCP6_MSS,
163 "Default TCP Maximum Segment Size for IPv6");
164
165 int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int,
166 struct sysctl_req *);
167 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING | CTLFLAG_WR,
168 0, 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
169
170 /* Current count of half-open TFO connections */
171 int tcp_tfo_halfcnt = 0;
172
173 /* Maximum of half-open TFO connection backlog */
174 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen_backlog,
175 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_tfo_backlog, 10,
176 "Backlog queue for half-open TFO connections");
177
178 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED,
179 int, tcp_fastopen, TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER,
180 "Enable TCP Fastopen (RFC 7413)");
181
182 SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD | CTLFLAG_LOCKED,
183 uint32_t, tcp_now_init, 0, "Initial tcp now value");
184
185 SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD | CTLFLAG_LOCKED,
186 uint32_t, tcp_microuptime_init, 0, "Initial tcp uptime value in micro seconds");
187
188 /*
189 * Minimum MSS we accept and use. This prevents DoS attacks where
190 * we are forced to a ridiculous low MSS like 20 and send hundreds
191 * of packets instead of one. The effect scales with the available
192 * bandwidth and quickly saturates the CPU and network interface
193 * with packet generation and sending. Set to zero to disable MINMSS
194 * checking. This setting prevents us from sending too small packets.
195 */
196 SYSCTL_SKMEM_TCP_INT(OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
197 int, tcp_minmss, TCP_MINMSS, "Minmum TCP Maximum Segment Size");
198
199 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
200 &tcbinfo.ipi_count, 0, "Number of active PCBs");
201
202 SYSCTL_SKMEM_TCP_INT(OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED,
203 static int, icmp_may_rst, 1,
204 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
205
206 int tcp_do_timestamps = 1;
207 #if (DEVELOPMENT || DEBUG)
208 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_timestamps,
209 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_timestamps, 0, "enable TCP timestamps");
210 #endif /* (DEVELOPMENT || DEBUG) */
211
212 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
213 int, tcp_TCPTV_MIN, 100, "min rtt value allowed");
214
215 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rexmt_slop, CTLFLAG_RW,
216 int, tcp_rexmt_slop, TCPTV_REXMTSLOP, "Slop added to retransmit timeout");
217
218 SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
219 __private_extern__ int, tcp_use_randomport, 0,
220 "Randomize TCP port numbers");
221
222 SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
223 __private_extern__ int, tcp_win_scale, 3, "Window scaling factor");
224
225 #if (DEVELOPMENT || DEBUG)
226 SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
227 CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
228 "Initalize RTT from route cache");
229 #else
230 SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
231 CTLFLAG_RD | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
232 "Initalize RTT from route cache");
233 #endif /* (DEVELOPMENT || DEBUG) */
234
235 static int tso_debug = 0;
236 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
237 &tso_debug, 0, "TSO verbosity");
238
239 static int tcp_rxt_seg_max = 1024;
240 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rxt_seg_max, CTLFLAG_RW | CTLFLAG_LOCKED,
241 &tcp_rxt_seg_max, 0, "");
242
243 static unsigned long tcp_rxt_seg_drop = 0;
244 SYSCTL_ULONG(_net_inet_tcp, OID_AUTO, rxt_seg_drop, CTLFLAG_RD | CTLFLAG_LOCKED,
245 &tcp_rxt_seg_drop, "");
246
247 static void tcp_notify(struct inpcb *, int);
248
249 static KALLOC_TYPE_DEFINE(tcp_bwmeas_zone, struct bwmeas, NET_KT_DEFAULT);
250 KALLOC_TYPE_DEFINE(tcp_reass_zone, struct tseg_qent, NET_KT_DEFAULT);
251 KALLOC_TYPE_DEFINE(tcp_rxt_seg_zone, struct tcp_rxt_seg, NET_KT_DEFAULT);
252 KALLOC_TYPE_DEFINE(tcp_seg_sent_zone, struct tcp_seg_sent, NET_KT_DEFAULT);
253
254 extern int slowlink_wsize; /* window correction for slow links */
255 extern int path_mtu_discovery;
256
257 uint32_t tcp_now_remainder_us = 0; /* remaining micro seconds for tcp_now */
258
259 static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb);
260
261 #define TCP_BWMEAS_BURST_MINSIZE 6
262 #define TCP_BWMEAS_BURST_MAXSIZE 25
263
264 /*
265 * Target size of TCP PCB hash tables. Must be a power of two.
266 *
267 * Note that this can be overridden by the kernel environment
268 * variable net.inet.tcp.tcbhashsize
269 */
270 #ifndef TCBHASHSIZE
271 #define TCBHASHSIZE CONFIG_TCBHASHSIZE
272 #endif
273
274 __private_extern__ int tcp_tcbhashsize = TCBHASHSIZE;
275 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
276 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
277
278 /*
279 * This is the actual shape of what we allocate using the zone
280 * allocator. Doing it this way allows us to protect both structures
281 * using the same generation count, and also eliminates the overhead
282 * of allocating tcpcbs separately. By hiding the structure here,
283 * we avoid changing most of the rest of the code (although it needs
284 * to be changed, eventually, for greater efficiency).
285 */
286 #define ALIGNMENT 32
287 struct inp_tp {
288 struct inpcb inp;
289 struct tcpcb tcb __attribute__((aligned(ALIGNMENT)));
290 };
291 #undef ALIGNMENT
292
293 static KALLOC_TYPE_DEFINE(tcpcbzone, struct inp_tp, NET_KT_DEFAULT);
294
295 int get_inpcb_str_size(void);
296 int get_tcp_str_size(void);
297
298 os_log_t tcp_mpkl_log_object = NULL;
299
300 static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
301
302 int tcp_notsent_lowat_check(struct socket *so);
303 static void tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
304 struct if_lim_perf_stat *stat);
305 static void tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
306 struct if_tcp_ecn_perf_stat *stat);
307
308 static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */
309
310 void
tcp_tfo_gen_cookie(struct inpcb * inp,u_char * out __sized_by (blk_size),size_t blk_size)311 tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out __sized_by(blk_size), size_t blk_size)
312 {
313 u_char in[CCAES_BLOCK_SIZE];
314 int isipv6 = inp->inp_vflag & INP_IPV6;
315
316 VERIFY(blk_size == CCAES_BLOCK_SIZE);
317
318 bzero(&in[0], CCAES_BLOCK_SIZE);
319 bzero(&out[0], CCAES_BLOCK_SIZE);
320
321 if (isipv6) {
322 memcpy(in, &inp->in6p_faddr, sizeof(struct in6_addr));
323 } else {
324 memcpy(in, &inp->inp_faddr, sizeof(struct in_addr));
325 }
326
327 aes_encrypt_cbc(in, NULL, 1, out, &tfo_ctx);
328 }
329
330 __private_extern__ int
tcp_sysctl_fastopenkey(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)331 tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1,
332 __unused int arg2, struct sysctl_req *req)
333 {
334 int error = 0;
335 /*
336 * TFO-key is expressed as a string in hex format
337 * +1 to account for the \0 char
338 * +1 because sysctl_io_string() expects a string length but the sysctl command
339 * now includes the terminating \0 in newlen -- see rdar://77205344
340 */
341 char keystring[TCP_FASTOPEN_KEYLEN * 2 + 2];
342 u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
343 int i;
344 size_t ks_len;
345
346 /*
347 * sysctl_io_string copies keystring into the oldptr of the sysctl_req.
348 * Make sure everything is zero, to avoid putting garbage in there or
349 * leaking the stack.
350 */
351 bzero(keystring, sizeof(keystring));
352
353 error = sysctl_io_string(req, keystring, sizeof(keystring), 0, NULL);
354 if (error) {
355 os_log(OS_LOG_DEFAULT,
356 "%s: sysctl_io_string() error %d, req->newlen %lu, sizeof(keystring) %lu",
357 __func__, error, req->newlen, sizeof(keystring));
358 goto exit;
359 }
360 if (req->newptr == USER_ADDR_NULL) {
361 goto exit;
362 }
363
364 ks_len = strbuflen(keystring, sizeof(keystring));
365 if (ks_len != TCP_FASTOPEN_KEYLEN * 2) {
366 os_log(OS_LOG_DEFAULT,
367 "%s: strlen(keystring) %lu != TCP_FASTOPEN_KEYLEN * 2 %u, newlen %lu",
368 __func__, ks_len, TCP_FASTOPEN_KEYLEN * 2, req->newlen);
369 error = EINVAL;
370 goto exit;
371 }
372
373 for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
374 /*
375 * We jump over the keystring in 8-character (4 byte in hex)
376 * steps
377 */
378 if (sscanf(__unsafe_null_terminated_from_indexable(&keystring[i * 8]), "%8x", &key[i]) != 1) {
379 error = EINVAL;
380 os_log(OS_LOG_DEFAULT,
381 "%s: sscanf() != 1, error EINVAL", __func__);
382 goto exit;
383 }
384 }
385
386 aes_encrypt_key128((u_char *)key, &tfo_ctx);
387
388 exit:
389 return error;
390 }
391
392 int
get_inpcb_str_size(void)393 get_inpcb_str_size(void)
394 {
395 return sizeof(struct inpcb);
396 }
397
398 int
get_tcp_str_size(void)399 get_tcp_str_size(void)
400 {
401 return sizeof(struct tcpcb);
402 }
403
404 static int scale_to_powerof2(int size);
405
406 /*
407 * This helper routine returns one of the following scaled value of size:
408 * 1. Rounded down power of two value of size if the size value passed as
409 * argument is not a power of two and the rounded up value overflows.
410 * OR
411 * 2. Rounded up power of two value of size if the size value passed as
412 * argument is not a power of two and the rounded up value does not overflow
413 * OR
414 * 3. Same value as argument size if it is already a power of two.
415 */
416 static int
scale_to_powerof2(int size)417 scale_to_powerof2(int size)
418 {
419 /* Handle special case of size = 0 */
420 int ret = size ? size : 1;
421
422 if (!powerof2(ret)) {
423 while (!powerof2(size)) {
424 /*
425 * Clear out least significant
426 * set bit till size is left with
427 * its highest set bit at which point
428 * it is rounded down power of two.
429 */
430 size = size & (size - 1);
431 }
432
433 /* Check for overflow when rounding up */
434 if (0 == (size << 1)) {
435 ret = size;
436 } else {
437 ret = size << 1;
438 }
439 }
440
441 return ret;
442 }
443
444 /*
445 * Round the floating point to the next integer
446 * Eg. 1.3 will round up to 2.
447 */
448 uint32_t
tcp_ceil(double a)449 tcp_ceil(double a)
450 {
451 double res = (uint32_t) a;
452 return (uint32_t)(res + (res < a));
453 }
454
455 uint32_t
tcp_round_to(uint32_t val,uint32_t round)456 tcp_round_to(uint32_t val, uint32_t round)
457 {
458 /*
459 * Round up or down based on the middle. Meaning, if we round upon a
460 * multiple of 10, 16 will round to 20 and 14 will round to 10.
461 */
462 return ((val + (round / 2)) / round) * round;
463 }
464
465 /*
466 * Round up to the next multiple of base.
467 * Eg. for a base of 64, 65 will become 128,
468 * 2896 will become 2944.
469 */
470 uint32_t
tcp_round_up(uint32_t val,uint32_t base)471 tcp_round_up(uint32_t val, uint32_t base)
472 {
473 if (base == 1 || val % base == 0) {
474 return val;
475 }
476
477 return ((val + base) / base) * base;
478 }
479
480 uint32_t
481 ntoh24(u_char *p __sized_by(3))
482 {
483 uint32_t v;
484
485 v = (uint32_t)(p[0] << 16);
486 v |= (uint32_t)(p[1] << 8);
487 v |= (uint32_t)(p[2] << 0);
488 return v;
489 }
490
491 uint32_t
tcp_packets_this_ack(struct tcpcb * tp,uint32_t acked)492 tcp_packets_this_ack(struct tcpcb *tp, uint32_t acked)
493 {
494 return acked / tp->t_maxseg +
495 (((acked % tp->t_maxseg) != 0) ? 1 : 0);
496 }
497
498 static void
tcp_tfo_init(void)499 tcp_tfo_init(void)
500 {
501 u_char key[TCP_FASTOPEN_KEYLEN];
502
503 read_frandom(key, sizeof(key));
504 aes_encrypt_key128(key, &tfo_ctx);
505 }
506
507 static u_char isn_secret[32];
508
509 /*
510 * Tcp initialization
511 */
512 void
tcp_init(struct protosw * pp,struct domain * dp)513 tcp_init(struct protosw *pp, struct domain *dp)
514 {
515 #pragma unused(dp)
516 static int tcp_initialized = 0;
517 struct inpcbinfo *pcbinfo;
518
519 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
520
521 if (tcp_initialized) {
522 return;
523 }
524 tcp_initialized = 1;
525
526 #if DEBUG || DEVELOPMENT
527 (void) PE_parse_boot_argn("tcp_rxt_seg_max", &tcp_rxt_seg_max,
528 sizeof(tcp_rxt_seg_max));
529 #endif /* DEBUG || DEVELOPMENT */
530
531 tcp_ccgen = 1;
532 tcp_keepinit = TCPTV_KEEP_INIT;
533 tcp_keepidle = TCPTV_KEEP_IDLE;
534 tcp_keepintvl = TCPTV_KEEPINTVL;
535 tcp_keepcnt = TCPTV_KEEPCNT;
536 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
537 tcp_msl = TCPTV_MSL;
538
539 microuptime(&tcp_uptime);
540 read_frandom(&tcp_now, sizeof(tcp_now));
541
542 /* Starts tcp internal clock at a random value */
543 tcp_now = tcp_now & 0x3fffffff;
544
545 /* expose initial uptime/now via systcl for utcp to keep time sync */
546 tcp_now_init = tcp_now;
547 tcp_microuptime_init =
548 (uint32_t)(tcp_uptime.tv_usec + (tcp_uptime.tv_sec * USEC_PER_SEC));
549 SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init);
550 SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init);
551
552 tcp_tfo_init();
553
554 LIST_INIT(&tcb);
555 tcbinfo.ipi_listhead = &tcb;
556
557 pcbinfo = &tcbinfo;
558
559 /*
560 * allocate group, lock attributes and lock for tcp pcb mutexes
561 */
562 pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb",
563 LCK_GRP_ATTR_NULL);
564 lck_attr_setdefault(&pcbinfo->ipi_lock_attr);
565 lck_rw_init(&pcbinfo->ipi_lock, pcbinfo->ipi_lock_grp,
566 &pcbinfo->ipi_lock_attr);
567
568 if (tcp_tcbhashsize == 0) {
569 /* Set to default */
570 tcp_tcbhashsize = 512;
571 }
572
573 if (!powerof2(tcp_tcbhashsize)) {
574 int old_hash_size = tcp_tcbhashsize;
575 tcp_tcbhashsize = scale_to_powerof2(tcp_tcbhashsize);
576 /* Lower limit of 16 */
577 if (tcp_tcbhashsize < 16) {
578 tcp_tcbhashsize = 16;
579 }
580 printf("WARNING: TCB hash size not a power of 2, "
581 "scaled from %d to %d.\n",
582 old_hash_size,
583 tcp_tcbhashsize);
584 }
585
586 hashinit_counted_by(tcp_tcbhashsize, tcbinfo.ipi_hashbase,
587 tcbinfo.ipi_hashbase_count);
588 tcbinfo.ipi_hashmask = tcbinfo.ipi_hashbase_count - 1;
589 hashinit_counted_by(tcp_tcbhashsize, tcbinfo.ipi_porthashbase,
590 tcbinfo.ipi_porthashbase_count);
591 tcbinfo.ipi_porthashmask = tcbinfo.ipi_porthashbase_count - 1;
592 tcbinfo.ipi_zone = tcpcbzone;
593
594 tcbinfo.ipi_gc = tcp_gc;
595 tcbinfo.ipi_timer = tcp_itimer;
596 in_pcbinfo_attach(&tcbinfo);
597
598 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
599 if (max_protohdr < TCP_MINPROTOHDR) {
600 max_protohdr = (int)P2ROUNDUP(TCP_MINPROTOHDR, sizeof(uint32_t));
601 }
602 if (max_linkhdr + max_protohdr > MCLBYTES) {
603 panic("tcp_init");
604 }
605 #undef TCP_MINPROTOHDR
606
607 /* Initialize time wait and timer lists */
608 TAILQ_INIT(&tcp_tw_tailq);
609
610 bzero(&tcp_timer_list, sizeof(tcp_timer_list));
611 LIST_INIT(&tcp_timer_list.lhead);
612 /*
613 * allocate group and attribute for the tcp timer list
614 */
615 tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist",
616 LCK_GRP_ATTR_NULL);
617 lck_mtx_init(&tcp_timer_list.mtx, tcp_timer_list.mtx_grp,
618 LCK_ATTR_NULL);
619
620 tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL);
621 if (tcp_timer_list.call == NULL) {
622 panic("failed to allocate call entry 1 in tcp_init");
623 }
624
625 /* Initialize TCP Cache */
626 tcp_cache_init();
627
628 tcp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.tcp");
629 if (tcp_mpkl_log_object == NULL) {
630 panic("MPKL_CREATE_LOGOBJECT failed");
631 }
632
633 if (PE_parse_boot_argn("tcp_log", &tcp_log_enable_flags, sizeof(tcp_log_enable_flags))) {
634 os_log(OS_LOG_DEFAULT, "tcp_init: set tcp_log_enable_flags to 0x%x", tcp_log_enable_flags);
635 }
636
637 if (PE_parse_boot_argn("tcp_link_heuristics", &tcp_link_heuristics_flags, sizeof(tcp_link_heuristics_flags))) {
638 os_log(OS_LOG_DEFAULT, "tcp_init: set tcp_link_heuristics_flags to 0x%x", tcp_link_heuristics_flags);
639 }
640
641 /*
642 * If more than 4GB of actual memory is available, increase the
643 * maximum allowed receive and send socket buffer size.
644 */
645 if (mem_actual >= (1ULL << (GBSHIFT + 2))) {
646 if (serverperfmode) {
647 tcp_autorcvbuf_max = 8 * 1024 * 1024;
648 tcp_autosndbuf_max = 8 * 1024 * 1024;
649 } else {
650 tcp_autorcvbuf_max = 4 * 1024 * 1024;
651 tcp_autosndbuf_max = 4 * 1024 * 1024;
652 }
653
654 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
655 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
656 }
657
658 /* Initialize the TCP CCA array */
659 tcp_cc_init();
660
661 read_frandom(&isn_secret, sizeof(isn_secret));
662 }
663
664 /*
665 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
666 * tcp_template used to store this data in mbufs, but we now recopy it out
667 * of the tcpcb each time to conserve mbufs.
668 */
669 void
tcp_fillheaders(struct mbuf * m,struct tcpcb * tp,void * ip_ptr,void * tcp_ptr)670 tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr)
671 {
672 struct inpcb *inp = tp->t_inpcb;
673 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
674
675 if ((inp->inp_vflag & INP_IPV6) != 0) {
676 struct ip6_hdr *ip6;
677
678 ip6 = (struct ip6_hdr *)ip_ptr;
679 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
680 (inp->inp_flow & IPV6_FLOWINFO_MASK);
681 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
682 (IPV6_VERSION & IPV6_VERSION_MASK);
683 ip6->ip6_plen = htons(sizeof(struct tcphdr));
684 ip6->ip6_nxt = IPPROTO_TCP;
685 ip6->ip6_hlim = 0;
686 ip6->ip6_src = inp->in6p_laddr;
687 ip6->ip6_dst = inp->in6p_faddr;
688 if (m->m_flags & M_PKTHDR) {
689 uint32_t lifscope = inp->inp_lifscope != 0 ? inp->inp_lifscope : inp->inp_fifscope;
690 uint32_t fifscope = inp->inp_fifscope != 0 ? inp->inp_fifscope : inp->inp_lifscope;
691 ip6_output_setsrcifscope(m, lifscope, NULL);
692 ip6_output_setdstifscope(m, fifscope, NULL);
693 }
694 tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
695 htonl(sizeof(struct tcphdr) + IPPROTO_TCP));
696 } else {
697 struct ip *ip = (struct ip *) ip_ptr;
698
699 ip->ip_vhl = IP_VHL_BORING;
700 ip->ip_tos = 0;
701 ip->ip_len = 0;
702 ip->ip_id = 0;
703 ip->ip_off = 0;
704 ip->ip_ttl = 0;
705 ip->ip_sum = 0;
706 ip->ip_p = IPPROTO_TCP;
707 ip->ip_src = inp->inp_laddr;
708 ip->ip_dst = inp->inp_faddr;
709 tcp_hdr->th_sum =
710 in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
711 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
712 }
713
714 tcp_hdr->th_sport = inp->inp_lport;
715 tcp_hdr->th_dport = inp->inp_fport;
716 tcp_hdr->th_seq = 0;
717 tcp_hdr->th_ack = 0;
718 tcp_hdr->th_x2 = 0;
719 tcp_hdr->th_off = 5;
720 tcp_hdr->th_flags = 0;
721 tcp_hdr->th_win = 0;
722 tcp_hdr->th_urp = 0;
723 }
724
725 /*
726 * Create template to be used to send tcp packets on a connection.
727 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
728 * use for this function is in keepalives, which use tcp_respond.
729 */
730 struct tcptemp *
tcp_maketemplate(struct tcpcb * tp,struct mbuf ** mp)731 tcp_maketemplate(struct tcpcb *tp, struct mbuf **mp)
732 {
733 struct mbuf *m;
734 struct tcptemp *n;
735
736 *mp = m = m_get(M_DONTWAIT, MT_HEADER);
737 if (m == NULL) {
738 return NULL;
739 }
740 m->m_len = sizeof(struct tcptemp);
741 n = mtod(m, struct tcptemp *);
742
743 tcp_fillheaders(m, tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
744 return n;
745 }
746
747 /*
748 * Send a single message to the TCP at address specified by
749 * the given TCP/IP header. If m == 0, then we make a copy
750 * of the tcpiphdr at ti and send directly to the addressed host.
751 * This is used to force keep alive messages out using the TCP
752 * template for a connection. If flags are given then we send
753 * a message back to the TCP which originated the * segment ti,
754 * and discard the mbuf containing it and any other attached mbufs.
755 *
756 * In any case the ack and sequence number of the transmitted
757 * segment are as specified by the parameters.
758 *
759 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
760 */
761 void
tcp_respond(struct tcpcb * tp,void * ipgen __sized_by (ipgen_size),size_t ipgen_size __unused,struct tcphdr * th,struct mbuf * m,tcp_seq ack,tcp_seq seq,uint8_t flags,struct tcp_respond_args * tra)762 tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_size __unused, struct tcphdr *th, struct mbuf *m,
763 tcp_seq ack, tcp_seq seq, uint8_t flags, struct tcp_respond_args *tra)
764 {
765 uint16_t tlen;
766 int win = 0;
767 struct route *ro = 0;
768 struct route sro;
769 struct ip *ip;
770 struct tcphdr *nth;
771 struct route_in6 *ro6 = 0;
772 struct route_in6 sro6;
773 struct ip6_hdr *ip6;
774 int isipv6;
775 struct ifnet *outif;
776 int sotc = SO_TC_UNSPEC;
777 bool check_qos_marking_again = FALSE;
778 uint32_t sifscope = IFSCOPE_NONE, fifscope = IFSCOPE_NONE;
779
780 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
781 ip6 = ipgen;
782 ip = ipgen;
783
784 if (tp) {
785 check_qos_marking_again = tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE ? FALSE : TRUE;
786 sifscope = tp->t_inpcb->inp_lifscope;
787 fifscope = tp->t_inpcb->inp_fifscope;
788 if (!(flags & TH_RST)) {
789 win = tcp_sbspace(tp);
790 if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale) {
791 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
792 }
793 }
794 if (isipv6) {
795 ro6 = &tp->t_inpcb->in6p_route;
796 } else {
797 ro = &tp->t_inpcb->inp_route;
798 }
799 } else {
800 if (isipv6) {
801 ro6 = &sro6;
802 bzero(ro6, sizeof(*ro6));
803 } else {
804 ro = &sro;
805 bzero(ro, sizeof(*ro));
806 }
807 }
808 if (m == 0) {
809 m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */
810 if (m == NULL) {
811 return;
812 }
813 tlen = 0;
814 m->m_data += max_linkhdr;
815 if (isipv6) {
816 VERIFY((MHLEN - max_linkhdr) >=
817 (sizeof(*ip6) + sizeof(*nth)));
818 bcopy((caddr_t)ip6, mtod(m, caddr_t),
819 sizeof(struct ip6_hdr));
820 ip6 = mtod(m, struct ip6_hdr *);
821 nth = (struct tcphdr *)(void *)(ip6 + 1);
822 } else {
823 VERIFY((MHLEN - max_linkhdr) >=
824 (sizeof(*ip) + sizeof(*nth)));
825 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
826 ip = mtod(m, struct ip *);
827 nth = (struct tcphdr *)(void *)(ip + 1);
828 }
829 bcopy(th, nth, sizeof(struct tcphdr));
830 #if MPTCP
831 if ((tp) && (tp->t_mpflags & TMPF_RESET)) {
832 flags = (TH_RST | TH_ACK);
833 } else
834 #endif
835 flags = TH_ACK;
836 } else {
837 m_freem(m->m_next);
838 m->m_next = 0;
839 m->m_data = (uintptr_t)ipgen;
840 /* m_len is set later */
841 tlen = 0;
842 #define xchg(a, b, type) { type t; t = a; a = b; b = t; }
843 if (isipv6) {
844 ip6_getsrcifaddr_info(m, &sifscope, NULL);
845 ip6_getdstifaddr_info(m, &fifscope, NULL);
846 if (!in6_embedded_scope) {
847 m->m_pkthdr.pkt_flags &= ~PKTF_IFAINFO;
848 }
849 /* Expect 32-bit aligned IP on strict-align platforms */
850 IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
851 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
852 nth = (struct tcphdr *)(void *)(ip6 + 1);
853 } else {
854 /* Expect 32-bit aligned IP on strict-align platforms */
855 IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
856 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
857 nth = (struct tcphdr *)(void *)(ip + 1);
858 }
859 if (th != nth) {
860 /*
861 * this is usually a case when an extension header
862 * exists between the IPv6 header and the
863 * TCP header.
864 */
865 nth->th_sport = th->th_sport;
866 nth->th_dport = th->th_dport;
867 }
868 xchg(nth->th_dport, nth->th_sport, n_short);
869 #undef xchg
870 }
871 if (isipv6) {
872 ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) +
873 tlen));
874 tlen += sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
875 ip6_output_setsrcifscope(m, sifscope, NULL);
876 ip6_output_setdstifscope(m, fifscope, NULL);
877 } else {
878 tlen += sizeof(struct tcpiphdr);
879 ip->ip_len = tlen;
880 ip->ip_ttl = (uint8_t)ip_defttl;
881 }
882 m->m_len = tlen;
883 m->m_pkthdr.len = tlen;
884 m->m_pkthdr.rcvif = 0;
885 if (tra->keep_alive) {
886 m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE;
887 }
888
889 nth->th_seq = htonl(seq);
890 nth->th_ack = htonl(ack);
891 nth->th_x2 = 0;
892 nth->th_off = sizeof(struct tcphdr) >> 2;
893 nth->th_flags = flags;
894 if (tp) {
895 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
896 } else {
897 nth->th_win = htons((u_short)win);
898 }
899 nth->th_urp = 0;
900 if (isipv6) {
901 nth->th_sum = 0;
902 nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
903 htonl((tlen - sizeof(struct ip6_hdr)) + IPPROTO_TCP));
904 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
905 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
906 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
907 ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
908 } else {
909 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
910 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
911 m->m_pkthdr.csum_flags = CSUM_TCP;
912 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
913 }
914 #if NECP
915 necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0, 0, 0);
916 #endif /* NECP */
917
918 #if IPSEC
919 if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
920 ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
921 m_freem(m);
922 return;
923 }
924 #endif
925
926 if (tp != NULL) {
927 u_int32_t svc_flags = 0;
928 if (isipv6) {
929 svc_flags |= PKT_SCF_IPV6;
930 }
931 sotc = tp->t_inpcb->inp_socket->so_traffic_class;
932 if ((flags & TH_RST) == 0) {
933 set_packet_service_class(m, tp->t_inpcb->inp_socket,
934 sotc, svc_flags);
935 } else {
936 m_set_service_class(m, MBUF_SC_BK_SYS);
937 }
938
939 /* Embed flowhash and flow control flags */
940 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
941 m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
942 m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
943 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
944 m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid;
945 m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid;
946
947 if (flags & TH_RST) {
948 m->m_pkthdr.comp_gencnt = tp->t_comp_ack_gencnt;
949 }
950 } else {
951 if (flags & TH_RST) {
952 m->m_pkthdr.comp_gencnt = TCP_ACK_COMPRESSION_DUMMY;
953 m_set_service_class(m, MBUF_SC_BK_SYS);
954 }
955 }
956
957 if (isipv6) {
958 struct ip6_out_args ip6oa;
959 bzero(&ip6oa, sizeof(ip6oa));
960 ip6oa.ip6oa_boundif = tra->ifscope;
961 ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR;
962 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
963 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
964
965 if (tra->ifscope != IFSCOPE_NONE) {
966 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
967 }
968 if (tra->nocell) {
969 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
970 }
971 if (tra->noexpensive) {
972 ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
973 }
974 if (tra->noconstrained) {
975 ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
976 }
977 if (tra->awdl_unrestricted) {
978 ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
979 }
980 if (tra->intcoproc_allowed) {
981 ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
982 }
983 if (tra->management_allowed) {
984 ip6oa.ip6oa_flags |= IP6OAF_MANAGEMENT_ALLOWED;
985 }
986 if (tra->ultra_constrained_allowed) {
987 ip6oa.ip6oa_flags |= IP6OAF_ULTRA_CONSTRAINED_ALLOWED;
988 }
989 ip6oa.ip6oa_sotc = sotc;
990 if (tp != NULL) {
991 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
992 ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
993 }
994 ip6oa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
995 if (check_qos_marking_again) {
996 ip6oa.ip6oa_flags |= IP6OAF_REDO_QOSMARKING_POLICY;
997 }
998 ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
999 }
1000 (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
1001 NULL, &ip6oa);
1002
1003 if (check_qos_marking_again) {
1004 struct inpcb *inp = tp->t_inpcb;
1005 inp->inp_policyresult.results.qos_marking_gencount = ip6oa.qos_marking_gencount;
1006 if (ip6oa.ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED) {
1007 inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
1008 } else {
1009 inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
1010 }
1011 }
1012
1013 if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
1014 (outif = ro6->ro_rt->rt_ifp) !=
1015 tp->t_inpcb->in6p_last_outifp) {
1016 tp->t_inpcb->in6p_last_outifp = outif;
1017 #if SKYWALK
1018 if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1019 netns_set_ifnet(&tp->t_inpcb->inp_netns_token,
1020 tp->t_inpcb->in6p_last_outifp);
1021 }
1022 #endif /* SKYWALK */
1023 }
1024
1025 if (ro6 == &sro6) {
1026 ROUTE_RELEASE(ro6);
1027 }
1028 } else {
1029 struct ip_out_args ipoa;
1030 bzero(&ipoa, sizeof(ipoa));
1031 ipoa.ipoa_boundif = tra->ifscope;
1032 ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR;
1033 ipoa.ipoa_sotc = SO_TC_UNSPEC;
1034 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1035
1036 if (tra->ifscope != IFSCOPE_NONE) {
1037 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
1038 }
1039 if (tra->nocell) {
1040 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
1041 }
1042 if (tra->noexpensive) {
1043 ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
1044 }
1045 if (tra->noconstrained) {
1046 ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED;
1047 }
1048 if (tra->awdl_unrestricted) {
1049 ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
1050 }
1051 if (tra->management_allowed) {
1052 ipoa.ipoa_flags |= IPOAF_MANAGEMENT_ALLOWED;
1053 }
1054 ipoa.ipoa_sotc = sotc;
1055 if (tp != NULL) {
1056 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
1057 ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1058 }
1059 if (!(tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE)) {
1060 ipoa.ipoa_flags |= IPOAF_REDO_QOSMARKING_POLICY;
1061 }
1062 ipoa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
1063 ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
1064 }
1065 if (ro != &sro) {
1066 /* Copy the cached route and take an extra reference */
1067 inp_route_copyout(tp->t_inpcb, &sro);
1068 }
1069 /*
1070 * For consistency, pass a local route copy.
1071 */
1072 (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
1073
1074 if (check_qos_marking_again) {
1075 struct inpcb *inp = tp->t_inpcb;
1076 inp->inp_policyresult.results.qos_marking_gencount = ipoa.qos_marking_gencount;
1077 if (ipoa.ipoa_flags & IPOAF_QOSMARKING_ALLOWED) {
1078 inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
1079 } else {
1080 inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
1081 }
1082 }
1083 if (tp != NULL && sro.ro_rt != NULL &&
1084 (outif = sro.ro_rt->rt_ifp) !=
1085 tp->t_inpcb->inp_last_outifp) {
1086 tp->t_inpcb->inp_last_outifp = outif;
1087 #if SKYWALK
1088 if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1089 netns_set_ifnet(&tp->t_inpcb->inp_netns_token, outif);
1090 }
1091 #endif /* SKYWALK */
1092 }
1093 if (ro != &sro) {
1094 /* Synchronize cached PCB route */
1095 inp_route_copyin(tp->t_inpcb, &sro);
1096 } else {
1097 ROUTE_RELEASE(&sro);
1098 }
1099 }
1100 }
1101
1102 /*
1103 * Create a new TCP control block, making an
1104 * empty reassembly queue and hooking it to the argument
1105 * protocol control block. The `inp' parameter must have
1106 * come from the zone allocator set up in tcp_init().
1107 */
1108 struct tcpcb *
tcp_newtcpcb(struct inpcb * inp)1109 tcp_newtcpcb(struct inpcb *inp)
1110 {
1111 struct inp_tp *it;
1112 struct tcpcb *tp;
1113 struct socket *so = inp->inp_socket;
1114 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1115 uint32_t random_32;
1116
1117 calculate_tcp_clock();
1118
1119 if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
1120 it = (struct inp_tp *)(void *)inp;
1121 tp = &it->tcb;
1122 } else {
1123 tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb;
1124 }
1125
1126 bzero((char *) tp, sizeof(struct tcpcb));
1127 LIST_INIT(&tp->t_segq);
1128 tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
1129
1130 tp->t_flags = TF_REQ_SCALE | (tcp_do_timestamps ? TF_REQ_TSTMP : 0);
1131 tp->t_flagsext |= TF_SACK_ENABLE;
1132
1133 if (tcp_rack) {
1134 tp->t_flagsext |= TF_RACK_ENABLED;
1135 }
1136
1137 TAILQ_INIT(&tp->snd_holes);
1138 SLIST_INIT(&tp->t_rxt_segments);
1139 TAILQ_INIT(&tp->t_segs_sent);
1140 RB_INIT(&tp->t_segs_sent_tree);
1141 TAILQ_INIT(&tp->t_segs_acked);
1142 TAILQ_INIT(&tp->seg_pool.free_segs);
1143 SLIST_INIT(&tp->t_notify_ack);
1144 tp->t_inpcb = inp;
1145 /*
1146 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
1147 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
1148 * reasonable initial retransmit time.
1149 */
1150 tp->t_srtt = TCPTV_SRTTBASE;
1151 tp->t_rttvar =
1152 ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1153 tp->t_rttmin = tcp_TCPTV_MIN;
1154 tp->t_rxtcur = TCPTV_RTOBASE;
1155
1156 if (tcp_use_newreno) {
1157 /* use newreno by default */
1158 tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
1159 #if (DEVELOPMENT || DEBUG)
1160 } else if (tcp_use_ledbat) {
1161 /* use ledbat for testing */
1162 tp->tcp_cc_index = TCP_CC_ALGO_BACKGROUND_INDEX;
1163 #endif
1164 } else {
1165 if (TCP_L4S_ENABLED(tp)) {
1166 tp->tcp_cc_index = TCP_CC_ALGO_PRAGUE_INDEX;
1167 } else {
1168 tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
1169 }
1170 }
1171
1172 tcp_cc_allocate_state(tp);
1173
1174 if (CC_ALGO(tp)->init != NULL) {
1175 CC_ALGO(tp)->init(tp);
1176 }
1177
1178 /* Initialize rledbat if we are using recv_bg */
1179 if (tcp_rledbat == 1 && TCP_RECV_BG(inp->inp_socket) &&
1180 tcp_cc_rledbat.init != NULL) {
1181 tcp_cc_rledbat.init(tp);
1182 }
1183
1184 tp->snd_cwnd = tcp_initial_cwnd(tp);
1185 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1186 tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1187 tp->t_rcvtime = tcp_now;
1188 tp->tentry.timer_start = tcp_now;
1189 tp->rcv_unackwin = tcp_now;
1190 tp->t_persist_timeout = tcp_max_persist_timeout;
1191 tp->t_persist_stop = 0;
1192 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1193 tp->t_rexmtthresh = (uint8_t)tcprexmtthresh;
1194 tp->rack.reo_wnd_multi = 1;
1195 tp->rfbuf_ts = tcp_now;
1196 tp->rfbuf_space = tcp_initial_cwnd(tp);
1197 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
1198 tp->bytes_lost = tp->bytes_sacked = tp->bytes_retransmitted = 0;
1199
1200 /* Enable bandwidth measurement on this connection */
1201 tp->t_flagsext |= TF_MEASURESNDBW;
1202 if (tp->t_bwmeas == NULL) {
1203 tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1204 if (tp->t_bwmeas == NULL) {
1205 tp->t_flagsext &= ~TF_MEASURESNDBW;
1206 }
1207 }
1208
1209 /* Clear time wait tailq entry */
1210 tp->t_twentry.tqe_next = NULL;
1211 tp->t_twentry.tqe_prev = NULL;
1212
1213 read_frandom(&random_32, sizeof(random_32));
1214 tp->t_comp_ack_gencnt = random_32;
1215 if (tp->t_comp_ack_gencnt <= TCP_ACK_COMPRESSION_DUMMY ||
1216 tp->t_comp_ack_gencnt > INT_MAX) {
1217 tp->t_comp_ack_gencnt = TCP_ACK_COMPRESSION_DUMMY + 1;
1218 }
1219 tp->t_comp_ack_lastinc = tcp_now;
1220
1221 /* Initialize Accurate ECN state */
1222 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_feature_disabled;
1223 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_feature_disabled;
1224
1225 /*
1226 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
1227 * because the socket may be bound to an IPv6 wildcard address,
1228 * which may match an IPv4-mapped IPv6 address.
1229 */
1230 inp->inp_ip_ttl = (uint8_t)ip_defttl;
1231 inp->inp_ppcb = (caddr_t)tp;
1232 return tp; /* XXX */
1233 }
1234
1235 /*
1236 * Drop a TCP connection, reporting
1237 * the specified error. If connection is synchronized,
1238 * then send a RST to peer.
1239 */
1240 struct tcpcb *
tcp_drop(struct tcpcb * tp,int errno)1241 tcp_drop(struct tcpcb *tp, int errno)
1242 {
1243 struct socket *so = tp->t_inpcb->inp_socket;
1244 #if CONFIG_DTRACE
1245 struct inpcb *inp = tp->t_inpcb;
1246 #endif
1247
1248 if (TCPS_HAVERCVDSYN(tp->t_state)) {
1249 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1250 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1251 TCP_LOG_STATE(tp, TCPS_CLOSED);
1252 tp->t_state = TCPS_CLOSED;
1253 (void) tcp_output(tp);
1254 tcpstat.tcps_drops++;
1255 } else {
1256 tcpstat.tcps_conndrops++;
1257 }
1258 if (errno == ETIMEDOUT && tp->t_softerror) {
1259 errno = tp->t_softerror;
1260 }
1261 so->so_error = (u_short)errno;
1262
1263 TCP_LOG_CONNECTION_SUMMARY(tp);
1264
1265 return tcp_close(tp);
1266 }
1267
1268 void
tcp_getrt_rtt(struct tcpcb * tp,struct rtentry * rt)1269 tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
1270 {
1271 uint32_t rtt = rt->rt_rmx.rmx_rtt;
1272
1273 TCP_LOG_RTM_RTT(tp, rt);
1274
1275 if (rtt != 0 && tcp_init_rtt_from_cache != 0) {
1276 /*
1277 * XXX the lock bit for RTT indicates that the value
1278 * is also a minimum value; this is subject to time.
1279 */
1280 if (rt->rt_rmx.rmx_locks & RTV_RTT) {
1281 tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
1282 } else {
1283 tp->t_rttmin = TCPTV_REXMTMIN;
1284 }
1285
1286 tp->t_srtt =
1287 rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1288 tcpstat.tcps_usedrtt++;
1289
1290 if (rt->rt_rmx.rmx_rttvar) {
1291 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1292 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1293 tcpstat.tcps_usedrttvar++;
1294 } else {
1295 /* default variation is +- 1 rtt */
1296 tp->t_rttvar =
1297 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1298 }
1299
1300 /*
1301 * The RTO formula in the route metric case is based on:
1302 * srtt + 4 * rttvar
1303 * modulo the min, max and slop
1304 */
1305 TCPT_RANGESET(tp->t_rxtcur,
1306 TCP_REXMTVAL(tp),
1307 tp->t_rttmin, TCPTV_REXMTMAX,
1308 TCP_ADD_REXMTSLOP(tp));
1309 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_srtt == 0 &&
1310 tp->t_rxtshift == 0) {
1311 struct ifnet *ifp = rt->rt_ifp;
1312
1313 if (ifp != NULL && (ifp->if_eflags & IFEF_AWDL) != 0) {
1314 /*
1315 * AWDL needs a special value for the default initial retransmission timeout
1316 */
1317 if (tcp_awdl_rtobase > tcp_TCPTV_MIN) {
1318 tp->t_rttvar = ((tcp_awdl_rtobase - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1319 } else {
1320 tp->t_rttvar = ((tcp_TCPTV_MIN - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1321 }
1322 TCPT_RANGESET(tp->t_rxtcur,
1323 TCP_REXMTVAL(tp),
1324 tp->t_rttmin, TCPTV_REXMTMAX,
1325 TCP_ADD_REXMTSLOP(tp));
1326 }
1327 }
1328
1329 TCP_LOG_RTT_INFO(tp);
1330 }
1331
1332 static inline void
tcp_create_ifnet_stats_per_flow(struct tcpcb * tp,struct ifnet_stats_per_flow * ifs)1333 tcp_create_ifnet_stats_per_flow(struct tcpcb *tp,
1334 struct ifnet_stats_per_flow *ifs)
1335 {
1336 struct inpcb *inp;
1337 struct socket *so;
1338 if (tp == NULL || ifs == NULL) {
1339 return;
1340 }
1341
1342 bzero(ifs, sizeof(*ifs));
1343 inp = tp->t_inpcb;
1344 so = inp->inp_socket;
1345
1346 ifs->ipv4 = (inp->inp_vflag & INP_IPV6) ? 0 : 1;
1347 ifs->local = (tp->t_flags & TF_LOCAL) ? 1 : 0;
1348 ifs->connreset = (so->so_error == ECONNRESET) ? 1 : 0;
1349 ifs->conntimeout = (so->so_error == ETIMEDOUT) ? 1 : 0;
1350 ifs->ecn_flags = tp->ecn_flags;
1351 ifs->txretransmitbytes = tp->t_stat.txretransmitbytes;
1352 ifs->rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1353 ifs->rxmitpkts = tp->t_stat.rxmitpkts;
1354 ifs->rcvoopack = tp->t_rcvoopack;
1355 ifs->pawsdrop = tp->t_pawsdrop;
1356 ifs->sack_recovery_episodes = tp->t_sack_recovery_episode;
1357 ifs->reordered_pkts = tp->t_reordered_pkts;
1358 ifs->dsack_sent = tp->t_dsack_sent;
1359 ifs->dsack_recvd = tp->t_dsack_recvd;
1360 ifs->srtt = tp->t_srtt;
1361 ifs->rttupdated = tp->t_rttupdated;
1362 ifs->rttvar = tp->t_rttvar;
1363 ifs->rttmin = get_base_rtt(tp);
1364 if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_sndbw_max > 0) {
1365 ifs->bw_sndbw_max = tp->t_bwmeas->bw_sndbw_max;
1366 } else {
1367 ifs->bw_sndbw_max = 0;
1368 }
1369 if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_rcvbw_max > 0) {
1370 ifs->bw_rcvbw_max = tp->t_bwmeas->bw_rcvbw_max;
1371 } else {
1372 ifs->bw_rcvbw_max = 0;
1373 }
1374 ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets;
1375 ifs->txpackets = inp->inp_stat->txpackets;
1376 ifs->rxpackets = inp->inp_stat->rxpackets;
1377 }
1378
1379 static inline void
tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow * ifs,struct if_tcp_ecn_perf_stat * stat)1380 tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
1381 struct if_tcp_ecn_perf_stat *stat)
1382 {
1383 u_int64_t curval, oldval;
1384 stat->total_txpkts += ifs->txpackets;
1385 stat->total_rxpkts += ifs->rxpackets;
1386 stat->total_rxmitpkts += ifs->rxmitpkts;
1387 stat->total_oopkts += ifs->rcvoopack;
1388 stat->total_reorderpkts += (ifs->reordered_pkts +
1389 ifs->pawsdrop + ifs->dsack_sent + ifs->dsack_recvd);
1390
1391 /* Average RTT */
1392 curval = ifs->srtt >> TCP_RTT_SHIFT;
1393 if (curval > 0 && ifs->rttupdated >= 16) {
1394 if (stat->rtt_avg == 0) {
1395 stat->rtt_avg = curval;
1396 } else {
1397 oldval = stat->rtt_avg;
1398 stat->rtt_avg = ((oldval << 4) - oldval + curval) >> 4;
1399 }
1400 }
1401
1402 /* RTT variance */
1403 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1404 if (curval > 0 && ifs->rttupdated >= 16) {
1405 if (stat->rtt_var == 0) {
1406 stat->rtt_var = curval;
1407 } else {
1408 oldval = stat->rtt_var;
1409 stat->rtt_var =
1410 ((oldval << 4) - oldval + curval) >> 4;
1411 }
1412 }
1413
1414 /* SACK episodes */
1415 stat->sack_episodes += ifs->sack_recovery_episodes;
1416 if (ifs->connreset) {
1417 stat->rst_drop++;
1418 }
1419 }
1420
1421 static inline void
tcp_flow_lim_stats(struct ifnet_stats_per_flow * ifs,struct if_lim_perf_stat * stat)1422 tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
1423 struct if_lim_perf_stat *stat)
1424 {
1425 u_int64_t curval, oldval;
1426
1427 stat->lim_total_txpkts += ifs->txpackets;
1428 stat->lim_total_rxpkts += ifs->rxpackets;
1429 stat->lim_total_retxpkts += ifs->rxmitpkts;
1430 stat->lim_total_oopkts += ifs->rcvoopack;
1431
1432 if (ifs->bw_sndbw_max > 0) {
1433 /* convert from bytes per ms to bits per second */
1434 ifs->bw_sndbw_max *= 8000;
1435 stat->lim_ul_max_bandwidth = MAX(stat->lim_ul_max_bandwidth,
1436 ifs->bw_sndbw_max);
1437 }
1438
1439 if (ifs->bw_rcvbw_max > 0) {
1440 /* convert from bytes per ms to bits per second */
1441 ifs->bw_rcvbw_max *= 8000;
1442 stat->lim_dl_max_bandwidth = MAX(stat->lim_dl_max_bandwidth,
1443 ifs->bw_rcvbw_max);
1444 }
1445
1446 /* Average RTT */
1447 curval = ifs->srtt >> TCP_RTT_SHIFT;
1448 if (curval > 0 && ifs->rttupdated >= 16) {
1449 if (stat->lim_rtt_average == 0) {
1450 stat->lim_rtt_average = curval;
1451 } else {
1452 oldval = stat->lim_rtt_average;
1453 stat->lim_rtt_average =
1454 ((oldval << 4) - oldval + curval) >> 4;
1455 }
1456 }
1457
1458 /* RTT variance */
1459 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1460 if (curval > 0 && ifs->rttupdated >= 16) {
1461 if (stat->lim_rtt_variance == 0) {
1462 stat->lim_rtt_variance = curval;
1463 } else {
1464 oldval = stat->lim_rtt_variance;
1465 stat->lim_rtt_variance =
1466 ((oldval << 4) - oldval + curval) >> 4;
1467 }
1468 }
1469
1470 if (stat->lim_rtt_min == 0) {
1471 stat->lim_rtt_min = ifs->rttmin;
1472 } else {
1473 stat->lim_rtt_min = MIN(stat->lim_rtt_min, ifs->rttmin);
1474 }
1475
1476 /* connection timeouts */
1477 stat->lim_conn_attempts++;
1478 if (ifs->conntimeout) {
1479 stat->lim_conn_timeouts++;
1480 }
1481
1482 /* bytes sent using background delay-based algorithms */
1483 stat->lim_bk_txpkts += ifs->bk_txpackets;
1484 }
1485
1486 /*
1487 * Close a TCP control block:
1488 * discard all space held by the tcp
1489 * discard internet protocol block
1490 * wake up any sleepers
1491 */
1492 struct tcpcb *
tcp_close(struct tcpcb * tp)1493 tcp_close(struct tcpcb *tp)
1494 {
1495 struct inpcb *inp = tp->t_inpcb;
1496 struct socket *so = inp->inp_socket;
1497 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1498 struct route *ro;
1499 struct rtentry *rt;
1500 int dosavessthresh;
1501 struct ifnet_stats_per_flow ifs;
1502
1503 /* tcp_close was called previously, bail */
1504 if (inp->inp_ppcb == NULL) {
1505 return NULL;
1506 }
1507
1508 tcp_del_fsw_flow(tp);
1509
1510 tcp_canceltimers(tp);
1511 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp, 0, 0, 0, 0);
1512
1513 /*
1514 * If another thread for this tcp is currently in ip (indicated by
1515 * the TF_SENDINPROG flag), defer the cleanup until after it returns
1516 * back to tcp. This is done to serialize the close until after all
1517 * pending output is finished, in order to avoid having the PCB be
1518 * detached and the cached route cleaned, only for ip to cache the
1519 * route back into the PCB again. Note that we've cleared all the
1520 * timers at this point. Set TF_CLOSING to indicate to tcp_output()
1521 * that is should call us again once it returns from ip; at that
1522 * point both flags should be cleared and we can proceed further
1523 * with the cleanup.
1524 */
1525 if ((tp->t_flags & TF_CLOSING) ||
1526 inp->inp_sndinprog_cnt > 0) {
1527 tp->t_flags |= TF_CLOSING;
1528 return NULL;
1529 }
1530
1531 TCP_LOG_CONNECTION_SUMMARY(tp);
1532
1533 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1534 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1535
1536 ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
1537 rt = ro->ro_rt;
1538 if (rt != NULL) {
1539 RT_LOCK_SPIN(rt);
1540 }
1541
1542 /*
1543 * If we got enough samples through the srtt filter,
1544 * save the rtt and rttvar in the routing entry.
1545 * 'Enough' is arbitrarily defined as the 16 samples.
1546 * 16 samples is enough for the srtt filter to converge
1547 * to within 5% of the correct value; fewer samples and
1548 * we could save a very bogus rtt.
1549 *
1550 * Don't update the default route's characteristics and don't
1551 * update anything that the user "locked".
1552 */
1553 if (tp->t_rttupdated >= 16) {
1554 u_int32_t i = 0;
1555 bool log_rtt = false;
1556
1557 if (isipv6) {
1558 struct sockaddr_in6 *sin6;
1559
1560 if (rt == NULL) {
1561 goto no_valid_rt;
1562 }
1563 sin6 = SIN6(rt_key(rt));
1564 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1565 goto no_valid_rt;
1566 }
1567 } else if (ROUTE_UNUSABLE(ro) ||
1568 SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
1569 DTRACE_TCP4(state__change, void, NULL,
1570 struct inpcb *, inp, struct tcpcb *, tp,
1571 int32_t, TCPS_CLOSED);
1572 TCP_LOG_STATE(tp, TCPS_CLOSED);
1573 tp->t_state = TCPS_CLOSED;
1574 goto no_valid_rt;
1575 }
1576
1577 RT_LOCK_ASSERT_HELD(rt);
1578 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1579 i = tp->t_srtt *
1580 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1581 if (rt->rt_rmx.rmx_rtt && i) {
1582 /*
1583 * filter this update to half the old & half
1584 * the new values, converting scale.
1585 * See route.h and tcp_var.h for a
1586 * description of the scaling constants.
1587 */
1588 rt->rt_rmx.rmx_rtt =
1589 (rt->rt_rmx.rmx_rtt + i) / 2;
1590 } else {
1591 rt->rt_rmx.rmx_rtt = i;
1592 }
1593 tcpstat.tcps_cachedrtt++;
1594 log_rtt = true;
1595 }
1596 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1597 i = tp->t_rttvar *
1598 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1599 if (rt->rt_rmx.rmx_rttvar && i) {
1600 rt->rt_rmx.rmx_rttvar =
1601 (rt->rt_rmx.rmx_rttvar + i) / 2;
1602 } else {
1603 rt->rt_rmx.rmx_rttvar = i;
1604 }
1605 tcpstat.tcps_cachedrttvar++;
1606 log_rtt = true;
1607 }
1608 if (log_rtt) {
1609 TCP_LOG_RTM_RTT(tp, rt);
1610 TCP_LOG_RTT_INFO(tp);
1611 }
1612 /*
1613 * The old comment here said:
1614 * update the pipelimit (ssthresh) if it has been updated
1615 * already or if a pipesize was specified & the threshhold
1616 * got below half the pipesize. I.e., wait for bad news
1617 * before we start updating, then update on both good
1618 * and bad news.
1619 *
1620 * But we want to save the ssthresh even if no pipesize is
1621 * specified explicitly in the route, because such
1622 * connections still have an implicit pipesize specified
1623 * by the global tcp_sendspace. In the absence of a reliable
1624 * way to calculate the pipesize, it will have to do.
1625 */
1626 i = tp->snd_ssthresh;
1627 if (rt->rt_rmx.rmx_sendpipe != 0) {
1628 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1629 } else {
1630 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1631 }
1632 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1633 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) ||
1634 dosavessthresh) {
1635 /*
1636 * convert the limit from user data bytes to
1637 * packets then to packet data bytes.
1638 */
1639 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1640 if (i < 2) {
1641 i = 2;
1642 }
1643 i *= (u_int32_t)(tp->t_maxseg +
1644 isipv6 ? sizeof(struct ip6_hdr) +
1645 sizeof(struct tcphdr) :
1646 sizeof(struct tcpiphdr));
1647 if (rt->rt_rmx.rmx_ssthresh) {
1648 rt->rt_rmx.rmx_ssthresh =
1649 (rt->rt_rmx.rmx_ssthresh + i) / 2;
1650 } else {
1651 rt->rt_rmx.rmx_ssthresh = i;
1652 }
1653 tcpstat.tcps_cachedssthresh++;
1654 }
1655 }
1656
1657 /*
1658 * Mark route for deletion if no information is cached.
1659 */
1660 if (rt != NULL && (so->so_flags & SOF_OVERFLOW)) {
1661 if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1662 rt->rt_rmx.rmx_rtt == 0) {
1663 rt->rt_flags |= RTF_DELCLONE;
1664 }
1665 }
1666
1667 no_valid_rt:
1668 if (rt != NULL) {
1669 RT_UNLOCK(rt);
1670 }
1671
1672 /* free the reassembly queue, if any */
1673 (void) tcp_freeq(tp);
1674
1675 /* performance stats per interface */
1676 tcp_create_ifnet_stats_per_flow(tp, &ifs);
1677 tcp_update_stats_per_flow(&ifs, inp->inp_last_outifp);
1678
1679 tcp_free_sackholes(tp);
1680 tcp_notify_ack_free(tp);
1681
1682 inp_decr_sndbytes_allunsent(so, tp->snd_una);
1683
1684 if (tp->t_bwmeas != NULL) {
1685 tcp_bwmeas_free(tp);
1686 }
1687 tcp_rxtseg_clean(tp);
1688 tcp_segs_sent_clean(tp, true);
1689
1690 /* Free the packet list */
1691 if (tp->t_pktlist_head != NULL) {
1692 m_freem_list(tp->t_pktlist_head);
1693 }
1694 TCP_PKTLIST_CLEAR(tp);
1695
1696 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1697 inp->inp_saved_ppcb = (caddr_t) tp;
1698 }
1699
1700 TCP_LOG_STATE(tp, TCPS_CLOSED);
1701 tp->t_state = TCPS_CLOSED;
1702
1703 /*
1704 * Issue a wakeup before detach so that we don't miss
1705 * a wakeup
1706 */
1707 sodisconnectwakeup(so);
1708
1709 /*
1710 * Make sure to clear the TCP Keep Alive Offload as it is
1711 * ref counted on the interface
1712 */
1713 tcp_clear_keep_alive_offload(so);
1714
1715 /*
1716 * If this is a socket that does not want to wakeup the device
1717 * for it's traffic, the application might need to know that the
1718 * socket is closed, send a notification.
1719 */
1720 if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
1721 inp->inp_state != INPCB_STATE_DEAD &&
1722 !(inp->inp_flags2 & INP2_TIMEWAIT)) {
1723 socket_post_kev_msg_closed(so);
1724 }
1725
1726 if (CC_ALGO(tp)->cleanup != NULL) {
1727 CC_ALGO(tp)->cleanup(tp);
1728 }
1729
1730 tp->tcp_cc_index = TCP_CC_ALGO_NONE;
1731
1732 if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.cleanup != NULL) {
1733 tcp_cc_rledbat.cleanup(tp);
1734 }
1735
1736 /* Can happen if we close the socket before receiving the third ACK */
1737 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
1738 OSDecrementAtomic(&tcp_tfo_halfcnt);
1739
1740 /* Panic if something has gone terribly wrong. */
1741 VERIFY(tcp_tfo_halfcnt >= 0);
1742
1743 tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
1744 }
1745
1746 if (SOCK_CHECK_DOM(so, PF_INET6)) {
1747 in6_pcbdetach(inp);
1748 } else {
1749 in_pcbdetach(inp);
1750 }
1751
1752 /*
1753 * Call soisdisconnected after detach because it might unlock the socket
1754 */
1755 soisdisconnected(so);
1756 tcpstat.tcps_closed++;
1757 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END,
1758 tcpstat.tcps_closed, 0, 0, 0, 0);
1759 return NULL;
1760 }
1761
1762 int
tcp_freeq(struct tcpcb * tp)1763 tcp_freeq(struct tcpcb *tp)
1764 {
1765 struct tseg_qent *q;
1766 int rv = 0;
1767 int count = 0;
1768
1769 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1770 LIST_REMOVE(q, tqe_q);
1771 tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
1772 q->tqe_m->m_ext.ext_size : 0;
1773 m_freem(q->tqe_m);
1774 zfree(tcp_reass_zone, q);
1775 rv = 1;
1776 count++;
1777 }
1778 tp->t_reassqlen = 0;
1779 if (count > 0) {
1780 OSAddAtomic(-count, &tcp_reass_total_qlen);
1781 }
1782 return rv;
1783 }
1784
1785
1786 void
tcp_drain(void)1787 tcp_drain(void)
1788 {
1789 struct inpcb *inp;
1790 struct tcpcb *tp;
1791
1792 if (!lck_rw_try_lock_exclusive(&tcbinfo.ipi_lock)) {
1793 return;
1794 }
1795
1796 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1797 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
1798 WNT_STOPUSING) {
1799 socket_lock(inp->inp_socket, 1);
1800 if (in_pcb_checkstate(inp, WNT_RELEASE, 1)
1801 == WNT_STOPUSING) {
1802 /* lost a race, try the next one */
1803 socket_unlock(inp->inp_socket, 1);
1804 continue;
1805 }
1806 tp = intotcpcb(inp);
1807
1808 so_drain_extended_bk_idle(inp->inp_socket);
1809
1810 socket_unlock(inp->inp_socket, 1);
1811 }
1812 }
1813 lck_rw_done(&tcbinfo.ipi_lock);
1814 }
1815
1816 /*
1817 * Notify a tcp user of an asynchronous error;
1818 * store error as soft error, but wake up user
1819 * (for now, won't do anything until can select for soft error).
1820 *
1821 * Do not wake up user since there currently is no mechanism for
1822 * reporting soft errors (yet - a kqueue filter may be added).
1823 */
1824 static void
tcp_notify(struct inpcb * inp,int error)1825 tcp_notify(struct inpcb *inp, int error)
1826 {
1827 struct tcpcb *tp;
1828
1829 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) {
1830 return; /* pcb is gone already */
1831 }
1832 tp = (struct tcpcb *)inp->inp_ppcb;
1833
1834 VERIFY(tp != NULL);
1835 /*
1836 * Ignore some errors if we are hooked up.
1837 * If connection hasn't completed, has retransmitted several times,
1838 * and receives a second error, give up now. This is better
1839 * than waiting a long time to establish a connection that
1840 * can never complete.
1841 */
1842 if (tp->t_state == TCPS_ESTABLISHED &&
1843 (error == EHOSTUNREACH || error == ENETUNREACH ||
1844 error == EHOSTDOWN)) {
1845 if (inp->inp_route.ro_rt) {
1846 rtfree(inp->inp_route.ro_rt);
1847 inp->inp_route.ro_rt = (struct rtentry *)NULL;
1848 }
1849 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1850 tp->t_softerror) {
1851 tcp_drop(tp, error);
1852 } else {
1853 tp->t_softerror = error;
1854 }
1855 }
1856
1857 struct bwmeas *
tcp_bwmeas_alloc(struct tcpcb * tp)1858 tcp_bwmeas_alloc(struct tcpcb *tp)
1859 {
1860 struct bwmeas *elm;
1861 elm = zalloc_flags(tcp_bwmeas_zone, Z_ZERO | Z_WAITOK);
1862 elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1863 elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1864 return elm;
1865 }
1866
1867 void
tcp_bwmeas_free(struct tcpcb * tp)1868 tcp_bwmeas_free(struct tcpcb *tp)
1869 {
1870 zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1871 tp->t_bwmeas = NULL;
1872 tp->t_flagsext &= ~(TF_MEASURESNDBW);
1873 }
1874
1875 int
get_tcp_inp_list(struct inpcb * __single * inp_list __counted_by (n),size_t n,inp_gen_t gencnt)1876 get_tcp_inp_list(struct inpcb * __single *inp_list __counted_by(n), size_t n, inp_gen_t gencnt)
1877 {
1878 struct tcpcb *tp;
1879 struct inpcb *inp;
1880 int i = 0;
1881
1882 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1883 if (i >= n) {
1884 break;
1885 }
1886 if (inp->inp_gencnt <= gencnt &&
1887 inp->inp_state != INPCB_STATE_DEAD) {
1888 inp_list[i++] = inp;
1889 }
1890 }
1891
1892 TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
1893 if (i >= n) {
1894 break;
1895 }
1896 inp = tp->t_inpcb;
1897 if (inp->inp_gencnt <= gencnt &&
1898 inp->inp_state != INPCB_STATE_DEAD) {
1899 inp_list[i++] = inp;
1900 }
1901 }
1902 return i;
1903 }
1904
1905 /*
1906 * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1907 * The otcpcb data structure is passed to user space and must not change.
1908 */
1909 static void
tcpcb_to_otcpcb(struct tcpcb * tp,struct otcpcb * otp)1910 tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1911 {
1912 otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
1913 otp->t_dupacks = tp->t_dupacks;
1914 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1915 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1916 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1917 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1918 otp->t_inpcb =
1919 (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRHASH(tp->t_inpcb);
1920 otp->t_state = tp->t_state;
1921 otp->t_flags = tp->t_flags;
1922 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1923 otp->snd_una = tp->snd_una;
1924 otp->snd_max = tp->snd_max;
1925 otp->snd_nxt = tp->snd_nxt;
1926 otp->snd_up = tp->snd_up;
1927 otp->snd_wl1 = tp->snd_wl1;
1928 otp->snd_wl2 = tp->snd_wl2;
1929 otp->iss = tp->iss;
1930 otp->irs = tp->irs;
1931 otp->rcv_nxt = tp->rcv_nxt;
1932 otp->rcv_adv = tp->rcv_adv;
1933 otp->rcv_wnd = tp->rcv_wnd;
1934 otp->rcv_up = tp->rcv_up;
1935 otp->snd_wnd = tp->snd_wnd;
1936 otp->snd_cwnd = tp->snd_cwnd;
1937 otp->snd_ssthresh = tp->snd_ssthresh;
1938 otp->t_maxopd = tp->t_maxopd;
1939 otp->t_rcvtime = tp->t_rcvtime;
1940 otp->t_starttime = tp->t_starttime;
1941 otp->t_rtttime = tp->t_rtttime;
1942 otp->t_rtseq = tp->t_rtseq;
1943 otp->t_rxtcur = tp->t_rxtcur;
1944 otp->t_maxseg = tp->t_maxseg;
1945 otp->t_srtt = tp->t_srtt;
1946 otp->t_rttvar = tp->t_rttvar;
1947 otp->t_rxtshift = tp->t_rxtshift;
1948 otp->t_rttmin = tp->t_rttmin;
1949 otp->t_rttupdated = tp->t_rttupdated;
1950 otp->max_sndwnd = tp->max_sndwnd;
1951 otp->t_softerror = tp->t_softerror;
1952 otp->t_oobflags = tp->t_oobflags;
1953 otp->t_iobc = tp->t_iobc;
1954 otp->snd_scale = tp->snd_scale;
1955 otp->rcv_scale = tp->rcv_scale;
1956 otp->request_r_scale = tp->request_r_scale;
1957 otp->requested_s_scale = tp->requested_s_scale;
1958 otp->ts_recent = tp->ts_recent;
1959 otp->ts_recent_age = tp->ts_recent_age;
1960 otp->last_ack_sent = tp->last_ack_sent;
1961 otp->cc_send = 0;
1962 otp->cc_recv = 0;
1963 otp->snd_recover = tp->snd_recover;
1964 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1965 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1966 otp->t_badrxtwin = 0;
1967 }
1968
1969 static int
1970 tcp_pcblist SYSCTL_HANDLER_ARGS
1971 {
1972 #pragma unused(oidp, arg1, arg2)
1973 int error, i = 0, n, sz;
1974 struct inpcb **inp_list;
1975 inp_gen_t gencnt;
1976 struct xinpgen xig;
1977
1978 /*
1979 * The process of preparing the TCB list is too time-consuming and
1980 * resource-intensive to repeat twice on every request.
1981 */
1982 lck_rw_lock_shared(&tcbinfo.ipi_lock);
1983 if (req->oldptr == USER_ADDR_NULL) {
1984 n = tcbinfo.ipi_count;
1985 req->oldidx = 2 * (sizeof(xig))
1986 + (n + n / 8) * sizeof(struct xtcpcb);
1987 lck_rw_done(&tcbinfo.ipi_lock);
1988 return 0;
1989 }
1990
1991 if (req->newptr != USER_ADDR_NULL) {
1992 lck_rw_done(&tcbinfo.ipi_lock);
1993 return EPERM;
1994 }
1995
1996 /*
1997 * OK, now we're committed to doing something.
1998 */
1999 gencnt = tcbinfo.ipi_gencnt;
2000 sz = n = tcbinfo.ipi_count;
2001
2002 bzero(&xig, sizeof(xig));
2003 xig.xig_len = sizeof(xig);
2004 xig.xig_count = n;
2005 xig.xig_gen = gencnt;
2006 xig.xig_sogen = so_gencnt;
2007 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2008 if (error) {
2009 lck_rw_done(&tcbinfo.ipi_lock);
2010 return error;
2011 }
2012 /*
2013 * We are done if there is no pcb
2014 */
2015 if (n == 0) {
2016 lck_rw_done(&tcbinfo.ipi_lock);
2017 return 0;
2018 }
2019
2020 inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
2021 if (inp_list == NULL) {
2022 lck_rw_done(&tcbinfo.ipi_lock);
2023 return ENOMEM;
2024 }
2025
2026 n = get_tcp_inp_list(inp_list, n, gencnt);
2027
2028 error = 0;
2029 for (i = 0; i < n; i++) {
2030 struct xtcpcb xt;
2031 caddr_t inp_ppcb __single;
2032 struct inpcb *inp;
2033
2034 inp = inp_list[i];
2035
2036 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2037 continue;
2038 }
2039 socket_lock(inp->inp_socket, 1);
2040 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2041 socket_unlock(inp->inp_socket, 1);
2042 continue;
2043 }
2044 if (inp->inp_gencnt > gencnt) {
2045 socket_unlock(inp->inp_socket, 1);
2046 continue;
2047 }
2048
2049 bzero(&xt, sizeof(xt));
2050 xt.xt_len = sizeof(xt);
2051 /* XXX should avoid extra copy */
2052 inpcb_to_compat(inp, &xt.xt_inp);
2053 inp_ppcb = inp->inp_ppcb;
2054 if (inp_ppcb != NULL) {
2055 tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb,
2056 &xt.xt_tp);
2057 } else {
2058 bzero((char *) &xt.xt_tp, sizeof(xt.xt_tp));
2059 }
2060 if (inp->inp_socket) {
2061 sotoxsocket(inp->inp_socket, &xt.xt_socket);
2062 }
2063
2064 socket_unlock(inp->inp_socket, 1);
2065
2066 error = SYSCTL_OUT(req, &xt, sizeof(xt));
2067 }
2068 if (!error) {
2069 /*
2070 * Give the user an updated idea of our state.
2071 * If the generation differs from what we told
2072 * her before, she knows that something happened
2073 * while we were processing this request, and it
2074 * might be necessary to retry.
2075 */
2076 bzero(&xig, sizeof(xig));
2077 xig.xig_len = sizeof(xig);
2078 xig.xig_gen = tcbinfo.ipi_gencnt;
2079 xig.xig_sogen = so_gencnt;
2080 xig.xig_count = tcbinfo.ipi_count;
2081 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2082 }
2083
2084 lck_rw_done(&tcbinfo.ipi_lock);
2085 kfree_type(struct inpcb *, sz, inp_list);
2086 return error;
2087 }
2088
2089 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
2090 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2091 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
2092
2093 #if XNU_TARGET_OS_OSX
2094
2095 static void
tcpcb_to_xtcpcb64(struct tcpcb * tp,struct xtcpcb64 * otp)2096 tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
2097 {
2098 otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
2099 otp->t_dupacks = tp->t_dupacks;
2100 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
2101 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
2102 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
2103 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
2104 otp->t_state = tp->t_state;
2105 otp->t_flags = tp->t_flags;
2106 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
2107 otp->snd_una = tp->snd_una;
2108 otp->snd_max = tp->snd_max;
2109 otp->snd_nxt = tp->snd_nxt;
2110 otp->snd_up = tp->snd_up;
2111 otp->snd_wl1 = tp->snd_wl1;
2112 otp->snd_wl2 = tp->snd_wl2;
2113 otp->iss = tp->iss;
2114 otp->irs = tp->irs;
2115 otp->rcv_nxt = tp->rcv_nxt;
2116 otp->rcv_adv = tp->rcv_adv;
2117 otp->rcv_wnd = tp->rcv_wnd;
2118 otp->rcv_up = tp->rcv_up;
2119 otp->snd_wnd = tp->snd_wnd;
2120 otp->snd_cwnd = tp->snd_cwnd;
2121 otp->snd_ssthresh = tp->snd_ssthresh;
2122 otp->t_maxopd = tp->t_maxopd;
2123 otp->t_rcvtime = tp->t_rcvtime;
2124 otp->t_starttime = tp->t_starttime;
2125 otp->t_rtttime = tp->t_rtttime;
2126 otp->t_rtseq = tp->t_rtseq;
2127 otp->t_rxtcur = tp->t_rxtcur;
2128 otp->t_maxseg = tp->t_maxseg;
2129 otp->t_srtt = tp->t_srtt;
2130 otp->t_rttvar = tp->t_rttvar;
2131 otp->t_rxtshift = tp->t_rxtshift;
2132 otp->t_rttmin = tp->t_rttmin;
2133 otp->t_rttupdated = tp->t_rttupdated;
2134 otp->max_sndwnd = tp->max_sndwnd;
2135 otp->t_softerror = tp->t_softerror;
2136 otp->t_oobflags = tp->t_oobflags;
2137 otp->t_iobc = tp->t_iobc;
2138 otp->snd_scale = tp->snd_scale;
2139 otp->rcv_scale = tp->rcv_scale;
2140 otp->request_r_scale = tp->request_r_scale;
2141 otp->requested_s_scale = tp->requested_s_scale;
2142 otp->ts_recent = tp->ts_recent;
2143 otp->ts_recent_age = tp->ts_recent_age;
2144 otp->last_ack_sent = tp->last_ack_sent;
2145 otp->cc_send = 0;
2146 otp->cc_recv = 0;
2147 otp->snd_recover = tp->snd_recover;
2148 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
2149 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
2150 otp->t_badrxtwin = 0;
2151 }
2152
2153
2154 static int
2155 tcp_pcblist64 SYSCTL_HANDLER_ARGS
2156 {
2157 #pragma unused(oidp, arg1, arg2)
2158 int error, i = 0, n, sz;
2159 struct inpcb **inp_list;
2160 inp_gen_t gencnt;
2161 struct xinpgen xig;
2162
2163 /*
2164 * The process of preparing the TCB list is too time-consuming and
2165 * resource-intensive to repeat twice on every request.
2166 */
2167 lck_rw_lock_shared(&tcbinfo.ipi_lock);
2168 if (req->oldptr == USER_ADDR_NULL) {
2169 n = tcbinfo.ipi_count;
2170 req->oldidx = 2 * (sizeof(xig))
2171 + (n + n / 8) * sizeof(struct xtcpcb64);
2172 lck_rw_done(&tcbinfo.ipi_lock);
2173 return 0;
2174 }
2175
2176 if (req->newptr != USER_ADDR_NULL) {
2177 lck_rw_done(&tcbinfo.ipi_lock);
2178 return EPERM;
2179 }
2180
2181 /*
2182 * OK, now we're committed to doing something.
2183 */
2184 gencnt = tcbinfo.ipi_gencnt;
2185 sz = n = tcbinfo.ipi_count;
2186
2187 bzero(&xig, sizeof(xig));
2188 xig.xig_len = sizeof(xig);
2189 xig.xig_count = n;
2190 xig.xig_gen = gencnt;
2191 xig.xig_sogen = so_gencnt;
2192 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2193 if (error) {
2194 lck_rw_done(&tcbinfo.ipi_lock);
2195 return error;
2196 }
2197 /*
2198 * We are done if there is no pcb
2199 */
2200 if (n == 0) {
2201 lck_rw_done(&tcbinfo.ipi_lock);
2202 return 0;
2203 }
2204
2205 inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
2206 if (inp_list == NULL) {
2207 lck_rw_done(&tcbinfo.ipi_lock);
2208 return ENOMEM;
2209 }
2210
2211 n = get_tcp_inp_list(inp_list, n, gencnt);
2212
2213 error = 0;
2214 for (i = 0; i < n; i++) {
2215 struct xtcpcb64 xt;
2216 struct inpcb *inp;
2217
2218 inp = inp_list[i];
2219
2220 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2221 continue;
2222 }
2223 socket_lock(inp->inp_socket, 1);
2224 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2225 socket_unlock(inp->inp_socket, 1);
2226 continue;
2227 }
2228 if (inp->inp_gencnt > gencnt) {
2229 socket_unlock(inp->inp_socket, 1);
2230 continue;
2231 }
2232
2233 bzero(&xt, sizeof(xt));
2234 xt.xt_len = sizeof(xt);
2235 inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
2236 xt.xt_inpcb.inp_ppcb =
2237 (uint64_t)VM_KERNEL_ADDRHASH(inp->inp_ppcb);
2238 if (inp->inp_ppcb != NULL) {
2239 tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb,
2240 &xt);
2241 }
2242 if (inp->inp_socket) {
2243 sotoxsocket64(inp->inp_socket,
2244 &xt.xt_inpcb.xi_socket);
2245 }
2246
2247 socket_unlock(inp->inp_socket, 1);
2248
2249 error = SYSCTL_OUT(req, &xt, sizeof(xt));
2250 }
2251 if (!error) {
2252 /*
2253 * Give the user an updated idea of our state.
2254 * If the generation differs from what we told
2255 * her before, she knows that something happened
2256 * while we were processing this request, and it
2257 * might be necessary to retry.
2258 */
2259 bzero(&xig, sizeof(xig));
2260 xig.xig_len = sizeof(xig);
2261 xig.xig_gen = tcbinfo.ipi_gencnt;
2262 xig.xig_sogen = so_gencnt;
2263 xig.xig_count = tcbinfo.ipi_count;
2264 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2265 }
2266
2267 lck_rw_done(&tcbinfo.ipi_lock);
2268 kfree_type(struct inpcb *, sz, inp_list);
2269 return error;
2270 }
2271
2272 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
2273 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2274 tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
2275
2276 #endif /* XNU_TARGET_OS_OSX */
2277
2278 static int
2279 tcp_pcblist_n SYSCTL_HANDLER_ARGS
2280 {
2281 #pragma unused(oidp, arg1, arg2)
2282 int error = 0;
2283
2284 error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
2285
2286 return error;
2287 }
2288
2289
2290 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
2291 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2292 tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
2293
2294 static int
2295 tcp_progress_probe_enable SYSCTL_HANDLER_ARGS
2296 {
2297 #pragma unused(oidp, arg1, arg2)
2298
2299 return ntstat_tcp_progress_enable(req);
2300 }
2301
2302 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress_enable,
2303 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
2304 tcp_progress_probe_enable, "S", "Enable/disable TCP keepalive probing on the specified link(s)");
2305
2306
2307 __private_extern__ void
tcp_get_ports_used(ifnet_t ifp,int protocol,uint32_t flags,bitstr_t * __counted_by (bitstr_size (IP_PORTRANGE_SIZE))bitfield)2308 tcp_get_ports_used(ifnet_t ifp, int protocol, uint32_t flags,
2309 bitstr_t *__counted_by(bitstr_size(IP_PORTRANGE_SIZE)) bitfield)
2310 {
2311 inpcb_get_ports_used(ifp, protocol, flags, bitfield,
2312 &tcbinfo);
2313 }
2314
2315 __private_extern__ uint32_t
tcp_count_opportunistic(unsigned int ifindex,u_int32_t flags)2316 tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
2317 {
2318 return inpcb_count_opportunistic(ifindex, &tcbinfo, flags);
2319 }
2320
2321 __private_extern__ uint32_t
tcp_find_anypcb_byaddr(struct ifaddr * ifa)2322 tcp_find_anypcb_byaddr(struct ifaddr *ifa)
2323 {
2324 #if SKYWALK
2325 if (netns_is_enabled()) {
2326 return netns_find_anyres_byaddr(ifa, IPPROTO_TCP);
2327 } else
2328 #endif /* SKYWALK */
2329 return inpcb_find_anypcb_byaddr(ifa, &tcbinfo);
2330 }
2331
2332 static void
tcp_handle_msgsize(struct ip * ip,struct inpcb * inp)2333 tcp_handle_msgsize(struct ip *ip, struct inpcb *inp)
2334 {
2335 struct rtentry *rt = NULL;
2336 u_short ifscope = IFSCOPE_NONE;
2337 int mtu;
2338 struct sockaddr_in icmpsrc = {
2339 .sin_len = sizeof(struct sockaddr_in),
2340 .sin_family = AF_INET, .sin_port = 0, .sin_addr = { .s_addr = 0 },
2341 .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }
2342 };
2343 struct icmp *icp = NULL;
2344
2345 icp = __container_of(ip, struct icmp, icmp_ip);
2346 icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
2347
2348 /*
2349 * MTU discovery:
2350 * If we got a needfrag and there is a host route to the
2351 * original destination, and the MTU is not locked, then
2352 * set the MTU in the route to the suggested new value
2353 * (if given) and then notify as usual. The ULPs will
2354 * notice that the MTU has changed and adapt accordingly.
2355 * If no new MTU was suggested, then we guess a new one
2356 * less than the current value. If the new MTU is
2357 * unreasonably small (defined by sysctl tcp_minmss), then
2358 * we reset the MTU to the interface value and enable the
2359 * lock bit, indicating that we are no longer doing MTU
2360 * discovery.
2361 */
2362 if (ROUTE_UNUSABLE(&(inp->inp_route)) == false) {
2363 rt = inp->inp_route.ro_rt;
2364 }
2365
2366 /*
2367 * icmp6_mtudisc_update scopes the routing lookup
2368 * to the incoming interface (delivered from mbuf
2369 * packet header.
2370 * That is mostly ok but for asymmetric networks
2371 * that may be an issue.
2372 * Frag needed OR Packet too big really communicates
2373 * MTU for the out data path.
2374 * Take the interface scope from cached route or
2375 * the last outgoing interface from inp
2376 */
2377 if (rt != NULL) {
2378 ifscope = (rt->rt_ifp != NULL) ?
2379 rt->rt_ifp->if_index : IFSCOPE_NONE;
2380 } else {
2381 ifscope = (inp->inp_last_outifp != NULL) ?
2382 inp->inp_last_outifp->if_index : IFSCOPE_NONE;
2383 }
2384
2385 if ((rt == NULL) ||
2386 !(rt->rt_flags & RTF_HOST) ||
2387 (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING))) {
2388 rt = rtalloc1_scoped(SA(&icmpsrc), 0, RTF_CLONING | RTF_PRCLONING, ifscope);
2389 } else if (rt) {
2390 RT_LOCK(rt);
2391 rtref(rt);
2392 RT_UNLOCK(rt);
2393 }
2394
2395 if (rt != NULL) {
2396 RT_LOCK(rt);
2397 if ((rt->rt_flags & RTF_HOST) &&
2398 !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
2399 mtu = ntohs(icp->icmp_nextmtu);
2400 /*
2401 * XXX Stock BSD has changed the following
2402 * to compare with icp->icmp_ip.ip_len
2403 * to converge faster when sent packet
2404 * < route's MTU. We may want to adopt
2405 * that change.
2406 */
2407 if (mtu == 0) {
2408 mtu = ip_next_mtu(rt->rt_rmx.
2409 rmx_mtu, 1);
2410 }
2411 #if DEBUG_MTUDISC
2412 printf("MTU for %s reduced to %d\n",
2413 inet_ntop(AF_INET,
2414 &icmpsrc.sin_addr, ipv4str,
2415 sizeof(ipv4str)), mtu);
2416 #endif
2417 if (mtu < max(296, (tcp_minmss +
2418 sizeof(struct tcpiphdr)))) {
2419 rt->rt_rmx.rmx_locks |= RTV_MTU;
2420 } else if (rt->rt_rmx.rmx_mtu > mtu) {
2421 rt->rt_rmx.rmx_mtu = mtu;
2422 }
2423 }
2424 RT_UNLOCK(rt);
2425 rtfree(rt);
2426 }
2427 }
2428
2429 void
tcp_ctlinput(int cmd,struct sockaddr * sa,void * vip,__unused struct ifnet * ifp)2430 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip, __unused struct ifnet *ifp)
2431 {
2432 tcp_seq icmp_tcp_seq;
2433 struct ipctlparam *ctl_param __single = vip;
2434 struct ip *ip = NULL;
2435 struct mbuf *m = NULL;
2436 struct in_addr faddr;
2437 struct inpcb *inp;
2438 struct tcpcb *tp;
2439 struct tcphdr *th;
2440 struct icmp *icp;
2441 size_t off;
2442 #if SKYWALK
2443 union sockaddr_in_4_6 sock_laddr;
2444 struct protoctl_ev_val prctl_ev_val;
2445 #endif /* SKYWALK */
2446 void (*notify)(struct inpcb *, int) = tcp_notify;
2447
2448 if (ctl_param != NULL) {
2449 ip = ctl_param->ipc_icmp_ip;
2450 icp = ctl_param->ipc_icmp;
2451 m = ctl_param->ipc_m;
2452 off = ctl_param->ipc_off;
2453 } else {
2454 ip = NULL;
2455 icp = NULL;
2456 m = NULL;
2457 off = 0;
2458 }
2459
2460 faddr = SIN(sa)->sin_addr;
2461 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) {
2462 return;
2463 }
2464
2465 if ((unsigned)cmd >= PRC_NCMDS) {
2466 return;
2467 }
2468
2469 /* Source quench is deprecated */
2470 if (cmd == PRC_QUENCH) {
2471 return;
2472 }
2473
2474 if (cmd == PRC_MSGSIZE) {
2475 notify = tcp_mtudisc;
2476 } else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2477 cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
2478 cmd == PRC_TIMXCEED_INTRANS) && ip) {
2479 notify = tcp_drop_syn_sent;
2480 }
2481 /*
2482 * Hostdead is ugly because it goes linearly through all PCBs.
2483 * XXX: We never get this from ICMP, otherwise it makes an
2484 * excellent DoS attack on machines with many connections.
2485 */
2486 else if (cmd == PRC_HOSTDEAD) {
2487 ip = NULL;
2488 } else if (inetctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2489 return;
2490 }
2491
2492 #if SKYWALK
2493 bzero(&prctl_ev_val, sizeof(prctl_ev_val));
2494 bzero(&sock_laddr, sizeof(sock_laddr));
2495 #endif /* SKYWALK */
2496
2497 if (ip == NULL) {
2498 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
2499 #if SKYWALK
2500 protoctl_event_enqueue_nwk_wq_entry(ifp, NULL,
2501 sa, 0, 0, IPPROTO_TCP, cmd, NULL);
2502 #endif /* SKYWALK */
2503 return;
2504 }
2505
2506 /* Check if we can safely get the sport, dport and the sequence number from the tcp header. */
2507 if (m == NULL ||
2508 (m->m_len < off + (sizeof(unsigned short) + sizeof(unsigned short) + sizeof(tcp_seq)))) {
2509 /* Insufficient length */
2510 return;
2511 }
2512
2513 th = (struct tcphdr*)(void*)(mtod(m, uint8_t*) + off);
2514 icmp_tcp_seq = ntohl(th->th_seq);
2515
2516 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
2517 ip->ip_src, th->th_sport, 0, NULL);
2518
2519 if (inp == NULL ||
2520 inp->inp_socket == NULL) {
2521 #if SKYWALK
2522 if (cmd == PRC_MSGSIZE) {
2523 prctl_ev_val.val = ntohs(icp->icmp_nextmtu);
2524 }
2525 prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2526
2527 sock_laddr.sin.sin_family = AF_INET;
2528 sock_laddr.sin.sin_len = sizeof(sock_laddr.sin);
2529 sock_laddr.sin.sin_addr = ip->ip_src;
2530
2531 protoctl_event_enqueue_nwk_wq_entry(ifp,
2532 SA(&sock_laddr), sa,
2533 th->th_sport, th->th_dport, IPPROTO_TCP,
2534 cmd, &prctl_ev_val);
2535 #endif /* SKYWALK */
2536 return;
2537 }
2538
2539 socket_lock(inp->inp_socket, 1);
2540 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2541 WNT_STOPUSING) {
2542 socket_unlock(inp->inp_socket, 1);
2543 return;
2544 }
2545
2546 if (PRC_IS_REDIRECT(cmd)) {
2547 /* signal EHOSTDOWN, as it flushes the cached route */
2548 (*notify)(inp, EHOSTDOWN);
2549 } else {
2550 tp = intotcpcb(inp);
2551 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2552 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2553 if (cmd == PRC_MSGSIZE) {
2554 tcp_handle_msgsize(ip, inp);
2555 }
2556
2557 (*notify)(inp, inetctlerrmap[cmd]);
2558 }
2559 }
2560 socket_unlock(inp->inp_socket, 1);
2561 }
2562
2563 void
tcp6_ctlinput(int cmd,struct sockaddr * sa,void * d,__unused struct ifnet * ifp)2564 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp)
2565 {
2566 tcp_seq icmp_tcp_seq;
2567 struct in6_addr *dst;
2568 void (*notify)(struct inpcb *, int) = tcp_notify;
2569 struct ip6_hdr *ip6;
2570 struct mbuf *m;
2571 struct inpcb *inp;
2572 struct tcpcb *tp;
2573 struct icmp6_hdr *icmp6;
2574 struct ip6ctlparam *ip6cp = NULL;
2575 const struct sockaddr_in6 *sa6_src = NULL;
2576 unsigned int mtu;
2577 unsigned int off;
2578
2579 struct tcp_ports {
2580 uint16_t th_sport;
2581 uint16_t th_dport;
2582 } t_ports;
2583 #if SKYWALK
2584 union sockaddr_in_4_6 sock_laddr;
2585 struct protoctl_ev_val prctl_ev_val;
2586 #endif /* SKYWALK */
2587
2588 if (sa->sa_family != AF_INET6 ||
2589 sa->sa_len != sizeof(struct sockaddr_in6)) {
2590 return;
2591 }
2592
2593 /* Source quench is deprecated */
2594 if (cmd == PRC_QUENCH) {
2595 return;
2596 }
2597
2598 if ((unsigned)cmd >= PRC_NCMDS) {
2599 return;
2600 }
2601
2602 /* if the parameter is from icmp6, decode it. */
2603 if (d != NULL) {
2604 ip6cp = (struct ip6ctlparam *)d;
2605 icmp6 = ip6cp->ip6c_icmp6;
2606 m = ip6cp->ip6c_m;
2607 ip6 = ip6cp->ip6c_ip6;
2608 off = ip6cp->ip6c_off;
2609 sa6_src = ip6cp->ip6c_src;
2610 dst = ip6cp->ip6c_finaldst;
2611 } else {
2612 m = NULL;
2613 ip6 = NULL;
2614 off = 0; /* fool gcc */
2615 sa6_src = &sa6_any;
2616 dst = NULL;
2617 }
2618
2619 if (cmd == PRC_MSGSIZE) {
2620 notify = tcp_mtudisc;
2621 } else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2622 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
2623 ip6 != NULL) {
2624 notify = tcp_drop_syn_sent;
2625 }
2626 /*
2627 * Hostdead is ugly because it goes linearly through all PCBs.
2628 * XXX: We never get this from ICMP, otherwise it makes an
2629 * excellent DoS attack on machines with many connections.
2630 */
2631 else if (cmd == PRC_HOSTDEAD) {
2632 ip6 = NULL;
2633 } else if (inet6ctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2634 return;
2635 }
2636
2637 #if SKYWALK
2638 bzero(&prctl_ev_val, sizeof(prctl_ev_val));
2639 bzero(&sock_laddr, sizeof(sock_laddr));
2640 #endif /* SKYWALK */
2641
2642 if (ip6 == NULL) {
2643 in6_pcbnotify(&tcbinfo, sa, 0, SA(sa6_src), 0, cmd, NULL, notify);
2644 #if SKYWALK
2645 protoctl_event_enqueue_nwk_wq_entry(ifp, NULL, sa,
2646 0, 0, IPPROTO_TCP, cmd, NULL);
2647 #endif /* SKYWALK */
2648 return;
2649 }
2650
2651 /* Check if we can safely get the ports from the tcp hdr */
2652 if (m == NULL ||
2653 (m->m_pkthdr.len <
2654 (int32_t) (off + sizeof(struct tcp_ports)))) {
2655 return;
2656 }
2657 bzero(&t_ports, sizeof(struct tcp_ports));
2658 m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
2659
2660 off += sizeof(struct tcp_ports);
2661 if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
2662 return;
2663 }
2664 m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
2665 icmp_tcp_seq = ntohl(icmp_tcp_seq);
2666
2667 if (cmd == PRC_MSGSIZE) {
2668 mtu = ntohl(icmp6->icmp6_mtu);
2669 /*
2670 * If no alternative MTU was proposed, or the proposed
2671 * MTU was too small, set to the min.
2672 */
2673 if (mtu < IPV6_MMTU) {
2674 mtu = IPV6_MMTU - 8;
2675 }
2676 }
2677
2678 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_dst, t_ports.th_dport, ip6_input_getdstifscope(m),
2679 &ip6->ip6_src, t_ports.th_sport, ip6_input_getsrcifscope(m), 0, NULL);
2680
2681 if (inp == NULL ||
2682 inp->inp_socket == NULL) {
2683 #if SKYWALK
2684 if (cmd == PRC_MSGSIZE) {
2685 prctl_ev_val.val = mtu;
2686 }
2687 prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2688
2689 sock_laddr.sin6.sin6_family = AF_INET6;
2690 sock_laddr.sin6.sin6_len = sizeof(sock_laddr.sin6);
2691 sock_laddr.sin6.sin6_addr = ip6->ip6_src;
2692
2693 protoctl_event_enqueue_nwk_wq_entry(ifp,
2694 SA(&sock_laddr), sa,
2695 t_ports.th_sport, t_ports.th_dport, IPPROTO_TCP,
2696 cmd, &prctl_ev_val);
2697 #endif /* SKYWALK */
2698 return;
2699 }
2700
2701 socket_lock(inp->inp_socket, 1);
2702 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2703 WNT_STOPUSING) {
2704 socket_unlock(inp->inp_socket, 1);
2705 return;
2706 }
2707
2708 if (PRC_IS_REDIRECT(cmd)) {
2709 /* signal EHOSTDOWN, as it flushes the cached route */
2710 (*notify)(inp, EHOSTDOWN);
2711 } else {
2712 tp = intotcpcb(inp);
2713 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2714 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2715 if (cmd == PRC_MSGSIZE) {
2716 /*
2717 * Only process the offered MTU if it
2718 * is smaller than the current one.
2719 */
2720 if (mtu < tp->t_maxseg +
2721 (sizeof(struct tcphdr) + sizeof(struct ip6_hdr))) {
2722 (*notify)(inp, inetctlerrmap[cmd]);
2723 }
2724 } else {
2725 (*notify)(inp, inetctlerrmap[cmd]);
2726 }
2727 }
2728 }
2729 socket_unlock(inp->inp_socket, 1);
2730 }
2731
2732
2733 /*
2734 * Following is where TCP initial sequence number generation occurs.
2735 *
2736 * There are two places where we must use initial sequence numbers:
2737 * 1. In SYN-ACK packets.
2738 * 2. In SYN packets.
2739 *
2740 * The ISNs in SYN-ACK packets have no monotonicity requirement,
2741 * and should be as unpredictable as possible to avoid the possibility
2742 * of spoofing and/or connection hijacking. To satisfy this
2743 * requirement, SYN-ACK ISNs are generated via the arc4random()
2744 * function. If exact RFC 1948 compliance is requested via sysctl,
2745 * these ISNs will be generated just like those in SYN packets.
2746 *
2747 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
2748 * depends on this property. In addition, these ISNs should be
2749 * unguessable so as to prevent connection hijacking. To satisfy
2750 * the requirements of this situation, the algorithm outlined in
2751 * RFC 9293 is used to generate sequence numbers.
2752 *
2753 * For more information on the theory of operation, please see
2754 * RFC 9293.
2755 *
2756 * Implementation details:
2757 *
2758 * Time is based off the system timer, and is corrected so that it
2759 * increases by one megabyte per second. This allows for proper
2760 * recycling on high speed LANs while still leaving over an hour
2761 * before rollover.
2762 *
2763 */
2764
2765 #define ISN_BYTES_PER_SECOND 1048576
2766
2767 tcp_seq
tcp_new_isn(struct tcpcb * tp)2768 tcp_new_isn(struct tcpcb *tp)
2769 {
2770 uint32_t md5_buffer[4];
2771 tcp_seq new_isn;
2772 struct timespec timenow;
2773 MD5_CTX isn_ctx;
2774
2775 nanouptime(&timenow);
2776
2777 /* Compute the md5 hash and return the ISN. */
2778 MD5Init(&isn_ctx);
2779 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport,
2780 sizeof(u_short));
2781 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport,
2782 sizeof(u_short));
2783 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
2784 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
2785 sizeof(struct in6_addr));
2786 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
2787 sizeof(struct in6_addr));
2788 } else {
2789 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
2790 sizeof(struct in_addr));
2791 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
2792 sizeof(struct in_addr));
2793 }
2794 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
2795 MD5Final((u_char *) &md5_buffer, &isn_ctx);
2796
2797 new_isn = (tcp_seq) md5_buffer[0];
2798
2799 /*
2800 * We use a 128ns clock, which is equivalent to 600 Mbps and wraps at
2801 * 549 seconds, thus safe for 2 MSL lifetime of TIME-WAIT-state.
2802 */
2803 new_isn += (timenow.tv_sec * NSEC_PER_SEC + timenow.tv_nsec) >> 7;
2804
2805 if (__probable(tcp_randomize_timestamps)) {
2806 tp->t_ts_offset = md5_buffer[1];
2807 }
2808
2809 return new_isn;
2810 }
2811
2812
2813 /*
2814 * When a specific ICMP unreachable message is received and the
2815 * connection state is SYN-SENT, drop the connection. This behavior
2816 * is controlled by the icmp_may_rst sysctl.
2817 */
2818 void
tcp_drop_syn_sent(struct inpcb * inp,int errno)2819 tcp_drop_syn_sent(struct inpcb *inp, int errno)
2820 {
2821 struct tcpcb *tp = intotcpcb(inp);
2822
2823 if (tp && tp->t_state == TCPS_SYN_SENT) {
2824 tcp_drop(tp, errno);
2825 }
2826 }
2827
2828 /*
2829 * Get effective MTU for redirect virtual interface. Redirect
2830 * virtual interface switches between multiple delegated interfaces.
2831 * For cases, where redirect forwards packets to an ipsec interface,
2832 * MTU should be adjusted to consider ESP encapsulation overhead.
2833 */
2834 uint32_t
tcp_get_effective_mtu(struct rtentry * rt,uint32_t current_mtu)2835 tcp_get_effective_mtu(struct rtentry *rt, uint32_t current_mtu)
2836 {
2837 ifnet_t ifp = NULL;
2838 ifnet_t delegated_ifp = NULL;
2839 ifnet_t outgoing_ifp = NULL;
2840 uint32_t min_mtu = 0;
2841 uint32_t outgoing_mtu = 0;
2842 uint32_t tunnel_overhead = 0;
2843
2844 if (rt == NULL || rt->rt_ifp == NULL) {
2845 return current_mtu;
2846 }
2847
2848 ifp = rt->rt_ifp;
2849 if (ifp->if_subfamily != IFNET_SUBFAMILY_REDIRECT) {
2850 return current_mtu;
2851 }
2852
2853 delegated_ifp = ifp->if_delegated.ifp;
2854 if (delegated_ifp == NULL || delegated_ifp->if_family != IFNET_FAMILY_IPSEC) {
2855 return current_mtu;
2856 }
2857
2858 min_mtu = MIN(delegated_ifp->if_mtu, current_mtu);
2859
2860 outgoing_ifp = delegated_ifp->if_delegated.ifp;
2861 if (outgoing_ifp == NULL) {
2862 return min_mtu;
2863 }
2864
2865 outgoing_mtu = outgoing_ifp->if_mtu;
2866 if (outgoing_mtu > 0) {
2867 tunnel_overhead = (u_int32_t)(esp_hdrsiz(NULL) + sizeof(struct ip6_hdr));
2868 if (outgoing_mtu > tunnel_overhead) {
2869 outgoing_mtu -= tunnel_overhead;
2870 }
2871 if (outgoing_mtu < min_mtu) {
2872 return outgoing_mtu;
2873 }
2874 }
2875
2876 return min_mtu;
2877 }
2878
2879 /*
2880 * When `need fragmentation' ICMP is received, update our idea of the MSS
2881 * based on the new value in the route. Also nudge TCP to send something,
2882 * since we know the packet we just sent was dropped.
2883 * This duplicates some code in the tcp_mss() function in tcp_input.c.
2884 */
2885 void
tcp_mtudisc(struct inpcb * inp,__unused int errno)2886 tcp_mtudisc(struct inpcb *inp, __unused int errno)
2887 {
2888 struct tcpcb *tp = intotcpcb(inp);
2889 struct rtentry *rt;
2890 struct socket *so = inp->inp_socket;
2891 int mss;
2892 u_int32_t mtu;
2893 u_int32_t protoHdrOverhead = sizeof(struct tcpiphdr);
2894 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
2895
2896 /*
2897 * Nothing left to send after the socket is defunct or TCP is in the closed state
2898 */
2899 if ((so->so_state & SS_DEFUNCT) || (tp != NULL && tp->t_state == TCPS_CLOSED)) {
2900 return;
2901 }
2902
2903 if (isipv6) {
2904 protoHdrOverhead = sizeof(struct ip6_hdr) +
2905 sizeof(struct tcphdr);
2906 }
2907
2908 if (tp != NULL) {
2909 if (isipv6) {
2910 rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2911 } else {
2912 rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2913 }
2914 if (!rt || !rt->rt_rmx.rmx_mtu) {
2915 tp->t_maxopd = tp->t_maxseg =
2916 isipv6 ? tcp_v6mssdflt :
2917 tcp_mssdflt;
2918
2919 /* Route locked during lookup above */
2920 if (rt != NULL) {
2921 RT_UNLOCK(rt);
2922 }
2923 return;
2924 }
2925 mtu = rt->rt_rmx.rmx_mtu;
2926
2927 mtu = tcp_get_effective_mtu(rt, mtu);
2928
2929 /* Route locked during lookup above */
2930 RT_UNLOCK(rt);
2931
2932 #if NECP
2933 // Adjust MTU if necessary.
2934 mtu = necp_socket_get_effective_mtu(inp, mtu);
2935 #endif /* NECP */
2936 mss = mtu - protoHdrOverhead;
2937
2938 if (tp->t_maxopd) {
2939 mss = min(mss, tp->t_maxopd);
2940 }
2941 /*
2942 * XXX - The above conditional probably violates the TCP
2943 * spec. The problem is that, since we don't know the
2944 * other end's MSS, we are supposed to use a conservative
2945 * default. But, if we do that, then MTU discovery will
2946 * never actually take place, because the conservative
2947 * default is much less than the MTUs typically seen
2948 * on the Internet today. For the moment, we'll sweep
2949 * this under the carpet.
2950 *
2951 * The conservative default might not actually be a problem
2952 * if the only case this occurs is when sending an initial
2953 * SYN with options and data to a host we've never talked
2954 * to before. Then, they will reply with an MSS value which
2955 * will get recorded and the new parameters should get
2956 * recomputed. For Further Study.
2957 */
2958 if (tp->t_maxopd <= mss) {
2959 return;
2960 }
2961 tp->t_maxopd = mss;
2962
2963 if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
2964 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) {
2965 mss -= TCPOLEN_TSTAMP_APPA;
2966 }
2967
2968 #if MPTCP
2969 mss -= mptcp_adj_mss(tp, TRUE);
2970 #endif
2971 if (so->so_snd.sb_hiwat < mss) {
2972 mss = so->so_snd.sb_hiwat;
2973 }
2974
2975 tp->t_maxseg = mss;
2976
2977 ASSERT(tp->t_maxseg);
2978
2979 /*
2980 * Reset the slow-start flight size as it may depends on the
2981 * new MSS
2982 */
2983 if (CC_ALGO(tp)->cwnd_init != NULL) {
2984 CC_ALGO(tp)->cwnd_init(tp);
2985 }
2986
2987 if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.rwnd_init != NULL) {
2988 tcp_cc_rledbat.rwnd_init(tp);
2989 }
2990
2991 tcpstat.tcps_mturesent++;
2992 tp->t_rtttime = 0;
2993 tp->snd_nxt = tp->snd_una;
2994 tcp_output(tp);
2995 }
2996 }
2997
2998 /*
2999 * Look-up the routing entry to the peer of this inpcb. If no route
3000 * is found and it cannot be allocated the return NULL. This routine
3001 * is called by TCP routines that access the rmx structure and by tcp_mss
3002 * to get the interface MTU. If a route is found, this routine will
3003 * hold the rtentry lock; the caller is responsible for unlocking.
3004 */
3005 struct rtentry *
tcp_rtlookup(struct inpcb * inp,unsigned int input_ifscope)3006 tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope)
3007 {
3008 struct route *ro;
3009 struct rtentry *rt;
3010 struct tcpcb *tp;
3011
3012 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
3013
3014 ro = &inp->inp_route;
3015 if ((rt = ro->ro_rt) != NULL) {
3016 RT_LOCK(rt);
3017 }
3018
3019 if (ROUTE_UNUSABLE(ro)) {
3020 if (rt != NULL) {
3021 RT_UNLOCK(rt);
3022 rt = NULL;
3023 }
3024 ROUTE_RELEASE(ro);
3025 /* No route yet, so try to acquire one */
3026 if (inp->inp_faddr.s_addr != INADDR_ANY) {
3027 unsigned int ifscope;
3028
3029 ro->ro_dst.sa_family = AF_INET;
3030 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
3031 SIN(&ro->ro_dst)->sin_addr = inp->inp_faddr;
3032
3033 /*
3034 * If the socket was bound to an interface, then
3035 * the bound-to-interface takes precedence over
3036 * the inbound interface passed in by the caller
3037 * (if we get here as part of the output path then
3038 * input_ifscope is IFSCOPE_NONE).
3039 */
3040 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
3041 inp->inp_boundifp->if_index : input_ifscope;
3042
3043 rtalloc_scoped(ro, ifscope);
3044 if ((rt = ro->ro_rt) != NULL) {
3045 RT_LOCK(rt);
3046 }
3047 }
3048 }
3049 if (rt != NULL) {
3050 RT_LOCK_ASSERT_HELD(rt);
3051 }
3052
3053 /*
3054 * Update MTU discovery determination. Don't do it if:
3055 * 1) it is disabled via the sysctl
3056 * 2) the route isn't up
3057 * 3) the MTU is locked (if it is, then discovery has been
3058 * disabled)
3059 */
3060
3061 tp = intotcpcb(inp);
3062
3063 if (!path_mtu_discovery || ((rt != NULL) &&
3064 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3065 tp->t_flags &= ~TF_PMTUD;
3066 } else {
3067 tp->t_flags |= TF_PMTUD;
3068 }
3069
3070 if (rt != NULL && rt->rt_ifp != NULL) {
3071 somultipages(inp->inp_socket,
3072 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3073 tcp_set_tso(tp, rt->rt_ifp);
3074 soif2kcl(inp->inp_socket,
3075 (rt->rt_ifp->if_eflags & IFEF_2KCL));
3076 tcp_set_ecn(tp, rt->rt_ifp);
3077 if (inp->inp_last_outifp == NULL) {
3078 inp->inp_last_outifp = rt->rt_ifp;
3079 #if SKYWALK
3080 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3081 netns_set_ifnet(&inp->inp_netns_token,
3082 inp->inp_last_outifp);
3083 }
3084 #endif /* SKYWALK */
3085 }
3086 }
3087
3088 /* Note if the peer is local */
3089 if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3090 (rt->rt_gateway->sa_family == AF_LINK ||
3091 rt->rt_ifp->if_flags & IFF_LOOPBACK ||
3092 in_localaddr(inp->inp_faddr))) {
3093 tp->t_flags |= TF_LOCAL;
3094 }
3095
3096 /*
3097 * Caller needs to call RT_UNLOCK(rt).
3098 */
3099 return rt;
3100 }
3101
3102 struct rtentry *
tcp_rtlookup6(struct inpcb * inp,unsigned int input_ifscope)3103 tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope)
3104 {
3105 struct route_in6 *ro6;
3106 struct rtentry *rt;
3107 struct tcpcb *tp;
3108
3109 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
3110
3111 ro6 = &inp->in6p_route;
3112 if ((rt = ro6->ro_rt) != NULL) {
3113 RT_LOCK(rt);
3114 }
3115
3116 if (ROUTE_UNUSABLE(ro6)) {
3117 if (rt != NULL) {
3118 RT_UNLOCK(rt);
3119 rt = NULL;
3120 }
3121 ROUTE_RELEASE(ro6);
3122 /* No route yet, so try to acquire one */
3123 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
3124 struct sockaddr_in6 *dst6;
3125 unsigned int ifscope;
3126
3127 dst6 = SIN6(&ro6->ro_dst);
3128 dst6->sin6_family = AF_INET6;
3129 dst6->sin6_len = sizeof(*dst6);
3130 dst6->sin6_addr = inp->in6p_faddr;
3131
3132 /*
3133 * If the socket was bound to an interface, then
3134 * the bound-to-interface takes precedence over
3135 * the inbound interface passed in by the caller
3136 * (if we get here as part of the output path then
3137 * input_ifscope is IFSCOPE_NONE).
3138 */
3139 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
3140 inp->inp_boundifp->if_index : input_ifscope;
3141
3142 rtalloc_scoped((struct route *)ro6, ifscope);
3143 if ((rt = ro6->ro_rt) != NULL) {
3144 RT_LOCK(rt);
3145 }
3146 }
3147 }
3148 if (rt != NULL) {
3149 RT_LOCK_ASSERT_HELD(rt);
3150 }
3151
3152 /*
3153 * Update path MTU Discovery determination
3154 * while looking up the route:
3155 * 1) we have a valid route to the destination
3156 * 2) the MTU is not locked (if it is, then discovery has been
3157 * disabled)
3158 */
3159
3160
3161 tp = intotcpcb(inp);
3162
3163 /*
3164 * Update MTU discovery determination. Don't do it if:
3165 * 1) it is disabled via the sysctl
3166 * 2) the route isn't up
3167 * 3) the MTU is locked (if it is, then discovery has been
3168 * disabled)
3169 */
3170
3171 if (!path_mtu_discovery || ((rt != NULL) &&
3172 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3173 tp->t_flags &= ~TF_PMTUD;
3174 } else {
3175 tp->t_flags |= TF_PMTUD;
3176 }
3177
3178 if (rt != NULL && rt->rt_ifp != NULL) {
3179 somultipages(inp->inp_socket,
3180 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3181 tcp_set_tso(tp, rt->rt_ifp);
3182 soif2kcl(inp->inp_socket,
3183 (rt->rt_ifp->if_eflags & IFEF_2KCL));
3184 tcp_set_ecn(tp, rt->rt_ifp);
3185 if (inp->inp_last_outifp == NULL) {
3186 inp->inp_last_outifp = rt->rt_ifp;
3187 #if SKYWALK
3188 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3189 netns_set_ifnet(&inp->inp_netns_token,
3190 inp->inp_last_outifp);
3191 }
3192 #endif /* SKYWALK */
3193 }
3194
3195 /* Note if the peer is local */
3196 if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3197 (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
3198 IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
3199 rt->rt_gateway->sa_family == AF_LINK ||
3200 in6_localaddr(&inp->in6p_faddr))) {
3201 tp->t_flags |= TF_LOCAL;
3202 }
3203 }
3204
3205 /*
3206 * Caller needs to call RT_UNLOCK(rt).
3207 */
3208 return rt;
3209 }
3210
3211 #if IPSEC
3212 /* compute ESP/AH header size for TCP, including outer IP header. */
3213 size_t
ipsec_hdrsiz_tcp(struct tcpcb * tp)3214 ipsec_hdrsiz_tcp(struct tcpcb *tp)
3215 {
3216 struct inpcb *inp;
3217 struct mbuf *m;
3218 size_t hdrsiz;
3219 struct ip *ip;
3220 struct ip6_hdr *ip6 = NULL;
3221 struct tcphdr *th;
3222
3223 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) {
3224 return 0;
3225 }
3226 MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */
3227 if (!m) {
3228 return 0;
3229 }
3230
3231 if ((inp->inp_vflag & INP_IPV6) != 0) {
3232 ip6 = mtod(m, struct ip6_hdr *);
3233 th = (struct tcphdr *)(void *)(ip6 + 1);
3234 m->m_pkthdr.len = m->m_len =
3235 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3236 tcp_fillheaders(m, tp, ip6, th);
3237 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3238 } else {
3239 ip = mtod(m, struct ip *);
3240 th = (struct tcphdr *)(ip + 1);
3241 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
3242 tcp_fillheaders(m, tp, ip, th);
3243 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3244 }
3245 m_free(m);
3246 return hdrsiz;
3247 }
3248 #endif /* IPSEC */
3249
3250 int
tcp_lock(struct socket * so,int refcount,void * lr)3251 tcp_lock(struct socket *so, int refcount, void *lr)
3252 {
3253 lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
3254
3255 retry:
3256 if (so->so_pcb != NULL) {
3257 if (so->so_flags & SOF_MP_SUBFLOW) {
3258 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3259 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3260
3261 socket_lock(mp_so, refcount);
3262
3263 /*
3264 * Check if we became non-MPTCP while waiting for the lock.
3265 * If yes, we have to retry to grab the right lock.
3266 */
3267 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3268 socket_unlock(mp_so, refcount);
3269 goto retry;
3270 }
3271 } else {
3272 lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3273
3274 if (so->so_flags & SOF_MP_SUBFLOW) {
3275 /*
3276 * While waiting for the lock, we might have
3277 * become MPTCP-enabled (see mptcp_subflow_socreate).
3278 */
3279 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3280 goto retry;
3281 }
3282 }
3283 } else {
3284 panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s",
3285 so, lr_saved, solockhistory_nr(so));
3286 /* NOTREACHED */
3287 }
3288
3289 if (so->so_usecount < 0) {
3290 panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s",
3291 so, so->so_pcb, lr_saved, so->so_usecount,
3292 solockhistory_nr(so));
3293 /* NOTREACHED */
3294 }
3295 if (refcount) {
3296 so->so_usecount++;
3297 }
3298 so->lock_lr[so->next_lock_lr] = lr_saved;
3299 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3300 return 0;
3301 }
3302
3303 int
tcp_unlock(struct socket * so,int refcount,void * lr)3304 tcp_unlock(struct socket *so, int refcount, void *lr)
3305 {
3306 lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
3307
3308
3309 #ifdef MORE_TCPLOCK_DEBUG
3310 printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
3311 "lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
3312 (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
3313 (uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
3314 so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
3315 #endif
3316 if (refcount) {
3317 so->so_usecount--;
3318 }
3319
3320 if (so->so_usecount < 0) {
3321 panic("tcp_unlock: so=%p usecount=%x lrh= %s",
3322 so, so->so_usecount, solockhistory_nr(so));
3323 /* NOTREACHED */
3324 }
3325 if (so->so_pcb == NULL) {
3326 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s",
3327 so, so->so_usecount, lr_saved, solockhistory_nr(so));
3328 /* NOTREACHED */
3329 } else {
3330 so->unlock_lr[so->next_unlock_lr] = lr_saved;
3331 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3332
3333 if (so->so_flags & SOF_MP_SUBFLOW) {
3334 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3335 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3336
3337 socket_lock_assert_owned(mp_so);
3338
3339 socket_unlock(mp_so, refcount);
3340 } else {
3341 LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3342 LCK_MTX_ASSERT_OWNED);
3343 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3344 }
3345 }
3346 return 0;
3347 }
3348
3349 lck_mtx_t *
tcp_getlock(struct socket * so,int flags)3350 tcp_getlock(struct socket *so, int flags)
3351 {
3352 struct inpcb *inp = sotoinpcb(so);
3353
3354 if (so->so_pcb) {
3355 if (so->so_usecount < 0) {
3356 panic("tcp_getlock: so=%p usecount=%x lrh= %s",
3357 so, so->so_usecount, solockhistory_nr(so));
3358 }
3359
3360 if (so->so_flags & SOF_MP_SUBFLOW) {
3361 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3362 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3363
3364 return mp_so->so_proto->pr_getlock(mp_so, flags);
3365 } else {
3366 return &inp->inpcb_mtx;
3367 }
3368 } else {
3369 panic("tcp_getlock: so=%p NULL so_pcb %s",
3370 so, solockhistory_nr(so));
3371 return so->so_proto->pr_domain->dom_mtx;
3372 }
3373 }
3374
3375 /*
3376 * Determine if we can grow the recieve socket buffer to avoid sending
3377 * a zero window update to the peer. We allow even socket buffers that
3378 * have fixed size (set by the application) to grow if the resource
3379 * constraints are met. They will also be trimmed after the application
3380 * reads data.
3381 */
3382 static void
tcp_sbrcv_grow_rwin(struct tcpcb * tp,struct sockbuf * sb)3383 tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb)
3384 {
3385 u_int32_t rcvbufinc = tp->t_maxseg << 4;
3386 u_int32_t rcvbuf = sb->sb_hiwat;
3387 struct socket *so = tp->t_inpcb->inp_socket;
3388
3389 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
3390 return;
3391 }
3392
3393 if (tcp_do_autorcvbuf == 1 &&
3394 (tp->t_flags & TF_SLOWLINK) == 0 &&
3395 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
3396 (rcvbuf - sb->sb_cc) < rcvbufinc &&
3397 rcvbuf < tcp_autorcvbuf_max &&
3398 (sb->sb_idealsize > 0 &&
3399 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
3400 sbreserve(sb,
3401 min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
3402 }
3403 }
3404
3405 int32_t
tcp_sbspace(struct tcpcb * tp)3406 tcp_sbspace(struct tcpcb *tp)
3407 {
3408 struct socket *so = tp->t_inpcb->inp_socket;
3409 struct sockbuf *sb = &so->so_rcv;
3410 u_int32_t rcvbuf;
3411 int32_t space;
3412 int32_t pending = 0;
3413
3414 if (so->so_flags & SOF_MP_SUBFLOW) {
3415 /* We still need to grow TCP's buffer to have a BDP-estimate */
3416 tcp_sbrcv_grow_rwin(tp, sb);
3417
3418 return mptcp_sbspace(tptomptp(tp));
3419 }
3420
3421 tcp_sbrcv_grow_rwin(tp, sb);
3422
3423 /* hiwat might have changed */
3424 rcvbuf = sb->sb_hiwat;
3425
3426 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
3427 (sb->sb_mbmax - sb->sb_mbcnt)));
3428 if (space < 0) {
3429 space = 0;
3430 }
3431
3432 #if CONTENT_FILTER
3433 /* Compensate for data being processed by content filters */
3434 pending = cfil_sock_data_space(sb);
3435 #endif /* CONTENT_FILTER */
3436 if (pending > space) {
3437 space = 0;
3438 } else {
3439 space -= pending;
3440 }
3441
3442 /*
3443 * Avoid increasing window size if the current window
3444 * is already very low, we could be in "persist" mode and
3445 * we could break some apps (see rdar://5409343)
3446 */
3447
3448 if (space < tp->t_maxseg) {
3449 return space;
3450 }
3451
3452 /* Clip window size for slower link */
3453
3454 if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0) {
3455 return imin(space, slowlink_wsize);
3456 }
3457
3458 return space;
3459 }
3460 /*
3461 * Checks TCP Segment Offloading capability for a given connection
3462 * and interface pair.
3463 */
3464 void
tcp_set_tso(struct tcpcb * tp,struct ifnet * ifp)3465 tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp)
3466 {
3467 struct inpcb *inp;
3468 int isipv6;
3469 struct ifnet *tunnel_ifp = NULL;
3470 #define IFNET_TSO_MASK (IFNET_TSO_IPV6 | IFNET_TSO_IPV4)
3471
3472 tp->t_flags &= ~TF_TSO;
3473
3474 /*
3475 * Bail if there's a non-TSO-capable filter on the interface.
3476 */
3477 if (ifp == NULL || ifp->if_flt_no_tso_count > 0) {
3478 return;
3479 }
3480
3481 inp = tp->t_inpcb;
3482 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
3483
3484 #if MPTCP
3485 /*
3486 * We can't use TSO if this tcpcb belongs to an MPTCP session.
3487 */
3488 if (inp->inp_socket->so_flags & SOF_MP_SUBFLOW) {
3489 return;
3490 }
3491 #endif
3492 /*
3493 * We can't use TSO if the TSO capability of the tunnel interface does
3494 * not match the capability of another interface known by TCP
3495 */
3496 if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL) {
3497 u_int tunnel_if_index = inp->inp_policyresult.results.result_parameter.tunnel_interface_index;
3498
3499 if (tunnel_if_index != 0) {
3500 ifnet_head_lock_shared();
3501 tunnel_ifp = ifindex2ifnet[tunnel_if_index];
3502 ifnet_head_done();
3503 }
3504
3505 if (tunnel_ifp == NULL) {
3506 return;
3507 }
3508
3509 if ((ifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3510 if (tso_debug > 0) {
3511 os_log(OS_LOG_DEFAULT,
3512 "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with ifp %s",
3513 __func__,
3514 ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3515 tunnel_ifp->if_xname, ifp->if_xname);
3516 }
3517 return;
3518 }
3519 if (inp->inp_last_outifp != NULL &&
3520 (inp->inp_last_outifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3521 if (tso_debug > 0) {
3522 os_log(OS_LOG_DEFAULT,
3523 "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_last_outifp %s",
3524 __func__,
3525 ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3526 tunnel_ifp->if_xname, inp->inp_last_outifp->if_xname);
3527 }
3528 return;
3529 }
3530 if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp != NULL &&
3531 (inp->inp_boundifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3532 if (tso_debug > 0) {
3533 os_log(OS_LOG_DEFAULT,
3534 "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_boundifp %s",
3535 __func__,
3536 ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3537 tunnel_ifp->if_xname, inp->inp_boundifp->if_xname);
3538 }
3539 return;
3540 }
3541 }
3542
3543 if (isipv6) {
3544 if (ifp->if_hwassist & IFNET_TSO_IPV6) {
3545 tp->t_flags |= TF_TSO;
3546 if (ifp->if_tso_v6_mtu != 0) {
3547 tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
3548 } else {
3549 tp->tso_max_segment_size = TCP_MAXWIN;
3550 }
3551 }
3552 } else {
3553 if (ifp->if_hwassist & IFNET_TSO_IPV4) {
3554 tp->t_flags |= TF_TSO;
3555 if (ifp->if_tso_v4_mtu != 0) {
3556 tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
3557 } else {
3558 tp->tso_max_segment_size = TCP_MAXWIN;
3559 }
3560 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3561 tp->tso_max_segment_size -=
3562 CLAT46_HDR_EXPANSION_OVERHD;
3563 }
3564 }
3565 }
3566
3567 if (tso_debug > 1) {
3568 os_log(OS_LOG_DEFAULT, "%s: %u > %u TSO %d ifp %s",
3569 __func__,
3570 ntohs(tp->t_inpcb->inp_lport),
3571 ntohs(tp->t_inpcb->inp_fport),
3572 (tp->t_flags & TF_TSO) != 0,
3573 ifp != NULL ? ifp->if_xname : "<NULL>");
3574 }
3575 }
3576
3577 #define TIMEVAL_TO_TCPHZ(_tv_) ((uint32_t)((_tv_).tv_sec * TCP_RETRANSHZ + \
3578 (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC))
3579
3580 /*
3581 * Function to calculate the tcp clock. The tcp clock will get updated
3582 * at the boundaries of the tcp layer. This is done at 3 places:
3583 * 1. Right before processing an input tcp packet
3584 * 2. Whenever a connection wants to access the network using tcp_usrreqs
3585 * 3. When a tcp timer fires or before tcp slow timeout
3586 *
3587 */
3588
3589 void
calculate_tcp_clock(void)3590 calculate_tcp_clock(void)
3591 {
3592 struct timeval tv = tcp_uptime;
3593 struct timeval interval = {.tv_sec = 0, .tv_usec = TCP_RETRANSHZ_TO_USEC};
3594 struct timeval now, hold_now;
3595 uint32_t incr = 0;
3596
3597 microuptime(&now);
3598
3599 /*
3600 * Update coarse-grained networking timestamp (in sec.); the idea
3601 * is to update the counter returnable via net_uptime() when
3602 * we read time.
3603 */
3604 net_update_uptime_with_time(&now);
3605
3606 timevaladd(&tv, &interval);
3607 if (timevalcmp(&now, &tv, >)) {
3608 /* time to update the clock */
3609 lck_spin_lock(&tcp_uptime_lock);
3610 if (timevalcmp(&tcp_uptime, &now, >=)) {
3611 /* clock got updated while waiting for the lock */
3612 lck_spin_unlock(&tcp_uptime_lock);
3613 return;
3614 }
3615
3616 microuptime(&now);
3617 hold_now = now;
3618 tv = tcp_uptime;
3619 timevalsub(&now, &tv);
3620
3621 incr = TIMEVAL_TO_TCPHZ(now);
3622
3623 /* Account for the previous remainder */
3624 uint32_t remaining_us = (now.tv_usec % TCP_RETRANSHZ_TO_USEC) +
3625 tcp_now_remainder_us;
3626 if (remaining_us >= TCP_RETRANSHZ_TO_USEC) {
3627 incr += (remaining_us / TCP_RETRANSHZ_TO_USEC);
3628 }
3629
3630 if (incr > 0) {
3631 tcp_uptime = hold_now;
3632 tcp_now_remainder_us = remaining_us % TCP_RETRANSHZ_TO_USEC;
3633 tcp_now += incr;
3634 }
3635
3636 lck_spin_unlock(&tcp_uptime_lock);
3637 }
3638 }
3639
3640 uint64_t
microuptime_ns(void)3641 microuptime_ns(void)
3642 {
3643 uint64_t abstime = mach_absolute_time();
3644 uint64_t ns = 0;
3645 absolutetime_to_nanoseconds(abstime, &ns);
3646
3647 return ns;
3648 }
3649
3650 #define MAX_BURST_INTERVAL_KERNEL_PACING_NSEC \
3651 (10 * NSEC_PER_MSEC) // Don't delay more than 10ms between two bursts
3652 static uint64_t
tcp_pacer_get_packet_interval(struct tcpcb * tp,uint32_t size)3653 tcp_pacer_get_packet_interval(struct tcpcb *tp, uint32_t size)
3654 {
3655 if (tp->t_pacer.rate == 0) {
3656 os_log_error(OS_LOG_DEFAULT,
3657 "pacer rate shouldn't be 0, CCA is %s (cwnd=%u, smoothed rtt=%u ms)",
3658 CC_ALGO(tp)->name, tp->snd_cwnd, tp->t_srtt >> TCP_RTT_SHIFT);
3659
3660 return MAX_BURST_INTERVAL_KERNEL_PACING_NSEC;
3661 }
3662
3663 uint64_t interval = (uint64_t)size * NSEC_PER_SEC / tp->t_pacer.rate;
3664 if (interval > MAX_BURST_INTERVAL_KERNEL_PACING_NSEC) {
3665 interval = MAX_BURST_INTERVAL_KERNEL_PACING_NSEC;
3666 }
3667
3668 return interval;
3669 }
3670
3671 /* Return packet tx_time in nanoseconds (absolute as well as continuous) */
3672 uint64_t
tcp_pacer_get_packet_tx_time(struct tcpcb * tp,uint16_t pkt_len)3673 tcp_pacer_get_packet_tx_time(struct tcpcb *tp, uint16_t pkt_len)
3674 {
3675 /*
3676 * This function is called multiple times for mss-sized packets
3677 * and for high-speeds, we'd want to send multiple packets
3678 * that add up to burst_size at the same time.
3679 */
3680 uint64_t now = microuptime_ns();
3681
3682 if (pkt_len == 0 || now == 0) {
3683 return now;
3684 }
3685
3686 if (tp->t_pacer.packet_tx_time == 0) {
3687 tp->t_pacer.packet_tx_time = now;
3688 tp->t_pacer.current_size = pkt_len;
3689 } else {
3690 tp->t_pacer.current_size += pkt_len;
3691 if (tp->t_pacer.current_size > tp->t_pacer.tso_burst_size) {
3692 /*
3693 * Increment tx_time by packet_interval and
3694 * reset size to this packet's len
3695 */
3696 tp->t_pacer.packet_tx_time +=
3697 tcp_pacer_get_packet_interval(tp, tp->t_pacer.current_size);
3698 tp->t_pacer.current_size = 0;
3699 if (now > tp->t_pacer.packet_tx_time) {
3700 /*
3701 * If current time is bigger, then application
3702 * has already paced the packet. Also, we can't
3703 * set tx_time in the past.
3704 */
3705 tp->t_pacer.packet_tx_time = now;
3706 }
3707 }
3708 }
3709
3710 return tp->t_pacer.packet_tx_time;
3711 }
3712
3713 void
tcp_set_mbuf_tx_time(struct mbuf * m,uint64_t tx_time)3714 tcp_set_mbuf_tx_time(struct mbuf *m, uint64_t tx_time)
3715 {
3716 struct m_tag *tag = NULL;
3717 tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM,
3718 sizeof(uint64_t), M_WAITOK, m);
3719 if (tag != NULL) {
3720 m_tag_prepend(m, tag);
3721 *(uint64_t *)tag->m_tag_data = tx_time;
3722 }
3723 }
3724
3725 /*
3726 * Compute receive window scaling that we are going to request
3727 * for this connection based on sb_hiwat. Try to leave some
3728 * room to potentially increase the window size upto a maximum
3729 * defined by the constant tcp_autorcvbuf_max.
3730 */
3731 void
tcp_set_max_rwinscale(struct tcpcb * tp,struct socket * so)3732 tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so)
3733 {
3734 uint32_t maxsockbufsize;
3735
3736 tp->request_r_scale = MAX((uint8_t)tcp_win_scale, tp->request_r_scale);
3737 maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
3738 so->so_rcv.sb_hiwat : tcp_autorcvbuf_max;
3739
3740 /*
3741 * Window scale should not exceed what is needed
3742 * to send the max receive window size; adding 1 to TCP_MAXWIN
3743 * ensures that.
3744 */
3745 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
3746 ((TCP_MAXWIN + 1) << tp->request_r_scale) < maxsockbufsize) {
3747 tp->request_r_scale++;
3748 }
3749 tp->request_r_scale = MIN(tp->request_r_scale, TCP_MAX_WINSHIFT);
3750 }
3751
3752 int
tcp_notsent_lowat_check(struct socket * so)3753 tcp_notsent_lowat_check(struct socket *so)
3754 {
3755 struct inpcb *inp = sotoinpcb(so);
3756 struct tcpcb *tp = NULL;
3757 int notsent = 0;
3758
3759 if (inp != NULL) {
3760 tp = intotcpcb(inp);
3761 }
3762
3763 if (tp == NULL) {
3764 return 0;
3765 }
3766
3767 notsent = so->so_snd.sb_cc -
3768 (tp->snd_nxt - tp->snd_una);
3769
3770 /*
3771 * When we send a FIN or SYN, not_sent can be negative.
3772 * In that case also we need to send a write event to the
3773 * process if it is waiting. In the FIN case, it will
3774 * get an error from send because cantsendmore will be set.
3775 */
3776 if (notsent <= tp->t_notsent_lowat) {
3777 return 1;
3778 }
3779
3780 /*
3781 * When Nagle's algorithm is not disabled, it is better
3782 * to wakeup the client until there is atleast one
3783 * maxseg of data to write.
3784 */
3785 if ((tp->t_flags & TF_NODELAY) == 0 &&
3786 notsent > 0 && notsent < tp->t_maxseg) {
3787 return 1;
3788 }
3789 return 0;
3790 }
3791
3792 void
tcp_rxtseg_insert(struct tcpcb * tp,tcp_seq start,tcp_seq end)3793 tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3794 {
3795 struct tcp_rxt_seg *rxseg = NULL, *prev = NULL, *next = NULL;
3796 uint16_t rxcount = 0;
3797
3798 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3799 tp->t_dsack_lastuna = tp->snd_una;
3800 }
3801 /*
3802 * First check if there is a segment already existing for this
3803 * sequence space.
3804 */
3805
3806 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3807 if (SEQ_GT(rxseg->rx_start, start)) {
3808 break;
3809 }
3810 prev = rxseg;
3811 }
3812 next = rxseg;
3813
3814 /* check if prev seg is for this sequence */
3815 if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
3816 SEQ_GEQ(prev->rx_end, end)) {
3817 prev->rx_count++;
3818 return;
3819 }
3820
3821 /*
3822 * There are a couple of possibilities at this point.
3823 * 1. prev overlaps with the beginning of this sequence
3824 * 2. next overlaps with the end of this sequence
3825 * 3. there is no overlap.
3826 */
3827
3828 if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
3829 if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
3830 start = prev->rx_end + 1;
3831 prev->rx_count++;
3832 } else {
3833 prev->rx_end = (start - 1);
3834 rxcount = prev->rx_count;
3835 }
3836 }
3837
3838 if (next != NULL && SEQ_LT(next->rx_start, end)) {
3839 if (SEQ_LEQ(next->rx_end, end)) {
3840 end = next->rx_start - 1;
3841 next->rx_count++;
3842 } else {
3843 next->rx_start = end + 1;
3844 rxcount = next->rx_count;
3845 }
3846 }
3847 if (!SEQ_LT(start, end)) {
3848 return;
3849 }
3850
3851 if (tcp_rxt_seg_max > 0 && tp->t_rxt_seg_count >= tcp_rxt_seg_max) {
3852 rxseg = SLIST_FIRST(&tp->t_rxt_segments);
3853 if (prev == rxseg) {
3854 prev = NULL;
3855 }
3856 SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3857 tcp_rxt_seg, rx_link);
3858
3859 tcp_rxt_seg_drop++;
3860 tp->t_rxt_seg_drop++;
3861 zfree(tcp_rxt_seg_zone, rxseg);
3862
3863 tp->t_rxt_seg_count -= 1;
3864 }
3865
3866 rxseg = zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
3867 rxseg->rx_start = start;
3868 rxseg->rx_end = end;
3869 rxseg->rx_count = rxcount + 1;
3870
3871 if (prev != NULL) {
3872 SLIST_INSERT_AFTER(prev, rxseg, rx_link);
3873 } else {
3874 SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
3875 }
3876 tp->t_rxt_seg_count += 1;
3877 }
3878
3879 struct tcp_rxt_seg *
tcp_rxtseg_find(struct tcpcb * tp,tcp_seq start,tcp_seq end)3880 tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3881 {
3882 struct tcp_rxt_seg *rxseg;
3883
3884 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3885 return NULL;
3886 }
3887
3888 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3889 if (SEQ_LEQ(rxseg->rx_start, start) &&
3890 SEQ_GEQ(rxseg->rx_end, end)) {
3891 return rxseg;
3892 }
3893 if (SEQ_GT(rxseg->rx_start, start)) {
3894 break;
3895 }
3896 }
3897 return NULL;
3898 }
3899
3900 void
tcp_rxtseg_set_spurious(struct tcpcb * tp,tcp_seq start,tcp_seq end)3901 tcp_rxtseg_set_spurious(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3902 {
3903 struct tcp_rxt_seg *rxseg;
3904
3905 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3906 return;
3907 }
3908
3909 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3910 if (SEQ_GEQ(rxseg->rx_start, start) &&
3911 SEQ_LEQ(rxseg->rx_end, end)) {
3912 /*
3913 * If the segment was retransmitted only once, mark it as
3914 * spurious.
3915 */
3916 if (rxseg->rx_count == 1) {
3917 rxseg->rx_flags |= TCP_RXT_SPURIOUS;
3918 }
3919 }
3920
3921 if (SEQ_GEQ(rxseg->rx_start, end)) {
3922 break;
3923 }
3924 }
3925 return;
3926 }
3927
3928 void
tcp_rxtseg_clean(struct tcpcb * tp)3929 tcp_rxtseg_clean(struct tcpcb *tp)
3930 {
3931 struct tcp_rxt_seg *rxseg, *next;
3932
3933 SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
3934 SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3935 tcp_rxt_seg, rx_link);
3936 zfree(tcp_rxt_seg_zone, rxseg);
3937 }
3938 tp->t_rxt_seg_count = 0;
3939 tp->t_dsack_lastuna = tp->snd_max;
3940 }
3941
3942 boolean_t
tcp_rxtseg_detect_bad_rexmt(struct tcpcb * tp,tcp_seq th_ack)3943 tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
3944 {
3945 boolean_t bad_rexmt;
3946 struct tcp_rxt_seg *rxseg;
3947
3948 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3949 return FALSE;
3950 }
3951
3952 /*
3953 * If all of the segments in this window are not cumulatively
3954 * acknowledged, then there can still be undetected packet loss.
3955 * Do not restore congestion window in that case.
3956 */
3957 if (SEQ_LT(th_ack, tp->snd_recover)) {
3958 return FALSE;
3959 }
3960
3961 bad_rexmt = TRUE;
3962 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3963 if (!(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
3964 bad_rexmt = FALSE;
3965 break;
3966 }
3967 }
3968 return bad_rexmt;
3969 }
3970
3971 u_int32_t
tcp_rxtseg_total_size(struct tcpcb * tp)3972 tcp_rxtseg_total_size(struct tcpcb *tp)
3973 {
3974 struct tcp_rxt_seg *rxseg;
3975 u_int32_t total_size = 0;
3976
3977 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3978 total_size += (rxseg->rx_end - rxseg->rx_start) + 1;
3979 }
3980 return total_size;
3981 }
3982
3983 int
tcp_seg_cmp(const struct tcp_seg_sent * seg1,const struct tcp_seg_sent * seg2)3984 tcp_seg_cmp(const struct tcp_seg_sent *seg1, const struct tcp_seg_sent *seg2)
3985 {
3986 return (int)(seg1->end_seq - seg2->end_seq);
3987 }
3988
RB_GENERATE(tcp_seg_sent_tree_head,tcp_seg_sent,seg_link,tcp_seg_cmp)3989 RB_GENERATE(tcp_seg_sent_tree_head, tcp_seg_sent, seg_link, tcp_seg_cmp)
3990
3991 uint32_t
3992 tcp_seg_len(struct tcp_seg_sent *seg)
3993 {
3994 if (SEQ_LT(seg->end_seq, seg->start_seq)) {
3995 os_log_error(OS_LOG_DEFAULT, "segment end(%u) can't be smaller "
3996 "than segment start(%u)", seg->end_seq, seg->start_seq);
3997 }
3998
3999 return seg->end_seq - seg->start_seq;
4000 }
4001
4002 static struct tcp_seg_sent *
tcp_seg_alloc_init(struct tcpcb * tp)4003 tcp_seg_alloc_init(struct tcpcb *tp)
4004 {
4005 struct tcp_seg_sent *seg = TAILQ_FIRST(&tp->seg_pool.free_segs);
4006 if (seg != NULL) {
4007 TAILQ_REMOVE(&tp->seg_pool.free_segs, seg, free_link);
4008 tp->seg_pool.free_segs_count--;
4009 } else {
4010 // TODO: remove Z_WAITOK and Z_NOFAIL?
4011 seg = zalloc_flags(tcp_seg_sent_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
4012 if (seg == NULL) {
4013 return NULL;
4014 }
4015 }
4016 bzero(seg, sizeof(*seg));
4017
4018 return seg;
4019 }
4020
4021 static void
tcp_update_seg_after_rto(struct tcpcb * tp,struct tcp_seg_sent * found_seg,uint32_t xmit_ts,uint8_t flags)4022 tcp_update_seg_after_rto(struct tcpcb *tp, struct tcp_seg_sent *found_seg,
4023 uint32_t xmit_ts, uint8_t flags)
4024 {
4025 tcp_rack_transmit_seg(tp, found_seg, found_seg->start_seq, found_seg->end_seq,
4026 xmit_ts, flags);
4027 struct tcp_seg_sent *seg = TAILQ_FIRST(&tp->t_segs_sent);
4028 if (found_seg == seg) {
4029 // Move this segment to the end of time-ordered list.
4030 TAILQ_REMOVE(&tp->t_segs_sent, seg, tx_link);
4031 TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4032 }
4033 }
4034
4035 static void
tcp_process_rxmt_segs_after_rto(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq start,uint32_t xmit_ts,uint8_t flags)4036 tcp_process_rxmt_segs_after_rto(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start,
4037 uint32_t xmit_ts, uint8_t flags)
4038 {
4039 struct tcp_seg_sent segment = {};
4040
4041 while (seg != NULL) {
4042 if (SEQ_LEQ(seg->start_seq, start)) {
4043 tcp_update_seg_after_rto(tp, seg, xmit_ts, flags);
4044 break;
4045 } else {
4046 /* The segment is a part of the total RTO retransmission */
4047 tcp_update_seg_after_rto(tp, seg, xmit_ts, flags);
4048
4049 /* Find the next segment ending at the start of current segment */
4050 segment.end_seq = seg->start_seq;
4051 seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &segment);
4052 }
4053 }
4054 }
4055
4056 static struct tcp_seg_sent *
tcp_seg_sent_insert_before(struct tcpcb * tp,struct tcp_seg_sent * before,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)4057 tcp_seg_sent_insert_before(struct tcpcb *tp, struct tcp_seg_sent *before, tcp_seq start, tcp_seq end,
4058 uint32_t xmit_ts, uint8_t flags)
4059 {
4060 struct tcp_seg_sent *seg = tcp_seg_alloc_init(tp);
4061 /* segment MUST be allocated, there is no other fail-safe here */
4062 tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags);
4063 struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4064 if (not_inserted) {
4065 os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4066 not_inserted->start_seq, not_inserted->end_seq);
4067 }
4068 TAILQ_INSERT_BEFORE(before, seg, tx_link);
4069
4070 return seg;
4071 }
4072
4073 static struct tcp_seg_sent *
tcp_seg_rto_insert_end(struct tcpcb * tp,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)4074 tcp_seg_rto_insert_end(struct tcpcb *tp, tcp_seq start, tcp_seq end,
4075 uint32_t xmit_ts, uint8_t flags)
4076 {
4077 struct tcp_seg_sent *seg = tcp_seg_alloc_init(tp);
4078 /* segment MUST be allocated, there is no other fail-safe here */
4079 tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags);
4080 struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4081 if (not_inserted) {
4082 os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4083 not_inserted->start_seq, not_inserted->end_seq);
4084 }
4085 TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4086
4087 return seg;
4088 }
4089
4090 void
tcp_seg_sent_insert(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)4091 tcp_seg_sent_insert(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, tcp_seq end,
4092 uint32_t xmit_ts, uint8_t flags)
4093 {
4094 if (seg != NULL) {
4095 uint8_t seg_flags = seg->flags | flags;
4096 if (seg->end_seq == end) {
4097 /* Entire seg retransmitted in RACK recovery, start and end sequence doesn't change */
4098 if (seg->start_seq != start) {
4099 os_log_error(OS_LOG_DEFAULT, "Segment start (%u) is not same as retransmitted "
4100 "start sequence number (%u)", seg->start_seq, start);
4101 }
4102 tcp_rack_transmit_seg(tp, seg, seg->start_seq, seg->end_seq, xmit_ts, seg_flags);
4103 TAILQ_REMOVE(&tp->t_segs_sent, seg, tx_link);
4104 TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4105 } else {
4106 /*
4107 * Original segment is retransmitted partially, update start_seq by len
4108 * and create new segment for retransmitted part
4109 */
4110 struct tcp_seg_sent *partial_seg = tcp_seg_alloc_init(tp);
4111 if (partial_seg == NULL) {
4112 return;
4113 }
4114 seg->start_seq += (end - start);
4115 tcp_rack_transmit_seg(tp, partial_seg, start, end, xmit_ts, seg_flags);
4116 struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head,
4117 &tp->t_segs_sent_tree, partial_seg);
4118 if (not_inserted) {
4119 os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4120 not_inserted->start_seq, not_inserted->end_seq);
4121 }
4122 TAILQ_INSERT_TAIL(&tp->t_segs_sent, partial_seg, tx_link);
4123 }
4124
4125 return;
4126 }
4127
4128 if ((flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE) == 0) {
4129 /* This is a new segment */
4130 seg = tcp_seg_alloc_init(tp);
4131 if (seg == NULL) {
4132 return;
4133 }
4134
4135 tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags);
4136 struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4137 if (not_inserted) {
4138 os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4139 not_inserted->start_seq, not_inserted->end_seq);
4140 }
4141 TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4142
4143 return;
4144 }
4145 /*
4146 * Either retransmitted after an RTO or PTO.
4147 * During RTO, time-ordered list may lose its order.
4148 * If retransmitted after RTO, check if the segment
4149 * already exists in RB tree and update its xmit_ts. Also,
4150 * if this seg is at the top of ordered list, then move it
4151 * to the end.
4152 */
4153 struct tcp_seg_sent segment = {};
4154 struct tcp_seg_sent *found_seg = NULL, *rxmt_seg = NULL;
4155
4156 /* Set the end sequence to search for existing segment */
4157 segment.end_seq = end;
4158 found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &segment);
4159 if (found_seg != NULL) {
4160 /* Found an exact match for retransmitted end sequence */
4161 tcp_process_rxmt_segs_after_rto(tp, found_seg, start, xmit_ts, flags);
4162 return;
4163 }
4164 /*
4165 * We come here when we don't find an exact match and end of segment
4166 * retransmitted after RTO lies within a segment.
4167 */
4168 RB_FOREACH(found_seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
4169 if (SEQ_LT(end, found_seg->end_seq) && SEQ_GT(end, found_seg->start_seq)) {
4170 /*
4171 * This segment is partially retransmitted. We split this segment at the boundary of end
4172 * sequence. First insert the part being retransmitted at the end of time-ordered list.
4173 */
4174 tcp_seg_rto_insert_end(tp, found_seg->start_seq, end, xmit_ts,
4175 found_seg->flags | flags);
4176
4177 if (SEQ_LEQ(found_seg->start_seq, start)) {
4178 /*
4179 * We are done with the retransmitted part.
4180 * Move the start of existing segment
4181 */
4182 found_seg->start_seq = end;
4183 } else {
4184 /*
4185 * This retransmitted sequence covers more than one segment
4186 * Look for segments covered by this retransmission below this segment
4187 */
4188 segment.end_seq = found_seg->start_seq;
4189 rxmt_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &segment);
4190
4191 if (rxmt_seg != NULL) {
4192 /* rxmt_seg is just before the current segment */
4193 tcp_process_rxmt_segs_after_rto(tp, rxmt_seg, start, xmit_ts, flags);
4194 }
4195
4196 /* Move the start of existing segment */
4197 found_seg->start_seq = end;
4198 }
4199 return;
4200 }
4201 }
4202 }
4203
4204 static void
tcp_seg_collect_acked_subtree(struct tcpcb * tp,struct tcp_seg_sent * seg,uint32_t acked_xmit_ts,uint32_t tsecr)4205 tcp_seg_collect_acked_subtree(struct tcpcb *tp, struct tcp_seg_sent *seg,
4206 uint32_t acked_xmit_ts, uint32_t tsecr)
4207 {
4208 if (seg != NULL) {
4209 tcp_seg_collect_acked_subtree(tp, RB_LEFT(seg, seg_link), acked_xmit_ts, tsecr);
4210 tcp_seg_collect_acked_subtree(tp, RB_RIGHT(seg, seg_link), acked_xmit_ts, tsecr);
4211 TAILQ_INSERT_TAIL(&tp->t_segs_acked, seg, ack_link);
4212 }
4213 }
4214
4215 /* Call this function with root of the rb tree */
4216 static void
tcp_seg_collect_acked(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq th_ack,uint32_t acked_xmit_ts,uint32_t tsecr)4217 tcp_seg_collect_acked(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq th_ack,
4218 uint32_t acked_xmit_ts, uint32_t tsecr)
4219 {
4220 if (seg == NULL) {
4221 return;
4222 }
4223
4224 if (SEQ_GEQ(th_ack, seg->end_seq)) {
4225 /* Delete the entire left sub-tree */
4226 tcp_seg_collect_acked_subtree(tp, RB_LEFT(seg, seg_link), acked_xmit_ts, tsecr);
4227 /* Evaluate the right sub-tree */
4228 tcp_seg_collect_acked(tp, RB_RIGHT(seg, seg_link), th_ack, acked_xmit_ts, tsecr);
4229 TAILQ_INSERT_TAIL(&tp->t_segs_acked, seg, ack_link);
4230 } else {
4231 /*
4232 * This ACK doesn't acknowledge the current root and its right sub-tree.
4233 * Evaluate the left sub-tree
4234 */
4235 tcp_seg_collect_acked(tp, RB_LEFT(seg, seg_link), th_ack, acked_xmit_ts, tsecr);
4236 }
4237 }
4238
4239 static void
tcp_seg_delete_acked(struct tcpcb * tp,uint32_t acked_xmit_ts,uint32_t tsecr)4240 tcp_seg_delete_acked(struct tcpcb *tp, uint32_t acked_xmit_ts, uint32_t tsecr)
4241 {
4242 struct tcp_seg_sent *acked_seg = NULL, *next = NULL;
4243
4244 TAILQ_FOREACH_SAFE(acked_seg, &tp->t_segs_acked, ack_link, next) {
4245 /* Advance RACK state if applicable */
4246 if (acked_seg->xmit_ts > acked_xmit_ts) {
4247 tcp_rack_update_segment_acked(tp, tsecr, acked_seg->xmit_ts, acked_seg->end_seq,
4248 !!(acked_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4249 }
4250 /* Check for reordering */
4251 tcp_rack_detect_reordering_acked(tp, acked_seg);
4252
4253 const uint32_t seg_len = tcp_seg_len(acked_seg);
4254 if (acked_seg->flags & TCP_SEGMENT_LOST) {
4255 if (tp->bytes_lost < seg_len) {
4256 os_log_error(OS_LOG_DEFAULT, "bytes_lost (%u) can't be smaller than already "
4257 "lost segment length (%u)", tp->bytes_lost, seg_len);
4258 }
4259 tp->bytes_lost -= seg_len;
4260 }
4261 if (acked_seg->flags & TCP_RACK_RETRANSMITTED) {
4262 if (tp->bytes_retransmitted < seg_len) {
4263 os_log_error(OS_LOG_DEFAULT, "bytes_retransmitted (%u) can't be smaller "
4264 "than already retransmited segment length (%u)",
4265 tp->bytes_retransmitted, seg_len);
4266 }
4267 tp->bytes_retransmitted -= seg_len;
4268 }
4269 if (acked_seg->flags & TCP_SEGMENT_SACKED) {
4270 if (tp->bytes_sacked < seg_len) {
4271 os_log_error(OS_LOG_DEFAULT, "bytes_sacked (%u) can't be smaller than already "
4272 "SACKed segment length (%u)", tp->bytes_sacked, seg_len);
4273 }
4274 tp->bytes_sacked -= seg_len;
4275 }
4276 TAILQ_REMOVE(&tp->t_segs_acked, acked_seg, ack_link);
4277 TAILQ_REMOVE(&tp->t_segs_sent, acked_seg, tx_link);
4278 RB_REMOVE(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, acked_seg);
4279 tcp_seg_delete(tp, acked_seg);
4280 }
4281 }
4282
4283 void
tcp_segs_doack(struct tcpcb * tp,tcp_seq th_ack,struct tcpopt * to)4284 tcp_segs_doack(struct tcpcb *tp, tcp_seq th_ack, struct tcpopt *to)
4285 {
4286 uint32_t tsecr = 0, acked_xmit_ts = 0;
4287 tcp_seq acked_seq = th_ack;
4288 bool was_retransmitted = false;
4289
4290 if (TAILQ_EMPTY(&tp->t_segs_sent)) {
4291 return;
4292 }
4293
4294 if (((to->to_flags & TOF_TS) != 0) && (to->to_tsecr != 0)) {
4295 tsecr = to->to_tsecr;
4296 }
4297
4298 struct tcp_seg_sent seg = {};
4299 struct tcp_seg_sent *found_seg = NULL, *next = NULL;
4300
4301 found_seg = TAILQ_LAST(&tp->t_segs_sent, tcp_seg_sent_head);
4302
4303 if (tp->rack.segs_retransmitted == false) {
4304 if (SEQ_GEQ(th_ack, found_seg->end_seq)) {
4305 /*
4306 * ACK acknowledges the last sent segment completely (snd_max),
4307 * we can remove all segments from time ordered list.
4308 */
4309 acked_seq = found_seg->end_seq;
4310 acked_xmit_ts = found_seg->xmit_ts;
4311 was_retransmitted = !!(found_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE);
4312 tcp_segs_sent_clean(tp, false);
4313
4314 /* Advance RACK state */
4315 tcp_rack_update_segment_acked(tp, tsecr, acked_xmit_ts, acked_seq, was_retransmitted);
4316 return;
4317 }
4318 }
4319 /*
4320 * If either not all segments are ACKed OR the time-ordered list contains retransmitted
4321 * segments, do a RB tree search for largest (completely) ACKed segment and remove the ACKed
4322 * segment and all segments left of it from both RB tree and time-ordered list.
4323 *
4324 * Set the end sequence to search for ACKed segment.
4325 */
4326 seg.end_seq = th_ack;
4327
4328 if ((found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg)) != NULL) {
4329 acked_seq = found_seg->end_seq;
4330 acked_xmit_ts = found_seg->xmit_ts;
4331 was_retransmitted = !!(found_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE);
4332
4333 /*
4334 * Remove all segments that are ACKed by this ACK.
4335 * We defer self-balancing of RB tree to the end
4336 * by calling RB_REMOVE after collecting all ACKed segments.
4337 */
4338 tcp_seg_collect_acked(tp, RB_ROOT(&tp->t_segs_sent_tree), th_ack, acked_xmit_ts, tsecr);
4339 tcp_seg_delete_acked(tp, acked_xmit_ts, tsecr);
4340
4341 /* Advance RACK state */
4342 tcp_rack_update_segment_acked(tp, tsecr, acked_xmit_ts, acked_seq, was_retransmitted);
4343
4344 return;
4345 }
4346 /*
4347 * When TSO is enabled, it is possible that th_ack is less
4348 * than segment->end, hence we search the tree
4349 * until we find the largest (partially) ACKed segment.
4350 */
4351 RB_FOREACH_SAFE(found_seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, next) {
4352 if (SEQ_LT(th_ack, found_seg->end_seq) && SEQ_GT(th_ack, found_seg->start_seq)) {
4353 acked_seq = th_ack;
4354 acked_xmit_ts = found_seg->xmit_ts;
4355 was_retransmitted = !!(found_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE);
4356
4357 /* Remove all segments completely ACKed by this ack */
4358 tcp_seg_collect_acked(tp, RB_ROOT(&tp->t_segs_sent_tree), th_ack, acked_xmit_ts, tsecr);
4359 tcp_seg_delete_acked(tp, acked_xmit_ts, tsecr);
4360 found_seg->start_seq = th_ack;
4361
4362 /* Advance RACK state */
4363 tcp_rack_update_segment_acked(tp, tsecr, acked_xmit_ts, acked_seq, was_retransmitted);
4364 break;
4365 }
4366 }
4367 }
4368
4369 static bool
tcp_seg_mark_sacked(struct tcpcb * tp,struct tcp_seg_sent * seg,uint32_t * newbytes_sacked)4370 tcp_seg_mark_sacked(struct tcpcb *tp, struct tcp_seg_sent *seg, uint32_t *newbytes_sacked)
4371 {
4372 if (seg->flags & TCP_SEGMENT_SACKED) {
4373 return false;
4374 }
4375
4376 const uint32_t seg_len = tcp_seg_len(seg);
4377
4378 /* Check for reordering */
4379 tcp_rack_detect_reordering_acked(tp, seg);
4380
4381 if (seg->flags & TCP_RACK_RETRANSMITTED) {
4382 if (seg->flags & TCP_SEGMENT_LOST) {
4383 /*
4384 * If the segment is not considered lost, we don't clear
4385 * retransmitted as it might still be in flight. The ONLY time
4386 * this can happen is when RTO happens and segment is retransmitted
4387 * and SACKed before RACK detects segment was lost.
4388 */
4389 seg->flags &= ~(TCP_SEGMENT_LOST | TCP_RACK_RETRANSMITTED);
4390 if (tp->bytes_lost < seg_len || tp->bytes_retransmitted < seg_len) {
4391 os_log_error(OS_LOG_DEFAULT, "bytes_lost (%u) and/or bytes_retransmitted (%u) "
4392 "can't be smaller than already lost/retransmitted segment length (%u)", tp->bytes_lost,
4393 tp->bytes_retransmitted, seg_len);
4394 }
4395 tp->bytes_lost -= seg_len;
4396 tp->bytes_retransmitted -= seg_len;
4397 }
4398 } else {
4399 if (seg->flags & TCP_SEGMENT_LOST) {
4400 seg->flags &= ~(TCP_SEGMENT_LOST);
4401 if (tp->bytes_lost < seg_len) {
4402 os_log_error(OS_LOG_DEFAULT, "bytes_lost (%u) can't be smaller "
4403 "than already lost segment length (%u)", tp->bytes_lost, seg_len);
4404 }
4405 tp->bytes_lost -= seg_len;
4406 }
4407 }
4408 *newbytes_sacked += seg_len;
4409 seg->flags |= TCP_SEGMENT_SACKED;
4410 tp->bytes_sacked += seg_len;
4411
4412 return true;
4413 }
4414
4415 static void
tcp_segs_dosack_matched(struct tcpcb * tp,struct tcp_seg_sent * found_seg,tcp_seq sblk_start,uint32_t tsecr,uint32_t * newbytes_sacked)4416 tcp_segs_dosack_matched(struct tcpcb *tp, struct tcp_seg_sent *found_seg,
4417 tcp_seq sblk_start, uint32_t tsecr,
4418 uint32_t *newbytes_sacked)
4419 {
4420 struct tcp_seg_sent seg = {};
4421
4422 while (found_seg != NULL) {
4423 if (sblk_start == found_seg->start_seq) {
4424 /*
4425 * Covered the entire SACK block.
4426 * Record segment flags before they get erased.
4427 */
4428 uint8_t seg_flags = found_seg->flags;
4429 bool newly_marked = tcp_seg_mark_sacked(tp, found_seg, newbytes_sacked);
4430 if (newly_marked) {
4431 /* Advance RACK state */
4432 tcp_rack_update_segment_acked(tp, tsecr, found_seg->xmit_ts,
4433 found_seg->end_seq,
4434 !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4435 }
4436 break;
4437 } else if (SEQ_GT(sblk_start, found_seg->start_seq)) {
4438 if ((found_seg->flags & TCP_SEGMENT_SACKED) != 0) {
4439 /* No need to process an already SACKED segment */
4440 break;
4441 }
4442 /*
4443 * This segment is partially ACKed by SACK block
4444 * as sblk_start > segment start. Since it is
4445 * partially SACKed, we should split the unSACKed and
4446 * SACKed parts.
4447 */
4448 /* First create a new segment for unSACKed part */
4449 tcp_seg_sent_insert_before(tp, found_seg, found_seg->start_seq, sblk_start,
4450 found_seg->xmit_ts, found_seg->flags);
4451 /* Now, update the SACKed part */
4452 found_seg->start_seq = sblk_start;
4453 /* Record seg flags before they get erased. */
4454 uint8_t seg_flags = found_seg->flags;
4455 bool newly_marked = tcp_seg_mark_sacked(tp, found_seg, newbytes_sacked);
4456 if (newly_marked) {
4457 /* Advance RACK state */
4458 tcp_rack_update_segment_acked(tp, tsecr, found_seg->xmit_ts,
4459 found_seg->end_seq,
4460 !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4461 }
4462 break;
4463 } else {
4464 /*
4465 * This segment lies within the SACK block
4466 * Record segment flags before they get erased.
4467 */
4468 uint8_t seg_flags = found_seg->flags;
4469 bool newly_marked = tcp_seg_mark_sacked(tp, found_seg, newbytes_sacked);
4470 if (newly_marked) {
4471 /* Advance RACK state */
4472 tcp_rack_update_segment_acked(tp, tsecr, found_seg->xmit_ts,
4473 found_seg->end_seq,
4474 !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4475 }
4476 /* Find the next segment ending at the start of current segment */
4477 seg.end_seq = found_seg->start_seq;
4478 found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg);
4479 }
4480 }
4481 }
4482
4483 void
tcp_segs_dosack(struct tcpcb * tp,tcp_seq sblk_start,tcp_seq sblk_end,uint32_t tsecr,uint32_t * newbytes_sacked)4484 tcp_segs_dosack(struct tcpcb *tp, tcp_seq sblk_start, tcp_seq sblk_end,
4485 uint32_t tsecr, uint32_t *newbytes_sacked)
4486 {
4487 /*
4488 * When we receive SACK, min RTT is computed after SACK processing which
4489 * means we are using min RTT from the previous ACK to advance RACK state
4490 * This is ok as we track a windowed min-filtered estimate over a period.
4491 */
4492 struct tcp_seg_sent seg = {};
4493 struct tcp_seg_sent *found_seg = NULL, *sacked_seg = NULL;
4494
4495 /* Set the end sequence to search for SACKed segment */
4496 seg.end_seq = sblk_end;
4497 found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg);
4498
4499 if (found_seg != NULL) {
4500 /* We found an exact match for sblk_end */
4501 tcp_segs_dosack_matched(tp, found_seg, sblk_start, tsecr, newbytes_sacked);
4502 return;
4503 }
4504 /*
4505 * We come here when we don't find an exact match and sblk_end
4506 * lies within a segment. This would happen only when TSO is used.
4507 */
4508 RB_FOREACH(found_seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
4509 if (SEQ_LT(sblk_end, found_seg->end_seq) && SEQ_GT(sblk_end, found_seg->start_seq)) {
4510 /*
4511 * This segment is partially SACKed. We split this segment at the boundary
4512 * of SACK block. First insert the newly SACKed part
4513 */
4514 tcp_seq start = SEQ_LEQ(sblk_start, found_seg->start_seq) ? found_seg->start_seq : sblk_start;
4515 struct tcp_seg_sent *inserted = tcp_seg_sent_insert_before(tp, found_seg, start,
4516 sblk_end, found_seg->xmit_ts, found_seg->flags);
4517 /* Record seg flags before they get erased. */
4518 uint8_t seg_flags = inserted->flags;
4519 /* Mark the SACKed segment */
4520 tcp_seg_mark_sacked(tp, inserted, newbytes_sacked);
4521
4522 /* Advance RACK state */
4523 tcp_rack_update_segment_acked(tp, tsecr, inserted->xmit_ts,
4524 inserted->end_seq, !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4525
4526 if (sblk_start == found_seg->start_seq) {
4527 /*
4528 * We are done with this SACK block.
4529 * Move the start of existing segment
4530 */
4531 found_seg->start_seq = sblk_end;
4532 break;
4533 }
4534
4535 if (SEQ_GT(sblk_start, found_seg->start_seq)) {
4536 /* Insert the remaining unSACKed part before the SACKED segment inserted above */
4537 tcp_seg_sent_insert_before(tp, inserted, found_seg->start_seq,
4538 sblk_start, found_seg->xmit_ts, found_seg->flags);
4539 /* Move the start of existing segment */
4540 found_seg->start_seq = sblk_end;
4541 break;
4542 } else {
4543 /*
4544 * This SACK block covers more than one segment
4545 * Look for segments SACKed below this segment
4546 */
4547 seg.end_seq = found_seg->start_seq;
4548 sacked_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg);
4549
4550 if (sacked_seg != NULL) {
4551 /* We found an exact match for sblk_end */
4552 tcp_segs_dosack_matched(tp, sacked_seg, sblk_start, tsecr, newbytes_sacked);
4553 }
4554
4555 /* Move the start of existing segment */
4556 found_seg->start_seq = sblk_end;
4557 }
4558 break;
4559 }
4560 }
4561 }
4562
4563 void
tcp_segs_clear_sacked(struct tcpcb * tp)4564 tcp_segs_clear_sacked(struct tcpcb *tp)
4565 {
4566 struct tcp_seg_sent *seg = NULL;
4567
4568 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link)
4569 {
4570 const uint32_t seg_len = tcp_seg_len(seg);
4571
4572 if (seg->flags & TCP_SEGMENT_SACKED) {
4573 seg->flags &= ~(TCP_SEGMENT_SACKED);
4574 if (tp->bytes_sacked < seg_len) {
4575 os_log_error(OS_LOG_DEFAULT, "bytes_sacked (%u) can't be smaller "
4576 "than already SACKed segment length (%u)", tp->bytes_sacked, seg_len);
4577 }
4578 tp->bytes_sacked -= seg_len;
4579 }
4580 }
4581 }
4582
4583 void
tcp_mark_seg_lost(struct tcpcb * tp,struct tcp_seg_sent * seg)4584 tcp_mark_seg_lost(struct tcpcb *tp, struct tcp_seg_sent *seg)
4585 {
4586 const uint32_t seg_len = tcp_seg_len(seg);
4587
4588 if (seg->flags & TCP_SEGMENT_LOST) {
4589 if (seg->flags & TCP_RACK_RETRANSMITTED) {
4590 /* Retransmission was lost */
4591 seg->flags &= ~TCP_RACK_RETRANSMITTED;
4592 if (tp->bytes_retransmitted < seg_len) {
4593 os_log_error(OS_LOG_DEFAULT, "bytes_retransmitted (%u) can't be "
4594 "smaller than retransmited segment length (%u)",
4595 tp->bytes_retransmitted, seg_len);
4596 return;
4597 }
4598 tp->bytes_retransmitted -= seg_len;
4599 }
4600 } else {
4601 seg->flags |= TCP_SEGMENT_LOST;
4602 tp->bytes_lost += seg_len;
4603 }
4604 }
4605
4606 void
tcp_seg_delete(struct tcpcb * tp,struct tcp_seg_sent * seg)4607 tcp_seg_delete(struct tcpcb *tp, struct tcp_seg_sent *seg)
4608 {
4609 if (tp->seg_pool.free_segs_count >= TCP_SEG_POOL_MAX_ITEM_COUNT) {
4610 zfree(tcp_seg_sent_zone, seg);
4611 } else {
4612 bzero(seg, sizeof(*seg));
4613 TAILQ_INSERT_TAIL(&tp->seg_pool.free_segs, seg, free_link);
4614 tp->seg_pool.free_segs_count++;
4615 }
4616 }
4617
4618 void
tcp_segs_sent_clean(struct tcpcb * tp,bool free_segs)4619 tcp_segs_sent_clean(struct tcpcb *tp, bool free_segs)
4620 {
4621 struct tcp_seg_sent *seg = NULL, *next = NULL;
4622
4623 TAILQ_FOREACH_SAFE(seg, &tp->t_segs_sent, tx_link, next) {
4624 /* Check for reordering */
4625 tcp_rack_detect_reordering_acked(tp, seg);
4626
4627 TAILQ_REMOVE(&tp->t_segs_sent, seg, tx_link);
4628 RB_REMOVE(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4629 tcp_seg_delete(tp, seg);
4630 }
4631 if (__improbable(!RB_EMPTY(&tp->t_segs_sent_tree))) {
4632 os_log_error(OS_LOG_DEFAULT, "RB tree still contains segments while "
4633 "time ordered list is already empty");
4634 }
4635 if (__improbable(!TAILQ_EMPTY(&tp->t_segs_acked))) {
4636 os_log_error(OS_LOG_DEFAULT, "Segment ACKed list shouldn't contain "
4637 "any segments as they are removed immediately after being ACKed");
4638 }
4639 /* Reset seg_retransmitted as we emptied the list */
4640 tcp_rack_reset_segs_retransmitted(tp);
4641 tp->bytes_lost = tp->bytes_sacked = tp->bytes_retransmitted = 0;
4642
4643 /* Empty the free segments pool */
4644 if (free_segs) {
4645 TAILQ_FOREACH_SAFE(seg, &tp->seg_pool.free_segs, free_link, next) {
4646 TAILQ_REMOVE(&tp->seg_pool.free_segs, seg, free_link);
4647 zfree(tcp_seg_sent_zone, seg);
4648 }
4649 tp->seg_pool.free_segs_count = 0;
4650 }
4651 }
4652
4653 void
tcp_get_connectivity_status(struct tcpcb * tp,struct tcp_conn_status * connstatus)4654 tcp_get_connectivity_status(struct tcpcb *tp,
4655 struct tcp_conn_status *connstatus)
4656 {
4657 if (tp == NULL || connstatus == NULL) {
4658 return;
4659 }
4660 bzero(connstatus, sizeof(*connstatus));
4661 if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
4662 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
4663 connstatus->write_probe_failed = 1;
4664 } else {
4665 connstatus->conn_probe_failed = 1;
4666 }
4667 }
4668 if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX) {
4669 connstatus->read_probe_failed = 1;
4670 }
4671 if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL &&
4672 (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)) {
4673 connstatus->probe_activated = 1;
4674 }
4675 }
4676
4677 void
tcp_disable_tfo(struct tcpcb * tp)4678 tcp_disable_tfo(struct tcpcb *tp)
4679 {
4680 tp->t_flagsext &= ~TF_FASTOPEN;
4681 }
4682
4683 static struct mbuf *
tcp_make_keepalive_frame(struct tcpcb * tp,struct ifnet * ifp,boolean_t is_probe)4684 tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp,
4685 boolean_t is_probe)
4686 {
4687 struct inpcb *inp = tp->t_inpcb;
4688 struct tcphdr *th;
4689 caddr_t data;
4690 int win = 0;
4691 struct mbuf *m;
4692
4693 /*
4694 * The code assumes the IP + TCP headers fit in an mbuf packet header
4695 */
4696 _CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN);
4697 _CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN);
4698
4699 MGETHDR(m, M_WAIT, MT_HEADER);
4700 if (m == NULL) {
4701 return NULL;
4702 }
4703 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
4704
4705 data = m_mtod_lower_bound(m);
4706
4707 if (inp->inp_vflag & INP_IPV4) {
4708 bzero(data, sizeof(struct ip) + sizeof(struct tcphdr));
4709 th = (struct tcphdr *)(void *) (data + sizeof(struct ip));
4710 m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
4711 m->m_pkthdr.len = m->m_len;
4712 } else {
4713 VERIFY(inp->inp_vflag & INP_IPV6);
4714
4715 bzero(data, sizeof(struct ip6_hdr)
4716 + sizeof(struct tcphdr));
4717 th = (struct tcphdr *)(void *)(data + sizeof(struct ip6_hdr));
4718 m->m_len = sizeof(struct ip6_hdr) +
4719 sizeof(struct tcphdr);
4720 m->m_pkthdr.len = m->m_len;
4721 }
4722
4723 tcp_fillheaders(m, tp, data, th);
4724
4725 if (inp->inp_vflag & INP_IPV4) {
4726 struct ip *ip;
4727
4728 ip = (__typeof__(ip))(void *)data;
4729
4730 ip->ip_id = rfc6864 ? 0 : ip_randomid((uint64_t)m);
4731 ip->ip_off = htons(IP_DF);
4732 ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
4733 ip->ip_ttl = inp->inp_ip_ttl;
4734 ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
4735 ip->ip_sum = in_cksum_hdr(ip);
4736 } else {
4737 struct ip6_hdr *ip6;
4738
4739 ip6 = (__typeof__(ip6))(void *)data;
4740
4741 ip6->ip6_plen = htons(sizeof(struct tcphdr));
4742 ip6->ip6_hlim = in6_selecthlim(inp, ifp);
4743 ip6->ip6_flow = ip6->ip6_flow & ~IPV6_FLOW_ECN_MASK;
4744
4745 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
4746 ip6->ip6_src.s6_addr16[1] = 0;
4747 }
4748 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
4749 ip6->ip6_dst.s6_addr16[1] = 0;
4750 }
4751 }
4752 th->th_flags = TH_ACK;
4753
4754 win = tcp_sbspace(tp);
4755 if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale)) {
4756 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
4757 }
4758 th->th_win = htons((u_short) (win >> tp->rcv_scale));
4759
4760 if (is_probe) {
4761 th->th_seq = htonl(tp->snd_una - 1);
4762 } else {
4763 th->th_seq = htonl(tp->snd_una);
4764 }
4765 th->th_ack = htonl(tp->rcv_nxt);
4766
4767 /* Force recompute TCP checksum to be the final value */
4768 th->th_sum = 0;
4769 if (inp->inp_vflag & INP_IPV4) {
4770 th->th_sum = inet_cksum(m, IPPROTO_TCP,
4771 sizeof(struct ip), sizeof(struct tcphdr));
4772 } else {
4773 th->th_sum = inet6_cksum(m, IPPROTO_TCP,
4774 sizeof(struct ip6_hdr), sizeof(struct tcphdr));
4775 }
4776
4777 return m;
4778 }
4779
4780 void
tcp_fill_keepalive_offload_frames(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frames_array __counted_by (frames_array_count),u_int32_t frames_array_count,size_t frame_data_offset,u_int32_t * used_frames_count)4781 tcp_fill_keepalive_offload_frames(ifnet_t ifp,
4782 struct ifnet_keepalive_offload_frame *frames_array __counted_by(frames_array_count),
4783 u_int32_t frames_array_count, size_t frame_data_offset,
4784 u_int32_t *used_frames_count)
4785 {
4786 struct inpcb *inp;
4787 inp_gen_t gencnt;
4788 u_int32_t frame_index = *used_frames_count;
4789
4790 /* Validation of the parameters */
4791 if (ifp == NULL || frames_array == NULL ||
4792 frames_array_count == 0 ||
4793 frame_index >= frames_array_count ||
4794 frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4795 return;
4796 }
4797
4798 /* Fast exit when no process is using the socket option TCP_KEEPALIVE_OFFLOAD */
4799 if (ifp->if_tcp_kao_cnt == 0) {
4800 return;
4801 }
4802
4803 /*
4804 * This function is called outside the regular TCP processing
4805 * so we need to update the TCP clock.
4806 */
4807 calculate_tcp_clock();
4808
4809 lck_rw_lock_shared(&tcbinfo.ipi_lock);
4810 gencnt = tcbinfo.ipi_gencnt;
4811 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4812 struct socket *so;
4813 struct ifnet_keepalive_offload_frame *frame;
4814 struct mbuf *m = NULL;
4815 struct tcpcb *tp = intotcpcb(inp);
4816
4817 if (frame_index >= frames_array_count) {
4818 break;
4819 }
4820
4821 if (inp->inp_gencnt > gencnt ||
4822 inp->inp_state == INPCB_STATE_DEAD) {
4823 continue;
4824 }
4825
4826 if ((so = inp->inp_socket) == NULL ||
4827 (so->so_state & SS_DEFUNCT)) {
4828 continue;
4829 }
4830 /*
4831 * check for keepalive offload flag without socket
4832 * lock to avoid a deadlock
4833 */
4834 if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4835 continue;
4836 }
4837
4838 if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
4839 continue;
4840 }
4841 if (inp->inp_ppcb == NULL ||
4842 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4843 continue;
4844 }
4845 socket_lock(so, 1);
4846 /* Release the want count */
4847 if (inp->inp_ppcb == NULL ||
4848 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
4849 socket_unlock(so, 1);
4850 continue;
4851 }
4852 if ((inp->inp_vflag & INP_IPV4) &&
4853 (inp->inp_laddr.s_addr == INADDR_ANY ||
4854 inp->inp_faddr.s_addr == INADDR_ANY)) {
4855 socket_unlock(so, 1);
4856 continue;
4857 }
4858 if ((inp->inp_vflag & INP_IPV6) &&
4859 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
4860 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) {
4861 socket_unlock(so, 1);
4862 continue;
4863 }
4864 if (inp->inp_lport == 0 || inp->inp_fport == 0) {
4865 socket_unlock(so, 1);
4866 continue;
4867 }
4868 if (inp->inp_last_outifp == NULL ||
4869 inp->inp_last_outifp->if_index != ifp->if_index) {
4870 socket_unlock(so, 1);
4871 continue;
4872 }
4873 if ((inp->inp_vflag & INP_IPV4) && frame_data_offset +
4874 sizeof(struct ip) + sizeof(struct tcphdr) >
4875 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4876 socket_unlock(so, 1);
4877 continue;
4878 } else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset +
4879 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) >
4880 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4881 socket_unlock(so, 1);
4882 continue;
4883 }
4884 /*
4885 * There is no point in waking up the device for connections
4886 * that are not established. Long lived connection are meant
4887 * for processes that will sent and receive data
4888 */
4889 if (tp->t_state != TCPS_ESTABLISHED) {
4890 socket_unlock(so, 1);
4891 continue;
4892 }
4893 /*
4894 * This inp has all the information that is needed to
4895 * generate an offload frame.
4896 */
4897 frame = &frames_array[frame_index];
4898 frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP;
4899 frame->ether_type = (inp->inp_vflag & INP_IPV4) ?
4900 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 :
4901 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6;
4902 frame->interval = (uint16_t)(tp->t_keepidle > 0 ? tp->t_keepidle :
4903 tcp_keepidle);
4904 frame->keep_cnt = (uint8_t)TCP_CONN_KEEPCNT(tp);
4905 frame->keep_retry = (uint16_t)TCP_CONN_KEEPINTVL(tp);
4906 if (so->so_options & SO_NOWAKEFROMSLEEP) {
4907 frame->flags |=
4908 IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP;
4909 }
4910 frame->local_port = ntohs(inp->inp_lport);
4911 frame->remote_port = ntohs(inp->inp_fport);
4912 frame->local_seq = tp->snd_nxt;
4913 frame->remote_seq = tp->rcv_nxt;
4914 if (inp->inp_vflag & INP_IPV4) {
4915 ASSERT(frame_data_offset + sizeof(struct ip) + sizeof(struct tcphdr) <= UINT8_MAX);
4916 frame->length = (uint8_t)(frame_data_offset +
4917 sizeof(struct ip) + sizeof(struct tcphdr));
4918 frame->reply_length = frame->length;
4919
4920 frame->addr_length = sizeof(struct in_addr);
4921 bcopy(&inp->inp_laddr, frame->local_addr,
4922 sizeof(struct in_addr));
4923 bcopy(&inp->inp_faddr, frame->remote_addr,
4924 sizeof(struct in_addr));
4925 } else {
4926 struct in6_addr *ip6;
4927
4928 ASSERT(frame_data_offset + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= UINT8_MAX);
4929 frame->length = (uint8_t)(frame_data_offset +
4930 sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
4931 frame->reply_length = frame->length;
4932
4933 frame->addr_length = sizeof(struct in6_addr);
4934 ip6 = (struct in6_addr *)(void *)frame->local_addr;
4935 bcopy(&inp->in6p_laddr, ip6, sizeof(struct in6_addr));
4936 if (IN6_IS_SCOPE_EMBED(ip6)) {
4937 ip6->s6_addr16[1] = 0;
4938 }
4939
4940 ip6 = (struct in6_addr *)(void *)frame->remote_addr;
4941 bcopy(&inp->in6p_faddr, ip6, sizeof(struct in6_addr));
4942 if (IN6_IS_SCOPE_EMBED(ip6)) {
4943 ip6->s6_addr16[1] = 0;
4944 }
4945 }
4946
4947 /*
4948 * First the probe
4949 */
4950 m = tcp_make_keepalive_frame(tp, ifp, TRUE);
4951 if (m == NULL) {
4952 socket_unlock(so, 1);
4953 continue;
4954 }
4955 bcopy(m_mtod_current(m), frame->data + frame_data_offset, m->m_len);
4956 m_freem(m);
4957
4958 /*
4959 * Now the response packet to incoming probes
4960 */
4961 m = tcp_make_keepalive_frame(tp, ifp, FALSE);
4962 if (m == NULL) {
4963 socket_unlock(so, 1);
4964 continue;
4965 }
4966 bcopy(m_mtod_current(m), frame->reply_data + frame_data_offset,
4967 m->m_len);
4968 m_freem(m);
4969
4970 frame_index++;
4971 socket_unlock(so, 1);
4972 }
4973 lck_rw_done(&tcbinfo.ipi_lock);
4974 *used_frames_count = frame_index;
4975 }
4976
4977 static bool
inp_matches_kao_frame(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frame,struct inpcb * inp)4978 inp_matches_kao_frame(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frame,
4979 struct inpcb *inp)
4980 {
4981 if (inp->inp_ppcb == NULL) {
4982 return false;
4983 }
4984 /* Release the want count */
4985 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4986 return false;
4987 }
4988 if (inp->inp_last_outifp == NULL ||
4989 inp->inp_last_outifp->if_index != ifp->if_index) {
4990 return false;
4991 }
4992 if (frame->local_port != ntohs(inp->inp_lport) ||
4993 frame->remote_port != ntohs(inp->inp_fport)) {
4994 return false;
4995 }
4996 if (inp->inp_vflag & INP_IPV4) {
4997 if (memcmp(&inp->inp_laddr, frame->local_addr,
4998 sizeof(struct in_addr)) != 0 ||
4999 memcmp(&inp->inp_faddr, frame->remote_addr,
5000 sizeof(struct in_addr)) != 0) {
5001 return false;
5002 }
5003 } else if (inp->inp_vflag & INP_IPV6) {
5004 if (memcmp(&inp->inp_laddr, frame->local_addr,
5005 sizeof(struct in6_addr)) != 0 ||
5006 memcmp(&inp->inp_faddr, frame->remote_addr,
5007 sizeof(struct in6_addr)) != 0) {
5008 return false;
5009 }
5010 } else {
5011 return false;
5012 }
5013 return true;
5014 }
5015
5016 int
tcp_notify_kao_timeout(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frame)5017 tcp_notify_kao_timeout(ifnet_t ifp,
5018 struct ifnet_keepalive_offload_frame *frame)
5019 {
5020 struct inpcb *inp = NULL;
5021 struct socket *so = NULL;
5022 bool found = false;
5023
5024 /*
5025 * Unlock the list before posting event on the matching socket
5026 */
5027 lck_rw_lock_shared(&tcbinfo.ipi_lock);
5028
5029 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
5030 if ((so = inp->inp_socket) == NULL ||
5031 (so->so_state & SS_DEFUNCT)) {
5032 continue;
5033 }
5034 if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
5035 continue;
5036 }
5037 if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
5038 continue;
5039 }
5040 if (inp->inp_ppcb == NULL ||
5041 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
5042 continue;
5043 }
5044 socket_lock(so, 1);
5045 if (inp_matches_kao_frame(ifp, frame, inp)) {
5046 /*
5047 * Keep the matching socket locked
5048 */
5049 found = true;
5050 break;
5051 }
5052 socket_unlock(so, 1);
5053 }
5054 lck_rw_done(&tcbinfo.ipi_lock);
5055
5056 if (found) {
5057 ASSERT(inp != NULL);
5058 ASSERT(so != NULL);
5059 ASSERT(so == inp->inp_socket);
5060 /*
5061 * Drop the TCP connection like tcptimers() does
5062 */
5063 tcpcb_ref_t tp = inp->inp_ppcb;
5064
5065 tcpstat.tcps_keepdrops++;
5066 soevent(so,
5067 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
5068 tp = tcp_drop(tp, ETIMEDOUT);
5069
5070 tcpstat.tcps_ka_offload_drops++;
5071 os_log_info(OS_LOG_DEFAULT, "%s: dropped lport %u fport %u\n",
5072 __func__, frame->local_port, frame->remote_port);
5073
5074 socket_unlock(so, 1);
5075 }
5076
5077 return 0;
5078 }
5079
5080 errno_t
tcp_notify_ack_id_valid(struct tcpcb * tp,struct socket * so,u_int32_t notify_id)5081 tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so,
5082 u_int32_t notify_id)
5083 {
5084 struct tcp_notify_ack_marker *elm;
5085
5086 if (so->so_snd.sb_cc == 0) {
5087 return ENOBUFS;
5088 }
5089
5090 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
5091 /* Duplicate id is not allowed */
5092 if (elm->notify_id == notify_id) {
5093 return EINVAL;
5094 }
5095 /* Duplicate position is not allowed */
5096 if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc) {
5097 return EINVAL;
5098 }
5099 }
5100 return 0;
5101 }
5102
5103 errno_t
tcp_add_notify_ack_marker(struct tcpcb * tp,u_int32_t notify_id)5104 tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id)
5105 {
5106 struct tcp_notify_ack_marker *nm, *elm = NULL;
5107 struct socket *so = tp->t_inpcb->inp_socket;
5108
5109 nm = kalloc_type(struct tcp_notify_ack_marker, M_WAIT | Z_ZERO);
5110 if (nm == NULL) {
5111 return ENOMEM;
5112 }
5113 nm->notify_id = notify_id;
5114 nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc;
5115
5116 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
5117 if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una)) {
5118 break;
5119 }
5120 }
5121
5122 if (elm == NULL) {
5123 VERIFY(SLIST_EMPTY(&tp->t_notify_ack));
5124 SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next);
5125 } else {
5126 SLIST_INSERT_AFTER(elm, nm, notify_next);
5127 }
5128 tp->t_notify_ack_count++;
5129 return 0;
5130 }
5131
5132 void
tcp_notify_ack_free(struct tcpcb * tp)5133 tcp_notify_ack_free(struct tcpcb *tp)
5134 {
5135 struct tcp_notify_ack_marker *elm, *next;
5136 if (SLIST_EMPTY(&tp->t_notify_ack)) {
5137 return;
5138 }
5139
5140 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
5141 SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker,
5142 notify_next);
5143 kfree_type(struct tcp_notify_ack_marker, elm);
5144 }
5145 SLIST_INIT(&tp->t_notify_ack);
5146 tp->t_notify_ack_count = 0;
5147 }
5148
5149 inline void
tcp_notify_acknowledgement(struct tcpcb * tp,struct socket * so)5150 tcp_notify_acknowledgement(struct tcpcb *tp, struct socket *so)
5151 {
5152 struct tcp_notify_ack_marker *elm;
5153
5154 elm = SLIST_FIRST(&tp->t_notify_ack);
5155 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5156 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOTIFY_ACK);
5157 }
5158 }
5159
5160 void
tcp_get_notify_ack_count(struct tcpcb * tp,struct tcp_notify_ack_complete * retid)5161 tcp_get_notify_ack_count(struct tcpcb *tp,
5162 struct tcp_notify_ack_complete *retid)
5163 {
5164 struct tcp_notify_ack_marker *elm;
5165 uint32_t complete = 0;
5166
5167 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
5168 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5169 ASSERT(complete < UINT32_MAX);
5170 complete++;
5171 } else {
5172 break;
5173 }
5174 }
5175 retid->notify_pending = tp->t_notify_ack_count - complete;
5176 retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, complete);
5177 }
5178
5179 void
tcp_get_notify_ack_ids(struct tcpcb * tp,struct tcp_notify_ack_complete * retid)5180 tcp_get_notify_ack_ids(struct tcpcb *tp,
5181 struct tcp_notify_ack_complete *retid)
5182 {
5183 size_t i = 0;
5184 struct tcp_notify_ack_marker *elm, *next;
5185
5186 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
5187 if (i >= retid->notify_complete_count) {
5188 break;
5189 }
5190 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5191 retid->notify_complete_id[i++] = elm->notify_id;
5192 SLIST_REMOVE(&tp->t_notify_ack, elm,
5193 tcp_notify_ack_marker, notify_next);
5194 kfree_type(struct tcp_notify_ack_marker, elm);
5195 tp->t_notify_ack_count--;
5196 } else {
5197 break;
5198 }
5199 }
5200 }
5201
5202 bool
tcp_notify_ack_active(struct socket * so)5203 tcp_notify_ack_active(struct socket *so)
5204 {
5205 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5206 SOCK_TYPE(so) == SOCK_STREAM) {
5207 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5208
5209 if (!SLIST_EMPTY(&tp->t_notify_ack)) {
5210 struct tcp_notify_ack_marker *elm;
5211 elm = SLIST_FIRST(&tp->t_notify_ack);
5212 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5213 return true;
5214 }
5215 }
5216 }
5217 return false;
5218 }
5219
5220 inline int32_t
inp_get_sndbytes_allunsent(struct socket * so,u_int32_t th_ack)5221 inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
5222 {
5223 struct inpcb *inp = sotoinpcb(so);
5224 struct tcpcb *tp = intotcpcb(inp);
5225
5226 if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) &&
5227 so->so_snd.sb_cc > 0) {
5228 int32_t unsent, sent;
5229 sent = tp->snd_max - th_ack;
5230 if (tp->t_flags & TF_SENTFIN) {
5231 sent--;
5232 }
5233 unsent = so->so_snd.sb_cc - sent;
5234 return unsent;
5235 }
5236 return 0;
5237 }
5238
5239 uint8_t
tcp_get_ace(struct tcphdr * th)5240 tcp_get_ace(struct tcphdr *th)
5241 {
5242 uint8_t ace = 0;
5243 if (th->th_flags & TH_ECE) {
5244 ace += 1;
5245 }
5246 if (th->th_flags & TH_CWR) {
5247 ace += 2;
5248 }
5249 if (th->th_x2 & (TH_AE >> 8)) {
5250 ace += 4;
5251 }
5252
5253 return ace;
5254 }
5255
5256 #define IFP_PER_FLOW_STAT(_ipv4_, _stat_) { \
5257 if (_ipv4_) { \
5258 ifp->if_ipv4_stat->_stat_++; \
5259 } else { \
5260 ifp->if_ipv6_stat->_stat_++; \
5261 } \
5262 }
5263
5264 #define FLOW_ECN_ENABLED(_flags_) \
5265 ((_flags_ & (TE_ECN_ON)) == (TE_ECN_ON))
5266
5267 void
tcp_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)5268 tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
5269 struct ifnet *ifp)
5270 {
5271 if (ifp == NULL || !IF_FULLY_ATTACHED(ifp)) {
5272 return;
5273 }
5274
5275 ifnet_lock_shared(ifp);
5276 if (ifs->ecn_flags & TE_SETUPSENT) {
5277 if (ifs->ecn_flags & TE_CLIENT_SETUP) {
5278 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_client_setup);
5279 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5280 IFP_PER_FLOW_STAT(ifs->ipv4,
5281 ecn_client_success);
5282 } else if (ifs->ecn_flags & TE_LOST_SYN) {
5283 IFP_PER_FLOW_STAT(ifs->ipv4,
5284 ecn_syn_lost);
5285 } else {
5286 IFP_PER_FLOW_STAT(ifs->ipv4,
5287 ecn_peer_nosupport);
5288 }
5289 } else {
5290 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_server_setup);
5291 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5292 IFP_PER_FLOW_STAT(ifs->ipv4,
5293 ecn_server_success);
5294 } else if (ifs->ecn_flags & TE_LOST_SYN) {
5295 IFP_PER_FLOW_STAT(ifs->ipv4,
5296 ecn_synack_lost);
5297 } else {
5298 IFP_PER_FLOW_STAT(ifs->ipv4,
5299 ecn_peer_nosupport);
5300 }
5301 }
5302 } else {
5303 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off_conn);
5304 }
5305 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5306 if (ifs->ecn_flags & TE_RECV_ECN_CE) {
5307 tcpstat.tcps_ecn_conn_recv_ce++;
5308 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ce);
5309 }
5310 if (ifs->ecn_flags & TE_RECV_ECN_ECE) {
5311 tcpstat.tcps_ecn_conn_recv_ece++;
5312 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ece);
5313 }
5314 if (ifs->ecn_flags & (TE_RECV_ECN_CE | TE_RECV_ECN_ECE)) {
5315 if (ifs->txretransmitbytes > 0 ||
5316 ifs->rxoutoforderbytes > 0) {
5317 tcpstat.tcps_ecn_conn_pl_ce++;
5318 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plce);
5319 } else {
5320 tcpstat.tcps_ecn_conn_nopl_ce++;
5321 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_noplce);
5322 }
5323 } else {
5324 if (ifs->txretransmitbytes > 0 ||
5325 ifs->rxoutoforderbytes > 0) {
5326 tcpstat.tcps_ecn_conn_plnoce++;
5327 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plnoce);
5328 }
5329 }
5330 }
5331
5332 /* Other stats are interesting for non-local connections only */
5333 if (ifs->local) {
5334 ifnet_lock_done(ifp);
5335 return;
5336 }
5337
5338 if (ifs->ipv4) {
5339 ifp->if_ipv4_stat->timestamp = net_uptime();
5340 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5341 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_on);
5342 } else {
5343 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_off);
5344 }
5345 } else {
5346 ifp->if_ipv6_stat->timestamp = net_uptime();
5347 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5348 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_on);
5349 } else {
5350 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_off);
5351 }
5352 }
5353
5354 if (ifs->rxmit_drop) {
5355 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5356 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_on.rxmit_drop);
5357 } else {
5358 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off.rxmit_drop);
5359 }
5360 }
5361 if (ifs->ecn_fallback_synloss) {
5362 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_synloss);
5363 }
5364 if (ifs->ecn_fallback_droprst) {
5365 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprst);
5366 }
5367 if (ifs->ecn_fallback_droprxmt) {
5368 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprxmt);
5369 }
5370 if (ifs->ecn_fallback_ce) {
5371 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_ce);
5372 }
5373 if (ifs->ecn_fallback_reorder) {
5374 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_reorder);
5375 }
5376 if (ifs->ecn_recv_ce > 0) {
5377 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ce);
5378 }
5379 if (ifs->ecn_recv_ece > 0) {
5380 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ece);
5381 }
5382
5383 tcp_flow_lim_stats(ifs, &ifp->if_lim_stat);
5384
5385 /*
5386 * Link heuristics are updated here only for NECP client flow when they close
5387 * Socket flows are updated live
5388 */
5389 os_atomic_add(&ifp->if_tcp_stat->linkheur_noackpri, ifs->linkheur_noackpri, relaxed);
5390 os_atomic_add(&ifp->if_tcp_stat->linkheur_comprxmt, ifs->linkheur_comprxmt, relaxed);
5391 os_atomic_add(&ifp->if_tcp_stat->linkheur_synrxmt, ifs->linkheur_synrxmt, relaxed);
5392 os_atomic_add(&ifp->if_tcp_stat->linkheur_rxmtfloor, ifs->linkheur_rxmtfloor, relaxed);
5393
5394 ifnet_lock_done(ifp);
5395 }
5396
5397 struct tseg_qent *
tcp_reass_qent_alloc(void)5398 tcp_reass_qent_alloc(void)
5399 {
5400 return zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL);
5401 }
5402
5403 void
tcp_reass_qent_free(struct tseg_qent * te)5404 tcp_reass_qent_free(struct tseg_qent *te)
5405 {
5406 zfree(tcp_reass_zone, te);
5407 }
5408
5409 struct tcp_rxt_seg *
tcp_rxt_seg_qent_alloc(void)5410 tcp_rxt_seg_qent_alloc(void)
5411 {
5412 return zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
5413 }
5414
5415 void
tcp_rxt_seg_qent_free(struct tcp_rxt_seg * te)5416 tcp_rxt_seg_qent_free(struct tcp_rxt_seg *te)
5417 {
5418 zfree(tcp_rxt_seg_zone, te);
5419 }
5420
5421
5422 struct tcp_seg_sent *
tcp_seg_sent_qent_alloc(void)5423 tcp_seg_sent_qent_alloc(void)
5424 {
5425 return zalloc_flags(tcp_seg_sent_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
5426 }
5427
5428 void
tcp_seg_sent_qent_free(struct tcp_seg_sent * te)5429 tcp_seg_sent_qent_free(struct tcp_seg_sent *te)
5430 {
5431 zfree(tcp_seg_sent_zone, te);
5432 }
5433
5434 #if SKYWALK
5435
5436 #include <skywalk/core/skywalk_var.h>
5437 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
5438
5439 void
tcp_add_fsw_flow(struct tcpcb * tp,struct ifnet * ifp)5440 tcp_add_fsw_flow(struct tcpcb *tp, struct ifnet *ifp)
5441 {
5442 struct inpcb *inp = tp->t_inpcb;
5443 struct socket *so = inp->inp_socket;
5444 uuid_t fsw_uuid;
5445 struct nx_flow_req nfr;
5446 int err;
5447
5448 if (!NX_FSW_TCP_RX_AGG_ENABLED()) {
5449 return;
5450 }
5451
5452 if (ifp == NULL || kern_nexus_get_flowswitch_instance(ifp, fsw_uuid)) {
5453 TCP_LOG_FSW_FLOW(tp, "skip ifp no fsw");
5454 return;
5455 }
5456
5457 memset(&nfr, 0, sizeof(nfr));
5458
5459 if (inp->inp_vflag & INP_IPV4) {
5460 ASSERT(!(inp->inp_laddr.s_addr == INADDR_ANY ||
5461 inp->inp_faddr.s_addr == INADDR_ANY ||
5462 IN_MULTICAST(ntohl(inp->inp_laddr.s_addr)) ||
5463 IN_MULTICAST(ntohl(inp->inp_faddr.s_addr))));
5464 nfr.nfr_saddr.sin.sin_len = sizeof(struct sockaddr_in);
5465 nfr.nfr_saddr.sin.sin_family = AF_INET;
5466 nfr.nfr_saddr.sin.sin_port = inp->inp_lport;
5467 memcpy(&nfr.nfr_saddr.sin.sin_addr, &inp->inp_laddr,
5468 sizeof(struct in_addr));
5469 nfr.nfr_daddr.sin.sin_len = sizeof(struct sockaddr_in);
5470 nfr.nfr_daddr.sin.sin_family = AF_INET;
5471 nfr.nfr_daddr.sin.sin_port = inp->inp_fport;
5472 memcpy(&nfr.nfr_daddr.sin.sin_addr, &inp->inp_faddr,
5473 sizeof(struct in_addr));
5474 } else {
5475 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
5476 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
5477 IN6_IS_ADDR_MULTICAST(&inp->in6p_laddr) ||
5478 IN6_IS_ADDR_MULTICAST(&inp->in6p_faddr)));
5479 nfr.nfr_saddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
5480 nfr.nfr_saddr.sin6.sin6_family = AF_INET6;
5481 nfr.nfr_saddr.sin6.sin6_port = inp->inp_lport;
5482 memcpy(&nfr.nfr_saddr.sin6.sin6_addr, &inp->in6p_laddr,
5483 sizeof(struct in6_addr));
5484 nfr.nfr_daddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
5485 nfr.nfr_daddr.sin.sin_family = AF_INET6;
5486 nfr.nfr_daddr.sin6.sin6_port = inp->inp_fport;
5487 memcpy(&nfr.nfr_daddr.sin6.sin6_addr, &inp->in6p_faddr,
5488 sizeof(struct in6_addr));
5489 /* clear embedded scope ID */
5490 if (IN6_IS_SCOPE_EMBED(&nfr.nfr_saddr.sin6.sin6_addr)) {
5491 nfr.nfr_saddr.sin6.sin6_addr.s6_addr16[1] = 0;
5492 }
5493 if (IN6_IS_SCOPE_EMBED(&nfr.nfr_daddr.sin6.sin6_addr)) {
5494 nfr.nfr_daddr.sin6.sin6_addr.s6_addr16[1] = 0;
5495 }
5496 }
5497
5498 nfr.nfr_nx_port = 1;
5499 nfr.nfr_ip_protocol = IPPROTO_TCP;
5500 nfr.nfr_transport_protocol = IPPROTO_TCP;
5501 nfr.nfr_flags = NXFLOWREQF_ASIS;
5502 nfr.nfr_epid = (so != NULL ? so->last_pid : 0);
5503 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
5504 nfr.nfr_port_reservation = inp->inp_netns_token;
5505 nfr.nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
5506 }
5507 ASSERT(inp->inp_flowhash != 0);
5508 nfr.nfr_inp_flowhash = inp->inp_flowhash;
5509
5510 uuid_generate_random(nfr.nfr_flow_uuid);
5511 err = kern_nexus_flow_add(kern_nexus_shared_controller(), fsw_uuid,
5512 &nfr, sizeof(nfr));
5513
5514 if (err == 0) {
5515 uuid_copy(tp->t_fsw_uuid, fsw_uuid);
5516 uuid_copy(tp->t_flow_uuid, nfr.nfr_flow_uuid);
5517 }
5518
5519 TCP_LOG_FSW_FLOW(tp, "add err %d\n", err);
5520 }
5521
5522 void
tcp_del_fsw_flow(struct tcpcb * tp)5523 tcp_del_fsw_flow(struct tcpcb *tp)
5524 {
5525 if (uuid_is_null(tp->t_fsw_uuid) || uuid_is_null(tp->t_flow_uuid)) {
5526 return;
5527 }
5528
5529 struct nx_flow_req nfr;
5530 uuid_copy(nfr.nfr_flow_uuid, tp->t_flow_uuid);
5531
5532 /* It's possible for this call to fail if the nexus has detached */
5533 int err = kern_nexus_flow_del(kern_nexus_shared_controller(),
5534 tp->t_fsw_uuid, &nfr, sizeof(nfr));
5535 VERIFY(err == 0 || err == ENOENT || err == ENXIO);
5536
5537 uuid_clear(tp->t_fsw_uuid);
5538 uuid_clear(tp->t_flow_uuid);
5539
5540 TCP_LOG_FSW_FLOW(tp, "del err %d\n", err);
5541 }
5542
5543 #endif /* SKYWALK */
5544