xref: /xnu-11417.121.6/bsd/netinet/tcp_subr.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
61  */
62 /*
63  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64  * support for mandatory and extensible security protections.  This notice
65  * is included in support of clause 2.2 (b) of the Apple Public License,
66  * Version 2.0.
67  */
68 
69 #include "tcp_includes.h"
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/sysctl.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/domain.h>
78 #include <sys/proc.h>
79 #include <sys/kauth.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/protosw.h>
83 #include <sys/random.h>
84 #include <sys/syslog.h>
85 #include <sys/mcache.h>
86 #include <kern/locks.h>
87 #include <kern/zalloc.h>
88 
89 #include <dev/random/randomdev.h>
90 
91 #include <net/route.h>
92 #include <net/if.h>
93 #include <net/content_filter.h>
94 #include <net/ntstat.h>
95 #include <net/multi_layer_pkt_log.h>
96 
97 #define tcp_minmssoverload fring
98 #define _IP_VHL
99 #include <netinet/in.h>
100 #include <netinet/in_systm.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip_icmp.h>
103 #include <netinet/ip6.h>
104 #include <netinet/icmp6.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet6/in6_pcb.h>
107 #include <netinet/in_var.h>
108 #include <netinet/ip_var.h>
109 #include <netinet/icmp_var.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/mptcp_var.h>
112 #include <netinet/tcp.h>
113 #include <netinet/tcp_fsm.h>
114 #include <netinet/tcp_seq.h>
115 #include <netinet/tcp_timer.h>
116 #include <netinet/tcp_var.h>
117 #include <netinet/tcp_cc.h>
118 #include <netinet/tcp_cache.h>
119 #include <kern/thread_call.h>
120 
121 #include <netinet6/tcp6_var.h>
122 #include <netinet/tcpip.h>
123 #include <netinet/tcp_log.h>
124 
125 #include <netinet6/ip6protosw.h>
126 #include <netinet6/esp.h>
127 
128 #if IPSEC
129 #include <netinet6/ipsec.h>
130 #include <netinet6/ipsec6.h>
131 #endif /* IPSEC */
132 
133 #if NECP
134 #include <net/necp.h>
135 #endif /* NECP */
136 
137 #undef tcp_minmssoverload
138 
139 #include <net/sockaddr_utils.h>
140 
141 #include <corecrypto/ccaes.h>
142 #include <libkern/crypto/aes.h>
143 #include <libkern/crypto/md5.h>
144 #include <sys/kdebug.h>
145 #include <mach/sdt.h>
146 #include <pexpert/pexpert.h>
147 #include <mach/mach_time.h>
148 
149 #define DBG_FNC_TCP_CLOSE       NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
150 
151 static tcp_cc tcp_ccgen;
152 
153 extern struct tcptimerlist tcp_timer_list;
154 extern struct tcptailq tcp_tw_tailq;
155 
156 extern int tcp_awdl_rtobase;
157 
158 SYSCTL_SKMEM_TCP_INT(TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
159     int, tcp_mssdflt, TCP_MSS, "Default TCP Maximum Segment Size");
160 
161 SYSCTL_SKMEM_TCP_INT(TCPCTL_V6MSSDFLT, v6mssdflt,
162     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_v6mssdflt, TCP6_MSS,
163     "Default TCP Maximum Segment Size for IPv6");
164 
165 int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int,
166     struct sysctl_req *);
167 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING | CTLFLAG_WR,
168     0, 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
169 
170 /* Current count of half-open TFO connections */
171 int     tcp_tfo_halfcnt = 0;
172 
173 /* Maximum of half-open TFO connection backlog */
174 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen_backlog,
175     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_tfo_backlog, 10,
176     "Backlog queue for half-open TFO connections");
177 
178 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED,
179     int, tcp_fastopen, TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER,
180     "Enable TCP Fastopen (RFC 7413)");
181 
182 SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD | CTLFLAG_LOCKED,
183     uint32_t, tcp_now_init, 0, "Initial tcp now value");
184 
185 SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD | CTLFLAG_LOCKED,
186     uint32_t, tcp_microuptime_init, 0, "Initial tcp uptime value in micro seconds");
187 
188 /*
189  * Minimum MSS we accept and use. This prevents DoS attacks where
190  * we are forced to a ridiculous low MSS like 20 and send hundreds
191  * of packets instead of one. The effect scales with the available
192  * bandwidth and quickly saturates the CPU and network interface
193  * with packet generation and sending. Set to zero to disable MINMSS
194  * checking. This setting prevents us from sending too small packets.
195  */
196 SYSCTL_SKMEM_TCP_INT(OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
197     int, tcp_minmss, TCP_MINMSS, "Minmum TCP Maximum Segment Size");
198 
199 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
200     &tcbinfo.ipi_count, 0, "Number of active PCBs");
201 
202 SYSCTL_SKMEM_TCP_INT(OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED,
203     static int, icmp_may_rst, 1,
204     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
205 
206 int             tcp_do_timestamps = 1;
207 #if (DEVELOPMENT || DEBUG)
208 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_timestamps,
209     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_timestamps, 0, "enable TCP timestamps");
210 #endif /* (DEVELOPMENT || DEBUG) */
211 
212 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
213     int, tcp_TCPTV_MIN, 100, "min rtt value allowed");
214 
215 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rexmt_slop, CTLFLAG_RW,
216     int, tcp_rexmt_slop, TCPTV_REXMTSLOP, "Slop added to retransmit timeout");
217 
218 SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
219     __private_extern__ int, tcp_use_randomport, 0,
220     "Randomize TCP port numbers");
221 
222 SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
223     __private_extern__ int, tcp_win_scale, 3, "Window scaling factor");
224 
225 #if (DEVELOPMENT || DEBUG)
226 SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
227     CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
228     "Initalize RTT from route cache");
229 #else
230 SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
231     CTLFLAG_RD | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
232     "Initalize RTT from route cache");
233 #endif /* (DEVELOPMENT || DEBUG) */
234 
235 static int tso_debug = 0;
236 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
237     &tso_debug, 0, "TSO verbosity");
238 
239 static int tcp_rxt_seg_max = 1024;
240 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rxt_seg_max, CTLFLAG_RW | CTLFLAG_LOCKED,
241     &tcp_rxt_seg_max, 0, "");
242 
243 static unsigned long tcp_rxt_seg_drop = 0;
244 SYSCTL_ULONG(_net_inet_tcp, OID_AUTO, rxt_seg_drop, CTLFLAG_RD | CTLFLAG_LOCKED,
245     &tcp_rxt_seg_drop, "");
246 
247 static void     tcp_notify(struct inpcb *, int);
248 
249 static KALLOC_TYPE_DEFINE(tcp_bwmeas_zone, struct bwmeas, NET_KT_DEFAULT);
250 KALLOC_TYPE_DEFINE(tcp_reass_zone, struct tseg_qent, NET_KT_DEFAULT);
251 KALLOC_TYPE_DEFINE(tcp_rxt_seg_zone, struct tcp_rxt_seg, NET_KT_DEFAULT);
252 KALLOC_TYPE_DEFINE(tcp_seg_sent_zone, struct tcp_seg_sent, NET_KT_DEFAULT);
253 
254 extern int slowlink_wsize;      /* window correction for slow links */
255 extern int path_mtu_discovery;
256 
257 uint32_t tcp_now_remainder_us = 0;  /* remaining micro seconds for tcp_now */
258 
259 static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb);
260 
261 #define TCP_BWMEAS_BURST_MINSIZE 6
262 #define TCP_BWMEAS_BURST_MAXSIZE 25
263 
264 /*
265  * Target size of TCP PCB hash tables. Must be a power of two.
266  *
267  * Note that this can be overridden by the kernel environment
268  * variable net.inet.tcp.tcbhashsize
269  */
270 #ifndef TCBHASHSIZE
271 #define TCBHASHSIZE     CONFIG_TCBHASHSIZE
272 #endif
273 
274 __private_extern__ int  tcp_tcbhashsize = TCBHASHSIZE;
275 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
276     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
277 
278 /*
279  * This is the actual shape of what we allocate using the zone
280  * allocator.  Doing it this way allows us to protect both structures
281  * using the same generation count, and also eliminates the overhead
282  * of allocating tcpcbs separately.  By hiding the structure here,
283  * we avoid changing most of the rest of the code (although it needs
284  * to be changed, eventually, for greater efficiency).
285  */
286 #define ALIGNMENT       32
287 struct  inp_tp {
288 	struct  inpcb   inp;
289 	struct  tcpcb   tcb __attribute__((aligned(ALIGNMENT)));
290 };
291 #undef ALIGNMENT
292 
293 static KALLOC_TYPE_DEFINE(tcpcbzone, struct inp_tp, NET_KT_DEFAULT);
294 
295 int  get_inpcb_str_size(void);
296 int  get_tcp_str_size(void);
297 
298 os_log_t tcp_mpkl_log_object = NULL;
299 
300 static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
301 
302 int tcp_notsent_lowat_check(struct socket *so);
303 static void tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
304     struct if_lim_perf_stat *stat);
305 static void tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
306     struct if_tcp_ecn_perf_stat *stat);
307 
308 static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */
309 
310 void
tcp_tfo_gen_cookie(struct inpcb * inp,u_char * out __sized_by (blk_size),size_t blk_size)311 tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out __sized_by(blk_size), size_t blk_size)
312 {
313 	u_char in[CCAES_BLOCK_SIZE];
314 	int isipv6 = inp->inp_vflag & INP_IPV6;
315 
316 	VERIFY(blk_size == CCAES_BLOCK_SIZE);
317 
318 	bzero(&in[0], CCAES_BLOCK_SIZE);
319 	bzero(&out[0], CCAES_BLOCK_SIZE);
320 
321 	if (isipv6) {
322 		memcpy(in, &inp->in6p_faddr, sizeof(struct in6_addr));
323 	} else {
324 		memcpy(in, &inp->inp_faddr, sizeof(struct in_addr));
325 	}
326 
327 	aes_encrypt_cbc(in, NULL, 1, out, &tfo_ctx);
328 }
329 
330 __private_extern__ int
tcp_sysctl_fastopenkey(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)331 tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1,
332     __unused int arg2, struct sysctl_req *req)
333 {
334 	int error = 0;
335 	/*
336 	 * TFO-key is expressed as a string in hex format
337 	 *  +1 to account for the \0 char
338 	 *  +1 because sysctl_io_string() expects a string length but the sysctl command
339 	 *     now includes the terminating \0 in newlen -- see rdar://77205344
340 	 */
341 	char keystring[TCP_FASTOPEN_KEYLEN * 2 + 2];
342 	u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
343 	int i;
344 	size_t ks_len;
345 
346 	/*
347 	 * sysctl_io_string copies keystring into the oldptr of the sysctl_req.
348 	 * Make sure everything is zero, to avoid putting garbage in there or
349 	 * leaking the stack.
350 	 */
351 	bzero(keystring, sizeof(keystring));
352 
353 	error = sysctl_io_string(req, keystring, sizeof(keystring), 0, NULL);
354 	if (error) {
355 		os_log(OS_LOG_DEFAULT,
356 		    "%s: sysctl_io_string() error %d, req->newlen %lu, sizeof(keystring) %lu",
357 		    __func__, error, req->newlen, sizeof(keystring));
358 		goto exit;
359 	}
360 	if (req->newptr == USER_ADDR_NULL) {
361 		goto exit;
362 	}
363 
364 	ks_len = strbuflen(keystring, sizeof(keystring));
365 	if (ks_len != TCP_FASTOPEN_KEYLEN * 2) {
366 		os_log(OS_LOG_DEFAULT,
367 		    "%s: strlen(keystring) %lu != TCP_FASTOPEN_KEYLEN * 2 %u, newlen %lu",
368 		    __func__, ks_len, TCP_FASTOPEN_KEYLEN * 2, req->newlen);
369 		error = EINVAL;
370 		goto exit;
371 	}
372 
373 	for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
374 		/*
375 		 * We jump over the keystring in 8-character (4 byte in hex)
376 		 * steps
377 		 */
378 		if (sscanf(__unsafe_null_terminated_from_indexable(&keystring[i * 8]), "%8x", &key[i]) != 1) {
379 			error = EINVAL;
380 			os_log(OS_LOG_DEFAULT,
381 			    "%s: sscanf() != 1, error EINVAL", __func__);
382 			goto exit;
383 		}
384 	}
385 
386 	aes_encrypt_key128((u_char *)key, &tfo_ctx);
387 
388 exit:
389 	return error;
390 }
391 
392 int
get_inpcb_str_size(void)393 get_inpcb_str_size(void)
394 {
395 	return sizeof(struct inpcb);
396 }
397 
398 int
get_tcp_str_size(void)399 get_tcp_str_size(void)
400 {
401 	return sizeof(struct tcpcb);
402 }
403 
404 static int scale_to_powerof2(int size);
405 
406 /*
407  * This helper routine returns one of the following scaled value of size:
408  * 1. Rounded down power of two value of size if the size value passed as
409  *    argument is not a power of two and the rounded up value overflows.
410  * OR
411  * 2. Rounded up power of two value of size if the size value passed as
412  *    argument is not a power of two and the rounded up value does not overflow
413  * OR
414  * 3. Same value as argument size if it is already a power of two.
415  */
416 static int
scale_to_powerof2(int size)417 scale_to_powerof2(int size)
418 {
419 	/* Handle special case of size = 0 */
420 	int ret = size ? size : 1;
421 
422 	if (!powerof2(ret)) {
423 		while (!powerof2(size)) {
424 			/*
425 			 * Clear out least significant
426 			 * set bit till size is left with
427 			 * its highest set bit at which point
428 			 * it is rounded down power of two.
429 			 */
430 			size = size & (size - 1);
431 		}
432 
433 		/* Check for overflow when rounding up */
434 		if (0 == (size << 1)) {
435 			ret = size;
436 		} else {
437 			ret = size << 1;
438 		}
439 	}
440 
441 	return ret;
442 }
443 
444 /*
445  * Round the floating point to the next integer
446  * Eg. 1.3 will round up to 2.
447  */
448 uint32_t
tcp_ceil(double a)449 tcp_ceil(double a)
450 {
451 	double res = (uint32_t) a;
452 	return (uint32_t)(res + (res < a));
453 }
454 
455 uint32_t
tcp_round_to(uint32_t val,uint32_t round)456 tcp_round_to(uint32_t val, uint32_t round)
457 {
458 	/*
459 	 * Round up or down based on the middle. Meaning, if we round upon a
460 	 * multiple of 10, 16 will round to 20 and 14 will round to 10.
461 	 */
462 	return ((val + (round / 2)) / round) * round;
463 }
464 
465 /*
466  * Round up to the next multiple of base.
467  * Eg. for a base of 64, 65 will become 128,
468  * 2896 will become 2944.
469  */
470 uint32_t
tcp_round_up(uint32_t val,uint32_t base)471 tcp_round_up(uint32_t val, uint32_t base)
472 {
473 	if (base == 1 || val % base == 0) {
474 		return val;
475 	}
476 
477 	return ((val + base) / base) * base;
478 }
479 
480 uint32_t
481 ntoh24(u_char *p __sized_by(3))
482 {
483 	uint32_t v;
484 
485 	v  = (uint32_t)(p[0] << 16);
486 	v |= (uint32_t)(p[1] << 8);
487 	v |= (uint32_t)(p[2] << 0);
488 	return v;
489 }
490 
491 uint32_t
tcp_packets_this_ack(struct tcpcb * tp,uint32_t acked)492 tcp_packets_this_ack(struct tcpcb *tp, uint32_t acked)
493 {
494 	return acked / tp->t_maxseg +
495 	       (((acked % tp->t_maxseg) != 0) ? 1 : 0);
496 }
497 
498 static void
tcp_tfo_init(void)499 tcp_tfo_init(void)
500 {
501 	u_char key[TCP_FASTOPEN_KEYLEN];
502 
503 	read_frandom(key, sizeof(key));
504 	aes_encrypt_key128(key, &tfo_ctx);
505 }
506 
507 static u_char isn_secret[32];
508 
509 /*
510  * Tcp initialization
511  */
512 void
tcp_init(struct protosw * pp,struct domain * dp)513 tcp_init(struct protosw *pp, struct domain *dp)
514 {
515 #pragma unused(dp)
516 	static int tcp_initialized = 0;
517 	struct inpcbinfo *pcbinfo;
518 
519 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
520 
521 	if (tcp_initialized) {
522 		return;
523 	}
524 	tcp_initialized = 1;
525 
526 #if DEBUG || DEVELOPMENT
527 	(void) PE_parse_boot_argn("tcp_rxt_seg_max", &tcp_rxt_seg_max,
528 	    sizeof(tcp_rxt_seg_max));
529 #endif /* DEBUG || DEVELOPMENT */
530 
531 	tcp_ccgen = 1;
532 	tcp_keepinit = TCPTV_KEEP_INIT;
533 	tcp_keepidle = TCPTV_KEEP_IDLE;
534 	tcp_keepintvl = TCPTV_KEEPINTVL;
535 	tcp_keepcnt = TCPTV_KEEPCNT;
536 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
537 	tcp_msl = TCPTV_MSL;
538 
539 	microuptime(&tcp_uptime);
540 	read_frandom(&tcp_now, sizeof(tcp_now));
541 
542 	/* Starts tcp internal clock at a random value */
543 	tcp_now = tcp_now & 0x3fffffff;
544 
545 	/* expose initial uptime/now via systcl for utcp to keep time sync */
546 	tcp_now_init = tcp_now;
547 	tcp_microuptime_init =
548 	    (uint32_t)(tcp_uptime.tv_usec + (tcp_uptime.tv_sec * USEC_PER_SEC));
549 	SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init);
550 	SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init);
551 
552 	tcp_tfo_init();
553 
554 	LIST_INIT(&tcb);
555 	tcbinfo.ipi_listhead = &tcb;
556 
557 	pcbinfo = &tcbinfo;
558 
559 	/*
560 	 * allocate group, lock attributes and lock for tcp pcb mutexes
561 	 */
562 	pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb",
563 	    LCK_GRP_ATTR_NULL);
564 	lck_attr_setdefault(&pcbinfo->ipi_lock_attr);
565 	lck_rw_init(&pcbinfo->ipi_lock, pcbinfo->ipi_lock_grp,
566 	    &pcbinfo->ipi_lock_attr);
567 
568 	if (tcp_tcbhashsize == 0) {
569 		/* Set to default */
570 		tcp_tcbhashsize = 512;
571 	}
572 
573 	if (!powerof2(tcp_tcbhashsize)) {
574 		int old_hash_size = tcp_tcbhashsize;
575 		tcp_tcbhashsize = scale_to_powerof2(tcp_tcbhashsize);
576 		/* Lower limit of 16  */
577 		if (tcp_tcbhashsize < 16) {
578 			tcp_tcbhashsize = 16;
579 		}
580 		printf("WARNING: TCB hash size not a power of 2, "
581 		    "scaled from %d to %d.\n",
582 		    old_hash_size,
583 		    tcp_tcbhashsize);
584 	}
585 
586 	hashinit_counted_by(tcp_tcbhashsize, tcbinfo.ipi_hashbase,
587 	    tcbinfo.ipi_hashbase_count);
588 	tcbinfo.ipi_hashmask = tcbinfo.ipi_hashbase_count - 1;
589 	hashinit_counted_by(tcp_tcbhashsize, tcbinfo.ipi_porthashbase,
590 	    tcbinfo.ipi_porthashbase_count);
591 	tcbinfo.ipi_porthashmask = tcbinfo.ipi_porthashbase_count - 1;
592 	tcbinfo.ipi_zone = tcpcbzone;
593 
594 	tcbinfo.ipi_gc = tcp_gc;
595 	tcbinfo.ipi_timer = tcp_itimer;
596 	in_pcbinfo_attach(&tcbinfo);
597 
598 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
599 	if (max_protohdr < TCP_MINPROTOHDR) {
600 		max_protohdr = (int)P2ROUNDUP(TCP_MINPROTOHDR, sizeof(uint32_t));
601 	}
602 	if (max_linkhdr + max_protohdr > MCLBYTES) {
603 		panic("tcp_init");
604 	}
605 #undef TCP_MINPROTOHDR
606 
607 	/* Initialize time wait and timer lists */
608 	TAILQ_INIT(&tcp_tw_tailq);
609 
610 	bzero(&tcp_timer_list, sizeof(tcp_timer_list));
611 	LIST_INIT(&tcp_timer_list.lhead);
612 	/*
613 	 * allocate group and attribute for the tcp timer list
614 	 */
615 	tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist",
616 	    LCK_GRP_ATTR_NULL);
617 	lck_mtx_init(&tcp_timer_list.mtx, tcp_timer_list.mtx_grp,
618 	    LCK_ATTR_NULL);
619 
620 	tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL);
621 	if (tcp_timer_list.call == NULL) {
622 		panic("failed to allocate call entry 1 in tcp_init");
623 	}
624 
625 	/* Initialize TCP Cache */
626 	tcp_cache_init();
627 
628 	tcp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.tcp");
629 	if (tcp_mpkl_log_object == NULL) {
630 		panic("MPKL_CREATE_LOGOBJECT failed");
631 	}
632 
633 	if (PE_parse_boot_argn("tcp_log", &tcp_log_enable_flags, sizeof(tcp_log_enable_flags))) {
634 		os_log(OS_LOG_DEFAULT, "tcp_init: set tcp_log_enable_flags to 0x%x", tcp_log_enable_flags);
635 	}
636 
637 	if (PE_parse_boot_argn("tcp_link_heuristics", &tcp_link_heuristics_flags, sizeof(tcp_link_heuristics_flags))) {
638 		os_log(OS_LOG_DEFAULT, "tcp_init: set tcp_link_heuristics_flags to 0x%x", tcp_link_heuristics_flags);
639 	}
640 
641 	/*
642 	 * If more than 4GB of actual memory is available, increase the
643 	 * maximum allowed receive and send socket buffer size.
644 	 */
645 	if (mem_actual >= (1ULL << (GBSHIFT + 2))) {
646 		if (serverperfmode) {
647 			tcp_autorcvbuf_max = 8 * 1024 * 1024;
648 			tcp_autosndbuf_max = 8 * 1024 * 1024;
649 		} else {
650 			tcp_autorcvbuf_max = 4 * 1024 * 1024;
651 			tcp_autosndbuf_max = 4 * 1024 * 1024;
652 		}
653 
654 		SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
655 		SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
656 	}
657 
658 	/* Initialize the TCP CCA array */
659 	tcp_cc_init();
660 
661 	read_frandom(&isn_secret, sizeof(isn_secret));
662 }
663 
664 /*
665  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
666  * tcp_template used to store this data in mbufs, but we now recopy it out
667  * of the tcpcb each time to conserve mbufs.
668  */
669 void
tcp_fillheaders(struct mbuf * m,struct tcpcb * tp,void * ip_ptr,void * tcp_ptr)670 tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr)
671 {
672 	struct inpcb *inp = tp->t_inpcb;
673 	struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
674 
675 	if ((inp->inp_vflag & INP_IPV6) != 0) {
676 		struct ip6_hdr *ip6;
677 
678 		ip6 = (struct ip6_hdr *)ip_ptr;
679 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
680 		    (inp->inp_flow & IPV6_FLOWINFO_MASK);
681 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
682 		    (IPV6_VERSION & IPV6_VERSION_MASK);
683 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
684 		ip6->ip6_nxt = IPPROTO_TCP;
685 		ip6->ip6_hlim = 0;
686 		ip6->ip6_src = inp->in6p_laddr;
687 		ip6->ip6_dst = inp->in6p_faddr;
688 		if (m->m_flags & M_PKTHDR) {
689 			uint32_t lifscope = inp->inp_lifscope != 0 ? inp->inp_lifscope : inp->inp_fifscope;
690 			uint32_t fifscope = inp->inp_fifscope != 0 ? inp->inp_fifscope : inp->inp_lifscope;
691 			ip6_output_setsrcifscope(m, lifscope, NULL);
692 			ip6_output_setdstifscope(m, fifscope, NULL);
693 		}
694 		tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
695 		    htonl(sizeof(struct tcphdr) + IPPROTO_TCP));
696 	} else {
697 		struct ip *ip = (struct ip *) ip_ptr;
698 
699 		ip->ip_vhl = IP_VHL_BORING;
700 		ip->ip_tos = 0;
701 		ip->ip_len = 0;
702 		ip->ip_id = 0;
703 		ip->ip_off = 0;
704 		ip->ip_ttl = 0;
705 		ip->ip_sum = 0;
706 		ip->ip_p = IPPROTO_TCP;
707 		ip->ip_src = inp->inp_laddr;
708 		ip->ip_dst = inp->inp_faddr;
709 		tcp_hdr->th_sum =
710 		    in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
711 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP));
712 	}
713 
714 	tcp_hdr->th_sport = inp->inp_lport;
715 	tcp_hdr->th_dport = inp->inp_fport;
716 	tcp_hdr->th_seq = 0;
717 	tcp_hdr->th_ack = 0;
718 	tcp_hdr->th_x2 = 0;
719 	tcp_hdr->th_off = 5;
720 	tcp_hdr->th_flags = 0;
721 	tcp_hdr->th_win = 0;
722 	tcp_hdr->th_urp = 0;
723 }
724 
725 /*
726  * Create template to be used to send tcp packets on a connection.
727  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
728  * use for this function is in keepalives, which use tcp_respond.
729  */
730 struct tcptemp *
tcp_maketemplate(struct tcpcb * tp,struct mbuf ** mp)731 tcp_maketemplate(struct tcpcb *tp, struct mbuf **mp)
732 {
733 	struct mbuf *m;
734 	struct tcptemp *n;
735 
736 	*mp = m = m_get(M_DONTWAIT, MT_HEADER);
737 	if (m == NULL) {
738 		return NULL;
739 	}
740 	m->m_len = sizeof(struct tcptemp);
741 	n = mtod(m, struct tcptemp *);
742 
743 	tcp_fillheaders(m, tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
744 	return n;
745 }
746 
747 /*
748  * Send a single message to the TCP at address specified by
749  * the given TCP/IP header.  If m == 0, then we make a copy
750  * of the tcpiphdr at ti and send directly to the addressed host.
751  * This is used to force keep alive messages out using the TCP
752  * template for a connection.  If flags are given then we send
753  * a message back to the TCP which originated the * segment ti,
754  * and discard the mbuf containing it and any other attached mbufs.
755  *
756  * In any case the ack and sequence number of the transmitted
757  * segment are as specified by the parameters.
758  *
759  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
760  */
761 void
tcp_respond(struct tcpcb * tp,void * ipgen __sized_by (ipgen_size),size_t ipgen_size __unused,struct tcphdr * th,struct mbuf * m,tcp_seq ack,tcp_seq seq,uint8_t flags,struct tcp_respond_args * tra)762 tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_size __unused, struct tcphdr *th, struct mbuf *m,
763     tcp_seq ack, tcp_seq seq, uint8_t flags, struct tcp_respond_args *tra)
764 {
765 	uint16_t tlen;
766 	int win = 0;
767 	struct route *ro = 0;
768 	struct route sro;
769 	struct ip *ip;
770 	struct tcphdr *nth;
771 	struct route_in6 *ro6 = 0;
772 	struct route_in6 sro6;
773 	struct ip6_hdr *ip6;
774 	int isipv6;
775 	struct ifnet *outif;
776 	int sotc = SO_TC_UNSPEC;
777 	bool check_qos_marking_again = FALSE;
778 	uint32_t sifscope = IFSCOPE_NONE, fifscope = IFSCOPE_NONE;
779 
780 	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
781 	ip6 = ipgen;
782 	ip = ipgen;
783 
784 	if (tp) {
785 		check_qos_marking_again = tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE ? FALSE : TRUE;
786 		sifscope = tp->t_inpcb->inp_lifscope;
787 		fifscope = tp->t_inpcb->inp_fifscope;
788 		if (!(flags & TH_RST)) {
789 			win = tcp_sbspace(tp);
790 			if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale) {
791 				win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
792 			}
793 		}
794 		if (isipv6) {
795 			ro6 = &tp->t_inpcb->in6p_route;
796 		} else {
797 			ro = &tp->t_inpcb->inp_route;
798 		}
799 	} else {
800 		if (isipv6) {
801 			ro6 = &sro6;
802 			bzero(ro6, sizeof(*ro6));
803 		} else {
804 			ro = &sro;
805 			bzero(ro, sizeof(*ro));
806 		}
807 	}
808 	if (m == 0) {
809 		m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
810 		if (m == NULL) {
811 			return;
812 		}
813 		tlen = 0;
814 		m->m_data += max_linkhdr;
815 		if (isipv6) {
816 			VERIFY((MHLEN - max_linkhdr) >=
817 			    (sizeof(*ip6) + sizeof(*nth)));
818 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
819 			    sizeof(struct ip6_hdr));
820 			ip6 = mtod(m, struct ip6_hdr *);
821 			nth = (struct tcphdr *)(void *)(ip6 + 1);
822 		} else {
823 			VERIFY((MHLEN - max_linkhdr) >=
824 			    (sizeof(*ip) + sizeof(*nth)));
825 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
826 			ip = mtod(m, struct ip *);
827 			nth = (struct tcphdr *)(void *)(ip + 1);
828 		}
829 		bcopy(th, nth, sizeof(struct tcphdr));
830 #if MPTCP
831 		if ((tp) && (tp->t_mpflags & TMPF_RESET)) {
832 			flags = (TH_RST | TH_ACK);
833 		} else
834 #endif
835 		flags = TH_ACK;
836 	} else {
837 		m_freem(m->m_next);
838 		m->m_next = 0;
839 		m->m_data = (uintptr_t)ipgen;
840 		/* m_len is set later */
841 		tlen = 0;
842 #define xchg(a, b, type) { type t; t = a; a = b; b = t; }
843 		if (isipv6) {
844 			ip6_getsrcifaddr_info(m, &sifscope, NULL);
845 			ip6_getdstifaddr_info(m, &fifscope, NULL);
846 			if (!in6_embedded_scope) {
847 				m->m_pkthdr.pkt_flags &= ~PKTF_IFAINFO;
848 			}
849 			/* Expect 32-bit aligned IP on strict-align platforms */
850 			IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
851 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
852 			nth = (struct tcphdr *)(void *)(ip6 + 1);
853 		} else {
854 			/* Expect 32-bit aligned IP on strict-align platforms */
855 			IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
856 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
857 			nth = (struct tcphdr *)(void *)(ip + 1);
858 		}
859 		if (th != nth) {
860 			/*
861 			 * this is usually a case when an extension header
862 			 * exists between the IPv6 header and the
863 			 * TCP header.
864 			 */
865 			nth->th_sport = th->th_sport;
866 			nth->th_dport = th->th_dport;
867 		}
868 		xchg(nth->th_dport, nth->th_sport, n_short);
869 #undef xchg
870 	}
871 	if (isipv6) {
872 		ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) +
873 		    tlen));
874 		tlen += sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
875 		ip6_output_setsrcifscope(m, sifscope, NULL);
876 		ip6_output_setdstifscope(m, fifscope, NULL);
877 	} else {
878 		tlen += sizeof(struct tcpiphdr);
879 		ip->ip_len = tlen;
880 		ip->ip_ttl = (uint8_t)ip_defttl;
881 	}
882 	m->m_len = tlen;
883 	m->m_pkthdr.len = tlen;
884 	m->m_pkthdr.rcvif = 0;
885 	if (tra->keep_alive) {
886 		m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE;
887 	}
888 
889 	nth->th_seq = htonl(seq);
890 	nth->th_ack = htonl(ack);
891 	nth->th_x2 = 0;
892 	nth->th_off = sizeof(struct tcphdr) >> 2;
893 	nth->th_flags = flags;
894 	if (tp) {
895 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
896 	} else {
897 		nth->th_win = htons((u_short)win);
898 	}
899 	nth->th_urp = 0;
900 	if (isipv6) {
901 		nth->th_sum = 0;
902 		nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
903 		    htonl((tlen - sizeof(struct ip6_hdr)) + IPPROTO_TCP));
904 		m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
905 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
906 		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
907 		    ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
908 	} else {
909 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
910 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
911 		m->m_pkthdr.csum_flags = CSUM_TCP;
912 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
913 	}
914 #if NECP
915 	necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0, 0, 0);
916 #endif /* NECP */
917 
918 #if IPSEC
919 	if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
920 	    ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
921 		m_freem(m);
922 		return;
923 	}
924 #endif
925 
926 	if (tp != NULL) {
927 		u_int32_t svc_flags = 0;
928 		if (isipv6) {
929 			svc_flags |= PKT_SCF_IPV6;
930 		}
931 		sotc = tp->t_inpcb->inp_socket->so_traffic_class;
932 		if ((flags & TH_RST) == 0) {
933 			set_packet_service_class(m, tp->t_inpcb->inp_socket,
934 			    sotc, svc_flags);
935 		} else {
936 			m_set_service_class(m, MBUF_SC_BK_SYS);
937 		}
938 
939 		/* Embed flowhash and flow control flags */
940 		m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
941 		m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
942 		m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
943 		m->m_pkthdr.pkt_proto = IPPROTO_TCP;
944 		m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid;
945 		m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid;
946 
947 		if (flags & TH_RST) {
948 			m->m_pkthdr.comp_gencnt = tp->t_comp_ack_gencnt;
949 		}
950 	} else {
951 		if (flags & TH_RST) {
952 			m->m_pkthdr.comp_gencnt = TCP_ACK_COMPRESSION_DUMMY;
953 			m_set_service_class(m, MBUF_SC_BK_SYS);
954 		}
955 	}
956 
957 	if (isipv6) {
958 		struct ip6_out_args ip6oa;
959 		bzero(&ip6oa, sizeof(ip6oa));
960 		ip6oa.ip6oa_boundif = tra->ifscope;
961 		ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR;
962 		ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
963 		ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
964 
965 		if (tra->ifscope != IFSCOPE_NONE) {
966 			ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
967 		}
968 		if (tra->nocell) {
969 			ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
970 		}
971 		if (tra->noexpensive) {
972 			ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
973 		}
974 		if (tra->noconstrained) {
975 			ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
976 		}
977 		if (tra->awdl_unrestricted) {
978 			ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
979 		}
980 		if (tra->intcoproc_allowed) {
981 			ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
982 		}
983 		if (tra->management_allowed) {
984 			ip6oa.ip6oa_flags |= IP6OAF_MANAGEMENT_ALLOWED;
985 		}
986 		if (tra->ultra_constrained_allowed) {
987 			ip6oa.ip6oa_flags |= IP6OAF_ULTRA_CONSTRAINED_ALLOWED;
988 		}
989 		ip6oa.ip6oa_sotc = sotc;
990 		if (tp != NULL) {
991 			if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
992 				ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
993 			}
994 			ip6oa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
995 			if (check_qos_marking_again) {
996 				ip6oa.ip6oa_flags |= IP6OAF_REDO_QOSMARKING_POLICY;
997 			}
998 			ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
999 		}
1000 		(void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
1001 		    NULL, &ip6oa);
1002 
1003 		if (check_qos_marking_again) {
1004 			struct inpcb *inp = tp->t_inpcb;
1005 			inp->inp_policyresult.results.qos_marking_gencount = ip6oa.qos_marking_gencount;
1006 			if (ip6oa.ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED) {
1007 				inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
1008 			} else {
1009 				inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
1010 			}
1011 		}
1012 
1013 		if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
1014 		    (outif = ro6->ro_rt->rt_ifp) !=
1015 		    tp->t_inpcb->in6p_last_outifp) {
1016 			tp->t_inpcb->in6p_last_outifp = outif;
1017 #if SKYWALK
1018 			if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1019 				netns_set_ifnet(&tp->t_inpcb->inp_netns_token,
1020 				    tp->t_inpcb->in6p_last_outifp);
1021 			}
1022 #endif /* SKYWALK */
1023 		}
1024 
1025 		if (ro6 == &sro6) {
1026 			ROUTE_RELEASE(ro6);
1027 		}
1028 	} else {
1029 		struct ip_out_args ipoa;
1030 		bzero(&ipoa, sizeof(ipoa));
1031 		ipoa.ipoa_boundif = tra->ifscope;
1032 		ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR;
1033 		ipoa.ipoa_sotc = SO_TC_UNSPEC;
1034 		ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1035 
1036 		if (tra->ifscope != IFSCOPE_NONE) {
1037 			ipoa.ipoa_flags |= IPOAF_BOUND_IF;
1038 		}
1039 		if (tra->nocell) {
1040 			ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
1041 		}
1042 		if (tra->noexpensive) {
1043 			ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
1044 		}
1045 		if (tra->noconstrained) {
1046 			ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED;
1047 		}
1048 		if (tra->awdl_unrestricted) {
1049 			ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
1050 		}
1051 		if (tra->management_allowed) {
1052 			ipoa.ipoa_flags |= IPOAF_MANAGEMENT_ALLOWED;
1053 		}
1054 		ipoa.ipoa_sotc = sotc;
1055 		if (tp != NULL) {
1056 			if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
1057 				ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1058 			}
1059 			if (!(tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE)) {
1060 				ipoa.ipoa_flags |= IPOAF_REDO_QOSMARKING_POLICY;
1061 			}
1062 			ipoa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
1063 			ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
1064 		}
1065 		if (ro != &sro) {
1066 			/* Copy the cached route and take an extra reference */
1067 			inp_route_copyout(tp->t_inpcb, &sro);
1068 		}
1069 		/*
1070 		 * For consistency, pass a local route copy.
1071 		 */
1072 		(void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
1073 
1074 		if (check_qos_marking_again) {
1075 			struct inpcb *inp = tp->t_inpcb;
1076 			inp->inp_policyresult.results.qos_marking_gencount = ipoa.qos_marking_gencount;
1077 			if (ipoa.ipoa_flags & IPOAF_QOSMARKING_ALLOWED) {
1078 				inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
1079 			} else {
1080 				inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
1081 			}
1082 		}
1083 		if (tp != NULL && sro.ro_rt != NULL &&
1084 		    (outif = sro.ro_rt->rt_ifp) !=
1085 		    tp->t_inpcb->inp_last_outifp) {
1086 			tp->t_inpcb->inp_last_outifp = outif;
1087 #if SKYWALK
1088 			if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1089 				netns_set_ifnet(&tp->t_inpcb->inp_netns_token, outif);
1090 			}
1091 #endif /* SKYWALK */
1092 		}
1093 		if (ro != &sro) {
1094 			/* Synchronize cached PCB route */
1095 			inp_route_copyin(tp->t_inpcb, &sro);
1096 		} else {
1097 			ROUTE_RELEASE(&sro);
1098 		}
1099 	}
1100 }
1101 
1102 /*
1103  * Create a new TCP control block, making an
1104  * empty reassembly queue and hooking it to the argument
1105  * protocol control block.  The `inp' parameter must have
1106  * come from the zone allocator set up in tcp_init().
1107  */
1108 struct tcpcb *
tcp_newtcpcb(struct inpcb * inp)1109 tcp_newtcpcb(struct inpcb *inp)
1110 {
1111 	struct inp_tp *it;
1112 	struct tcpcb *tp;
1113 	struct socket *so = inp->inp_socket;
1114 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1115 	uint32_t random_32;
1116 
1117 	calculate_tcp_clock();
1118 
1119 	if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
1120 		it = (struct inp_tp *)(void *)inp;
1121 		tp = &it->tcb;
1122 	} else {
1123 		tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb;
1124 	}
1125 
1126 	bzero((char *) tp, sizeof(struct tcpcb));
1127 	LIST_INIT(&tp->t_segq);
1128 	tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
1129 
1130 	tp->t_flags = TF_REQ_SCALE | (tcp_do_timestamps ? TF_REQ_TSTMP : 0);
1131 	tp->t_flagsext |= TF_SACK_ENABLE;
1132 
1133 	if (tcp_rack) {
1134 		tp->t_flagsext |= TF_RACK_ENABLED;
1135 	}
1136 
1137 	TAILQ_INIT(&tp->snd_holes);
1138 	SLIST_INIT(&tp->t_rxt_segments);
1139 	TAILQ_INIT(&tp->t_segs_sent);
1140 	RB_INIT(&tp->t_segs_sent_tree);
1141 	TAILQ_INIT(&tp->t_segs_acked);
1142 	TAILQ_INIT(&tp->seg_pool.free_segs);
1143 	SLIST_INIT(&tp->t_notify_ack);
1144 	tp->t_inpcb = inp;
1145 	/*
1146 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
1147 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
1148 	 * reasonable initial retransmit time.
1149 	 */
1150 	tp->t_srtt = TCPTV_SRTTBASE;
1151 	tp->t_rttvar =
1152 	    ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1153 	tp->t_rttmin = tcp_TCPTV_MIN;
1154 	tp->t_rxtcur = TCPTV_RTOBASE;
1155 
1156 	if (tcp_use_newreno) {
1157 		/* use newreno by default */
1158 		tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
1159 #if (DEVELOPMENT || DEBUG)
1160 	} else if (tcp_use_ledbat) {
1161 		/* use ledbat for testing */
1162 		tp->tcp_cc_index = TCP_CC_ALGO_BACKGROUND_INDEX;
1163 #endif
1164 	} else {
1165 		if (TCP_L4S_ENABLED(tp)) {
1166 			tp->tcp_cc_index = TCP_CC_ALGO_PRAGUE_INDEX;
1167 		} else {
1168 			tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
1169 		}
1170 	}
1171 
1172 	tcp_cc_allocate_state(tp);
1173 
1174 	if (CC_ALGO(tp)->init != NULL) {
1175 		CC_ALGO(tp)->init(tp);
1176 	}
1177 
1178 	/* Initialize rledbat if we are using recv_bg */
1179 	if (tcp_rledbat == 1 && TCP_RECV_BG(inp->inp_socket) &&
1180 	    tcp_cc_rledbat.init != NULL) {
1181 		tcp_cc_rledbat.init(tp);
1182 	}
1183 
1184 	tp->snd_cwnd = tcp_initial_cwnd(tp);
1185 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1186 	tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1187 	tp->t_rcvtime = tcp_now;
1188 	tp->tentry.timer_start = tcp_now;
1189 	tp->rcv_unackwin = tcp_now;
1190 	tp->t_persist_timeout = tcp_max_persist_timeout;
1191 	tp->t_persist_stop = 0;
1192 	tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1193 	tp->t_rexmtthresh = (uint8_t)tcprexmtthresh;
1194 	tp->rack.reo_wnd_multi = 1;
1195 	tp->rfbuf_ts = tcp_now;
1196 	tp->rfbuf_space = tcp_initial_cwnd(tp);
1197 	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
1198 	tp->bytes_lost = tp->bytes_sacked = tp->bytes_retransmitted = 0;
1199 
1200 	/* Enable bandwidth measurement on this connection */
1201 	tp->t_flagsext |= TF_MEASURESNDBW;
1202 	if (tp->t_bwmeas == NULL) {
1203 		tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1204 		if (tp->t_bwmeas == NULL) {
1205 			tp->t_flagsext &= ~TF_MEASURESNDBW;
1206 		}
1207 	}
1208 
1209 	/* Clear time wait tailq entry */
1210 	tp->t_twentry.tqe_next = NULL;
1211 	tp->t_twentry.tqe_prev = NULL;
1212 
1213 	read_frandom(&random_32, sizeof(random_32));
1214 	tp->t_comp_ack_gencnt = random_32;
1215 	if (tp->t_comp_ack_gencnt <= TCP_ACK_COMPRESSION_DUMMY ||
1216 	    tp->t_comp_ack_gencnt > INT_MAX) {
1217 		tp->t_comp_ack_gencnt = TCP_ACK_COMPRESSION_DUMMY + 1;
1218 	}
1219 	tp->t_comp_ack_lastinc = tcp_now;
1220 
1221 	/* Initialize Accurate ECN state */
1222 	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_feature_disabled;
1223 	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_feature_disabled;
1224 
1225 	/*
1226 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
1227 	 * because the socket may be bound to an IPv6 wildcard address,
1228 	 * which may match an IPv4-mapped IPv6 address.
1229 	 */
1230 	inp->inp_ip_ttl = (uint8_t)ip_defttl;
1231 	inp->inp_ppcb = (caddr_t)tp;
1232 	return tp;            /* XXX */
1233 }
1234 
1235 /*
1236  * Drop a TCP connection, reporting
1237  * the specified error.  If connection is synchronized,
1238  * then send a RST to peer.
1239  */
1240 struct tcpcb *
tcp_drop(struct tcpcb * tp,int errno)1241 tcp_drop(struct tcpcb *tp, int errno)
1242 {
1243 	struct socket *so = tp->t_inpcb->inp_socket;
1244 #if CONFIG_DTRACE
1245 	struct inpcb *inp = tp->t_inpcb;
1246 #endif
1247 
1248 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
1249 		DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1250 		    struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1251 		TCP_LOG_STATE(tp, TCPS_CLOSED);
1252 		tp->t_state = TCPS_CLOSED;
1253 		(void) tcp_output(tp);
1254 		tcpstat.tcps_drops++;
1255 	} else {
1256 		tcpstat.tcps_conndrops++;
1257 	}
1258 	if (errno == ETIMEDOUT && tp->t_softerror) {
1259 		errno = tp->t_softerror;
1260 	}
1261 	so->so_error = (u_short)errno;
1262 
1263 	TCP_LOG_CONNECTION_SUMMARY(tp);
1264 
1265 	return tcp_close(tp);
1266 }
1267 
1268 void
tcp_getrt_rtt(struct tcpcb * tp,struct rtentry * rt)1269 tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
1270 {
1271 	uint32_t rtt = rt->rt_rmx.rmx_rtt;
1272 
1273 	TCP_LOG_RTM_RTT(tp, rt);
1274 
1275 	if (rtt != 0 && tcp_init_rtt_from_cache != 0) {
1276 		/*
1277 		 * XXX the lock bit for RTT indicates that the value
1278 		 * is also a minimum value; this is subject to time.
1279 		 */
1280 		if (rt->rt_rmx.rmx_locks & RTV_RTT) {
1281 			tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
1282 		} else {
1283 			tp->t_rttmin = TCPTV_REXMTMIN;
1284 		}
1285 
1286 		tp->t_srtt =
1287 		    rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1288 		tcpstat.tcps_usedrtt++;
1289 
1290 		if (rt->rt_rmx.rmx_rttvar) {
1291 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1292 			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1293 			tcpstat.tcps_usedrttvar++;
1294 		} else {
1295 			/* default variation is +- 1 rtt */
1296 			tp->t_rttvar =
1297 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1298 		}
1299 
1300 		/*
1301 		 * The RTO formula in the route metric case is based on:
1302 		 *     srtt + 4 * rttvar
1303 		 * modulo the min, max and slop
1304 		 */
1305 		TCPT_RANGESET(tp->t_rxtcur,
1306 		    TCP_REXMTVAL(tp),
1307 		    tp->t_rttmin, TCPTV_REXMTMAX,
1308 		    TCP_ADD_REXMTSLOP(tp));
1309 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_srtt == 0 &&
1310 	    tp->t_rxtshift == 0) {
1311 		struct ifnet *ifp = rt->rt_ifp;
1312 
1313 		if (ifp != NULL && (ifp->if_eflags & IFEF_AWDL) != 0) {
1314 			/*
1315 			 * AWDL needs a special value for the default initial retransmission timeout
1316 			 */
1317 			if (tcp_awdl_rtobase > tcp_TCPTV_MIN) {
1318 				tp->t_rttvar = ((tcp_awdl_rtobase - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1319 			} else {
1320 				tp->t_rttvar = ((tcp_TCPTV_MIN - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1321 			}
1322 			TCPT_RANGESET(tp->t_rxtcur,
1323 			    TCP_REXMTVAL(tp),
1324 			    tp->t_rttmin, TCPTV_REXMTMAX,
1325 			    TCP_ADD_REXMTSLOP(tp));
1326 		}
1327 	}
1328 
1329 	TCP_LOG_RTT_INFO(tp);
1330 }
1331 
1332 static inline void
tcp_create_ifnet_stats_per_flow(struct tcpcb * tp,struct ifnet_stats_per_flow * ifs)1333 tcp_create_ifnet_stats_per_flow(struct tcpcb *tp,
1334     struct ifnet_stats_per_flow *ifs)
1335 {
1336 	struct inpcb *inp;
1337 	struct socket *so;
1338 	if (tp == NULL || ifs == NULL) {
1339 		return;
1340 	}
1341 
1342 	bzero(ifs, sizeof(*ifs));
1343 	inp = tp->t_inpcb;
1344 	so = inp->inp_socket;
1345 
1346 	ifs->ipv4 = (inp->inp_vflag & INP_IPV6) ? 0 : 1;
1347 	ifs->local = (tp->t_flags & TF_LOCAL) ? 1 : 0;
1348 	ifs->connreset = (so->so_error == ECONNRESET) ? 1 : 0;
1349 	ifs->conntimeout = (so->so_error == ETIMEDOUT) ? 1 : 0;
1350 	ifs->ecn_flags = tp->ecn_flags;
1351 	ifs->txretransmitbytes = tp->t_stat.txretransmitbytes;
1352 	ifs->rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1353 	ifs->rxmitpkts = tp->t_stat.rxmitpkts;
1354 	ifs->rcvoopack = tp->t_rcvoopack;
1355 	ifs->pawsdrop = tp->t_pawsdrop;
1356 	ifs->sack_recovery_episodes = tp->t_sack_recovery_episode;
1357 	ifs->reordered_pkts = tp->t_reordered_pkts;
1358 	ifs->dsack_sent = tp->t_dsack_sent;
1359 	ifs->dsack_recvd = tp->t_dsack_recvd;
1360 	ifs->srtt = tp->t_srtt;
1361 	ifs->rttupdated = tp->t_rttupdated;
1362 	ifs->rttvar = tp->t_rttvar;
1363 	ifs->rttmin = get_base_rtt(tp);
1364 	if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_sndbw_max > 0) {
1365 		ifs->bw_sndbw_max = tp->t_bwmeas->bw_sndbw_max;
1366 	} else {
1367 		ifs->bw_sndbw_max = 0;
1368 	}
1369 	if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_rcvbw_max > 0) {
1370 		ifs->bw_rcvbw_max = tp->t_bwmeas->bw_rcvbw_max;
1371 	} else {
1372 		ifs->bw_rcvbw_max = 0;
1373 	}
1374 	ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets;
1375 	ifs->txpackets = inp->inp_stat->txpackets;
1376 	ifs->rxpackets = inp->inp_stat->rxpackets;
1377 }
1378 
1379 static inline void
tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow * ifs,struct if_tcp_ecn_perf_stat * stat)1380 tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
1381     struct if_tcp_ecn_perf_stat *stat)
1382 {
1383 	u_int64_t curval, oldval;
1384 	stat->total_txpkts += ifs->txpackets;
1385 	stat->total_rxpkts += ifs->rxpackets;
1386 	stat->total_rxmitpkts += ifs->rxmitpkts;
1387 	stat->total_oopkts += ifs->rcvoopack;
1388 	stat->total_reorderpkts += (ifs->reordered_pkts +
1389 	    ifs->pawsdrop + ifs->dsack_sent + ifs->dsack_recvd);
1390 
1391 	/* Average RTT */
1392 	curval = ifs->srtt >> TCP_RTT_SHIFT;
1393 	if (curval > 0 && ifs->rttupdated >= 16) {
1394 		if (stat->rtt_avg == 0) {
1395 			stat->rtt_avg = curval;
1396 		} else {
1397 			oldval = stat->rtt_avg;
1398 			stat->rtt_avg = ((oldval << 4) - oldval + curval) >> 4;
1399 		}
1400 	}
1401 
1402 	/* RTT variance */
1403 	curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1404 	if (curval > 0 && ifs->rttupdated >= 16) {
1405 		if (stat->rtt_var == 0) {
1406 			stat->rtt_var = curval;
1407 		} else {
1408 			oldval = stat->rtt_var;
1409 			stat->rtt_var =
1410 			    ((oldval << 4) - oldval + curval) >> 4;
1411 		}
1412 	}
1413 
1414 	/* SACK episodes */
1415 	stat->sack_episodes += ifs->sack_recovery_episodes;
1416 	if (ifs->connreset) {
1417 		stat->rst_drop++;
1418 	}
1419 }
1420 
1421 static inline void
tcp_flow_lim_stats(struct ifnet_stats_per_flow * ifs,struct if_lim_perf_stat * stat)1422 tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
1423     struct if_lim_perf_stat *stat)
1424 {
1425 	u_int64_t curval, oldval;
1426 
1427 	stat->lim_total_txpkts += ifs->txpackets;
1428 	stat->lim_total_rxpkts += ifs->rxpackets;
1429 	stat->lim_total_retxpkts += ifs->rxmitpkts;
1430 	stat->lim_total_oopkts += ifs->rcvoopack;
1431 
1432 	if (ifs->bw_sndbw_max > 0) {
1433 		/* convert from bytes per ms to bits per second */
1434 		ifs->bw_sndbw_max *= 8000;
1435 		stat->lim_ul_max_bandwidth = MAX(stat->lim_ul_max_bandwidth,
1436 		    ifs->bw_sndbw_max);
1437 	}
1438 
1439 	if (ifs->bw_rcvbw_max > 0) {
1440 		/* convert from bytes per ms to bits per second */
1441 		ifs->bw_rcvbw_max *= 8000;
1442 		stat->lim_dl_max_bandwidth = MAX(stat->lim_dl_max_bandwidth,
1443 		    ifs->bw_rcvbw_max);
1444 	}
1445 
1446 	/* Average RTT */
1447 	curval = ifs->srtt >> TCP_RTT_SHIFT;
1448 	if (curval > 0 && ifs->rttupdated >= 16) {
1449 		if (stat->lim_rtt_average == 0) {
1450 			stat->lim_rtt_average = curval;
1451 		} else {
1452 			oldval = stat->lim_rtt_average;
1453 			stat->lim_rtt_average =
1454 			    ((oldval << 4) - oldval + curval) >> 4;
1455 		}
1456 	}
1457 
1458 	/* RTT variance */
1459 	curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1460 	if (curval > 0 && ifs->rttupdated >= 16) {
1461 		if (stat->lim_rtt_variance == 0) {
1462 			stat->lim_rtt_variance = curval;
1463 		} else {
1464 			oldval = stat->lim_rtt_variance;
1465 			stat->lim_rtt_variance =
1466 			    ((oldval << 4) - oldval + curval) >> 4;
1467 		}
1468 	}
1469 
1470 	if (stat->lim_rtt_min == 0) {
1471 		stat->lim_rtt_min = ifs->rttmin;
1472 	} else {
1473 		stat->lim_rtt_min = MIN(stat->lim_rtt_min, ifs->rttmin);
1474 	}
1475 
1476 	/* connection timeouts */
1477 	stat->lim_conn_attempts++;
1478 	if (ifs->conntimeout) {
1479 		stat->lim_conn_timeouts++;
1480 	}
1481 
1482 	/* bytes sent using background delay-based algorithms */
1483 	stat->lim_bk_txpkts += ifs->bk_txpackets;
1484 }
1485 
1486 /*
1487  * Close a TCP control block:
1488  *	discard all space held by the tcp
1489  *	discard internet protocol block
1490  *	wake up any sleepers
1491  */
1492 struct tcpcb *
tcp_close(struct tcpcb * tp)1493 tcp_close(struct tcpcb *tp)
1494 {
1495 	struct inpcb *inp = tp->t_inpcb;
1496 	struct socket *so = inp->inp_socket;
1497 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1498 	struct route *ro;
1499 	struct rtentry *rt;
1500 	int dosavessthresh;
1501 	struct ifnet_stats_per_flow ifs;
1502 
1503 	/* tcp_close was called previously, bail */
1504 	if (inp->inp_ppcb == NULL) {
1505 		return NULL;
1506 	}
1507 
1508 	tcp_del_fsw_flow(tp);
1509 
1510 	tcp_canceltimers(tp);
1511 	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp, 0, 0, 0, 0);
1512 
1513 	/*
1514 	 * If another thread for this tcp is currently in ip (indicated by
1515 	 * the TF_SENDINPROG flag), defer the cleanup until after it returns
1516 	 * back to tcp.  This is done to serialize the close until after all
1517 	 * pending output is finished, in order to avoid having the PCB be
1518 	 * detached and the cached route cleaned, only for ip to cache the
1519 	 * route back into the PCB again.  Note that we've cleared all the
1520 	 * timers at this point.  Set TF_CLOSING to indicate to tcp_output()
1521 	 * that is should call us again once it returns from ip; at that
1522 	 * point both flags should be cleared and we can proceed further
1523 	 * with the cleanup.
1524 	 */
1525 	if ((tp->t_flags & TF_CLOSING) ||
1526 	    inp->inp_sndinprog_cnt > 0) {
1527 		tp->t_flags |= TF_CLOSING;
1528 		return NULL;
1529 	}
1530 
1531 	TCP_LOG_CONNECTION_SUMMARY(tp);
1532 
1533 	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1534 	    struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1535 
1536 	ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
1537 	rt = ro->ro_rt;
1538 	if (rt != NULL) {
1539 		RT_LOCK_SPIN(rt);
1540 	}
1541 
1542 	/*
1543 	 * If we got enough samples through the srtt filter,
1544 	 * save the rtt and rttvar in the routing entry.
1545 	 * 'Enough' is arbitrarily defined as the 16 samples.
1546 	 * 16 samples is enough for the srtt filter to converge
1547 	 * to within 5% of the correct value; fewer samples and
1548 	 * we could save a very bogus rtt.
1549 	 *
1550 	 * Don't update the default route's characteristics and don't
1551 	 * update anything that the user "locked".
1552 	 */
1553 	if (tp->t_rttupdated >= 16) {
1554 		u_int32_t i = 0;
1555 		bool log_rtt = false;
1556 
1557 		if (isipv6) {
1558 			struct sockaddr_in6 *sin6;
1559 
1560 			if (rt == NULL) {
1561 				goto no_valid_rt;
1562 			}
1563 			sin6 = SIN6(rt_key(rt));
1564 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1565 				goto no_valid_rt;
1566 			}
1567 		} else if (ROUTE_UNUSABLE(ro) ||
1568 		    SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
1569 			DTRACE_TCP4(state__change, void, NULL,
1570 			    struct inpcb *, inp, struct tcpcb *, tp,
1571 			    int32_t, TCPS_CLOSED);
1572 			TCP_LOG_STATE(tp, TCPS_CLOSED);
1573 			tp->t_state = TCPS_CLOSED;
1574 			goto no_valid_rt;
1575 		}
1576 
1577 		RT_LOCK_ASSERT_HELD(rt);
1578 		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1579 			i = tp->t_srtt *
1580 			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1581 			if (rt->rt_rmx.rmx_rtt && i) {
1582 				/*
1583 				 * filter this update to half the old & half
1584 				 * the new values, converting scale.
1585 				 * See route.h and tcp_var.h for a
1586 				 * description of the scaling constants.
1587 				 */
1588 				rt->rt_rmx.rmx_rtt =
1589 				    (rt->rt_rmx.rmx_rtt + i) / 2;
1590 			} else {
1591 				rt->rt_rmx.rmx_rtt = i;
1592 			}
1593 			tcpstat.tcps_cachedrtt++;
1594 			log_rtt = true;
1595 		}
1596 		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1597 			i = tp->t_rttvar *
1598 			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1599 			if (rt->rt_rmx.rmx_rttvar && i) {
1600 				rt->rt_rmx.rmx_rttvar =
1601 				    (rt->rt_rmx.rmx_rttvar + i) / 2;
1602 			} else {
1603 				rt->rt_rmx.rmx_rttvar = i;
1604 			}
1605 			tcpstat.tcps_cachedrttvar++;
1606 			log_rtt = true;
1607 		}
1608 		if (log_rtt) {
1609 			TCP_LOG_RTM_RTT(tp, rt);
1610 			TCP_LOG_RTT_INFO(tp);
1611 		}
1612 		/*
1613 		 * The old comment here said:
1614 		 * update the pipelimit (ssthresh) if it has been updated
1615 		 * already or if a pipesize was specified & the threshhold
1616 		 * got below half the pipesize.  I.e., wait for bad news
1617 		 * before we start updating, then update on both good
1618 		 * and bad news.
1619 		 *
1620 		 * But we want to save the ssthresh even if no pipesize is
1621 		 * specified explicitly in the route, because such
1622 		 * connections still have an implicit pipesize specified
1623 		 * by the global tcp_sendspace.  In the absence of a reliable
1624 		 * way to calculate the pipesize, it will have to do.
1625 		 */
1626 		i = tp->snd_ssthresh;
1627 		if (rt->rt_rmx.rmx_sendpipe != 0) {
1628 			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1629 		} else {
1630 			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1631 		}
1632 		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1633 		    i != 0 && rt->rt_rmx.rmx_ssthresh != 0) ||
1634 		    dosavessthresh) {
1635 			/*
1636 			 * convert the limit from user data bytes to
1637 			 * packets then to packet data bytes.
1638 			 */
1639 			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1640 			if (i < 2) {
1641 				i = 2;
1642 			}
1643 			i *= (u_int32_t)(tp->t_maxseg +
1644 			    isipv6 ? sizeof(struct ip6_hdr) +
1645 			    sizeof(struct tcphdr) :
1646 			    sizeof(struct tcpiphdr));
1647 			if (rt->rt_rmx.rmx_ssthresh) {
1648 				rt->rt_rmx.rmx_ssthresh =
1649 				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
1650 			} else {
1651 				rt->rt_rmx.rmx_ssthresh = i;
1652 			}
1653 			tcpstat.tcps_cachedssthresh++;
1654 		}
1655 	}
1656 
1657 	/*
1658 	 * Mark route for deletion if no information is cached.
1659 	 */
1660 	if (rt != NULL && (so->so_flags & SOF_OVERFLOW)) {
1661 		if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1662 		    rt->rt_rmx.rmx_rtt == 0) {
1663 			rt->rt_flags |= RTF_DELCLONE;
1664 		}
1665 	}
1666 
1667 no_valid_rt:
1668 	if (rt != NULL) {
1669 		RT_UNLOCK(rt);
1670 	}
1671 
1672 	/* free the reassembly queue, if any */
1673 	(void) tcp_freeq(tp);
1674 
1675 	/* performance stats per interface */
1676 	tcp_create_ifnet_stats_per_flow(tp, &ifs);
1677 	tcp_update_stats_per_flow(&ifs, inp->inp_last_outifp);
1678 
1679 	tcp_free_sackholes(tp);
1680 	tcp_notify_ack_free(tp);
1681 
1682 	inp_decr_sndbytes_allunsent(so, tp->snd_una);
1683 
1684 	if (tp->t_bwmeas != NULL) {
1685 		tcp_bwmeas_free(tp);
1686 	}
1687 	tcp_rxtseg_clean(tp);
1688 	tcp_segs_sent_clean(tp, true);
1689 
1690 	/* Free the packet list */
1691 	if (tp->t_pktlist_head != NULL) {
1692 		m_freem_list(tp->t_pktlist_head);
1693 	}
1694 	TCP_PKTLIST_CLEAR(tp);
1695 
1696 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1697 		inp->inp_saved_ppcb = (caddr_t) tp;
1698 	}
1699 
1700 	TCP_LOG_STATE(tp, TCPS_CLOSED);
1701 	tp->t_state = TCPS_CLOSED;
1702 
1703 	/*
1704 	 * Issue a wakeup before detach so that we don't miss
1705 	 * a wakeup
1706 	 */
1707 	sodisconnectwakeup(so);
1708 
1709 	/*
1710 	 * Make sure to clear the TCP Keep Alive Offload as it is
1711 	 * ref counted on the interface
1712 	 */
1713 	tcp_clear_keep_alive_offload(so);
1714 
1715 	/*
1716 	 * If this is a socket that does not want to wakeup the device
1717 	 * for it's traffic, the application might need to know that the
1718 	 * socket is closed, send a notification.
1719 	 */
1720 	if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
1721 	    inp->inp_state != INPCB_STATE_DEAD &&
1722 	    !(inp->inp_flags2 & INP2_TIMEWAIT)) {
1723 		socket_post_kev_msg_closed(so);
1724 	}
1725 
1726 	if (CC_ALGO(tp)->cleanup != NULL) {
1727 		CC_ALGO(tp)->cleanup(tp);
1728 	}
1729 
1730 	tp->tcp_cc_index = TCP_CC_ALGO_NONE;
1731 
1732 	if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.cleanup != NULL) {
1733 		tcp_cc_rledbat.cleanup(tp);
1734 	}
1735 
1736 	/* Can happen if we close the socket before receiving the third ACK */
1737 	if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
1738 		OSDecrementAtomic(&tcp_tfo_halfcnt);
1739 
1740 		/* Panic if something has gone terribly wrong. */
1741 		VERIFY(tcp_tfo_halfcnt >= 0);
1742 
1743 		tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
1744 	}
1745 
1746 	if (SOCK_CHECK_DOM(so, PF_INET6)) {
1747 		in6_pcbdetach(inp);
1748 	} else {
1749 		in_pcbdetach(inp);
1750 	}
1751 
1752 	/*
1753 	 * Call soisdisconnected after detach because it might unlock the socket
1754 	 */
1755 	soisdisconnected(so);
1756 	tcpstat.tcps_closed++;
1757 	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END,
1758 	    tcpstat.tcps_closed, 0, 0, 0, 0);
1759 	return NULL;
1760 }
1761 
1762 int
tcp_freeq(struct tcpcb * tp)1763 tcp_freeq(struct tcpcb *tp)
1764 {
1765 	struct tseg_qent *q;
1766 	int rv = 0;
1767 	int count = 0;
1768 
1769 	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1770 		LIST_REMOVE(q, tqe_q);
1771 		tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
1772 		    q->tqe_m->m_ext.ext_size : 0;
1773 		m_freem(q->tqe_m);
1774 		zfree(tcp_reass_zone, q);
1775 		rv = 1;
1776 		count++;
1777 	}
1778 	tp->t_reassqlen = 0;
1779 	if (count > 0) {
1780 		OSAddAtomic(-count, &tcp_reass_total_qlen);
1781 	}
1782 	return rv;
1783 }
1784 
1785 
1786 void
tcp_drain(void)1787 tcp_drain(void)
1788 {
1789 	struct inpcb *inp;
1790 	struct tcpcb *tp;
1791 
1792 	if (!lck_rw_try_lock_exclusive(&tcbinfo.ipi_lock)) {
1793 		return;
1794 	}
1795 
1796 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1797 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
1798 		    WNT_STOPUSING) {
1799 			socket_lock(inp->inp_socket, 1);
1800 			if (in_pcb_checkstate(inp, WNT_RELEASE, 1)
1801 			    == WNT_STOPUSING) {
1802 				/* lost a race, try the next one */
1803 				socket_unlock(inp->inp_socket, 1);
1804 				continue;
1805 			}
1806 			tp = intotcpcb(inp);
1807 
1808 			so_drain_extended_bk_idle(inp->inp_socket);
1809 
1810 			socket_unlock(inp->inp_socket, 1);
1811 		}
1812 	}
1813 	lck_rw_done(&tcbinfo.ipi_lock);
1814 }
1815 
1816 /*
1817  * Notify a tcp user of an asynchronous error;
1818  * store error as soft error, but wake up user
1819  * (for now, won't do anything until can select for soft error).
1820  *
1821  * Do not wake up user since there currently is no mechanism for
1822  * reporting soft errors (yet - a kqueue filter may be added).
1823  */
1824 static void
tcp_notify(struct inpcb * inp,int error)1825 tcp_notify(struct inpcb *inp, int error)
1826 {
1827 	struct tcpcb *tp;
1828 
1829 	if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) {
1830 		return; /* pcb is gone already */
1831 	}
1832 	tp = (struct tcpcb *)inp->inp_ppcb;
1833 
1834 	VERIFY(tp != NULL);
1835 	/*
1836 	 * Ignore some errors if we are hooked up.
1837 	 * If connection hasn't completed, has retransmitted several times,
1838 	 * and receives a second error, give up now.  This is better
1839 	 * than waiting a long time to establish a connection that
1840 	 * can never complete.
1841 	 */
1842 	if (tp->t_state == TCPS_ESTABLISHED &&
1843 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1844 	    error == EHOSTDOWN)) {
1845 		if (inp->inp_route.ro_rt) {
1846 			rtfree(inp->inp_route.ro_rt);
1847 			inp->inp_route.ro_rt = (struct rtentry *)NULL;
1848 		}
1849 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1850 	    tp->t_softerror) {
1851 		tcp_drop(tp, error);
1852 	} else {
1853 		tp->t_softerror = error;
1854 	}
1855 }
1856 
1857 struct bwmeas *
tcp_bwmeas_alloc(struct tcpcb * tp)1858 tcp_bwmeas_alloc(struct tcpcb *tp)
1859 {
1860 	struct bwmeas *elm;
1861 	elm = zalloc_flags(tcp_bwmeas_zone, Z_ZERO | Z_WAITOK);
1862 	elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1863 	elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1864 	return elm;
1865 }
1866 
1867 void
tcp_bwmeas_free(struct tcpcb * tp)1868 tcp_bwmeas_free(struct tcpcb *tp)
1869 {
1870 	zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1871 	tp->t_bwmeas = NULL;
1872 	tp->t_flagsext &= ~(TF_MEASURESNDBW);
1873 }
1874 
1875 int
get_tcp_inp_list(struct inpcb * __single * inp_list __counted_by (n),size_t n,inp_gen_t gencnt)1876 get_tcp_inp_list(struct inpcb * __single *inp_list __counted_by(n), size_t n, inp_gen_t gencnt)
1877 {
1878 	struct tcpcb *tp;
1879 	struct inpcb *inp;
1880 	int i = 0;
1881 
1882 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1883 		if (i >= n) {
1884 			break;
1885 		}
1886 		if (inp->inp_gencnt <= gencnt &&
1887 		    inp->inp_state != INPCB_STATE_DEAD) {
1888 			inp_list[i++] = inp;
1889 		}
1890 	}
1891 
1892 	TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
1893 		if (i >= n) {
1894 			break;
1895 		}
1896 		inp = tp->t_inpcb;
1897 		if (inp->inp_gencnt <= gencnt &&
1898 		    inp->inp_state != INPCB_STATE_DEAD) {
1899 			inp_list[i++] = inp;
1900 		}
1901 	}
1902 	return i;
1903 }
1904 
1905 /*
1906  * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1907  * The otcpcb data structure is passed to user space and must not change.
1908  */
1909 static void
tcpcb_to_otcpcb(struct tcpcb * tp,struct otcpcb * otp)1910 tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1911 {
1912 	otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
1913 	otp->t_dupacks = tp->t_dupacks;
1914 	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1915 	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1916 	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1917 	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1918 	otp->t_inpcb =
1919 	    (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRHASH(tp->t_inpcb);
1920 	otp->t_state = tp->t_state;
1921 	otp->t_flags = tp->t_flags;
1922 	otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1923 	otp->snd_una = tp->snd_una;
1924 	otp->snd_max = tp->snd_max;
1925 	otp->snd_nxt = tp->snd_nxt;
1926 	otp->snd_up = tp->snd_up;
1927 	otp->snd_wl1 = tp->snd_wl1;
1928 	otp->snd_wl2 = tp->snd_wl2;
1929 	otp->iss = tp->iss;
1930 	otp->irs = tp->irs;
1931 	otp->rcv_nxt = tp->rcv_nxt;
1932 	otp->rcv_adv = tp->rcv_adv;
1933 	otp->rcv_wnd = tp->rcv_wnd;
1934 	otp->rcv_up = tp->rcv_up;
1935 	otp->snd_wnd = tp->snd_wnd;
1936 	otp->snd_cwnd = tp->snd_cwnd;
1937 	otp->snd_ssthresh = tp->snd_ssthresh;
1938 	otp->t_maxopd = tp->t_maxopd;
1939 	otp->t_rcvtime = tp->t_rcvtime;
1940 	otp->t_starttime = tp->t_starttime;
1941 	otp->t_rtttime = tp->t_rtttime;
1942 	otp->t_rtseq = tp->t_rtseq;
1943 	otp->t_rxtcur = tp->t_rxtcur;
1944 	otp->t_maxseg = tp->t_maxseg;
1945 	otp->t_srtt = tp->t_srtt;
1946 	otp->t_rttvar = tp->t_rttvar;
1947 	otp->t_rxtshift = tp->t_rxtshift;
1948 	otp->t_rttmin = tp->t_rttmin;
1949 	otp->t_rttupdated = tp->t_rttupdated;
1950 	otp->max_sndwnd = tp->max_sndwnd;
1951 	otp->t_softerror = tp->t_softerror;
1952 	otp->t_oobflags = tp->t_oobflags;
1953 	otp->t_iobc = tp->t_iobc;
1954 	otp->snd_scale = tp->snd_scale;
1955 	otp->rcv_scale = tp->rcv_scale;
1956 	otp->request_r_scale = tp->request_r_scale;
1957 	otp->requested_s_scale = tp->requested_s_scale;
1958 	otp->ts_recent = tp->ts_recent;
1959 	otp->ts_recent_age = tp->ts_recent_age;
1960 	otp->last_ack_sent = tp->last_ack_sent;
1961 	otp->cc_send = 0;
1962 	otp->cc_recv = 0;
1963 	otp->snd_recover = tp->snd_recover;
1964 	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1965 	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1966 	otp->t_badrxtwin = 0;
1967 }
1968 
1969 static int
1970 tcp_pcblist SYSCTL_HANDLER_ARGS
1971 {
1972 #pragma unused(oidp, arg1, arg2)
1973 	int error, i = 0, n, sz;
1974 	struct inpcb **inp_list;
1975 	inp_gen_t gencnt;
1976 	struct xinpgen xig;
1977 
1978 	/*
1979 	 * The process of preparing the TCB list is too time-consuming and
1980 	 * resource-intensive to repeat twice on every request.
1981 	 */
1982 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
1983 	if (req->oldptr == USER_ADDR_NULL) {
1984 		n = tcbinfo.ipi_count;
1985 		req->oldidx = 2 * (sizeof(xig))
1986 		    + (n + n / 8) * sizeof(struct xtcpcb);
1987 		lck_rw_done(&tcbinfo.ipi_lock);
1988 		return 0;
1989 	}
1990 
1991 	if (req->newptr != USER_ADDR_NULL) {
1992 		lck_rw_done(&tcbinfo.ipi_lock);
1993 		return EPERM;
1994 	}
1995 
1996 	/*
1997 	 * OK, now we're committed to doing something.
1998 	 */
1999 	gencnt = tcbinfo.ipi_gencnt;
2000 	sz = n = tcbinfo.ipi_count;
2001 
2002 	bzero(&xig, sizeof(xig));
2003 	xig.xig_len = sizeof(xig);
2004 	xig.xig_count = n;
2005 	xig.xig_gen = gencnt;
2006 	xig.xig_sogen = so_gencnt;
2007 	error = SYSCTL_OUT(req, &xig, sizeof(xig));
2008 	if (error) {
2009 		lck_rw_done(&tcbinfo.ipi_lock);
2010 		return error;
2011 	}
2012 	/*
2013 	 * We are done if there is no pcb
2014 	 */
2015 	if (n == 0) {
2016 		lck_rw_done(&tcbinfo.ipi_lock);
2017 		return 0;
2018 	}
2019 
2020 	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
2021 	if (inp_list == NULL) {
2022 		lck_rw_done(&tcbinfo.ipi_lock);
2023 		return ENOMEM;
2024 	}
2025 
2026 	n = get_tcp_inp_list(inp_list, n, gencnt);
2027 
2028 	error = 0;
2029 	for (i = 0; i < n; i++) {
2030 		struct xtcpcb xt;
2031 		caddr_t inp_ppcb __single;
2032 		struct inpcb *inp;
2033 
2034 		inp = inp_list[i];
2035 
2036 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2037 			continue;
2038 		}
2039 		socket_lock(inp->inp_socket, 1);
2040 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2041 			socket_unlock(inp->inp_socket, 1);
2042 			continue;
2043 		}
2044 		if (inp->inp_gencnt > gencnt) {
2045 			socket_unlock(inp->inp_socket, 1);
2046 			continue;
2047 		}
2048 
2049 		bzero(&xt, sizeof(xt));
2050 		xt.xt_len = sizeof(xt);
2051 		/* XXX should avoid extra copy */
2052 		inpcb_to_compat(inp, &xt.xt_inp);
2053 		inp_ppcb = inp->inp_ppcb;
2054 		if (inp_ppcb != NULL) {
2055 			tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb,
2056 			    &xt.xt_tp);
2057 		} else {
2058 			bzero((char *) &xt.xt_tp, sizeof(xt.xt_tp));
2059 		}
2060 		if (inp->inp_socket) {
2061 			sotoxsocket(inp->inp_socket, &xt.xt_socket);
2062 		}
2063 
2064 		socket_unlock(inp->inp_socket, 1);
2065 
2066 		error = SYSCTL_OUT(req, &xt, sizeof(xt));
2067 	}
2068 	if (!error) {
2069 		/*
2070 		 * Give the user an updated idea of our state.
2071 		 * If the generation differs from what we told
2072 		 * her before, she knows that something happened
2073 		 * while we were processing this request, and it
2074 		 * might be necessary to retry.
2075 		 */
2076 		bzero(&xig, sizeof(xig));
2077 		xig.xig_len = sizeof(xig);
2078 		xig.xig_gen = tcbinfo.ipi_gencnt;
2079 		xig.xig_sogen = so_gencnt;
2080 		xig.xig_count = tcbinfo.ipi_count;
2081 		error = SYSCTL_OUT(req, &xig, sizeof(xig));
2082 	}
2083 
2084 	lck_rw_done(&tcbinfo.ipi_lock);
2085 	kfree_type(struct inpcb *, sz, inp_list);
2086 	return error;
2087 }
2088 
2089 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
2090     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2091     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
2092 
2093 #if XNU_TARGET_OS_OSX
2094 
2095 static void
tcpcb_to_xtcpcb64(struct tcpcb * tp,struct xtcpcb64 * otp)2096 tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
2097 {
2098 	otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
2099 	otp->t_dupacks = tp->t_dupacks;
2100 	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
2101 	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
2102 	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
2103 	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
2104 	otp->t_state = tp->t_state;
2105 	otp->t_flags = tp->t_flags;
2106 	otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
2107 	otp->snd_una = tp->snd_una;
2108 	otp->snd_max = tp->snd_max;
2109 	otp->snd_nxt = tp->snd_nxt;
2110 	otp->snd_up = tp->snd_up;
2111 	otp->snd_wl1 = tp->snd_wl1;
2112 	otp->snd_wl2 = tp->snd_wl2;
2113 	otp->iss = tp->iss;
2114 	otp->irs = tp->irs;
2115 	otp->rcv_nxt = tp->rcv_nxt;
2116 	otp->rcv_adv = tp->rcv_adv;
2117 	otp->rcv_wnd = tp->rcv_wnd;
2118 	otp->rcv_up = tp->rcv_up;
2119 	otp->snd_wnd = tp->snd_wnd;
2120 	otp->snd_cwnd = tp->snd_cwnd;
2121 	otp->snd_ssthresh = tp->snd_ssthresh;
2122 	otp->t_maxopd = tp->t_maxopd;
2123 	otp->t_rcvtime = tp->t_rcvtime;
2124 	otp->t_starttime = tp->t_starttime;
2125 	otp->t_rtttime = tp->t_rtttime;
2126 	otp->t_rtseq = tp->t_rtseq;
2127 	otp->t_rxtcur = tp->t_rxtcur;
2128 	otp->t_maxseg = tp->t_maxseg;
2129 	otp->t_srtt = tp->t_srtt;
2130 	otp->t_rttvar = tp->t_rttvar;
2131 	otp->t_rxtshift = tp->t_rxtshift;
2132 	otp->t_rttmin = tp->t_rttmin;
2133 	otp->t_rttupdated = tp->t_rttupdated;
2134 	otp->max_sndwnd = tp->max_sndwnd;
2135 	otp->t_softerror = tp->t_softerror;
2136 	otp->t_oobflags = tp->t_oobflags;
2137 	otp->t_iobc = tp->t_iobc;
2138 	otp->snd_scale = tp->snd_scale;
2139 	otp->rcv_scale = tp->rcv_scale;
2140 	otp->request_r_scale = tp->request_r_scale;
2141 	otp->requested_s_scale = tp->requested_s_scale;
2142 	otp->ts_recent = tp->ts_recent;
2143 	otp->ts_recent_age = tp->ts_recent_age;
2144 	otp->last_ack_sent = tp->last_ack_sent;
2145 	otp->cc_send = 0;
2146 	otp->cc_recv = 0;
2147 	otp->snd_recover = tp->snd_recover;
2148 	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
2149 	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
2150 	otp->t_badrxtwin = 0;
2151 }
2152 
2153 
2154 static int
2155 tcp_pcblist64 SYSCTL_HANDLER_ARGS
2156 {
2157 #pragma unused(oidp, arg1, arg2)
2158 	int error, i = 0, n, sz;
2159 	struct inpcb **inp_list;
2160 	inp_gen_t gencnt;
2161 	struct xinpgen xig;
2162 
2163 	/*
2164 	 * The process of preparing the TCB list is too time-consuming and
2165 	 * resource-intensive to repeat twice on every request.
2166 	 */
2167 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
2168 	if (req->oldptr == USER_ADDR_NULL) {
2169 		n = tcbinfo.ipi_count;
2170 		req->oldidx = 2 * (sizeof(xig))
2171 		    + (n + n / 8) * sizeof(struct xtcpcb64);
2172 		lck_rw_done(&tcbinfo.ipi_lock);
2173 		return 0;
2174 	}
2175 
2176 	if (req->newptr != USER_ADDR_NULL) {
2177 		lck_rw_done(&tcbinfo.ipi_lock);
2178 		return EPERM;
2179 	}
2180 
2181 	/*
2182 	 * OK, now we're committed to doing something.
2183 	 */
2184 	gencnt = tcbinfo.ipi_gencnt;
2185 	sz = n = tcbinfo.ipi_count;
2186 
2187 	bzero(&xig, sizeof(xig));
2188 	xig.xig_len = sizeof(xig);
2189 	xig.xig_count = n;
2190 	xig.xig_gen = gencnt;
2191 	xig.xig_sogen = so_gencnt;
2192 	error = SYSCTL_OUT(req, &xig, sizeof(xig));
2193 	if (error) {
2194 		lck_rw_done(&tcbinfo.ipi_lock);
2195 		return error;
2196 	}
2197 	/*
2198 	 * We are done if there is no pcb
2199 	 */
2200 	if (n == 0) {
2201 		lck_rw_done(&tcbinfo.ipi_lock);
2202 		return 0;
2203 	}
2204 
2205 	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
2206 	if (inp_list == NULL) {
2207 		lck_rw_done(&tcbinfo.ipi_lock);
2208 		return ENOMEM;
2209 	}
2210 
2211 	n = get_tcp_inp_list(inp_list, n, gencnt);
2212 
2213 	error = 0;
2214 	for (i = 0; i < n; i++) {
2215 		struct xtcpcb64 xt;
2216 		struct inpcb *inp;
2217 
2218 		inp = inp_list[i];
2219 
2220 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2221 			continue;
2222 		}
2223 		socket_lock(inp->inp_socket, 1);
2224 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2225 			socket_unlock(inp->inp_socket, 1);
2226 			continue;
2227 		}
2228 		if (inp->inp_gencnt > gencnt) {
2229 			socket_unlock(inp->inp_socket, 1);
2230 			continue;
2231 		}
2232 
2233 		bzero(&xt, sizeof(xt));
2234 		xt.xt_len = sizeof(xt);
2235 		inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
2236 		xt.xt_inpcb.inp_ppcb =
2237 		    (uint64_t)VM_KERNEL_ADDRHASH(inp->inp_ppcb);
2238 		if (inp->inp_ppcb != NULL) {
2239 			tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb,
2240 			    &xt);
2241 		}
2242 		if (inp->inp_socket) {
2243 			sotoxsocket64(inp->inp_socket,
2244 			    &xt.xt_inpcb.xi_socket);
2245 		}
2246 
2247 		socket_unlock(inp->inp_socket, 1);
2248 
2249 		error = SYSCTL_OUT(req, &xt, sizeof(xt));
2250 	}
2251 	if (!error) {
2252 		/*
2253 		 * Give the user an updated idea of our state.
2254 		 * If the generation differs from what we told
2255 		 * her before, she knows that something happened
2256 		 * while we were processing this request, and it
2257 		 * might be necessary to retry.
2258 		 */
2259 		bzero(&xig, sizeof(xig));
2260 		xig.xig_len = sizeof(xig);
2261 		xig.xig_gen = tcbinfo.ipi_gencnt;
2262 		xig.xig_sogen = so_gencnt;
2263 		xig.xig_count = tcbinfo.ipi_count;
2264 		error = SYSCTL_OUT(req, &xig, sizeof(xig));
2265 	}
2266 
2267 	lck_rw_done(&tcbinfo.ipi_lock);
2268 	kfree_type(struct inpcb *, sz, inp_list);
2269 	return error;
2270 }
2271 
2272 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
2273     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2274     tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
2275 
2276 #endif /* XNU_TARGET_OS_OSX */
2277 
2278 static int
2279 tcp_pcblist_n SYSCTL_HANDLER_ARGS
2280 {
2281 #pragma unused(oidp, arg1, arg2)
2282 	int error = 0;
2283 
2284 	error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
2285 
2286 	return error;
2287 }
2288 
2289 
2290 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
2291     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2292     tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
2293 
2294 static int
2295 tcp_progress_probe_enable SYSCTL_HANDLER_ARGS
2296 {
2297 #pragma unused(oidp, arg1, arg2)
2298 
2299 	return ntstat_tcp_progress_enable(req);
2300 }
2301 
2302 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress_enable,
2303     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
2304     tcp_progress_probe_enable, "S", "Enable/disable TCP keepalive probing on the specified link(s)");
2305 
2306 
2307 __private_extern__ void
tcp_get_ports_used(ifnet_t ifp,int protocol,uint32_t flags,bitstr_t * __counted_by (bitstr_size (IP_PORTRANGE_SIZE))bitfield)2308 tcp_get_ports_used(ifnet_t ifp, int protocol, uint32_t flags,
2309     bitstr_t *__counted_by(bitstr_size(IP_PORTRANGE_SIZE)) bitfield)
2310 {
2311 	inpcb_get_ports_used(ifp, protocol, flags, bitfield,
2312 	    &tcbinfo);
2313 }
2314 
2315 __private_extern__ uint32_t
tcp_count_opportunistic(unsigned int ifindex,u_int32_t flags)2316 tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
2317 {
2318 	return inpcb_count_opportunistic(ifindex, &tcbinfo, flags);
2319 }
2320 
2321 __private_extern__ uint32_t
tcp_find_anypcb_byaddr(struct ifaddr * ifa)2322 tcp_find_anypcb_byaddr(struct ifaddr *ifa)
2323 {
2324 #if SKYWALK
2325 	if (netns_is_enabled()) {
2326 		return netns_find_anyres_byaddr(ifa, IPPROTO_TCP);
2327 	} else
2328 #endif /* SKYWALK */
2329 	return inpcb_find_anypcb_byaddr(ifa, &tcbinfo);
2330 }
2331 
2332 static void
tcp_handle_msgsize(struct ip * ip,struct inpcb * inp)2333 tcp_handle_msgsize(struct ip *ip, struct inpcb *inp)
2334 {
2335 	struct rtentry *rt = NULL;
2336 	u_short ifscope = IFSCOPE_NONE;
2337 	int mtu;
2338 	struct sockaddr_in icmpsrc = {
2339 		.sin_len = sizeof(struct sockaddr_in),
2340 		.sin_family = AF_INET, .sin_port = 0, .sin_addr = { .s_addr = 0 },
2341 		.sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }
2342 	};
2343 	struct icmp *icp = NULL;
2344 
2345 	icp = __container_of(ip, struct icmp, icmp_ip);
2346 	icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
2347 
2348 	/*
2349 	 * MTU discovery:
2350 	 * If we got a needfrag and there is a host route to the
2351 	 * original destination, and the MTU is not locked, then
2352 	 * set the MTU in the route to the suggested new value
2353 	 * (if given) and then notify as usual.  The ULPs will
2354 	 * notice that the MTU has changed and adapt accordingly.
2355 	 * If no new MTU was suggested, then we guess a new one
2356 	 * less than the current value.  If the new MTU is
2357 	 * unreasonably small (defined by sysctl tcp_minmss), then
2358 	 * we reset the MTU to the interface value and enable the
2359 	 * lock bit, indicating that we are no longer doing MTU
2360 	 * discovery.
2361 	 */
2362 	if (ROUTE_UNUSABLE(&(inp->inp_route)) == false) {
2363 		rt = inp->inp_route.ro_rt;
2364 	}
2365 
2366 	/*
2367 	 * icmp6_mtudisc_update scopes the routing lookup
2368 	 * to the incoming interface (delivered from mbuf
2369 	 * packet header.
2370 	 * That is mostly ok but for asymmetric networks
2371 	 * that may be an issue.
2372 	 * Frag needed OR Packet too big really communicates
2373 	 * MTU for the out data path.
2374 	 * Take the interface scope from cached route or
2375 	 * the last outgoing interface from inp
2376 	 */
2377 	if (rt != NULL) {
2378 		ifscope = (rt->rt_ifp != NULL) ?
2379 		    rt->rt_ifp->if_index : IFSCOPE_NONE;
2380 	} else {
2381 		ifscope = (inp->inp_last_outifp != NULL) ?
2382 		    inp->inp_last_outifp->if_index : IFSCOPE_NONE;
2383 	}
2384 
2385 	if ((rt == NULL) ||
2386 	    !(rt->rt_flags & RTF_HOST) ||
2387 	    (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING))) {
2388 		rt = rtalloc1_scoped(SA(&icmpsrc), 0, RTF_CLONING | RTF_PRCLONING, ifscope);
2389 	} else if (rt) {
2390 		RT_LOCK(rt);
2391 		rtref(rt);
2392 		RT_UNLOCK(rt);
2393 	}
2394 
2395 	if (rt != NULL) {
2396 		RT_LOCK(rt);
2397 		if ((rt->rt_flags & RTF_HOST) &&
2398 		    !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
2399 			mtu = ntohs(icp->icmp_nextmtu);
2400 			/*
2401 			 * XXX Stock BSD has changed the following
2402 			 * to compare with icp->icmp_ip.ip_len
2403 			 * to converge faster when sent packet
2404 			 * < route's MTU. We may want to adopt
2405 			 * that change.
2406 			 */
2407 			if (mtu == 0) {
2408 				mtu = ip_next_mtu(rt->rt_rmx.
2409 				    rmx_mtu, 1);
2410 			}
2411 #if DEBUG_MTUDISC
2412 			printf("MTU for %s reduced to %d\n",
2413 			    inet_ntop(AF_INET,
2414 			    &icmpsrc.sin_addr, ipv4str,
2415 			    sizeof(ipv4str)), mtu);
2416 #endif
2417 			if (mtu < max(296, (tcp_minmss +
2418 			    sizeof(struct tcpiphdr)))) {
2419 				rt->rt_rmx.rmx_locks |= RTV_MTU;
2420 			} else if (rt->rt_rmx.rmx_mtu > mtu) {
2421 				rt->rt_rmx.rmx_mtu = mtu;
2422 			}
2423 		}
2424 		RT_UNLOCK(rt);
2425 		rtfree(rt);
2426 	}
2427 }
2428 
2429 void
tcp_ctlinput(int cmd,struct sockaddr * sa,void * vip,__unused struct ifnet * ifp)2430 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip, __unused struct ifnet *ifp)
2431 {
2432 	tcp_seq icmp_tcp_seq;
2433 	struct ipctlparam *ctl_param __single = vip;
2434 	struct ip *ip = NULL;
2435 	struct mbuf *m = NULL;
2436 	struct in_addr faddr;
2437 	struct inpcb *inp;
2438 	struct tcpcb *tp;
2439 	struct tcphdr *th;
2440 	struct icmp *icp;
2441 	size_t off;
2442 #if SKYWALK
2443 	union sockaddr_in_4_6 sock_laddr;
2444 	struct protoctl_ev_val prctl_ev_val;
2445 #endif /* SKYWALK */
2446 	void (*notify)(struct inpcb *, int) = tcp_notify;
2447 
2448 	if (ctl_param != NULL) {
2449 		ip = ctl_param->ipc_icmp_ip;
2450 		icp = ctl_param->ipc_icmp;
2451 		m = ctl_param->ipc_m;
2452 		off = ctl_param->ipc_off;
2453 	} else {
2454 		ip = NULL;
2455 		icp = NULL;
2456 		m = NULL;
2457 		off = 0;
2458 	}
2459 
2460 	faddr = SIN(sa)->sin_addr;
2461 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) {
2462 		return;
2463 	}
2464 
2465 	if ((unsigned)cmd >= PRC_NCMDS) {
2466 		return;
2467 	}
2468 
2469 	/* Source quench is deprecated */
2470 	if (cmd == PRC_QUENCH) {
2471 		return;
2472 	}
2473 
2474 	if (cmd == PRC_MSGSIZE) {
2475 		notify = tcp_mtudisc;
2476 	} else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2477 	    cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
2478 	    cmd == PRC_TIMXCEED_INTRANS) && ip) {
2479 		notify = tcp_drop_syn_sent;
2480 	}
2481 	/*
2482 	 * Hostdead is ugly because it goes linearly through all PCBs.
2483 	 * XXX: We never get this from ICMP, otherwise it makes an
2484 	 * excellent DoS attack on machines with many connections.
2485 	 */
2486 	else if (cmd == PRC_HOSTDEAD) {
2487 		ip = NULL;
2488 	} else if (inetctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2489 		return;
2490 	}
2491 
2492 #if SKYWALK
2493 	bzero(&prctl_ev_val, sizeof(prctl_ev_val));
2494 	bzero(&sock_laddr, sizeof(sock_laddr));
2495 #endif /* SKYWALK */
2496 
2497 	if (ip == NULL) {
2498 		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
2499 #if SKYWALK
2500 		protoctl_event_enqueue_nwk_wq_entry(ifp, NULL,
2501 		    sa, 0, 0, IPPROTO_TCP, cmd, NULL);
2502 #endif /* SKYWALK */
2503 		return;
2504 	}
2505 
2506 	/* Check if we can safely get the sport, dport and the sequence number from the tcp header. */
2507 	if (m == NULL ||
2508 	    (m->m_len < off + (sizeof(unsigned short) + sizeof(unsigned short) + sizeof(tcp_seq)))) {
2509 		/* Insufficient length */
2510 		return;
2511 	}
2512 
2513 	th = (struct tcphdr*)(void*)(mtod(m, uint8_t*) + off);
2514 	icmp_tcp_seq = ntohl(th->th_seq);
2515 
2516 	inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
2517 	    ip->ip_src, th->th_sport, 0, NULL);
2518 
2519 	if (inp == NULL ||
2520 	    inp->inp_socket == NULL) {
2521 #if SKYWALK
2522 		if (cmd == PRC_MSGSIZE) {
2523 			prctl_ev_val.val = ntohs(icp->icmp_nextmtu);
2524 		}
2525 		prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2526 
2527 		sock_laddr.sin.sin_family = AF_INET;
2528 		sock_laddr.sin.sin_len = sizeof(sock_laddr.sin);
2529 		sock_laddr.sin.sin_addr = ip->ip_src;
2530 
2531 		protoctl_event_enqueue_nwk_wq_entry(ifp,
2532 		    SA(&sock_laddr), sa,
2533 		    th->th_sport, th->th_dport, IPPROTO_TCP,
2534 		    cmd, &prctl_ev_val);
2535 #endif /* SKYWALK */
2536 		return;
2537 	}
2538 
2539 	socket_lock(inp->inp_socket, 1);
2540 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2541 	    WNT_STOPUSING) {
2542 		socket_unlock(inp->inp_socket, 1);
2543 		return;
2544 	}
2545 
2546 	if (PRC_IS_REDIRECT(cmd)) {
2547 		/* signal EHOSTDOWN, as it flushes the cached route */
2548 		(*notify)(inp, EHOSTDOWN);
2549 	} else {
2550 		tp = intotcpcb(inp);
2551 		if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2552 		    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2553 			if (cmd == PRC_MSGSIZE) {
2554 				tcp_handle_msgsize(ip, inp);
2555 			}
2556 
2557 			(*notify)(inp, inetctlerrmap[cmd]);
2558 		}
2559 	}
2560 	socket_unlock(inp->inp_socket, 1);
2561 }
2562 
2563 void
tcp6_ctlinput(int cmd,struct sockaddr * sa,void * d,__unused struct ifnet * ifp)2564 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp)
2565 {
2566 	tcp_seq icmp_tcp_seq;
2567 	struct in6_addr *dst;
2568 	void (*notify)(struct inpcb *, int) = tcp_notify;
2569 	struct ip6_hdr *ip6;
2570 	struct mbuf *m;
2571 	struct inpcb *inp;
2572 	struct tcpcb *tp;
2573 	struct icmp6_hdr *icmp6;
2574 	struct ip6ctlparam *ip6cp = NULL;
2575 	const struct sockaddr_in6 *sa6_src = NULL;
2576 	unsigned int mtu;
2577 	unsigned int off;
2578 
2579 	struct tcp_ports {
2580 		uint16_t th_sport;
2581 		uint16_t th_dport;
2582 	} t_ports;
2583 #if SKYWALK
2584 	union sockaddr_in_4_6 sock_laddr;
2585 	struct protoctl_ev_val prctl_ev_val;
2586 #endif /* SKYWALK */
2587 
2588 	if (sa->sa_family != AF_INET6 ||
2589 	    sa->sa_len != sizeof(struct sockaddr_in6)) {
2590 		return;
2591 	}
2592 
2593 	/* Source quench is deprecated */
2594 	if (cmd == PRC_QUENCH) {
2595 		return;
2596 	}
2597 
2598 	if ((unsigned)cmd >= PRC_NCMDS) {
2599 		return;
2600 	}
2601 
2602 	/* if the parameter is from icmp6, decode it. */
2603 	if (d != NULL) {
2604 		ip6cp = (struct ip6ctlparam *)d;
2605 		icmp6 = ip6cp->ip6c_icmp6;
2606 		m = ip6cp->ip6c_m;
2607 		ip6 = ip6cp->ip6c_ip6;
2608 		off = ip6cp->ip6c_off;
2609 		sa6_src = ip6cp->ip6c_src;
2610 		dst = ip6cp->ip6c_finaldst;
2611 	} else {
2612 		m = NULL;
2613 		ip6 = NULL;
2614 		off = 0;        /* fool gcc */
2615 		sa6_src = &sa6_any;
2616 		dst = NULL;
2617 	}
2618 
2619 	if (cmd == PRC_MSGSIZE) {
2620 		notify = tcp_mtudisc;
2621 	} else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2622 	    cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
2623 	    ip6 != NULL) {
2624 		notify = tcp_drop_syn_sent;
2625 	}
2626 	/*
2627 	 * Hostdead is ugly because it goes linearly through all PCBs.
2628 	 * XXX: We never get this from ICMP, otherwise it makes an
2629 	 * excellent DoS attack on machines with many connections.
2630 	 */
2631 	else if (cmd == PRC_HOSTDEAD) {
2632 		ip6 = NULL;
2633 	} else if (inet6ctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2634 		return;
2635 	}
2636 
2637 #if SKYWALK
2638 	bzero(&prctl_ev_val, sizeof(prctl_ev_val));
2639 	bzero(&sock_laddr, sizeof(sock_laddr));
2640 #endif /* SKYWALK */
2641 
2642 	if (ip6 == NULL) {
2643 		in6_pcbnotify(&tcbinfo, sa, 0, SA(sa6_src), 0, cmd, NULL, notify);
2644 #if SKYWALK
2645 		protoctl_event_enqueue_nwk_wq_entry(ifp, NULL, sa,
2646 		    0, 0, IPPROTO_TCP, cmd, NULL);
2647 #endif /* SKYWALK */
2648 		return;
2649 	}
2650 
2651 	/* Check if we can safely get the ports from the tcp hdr */
2652 	if (m == NULL ||
2653 	    (m->m_pkthdr.len <
2654 	    (int32_t) (off + sizeof(struct tcp_ports)))) {
2655 		return;
2656 	}
2657 	bzero(&t_ports, sizeof(struct tcp_ports));
2658 	m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
2659 
2660 	off += sizeof(struct tcp_ports);
2661 	if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
2662 		return;
2663 	}
2664 	m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
2665 	icmp_tcp_seq = ntohl(icmp_tcp_seq);
2666 
2667 	if (cmd == PRC_MSGSIZE) {
2668 		mtu = ntohl(icmp6->icmp6_mtu);
2669 		/*
2670 		 * If no alternative MTU was proposed, or the proposed
2671 		 * MTU was too small, set to the min.
2672 		 */
2673 		if (mtu < IPV6_MMTU) {
2674 			mtu = IPV6_MMTU - 8;
2675 		}
2676 	}
2677 
2678 	inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_dst, t_ports.th_dport, ip6_input_getdstifscope(m),
2679 	    &ip6->ip6_src, t_ports.th_sport, ip6_input_getsrcifscope(m), 0, NULL);
2680 
2681 	if (inp == NULL ||
2682 	    inp->inp_socket == NULL) {
2683 #if SKYWALK
2684 		if (cmd == PRC_MSGSIZE) {
2685 			prctl_ev_val.val = mtu;
2686 		}
2687 		prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2688 
2689 		sock_laddr.sin6.sin6_family = AF_INET6;
2690 		sock_laddr.sin6.sin6_len = sizeof(sock_laddr.sin6);
2691 		sock_laddr.sin6.sin6_addr = ip6->ip6_src;
2692 
2693 		protoctl_event_enqueue_nwk_wq_entry(ifp,
2694 		    SA(&sock_laddr), sa,
2695 		    t_ports.th_sport, t_ports.th_dport, IPPROTO_TCP,
2696 		    cmd, &prctl_ev_val);
2697 #endif /* SKYWALK */
2698 		return;
2699 	}
2700 
2701 	socket_lock(inp->inp_socket, 1);
2702 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2703 	    WNT_STOPUSING) {
2704 		socket_unlock(inp->inp_socket, 1);
2705 		return;
2706 	}
2707 
2708 	if (PRC_IS_REDIRECT(cmd)) {
2709 		/* signal EHOSTDOWN, as it flushes the cached route */
2710 		(*notify)(inp, EHOSTDOWN);
2711 	} else {
2712 		tp = intotcpcb(inp);
2713 		if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2714 		    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2715 			if (cmd == PRC_MSGSIZE) {
2716 				/*
2717 				 * Only process the offered MTU if it
2718 				 * is smaller than the current one.
2719 				 */
2720 				if (mtu < tp->t_maxseg +
2721 				    (sizeof(struct tcphdr) + sizeof(struct ip6_hdr))) {
2722 					(*notify)(inp, inetctlerrmap[cmd]);
2723 				}
2724 			} else {
2725 				(*notify)(inp, inetctlerrmap[cmd]);
2726 			}
2727 		}
2728 	}
2729 	socket_unlock(inp->inp_socket, 1);
2730 }
2731 
2732 
2733 /*
2734  * Following is where TCP initial sequence number generation occurs.
2735  *
2736  * There are two places where we must use initial sequence numbers:
2737  * 1.  In SYN-ACK packets.
2738  * 2.  In SYN packets.
2739  *
2740  * The ISNs in SYN-ACK packets have no monotonicity requirement,
2741  * and should be as unpredictable as possible to avoid the possibility
2742  * of spoofing and/or connection hijacking.  To satisfy this
2743  * requirement, SYN-ACK ISNs are generated via the arc4random()
2744  * function.  If exact RFC 1948 compliance is requested via sysctl,
2745  * these ISNs will be generated just like those in SYN packets.
2746  *
2747  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
2748  * depends on this property.  In addition, these ISNs should be
2749  * unguessable so as to prevent connection hijacking.  To satisfy
2750  * the requirements of this situation, the algorithm outlined in
2751  * RFC 9293 is used to generate sequence numbers.
2752  *
2753  * For more information on the theory of operation, please see
2754  * RFC 9293.
2755  *
2756  * Implementation details:
2757  *
2758  * Time is based off the system timer, and is corrected so that it
2759  * increases by one megabyte per second.  This allows for proper
2760  * recycling on high speed LANs while still leaving over an hour
2761  * before rollover.
2762  *
2763  */
2764 
2765 #define ISN_BYTES_PER_SECOND 1048576
2766 
2767 tcp_seq
tcp_new_isn(struct tcpcb * tp)2768 tcp_new_isn(struct tcpcb *tp)
2769 {
2770 	uint32_t md5_buffer[4];
2771 	tcp_seq new_isn;
2772 	struct timespec timenow;
2773 	MD5_CTX isn_ctx;
2774 
2775 	nanouptime(&timenow);
2776 
2777 	/* Compute the md5 hash and return the ISN. */
2778 	MD5Init(&isn_ctx);
2779 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport,
2780 	    sizeof(u_short));
2781 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport,
2782 	    sizeof(u_short));
2783 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
2784 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
2785 		    sizeof(struct in6_addr));
2786 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
2787 		    sizeof(struct in6_addr));
2788 	} else {
2789 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
2790 		    sizeof(struct in_addr));
2791 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
2792 		    sizeof(struct in_addr));
2793 	}
2794 	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
2795 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
2796 
2797 	new_isn = (tcp_seq) md5_buffer[0];
2798 
2799 	/*
2800 	 * We use a 128ns clock, which is equivalent to 600 Mbps and wraps at
2801 	 * 549 seconds, thus safe for 2 MSL lifetime of TIME-WAIT-state.
2802 	 */
2803 	new_isn += (timenow.tv_sec * NSEC_PER_SEC + timenow.tv_nsec) >> 7;
2804 
2805 	if (__probable(tcp_randomize_timestamps)) {
2806 		tp->t_ts_offset = md5_buffer[1];
2807 	}
2808 
2809 	return new_isn;
2810 }
2811 
2812 
2813 /*
2814  * When a specific ICMP unreachable message is received and the
2815  * connection state is SYN-SENT, drop the connection.  This behavior
2816  * is controlled by the icmp_may_rst sysctl.
2817  */
2818 void
tcp_drop_syn_sent(struct inpcb * inp,int errno)2819 tcp_drop_syn_sent(struct inpcb *inp, int errno)
2820 {
2821 	struct tcpcb *tp = intotcpcb(inp);
2822 
2823 	if (tp && tp->t_state == TCPS_SYN_SENT) {
2824 		tcp_drop(tp, errno);
2825 	}
2826 }
2827 
2828 /*
2829  * Get effective MTU for redirect virtual interface. Redirect
2830  * virtual interface switches between multiple delegated interfaces.
2831  * For cases, where redirect forwards packets to an ipsec interface,
2832  * MTU should be adjusted to consider ESP encapsulation overhead.
2833  */
2834 uint32_t
tcp_get_effective_mtu(struct rtentry * rt,uint32_t current_mtu)2835 tcp_get_effective_mtu(struct rtentry *rt, uint32_t current_mtu)
2836 {
2837 	ifnet_t ifp = NULL;
2838 	ifnet_t delegated_ifp = NULL;
2839 	ifnet_t outgoing_ifp = NULL;
2840 	uint32_t min_mtu = 0;
2841 	uint32_t outgoing_mtu = 0;
2842 	uint32_t tunnel_overhead = 0;
2843 
2844 	if (rt == NULL || rt->rt_ifp == NULL) {
2845 		return current_mtu;
2846 	}
2847 
2848 	ifp = rt->rt_ifp;
2849 	if (ifp->if_subfamily != IFNET_SUBFAMILY_REDIRECT) {
2850 		return current_mtu;
2851 	}
2852 
2853 	delegated_ifp = ifp->if_delegated.ifp;
2854 	if (delegated_ifp == NULL || delegated_ifp->if_family != IFNET_FAMILY_IPSEC) {
2855 		return current_mtu;
2856 	}
2857 
2858 	min_mtu = MIN(delegated_ifp->if_mtu, current_mtu);
2859 
2860 	outgoing_ifp = delegated_ifp->if_delegated.ifp;
2861 	if (outgoing_ifp == NULL) {
2862 		return min_mtu;
2863 	}
2864 
2865 	outgoing_mtu = outgoing_ifp->if_mtu;
2866 	if (outgoing_mtu > 0) {
2867 		tunnel_overhead = (u_int32_t)(esp_hdrsiz(NULL) + sizeof(struct ip6_hdr));
2868 		if (outgoing_mtu > tunnel_overhead) {
2869 			outgoing_mtu -= tunnel_overhead;
2870 		}
2871 		if (outgoing_mtu < min_mtu) {
2872 			return outgoing_mtu;
2873 		}
2874 	}
2875 
2876 	return min_mtu;
2877 }
2878 
2879 /*
2880  * When `need fragmentation' ICMP is received, update our idea of the MSS
2881  * based on the new value in the route.  Also nudge TCP to send something,
2882  * since we know the packet we just sent was dropped.
2883  * This duplicates some code in the tcp_mss() function in tcp_input.c.
2884  */
2885 void
tcp_mtudisc(struct inpcb * inp,__unused int errno)2886 tcp_mtudisc(struct inpcb *inp, __unused int errno)
2887 {
2888 	struct tcpcb *tp = intotcpcb(inp);
2889 	struct rtentry *rt;
2890 	struct socket *so = inp->inp_socket;
2891 	int mss;
2892 	u_int32_t mtu;
2893 	u_int32_t protoHdrOverhead = sizeof(struct tcpiphdr);
2894 	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
2895 
2896 	/*
2897 	 * Nothing left to send after the socket is defunct or TCP is in the closed state
2898 	 */
2899 	if ((so->so_state & SS_DEFUNCT) || (tp != NULL && tp->t_state == TCPS_CLOSED)) {
2900 		return;
2901 	}
2902 
2903 	if (isipv6) {
2904 		protoHdrOverhead = sizeof(struct ip6_hdr) +
2905 		    sizeof(struct tcphdr);
2906 	}
2907 
2908 	if (tp != NULL) {
2909 		if (isipv6) {
2910 			rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2911 		} else {
2912 			rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2913 		}
2914 		if (!rt || !rt->rt_rmx.rmx_mtu) {
2915 			tp->t_maxopd = tp->t_maxseg =
2916 			    isipv6 ? tcp_v6mssdflt :
2917 			    tcp_mssdflt;
2918 
2919 			/* Route locked during lookup above */
2920 			if (rt != NULL) {
2921 				RT_UNLOCK(rt);
2922 			}
2923 			return;
2924 		}
2925 		mtu = rt->rt_rmx.rmx_mtu;
2926 
2927 		mtu = tcp_get_effective_mtu(rt, mtu);
2928 
2929 		/* Route locked during lookup above */
2930 		RT_UNLOCK(rt);
2931 
2932 #if NECP
2933 		// Adjust MTU if necessary.
2934 		mtu = necp_socket_get_effective_mtu(inp, mtu);
2935 #endif /* NECP */
2936 		mss = mtu - protoHdrOverhead;
2937 
2938 		if (tp->t_maxopd) {
2939 			mss = min(mss, tp->t_maxopd);
2940 		}
2941 		/*
2942 		 * XXX - The above conditional probably violates the TCP
2943 		 * spec.  The problem is that, since we don't know the
2944 		 * other end's MSS, we are supposed to use a conservative
2945 		 * default.  But, if we do that, then MTU discovery will
2946 		 * never actually take place, because the conservative
2947 		 * default is much less than the MTUs typically seen
2948 		 * on the Internet today.  For the moment, we'll sweep
2949 		 * this under the carpet.
2950 		 *
2951 		 * The conservative default might not actually be a problem
2952 		 * if the only case this occurs is when sending an initial
2953 		 * SYN with options and data to a host we've never talked
2954 		 * to before.  Then, they will reply with an MSS value which
2955 		 * will get recorded and the new parameters should get
2956 		 * recomputed.  For Further Study.
2957 		 */
2958 		if (tp->t_maxopd <= mss) {
2959 			return;
2960 		}
2961 		tp->t_maxopd = mss;
2962 
2963 		if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
2964 		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) {
2965 			mss -= TCPOLEN_TSTAMP_APPA;
2966 		}
2967 
2968 #if MPTCP
2969 		mss -= mptcp_adj_mss(tp, TRUE);
2970 #endif
2971 		if (so->so_snd.sb_hiwat < mss) {
2972 			mss = so->so_snd.sb_hiwat;
2973 		}
2974 
2975 		tp->t_maxseg = mss;
2976 
2977 		ASSERT(tp->t_maxseg);
2978 
2979 		/*
2980 		 * Reset the slow-start flight size as it may depends on the
2981 		 * new MSS
2982 		 */
2983 		if (CC_ALGO(tp)->cwnd_init != NULL) {
2984 			CC_ALGO(tp)->cwnd_init(tp);
2985 		}
2986 
2987 		if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.rwnd_init != NULL) {
2988 			tcp_cc_rledbat.rwnd_init(tp);
2989 		}
2990 
2991 		tcpstat.tcps_mturesent++;
2992 		tp->t_rtttime = 0;
2993 		tp->snd_nxt = tp->snd_una;
2994 		tcp_output(tp);
2995 	}
2996 }
2997 
2998 /*
2999  * Look-up the routing entry to the peer of this inpcb.  If no route
3000  * is found and it cannot be allocated the return NULL.  This routine
3001  * is called by TCP routines that access the rmx structure and by tcp_mss
3002  * to get the interface MTU.  If a route is found, this routine will
3003  * hold the rtentry lock; the caller is responsible for unlocking.
3004  */
3005 struct rtentry *
tcp_rtlookup(struct inpcb * inp,unsigned int input_ifscope)3006 tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope)
3007 {
3008 	struct route *ro;
3009 	struct rtentry *rt;
3010 	struct tcpcb *tp;
3011 
3012 	LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
3013 
3014 	ro = &inp->inp_route;
3015 	if ((rt = ro->ro_rt) != NULL) {
3016 		RT_LOCK(rt);
3017 	}
3018 
3019 	if (ROUTE_UNUSABLE(ro)) {
3020 		if (rt != NULL) {
3021 			RT_UNLOCK(rt);
3022 			rt = NULL;
3023 		}
3024 		ROUTE_RELEASE(ro);
3025 		/* No route yet, so try to acquire one */
3026 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
3027 			unsigned int ifscope;
3028 
3029 			ro->ro_dst.sa_family = AF_INET;
3030 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
3031 			SIN(&ro->ro_dst)->sin_addr = inp->inp_faddr;
3032 
3033 			/*
3034 			 * If the socket was bound to an interface, then
3035 			 * the bound-to-interface takes precedence over
3036 			 * the inbound interface passed in by the caller
3037 			 * (if we get here as part of the output path then
3038 			 * input_ifscope is IFSCOPE_NONE).
3039 			 */
3040 			ifscope = (inp->inp_flags & INP_BOUND_IF) ?
3041 			    inp->inp_boundifp->if_index : input_ifscope;
3042 
3043 			rtalloc_scoped(ro, ifscope);
3044 			if ((rt = ro->ro_rt) != NULL) {
3045 				RT_LOCK(rt);
3046 			}
3047 		}
3048 	}
3049 	if (rt != NULL) {
3050 		RT_LOCK_ASSERT_HELD(rt);
3051 	}
3052 
3053 	/*
3054 	 * Update MTU discovery determination. Don't do it if:
3055 	 *	1) it is disabled via the sysctl
3056 	 *	2) the route isn't up
3057 	 *	3) the MTU is locked (if it is, then discovery has been
3058 	 *	   disabled)
3059 	 */
3060 
3061 	tp = intotcpcb(inp);
3062 
3063 	if (!path_mtu_discovery || ((rt != NULL) &&
3064 	    (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3065 		tp->t_flags &= ~TF_PMTUD;
3066 	} else {
3067 		tp->t_flags |= TF_PMTUD;
3068 	}
3069 
3070 	if (rt != NULL && rt->rt_ifp != NULL) {
3071 		somultipages(inp->inp_socket,
3072 		    (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3073 		tcp_set_tso(tp, rt->rt_ifp);
3074 		soif2kcl(inp->inp_socket,
3075 		    (rt->rt_ifp->if_eflags & IFEF_2KCL));
3076 		tcp_set_ecn(tp, rt->rt_ifp);
3077 		if (inp->inp_last_outifp == NULL) {
3078 			inp->inp_last_outifp = rt->rt_ifp;
3079 #if SKYWALK
3080 			if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3081 				netns_set_ifnet(&inp->inp_netns_token,
3082 				    inp->inp_last_outifp);
3083 			}
3084 #endif /* SKYWALK */
3085 		}
3086 	}
3087 
3088 	/* Note if the peer is local */
3089 	if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3090 	    (rt->rt_gateway->sa_family == AF_LINK ||
3091 	    rt->rt_ifp->if_flags & IFF_LOOPBACK ||
3092 	    in_localaddr(inp->inp_faddr))) {
3093 		tp->t_flags |= TF_LOCAL;
3094 	}
3095 
3096 	/*
3097 	 * Caller needs to call RT_UNLOCK(rt).
3098 	 */
3099 	return rt;
3100 }
3101 
3102 struct rtentry *
tcp_rtlookup6(struct inpcb * inp,unsigned int input_ifscope)3103 tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope)
3104 {
3105 	struct route_in6 *ro6;
3106 	struct rtentry *rt;
3107 	struct tcpcb *tp;
3108 
3109 	LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
3110 
3111 	ro6 = &inp->in6p_route;
3112 	if ((rt = ro6->ro_rt) != NULL) {
3113 		RT_LOCK(rt);
3114 	}
3115 
3116 	if (ROUTE_UNUSABLE(ro6)) {
3117 		if (rt != NULL) {
3118 			RT_UNLOCK(rt);
3119 			rt = NULL;
3120 		}
3121 		ROUTE_RELEASE(ro6);
3122 		/* No route yet, so try to acquire one */
3123 		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
3124 			struct sockaddr_in6 *dst6;
3125 			unsigned int ifscope;
3126 
3127 			dst6 = SIN6(&ro6->ro_dst);
3128 			dst6->sin6_family = AF_INET6;
3129 			dst6->sin6_len = sizeof(*dst6);
3130 			dst6->sin6_addr = inp->in6p_faddr;
3131 
3132 			/*
3133 			 * If the socket was bound to an interface, then
3134 			 * the bound-to-interface takes precedence over
3135 			 * the inbound interface passed in by the caller
3136 			 * (if we get here as part of the output path then
3137 			 * input_ifscope is IFSCOPE_NONE).
3138 			 */
3139 			ifscope = (inp->inp_flags & INP_BOUND_IF) ?
3140 			    inp->inp_boundifp->if_index : input_ifscope;
3141 
3142 			rtalloc_scoped((struct route *)ro6, ifscope);
3143 			if ((rt = ro6->ro_rt) != NULL) {
3144 				RT_LOCK(rt);
3145 			}
3146 		}
3147 	}
3148 	if (rt != NULL) {
3149 		RT_LOCK_ASSERT_HELD(rt);
3150 	}
3151 
3152 	/*
3153 	 * Update path MTU Discovery determination
3154 	 * while looking up the route:
3155 	 *  1) we have a valid route to the destination
3156 	 *  2) the MTU is not locked (if it is, then discovery has been
3157 	 *    disabled)
3158 	 */
3159 
3160 
3161 	tp = intotcpcb(inp);
3162 
3163 	/*
3164 	 * Update MTU discovery determination. Don't do it if:
3165 	 *	1) it is disabled via the sysctl
3166 	 *	2) the route isn't up
3167 	 *	3) the MTU is locked (if it is, then discovery has been
3168 	 *	   disabled)
3169 	 */
3170 
3171 	if (!path_mtu_discovery || ((rt != NULL) &&
3172 	    (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3173 		tp->t_flags &= ~TF_PMTUD;
3174 	} else {
3175 		tp->t_flags |= TF_PMTUD;
3176 	}
3177 
3178 	if (rt != NULL && rt->rt_ifp != NULL) {
3179 		somultipages(inp->inp_socket,
3180 		    (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3181 		tcp_set_tso(tp, rt->rt_ifp);
3182 		soif2kcl(inp->inp_socket,
3183 		    (rt->rt_ifp->if_eflags & IFEF_2KCL));
3184 		tcp_set_ecn(tp, rt->rt_ifp);
3185 		if (inp->inp_last_outifp == NULL) {
3186 			inp->inp_last_outifp = rt->rt_ifp;
3187 #if SKYWALK
3188 			if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3189 				netns_set_ifnet(&inp->inp_netns_token,
3190 				    inp->inp_last_outifp);
3191 			}
3192 #endif /* SKYWALK */
3193 		}
3194 
3195 		/* Note if the peer is local */
3196 		if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3197 		    (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
3198 		    IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
3199 		    rt->rt_gateway->sa_family == AF_LINK ||
3200 		    in6_localaddr(&inp->in6p_faddr))) {
3201 			tp->t_flags |= TF_LOCAL;
3202 		}
3203 	}
3204 
3205 	/*
3206 	 * Caller needs to call RT_UNLOCK(rt).
3207 	 */
3208 	return rt;
3209 }
3210 
3211 #if IPSEC
3212 /* compute ESP/AH header size for TCP, including outer IP header. */
3213 size_t
ipsec_hdrsiz_tcp(struct tcpcb * tp)3214 ipsec_hdrsiz_tcp(struct tcpcb *tp)
3215 {
3216 	struct inpcb *inp;
3217 	struct mbuf *m;
3218 	size_t hdrsiz;
3219 	struct ip *ip;
3220 	struct ip6_hdr *ip6 = NULL;
3221 	struct tcphdr *th;
3222 
3223 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) {
3224 		return 0;
3225 	}
3226 	MGETHDR(m, M_DONTWAIT, MT_DATA);        /* MAC-OK */
3227 	if (!m) {
3228 		return 0;
3229 	}
3230 
3231 	if ((inp->inp_vflag & INP_IPV6) != 0) {
3232 		ip6 = mtod(m, struct ip6_hdr *);
3233 		th = (struct tcphdr *)(void *)(ip6 + 1);
3234 		m->m_pkthdr.len = m->m_len =
3235 		    sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3236 		tcp_fillheaders(m, tp, ip6, th);
3237 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3238 	} else {
3239 		ip = mtod(m, struct ip *);
3240 		th = (struct tcphdr *)(ip + 1);
3241 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
3242 		tcp_fillheaders(m, tp, ip, th);
3243 		hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3244 	}
3245 	m_free(m);
3246 	return hdrsiz;
3247 }
3248 #endif /* IPSEC */
3249 
3250 int
tcp_lock(struct socket * so,int refcount,void * lr)3251 tcp_lock(struct socket *so, int refcount, void *lr)
3252 {
3253 	lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
3254 
3255 retry:
3256 	if (so->so_pcb != NULL) {
3257 		if (so->so_flags & SOF_MP_SUBFLOW) {
3258 			struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3259 			struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3260 
3261 			socket_lock(mp_so, refcount);
3262 
3263 			/*
3264 			 * Check if we became non-MPTCP while waiting for the lock.
3265 			 * If yes, we have to retry to grab the right lock.
3266 			 */
3267 			if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3268 				socket_unlock(mp_so, refcount);
3269 				goto retry;
3270 			}
3271 		} else {
3272 			lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3273 
3274 			if (so->so_flags & SOF_MP_SUBFLOW) {
3275 				/*
3276 				 * While waiting for the lock, we might have
3277 				 * become MPTCP-enabled (see mptcp_subflow_socreate).
3278 				 */
3279 				lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3280 				goto retry;
3281 			}
3282 		}
3283 	} else {
3284 		panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s",
3285 		    so, lr_saved, solockhistory_nr(so));
3286 		/* NOTREACHED */
3287 	}
3288 
3289 	if (so->so_usecount < 0) {
3290 		panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s",
3291 		    so, so->so_pcb, lr_saved, so->so_usecount,
3292 		    solockhistory_nr(so));
3293 		/* NOTREACHED */
3294 	}
3295 	if (refcount) {
3296 		so->so_usecount++;
3297 	}
3298 	so->lock_lr[so->next_lock_lr] = lr_saved;
3299 	so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3300 	return 0;
3301 }
3302 
3303 int
tcp_unlock(struct socket * so,int refcount,void * lr)3304 tcp_unlock(struct socket *so, int refcount, void *lr)
3305 {
3306 	lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
3307 
3308 
3309 #ifdef MORE_TCPLOCK_DEBUG
3310 	printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
3311 	    "lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
3312 	    (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
3313 	    (uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
3314 	    so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
3315 #endif
3316 	if (refcount) {
3317 		so->so_usecount--;
3318 	}
3319 
3320 	if (so->so_usecount < 0) {
3321 		panic("tcp_unlock: so=%p usecount=%x lrh= %s",
3322 		    so, so->so_usecount, solockhistory_nr(so));
3323 		/* NOTREACHED */
3324 	}
3325 	if (so->so_pcb == NULL) {
3326 		panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s",
3327 		    so, so->so_usecount, lr_saved, solockhistory_nr(so));
3328 		/* NOTREACHED */
3329 	} else {
3330 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
3331 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3332 
3333 		if (so->so_flags & SOF_MP_SUBFLOW) {
3334 			struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3335 			struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3336 
3337 			socket_lock_assert_owned(mp_so);
3338 
3339 			socket_unlock(mp_so, refcount);
3340 		} else {
3341 			LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3342 			    LCK_MTX_ASSERT_OWNED);
3343 			lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3344 		}
3345 	}
3346 	return 0;
3347 }
3348 
3349 lck_mtx_t *
tcp_getlock(struct socket * so,int flags)3350 tcp_getlock(struct socket *so, int flags)
3351 {
3352 	struct inpcb *inp = sotoinpcb(so);
3353 
3354 	if (so->so_pcb) {
3355 		if (so->so_usecount < 0) {
3356 			panic("tcp_getlock: so=%p usecount=%x lrh= %s",
3357 			    so, so->so_usecount, solockhistory_nr(so));
3358 		}
3359 
3360 		if (so->so_flags & SOF_MP_SUBFLOW) {
3361 			struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3362 			struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3363 
3364 			return mp_so->so_proto->pr_getlock(mp_so, flags);
3365 		} else {
3366 			return &inp->inpcb_mtx;
3367 		}
3368 	} else {
3369 		panic("tcp_getlock: so=%p NULL so_pcb %s",
3370 		    so, solockhistory_nr(so));
3371 		return so->so_proto->pr_domain->dom_mtx;
3372 	}
3373 }
3374 
3375 /*
3376  * Determine if we can grow the recieve socket buffer to avoid sending
3377  * a zero window update to the peer. We allow even socket buffers that
3378  * have fixed size (set by the application) to grow if the resource
3379  * constraints are met. They will also be trimmed after the application
3380  * reads data.
3381  */
3382 static void
tcp_sbrcv_grow_rwin(struct tcpcb * tp,struct sockbuf * sb)3383 tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb)
3384 {
3385 	u_int32_t rcvbufinc = tp->t_maxseg << 4;
3386 	u_int32_t rcvbuf = sb->sb_hiwat;
3387 	struct socket *so = tp->t_inpcb->inp_socket;
3388 
3389 	if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
3390 		return;
3391 	}
3392 
3393 	if (tcp_do_autorcvbuf == 1 &&
3394 	    (tp->t_flags & TF_SLOWLINK) == 0 &&
3395 	    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
3396 	    (rcvbuf - sb->sb_cc) < rcvbufinc &&
3397 	    rcvbuf < tcp_autorcvbuf_max &&
3398 	    (sb->sb_idealsize > 0 &&
3399 	    sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
3400 		sbreserve(sb,
3401 		    min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
3402 	}
3403 }
3404 
3405 int32_t
tcp_sbspace(struct tcpcb * tp)3406 tcp_sbspace(struct tcpcb *tp)
3407 {
3408 	struct socket *so = tp->t_inpcb->inp_socket;
3409 	struct sockbuf *sb = &so->so_rcv;
3410 	u_int32_t rcvbuf;
3411 	int32_t space;
3412 	int32_t pending = 0;
3413 
3414 	if (so->so_flags & SOF_MP_SUBFLOW) {
3415 		/* We still need to grow TCP's buffer to have a BDP-estimate */
3416 		tcp_sbrcv_grow_rwin(tp, sb);
3417 
3418 		return mptcp_sbspace(tptomptp(tp));
3419 	}
3420 
3421 	tcp_sbrcv_grow_rwin(tp, sb);
3422 
3423 	/* hiwat might have changed */
3424 	rcvbuf = sb->sb_hiwat;
3425 
3426 	space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
3427 	    (sb->sb_mbmax - sb->sb_mbcnt)));
3428 	if (space < 0) {
3429 		space = 0;
3430 	}
3431 
3432 #if CONTENT_FILTER
3433 	/* Compensate for data being processed by content filters */
3434 	pending = cfil_sock_data_space(sb);
3435 #endif /* CONTENT_FILTER */
3436 	if (pending > space) {
3437 		space = 0;
3438 	} else {
3439 		space -= pending;
3440 	}
3441 
3442 	/*
3443 	 * Avoid increasing window size if the current window
3444 	 * is already very low, we could be in "persist" mode and
3445 	 * we could break some apps (see rdar://5409343)
3446 	 */
3447 
3448 	if (space < tp->t_maxseg) {
3449 		return space;
3450 	}
3451 
3452 	/* Clip window size for slower link */
3453 
3454 	if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0) {
3455 		return imin(space, slowlink_wsize);
3456 	}
3457 
3458 	return space;
3459 }
3460 /*
3461  * Checks TCP Segment Offloading capability for a given connection
3462  * and interface pair.
3463  */
3464 void
tcp_set_tso(struct tcpcb * tp,struct ifnet * ifp)3465 tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp)
3466 {
3467 	struct inpcb *inp;
3468 	int isipv6;
3469 	struct ifnet *tunnel_ifp = NULL;
3470 #define IFNET_TSO_MASK (IFNET_TSO_IPV6 | IFNET_TSO_IPV4)
3471 
3472 	tp->t_flags &= ~TF_TSO;
3473 
3474 	/*
3475 	 * Bail if there's a non-TSO-capable filter on the interface.
3476 	 */
3477 	if (ifp == NULL || ifp->if_flt_no_tso_count > 0) {
3478 		return;
3479 	}
3480 
3481 	inp = tp->t_inpcb;
3482 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
3483 
3484 #if MPTCP
3485 	/*
3486 	 * We can't use TSO if this tcpcb belongs to an MPTCP session.
3487 	 */
3488 	if (inp->inp_socket->so_flags & SOF_MP_SUBFLOW) {
3489 		return;
3490 	}
3491 #endif
3492 	/*
3493 	 * We can't use TSO if the TSO capability of the tunnel interface does
3494 	 * not match the capability of another interface known by TCP
3495 	 */
3496 	if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL) {
3497 		u_int tunnel_if_index = inp->inp_policyresult.results.result_parameter.tunnel_interface_index;
3498 
3499 		if (tunnel_if_index != 0) {
3500 			ifnet_head_lock_shared();
3501 			tunnel_ifp = ifindex2ifnet[tunnel_if_index];
3502 			ifnet_head_done();
3503 		}
3504 
3505 		if (tunnel_ifp == NULL) {
3506 			return;
3507 		}
3508 
3509 		if ((ifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3510 			if (tso_debug > 0) {
3511 				os_log(OS_LOG_DEFAULT,
3512 				    "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with ifp %s",
3513 				    __func__,
3514 				    ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3515 				    tunnel_ifp->if_xname, ifp->if_xname);
3516 			}
3517 			return;
3518 		}
3519 		if (inp->inp_last_outifp != NULL &&
3520 		    (inp->inp_last_outifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3521 			if (tso_debug > 0) {
3522 				os_log(OS_LOG_DEFAULT,
3523 				    "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_last_outifp %s",
3524 				    __func__,
3525 				    ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3526 				    tunnel_ifp->if_xname, inp->inp_last_outifp->if_xname);
3527 			}
3528 			return;
3529 		}
3530 		if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp != NULL &&
3531 		    (inp->inp_boundifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3532 			if (tso_debug > 0) {
3533 				os_log(OS_LOG_DEFAULT,
3534 				    "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_boundifp %s",
3535 				    __func__,
3536 				    ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3537 				    tunnel_ifp->if_xname, inp->inp_boundifp->if_xname);
3538 			}
3539 			return;
3540 		}
3541 	}
3542 
3543 	if (isipv6) {
3544 		if (ifp->if_hwassist & IFNET_TSO_IPV6) {
3545 			tp->t_flags |= TF_TSO;
3546 			if (ifp->if_tso_v6_mtu != 0) {
3547 				tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
3548 			} else {
3549 				tp->tso_max_segment_size = TCP_MAXWIN;
3550 			}
3551 		}
3552 	} else {
3553 		if (ifp->if_hwassist & IFNET_TSO_IPV4) {
3554 			tp->t_flags |= TF_TSO;
3555 			if (ifp->if_tso_v4_mtu != 0) {
3556 				tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
3557 			} else {
3558 				tp->tso_max_segment_size = TCP_MAXWIN;
3559 			}
3560 			if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3561 				tp->tso_max_segment_size -=
3562 				    CLAT46_HDR_EXPANSION_OVERHD;
3563 			}
3564 		}
3565 	}
3566 
3567 	if (tso_debug > 1) {
3568 		os_log(OS_LOG_DEFAULT, "%s: %u > %u TSO %d ifp %s",
3569 		    __func__,
3570 		    ntohs(tp->t_inpcb->inp_lport),
3571 		    ntohs(tp->t_inpcb->inp_fport),
3572 		    (tp->t_flags & TF_TSO) != 0,
3573 		    ifp != NULL ? ifp->if_xname : "<NULL>");
3574 	}
3575 }
3576 
3577 #define TIMEVAL_TO_TCPHZ(_tv_) ((uint32_t)((_tv_).tv_sec * TCP_RETRANSHZ + \
3578 	(_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC))
3579 
3580 /*
3581  * Function to calculate the tcp clock. The tcp clock will get updated
3582  * at the boundaries of the tcp layer. This is done at 3 places:
3583  * 1. Right before processing an input tcp packet
3584  * 2. Whenever a connection wants to access the network using tcp_usrreqs
3585  * 3. When a tcp timer fires or before tcp slow timeout
3586  *
3587  */
3588 
3589 void
calculate_tcp_clock(void)3590 calculate_tcp_clock(void)
3591 {
3592 	struct timeval tv = tcp_uptime;
3593 	struct timeval interval = {.tv_sec = 0, .tv_usec = TCP_RETRANSHZ_TO_USEC};
3594 	struct timeval now, hold_now;
3595 	uint32_t incr = 0;
3596 
3597 	microuptime(&now);
3598 
3599 	/*
3600 	 * Update coarse-grained networking timestamp (in sec.); the idea
3601 	 * is to update the counter returnable via net_uptime() when
3602 	 * we read time.
3603 	 */
3604 	net_update_uptime_with_time(&now);
3605 
3606 	timevaladd(&tv, &interval);
3607 	if (timevalcmp(&now, &tv, >)) {
3608 		/* time to update the clock */
3609 		lck_spin_lock(&tcp_uptime_lock);
3610 		if (timevalcmp(&tcp_uptime, &now, >=)) {
3611 			/* clock got updated while waiting for the lock */
3612 			lck_spin_unlock(&tcp_uptime_lock);
3613 			return;
3614 		}
3615 
3616 		microuptime(&now);
3617 		hold_now = now;
3618 		tv = tcp_uptime;
3619 		timevalsub(&now, &tv);
3620 
3621 		incr = TIMEVAL_TO_TCPHZ(now);
3622 
3623 		/* Account for the previous remainder */
3624 		uint32_t remaining_us = (now.tv_usec % TCP_RETRANSHZ_TO_USEC) +
3625 		    tcp_now_remainder_us;
3626 		if (remaining_us >= TCP_RETRANSHZ_TO_USEC) {
3627 			incr += (remaining_us / TCP_RETRANSHZ_TO_USEC);
3628 		}
3629 
3630 		if (incr > 0) {
3631 			tcp_uptime = hold_now;
3632 			tcp_now_remainder_us = remaining_us % TCP_RETRANSHZ_TO_USEC;
3633 			tcp_now += incr;
3634 		}
3635 
3636 		lck_spin_unlock(&tcp_uptime_lock);
3637 	}
3638 }
3639 
3640 uint64_t
microuptime_ns(void)3641 microuptime_ns(void)
3642 {
3643 	uint64_t abstime = mach_absolute_time();
3644 	uint64_t ns = 0;
3645 	absolutetime_to_nanoseconds(abstime, &ns);
3646 
3647 	return ns;
3648 }
3649 
3650 #define MAX_BURST_INTERVAL_KERNEL_PACING_NSEC                                  \
3651 	(10 * NSEC_PER_MSEC) // Don't delay more than 10ms between two bursts
3652 static uint64_t
tcp_pacer_get_packet_interval(struct tcpcb * tp,uint32_t size)3653 tcp_pacer_get_packet_interval(struct tcpcb *tp, uint32_t size)
3654 {
3655 	if (tp->t_pacer.rate == 0) {
3656 		os_log_error(OS_LOG_DEFAULT,
3657 		    "pacer rate shouldn't be 0, CCA is %s (cwnd=%u, smoothed rtt=%u ms)",
3658 		    CC_ALGO(tp)->name, tp->snd_cwnd, tp->t_srtt >> TCP_RTT_SHIFT);
3659 
3660 		return MAX_BURST_INTERVAL_KERNEL_PACING_NSEC;
3661 	}
3662 
3663 	uint64_t interval = (uint64_t)size * NSEC_PER_SEC / tp->t_pacer.rate;
3664 	if (interval > MAX_BURST_INTERVAL_KERNEL_PACING_NSEC) {
3665 		interval = MAX_BURST_INTERVAL_KERNEL_PACING_NSEC;
3666 	}
3667 
3668 	return interval;
3669 }
3670 
3671 /* Return packet tx_time in nanoseconds (absolute as well as continuous) */
3672 uint64_t
tcp_pacer_get_packet_tx_time(struct tcpcb * tp,uint16_t pkt_len)3673 tcp_pacer_get_packet_tx_time(struct tcpcb *tp, uint16_t pkt_len)
3674 {
3675 	/*
3676 	 * This function is called multiple times for mss-sized packets
3677 	 * and for high-speeds, we'd want to send multiple packets
3678 	 * that add up to burst_size at the same time.
3679 	 */
3680 	uint64_t now = microuptime_ns();
3681 
3682 	if (pkt_len == 0 || now == 0) {
3683 		return now;
3684 	}
3685 
3686 	if (tp->t_pacer.packet_tx_time == 0) {
3687 		tp->t_pacer.packet_tx_time = now;
3688 		tp->t_pacer.current_size = pkt_len;
3689 	} else {
3690 		tp->t_pacer.current_size += pkt_len;
3691 		if (tp->t_pacer.current_size > tp->t_pacer.tso_burst_size) {
3692 			/*
3693 			 * Increment tx_time by packet_interval and
3694 			 * reset size to this packet's len
3695 			 */
3696 			tp->t_pacer.packet_tx_time +=
3697 			    tcp_pacer_get_packet_interval(tp, tp->t_pacer.current_size);
3698 			tp->t_pacer.current_size = 0;
3699 			if (now > tp->t_pacer.packet_tx_time) {
3700 				/*
3701 				 * If current time is bigger, then application
3702 				 * has already paced the packet. Also, we can't
3703 				 * set tx_time in the past.
3704 				 */
3705 				tp->t_pacer.packet_tx_time = now;
3706 			}
3707 		}
3708 	}
3709 
3710 	return tp->t_pacer.packet_tx_time;
3711 }
3712 
3713 void
tcp_set_mbuf_tx_time(struct mbuf * m,uint64_t tx_time)3714 tcp_set_mbuf_tx_time(struct mbuf *m, uint64_t tx_time)
3715 {
3716 	struct m_tag *tag = NULL;
3717 	tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM,
3718 	    sizeof(uint64_t), M_WAITOK, m);
3719 	if (tag != NULL) {
3720 		m_tag_prepend(m, tag);
3721 		*(uint64_t *)tag->m_tag_data = tx_time;
3722 	}
3723 }
3724 
3725 /*
3726  * Compute receive window scaling that we are going to request
3727  * for this connection based on  sb_hiwat. Try to leave some
3728  * room to potentially increase the window size upto a maximum
3729  * defined by the constant tcp_autorcvbuf_max.
3730  */
3731 void
tcp_set_max_rwinscale(struct tcpcb * tp,struct socket * so)3732 tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so)
3733 {
3734 	uint32_t maxsockbufsize;
3735 
3736 	tp->request_r_scale = MAX((uint8_t)tcp_win_scale, tp->request_r_scale);
3737 	maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
3738 	    so->so_rcv.sb_hiwat : tcp_autorcvbuf_max;
3739 
3740 	/*
3741 	 * Window scale should not exceed what is needed
3742 	 * to send the max receive window size; adding 1 to TCP_MAXWIN
3743 	 * ensures that.
3744 	 */
3745 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
3746 	    ((TCP_MAXWIN + 1) << tp->request_r_scale) < maxsockbufsize) {
3747 		tp->request_r_scale++;
3748 	}
3749 	tp->request_r_scale = MIN(tp->request_r_scale, TCP_MAX_WINSHIFT);
3750 }
3751 
3752 int
tcp_notsent_lowat_check(struct socket * so)3753 tcp_notsent_lowat_check(struct socket *so)
3754 {
3755 	struct inpcb *inp = sotoinpcb(so);
3756 	struct tcpcb *tp = NULL;
3757 	int notsent = 0;
3758 
3759 	if (inp != NULL) {
3760 		tp = intotcpcb(inp);
3761 	}
3762 
3763 	if (tp == NULL) {
3764 		return 0;
3765 	}
3766 
3767 	notsent = so->so_snd.sb_cc -
3768 	    (tp->snd_nxt - tp->snd_una);
3769 
3770 	/*
3771 	 * When we send a FIN or SYN, not_sent can be negative.
3772 	 * In that case also we need to send a write event to the
3773 	 * process if it is waiting. In the FIN case, it will
3774 	 * get an error from send because cantsendmore will be set.
3775 	 */
3776 	if (notsent <= tp->t_notsent_lowat) {
3777 		return 1;
3778 	}
3779 
3780 	/*
3781 	 * When Nagle's algorithm is not disabled, it is better
3782 	 * to wakeup the client until there is atleast one
3783 	 * maxseg of data to write.
3784 	 */
3785 	if ((tp->t_flags & TF_NODELAY) == 0 &&
3786 	    notsent > 0 && notsent < tp->t_maxseg) {
3787 		return 1;
3788 	}
3789 	return 0;
3790 }
3791 
3792 void
tcp_rxtseg_insert(struct tcpcb * tp,tcp_seq start,tcp_seq end)3793 tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3794 {
3795 	struct tcp_rxt_seg *rxseg = NULL, *prev = NULL, *next = NULL;
3796 	uint16_t rxcount = 0;
3797 
3798 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3799 		tp->t_dsack_lastuna = tp->snd_una;
3800 	}
3801 	/*
3802 	 * First check if there is a segment already existing for this
3803 	 * sequence space.
3804 	 */
3805 
3806 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3807 		if (SEQ_GT(rxseg->rx_start, start)) {
3808 			break;
3809 		}
3810 		prev = rxseg;
3811 	}
3812 	next = rxseg;
3813 
3814 	/* check if prev seg is for this sequence */
3815 	if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
3816 	    SEQ_GEQ(prev->rx_end, end)) {
3817 		prev->rx_count++;
3818 		return;
3819 	}
3820 
3821 	/*
3822 	 * There are a couple of possibilities at this point.
3823 	 * 1. prev overlaps with the beginning of this sequence
3824 	 * 2. next overlaps with the end of this sequence
3825 	 * 3. there is no overlap.
3826 	 */
3827 
3828 	if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
3829 		if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
3830 			start = prev->rx_end + 1;
3831 			prev->rx_count++;
3832 		} else {
3833 			prev->rx_end = (start - 1);
3834 			rxcount = prev->rx_count;
3835 		}
3836 	}
3837 
3838 	if (next != NULL && SEQ_LT(next->rx_start, end)) {
3839 		if (SEQ_LEQ(next->rx_end, end)) {
3840 			end = next->rx_start - 1;
3841 			next->rx_count++;
3842 		} else {
3843 			next->rx_start = end + 1;
3844 			rxcount = next->rx_count;
3845 		}
3846 	}
3847 	if (!SEQ_LT(start, end)) {
3848 		return;
3849 	}
3850 
3851 	if (tcp_rxt_seg_max > 0 && tp->t_rxt_seg_count >= tcp_rxt_seg_max) {
3852 		rxseg = SLIST_FIRST(&tp->t_rxt_segments);
3853 		if (prev == rxseg) {
3854 			prev = NULL;
3855 		}
3856 		SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3857 		    tcp_rxt_seg, rx_link);
3858 
3859 		tcp_rxt_seg_drop++;
3860 		tp->t_rxt_seg_drop++;
3861 		zfree(tcp_rxt_seg_zone, rxseg);
3862 
3863 		tp->t_rxt_seg_count -= 1;
3864 	}
3865 
3866 	rxseg = zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
3867 	rxseg->rx_start = start;
3868 	rxseg->rx_end = end;
3869 	rxseg->rx_count = rxcount + 1;
3870 
3871 	if (prev != NULL) {
3872 		SLIST_INSERT_AFTER(prev, rxseg, rx_link);
3873 	} else {
3874 		SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
3875 	}
3876 	tp->t_rxt_seg_count += 1;
3877 }
3878 
3879 struct tcp_rxt_seg *
tcp_rxtseg_find(struct tcpcb * tp,tcp_seq start,tcp_seq end)3880 tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3881 {
3882 	struct tcp_rxt_seg *rxseg;
3883 
3884 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3885 		return NULL;
3886 	}
3887 
3888 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3889 		if (SEQ_LEQ(rxseg->rx_start, start) &&
3890 		    SEQ_GEQ(rxseg->rx_end, end)) {
3891 			return rxseg;
3892 		}
3893 		if (SEQ_GT(rxseg->rx_start, start)) {
3894 			break;
3895 		}
3896 	}
3897 	return NULL;
3898 }
3899 
3900 void
tcp_rxtseg_set_spurious(struct tcpcb * tp,tcp_seq start,tcp_seq end)3901 tcp_rxtseg_set_spurious(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3902 {
3903 	struct tcp_rxt_seg *rxseg;
3904 
3905 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3906 		return;
3907 	}
3908 
3909 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3910 		if (SEQ_GEQ(rxseg->rx_start, start) &&
3911 		    SEQ_LEQ(rxseg->rx_end, end)) {
3912 			/*
3913 			 * If the segment was retransmitted only once, mark it as
3914 			 * spurious.
3915 			 */
3916 			if (rxseg->rx_count == 1) {
3917 				rxseg->rx_flags |= TCP_RXT_SPURIOUS;
3918 			}
3919 		}
3920 
3921 		if (SEQ_GEQ(rxseg->rx_start, end)) {
3922 			break;
3923 		}
3924 	}
3925 	return;
3926 }
3927 
3928 void
tcp_rxtseg_clean(struct tcpcb * tp)3929 tcp_rxtseg_clean(struct tcpcb *tp)
3930 {
3931 	struct tcp_rxt_seg *rxseg, *next;
3932 
3933 	SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
3934 		SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3935 		    tcp_rxt_seg, rx_link);
3936 		zfree(tcp_rxt_seg_zone, rxseg);
3937 	}
3938 	tp->t_rxt_seg_count = 0;
3939 	tp->t_dsack_lastuna = tp->snd_max;
3940 }
3941 
3942 boolean_t
tcp_rxtseg_detect_bad_rexmt(struct tcpcb * tp,tcp_seq th_ack)3943 tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
3944 {
3945 	boolean_t bad_rexmt;
3946 	struct tcp_rxt_seg *rxseg;
3947 
3948 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3949 		return FALSE;
3950 	}
3951 
3952 	/*
3953 	 * If all of the segments in this window are not cumulatively
3954 	 * acknowledged, then there can still be undetected packet loss.
3955 	 * Do not restore congestion window in that case.
3956 	 */
3957 	if (SEQ_LT(th_ack, tp->snd_recover)) {
3958 		return FALSE;
3959 	}
3960 
3961 	bad_rexmt = TRUE;
3962 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3963 		if (!(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
3964 			bad_rexmt = FALSE;
3965 			break;
3966 		}
3967 	}
3968 	return bad_rexmt;
3969 }
3970 
3971 u_int32_t
tcp_rxtseg_total_size(struct tcpcb * tp)3972 tcp_rxtseg_total_size(struct tcpcb *tp)
3973 {
3974 	struct tcp_rxt_seg *rxseg;
3975 	u_int32_t total_size = 0;
3976 
3977 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3978 		total_size += (rxseg->rx_end - rxseg->rx_start) + 1;
3979 	}
3980 	return total_size;
3981 }
3982 
3983 int
tcp_seg_cmp(const struct tcp_seg_sent * seg1,const struct tcp_seg_sent * seg2)3984 tcp_seg_cmp(const struct tcp_seg_sent *seg1, const struct tcp_seg_sent *seg2)
3985 {
3986 	return (int)(seg1->end_seq - seg2->end_seq);
3987 }
3988 
RB_GENERATE(tcp_seg_sent_tree_head,tcp_seg_sent,seg_link,tcp_seg_cmp)3989 RB_GENERATE(tcp_seg_sent_tree_head, tcp_seg_sent, seg_link, tcp_seg_cmp)
3990 
3991 uint32_t
3992 tcp_seg_len(struct tcp_seg_sent *seg)
3993 {
3994 	if (SEQ_LT(seg->end_seq, seg->start_seq)) {
3995 		os_log_error(OS_LOG_DEFAULT, "segment end(%u) can't be smaller "
3996 		    "than segment start(%u)", seg->end_seq, seg->start_seq);
3997 	}
3998 
3999 	return seg->end_seq - seg->start_seq;
4000 }
4001 
4002 static struct tcp_seg_sent *
tcp_seg_alloc_init(struct tcpcb * tp)4003 tcp_seg_alloc_init(struct tcpcb *tp)
4004 {
4005 	struct tcp_seg_sent *seg = TAILQ_FIRST(&tp->seg_pool.free_segs);
4006 	if (seg != NULL) {
4007 		TAILQ_REMOVE(&tp->seg_pool.free_segs, seg, free_link);
4008 		tp->seg_pool.free_segs_count--;
4009 	} else {
4010 		// TODO: remove Z_WAITOK and Z_NOFAIL?
4011 		seg = zalloc_flags(tcp_seg_sent_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
4012 		if (seg == NULL) {
4013 			return NULL;
4014 		}
4015 	}
4016 	bzero(seg, sizeof(*seg));
4017 
4018 	return seg;
4019 }
4020 
4021 static void
tcp_update_seg_after_rto(struct tcpcb * tp,struct tcp_seg_sent * found_seg,uint32_t xmit_ts,uint8_t flags)4022 tcp_update_seg_after_rto(struct tcpcb *tp, struct tcp_seg_sent *found_seg,
4023     uint32_t xmit_ts, uint8_t flags)
4024 {
4025 	tcp_rack_transmit_seg(tp, found_seg, found_seg->start_seq, found_seg->end_seq,
4026 	    xmit_ts, flags);
4027 	struct tcp_seg_sent *seg = TAILQ_FIRST(&tp->t_segs_sent);
4028 	if (found_seg == seg) {
4029 		// Move this segment to the end of time-ordered list.
4030 		TAILQ_REMOVE(&tp->t_segs_sent, seg, tx_link);
4031 		TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4032 	}
4033 }
4034 
4035 static void
tcp_process_rxmt_segs_after_rto(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq start,uint32_t xmit_ts,uint8_t flags)4036 tcp_process_rxmt_segs_after_rto(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start,
4037     uint32_t xmit_ts, uint8_t flags)
4038 {
4039 	struct tcp_seg_sent segment = {};
4040 
4041 	while (seg != NULL) {
4042 		if (SEQ_LEQ(seg->start_seq, start)) {
4043 			tcp_update_seg_after_rto(tp, seg, xmit_ts, flags);
4044 			break;
4045 		} else {
4046 			/* The segment is a part of the total RTO retransmission */
4047 			tcp_update_seg_after_rto(tp, seg, xmit_ts, flags);
4048 
4049 			/* Find the next segment ending at the start of current segment */
4050 			segment.end_seq = seg->start_seq;
4051 			seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &segment);
4052 		}
4053 	}
4054 }
4055 
4056 static struct tcp_seg_sent *
tcp_seg_sent_insert_before(struct tcpcb * tp,struct tcp_seg_sent * before,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)4057 tcp_seg_sent_insert_before(struct tcpcb *tp, struct tcp_seg_sent *before, tcp_seq start, tcp_seq end,
4058     uint32_t xmit_ts, uint8_t flags)
4059 {
4060 	struct tcp_seg_sent *seg = tcp_seg_alloc_init(tp);
4061 	/* segment MUST be allocated, there is no other fail-safe here */
4062 	tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags);
4063 	struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4064 	if (not_inserted) {
4065 		os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4066 		    not_inserted->start_seq, not_inserted->end_seq);
4067 	}
4068 	TAILQ_INSERT_BEFORE(before, seg, tx_link);
4069 
4070 	return seg;
4071 }
4072 
4073 static struct tcp_seg_sent *
tcp_seg_rto_insert_end(struct tcpcb * tp,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)4074 tcp_seg_rto_insert_end(struct tcpcb *tp, tcp_seq start, tcp_seq end,
4075     uint32_t xmit_ts, uint8_t flags)
4076 {
4077 	struct tcp_seg_sent *seg = tcp_seg_alloc_init(tp);
4078 	/* segment MUST be allocated, there is no other fail-safe here */
4079 	tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags);
4080 	struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4081 	if (not_inserted) {
4082 		os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4083 		    not_inserted->start_seq, not_inserted->end_seq);
4084 	}
4085 	TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4086 
4087 	return seg;
4088 }
4089 
4090 void
tcp_seg_sent_insert(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)4091 tcp_seg_sent_insert(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, tcp_seq end,
4092     uint32_t xmit_ts, uint8_t flags)
4093 {
4094 	if (seg != NULL) {
4095 		uint8_t seg_flags = seg->flags | flags;
4096 		if (seg->end_seq == end) {
4097 			/* Entire seg retransmitted in RACK recovery, start and end sequence doesn't change */
4098 			if (seg->start_seq != start) {
4099 				os_log_error(OS_LOG_DEFAULT, "Segment start (%u) is not same as retransmitted "
4100 				    "start sequence number (%u)", seg->start_seq, start);
4101 			}
4102 			tcp_rack_transmit_seg(tp, seg, seg->start_seq, seg->end_seq, xmit_ts, seg_flags);
4103 			TAILQ_REMOVE(&tp->t_segs_sent, seg, tx_link);
4104 			TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4105 		} else {
4106 			/*
4107 			 * Original segment is retransmitted partially, update start_seq by len
4108 			 * and create new segment for retransmitted part
4109 			 */
4110 			struct tcp_seg_sent *partial_seg = tcp_seg_alloc_init(tp);
4111 			if (partial_seg == NULL) {
4112 				return;
4113 			}
4114 			seg->start_seq += (end - start);
4115 			tcp_rack_transmit_seg(tp, partial_seg, start, end, xmit_ts, seg_flags);
4116 			struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head,
4117 			    &tp->t_segs_sent_tree, partial_seg);
4118 			if (not_inserted) {
4119 				os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4120 				    not_inserted->start_seq, not_inserted->end_seq);
4121 			}
4122 			TAILQ_INSERT_TAIL(&tp->t_segs_sent, partial_seg, tx_link);
4123 		}
4124 
4125 		return;
4126 	}
4127 
4128 	if ((flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE) == 0) {
4129 		/* This is a new segment */
4130 		seg = tcp_seg_alloc_init(tp);
4131 		if (seg == NULL) {
4132 			return;
4133 		}
4134 
4135 		tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags);
4136 		struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4137 		if (not_inserted) {
4138 			os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted,
4139 			    not_inserted->start_seq, not_inserted->end_seq);
4140 		}
4141 		TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link);
4142 
4143 		return;
4144 	}
4145 	/*
4146 	 * Either retransmitted after an RTO or PTO.
4147 	 * During RTO, time-ordered list may lose its order.
4148 	 * If retransmitted after RTO, check if the segment
4149 	 * already exists in RB tree and update its xmit_ts. Also,
4150 	 * if this seg is at the top of ordered list, then move it
4151 	 * to the end.
4152 	 */
4153 	struct tcp_seg_sent segment = {};
4154 	struct tcp_seg_sent *found_seg = NULL, *rxmt_seg = NULL;
4155 
4156 	/* Set the end sequence to search for existing segment */
4157 	segment.end_seq = end;
4158 	found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &segment);
4159 	if (found_seg != NULL) {
4160 		/* Found an exact match for retransmitted end sequence */
4161 		tcp_process_rxmt_segs_after_rto(tp, found_seg, start, xmit_ts, flags);
4162 		return;
4163 	}
4164 	/*
4165 	 * We come here when we don't find an exact match and end of segment
4166 	 * retransmitted after RTO lies within a segment.
4167 	 */
4168 	RB_FOREACH(found_seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
4169 		if (SEQ_LT(end, found_seg->end_seq) && SEQ_GT(end, found_seg->start_seq)) {
4170 			/*
4171 			 * This segment is partially retransmitted. We split this segment at the boundary of end
4172 			 * sequence. First insert the part being retransmitted at the end of time-ordered list.
4173 			 */
4174 			tcp_seg_rto_insert_end(tp, found_seg->start_seq, end, xmit_ts,
4175 			    found_seg->flags | flags);
4176 
4177 			if (SEQ_LEQ(found_seg->start_seq, start)) {
4178 				/*
4179 				 * We are done with the retransmitted part.
4180 				 * Move the start of existing segment
4181 				 */
4182 				found_seg->start_seq = end;
4183 			} else {
4184 				/*
4185 				 * This retransmitted sequence covers more than one segment
4186 				 * Look for segments covered by this retransmission below this segment
4187 				 */
4188 				segment.end_seq = found_seg->start_seq;
4189 				rxmt_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &segment);
4190 
4191 				if (rxmt_seg != NULL) {
4192 					/* rxmt_seg is just before the current segment */
4193 					tcp_process_rxmt_segs_after_rto(tp, rxmt_seg, start, xmit_ts, flags);
4194 				}
4195 
4196 				/* Move the start of existing segment */
4197 				found_seg->start_seq = end;
4198 			}
4199 			return;
4200 		}
4201 	}
4202 }
4203 
4204 static void
tcp_seg_collect_acked_subtree(struct tcpcb * tp,struct tcp_seg_sent * seg,uint32_t acked_xmit_ts,uint32_t tsecr)4205 tcp_seg_collect_acked_subtree(struct tcpcb *tp, struct tcp_seg_sent *seg,
4206     uint32_t acked_xmit_ts, uint32_t tsecr)
4207 {
4208 	if (seg != NULL) {
4209 		tcp_seg_collect_acked_subtree(tp, RB_LEFT(seg, seg_link), acked_xmit_ts, tsecr);
4210 		tcp_seg_collect_acked_subtree(tp, RB_RIGHT(seg, seg_link), acked_xmit_ts, tsecr);
4211 		TAILQ_INSERT_TAIL(&tp->t_segs_acked, seg, ack_link);
4212 	}
4213 }
4214 
4215 /* Call this function with root of the rb tree */
4216 static void
tcp_seg_collect_acked(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq th_ack,uint32_t acked_xmit_ts,uint32_t tsecr)4217 tcp_seg_collect_acked(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq th_ack,
4218     uint32_t acked_xmit_ts, uint32_t tsecr)
4219 {
4220 	if (seg == NULL) {
4221 		return;
4222 	}
4223 
4224 	if (SEQ_GEQ(th_ack, seg->end_seq)) {
4225 		/* Delete the entire left sub-tree */
4226 		tcp_seg_collect_acked_subtree(tp, RB_LEFT(seg, seg_link), acked_xmit_ts, tsecr);
4227 		/* Evaluate the right sub-tree */
4228 		tcp_seg_collect_acked(tp, RB_RIGHT(seg, seg_link), th_ack, acked_xmit_ts, tsecr);
4229 		TAILQ_INSERT_TAIL(&tp->t_segs_acked, seg, ack_link);
4230 	} else {
4231 		/*
4232 		 * This ACK doesn't acknowledge the current root and its right sub-tree.
4233 		 * Evaluate the left sub-tree
4234 		 */
4235 		tcp_seg_collect_acked(tp, RB_LEFT(seg, seg_link), th_ack, acked_xmit_ts, tsecr);
4236 	}
4237 }
4238 
4239 static void
tcp_seg_delete_acked(struct tcpcb * tp,uint32_t acked_xmit_ts,uint32_t tsecr)4240 tcp_seg_delete_acked(struct tcpcb *tp, uint32_t acked_xmit_ts, uint32_t tsecr)
4241 {
4242 	struct tcp_seg_sent *acked_seg = NULL, *next = NULL;
4243 
4244 	TAILQ_FOREACH_SAFE(acked_seg, &tp->t_segs_acked, ack_link, next) {
4245 		/* Advance RACK state if applicable */
4246 		if (acked_seg->xmit_ts > acked_xmit_ts) {
4247 			tcp_rack_update_segment_acked(tp, tsecr, acked_seg->xmit_ts, acked_seg->end_seq,
4248 			    !!(acked_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4249 		}
4250 		/* Check for reordering */
4251 		tcp_rack_detect_reordering_acked(tp, acked_seg);
4252 
4253 		const uint32_t seg_len = tcp_seg_len(acked_seg);
4254 		if (acked_seg->flags & TCP_SEGMENT_LOST) {
4255 			if (tp->bytes_lost < seg_len) {
4256 				os_log_error(OS_LOG_DEFAULT, "bytes_lost (%u) can't be smaller than already "
4257 				    "lost segment length (%u)", tp->bytes_lost, seg_len);
4258 			}
4259 			tp->bytes_lost -= seg_len;
4260 		}
4261 		if (acked_seg->flags & TCP_RACK_RETRANSMITTED) {
4262 			if (tp->bytes_retransmitted < seg_len) {
4263 				os_log_error(OS_LOG_DEFAULT, "bytes_retransmitted (%u) can't be smaller "
4264 				    "than already retransmited segment length (%u)",
4265 				    tp->bytes_retransmitted, seg_len);
4266 			}
4267 			tp->bytes_retransmitted -= seg_len;
4268 		}
4269 		if (acked_seg->flags & TCP_SEGMENT_SACKED) {
4270 			if (tp->bytes_sacked < seg_len) {
4271 				os_log_error(OS_LOG_DEFAULT, "bytes_sacked (%u) can't be smaller than already "
4272 				    "SACKed segment length (%u)", tp->bytes_sacked, seg_len);
4273 			}
4274 			tp->bytes_sacked -= seg_len;
4275 		}
4276 		TAILQ_REMOVE(&tp->t_segs_acked, acked_seg, ack_link);
4277 		TAILQ_REMOVE(&tp->t_segs_sent, acked_seg, tx_link);
4278 		RB_REMOVE(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, acked_seg);
4279 		tcp_seg_delete(tp, acked_seg);
4280 	}
4281 }
4282 
4283 void
tcp_segs_doack(struct tcpcb * tp,tcp_seq th_ack,struct tcpopt * to)4284 tcp_segs_doack(struct tcpcb *tp, tcp_seq th_ack, struct tcpopt *to)
4285 {
4286 	uint32_t tsecr = 0, acked_xmit_ts = 0;
4287 	tcp_seq acked_seq = th_ack;
4288 	bool was_retransmitted = false;
4289 
4290 	if (TAILQ_EMPTY(&tp->t_segs_sent)) {
4291 		return;
4292 	}
4293 
4294 	if (((to->to_flags & TOF_TS) != 0) && (to->to_tsecr != 0)) {
4295 		tsecr = to->to_tsecr;
4296 	}
4297 
4298 	struct tcp_seg_sent seg = {};
4299 	struct tcp_seg_sent *found_seg = NULL, *next = NULL;
4300 
4301 	found_seg = TAILQ_LAST(&tp->t_segs_sent, tcp_seg_sent_head);
4302 
4303 	if (tp->rack.segs_retransmitted == false) {
4304 		if (SEQ_GEQ(th_ack, found_seg->end_seq)) {
4305 			/*
4306 			 * ACK acknowledges the last sent segment completely (snd_max),
4307 			 * we can remove all segments from time ordered list.
4308 			 */
4309 			acked_seq = found_seg->end_seq;
4310 			acked_xmit_ts = found_seg->xmit_ts;
4311 			was_retransmitted = !!(found_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE);
4312 			tcp_segs_sent_clean(tp, false);
4313 
4314 			/* Advance RACK state */
4315 			tcp_rack_update_segment_acked(tp, tsecr, acked_xmit_ts, acked_seq, was_retransmitted);
4316 			return;
4317 		}
4318 	}
4319 	/*
4320 	 * If either not all segments are ACKed OR the time-ordered list contains retransmitted
4321 	 * segments, do a RB tree search for largest (completely) ACKed segment and remove the ACKed
4322 	 * segment and all segments left of it from both RB tree and time-ordered list.
4323 	 *
4324 	 * Set the end sequence to search for ACKed segment.
4325 	 */
4326 	seg.end_seq = th_ack;
4327 
4328 	if ((found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg)) != NULL) {
4329 		acked_seq = found_seg->end_seq;
4330 		acked_xmit_ts = found_seg->xmit_ts;
4331 		was_retransmitted = !!(found_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE);
4332 
4333 		/*
4334 		 * Remove all segments that are ACKed by this ACK.
4335 		 * We defer self-balancing of RB tree to the end
4336 		 * by calling RB_REMOVE after collecting all ACKed segments.
4337 		 */
4338 		tcp_seg_collect_acked(tp, RB_ROOT(&tp->t_segs_sent_tree), th_ack, acked_xmit_ts, tsecr);
4339 		tcp_seg_delete_acked(tp, acked_xmit_ts, tsecr);
4340 
4341 		/* Advance RACK state */
4342 		tcp_rack_update_segment_acked(tp, tsecr, acked_xmit_ts, acked_seq, was_retransmitted);
4343 
4344 		return;
4345 	}
4346 	/*
4347 	 * When TSO is enabled, it is possible that th_ack is less
4348 	 * than segment->end, hence we search the tree
4349 	 * until we find the largest (partially) ACKed segment.
4350 	 */
4351 	RB_FOREACH_SAFE(found_seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, next) {
4352 		if (SEQ_LT(th_ack, found_seg->end_seq) && SEQ_GT(th_ack, found_seg->start_seq)) {
4353 			acked_seq = th_ack;
4354 			acked_xmit_ts = found_seg->xmit_ts;
4355 			was_retransmitted = !!(found_seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE);
4356 
4357 			/* Remove all segments completely ACKed by this ack */
4358 			tcp_seg_collect_acked(tp, RB_ROOT(&tp->t_segs_sent_tree), th_ack, acked_xmit_ts, tsecr);
4359 			tcp_seg_delete_acked(tp, acked_xmit_ts, tsecr);
4360 			found_seg->start_seq = th_ack;
4361 
4362 			/* Advance RACK state */
4363 			tcp_rack_update_segment_acked(tp, tsecr, acked_xmit_ts, acked_seq, was_retransmitted);
4364 			break;
4365 		}
4366 	}
4367 }
4368 
4369 static bool
tcp_seg_mark_sacked(struct tcpcb * tp,struct tcp_seg_sent * seg,uint32_t * newbytes_sacked)4370 tcp_seg_mark_sacked(struct tcpcb *tp, struct tcp_seg_sent *seg, uint32_t *newbytes_sacked)
4371 {
4372 	if (seg->flags & TCP_SEGMENT_SACKED) {
4373 		return false;
4374 	}
4375 
4376 	const uint32_t seg_len = tcp_seg_len(seg);
4377 
4378 	/* Check for reordering */
4379 	tcp_rack_detect_reordering_acked(tp, seg);
4380 
4381 	if (seg->flags & TCP_RACK_RETRANSMITTED) {
4382 		if (seg->flags & TCP_SEGMENT_LOST) {
4383 			/*
4384 			 * If the segment is not considered lost, we don't clear
4385 			 * retransmitted as it might still be in flight. The ONLY time
4386 			 * this can happen is when RTO happens and segment is retransmitted
4387 			 * and SACKed before RACK detects segment was lost.
4388 			 */
4389 			seg->flags &= ~(TCP_SEGMENT_LOST | TCP_RACK_RETRANSMITTED);
4390 			if (tp->bytes_lost < seg_len || tp->bytes_retransmitted < seg_len) {
4391 				os_log_error(OS_LOG_DEFAULT, "bytes_lost (%u) and/or bytes_retransmitted (%u) "
4392 				    "can't be smaller than already lost/retransmitted segment length (%u)", tp->bytes_lost,
4393 				    tp->bytes_retransmitted, seg_len);
4394 			}
4395 			tp->bytes_lost -= seg_len;
4396 			tp->bytes_retransmitted -= seg_len;
4397 		}
4398 	} else {
4399 		if (seg->flags & TCP_SEGMENT_LOST) {
4400 			seg->flags &= ~(TCP_SEGMENT_LOST);
4401 			if (tp->bytes_lost < seg_len) {
4402 				os_log_error(OS_LOG_DEFAULT, "bytes_lost (%u) can't be smaller "
4403 				    "than already lost segment length (%u)", tp->bytes_lost, seg_len);
4404 			}
4405 			tp->bytes_lost -= seg_len;
4406 		}
4407 	}
4408 	*newbytes_sacked += seg_len;
4409 	seg->flags |= TCP_SEGMENT_SACKED;
4410 	tp->bytes_sacked += seg_len;
4411 
4412 	return true;
4413 }
4414 
4415 static void
tcp_segs_dosack_matched(struct tcpcb * tp,struct tcp_seg_sent * found_seg,tcp_seq sblk_start,uint32_t tsecr,uint32_t * newbytes_sacked)4416 tcp_segs_dosack_matched(struct tcpcb *tp, struct tcp_seg_sent *found_seg,
4417     tcp_seq sblk_start, uint32_t tsecr,
4418     uint32_t *newbytes_sacked)
4419 {
4420 	struct tcp_seg_sent seg = {};
4421 
4422 	while (found_seg != NULL) {
4423 		if (sblk_start == found_seg->start_seq) {
4424 			/*
4425 			 * Covered the entire SACK block.
4426 			 * Record segment flags before they get erased.
4427 			 */
4428 			uint8_t seg_flags = found_seg->flags;
4429 			bool newly_marked = tcp_seg_mark_sacked(tp, found_seg, newbytes_sacked);
4430 			if (newly_marked) {
4431 				/* Advance RACK state */
4432 				tcp_rack_update_segment_acked(tp, tsecr, found_seg->xmit_ts,
4433 				    found_seg->end_seq,
4434 				    !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4435 			}
4436 			break;
4437 		} else if (SEQ_GT(sblk_start, found_seg->start_seq)) {
4438 			if ((found_seg->flags & TCP_SEGMENT_SACKED) != 0) {
4439 				/* No need to process an already SACKED segment */
4440 				break;
4441 			}
4442 			/*
4443 			 * This segment is partially ACKed by SACK block
4444 			 * as sblk_start > segment start. Since it is
4445 			 * partially SACKed, we should split the unSACKed and
4446 			 * SACKed parts.
4447 			 */
4448 			/* First create a new segment for unSACKed part */
4449 			tcp_seg_sent_insert_before(tp, found_seg, found_seg->start_seq, sblk_start,
4450 			    found_seg->xmit_ts, found_seg->flags);
4451 			/* Now, update the SACKed part */
4452 			found_seg->start_seq = sblk_start;
4453 			/* Record seg flags before they get erased. */
4454 			uint8_t seg_flags = found_seg->flags;
4455 			bool newly_marked = tcp_seg_mark_sacked(tp, found_seg, newbytes_sacked);
4456 			if (newly_marked) {
4457 				/* Advance RACK state */
4458 				tcp_rack_update_segment_acked(tp, tsecr, found_seg->xmit_ts,
4459 				    found_seg->end_seq,
4460 				    !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4461 			}
4462 			break;
4463 		} else {
4464 			/*
4465 			 * This segment lies within the SACK block
4466 			 * Record segment flags before they get erased.
4467 			 */
4468 			uint8_t seg_flags = found_seg->flags;
4469 			bool newly_marked = tcp_seg_mark_sacked(tp, found_seg, newbytes_sacked);
4470 			if (newly_marked) {
4471 				/* Advance RACK state */
4472 				tcp_rack_update_segment_acked(tp, tsecr, found_seg->xmit_ts,
4473 				    found_seg->end_seq,
4474 				    !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4475 			}
4476 			/* Find the next segment ending at the start of current segment */
4477 			seg.end_seq = found_seg->start_seq;
4478 			found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg);
4479 		}
4480 	}
4481 }
4482 
4483 void
tcp_segs_dosack(struct tcpcb * tp,tcp_seq sblk_start,tcp_seq sblk_end,uint32_t tsecr,uint32_t * newbytes_sacked)4484 tcp_segs_dosack(struct tcpcb *tp, tcp_seq sblk_start, tcp_seq sblk_end,
4485     uint32_t tsecr, uint32_t *newbytes_sacked)
4486 {
4487 	/*
4488 	 * When we receive SACK, min RTT is computed after SACK processing which
4489 	 * means we are using min RTT from the previous ACK to advance RACK state
4490 	 * This is ok as we track a windowed min-filtered estimate over a period.
4491 	 */
4492 	struct tcp_seg_sent seg = {};
4493 	struct tcp_seg_sent *found_seg = NULL, *sacked_seg = NULL;
4494 
4495 	/* Set the end sequence to search for SACKed segment */
4496 	seg.end_seq = sblk_end;
4497 	found_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg);
4498 
4499 	if (found_seg != NULL) {
4500 		/* We found an exact match for sblk_end */
4501 		tcp_segs_dosack_matched(tp, found_seg, sblk_start, tsecr, newbytes_sacked);
4502 		return;
4503 	}
4504 	/*
4505 	 * We come here when we don't find an exact match and sblk_end
4506 	 * lies within a segment. This would happen only when TSO is used.
4507 	 */
4508 	RB_FOREACH(found_seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
4509 		if (SEQ_LT(sblk_end, found_seg->end_seq) && SEQ_GT(sblk_end, found_seg->start_seq)) {
4510 			/*
4511 			 * This segment is partially SACKed. We split this segment at the boundary
4512 			 * of SACK block. First insert the newly SACKed part
4513 			 */
4514 			tcp_seq start = SEQ_LEQ(sblk_start, found_seg->start_seq) ? found_seg->start_seq : sblk_start;
4515 			struct tcp_seg_sent *inserted = tcp_seg_sent_insert_before(tp, found_seg, start,
4516 			    sblk_end, found_seg->xmit_ts, found_seg->flags);
4517 			/* Record seg flags before they get erased. */
4518 			uint8_t seg_flags = inserted->flags;
4519 			/* Mark the SACKed segment */
4520 			tcp_seg_mark_sacked(tp, inserted, newbytes_sacked);
4521 
4522 			/* Advance RACK state */
4523 			tcp_rack_update_segment_acked(tp, tsecr, inserted->xmit_ts,
4524 			    inserted->end_seq, !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE));
4525 
4526 			if (sblk_start == found_seg->start_seq) {
4527 				/*
4528 				 * We are done with this SACK block.
4529 				 * Move the start of existing segment
4530 				 */
4531 				found_seg->start_seq = sblk_end;
4532 				break;
4533 			}
4534 
4535 			if (SEQ_GT(sblk_start, found_seg->start_seq)) {
4536 				/* Insert the remaining unSACKed part before the SACKED segment inserted above */
4537 				tcp_seg_sent_insert_before(tp, inserted, found_seg->start_seq,
4538 				    sblk_start, found_seg->xmit_ts, found_seg->flags);
4539 				/* Move the start of existing segment */
4540 				found_seg->start_seq = sblk_end;
4541 				break;
4542 			} else {
4543 				/*
4544 				 * This SACK block covers more than one segment
4545 				 * Look for segments SACKed below this segment
4546 				 */
4547 				seg.end_seq = found_seg->start_seq;
4548 				sacked_seg = RB_FIND(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, &seg);
4549 
4550 				if (sacked_seg != NULL) {
4551 					/* We found an exact match for sblk_end */
4552 					tcp_segs_dosack_matched(tp, sacked_seg, sblk_start, tsecr, newbytes_sacked);
4553 				}
4554 
4555 				/* Move the start of existing segment */
4556 				found_seg->start_seq = sblk_end;
4557 			}
4558 			break;
4559 		}
4560 	}
4561 }
4562 
4563 void
tcp_segs_clear_sacked(struct tcpcb * tp)4564 tcp_segs_clear_sacked(struct tcpcb *tp)
4565 {
4566 	struct tcp_seg_sent *seg = NULL;
4567 
4568 	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link)
4569 	{
4570 		const uint32_t seg_len = tcp_seg_len(seg);
4571 
4572 		if (seg->flags & TCP_SEGMENT_SACKED) {
4573 			seg->flags &= ~(TCP_SEGMENT_SACKED);
4574 			if (tp->bytes_sacked < seg_len) {
4575 				os_log_error(OS_LOG_DEFAULT, "bytes_sacked (%u) can't be smaller "
4576 				    "than already SACKed segment length (%u)", tp->bytes_sacked, seg_len);
4577 			}
4578 			tp->bytes_sacked -= seg_len;
4579 		}
4580 	}
4581 }
4582 
4583 void
tcp_mark_seg_lost(struct tcpcb * tp,struct tcp_seg_sent * seg)4584 tcp_mark_seg_lost(struct tcpcb *tp, struct tcp_seg_sent *seg)
4585 {
4586 	const uint32_t seg_len = tcp_seg_len(seg);
4587 
4588 	if (seg->flags & TCP_SEGMENT_LOST) {
4589 		if (seg->flags & TCP_RACK_RETRANSMITTED) {
4590 			/* Retransmission was lost */
4591 			seg->flags &= ~TCP_RACK_RETRANSMITTED;
4592 			if (tp->bytes_retransmitted < seg_len) {
4593 				os_log_error(OS_LOG_DEFAULT, "bytes_retransmitted (%u) can't be "
4594 				    "smaller than retransmited segment length (%u)",
4595 				    tp->bytes_retransmitted, seg_len);
4596 				return;
4597 			}
4598 			tp->bytes_retransmitted -= seg_len;
4599 		}
4600 	} else {
4601 		seg->flags |= TCP_SEGMENT_LOST;
4602 		tp->bytes_lost += seg_len;
4603 	}
4604 }
4605 
4606 void
tcp_seg_delete(struct tcpcb * tp,struct tcp_seg_sent * seg)4607 tcp_seg_delete(struct tcpcb *tp, struct tcp_seg_sent *seg)
4608 {
4609 	if (tp->seg_pool.free_segs_count >= TCP_SEG_POOL_MAX_ITEM_COUNT) {
4610 		zfree(tcp_seg_sent_zone, seg);
4611 	} else {
4612 		bzero(seg, sizeof(*seg));
4613 		TAILQ_INSERT_TAIL(&tp->seg_pool.free_segs, seg, free_link);
4614 		tp->seg_pool.free_segs_count++;
4615 	}
4616 }
4617 
4618 void
tcp_segs_sent_clean(struct tcpcb * tp,bool free_segs)4619 tcp_segs_sent_clean(struct tcpcb *tp, bool free_segs)
4620 {
4621 	struct tcp_seg_sent *seg = NULL, *next = NULL;
4622 
4623 	TAILQ_FOREACH_SAFE(seg, &tp->t_segs_sent, tx_link, next) {
4624 		/* Check for reordering */
4625 		tcp_rack_detect_reordering_acked(tp, seg);
4626 
4627 		TAILQ_REMOVE(&tp->t_segs_sent, seg, tx_link);
4628 		RB_REMOVE(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg);
4629 		tcp_seg_delete(tp, seg);
4630 	}
4631 	if (__improbable(!RB_EMPTY(&tp->t_segs_sent_tree))) {
4632 		os_log_error(OS_LOG_DEFAULT, "RB tree still contains segments while "
4633 		    "time ordered list is already empty");
4634 	}
4635 	if (__improbable(!TAILQ_EMPTY(&tp->t_segs_acked))) {
4636 		os_log_error(OS_LOG_DEFAULT, "Segment ACKed list shouldn't contain "
4637 		    "any segments as they are removed immediately after being ACKed");
4638 	}
4639 	/* Reset seg_retransmitted as we emptied the list */
4640 	tcp_rack_reset_segs_retransmitted(tp);
4641 	tp->bytes_lost = tp->bytes_sacked = tp->bytes_retransmitted = 0;
4642 
4643 	/* Empty the free segments pool */
4644 	if (free_segs) {
4645 		TAILQ_FOREACH_SAFE(seg, &tp->seg_pool.free_segs, free_link, next) {
4646 			TAILQ_REMOVE(&tp->seg_pool.free_segs, seg, free_link);
4647 			zfree(tcp_seg_sent_zone, seg);
4648 		}
4649 		tp->seg_pool.free_segs_count = 0;
4650 	}
4651 }
4652 
4653 void
tcp_get_connectivity_status(struct tcpcb * tp,struct tcp_conn_status * connstatus)4654 tcp_get_connectivity_status(struct tcpcb *tp,
4655     struct tcp_conn_status *connstatus)
4656 {
4657 	if (tp == NULL || connstatus == NULL) {
4658 		return;
4659 	}
4660 	bzero(connstatus, sizeof(*connstatus));
4661 	if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
4662 		if (TCPS_HAVEESTABLISHED(tp->t_state)) {
4663 			connstatus->write_probe_failed = 1;
4664 		} else {
4665 			connstatus->conn_probe_failed = 1;
4666 		}
4667 	}
4668 	if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX) {
4669 		connstatus->read_probe_failed = 1;
4670 	}
4671 	if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL &&
4672 	    (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)) {
4673 		connstatus->probe_activated = 1;
4674 	}
4675 }
4676 
4677 void
tcp_disable_tfo(struct tcpcb * tp)4678 tcp_disable_tfo(struct tcpcb *tp)
4679 {
4680 	tp->t_flagsext &= ~TF_FASTOPEN;
4681 }
4682 
4683 static struct mbuf *
tcp_make_keepalive_frame(struct tcpcb * tp,struct ifnet * ifp,boolean_t is_probe)4684 tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp,
4685     boolean_t is_probe)
4686 {
4687 	struct inpcb *inp = tp->t_inpcb;
4688 	struct tcphdr *th;
4689 	caddr_t data;
4690 	int win = 0;
4691 	struct mbuf *m;
4692 
4693 	/*
4694 	 * The code assumes the IP + TCP headers fit in an mbuf packet header
4695 	 */
4696 	_CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN);
4697 	_CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN);
4698 
4699 	MGETHDR(m, M_WAIT, MT_HEADER);
4700 	if (m == NULL) {
4701 		return NULL;
4702 	}
4703 	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
4704 
4705 	data = m_mtod_lower_bound(m);
4706 
4707 	if (inp->inp_vflag & INP_IPV4) {
4708 		bzero(data, sizeof(struct ip) + sizeof(struct tcphdr));
4709 		th = (struct tcphdr *)(void *) (data + sizeof(struct ip));
4710 		m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
4711 		m->m_pkthdr.len = m->m_len;
4712 	} else {
4713 		VERIFY(inp->inp_vflag & INP_IPV6);
4714 
4715 		bzero(data, sizeof(struct ip6_hdr)
4716 		    + sizeof(struct tcphdr));
4717 		th = (struct tcphdr *)(void *)(data + sizeof(struct ip6_hdr));
4718 		m->m_len = sizeof(struct ip6_hdr) +
4719 		    sizeof(struct tcphdr);
4720 		m->m_pkthdr.len = m->m_len;
4721 	}
4722 
4723 	tcp_fillheaders(m, tp, data, th);
4724 
4725 	if (inp->inp_vflag & INP_IPV4) {
4726 		struct ip *ip;
4727 
4728 		ip = (__typeof__(ip))(void *)data;
4729 
4730 		ip->ip_id = rfc6864 ? 0 : ip_randomid((uint64_t)m);
4731 		ip->ip_off = htons(IP_DF);
4732 		ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
4733 		ip->ip_ttl = inp->inp_ip_ttl;
4734 		ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
4735 		ip->ip_sum = in_cksum_hdr(ip);
4736 	} else {
4737 		struct ip6_hdr *ip6;
4738 
4739 		ip6 = (__typeof__(ip6))(void *)data;
4740 
4741 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
4742 		ip6->ip6_hlim = in6_selecthlim(inp, ifp);
4743 		ip6->ip6_flow = ip6->ip6_flow & ~IPV6_FLOW_ECN_MASK;
4744 
4745 		if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
4746 			ip6->ip6_src.s6_addr16[1] = 0;
4747 		}
4748 		if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
4749 			ip6->ip6_dst.s6_addr16[1] = 0;
4750 		}
4751 	}
4752 	th->th_flags = TH_ACK;
4753 
4754 	win = tcp_sbspace(tp);
4755 	if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale)) {
4756 		win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
4757 	}
4758 	th->th_win = htons((u_short) (win >> tp->rcv_scale));
4759 
4760 	if (is_probe) {
4761 		th->th_seq = htonl(tp->snd_una - 1);
4762 	} else {
4763 		th->th_seq = htonl(tp->snd_una);
4764 	}
4765 	th->th_ack = htonl(tp->rcv_nxt);
4766 
4767 	/* Force recompute TCP checksum to be the final value */
4768 	th->th_sum = 0;
4769 	if (inp->inp_vflag & INP_IPV4) {
4770 		th->th_sum = inet_cksum(m, IPPROTO_TCP,
4771 		    sizeof(struct ip), sizeof(struct tcphdr));
4772 	} else {
4773 		th->th_sum = inet6_cksum(m, IPPROTO_TCP,
4774 		    sizeof(struct ip6_hdr), sizeof(struct tcphdr));
4775 	}
4776 
4777 	return m;
4778 }
4779 
4780 void
tcp_fill_keepalive_offload_frames(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frames_array __counted_by (frames_array_count),u_int32_t frames_array_count,size_t frame_data_offset,u_int32_t * used_frames_count)4781 tcp_fill_keepalive_offload_frames(ifnet_t ifp,
4782     struct ifnet_keepalive_offload_frame *frames_array __counted_by(frames_array_count),
4783     u_int32_t frames_array_count, size_t frame_data_offset,
4784     u_int32_t *used_frames_count)
4785 {
4786 	struct inpcb *inp;
4787 	inp_gen_t gencnt;
4788 	u_int32_t frame_index = *used_frames_count;
4789 
4790 	/* Validation of the parameters */
4791 	if (ifp == NULL || frames_array == NULL ||
4792 	    frames_array_count == 0 ||
4793 	    frame_index >= frames_array_count ||
4794 	    frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4795 		return;
4796 	}
4797 
4798 	/* Fast exit when no process is using the socket option TCP_KEEPALIVE_OFFLOAD */
4799 	if (ifp->if_tcp_kao_cnt == 0) {
4800 		return;
4801 	}
4802 
4803 	/*
4804 	 * This function is called outside the regular TCP processing
4805 	 * so we need to update the TCP clock.
4806 	 */
4807 	calculate_tcp_clock();
4808 
4809 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
4810 	gencnt = tcbinfo.ipi_gencnt;
4811 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4812 		struct socket *so;
4813 		struct ifnet_keepalive_offload_frame *frame;
4814 		struct mbuf *m = NULL;
4815 		struct tcpcb *tp = intotcpcb(inp);
4816 
4817 		if (frame_index >= frames_array_count) {
4818 			break;
4819 		}
4820 
4821 		if (inp->inp_gencnt > gencnt ||
4822 		    inp->inp_state == INPCB_STATE_DEAD) {
4823 			continue;
4824 		}
4825 
4826 		if ((so = inp->inp_socket) == NULL ||
4827 		    (so->so_state & SS_DEFUNCT)) {
4828 			continue;
4829 		}
4830 		/*
4831 		 * check for keepalive offload flag without socket
4832 		 * lock to avoid a deadlock
4833 		 */
4834 		if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4835 			continue;
4836 		}
4837 
4838 		if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
4839 			continue;
4840 		}
4841 		if (inp->inp_ppcb == NULL ||
4842 		    in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4843 			continue;
4844 		}
4845 		socket_lock(so, 1);
4846 		/* Release the want count */
4847 		if (inp->inp_ppcb == NULL ||
4848 		    (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
4849 			socket_unlock(so, 1);
4850 			continue;
4851 		}
4852 		if ((inp->inp_vflag & INP_IPV4) &&
4853 		    (inp->inp_laddr.s_addr == INADDR_ANY ||
4854 		    inp->inp_faddr.s_addr == INADDR_ANY)) {
4855 			socket_unlock(so, 1);
4856 			continue;
4857 		}
4858 		if ((inp->inp_vflag & INP_IPV6) &&
4859 		    (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
4860 		    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) {
4861 			socket_unlock(so, 1);
4862 			continue;
4863 		}
4864 		if (inp->inp_lport == 0 || inp->inp_fport == 0) {
4865 			socket_unlock(so, 1);
4866 			continue;
4867 		}
4868 		if (inp->inp_last_outifp == NULL ||
4869 		    inp->inp_last_outifp->if_index != ifp->if_index) {
4870 			socket_unlock(so, 1);
4871 			continue;
4872 		}
4873 		if ((inp->inp_vflag & INP_IPV4) && frame_data_offset +
4874 		    sizeof(struct ip) + sizeof(struct tcphdr) >
4875 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4876 			socket_unlock(so, 1);
4877 			continue;
4878 		} else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset +
4879 		    sizeof(struct ip6_hdr) + sizeof(struct tcphdr) >
4880 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4881 			socket_unlock(so, 1);
4882 			continue;
4883 		}
4884 		/*
4885 		 * There is no point in waking up the device for connections
4886 		 * that are not established. Long lived connection are meant
4887 		 * for processes that will sent and receive data
4888 		 */
4889 		if (tp->t_state != TCPS_ESTABLISHED) {
4890 			socket_unlock(so, 1);
4891 			continue;
4892 		}
4893 		/*
4894 		 * This inp has all the information that is needed to
4895 		 * generate an offload frame.
4896 		 */
4897 		frame = &frames_array[frame_index];
4898 		frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP;
4899 		frame->ether_type = (inp->inp_vflag & INP_IPV4) ?
4900 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 :
4901 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6;
4902 		frame->interval = (uint16_t)(tp->t_keepidle > 0 ? tp->t_keepidle :
4903 		    tcp_keepidle);
4904 		frame->keep_cnt = (uint8_t)TCP_CONN_KEEPCNT(tp);
4905 		frame->keep_retry = (uint16_t)TCP_CONN_KEEPINTVL(tp);
4906 		if (so->so_options & SO_NOWAKEFROMSLEEP) {
4907 			frame->flags |=
4908 			    IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP;
4909 		}
4910 		frame->local_port = ntohs(inp->inp_lport);
4911 		frame->remote_port = ntohs(inp->inp_fport);
4912 		frame->local_seq = tp->snd_nxt;
4913 		frame->remote_seq = tp->rcv_nxt;
4914 		if (inp->inp_vflag & INP_IPV4) {
4915 			ASSERT(frame_data_offset + sizeof(struct ip) + sizeof(struct tcphdr) <= UINT8_MAX);
4916 			frame->length = (uint8_t)(frame_data_offset +
4917 			    sizeof(struct ip) + sizeof(struct tcphdr));
4918 			frame->reply_length =  frame->length;
4919 
4920 			frame->addr_length = sizeof(struct in_addr);
4921 			bcopy(&inp->inp_laddr, frame->local_addr,
4922 			    sizeof(struct in_addr));
4923 			bcopy(&inp->inp_faddr, frame->remote_addr,
4924 			    sizeof(struct in_addr));
4925 		} else {
4926 			struct in6_addr *ip6;
4927 
4928 			ASSERT(frame_data_offset + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= UINT8_MAX);
4929 			frame->length = (uint8_t)(frame_data_offset +
4930 			    sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
4931 			frame->reply_length =  frame->length;
4932 
4933 			frame->addr_length = sizeof(struct in6_addr);
4934 			ip6 = (struct in6_addr *)(void *)frame->local_addr;
4935 			bcopy(&inp->in6p_laddr, ip6, sizeof(struct in6_addr));
4936 			if (IN6_IS_SCOPE_EMBED(ip6)) {
4937 				ip6->s6_addr16[1] = 0;
4938 			}
4939 
4940 			ip6 = (struct in6_addr *)(void *)frame->remote_addr;
4941 			bcopy(&inp->in6p_faddr, ip6, sizeof(struct in6_addr));
4942 			if (IN6_IS_SCOPE_EMBED(ip6)) {
4943 				ip6->s6_addr16[1] = 0;
4944 			}
4945 		}
4946 
4947 		/*
4948 		 * First the probe
4949 		 */
4950 		m = tcp_make_keepalive_frame(tp, ifp, TRUE);
4951 		if (m == NULL) {
4952 			socket_unlock(so, 1);
4953 			continue;
4954 		}
4955 		bcopy(m_mtod_current(m), frame->data + frame_data_offset, m->m_len);
4956 		m_freem(m);
4957 
4958 		/*
4959 		 * Now the response packet to incoming probes
4960 		 */
4961 		m = tcp_make_keepalive_frame(tp, ifp, FALSE);
4962 		if (m == NULL) {
4963 			socket_unlock(so, 1);
4964 			continue;
4965 		}
4966 		bcopy(m_mtod_current(m), frame->reply_data + frame_data_offset,
4967 		    m->m_len);
4968 		m_freem(m);
4969 
4970 		frame_index++;
4971 		socket_unlock(so, 1);
4972 	}
4973 	lck_rw_done(&tcbinfo.ipi_lock);
4974 	*used_frames_count = frame_index;
4975 }
4976 
4977 static bool
inp_matches_kao_frame(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frame,struct inpcb * inp)4978 inp_matches_kao_frame(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frame,
4979     struct inpcb *inp)
4980 {
4981 	if (inp->inp_ppcb == NULL) {
4982 		return false;
4983 	}
4984 	/* Release the want count */
4985 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4986 		return false;
4987 	}
4988 	if (inp->inp_last_outifp == NULL ||
4989 	    inp->inp_last_outifp->if_index != ifp->if_index) {
4990 		return false;
4991 	}
4992 	if (frame->local_port != ntohs(inp->inp_lport) ||
4993 	    frame->remote_port != ntohs(inp->inp_fport)) {
4994 		return false;
4995 	}
4996 	if (inp->inp_vflag & INP_IPV4) {
4997 		if (memcmp(&inp->inp_laddr, frame->local_addr,
4998 		    sizeof(struct in_addr)) != 0 ||
4999 		    memcmp(&inp->inp_faddr, frame->remote_addr,
5000 		    sizeof(struct in_addr)) != 0) {
5001 			return false;
5002 		}
5003 	} else if (inp->inp_vflag & INP_IPV6) {
5004 		if (memcmp(&inp->inp_laddr, frame->local_addr,
5005 		    sizeof(struct in6_addr)) != 0 ||
5006 		    memcmp(&inp->inp_faddr, frame->remote_addr,
5007 		    sizeof(struct in6_addr)) != 0) {
5008 			return false;
5009 		}
5010 	} else {
5011 		return false;
5012 	}
5013 	return true;
5014 }
5015 
5016 int
tcp_notify_kao_timeout(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frame)5017 tcp_notify_kao_timeout(ifnet_t ifp,
5018     struct ifnet_keepalive_offload_frame *frame)
5019 {
5020 	struct inpcb *inp = NULL;
5021 	struct socket *so = NULL;
5022 	bool found = false;
5023 
5024 	/*
5025 	 *  Unlock the list before posting event on the matching socket
5026 	 */
5027 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
5028 
5029 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
5030 		if ((so = inp->inp_socket) == NULL ||
5031 		    (so->so_state & SS_DEFUNCT)) {
5032 			continue;
5033 		}
5034 		if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
5035 			continue;
5036 		}
5037 		if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
5038 			continue;
5039 		}
5040 		if (inp->inp_ppcb == NULL ||
5041 		    in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
5042 			continue;
5043 		}
5044 		socket_lock(so, 1);
5045 		if (inp_matches_kao_frame(ifp, frame, inp)) {
5046 			/*
5047 			 * Keep the matching socket locked
5048 			 */
5049 			found = true;
5050 			break;
5051 		}
5052 		socket_unlock(so, 1);
5053 	}
5054 	lck_rw_done(&tcbinfo.ipi_lock);
5055 
5056 	if (found) {
5057 		ASSERT(inp != NULL);
5058 		ASSERT(so != NULL);
5059 		ASSERT(so == inp->inp_socket);
5060 		/*
5061 		 * Drop the TCP connection like tcptimers() does
5062 		 */
5063 		tcpcb_ref_t tp = inp->inp_ppcb;
5064 
5065 		tcpstat.tcps_keepdrops++;
5066 		soevent(so,
5067 		    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
5068 		tp = tcp_drop(tp, ETIMEDOUT);
5069 
5070 		tcpstat.tcps_ka_offload_drops++;
5071 		os_log_info(OS_LOG_DEFAULT, "%s: dropped lport %u fport %u\n",
5072 		    __func__, frame->local_port, frame->remote_port);
5073 
5074 		socket_unlock(so, 1);
5075 	}
5076 
5077 	return 0;
5078 }
5079 
5080 errno_t
tcp_notify_ack_id_valid(struct tcpcb * tp,struct socket * so,u_int32_t notify_id)5081 tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so,
5082     u_int32_t notify_id)
5083 {
5084 	struct tcp_notify_ack_marker *elm;
5085 
5086 	if (so->so_snd.sb_cc == 0) {
5087 		return ENOBUFS;
5088 	}
5089 
5090 	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
5091 		/* Duplicate id is not allowed */
5092 		if (elm->notify_id == notify_id) {
5093 			return EINVAL;
5094 		}
5095 		/* Duplicate position is not allowed */
5096 		if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc) {
5097 			return EINVAL;
5098 		}
5099 	}
5100 	return 0;
5101 }
5102 
5103 errno_t
tcp_add_notify_ack_marker(struct tcpcb * tp,u_int32_t notify_id)5104 tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id)
5105 {
5106 	struct tcp_notify_ack_marker *nm, *elm = NULL;
5107 	struct socket *so = tp->t_inpcb->inp_socket;
5108 
5109 	nm = kalloc_type(struct tcp_notify_ack_marker, M_WAIT | Z_ZERO);
5110 	if (nm == NULL) {
5111 		return ENOMEM;
5112 	}
5113 	nm->notify_id = notify_id;
5114 	nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc;
5115 
5116 	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
5117 		if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una)) {
5118 			break;
5119 		}
5120 	}
5121 
5122 	if (elm == NULL) {
5123 		VERIFY(SLIST_EMPTY(&tp->t_notify_ack));
5124 		SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next);
5125 	} else {
5126 		SLIST_INSERT_AFTER(elm, nm, notify_next);
5127 	}
5128 	tp->t_notify_ack_count++;
5129 	return 0;
5130 }
5131 
5132 void
tcp_notify_ack_free(struct tcpcb * tp)5133 tcp_notify_ack_free(struct tcpcb *tp)
5134 {
5135 	struct tcp_notify_ack_marker *elm, *next;
5136 	if (SLIST_EMPTY(&tp->t_notify_ack)) {
5137 		return;
5138 	}
5139 
5140 	SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
5141 		SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker,
5142 		    notify_next);
5143 		kfree_type(struct tcp_notify_ack_marker, elm);
5144 	}
5145 	SLIST_INIT(&tp->t_notify_ack);
5146 	tp->t_notify_ack_count = 0;
5147 }
5148 
5149 inline void
tcp_notify_acknowledgement(struct tcpcb * tp,struct socket * so)5150 tcp_notify_acknowledgement(struct tcpcb *tp, struct socket *so)
5151 {
5152 	struct tcp_notify_ack_marker *elm;
5153 
5154 	elm = SLIST_FIRST(&tp->t_notify_ack);
5155 	if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5156 		soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOTIFY_ACK);
5157 	}
5158 }
5159 
5160 void
tcp_get_notify_ack_count(struct tcpcb * tp,struct tcp_notify_ack_complete * retid)5161 tcp_get_notify_ack_count(struct tcpcb *tp,
5162     struct tcp_notify_ack_complete *retid)
5163 {
5164 	struct tcp_notify_ack_marker *elm;
5165 	uint32_t  complete = 0;
5166 
5167 	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
5168 		if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5169 			ASSERT(complete < UINT32_MAX);
5170 			complete++;
5171 		} else {
5172 			break;
5173 		}
5174 	}
5175 	retid->notify_pending = tp->t_notify_ack_count - complete;
5176 	retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, complete);
5177 }
5178 
5179 void
tcp_get_notify_ack_ids(struct tcpcb * tp,struct tcp_notify_ack_complete * retid)5180 tcp_get_notify_ack_ids(struct tcpcb *tp,
5181     struct tcp_notify_ack_complete *retid)
5182 {
5183 	size_t i = 0;
5184 	struct tcp_notify_ack_marker *elm, *next;
5185 
5186 	SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
5187 		if (i >= retid->notify_complete_count) {
5188 			break;
5189 		}
5190 		if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5191 			retid->notify_complete_id[i++] = elm->notify_id;
5192 			SLIST_REMOVE(&tp->t_notify_ack, elm,
5193 			    tcp_notify_ack_marker, notify_next);
5194 			kfree_type(struct tcp_notify_ack_marker, elm);
5195 			tp->t_notify_ack_count--;
5196 		} else {
5197 			break;
5198 		}
5199 	}
5200 }
5201 
5202 bool
tcp_notify_ack_active(struct socket * so)5203 tcp_notify_ack_active(struct socket *so)
5204 {
5205 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5206 	    SOCK_TYPE(so) == SOCK_STREAM) {
5207 		struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5208 
5209 		if (!SLIST_EMPTY(&tp->t_notify_ack)) {
5210 			struct tcp_notify_ack_marker *elm;
5211 			elm = SLIST_FIRST(&tp->t_notify_ack);
5212 			if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
5213 				return true;
5214 			}
5215 		}
5216 	}
5217 	return false;
5218 }
5219 
5220 inline int32_t
inp_get_sndbytes_allunsent(struct socket * so,u_int32_t th_ack)5221 inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
5222 {
5223 	struct inpcb *inp = sotoinpcb(so);
5224 	struct tcpcb *tp = intotcpcb(inp);
5225 
5226 	if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) &&
5227 	    so->so_snd.sb_cc > 0) {
5228 		int32_t unsent, sent;
5229 		sent = tp->snd_max - th_ack;
5230 		if (tp->t_flags & TF_SENTFIN) {
5231 			sent--;
5232 		}
5233 		unsent = so->so_snd.sb_cc - sent;
5234 		return unsent;
5235 	}
5236 	return 0;
5237 }
5238 
5239 uint8_t
tcp_get_ace(struct tcphdr * th)5240 tcp_get_ace(struct tcphdr *th)
5241 {
5242 	uint8_t ace = 0;
5243 	if (th->th_flags & TH_ECE) {
5244 		ace += 1;
5245 	}
5246 	if (th->th_flags & TH_CWR) {
5247 		ace += 2;
5248 	}
5249 	if (th->th_x2 & (TH_AE >> 8)) {
5250 		ace += 4;
5251 	}
5252 
5253 	return ace;
5254 }
5255 
5256 #define IFP_PER_FLOW_STAT(_ipv4_, _stat_) { \
5257 	if (_ipv4_) { \
5258 	        ifp->if_ipv4_stat->_stat_++; \
5259 	} else { \
5260 	        ifp->if_ipv6_stat->_stat_++; \
5261 	} \
5262 }
5263 
5264 #define FLOW_ECN_ENABLED(_flags_) \
5265     ((_flags_ & (TE_ECN_ON)) == (TE_ECN_ON))
5266 
5267 void
tcp_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)5268 tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
5269     struct ifnet *ifp)
5270 {
5271 	if (ifp == NULL || !IF_FULLY_ATTACHED(ifp)) {
5272 		return;
5273 	}
5274 
5275 	ifnet_lock_shared(ifp);
5276 	if (ifs->ecn_flags & TE_SETUPSENT) {
5277 		if (ifs->ecn_flags & TE_CLIENT_SETUP) {
5278 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_client_setup);
5279 			if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5280 				IFP_PER_FLOW_STAT(ifs->ipv4,
5281 				    ecn_client_success);
5282 			} else if (ifs->ecn_flags & TE_LOST_SYN) {
5283 				IFP_PER_FLOW_STAT(ifs->ipv4,
5284 				    ecn_syn_lost);
5285 			} else {
5286 				IFP_PER_FLOW_STAT(ifs->ipv4,
5287 				    ecn_peer_nosupport);
5288 			}
5289 		} else {
5290 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_server_setup);
5291 			if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5292 				IFP_PER_FLOW_STAT(ifs->ipv4,
5293 				    ecn_server_success);
5294 			} else if (ifs->ecn_flags & TE_LOST_SYN) {
5295 				IFP_PER_FLOW_STAT(ifs->ipv4,
5296 				    ecn_synack_lost);
5297 			} else {
5298 				IFP_PER_FLOW_STAT(ifs->ipv4,
5299 				    ecn_peer_nosupport);
5300 			}
5301 		}
5302 	} else {
5303 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off_conn);
5304 	}
5305 	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5306 		if (ifs->ecn_flags & TE_RECV_ECN_CE) {
5307 			tcpstat.tcps_ecn_conn_recv_ce++;
5308 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ce);
5309 		}
5310 		if (ifs->ecn_flags & TE_RECV_ECN_ECE) {
5311 			tcpstat.tcps_ecn_conn_recv_ece++;
5312 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ece);
5313 		}
5314 		if (ifs->ecn_flags & (TE_RECV_ECN_CE | TE_RECV_ECN_ECE)) {
5315 			if (ifs->txretransmitbytes > 0 ||
5316 			    ifs->rxoutoforderbytes > 0) {
5317 				tcpstat.tcps_ecn_conn_pl_ce++;
5318 				IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plce);
5319 			} else {
5320 				tcpstat.tcps_ecn_conn_nopl_ce++;
5321 				IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_noplce);
5322 			}
5323 		} else {
5324 			if (ifs->txretransmitbytes > 0 ||
5325 			    ifs->rxoutoforderbytes > 0) {
5326 				tcpstat.tcps_ecn_conn_plnoce++;
5327 				IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plnoce);
5328 			}
5329 		}
5330 	}
5331 
5332 	/* Other stats are interesting for non-local connections only */
5333 	if (ifs->local) {
5334 		ifnet_lock_done(ifp);
5335 		return;
5336 	}
5337 
5338 	if (ifs->ipv4) {
5339 		ifp->if_ipv4_stat->timestamp = net_uptime();
5340 		if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5341 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_on);
5342 		} else {
5343 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_off);
5344 		}
5345 	} else {
5346 		ifp->if_ipv6_stat->timestamp = net_uptime();
5347 		if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5348 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_on);
5349 		} else {
5350 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_off);
5351 		}
5352 	}
5353 
5354 	if (ifs->rxmit_drop) {
5355 		if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
5356 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_on.rxmit_drop);
5357 		} else {
5358 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off.rxmit_drop);
5359 		}
5360 	}
5361 	if (ifs->ecn_fallback_synloss) {
5362 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_synloss);
5363 	}
5364 	if (ifs->ecn_fallback_droprst) {
5365 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprst);
5366 	}
5367 	if (ifs->ecn_fallback_droprxmt) {
5368 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprxmt);
5369 	}
5370 	if (ifs->ecn_fallback_ce) {
5371 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_ce);
5372 	}
5373 	if (ifs->ecn_fallback_reorder) {
5374 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_reorder);
5375 	}
5376 	if (ifs->ecn_recv_ce > 0) {
5377 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ce);
5378 	}
5379 	if (ifs->ecn_recv_ece > 0) {
5380 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ece);
5381 	}
5382 
5383 	tcp_flow_lim_stats(ifs, &ifp->if_lim_stat);
5384 
5385 	/*
5386 	 * Link heuristics are updated here only for NECP client flow when they close
5387 	 * Socket flows are updated live
5388 	 */
5389 	os_atomic_add(&ifp->if_tcp_stat->linkheur_noackpri, ifs->linkheur_noackpri, relaxed);
5390 	os_atomic_add(&ifp->if_tcp_stat->linkheur_comprxmt, ifs->linkheur_comprxmt, relaxed);
5391 	os_atomic_add(&ifp->if_tcp_stat->linkheur_synrxmt, ifs->linkheur_synrxmt, relaxed);
5392 	os_atomic_add(&ifp->if_tcp_stat->linkheur_rxmtfloor, ifs->linkheur_rxmtfloor, relaxed);
5393 
5394 	ifnet_lock_done(ifp);
5395 }
5396 
5397 struct tseg_qent *
tcp_reass_qent_alloc(void)5398 tcp_reass_qent_alloc(void)
5399 {
5400 	return zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL);
5401 }
5402 
5403 void
tcp_reass_qent_free(struct tseg_qent * te)5404 tcp_reass_qent_free(struct tseg_qent *te)
5405 {
5406 	zfree(tcp_reass_zone, te);
5407 }
5408 
5409 struct tcp_rxt_seg *
tcp_rxt_seg_qent_alloc(void)5410 tcp_rxt_seg_qent_alloc(void)
5411 {
5412 	return zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
5413 }
5414 
5415 void
tcp_rxt_seg_qent_free(struct tcp_rxt_seg * te)5416 tcp_rxt_seg_qent_free(struct tcp_rxt_seg *te)
5417 {
5418 	zfree(tcp_rxt_seg_zone, te);
5419 }
5420 
5421 
5422 struct tcp_seg_sent *
tcp_seg_sent_qent_alloc(void)5423 tcp_seg_sent_qent_alloc(void)
5424 {
5425 	return zalloc_flags(tcp_seg_sent_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
5426 }
5427 
5428 void
tcp_seg_sent_qent_free(struct tcp_seg_sent * te)5429 tcp_seg_sent_qent_free(struct tcp_seg_sent *te)
5430 {
5431 	zfree(tcp_seg_sent_zone, te);
5432 }
5433 
5434 #if SKYWALK
5435 
5436 #include <skywalk/core/skywalk_var.h>
5437 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
5438 
5439 void
tcp_add_fsw_flow(struct tcpcb * tp,struct ifnet * ifp)5440 tcp_add_fsw_flow(struct tcpcb *tp, struct ifnet *ifp)
5441 {
5442 	struct inpcb *inp = tp->t_inpcb;
5443 	struct socket *so = inp->inp_socket;
5444 	uuid_t fsw_uuid;
5445 	struct nx_flow_req nfr;
5446 	int err;
5447 
5448 	if (!NX_FSW_TCP_RX_AGG_ENABLED()) {
5449 		return;
5450 	}
5451 
5452 	if (ifp == NULL || kern_nexus_get_flowswitch_instance(ifp, fsw_uuid)) {
5453 		TCP_LOG_FSW_FLOW(tp, "skip ifp no fsw");
5454 		return;
5455 	}
5456 
5457 	memset(&nfr, 0, sizeof(nfr));
5458 
5459 	if (inp->inp_vflag & INP_IPV4) {
5460 		ASSERT(!(inp->inp_laddr.s_addr == INADDR_ANY ||
5461 		    inp->inp_faddr.s_addr == INADDR_ANY ||
5462 		    IN_MULTICAST(ntohl(inp->inp_laddr.s_addr)) ||
5463 		    IN_MULTICAST(ntohl(inp->inp_faddr.s_addr))));
5464 		nfr.nfr_saddr.sin.sin_len = sizeof(struct sockaddr_in);
5465 		nfr.nfr_saddr.sin.sin_family = AF_INET;
5466 		nfr.nfr_saddr.sin.sin_port = inp->inp_lport;
5467 		memcpy(&nfr.nfr_saddr.sin.sin_addr, &inp->inp_laddr,
5468 		    sizeof(struct in_addr));
5469 		nfr.nfr_daddr.sin.sin_len = sizeof(struct sockaddr_in);
5470 		nfr.nfr_daddr.sin.sin_family = AF_INET;
5471 		nfr.nfr_daddr.sin.sin_port = inp->inp_fport;
5472 		memcpy(&nfr.nfr_daddr.sin.sin_addr, &inp->inp_faddr,
5473 		    sizeof(struct in_addr));
5474 	} else {
5475 		ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
5476 		    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
5477 		    IN6_IS_ADDR_MULTICAST(&inp->in6p_laddr) ||
5478 		    IN6_IS_ADDR_MULTICAST(&inp->in6p_faddr)));
5479 		nfr.nfr_saddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
5480 		nfr.nfr_saddr.sin6.sin6_family = AF_INET6;
5481 		nfr.nfr_saddr.sin6.sin6_port = inp->inp_lport;
5482 		memcpy(&nfr.nfr_saddr.sin6.sin6_addr, &inp->in6p_laddr,
5483 		    sizeof(struct in6_addr));
5484 		nfr.nfr_daddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
5485 		nfr.nfr_daddr.sin.sin_family = AF_INET6;
5486 		nfr.nfr_daddr.sin6.sin6_port = inp->inp_fport;
5487 		memcpy(&nfr.nfr_daddr.sin6.sin6_addr, &inp->in6p_faddr,
5488 		    sizeof(struct in6_addr));
5489 		/* clear embedded scope ID */
5490 		if (IN6_IS_SCOPE_EMBED(&nfr.nfr_saddr.sin6.sin6_addr)) {
5491 			nfr.nfr_saddr.sin6.sin6_addr.s6_addr16[1] = 0;
5492 		}
5493 		if (IN6_IS_SCOPE_EMBED(&nfr.nfr_daddr.sin6.sin6_addr)) {
5494 			nfr.nfr_daddr.sin6.sin6_addr.s6_addr16[1] = 0;
5495 		}
5496 	}
5497 
5498 	nfr.nfr_nx_port = 1;
5499 	nfr.nfr_ip_protocol = IPPROTO_TCP;
5500 	nfr.nfr_transport_protocol = IPPROTO_TCP;
5501 	nfr.nfr_flags = NXFLOWREQF_ASIS;
5502 	nfr.nfr_epid = (so != NULL ? so->last_pid : 0);
5503 	if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
5504 		nfr.nfr_port_reservation = inp->inp_netns_token;
5505 		nfr.nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
5506 	}
5507 	ASSERT(inp->inp_flowhash != 0);
5508 	nfr.nfr_inp_flowhash = inp->inp_flowhash;
5509 
5510 	uuid_generate_random(nfr.nfr_flow_uuid);
5511 	err = kern_nexus_flow_add(kern_nexus_shared_controller(), fsw_uuid,
5512 	    &nfr, sizeof(nfr));
5513 
5514 	if (err == 0) {
5515 		uuid_copy(tp->t_fsw_uuid, fsw_uuid);
5516 		uuid_copy(tp->t_flow_uuid, nfr.nfr_flow_uuid);
5517 	}
5518 
5519 	TCP_LOG_FSW_FLOW(tp, "add err %d\n", err);
5520 }
5521 
5522 void
tcp_del_fsw_flow(struct tcpcb * tp)5523 tcp_del_fsw_flow(struct tcpcb *tp)
5524 {
5525 	if (uuid_is_null(tp->t_fsw_uuid) || uuid_is_null(tp->t_flow_uuid)) {
5526 		return;
5527 	}
5528 
5529 	struct nx_flow_req nfr;
5530 	uuid_copy(nfr.nfr_flow_uuid, tp->t_flow_uuid);
5531 
5532 	/* It's possible for this call to fail if the nexus has detached */
5533 	int err = kern_nexus_flow_del(kern_nexus_shared_controller(),
5534 	    tp->t_fsw_uuid, &nfr, sizeof(nfr));
5535 	VERIFY(err == 0 || err == ENOENT || err == ENXIO);
5536 
5537 	uuid_clear(tp->t_fsw_uuid);
5538 	uuid_clear(tp->t_flow_uuid);
5539 
5540 	TCP_LOG_FSW_FLOW(tp, "del err %d\n", err);
5541 }
5542 
5543 #endif /* SKYWALK */
5544