xref: /xnu-8020.140.41/bsd/netinet/tcp_subr.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
61  */
62 /*
63  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64  * support for mandatory and extensible security protections.  This notice
65  * is included in support of clause 2.2 (b) of the Apple Public License,
66  * Version 2.0.
67  */
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/kernel.h>
72 #include <sys/sysctl.h>
73 #include <sys/malloc.h>
74 #include <sys/mbuf.h>
75 #include <sys/domain.h>
76 #include <sys/proc.h>
77 #include <sys/kauth.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/protosw.h>
81 #include <sys/random.h>
82 #include <sys/syslog.h>
83 #include <sys/mcache.h>
84 #include <kern/locks.h>
85 #include <kern/zalloc.h>
86 
87 #include <dev/random/randomdev.h>
88 
89 #include <net/route.h>
90 #include <net/if.h>
91 #include <net/content_filter.h>
92 #include <net/ntstat.h>
93 #include <net/multi_layer_pkt_log.h>
94 
95 #define tcp_minmssoverload fring
96 #define _IP_VHL
97 #include <netinet/in.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/ip6.h>
102 #include <netinet/icmp6.h>
103 #include <netinet/in_pcb.h>
104 #include <netinet6/in6_pcb.h>
105 #include <netinet/in_var.h>
106 #include <netinet/ip_var.h>
107 #include <netinet/icmp_var.h>
108 #include <netinet6/ip6_var.h>
109 #include <netinet/mptcp_var.h>
110 #include <netinet/tcp.h>
111 #include <netinet/tcp_fsm.h>
112 #include <netinet/tcp_seq.h>
113 #include <netinet/tcp_timer.h>
114 #include <netinet/tcp_var.h>
115 #include <netinet/tcp_cc.h>
116 #include <netinet/tcp_cache.h>
117 #include <kern/thread_call.h>
118 
119 #include <netinet6/tcp6_var.h>
120 #include <netinet/tcpip.h>
121 #if TCPDEBUG
122 #include <netinet/tcp_debug.h>
123 #endif
124 #include <netinet/tcp_log.h>
125 
126 #include <netinet6/ip6protosw.h>
127 
128 #if IPSEC
129 #include <netinet6/ipsec.h>
130 #include <netinet6/ipsec6.h>
131 #endif /* IPSEC */
132 
133 #if NECP
134 #include <net/necp.h>
135 #endif /* NECP */
136 
137 #undef tcp_minmssoverload
138 
139 #include <corecrypto/ccaes.h>
140 #include <libkern/crypto/aes.h>
141 #include <libkern/crypto/md5.h>
142 #include <sys/kdebug.h>
143 #include <mach/sdt.h>
144 #include <atm/atm_internal.h>
145 #include <pexpert/pexpert.h>
146 
147 #define DBG_FNC_TCP_CLOSE       NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
148 
149 static tcp_cc tcp_ccgen;
150 
151 extern struct tcptimerlist tcp_timer_list;
152 extern struct tcptailq tcp_tw_tailq;
153 
154 SYSCTL_SKMEM_TCP_INT(TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
155     int, tcp_mssdflt, TCP_MSS, "Default TCP Maximum Segment Size");
156 
157 SYSCTL_SKMEM_TCP_INT(TCPCTL_V6MSSDFLT, v6mssdflt,
158     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_v6mssdflt, TCP6_MSS,
159     "Default TCP Maximum Segment Size for IPv6");
160 
161 int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int,
162     struct sysctl_req *);
163 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING | CTLFLAG_WR,
164     0, 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
165 
166 /* Current count of half-open TFO connections */
167 int     tcp_tfo_halfcnt = 0;
168 
169 /* Maximum of half-open TFO connection backlog */
170 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen_backlog,
171     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_tfo_backlog, 10,
172     "Backlog queue for half-open TFO connections");
173 
174 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED,
175     int, tcp_fastopen, TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER,
176     "Enable TCP Fastopen (RFC 7413)");
177 
178 SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD | CTLFLAG_LOCKED,
179     uint32_t, tcp_now_init, 0, "Initial tcp now value");
180 
181 SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD | CTLFLAG_LOCKED,
182     uint32_t, tcp_microuptime_init, 0, "Initial tcp uptime value in micro seconds");
183 
184 /*
185  * Minimum MSS we accept and use. This prevents DoS attacks where
186  * we are forced to a ridiculous low MSS like 20 and send hundreds
187  * of packets instead of one. The effect scales with the available
188  * bandwidth and quickly saturates the CPU and network interface
189  * with packet generation and sending. Set to zero to disable MINMSS
190  * checking. This setting prevents us from sending too small packets.
191  */
192 SYSCTL_SKMEM_TCP_INT(OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
193     int, tcp_minmss, TCP_MINMSS, "Minmum TCP Maximum Segment Size");
194 
195 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
196     &tcbinfo.ipi_count, 0, "Number of active PCBs");
197 
198 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
199     &tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state");
200 
201 SYSCTL_SKMEM_TCP_INT(OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED,
202     static int, icmp_may_rst, 1,
203     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
204 
205 static int      tcp_strict_rfc1948 = 0;
206 static int      tcp_isn_reseed_interval = 0;
207 #if (DEVELOPMENT || DEBUG)
208 SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED,
209     &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
210 
211 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval,
212     CTLFLAG_RW | CTLFLAG_LOCKED,
213     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
214 #endif /* (DEVELOPMENT || DEBUG) */
215 
216 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
217     int, tcp_TCPTV_MIN, 100, "min rtt value allowed");
218 
219 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rexmt_slop, CTLFLAG_RW,
220     int, tcp_rexmt_slop, TCPTV_REXMTSLOP, "Slop added to retransmit timeout");
221 
222 SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
223     __private_extern__ int, tcp_use_randomport, 0,
224     "Randomize TCP port numbers");
225 
226 SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
227     __private_extern__ int, tcp_win_scale, 3, "Window scaling factor");
228 
229 #if (DEVELOPMENT || DEBUG)
230 SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
231     CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
232     "Initalize RTT from route cache");
233 #else
234 SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
235     CTLFLAG_RD | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
236     "Initalize RTT from route cache");
237 #endif /* (DEVELOPMENT || DEBUG) */
238 
239 static int tso_debug = 0;
240 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
241     &tso_debug, 0, "TSO verbosity");
242 
243 static int tcp_rxt_seg_max = 1024;
244 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rxt_seg_max, CTLFLAG_RW | CTLFLAG_LOCKED,
245     &tcp_rxt_seg_max, 0, "");
246 
247 static unsigned long tcp_rxt_seg_drop = 0;
248 SYSCTL_ULONG(_net_inet_tcp, OID_AUTO, rxt_seg_drop, CTLFLAG_RD | CTLFLAG_LOCKED,
249     &tcp_rxt_seg_drop, "");
250 
251 static void     tcp_notify(struct inpcb *, int);
252 
253 struct zone     *sack_hole_zone;
254 struct zone     *tcp_reass_zone;
255 struct zone     *tcp_bwmeas_zone;
256 struct zone     *tcp_rxt_seg_zone;
257 
258 extern int slowlink_wsize;      /* window correction for slow links */
259 extern int path_mtu_discovery;
260 
261 uint32_t tcp_now_remainder_us = 0;  /* remaining micro seconds for tcp_now */
262 
263 static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb);
264 
265 #define TCP_BWMEAS_BURST_MINSIZE 6
266 #define TCP_BWMEAS_BURST_MAXSIZE 25
267 
268 /*
269  * Target size of TCP PCB hash tables. Must be a power of two.
270  *
271  * Note that this can be overridden by the kernel environment
272  * variable net.inet.tcp.tcbhashsize
273  */
274 #ifndef TCBHASHSIZE
275 #define TCBHASHSIZE     CONFIG_TCBHASHSIZE
276 #endif
277 
278 __private_extern__ int  tcp_tcbhashsize = TCBHASHSIZE;
279 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
280     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
281 
282 /*
283  * This is the actual shape of what we allocate using the zone
284  * allocator.  Doing it this way allows us to protect both structures
285  * using the same generation count, and also eliminates the overhead
286  * of allocating tcpcbs separately.  By hiding the structure here,
287  * we avoid changing most of the rest of the code (although it needs
288  * to be changed, eventually, for greater efficiency).
289  */
290 #define ALIGNMENT       32
291 struct  inp_tp {
292 	struct  inpcb   inp;
293 	struct  tcpcb   tcb __attribute__((aligned(ALIGNMENT)));
294 };
295 #undef ALIGNMENT
296 
297 int  get_inpcb_str_size(void);
298 int  get_tcp_str_size(void);
299 
300 os_log_t tcp_mpkl_log_object = NULL;
301 
302 static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
303 
304 int tcp_notsent_lowat_check(struct socket *so);
305 static void tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
306     struct if_lim_perf_stat *stat);
307 static void tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
308     struct if_tcp_ecn_perf_stat *stat);
309 
310 static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */
311 
312 void
tcp_tfo_gen_cookie(struct inpcb * inp,u_char * out,size_t blk_size)313 tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out, size_t blk_size)
314 {
315 	u_char in[CCAES_BLOCK_SIZE];
316 	int isipv6 = inp->inp_vflag & INP_IPV6;
317 
318 	VERIFY(blk_size == CCAES_BLOCK_SIZE);
319 
320 	bzero(&in[0], CCAES_BLOCK_SIZE);
321 	bzero(&out[0], CCAES_BLOCK_SIZE);
322 
323 	if (isipv6) {
324 		memcpy(in, &inp->in6p_faddr, sizeof(struct in6_addr));
325 	} else {
326 		memcpy(in, &inp->inp_faddr, sizeof(struct in_addr));
327 	}
328 
329 	aes_encrypt_cbc(in, NULL, 1, out, &tfo_ctx);
330 }
331 
332 __private_extern__ int
tcp_sysctl_fastopenkey(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)333 tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1,
334     __unused int arg2, struct sysctl_req *req)
335 {
336 	int error = 0;
337 	/*
338 	 * TFO-key is expressed as a string in hex format
339 	 *  +1 to account for the \0 char
340 	 *  +1 because sysctl_io_string() expects a string length but the sysctl command
341 	 *     now includes the terminating \0 in newlen -- see rdar://77205344
342 	 */
343 	char keystring[TCP_FASTOPEN_KEYLEN * 2 + 2];
344 	u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
345 	int i;
346 
347 	/*
348 	 * sysctl_io_string copies keystring into the oldptr of the sysctl_req.
349 	 * Make sure everything is zero, to avoid putting garbage in there or
350 	 * leaking the stack.
351 	 */
352 	bzero(keystring, sizeof(keystring));
353 
354 	error = sysctl_io_string(req, keystring, sizeof(keystring), 0, NULL);
355 	if (error) {
356 		os_log(OS_LOG_DEFAULT,
357 		    "%s: sysctl_io_string() error %d, req->newlen %lu, sizeof(keystring) %lu",
358 		    __func__, error, req->newlen, sizeof(keystring));
359 		goto exit;
360 	}
361 	if (req->newptr == USER_ADDR_NULL) {
362 		goto exit;
363 	}
364 
365 	if (strlen(keystring) != TCP_FASTOPEN_KEYLEN * 2) {
366 		os_log(OS_LOG_DEFAULT,
367 		    "%s: strlen(keystring) %lu != TCP_FASTOPEN_KEYLEN * 2 %u, newlen %lu",
368 		    __func__, strlen(keystring), TCP_FASTOPEN_KEYLEN * 2, req->newlen);
369 		error = EINVAL;
370 		goto exit;
371 	}
372 
373 	for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
374 		/*
375 		 * We jump over the keystring in 8-character (4 byte in hex)
376 		 * steps
377 		 */
378 		if (sscanf(&keystring[i * 8], "%8x", &key[i]) != 1) {
379 			error = EINVAL;
380 			os_log(OS_LOG_DEFAULT,
381 			    "%s: sscanf() != 1, error EINVAL", __func__);
382 			goto exit;
383 		}
384 	}
385 
386 	aes_encrypt_key128((u_char *)key, &tfo_ctx);
387 
388 exit:
389 	return error;
390 }
391 
392 int
get_inpcb_str_size(void)393 get_inpcb_str_size(void)
394 {
395 	return sizeof(struct inpcb);
396 }
397 
398 int
get_tcp_str_size(void)399 get_tcp_str_size(void)
400 {
401 	return sizeof(struct tcpcb);
402 }
403 
404 static int scale_to_powerof2(int size);
405 
406 /*
407  * This helper routine returns one of the following scaled value of size:
408  * 1. Rounded down power of two value of size if the size value passed as
409  *    argument is not a power of two and the rounded up value overflows.
410  * OR
411  * 2. Rounded up power of two value of size if the size value passed as
412  *    argument is not a power of two and the rounded up value does not overflow
413  * OR
414  * 3. Same value as argument size if it is already a power of two.
415  */
416 static int
scale_to_powerof2(int size)417 scale_to_powerof2(int size)
418 {
419 	/* Handle special case of size = 0 */
420 	int ret = size ? size : 1;
421 
422 	if (!powerof2(ret)) {
423 		while (!powerof2(size)) {
424 			/*
425 			 * Clear out least significant
426 			 * set bit till size is left with
427 			 * its highest set bit at which point
428 			 * it is rounded down power of two.
429 			 */
430 			size = size & (size - 1);
431 		}
432 
433 		/* Check for overflow when rounding up */
434 		if (0 == (size << 1)) {
435 			ret = size;
436 		} else {
437 			ret = size << 1;
438 		}
439 	}
440 
441 	return ret;
442 }
443 
444 /*
445  * Round the floating point to the next integer
446  * Eg. 1.3 will round up to 2.
447  */
448 uint32_t
tcp_ceil(double a)449 tcp_ceil(double a)
450 {
451 	double res = (uint32_t) a;
452 	return (uint32_t)(res + (res < a));
453 }
454 
455 uint32_t
tcp_round_to(uint32_t val,uint32_t round)456 tcp_round_to(uint32_t val, uint32_t round)
457 {
458 	/*
459 	 * Round up or down based on the middle. Meaning, if we round upon a
460 	 * multiple of 10, 16 will round to 20 and 14 will round to 10.
461 	 */
462 	return ((val + (round / 2)) / round) * round;
463 }
464 
465 /*
466  * Round up to the next multiple of base.
467  * Eg. for a base of 64, 65 will become 128,
468  * 2896 will become 2944.
469  */
470 uint32_t
tcp_round_up(uint32_t val,uint32_t base)471 tcp_round_up(uint32_t val, uint32_t base)
472 {
473 	if (base == 1 || val % base == 0) {
474 		return val;
475 	}
476 
477 	return ((val + base) / base) * base;
478 }
479 
480 static void
tcp_tfo_init(void)481 tcp_tfo_init(void)
482 {
483 	u_char key[TCP_FASTOPEN_KEYLEN];
484 
485 	read_frandom(key, sizeof(key));
486 	aes_encrypt_key128(key, &tfo_ctx);
487 }
488 
489 /*
490  * Tcp initialization
491  */
492 void
tcp_init(struct protosw * pp,struct domain * dp)493 tcp_init(struct protosw *pp, struct domain *dp)
494 {
495 #pragma unused(dp)
496 	static int tcp_initialized = 0;
497 	vm_size_t str_size;
498 	struct inpcbinfo *pcbinfo;
499 	uint32_t logging_config;
500 
501 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
502 
503 	if (tcp_initialized) {
504 		return;
505 	}
506 	tcp_initialized = 1;
507 
508 #if DEBUG || DEVELOPMENT
509 	(void) PE_parse_boot_argn("tcp_rxt_seg_max", &tcp_rxt_seg_max,
510 	    sizeof(tcp_rxt_seg_max));
511 #endif /* DEBUG || DEVELOPMENT */
512 
513 	tcp_ccgen = 1;
514 	tcp_keepinit = TCPTV_KEEP_INIT;
515 	tcp_keepidle = TCPTV_KEEP_IDLE;
516 	tcp_keepintvl = TCPTV_KEEPINTVL;
517 	tcp_keepcnt = TCPTV_KEEPCNT;
518 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
519 	tcp_msl = TCPTV_MSL;
520 
521 	microuptime(&tcp_uptime);
522 	read_frandom(&tcp_now, sizeof(tcp_now));
523 
524 	/* Starts tcp internal clock at a random value */
525 	tcp_now = tcp_now & 0x3fffffff;
526 
527 	/* expose initial uptime/now via systcl for utcp to keep time sync */
528 	tcp_now_init = tcp_now;
529 	tcp_microuptime_init =
530 	    (uint32_t)(tcp_uptime.tv_usec + (tcp_uptime.tv_sec * USEC_PER_SEC));
531 	SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init);
532 	SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init);
533 
534 	tcp_tfo_init();
535 
536 	LIST_INIT(&tcb);
537 	tcbinfo.ipi_listhead = &tcb;
538 
539 	pcbinfo = &tcbinfo;
540 
541 	/*
542 	 * allocate group, lock attributes and lock for tcp pcb mutexes
543 	 */
544 	pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb",
545 	    LCK_GRP_ATTR_NULL);
546 	lck_attr_setdefault(&pcbinfo->ipi_lock_attr);
547 	lck_rw_init(&pcbinfo->ipi_lock, pcbinfo->ipi_lock_grp,
548 	    &pcbinfo->ipi_lock_attr);
549 
550 	if (tcp_tcbhashsize == 0) {
551 		/* Set to default */
552 		tcp_tcbhashsize = 512;
553 	}
554 
555 	if (!powerof2(tcp_tcbhashsize)) {
556 		int old_hash_size = tcp_tcbhashsize;
557 		tcp_tcbhashsize = scale_to_powerof2(tcp_tcbhashsize);
558 		/* Lower limit of 16  */
559 		if (tcp_tcbhashsize < 16) {
560 			tcp_tcbhashsize = 16;
561 		}
562 		printf("WARNING: TCB hash size not a power of 2, "
563 		    "scaled from %d to %d.\n",
564 		    old_hash_size,
565 		    tcp_tcbhashsize);
566 	}
567 
568 	tcbinfo.ipi_hashbase = hashinit(tcp_tcbhashsize, M_PCB,
569 	    &tcbinfo.ipi_hashmask);
570 	tcbinfo.ipi_porthashbase = hashinit(tcp_tcbhashsize, M_PCB,
571 	    &tcbinfo.ipi_porthashmask);
572 	str_size = (vm_size_t)P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t));
573 	tcbinfo.ipi_zone = zone_create("tcpcb", str_size, ZC_NONE);
574 
575 	tcbinfo.ipi_gc = tcp_gc;
576 	tcbinfo.ipi_timer = tcp_itimer;
577 	in_pcbinfo_attach(&tcbinfo);
578 
579 	str_size = (vm_size_t)P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t));
580 	sack_hole_zone = zone_create("sack_hole zone", str_size, ZC_NONE);
581 
582 	str_size = (vm_size_t)P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t));
583 	tcp_reass_zone = zone_create("tcp_reass_zone", str_size, ZC_NONE);
584 
585 	str_size = (vm_size_t)P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t));
586 	tcp_bwmeas_zone = zone_create("tcp_bwmeas_zone", str_size, ZC_ZFREE_CLEARMEM);
587 
588 	str_size = (vm_size_t)P2ROUNDUP(sizeof(struct tcp_ccstate), sizeof(u_int64_t));
589 	tcp_cc_zone = zone_create("tcp_cc_zone", str_size, ZC_NONE);
590 
591 	str_size = (vm_size_t)P2ROUNDUP(sizeof(struct tcp_rxt_seg), sizeof(u_int64_t));
592 	tcp_rxt_seg_zone = zone_create("tcp_rxt_seg_zone", str_size, ZC_NONE);
593 
594 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
595 	if (max_protohdr < TCP_MINPROTOHDR) {
596 		_max_protohdr = TCP_MINPROTOHDR;
597 		_max_protohdr = (int)max_protohdr;   /* round it up */
598 	}
599 	if (max_linkhdr + max_protohdr > MCLBYTES) {
600 		panic("tcp_init");
601 	}
602 #undef TCP_MINPROTOHDR
603 
604 	/* Initialize time wait and timer lists */
605 	TAILQ_INIT(&tcp_tw_tailq);
606 
607 	bzero(&tcp_timer_list, sizeof(tcp_timer_list));
608 	LIST_INIT(&tcp_timer_list.lhead);
609 	/*
610 	 * allocate group and attribute for the tcp timer list
611 	 */
612 	tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist",
613 	    LCK_GRP_ATTR_NULL);
614 	lck_mtx_init(&tcp_timer_list.mtx, tcp_timer_list.mtx_grp,
615 	    LCK_ATTR_NULL);
616 
617 	tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL);
618 	if (tcp_timer_list.call == NULL) {
619 		panic("failed to allocate call entry 1 in tcp_init");
620 	}
621 
622 	/* Initialize TCP Cache */
623 	tcp_cache_init();
624 
625 	tcp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.tcp");
626 	if (tcp_mpkl_log_object == NULL) {
627 		panic("MPKL_CREATE_LOGOBJECT failed");
628 	}
629 
630 	logging_config = atm_get_diagnostic_config();
631 	if (logging_config & 0x80000000) {
632 		tcp_log_privacy = 1;
633 	}
634 
635 	PE_parse_boot_argn("tcp_log", &tcp_log_enable_flags, sizeof(tcp_log_enable_flags));
636 
637 	/*
638 	 * If more than 4GB of actual memory is available, increase the
639 	 * maximum allowed receive and send socket buffer size.
640 	 */
641 	if (mem_actual >= (1ULL << (GBSHIFT + 2))) {
642 		tcp_autorcvbuf_max = 4 * 1024 * 1024;
643 		tcp_autosndbuf_max = 4 * 1024 * 1024;
644 
645 		SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
646 		SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
647 	}
648 }
649 
650 /*
651  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
652  * tcp_template used to store this data in mbufs, but we now recopy it out
653  * of the tcpcb each time to conserve mbufs.
654  */
655 void
tcp_fillheaders(struct mbuf * m,struct tcpcb * tp,void * ip_ptr,void * tcp_ptr)656 tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr)
657 {
658 	struct inpcb *inp = tp->t_inpcb;
659 	struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
660 
661 	if ((inp->inp_vflag & INP_IPV6) != 0) {
662 		struct ip6_hdr *ip6;
663 
664 		ip6 = (struct ip6_hdr *)ip_ptr;
665 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
666 		    (inp->inp_flow & IPV6_FLOWINFO_MASK);
667 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
668 		    (IPV6_VERSION & IPV6_VERSION_MASK);
669 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
670 		ip6->ip6_nxt = IPPROTO_TCP;
671 		ip6->ip6_hlim = 0;
672 		ip6->ip6_src = inp->in6p_laddr;
673 		ip6->ip6_dst = inp->in6p_faddr;
674 		if (m->m_flags & M_PKTHDR) {
675 			uint32_t lifscope = inp->inp_lifscope != 0 ? inp->inp_lifscope : inp->inp_fifscope;
676 			uint32_t fifscope = inp->inp_fifscope != 0 ? inp->inp_fifscope : inp->inp_lifscope;
677 			ip6_output_setsrcifscope(m, lifscope, NULL);
678 			ip6_output_setdstifscope(m, fifscope, NULL);
679 		}
680 		tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
681 		    htonl(sizeof(struct tcphdr) + IPPROTO_TCP));
682 	} else {
683 		struct ip *ip = (struct ip *) ip_ptr;
684 
685 		ip->ip_vhl = IP_VHL_BORING;
686 		ip->ip_tos = 0;
687 		ip->ip_len = 0;
688 		ip->ip_id = 0;
689 		ip->ip_off = 0;
690 		ip->ip_ttl = 0;
691 		ip->ip_sum = 0;
692 		ip->ip_p = IPPROTO_TCP;
693 		ip->ip_src = inp->inp_laddr;
694 		ip->ip_dst = inp->inp_faddr;
695 		tcp_hdr->th_sum =
696 		    in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
697 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP));
698 	}
699 
700 	tcp_hdr->th_sport = inp->inp_lport;
701 	tcp_hdr->th_dport = inp->inp_fport;
702 	tcp_hdr->th_seq = 0;
703 	tcp_hdr->th_ack = 0;
704 	tcp_hdr->th_x2 = 0;
705 	tcp_hdr->th_off = 5;
706 	tcp_hdr->th_flags = 0;
707 	tcp_hdr->th_win = 0;
708 	tcp_hdr->th_urp = 0;
709 }
710 
711 /*
712  * Create template to be used to send tcp packets on a connection.
713  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
714  * use for this function is in keepalives, which use tcp_respond.
715  */
716 struct tcptemp *
tcp_maketemplate(struct tcpcb * tp)717 tcp_maketemplate(struct tcpcb *tp)
718 {
719 	struct mbuf *m;
720 	struct tcptemp *n;
721 
722 	m = m_get(M_DONTWAIT, MT_HEADER);
723 	if (m == NULL) {
724 		return NULL;
725 	}
726 	m->m_len = sizeof(struct tcptemp);
727 	n = mtod(m, struct tcptemp *);
728 
729 	tcp_fillheaders(m, tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
730 	return n;
731 }
732 
733 /*
734  * Send a single message to the TCP at address specified by
735  * the given TCP/IP header.  If m == 0, then we make a copy
736  * of the tcpiphdr at ti and send directly to the addressed host.
737  * This is used to force keep alive messages out using the TCP
738  * template for a connection.  If flags are given then we send
739  * a message back to the TCP which originated the * segment ti,
740  * and discard the mbuf containing it and any other attached mbufs.
741  *
742  * In any case the ack and sequence number of the transmitted
743  * segment are as specified by the parameters.
744  *
745  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
746  */
747 void
tcp_respond(struct tcpcb * tp,void * ipgen,struct tcphdr * th,struct mbuf * m,tcp_seq ack,tcp_seq seq,uint8_t flags,struct tcp_respond_args * tra)748 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
749     tcp_seq ack, tcp_seq seq, uint8_t flags, struct tcp_respond_args *tra)
750 {
751 	uint16_t tlen;
752 	int win = 0;
753 	struct route *ro = 0;
754 	struct route sro;
755 	struct ip *ip;
756 	struct tcphdr *nth;
757 	struct route_in6 *ro6 = 0;
758 	struct route_in6 sro6;
759 	struct ip6_hdr *ip6;
760 	int isipv6;
761 	struct ifnet *outif;
762 	int sotc = SO_TC_UNSPEC;
763 	bool check_qos_marking_again = FALSE;
764 	uint32_t sifscope = IFSCOPE_NONE, fifscope = IFSCOPE_NONE;
765 
766 	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
767 	ip6 = ipgen;
768 	ip = ipgen;
769 
770 	if (tp) {
771 		check_qos_marking_again = tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE ? FALSE : TRUE;
772 		sifscope = tp->t_inpcb->inp_lifscope;
773 		fifscope = tp->t_inpcb->inp_fifscope;
774 		if (!(flags & TH_RST)) {
775 			win = tcp_sbspace(tp);
776 			if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale) {
777 				win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
778 			}
779 		}
780 		if (isipv6) {
781 			ro6 = &tp->t_inpcb->in6p_route;
782 		} else {
783 			ro = &tp->t_inpcb->inp_route;
784 		}
785 	} else {
786 		if (isipv6) {
787 			ro6 = &sro6;
788 			bzero(ro6, sizeof(*ro6));
789 		} else {
790 			ro = &sro;
791 			bzero(ro, sizeof(*ro));
792 		}
793 	}
794 	if (m == 0) {
795 		m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
796 		if (m == NULL) {
797 			return;
798 		}
799 		tlen = 0;
800 		m->m_data += max_linkhdr;
801 		if (isipv6) {
802 			VERIFY((MHLEN - max_linkhdr) >=
803 			    (sizeof(*ip6) + sizeof(*nth)));
804 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
805 			    sizeof(struct ip6_hdr));
806 			ip6 = mtod(m, struct ip6_hdr *);
807 			nth = (struct tcphdr *)(void *)(ip6 + 1);
808 		} else {
809 			VERIFY((MHLEN - max_linkhdr) >=
810 			    (sizeof(*ip) + sizeof(*nth)));
811 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
812 			ip = mtod(m, struct ip *);
813 			nth = (struct tcphdr *)(void *)(ip + 1);
814 		}
815 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
816 #if MPTCP
817 		if ((tp) && (tp->t_mpflags & TMPF_RESET)) {
818 			flags = (TH_RST | TH_ACK);
819 		} else
820 #endif
821 		flags = TH_ACK;
822 	} else {
823 		m_freem(m->m_next);
824 		m->m_next = 0;
825 		m->m_data = (caddr_t)ipgen;
826 		/* m_len is set later */
827 		tlen = 0;
828 #define xchg(a, b, type) { type t; t = a; a = b; b = t; }
829 		if (isipv6) {
830 			ip6_getsrcifaddr_info(m, &sifscope, NULL);
831 			ip6_getdstifaddr_info(m, &fifscope, NULL);
832 			if (!in6_embedded_scope) {
833 				m->m_pkthdr.pkt_flags &= ~PKTF_IFAINFO;
834 			}
835 			/* Expect 32-bit aligned IP on strict-align platforms */
836 			IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
837 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
838 			nth = (struct tcphdr *)(void *)(ip6 + 1);
839 		} else {
840 			/* Expect 32-bit aligned IP on strict-align platforms */
841 			IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
842 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
843 			nth = (struct tcphdr *)(void *)(ip + 1);
844 		}
845 		if (th != nth) {
846 			/*
847 			 * this is usually a case when an extension header
848 			 * exists between the IPv6 header and the
849 			 * TCP header.
850 			 */
851 			nth->th_sport = th->th_sport;
852 			nth->th_dport = th->th_dport;
853 		}
854 		xchg(nth->th_dport, nth->th_sport, n_short);
855 #undef xchg
856 	}
857 	if (isipv6) {
858 		ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) +
859 		    tlen));
860 		tlen += sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
861 		ip6_output_setsrcifscope(m, sifscope, NULL);
862 		ip6_output_setdstifscope(m, fifscope, NULL);
863 	} else {
864 		tlen += sizeof(struct tcpiphdr);
865 		ip->ip_len = tlen;
866 		ip->ip_ttl = (uint8_t)ip_defttl;
867 	}
868 	m->m_len = tlen;
869 	m->m_pkthdr.len = tlen;
870 	m->m_pkthdr.rcvif = 0;
871 	if (tra->keep_alive) {
872 		m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE;
873 	}
874 
875 	nth->th_seq = htonl(seq);
876 	nth->th_ack = htonl(ack);
877 	nth->th_x2 = 0;
878 	nth->th_off = sizeof(struct tcphdr) >> 2;
879 	nth->th_flags = flags;
880 	if (tp) {
881 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
882 	} else {
883 		nth->th_win = htons((u_short)win);
884 	}
885 	nth->th_urp = 0;
886 	if (isipv6) {
887 		nth->th_sum = 0;
888 		nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
889 		    htonl((tlen - sizeof(struct ip6_hdr)) + IPPROTO_TCP));
890 		m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
891 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
892 		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
893 		    ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
894 	} else {
895 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
896 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
897 		m->m_pkthdr.csum_flags = CSUM_TCP;
898 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
899 	}
900 #if TCPDEBUG
901 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
902 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
903 	}
904 #endif
905 
906 #if NECP
907 	necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0, 0, 0);
908 #endif /* NECP */
909 
910 #if IPSEC
911 	if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
912 	    ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
913 		m_freem(m);
914 		return;
915 	}
916 #endif
917 
918 	if (tp != NULL) {
919 		u_int32_t svc_flags = 0;
920 		if (isipv6) {
921 			svc_flags |= PKT_SCF_IPV6;
922 		}
923 		sotc = tp->t_inpcb->inp_socket->so_traffic_class;
924 		if ((flags & TH_RST) == 0) {
925 			set_packet_service_class(m, tp->t_inpcb->inp_socket,
926 			    sotc, svc_flags);
927 		} else {
928 			m_set_service_class(m, MBUF_SC_BK_SYS);
929 		}
930 
931 		/* Embed flowhash and flow control flags */
932 		m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
933 		m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
934 		m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
935 		m->m_pkthdr.pkt_proto = IPPROTO_TCP;
936 		m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid;
937 		m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid;
938 
939 		if (flags & TH_RST) {
940 			m->m_pkthdr.comp_gencnt = tp->t_comp_gencnt;
941 		}
942 	} else {
943 		if (flags & TH_RST) {
944 			m->m_pkthdr.comp_gencnt = TCP_ACK_COMPRESSION_DUMMY;
945 			m_set_service_class(m, MBUF_SC_BK_SYS);
946 		}
947 	}
948 
949 	if (isipv6) {
950 		struct ip6_out_args ip6oa;
951 		bzero(&ip6oa, sizeof(ip6oa));
952 		ip6oa.ip6oa_boundif = tra->ifscope;
953 		ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR;
954 		ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
955 		ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
956 
957 		if (tra->ifscope != IFSCOPE_NONE) {
958 			ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
959 		}
960 		if (tra->nocell) {
961 			ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
962 		}
963 		if (tra->noexpensive) {
964 			ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
965 		}
966 		if (tra->noconstrained) {
967 			ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
968 		}
969 		if (tra->awdl_unrestricted) {
970 			ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
971 		}
972 		if (tra->intcoproc_allowed) {
973 			ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
974 		}
975 		ip6oa.ip6oa_sotc = sotc;
976 		if (tp != NULL) {
977 			if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
978 				ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
979 			}
980 			ip6oa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
981 			if (check_qos_marking_again) {
982 				ip6oa.ip6oa_flags |= IP6OAF_REDO_QOSMARKING_POLICY;
983 			}
984 			ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
985 		}
986 		(void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
987 		    NULL, &ip6oa);
988 
989 		if (check_qos_marking_again) {
990 			struct inpcb *inp = tp->t_inpcb;
991 			inp->inp_policyresult.results.qos_marking_gencount = ip6oa.qos_marking_gencount;
992 			if (ip6oa.ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED) {
993 				inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
994 			} else {
995 				inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
996 			}
997 		}
998 
999 		if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
1000 		    (outif = ro6->ro_rt->rt_ifp) !=
1001 		    tp->t_inpcb->in6p_last_outifp) {
1002 			tp->t_inpcb->in6p_last_outifp = outif;
1003 #if SKYWALK
1004 			if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1005 				netns_set_ifnet(&tp->t_inpcb->inp_netns_token,
1006 				    tp->t_inpcb->in6p_last_outifp);
1007 			}
1008 #endif /* SKYWALK */
1009 		}
1010 
1011 		if (ro6 == &sro6) {
1012 			ROUTE_RELEASE(ro6);
1013 		}
1014 	} else {
1015 		struct ip_out_args ipoa;
1016 		bzero(&ipoa, sizeof(ipoa));
1017 		ipoa.ipoa_boundif = tra->ifscope;
1018 		ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR;
1019 		ipoa.ipoa_sotc = SO_TC_UNSPEC;
1020 		ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1021 
1022 		if (tra->ifscope != IFSCOPE_NONE) {
1023 			ipoa.ipoa_flags |= IPOAF_BOUND_IF;
1024 		}
1025 		if (tra->nocell) {
1026 			ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
1027 		}
1028 		if (tra->noexpensive) {
1029 			ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
1030 		}
1031 		if (tra->noconstrained) {
1032 			ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED;
1033 		}
1034 		if (tra->awdl_unrestricted) {
1035 			ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
1036 		}
1037 		ipoa.ipoa_sotc = sotc;
1038 		if (tp != NULL) {
1039 			if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
1040 				ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1041 			}
1042 			if (!(tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE)) {
1043 				ipoa.ipoa_flags |= IPOAF_REDO_QOSMARKING_POLICY;
1044 			}
1045 			ipoa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
1046 			ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
1047 		}
1048 		if (ro != &sro) {
1049 			/* Copy the cached route and take an extra reference */
1050 			inp_route_copyout(tp->t_inpcb, &sro);
1051 		}
1052 		/*
1053 		 * For consistency, pass a local route copy.
1054 		 */
1055 		(void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
1056 
1057 		if (check_qos_marking_again) {
1058 			struct inpcb *inp = tp->t_inpcb;
1059 			inp->inp_policyresult.results.qos_marking_gencount = ipoa.qos_marking_gencount;
1060 			if (ipoa.ipoa_flags & IPOAF_QOSMARKING_ALLOWED) {
1061 				inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
1062 			} else {
1063 				inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
1064 			}
1065 		}
1066 		if (tp != NULL && sro.ro_rt != NULL &&
1067 		    (outif = sro.ro_rt->rt_ifp) !=
1068 		    tp->t_inpcb->inp_last_outifp) {
1069 			tp->t_inpcb->inp_last_outifp = outif;
1070 #if SKYWALK
1071 			if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1072 				netns_set_ifnet(&tp->t_inpcb->inp_netns_token, outif);
1073 			}
1074 #endif /* SKYWALK */
1075 		}
1076 		if (ro != &sro) {
1077 			/* Synchronize cached PCB route */
1078 			inp_route_copyin(tp->t_inpcb, &sro);
1079 		} else {
1080 			ROUTE_RELEASE(&sro);
1081 		}
1082 	}
1083 }
1084 
1085 /*
1086  * Create a new TCP control block, making an
1087  * empty reassembly queue and hooking it to the argument
1088  * protocol control block.  The `inp' parameter must have
1089  * come from the zone allocator set up in tcp_init().
1090  */
1091 struct tcpcb *
tcp_newtcpcb(struct inpcb * inp)1092 tcp_newtcpcb(struct inpcb *inp)
1093 {
1094 	struct inp_tp *it;
1095 	struct tcpcb *tp;
1096 	struct socket *so = inp->inp_socket;
1097 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1098 	uint32_t random_32;
1099 
1100 	calculate_tcp_clock();
1101 
1102 	if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
1103 		it = (struct inp_tp *)(void *)inp;
1104 		tp = &it->tcb;
1105 	} else {
1106 		tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb;
1107 	}
1108 
1109 	bzero((char *) tp, sizeof(struct tcpcb));
1110 	LIST_INIT(&tp->t_segq);
1111 	tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
1112 
1113 	tp->t_flags = (TF_REQ_SCALE | TF_REQ_TSTMP);
1114 	tp->t_flagsext |= TF_SACK_ENABLE;
1115 
1116 	TAILQ_INIT(&tp->snd_holes);
1117 	SLIST_INIT(&tp->t_rxt_segments);
1118 	SLIST_INIT(&tp->t_notify_ack);
1119 	tp->t_inpcb = inp;
1120 	/*
1121 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
1122 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
1123 	 * reasonable initial retransmit time.
1124 	 */
1125 	tp->t_srtt = TCPTV_SRTTBASE;
1126 	tp->t_rttvar =
1127 	    ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1128 	tp->t_rttmin = tcp_TCPTV_MIN;
1129 	tp->t_rxtcur = TCPTV_RTOBASE;
1130 
1131 	if (tcp_use_newreno) {
1132 		/* use newreno by default */
1133 		tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
1134 #if (DEVELOPMENT || DEBUG)
1135 	} else if (tcp_use_ledbat) {
1136 		/* use ledbat for testing */
1137 		tp->tcp_cc_index = TCP_CC_ALGO_BACKGROUND_INDEX;
1138 #endif
1139 	} else {
1140 		tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
1141 	}
1142 
1143 	tcp_cc_allocate_state(tp);
1144 
1145 	if (CC_ALGO(tp)->init != NULL) {
1146 		CC_ALGO(tp)->init(tp);
1147 	}
1148 
1149 	/* Initialize rledbat if we are using recv_bg */
1150 	if (tcp_rledbat == 1 && TCP_RECV_BG(inp->inp_socket) &&
1151 	    tcp_cc_rledbat.init != NULL) {
1152 		tcp_cc_rledbat.init(tp);
1153 	}
1154 
1155 	tp->snd_cwnd = tcp_initial_cwnd(tp);
1156 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1157 	tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1158 	tp->t_rcvtime = tcp_now;
1159 	tp->tentry.timer_start = tcp_now;
1160 	tp->rcv_unackwin = tcp_now;
1161 	tp->t_persist_timeout = tcp_max_persist_timeout;
1162 	tp->t_persist_stop = 0;
1163 	tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1164 	tp->t_rexmtthresh = (uint8_t)tcprexmtthresh;
1165 	tp->rfbuf_ts = tcp_now;
1166 	tp->rfbuf_space = tcp_initial_cwnd(tp);
1167 	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
1168 
1169 	/* Enable bandwidth measurement on this connection */
1170 	tp->t_flagsext |= TF_MEASURESNDBW;
1171 	if (tp->t_bwmeas == NULL) {
1172 		tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1173 		if (tp->t_bwmeas == NULL) {
1174 			tp->t_flagsext &= ~TF_MEASURESNDBW;
1175 		}
1176 	}
1177 
1178 	/* Clear time wait tailq entry */
1179 	tp->t_twentry.tqe_next = NULL;
1180 	tp->t_twentry.tqe_prev = NULL;
1181 
1182 	read_frandom(&random_32, sizeof(random_32));
1183 	if (__probable(tcp_do_ack_compression)) {
1184 		tp->t_comp_gencnt = random_32;
1185 		if (tp->t_comp_gencnt <= TCP_ACK_COMPRESSION_DUMMY) {
1186 			tp->t_comp_gencnt = TCP_ACK_COMPRESSION_DUMMY + 1;
1187 		}
1188 		tp->t_comp_lastinc = tcp_now;
1189 	}
1190 
1191 	if (__probable(tcp_randomize_timestamps)) {
1192 		tp->t_ts_offset = random_32;
1193 	}
1194 
1195 	/*
1196 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
1197 	 * because the socket may be bound to an IPv6 wildcard address,
1198 	 * which may match an IPv4-mapped IPv6 address.
1199 	 */
1200 	inp->inp_ip_ttl = (uint8_t)ip_defttl;
1201 	inp->inp_ppcb = (caddr_t)tp;
1202 	return tp;            /* XXX */
1203 }
1204 
1205 /*
1206  * Drop a TCP connection, reporting
1207  * the specified error.  If connection is synchronized,
1208  * then send a RST to peer.
1209  */
1210 struct tcpcb *
tcp_drop(struct tcpcb * tp,int errno)1211 tcp_drop(struct tcpcb *tp, int errno)
1212 {
1213 	struct socket *so = tp->t_inpcb->inp_socket;
1214 #if CONFIG_DTRACE
1215 	struct inpcb *inp = tp->t_inpcb;
1216 #endif
1217 
1218 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
1219 		DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1220 		    struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1221 		tp->t_state = TCPS_CLOSED;
1222 		(void) tcp_output(tp);
1223 		tcpstat.tcps_drops++;
1224 	} else {
1225 		tcpstat.tcps_conndrops++;
1226 	}
1227 	if (errno == ETIMEDOUT && tp->t_softerror) {
1228 		errno = tp->t_softerror;
1229 	}
1230 	so->so_error = (u_short)errno;
1231 
1232 	TCP_LOG_CONNECTION_SUMMARY(tp);
1233 
1234 	return tcp_close(tp);
1235 }
1236 
1237 void
tcp_getrt_rtt(struct tcpcb * tp,struct rtentry * rt)1238 tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
1239 {
1240 	u_int32_t rtt = rt->rt_rmx.rmx_rtt;
1241 	int isnetlocal = (tp->t_flags & TF_LOCAL);
1242 
1243 	TCP_LOG_RTM_RTT(tp, rt);
1244 
1245 	if (rtt != 0 && tcp_init_rtt_from_cache != 0) {
1246 		/*
1247 		 * XXX the lock bit for RTT indicates that the value
1248 		 * is also a minimum value; this is subject to time.
1249 		 */
1250 		if (rt->rt_rmx.rmx_locks & RTV_RTT) {
1251 			tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
1252 		} else {
1253 			tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN :
1254 			    TCPTV_REXMTMIN;
1255 		}
1256 
1257 		tp->t_srtt =
1258 		    rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1259 		tcpstat.tcps_usedrtt++;
1260 
1261 		if (rt->rt_rmx.rmx_rttvar) {
1262 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1263 			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1264 			tcpstat.tcps_usedrttvar++;
1265 		} else {
1266 			/* default variation is +- 1 rtt */
1267 			tp->t_rttvar =
1268 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1269 		}
1270 
1271 		/*
1272 		 * The RTO formula in the route metric case is based on:
1273 		 *     srtt + 4 * rttvar
1274 		 * modulo the min, max and slop
1275 		 */
1276 		TCPT_RANGESET(tp->t_rxtcur,
1277 		    TCP_REXMTVAL(tp),
1278 		    tp->t_rttmin, TCPTV_REXMTMAX,
1279 		    TCP_ADD_REXMTSLOP(tp));
1280 	}
1281 
1282 	TCP_LOG_RTT_INFO(tp);
1283 }
1284 
1285 static inline void
tcp_create_ifnet_stats_per_flow(struct tcpcb * tp,struct ifnet_stats_per_flow * ifs)1286 tcp_create_ifnet_stats_per_flow(struct tcpcb *tp,
1287     struct ifnet_stats_per_flow *ifs)
1288 {
1289 	struct inpcb *inp;
1290 	struct socket *so;
1291 	if (tp == NULL || ifs == NULL) {
1292 		return;
1293 	}
1294 
1295 	bzero(ifs, sizeof(*ifs));
1296 	inp = tp->t_inpcb;
1297 	so = inp->inp_socket;
1298 
1299 	ifs->ipv4 = (inp->inp_vflag & INP_IPV6) ? 0 : 1;
1300 	ifs->local = (tp->t_flags & TF_LOCAL) ? 1 : 0;
1301 	ifs->connreset = (so->so_error == ECONNRESET) ? 1 : 0;
1302 	ifs->conntimeout = (so->so_error == ETIMEDOUT) ? 1 : 0;
1303 	ifs->ecn_flags = tp->ecn_flags;
1304 	ifs->txretransmitbytes = tp->t_stat.txretransmitbytes;
1305 	ifs->rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1306 	ifs->rxmitpkts = tp->t_stat.rxmitpkts;
1307 	ifs->rcvoopack = tp->t_rcvoopack;
1308 	ifs->pawsdrop = tp->t_pawsdrop;
1309 	ifs->sack_recovery_episodes = tp->t_sack_recovery_episode;
1310 	ifs->reordered_pkts = tp->t_reordered_pkts;
1311 	ifs->dsack_sent = tp->t_dsack_sent;
1312 	ifs->dsack_recvd = tp->t_dsack_recvd;
1313 	ifs->srtt = tp->t_srtt;
1314 	ifs->rttupdated = tp->t_rttupdated;
1315 	ifs->rttvar = tp->t_rttvar;
1316 	ifs->rttmin = get_base_rtt(tp);
1317 	if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_sndbw_max > 0) {
1318 		ifs->bw_sndbw_max = tp->t_bwmeas->bw_sndbw_max;
1319 	} else {
1320 		ifs->bw_sndbw_max = 0;
1321 	}
1322 	if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_rcvbw_max > 0) {
1323 		ifs->bw_rcvbw_max = tp->t_bwmeas->bw_rcvbw_max;
1324 	} else {
1325 		ifs->bw_rcvbw_max = 0;
1326 	}
1327 	ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets;
1328 	ifs->txpackets = inp->inp_stat->txpackets;
1329 	ifs->rxpackets = inp->inp_stat->rxpackets;
1330 }
1331 
1332 static inline void
tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow * ifs,struct if_tcp_ecn_perf_stat * stat)1333 tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
1334     struct if_tcp_ecn_perf_stat *stat)
1335 {
1336 	u_int64_t curval, oldval;
1337 	stat->total_txpkts += ifs->txpackets;
1338 	stat->total_rxpkts += ifs->rxpackets;
1339 	stat->total_rxmitpkts += ifs->rxmitpkts;
1340 	stat->total_oopkts += ifs->rcvoopack;
1341 	stat->total_reorderpkts += (ifs->reordered_pkts +
1342 	    ifs->pawsdrop + ifs->dsack_sent + ifs->dsack_recvd);
1343 
1344 	/* Average RTT */
1345 	curval = ifs->srtt >> TCP_RTT_SHIFT;
1346 	if (curval > 0 && ifs->rttupdated >= 16) {
1347 		if (stat->rtt_avg == 0) {
1348 			stat->rtt_avg = curval;
1349 		} else {
1350 			oldval = stat->rtt_avg;
1351 			stat->rtt_avg = ((oldval << 4) - oldval + curval) >> 4;
1352 		}
1353 	}
1354 
1355 	/* RTT variance */
1356 	curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1357 	if (curval > 0 && ifs->rttupdated >= 16) {
1358 		if (stat->rtt_var == 0) {
1359 			stat->rtt_var = curval;
1360 		} else {
1361 			oldval = stat->rtt_var;
1362 			stat->rtt_var =
1363 			    ((oldval << 4) - oldval + curval) >> 4;
1364 		}
1365 	}
1366 
1367 	/* SACK episodes */
1368 	stat->sack_episodes += ifs->sack_recovery_episodes;
1369 	if (ifs->connreset) {
1370 		stat->rst_drop++;
1371 	}
1372 }
1373 
1374 static inline void
tcp_flow_lim_stats(struct ifnet_stats_per_flow * ifs,struct if_lim_perf_stat * stat)1375 tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
1376     struct if_lim_perf_stat *stat)
1377 {
1378 	u_int64_t curval, oldval;
1379 
1380 	stat->lim_total_txpkts += ifs->txpackets;
1381 	stat->lim_total_rxpkts += ifs->rxpackets;
1382 	stat->lim_total_retxpkts += ifs->rxmitpkts;
1383 	stat->lim_total_oopkts += ifs->rcvoopack;
1384 
1385 	if (ifs->bw_sndbw_max > 0) {
1386 		/* convert from bytes per ms to bits per second */
1387 		ifs->bw_sndbw_max *= 8000;
1388 		stat->lim_ul_max_bandwidth = MAX(stat->lim_ul_max_bandwidth,
1389 		    ifs->bw_sndbw_max);
1390 	}
1391 
1392 	if (ifs->bw_rcvbw_max > 0) {
1393 		/* convert from bytes per ms to bits per second */
1394 		ifs->bw_rcvbw_max *= 8000;
1395 		stat->lim_dl_max_bandwidth = MAX(stat->lim_dl_max_bandwidth,
1396 		    ifs->bw_rcvbw_max);
1397 	}
1398 
1399 	/* Average RTT */
1400 	curval = ifs->srtt >> TCP_RTT_SHIFT;
1401 	if (curval > 0 && ifs->rttupdated >= 16) {
1402 		if (stat->lim_rtt_average == 0) {
1403 			stat->lim_rtt_average = curval;
1404 		} else {
1405 			oldval = stat->lim_rtt_average;
1406 			stat->lim_rtt_average =
1407 			    ((oldval << 4) - oldval + curval) >> 4;
1408 		}
1409 	}
1410 
1411 	/* RTT variance */
1412 	curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1413 	if (curval > 0 && ifs->rttupdated >= 16) {
1414 		if (stat->lim_rtt_variance == 0) {
1415 			stat->lim_rtt_variance = curval;
1416 		} else {
1417 			oldval = stat->lim_rtt_variance;
1418 			stat->lim_rtt_variance =
1419 			    ((oldval << 4) - oldval + curval) >> 4;
1420 		}
1421 	}
1422 
1423 	if (stat->lim_rtt_min == 0) {
1424 		stat->lim_rtt_min = ifs->rttmin;
1425 	} else {
1426 		stat->lim_rtt_min = MIN(stat->lim_rtt_min, ifs->rttmin);
1427 	}
1428 
1429 	/* connection timeouts */
1430 	stat->lim_conn_attempts++;
1431 	if (ifs->conntimeout) {
1432 		stat->lim_conn_timeouts++;
1433 	}
1434 
1435 	/* bytes sent using background delay-based algorithms */
1436 	stat->lim_bk_txpkts += ifs->bk_txpackets;
1437 }
1438 
1439 /*
1440  * Close a TCP control block:
1441  *	discard all space held by the tcp
1442  *	discard internet protocol block
1443  *	wake up any sleepers
1444  */
1445 struct tcpcb *
tcp_close(struct tcpcb * tp)1446 tcp_close(struct tcpcb *tp)
1447 {
1448 	struct inpcb *inp = tp->t_inpcb;
1449 	struct socket *so = inp->inp_socket;
1450 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1451 	struct route *ro;
1452 	struct rtentry *rt;
1453 	int dosavessthresh;
1454 	struct ifnet_stats_per_flow ifs;
1455 
1456 	/* tcp_close was called previously, bail */
1457 	if (inp->inp_ppcb == NULL) {
1458 		return NULL;
1459 	}
1460 
1461 	tcp_del_fsw_flow(tp);
1462 
1463 	tcp_canceltimers(tp);
1464 	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp, 0, 0, 0, 0);
1465 
1466 	/*
1467 	 * If another thread for this tcp is currently in ip (indicated by
1468 	 * the TF_SENDINPROG flag), defer the cleanup until after it returns
1469 	 * back to tcp.  This is done to serialize the close until after all
1470 	 * pending output is finished, in order to avoid having the PCB be
1471 	 * detached and the cached route cleaned, only for ip to cache the
1472 	 * route back into the PCB again.  Note that we've cleared all the
1473 	 * timers at this point.  Set TF_CLOSING to indicate to tcp_output()
1474 	 * that is should call us again once it returns from ip; at that
1475 	 * point both flags should be cleared and we can proceed further
1476 	 * with the cleanup.
1477 	 */
1478 	if ((tp->t_flags & TF_CLOSING) ||
1479 	    inp->inp_sndinprog_cnt > 0) {
1480 		tp->t_flags |= TF_CLOSING;
1481 		return NULL;
1482 	}
1483 
1484 	TCP_LOG_CONNECTION_SUMMARY(tp);
1485 
1486 	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1487 	    struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1488 
1489 	ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
1490 	rt = ro->ro_rt;
1491 	if (rt != NULL) {
1492 		RT_LOCK_SPIN(rt);
1493 	}
1494 
1495 	/*
1496 	 * If we got enough samples through the srtt filter,
1497 	 * save the rtt and rttvar in the routing entry.
1498 	 * 'Enough' is arbitrarily defined as the 16 samples.
1499 	 * 16 samples is enough for the srtt filter to converge
1500 	 * to within 5% of the correct value; fewer samples and
1501 	 * we could save a very bogus rtt.
1502 	 *
1503 	 * Don't update the default route's characteristics and don't
1504 	 * update anything that the user "locked".
1505 	 */
1506 	if (tp->t_rttupdated >= 16) {
1507 		u_int32_t i = 0;
1508 		bool log_rtt = false;
1509 
1510 		if (isipv6) {
1511 			struct sockaddr_in6 *sin6;
1512 
1513 			if (rt == NULL) {
1514 				goto no_valid_rt;
1515 			}
1516 			sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt);
1517 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1518 				goto no_valid_rt;
1519 			}
1520 		} else if (ROUTE_UNUSABLE(ro) ||
1521 		    SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
1522 			DTRACE_TCP4(state__change, void, NULL,
1523 			    struct inpcb *, inp, struct tcpcb *, tp,
1524 			    int32_t, TCPS_CLOSED);
1525 			tp->t_state = TCPS_CLOSED;
1526 			goto no_valid_rt;
1527 		}
1528 
1529 		RT_LOCK_ASSERT_HELD(rt);
1530 		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1531 			i = tp->t_srtt *
1532 			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1533 			if (rt->rt_rmx.rmx_rtt && i) {
1534 				/*
1535 				 * filter this update to half the old & half
1536 				 * the new values, converting scale.
1537 				 * See route.h and tcp_var.h for a
1538 				 * description of the scaling constants.
1539 				 */
1540 				rt->rt_rmx.rmx_rtt =
1541 				    (rt->rt_rmx.rmx_rtt + i) / 2;
1542 			} else {
1543 				rt->rt_rmx.rmx_rtt = i;
1544 			}
1545 			tcpstat.tcps_cachedrtt++;
1546 			log_rtt = true;
1547 		}
1548 		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1549 			i = tp->t_rttvar *
1550 			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1551 			if (rt->rt_rmx.rmx_rttvar && i) {
1552 				rt->rt_rmx.rmx_rttvar =
1553 				    (rt->rt_rmx.rmx_rttvar + i) / 2;
1554 			} else {
1555 				rt->rt_rmx.rmx_rttvar = i;
1556 			}
1557 			tcpstat.tcps_cachedrttvar++;
1558 			log_rtt = true;
1559 		}
1560 		if (log_rtt) {
1561 			TCP_LOG_RTM_RTT(tp, rt);
1562 			TCP_LOG_RTT_INFO(tp);
1563 		}
1564 		/*
1565 		 * The old comment here said:
1566 		 * update the pipelimit (ssthresh) if it has been updated
1567 		 * already or if a pipesize was specified & the threshhold
1568 		 * got below half the pipesize.  I.e., wait for bad news
1569 		 * before we start updating, then update on both good
1570 		 * and bad news.
1571 		 *
1572 		 * But we want to save the ssthresh even if no pipesize is
1573 		 * specified explicitly in the route, because such
1574 		 * connections still have an implicit pipesize specified
1575 		 * by the global tcp_sendspace.  In the absence of a reliable
1576 		 * way to calculate the pipesize, it will have to do.
1577 		 */
1578 		i = tp->snd_ssthresh;
1579 		if (rt->rt_rmx.rmx_sendpipe != 0) {
1580 			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1581 		} else {
1582 			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1583 		}
1584 		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1585 		    i != 0 && rt->rt_rmx.rmx_ssthresh != 0) ||
1586 		    dosavessthresh) {
1587 			/*
1588 			 * convert the limit from user data bytes to
1589 			 * packets then to packet data bytes.
1590 			 */
1591 			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1592 			if (i < 2) {
1593 				i = 2;
1594 			}
1595 			i *= (u_int32_t)(tp->t_maxseg +
1596 			    isipv6 ? sizeof(struct ip6_hdr) +
1597 			    sizeof(struct tcphdr) :
1598 			    sizeof(struct tcpiphdr));
1599 			if (rt->rt_rmx.rmx_ssthresh) {
1600 				rt->rt_rmx.rmx_ssthresh =
1601 				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
1602 			} else {
1603 				rt->rt_rmx.rmx_ssthresh = i;
1604 			}
1605 			tcpstat.tcps_cachedssthresh++;
1606 		}
1607 	}
1608 
1609 	/*
1610 	 * Mark route for deletion if no information is cached.
1611 	 */
1612 	if (rt != NULL && (so->so_flags & SOF_OVERFLOW)) {
1613 		if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1614 		    rt->rt_rmx.rmx_rtt == 0) {
1615 			rt->rt_flags |= RTF_DELCLONE;
1616 		}
1617 	}
1618 
1619 no_valid_rt:
1620 	if (rt != NULL) {
1621 		RT_UNLOCK(rt);
1622 	}
1623 
1624 	/* free the reassembly queue, if any */
1625 	(void) tcp_freeq(tp);
1626 
1627 	/* performance stats per interface */
1628 	tcp_create_ifnet_stats_per_flow(tp, &ifs);
1629 	tcp_update_stats_per_flow(&ifs, inp->inp_last_outifp);
1630 
1631 	tcp_free_sackholes(tp);
1632 	tcp_notify_ack_free(tp);
1633 
1634 	inp_decr_sndbytes_allunsent(so, tp->snd_una);
1635 
1636 	if (tp->t_bwmeas != NULL) {
1637 		tcp_bwmeas_free(tp);
1638 	}
1639 	tcp_rxtseg_clean(tp);
1640 	/* Free the packet list */
1641 	if (tp->t_pktlist_head != NULL) {
1642 		m_freem_list(tp->t_pktlist_head);
1643 	}
1644 	TCP_PKTLIST_CLEAR(tp);
1645 
1646 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1647 		inp->inp_saved_ppcb = (caddr_t) tp;
1648 	}
1649 
1650 	tp->t_state = TCPS_CLOSED;
1651 
1652 	/*
1653 	 * Issue a wakeup before detach so that we don't miss
1654 	 * a wakeup
1655 	 */
1656 	sodisconnectwakeup(so);
1657 
1658 	/*
1659 	 * Make sure to clear the TCP Keep Alive Offload as it is
1660 	 * ref counted on the interface
1661 	 */
1662 	tcp_clear_keep_alive_offload(so);
1663 
1664 	/*
1665 	 * If this is a socket that does not want to wakeup the device
1666 	 * for it's traffic, the application might need to know that the
1667 	 * socket is closed, send a notification.
1668 	 */
1669 	if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
1670 	    inp->inp_state != INPCB_STATE_DEAD &&
1671 	    !(inp->inp_flags2 & INP2_TIMEWAIT)) {
1672 		socket_post_kev_msg_closed(so);
1673 	}
1674 
1675 	if (CC_ALGO(tp)->cleanup != NULL) {
1676 		CC_ALGO(tp)->cleanup(tp);
1677 	}
1678 
1679 	if (tp->t_ccstate != NULL) {
1680 		zfree(tcp_cc_zone, tp->t_ccstate);
1681 		tp->t_ccstate = NULL;
1682 	}
1683 	tp->tcp_cc_index = TCP_CC_ALGO_NONE;
1684 
1685 	if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.cleanup != NULL) {
1686 		tcp_cc_rledbat.cleanup(tp);
1687 	}
1688 
1689 	/* Can happen if we close the socket before receiving the third ACK */
1690 	if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
1691 		OSDecrementAtomic(&tcp_tfo_halfcnt);
1692 
1693 		/* Panic if something has gone terribly wrong. */
1694 		VERIFY(tcp_tfo_halfcnt >= 0);
1695 
1696 		tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
1697 	}
1698 
1699 	if (SOCK_CHECK_DOM(so, PF_INET6)) {
1700 		in6_pcbdetach(inp);
1701 	} else {
1702 		in_pcbdetach(inp);
1703 	}
1704 
1705 	/*
1706 	 * Call soisdisconnected after detach because it might unlock the socket
1707 	 */
1708 	soisdisconnected(so);
1709 	tcpstat.tcps_closed++;
1710 	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END,
1711 	    tcpstat.tcps_closed, 0, 0, 0, 0);
1712 	return NULL;
1713 }
1714 
1715 int
tcp_freeq(struct tcpcb * tp)1716 tcp_freeq(struct tcpcb *tp)
1717 {
1718 	struct tseg_qent *q;
1719 	int rv = 0;
1720 	int count = 0;
1721 
1722 	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1723 		LIST_REMOVE(q, tqe_q);
1724 		tp->t_reassq_mbcnt -= MSIZE + (q->tqe_m->m_flags & M_EXT) ?
1725 		    q->tqe_m->m_ext.ext_size : 0;
1726 		m_freem(q->tqe_m);
1727 		zfree(tcp_reass_zone, q);
1728 		rv = 1;
1729 		count++;
1730 	}
1731 	tp->t_reassqlen = 0;
1732 	if (count > 0) {
1733 		OSAddAtomic(-count, &tcp_reass_total_qlen);
1734 	}
1735 	return rv;
1736 }
1737 
1738 
1739 void
tcp_drain(void)1740 tcp_drain(void)
1741 {
1742 	struct inpcb *inp;
1743 	struct tcpcb *tp;
1744 
1745 	if (!lck_rw_try_lock_exclusive(&tcbinfo.ipi_lock)) {
1746 		return;
1747 	}
1748 
1749 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1750 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
1751 		    WNT_STOPUSING) {
1752 			socket_lock(inp->inp_socket, 1);
1753 			if (in_pcb_checkstate(inp, WNT_RELEASE, 1)
1754 			    == WNT_STOPUSING) {
1755 				/* lost a race, try the next one */
1756 				socket_unlock(inp->inp_socket, 1);
1757 				continue;
1758 			}
1759 			tp = intotcpcb(inp);
1760 
1761 			so_drain_extended_bk_idle(inp->inp_socket);
1762 
1763 			socket_unlock(inp->inp_socket, 1);
1764 		}
1765 	}
1766 	lck_rw_done(&tcbinfo.ipi_lock);
1767 }
1768 
1769 /*
1770  * Notify a tcp user of an asynchronous error;
1771  * store error as soft error, but wake up user
1772  * (for now, won't do anything until can select for soft error).
1773  *
1774  * Do not wake up user since there currently is no mechanism for
1775  * reporting soft errors (yet - a kqueue filter may be added).
1776  */
1777 static void
tcp_notify(struct inpcb * inp,int error)1778 tcp_notify(struct inpcb *inp, int error)
1779 {
1780 	struct tcpcb *tp;
1781 
1782 	if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) {
1783 		return; /* pcb is gone already */
1784 	}
1785 	tp = (struct tcpcb *)inp->inp_ppcb;
1786 
1787 	VERIFY(tp != NULL);
1788 	/*
1789 	 * Ignore some errors if we are hooked up.
1790 	 * If connection hasn't completed, has retransmitted several times,
1791 	 * and receives a second error, give up now.  This is better
1792 	 * than waiting a long time to establish a connection that
1793 	 * can never complete.
1794 	 */
1795 	if (tp->t_state == TCPS_ESTABLISHED &&
1796 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1797 	    error == EHOSTDOWN)) {
1798 		if (inp->inp_route.ro_rt) {
1799 			rtfree(inp->inp_route.ro_rt);
1800 			inp->inp_route.ro_rt = (struct rtentry *)NULL;
1801 		}
1802 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1803 	    tp->t_softerror) {
1804 		tcp_drop(tp, error);
1805 	} else {
1806 		tp->t_softerror = error;
1807 	}
1808 }
1809 
1810 struct bwmeas *
tcp_bwmeas_alloc(struct tcpcb * tp)1811 tcp_bwmeas_alloc(struct tcpcb *tp)
1812 {
1813 	struct bwmeas *elm;
1814 	elm = zalloc_flags(tcp_bwmeas_zone, Z_ZERO | Z_WAITOK);
1815 	elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1816 	elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1817 	return elm;
1818 }
1819 
1820 void
tcp_bwmeas_free(struct tcpcb * tp)1821 tcp_bwmeas_free(struct tcpcb *tp)
1822 {
1823 	zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1824 	tp->t_bwmeas = NULL;
1825 	tp->t_flagsext &= ~(TF_MEASURESNDBW);
1826 }
1827 
1828 int
get_tcp_inp_list(struct inpcb ** inp_list,int n,inp_gen_t gencnt)1829 get_tcp_inp_list(struct inpcb **inp_list, int n, inp_gen_t gencnt)
1830 {
1831 	struct tcpcb *tp;
1832 	struct inpcb *inp;
1833 	int i = 0;
1834 
1835 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1836 		if (inp->inp_gencnt <= gencnt &&
1837 		    inp->inp_state != INPCB_STATE_DEAD) {
1838 			inp_list[i++] = inp;
1839 		}
1840 		if (i >= n) {
1841 			break;
1842 		}
1843 	}
1844 
1845 	TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
1846 		inp = tp->t_inpcb;
1847 		if (inp->inp_gencnt <= gencnt &&
1848 		    inp->inp_state != INPCB_STATE_DEAD) {
1849 			inp_list[i++] = inp;
1850 		}
1851 		if (i >= n) {
1852 			break;
1853 		}
1854 	}
1855 	return i;
1856 }
1857 
1858 /*
1859  * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1860  * The otcpcb data structure is passed to user space and must not change.
1861  */
1862 static void
tcpcb_to_otcpcb(struct tcpcb * tp,struct otcpcb * otp)1863 tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1864 {
1865 	otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
1866 	otp->t_dupacks = tp->t_dupacks;
1867 	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1868 	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1869 	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1870 	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1871 	otp->t_inpcb =
1872 	    (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRPERM(tp->t_inpcb);
1873 	otp->t_state = tp->t_state;
1874 	otp->t_flags = tp->t_flags;
1875 	otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1876 	otp->snd_una = tp->snd_una;
1877 	otp->snd_max = tp->snd_max;
1878 	otp->snd_nxt = tp->snd_nxt;
1879 	otp->snd_up = tp->snd_up;
1880 	otp->snd_wl1 = tp->snd_wl1;
1881 	otp->snd_wl2 = tp->snd_wl2;
1882 	otp->iss = tp->iss;
1883 	otp->irs = tp->irs;
1884 	otp->rcv_nxt = tp->rcv_nxt;
1885 	otp->rcv_adv = tp->rcv_adv;
1886 	otp->rcv_wnd = tp->rcv_wnd;
1887 	otp->rcv_up = tp->rcv_up;
1888 	otp->snd_wnd = tp->snd_wnd;
1889 	otp->snd_cwnd = tp->snd_cwnd;
1890 	otp->snd_ssthresh = tp->snd_ssthresh;
1891 	otp->t_maxopd = tp->t_maxopd;
1892 	otp->t_rcvtime = tp->t_rcvtime;
1893 	otp->t_starttime = tp->t_starttime;
1894 	otp->t_rtttime = tp->t_rtttime;
1895 	otp->t_rtseq = tp->t_rtseq;
1896 	otp->t_rxtcur = tp->t_rxtcur;
1897 	otp->t_maxseg = tp->t_maxseg;
1898 	otp->t_srtt = tp->t_srtt;
1899 	otp->t_rttvar = tp->t_rttvar;
1900 	otp->t_rxtshift = tp->t_rxtshift;
1901 	otp->t_rttmin = tp->t_rttmin;
1902 	otp->t_rttupdated = tp->t_rttupdated;
1903 	otp->max_sndwnd = tp->max_sndwnd;
1904 	otp->t_softerror = tp->t_softerror;
1905 	otp->t_oobflags = tp->t_oobflags;
1906 	otp->t_iobc = tp->t_iobc;
1907 	otp->snd_scale = tp->snd_scale;
1908 	otp->rcv_scale = tp->rcv_scale;
1909 	otp->request_r_scale = tp->request_r_scale;
1910 	otp->requested_s_scale = tp->requested_s_scale;
1911 	otp->ts_recent = tp->ts_recent;
1912 	otp->ts_recent_age = tp->ts_recent_age;
1913 	otp->last_ack_sent = tp->last_ack_sent;
1914 	otp->cc_send = 0;
1915 	otp->cc_recv = 0;
1916 	otp->snd_recover = tp->snd_recover;
1917 	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1918 	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1919 	otp->t_badrxtwin = 0;
1920 }
1921 
1922 static int
1923 tcp_pcblist SYSCTL_HANDLER_ARGS
1924 {
1925 #pragma unused(oidp, arg1, arg2)
1926 	int error, i = 0, n, sz;
1927 	struct inpcb **inp_list;
1928 	inp_gen_t gencnt;
1929 	struct xinpgen xig;
1930 
1931 	/*
1932 	 * The process of preparing the TCB list is too time-consuming and
1933 	 * resource-intensive to repeat twice on every request.
1934 	 */
1935 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
1936 	if (req->oldptr == USER_ADDR_NULL) {
1937 		n = tcbinfo.ipi_count;
1938 		req->oldidx = 2 * (sizeof(xig))
1939 		    + (n + n / 8) * sizeof(struct xtcpcb);
1940 		lck_rw_done(&tcbinfo.ipi_lock);
1941 		return 0;
1942 	}
1943 
1944 	if (req->newptr != USER_ADDR_NULL) {
1945 		lck_rw_done(&tcbinfo.ipi_lock);
1946 		return EPERM;
1947 	}
1948 
1949 	/*
1950 	 * OK, now we're committed to doing something.
1951 	 */
1952 	gencnt = tcbinfo.ipi_gencnt;
1953 	sz = n = tcbinfo.ipi_count;
1954 
1955 	bzero(&xig, sizeof(xig));
1956 	xig.xig_len = sizeof(xig);
1957 	xig.xig_count = n;
1958 	xig.xig_gen = gencnt;
1959 	xig.xig_sogen = so_gencnt;
1960 	error = SYSCTL_OUT(req, &xig, sizeof(xig));
1961 	if (error) {
1962 		lck_rw_done(&tcbinfo.ipi_lock);
1963 		return error;
1964 	}
1965 	/*
1966 	 * We are done if there is no pcb
1967 	 */
1968 	if (n == 0) {
1969 		lck_rw_done(&tcbinfo.ipi_lock);
1970 		return 0;
1971 	}
1972 
1973 	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
1974 	if (inp_list == NULL) {
1975 		lck_rw_done(&tcbinfo.ipi_lock);
1976 		return ENOMEM;
1977 	}
1978 
1979 	n = get_tcp_inp_list(inp_list, n, gencnt);
1980 
1981 	error = 0;
1982 	for (i = 0; i < n; i++) {
1983 		struct xtcpcb xt;
1984 		caddr_t inp_ppcb;
1985 		struct inpcb *inp;
1986 
1987 		inp = inp_list[i];
1988 
1989 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1990 			continue;
1991 		}
1992 		socket_lock(inp->inp_socket, 1);
1993 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1994 			socket_unlock(inp->inp_socket, 1);
1995 			continue;
1996 		}
1997 		if (inp->inp_gencnt > gencnt) {
1998 			socket_unlock(inp->inp_socket, 1);
1999 			continue;
2000 		}
2001 
2002 		bzero(&xt, sizeof(xt));
2003 		xt.xt_len = sizeof(xt);
2004 		/* XXX should avoid extra copy */
2005 		inpcb_to_compat(inp, &xt.xt_inp);
2006 		inp_ppcb = inp->inp_ppcb;
2007 		if (inp_ppcb != NULL) {
2008 			tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb,
2009 			    &xt.xt_tp);
2010 		} else {
2011 			bzero((char *) &xt.xt_tp, sizeof(xt.xt_tp));
2012 		}
2013 		if (inp->inp_socket) {
2014 			sotoxsocket(inp->inp_socket, &xt.xt_socket);
2015 		}
2016 
2017 		socket_unlock(inp->inp_socket, 1);
2018 
2019 		error = SYSCTL_OUT(req, &xt, sizeof(xt));
2020 	}
2021 	if (!error) {
2022 		/*
2023 		 * Give the user an updated idea of our state.
2024 		 * If the generation differs from what we told
2025 		 * her before, she knows that something happened
2026 		 * while we were processing this request, and it
2027 		 * might be necessary to retry.
2028 		 */
2029 		bzero(&xig, sizeof(xig));
2030 		xig.xig_len = sizeof(xig);
2031 		xig.xig_gen = tcbinfo.ipi_gencnt;
2032 		xig.xig_sogen = so_gencnt;
2033 		xig.xig_count = tcbinfo.ipi_count;
2034 		error = SYSCTL_OUT(req, &xig, sizeof(xig));
2035 	}
2036 
2037 	lck_rw_done(&tcbinfo.ipi_lock);
2038 	kfree_type(struct inpcb *, sz, inp_list);
2039 	return error;
2040 }
2041 
2042 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
2043     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2044     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
2045 
2046 #if XNU_TARGET_OS_OSX
2047 
2048 static void
tcpcb_to_xtcpcb64(struct tcpcb * tp,struct xtcpcb64 * otp)2049 tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
2050 {
2051 	otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
2052 	otp->t_dupacks = tp->t_dupacks;
2053 	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
2054 	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
2055 	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
2056 	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
2057 	otp->t_state = tp->t_state;
2058 	otp->t_flags = tp->t_flags;
2059 	otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
2060 	otp->snd_una = tp->snd_una;
2061 	otp->snd_max = tp->snd_max;
2062 	otp->snd_nxt = tp->snd_nxt;
2063 	otp->snd_up = tp->snd_up;
2064 	otp->snd_wl1 = tp->snd_wl1;
2065 	otp->snd_wl2 = tp->snd_wl2;
2066 	otp->iss = tp->iss;
2067 	otp->irs = tp->irs;
2068 	otp->rcv_nxt = tp->rcv_nxt;
2069 	otp->rcv_adv = tp->rcv_adv;
2070 	otp->rcv_wnd = tp->rcv_wnd;
2071 	otp->rcv_up = tp->rcv_up;
2072 	otp->snd_wnd = tp->snd_wnd;
2073 	otp->snd_cwnd = tp->snd_cwnd;
2074 	otp->snd_ssthresh = tp->snd_ssthresh;
2075 	otp->t_maxopd = tp->t_maxopd;
2076 	otp->t_rcvtime = tp->t_rcvtime;
2077 	otp->t_starttime = tp->t_starttime;
2078 	otp->t_rtttime = tp->t_rtttime;
2079 	otp->t_rtseq = tp->t_rtseq;
2080 	otp->t_rxtcur = tp->t_rxtcur;
2081 	otp->t_maxseg = tp->t_maxseg;
2082 	otp->t_srtt = tp->t_srtt;
2083 	otp->t_rttvar = tp->t_rttvar;
2084 	otp->t_rxtshift = tp->t_rxtshift;
2085 	otp->t_rttmin = tp->t_rttmin;
2086 	otp->t_rttupdated = tp->t_rttupdated;
2087 	otp->max_sndwnd = tp->max_sndwnd;
2088 	otp->t_softerror = tp->t_softerror;
2089 	otp->t_oobflags = tp->t_oobflags;
2090 	otp->t_iobc = tp->t_iobc;
2091 	otp->snd_scale = tp->snd_scale;
2092 	otp->rcv_scale = tp->rcv_scale;
2093 	otp->request_r_scale = tp->request_r_scale;
2094 	otp->requested_s_scale = tp->requested_s_scale;
2095 	otp->ts_recent = tp->ts_recent;
2096 	otp->ts_recent_age = tp->ts_recent_age;
2097 	otp->last_ack_sent = tp->last_ack_sent;
2098 	otp->cc_send = 0;
2099 	otp->cc_recv = 0;
2100 	otp->snd_recover = tp->snd_recover;
2101 	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
2102 	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
2103 	otp->t_badrxtwin = 0;
2104 }
2105 
2106 
2107 static int
2108 tcp_pcblist64 SYSCTL_HANDLER_ARGS
2109 {
2110 #pragma unused(oidp, arg1, arg2)
2111 	int error, i = 0, n, sz;
2112 	struct inpcb **inp_list;
2113 	inp_gen_t gencnt;
2114 	struct xinpgen xig;
2115 
2116 	/*
2117 	 * The process of preparing the TCB list is too time-consuming and
2118 	 * resource-intensive to repeat twice on every request.
2119 	 */
2120 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
2121 	if (req->oldptr == USER_ADDR_NULL) {
2122 		n = tcbinfo.ipi_count;
2123 		req->oldidx = 2 * (sizeof(xig))
2124 		    + (n + n / 8) * sizeof(struct xtcpcb64);
2125 		lck_rw_done(&tcbinfo.ipi_lock);
2126 		return 0;
2127 	}
2128 
2129 	if (req->newptr != USER_ADDR_NULL) {
2130 		lck_rw_done(&tcbinfo.ipi_lock);
2131 		return EPERM;
2132 	}
2133 
2134 	/*
2135 	 * OK, now we're committed to doing something.
2136 	 */
2137 	gencnt = tcbinfo.ipi_gencnt;
2138 	sz = n = tcbinfo.ipi_count;
2139 
2140 	bzero(&xig, sizeof(xig));
2141 	xig.xig_len = sizeof(xig);
2142 	xig.xig_count = n;
2143 	xig.xig_gen = gencnt;
2144 	xig.xig_sogen = so_gencnt;
2145 	error = SYSCTL_OUT(req, &xig, sizeof(xig));
2146 	if (error) {
2147 		lck_rw_done(&tcbinfo.ipi_lock);
2148 		return error;
2149 	}
2150 	/*
2151 	 * We are done if there is no pcb
2152 	 */
2153 	if (n == 0) {
2154 		lck_rw_done(&tcbinfo.ipi_lock);
2155 		return 0;
2156 	}
2157 
2158 	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
2159 	if (inp_list == NULL) {
2160 		lck_rw_done(&tcbinfo.ipi_lock);
2161 		return ENOMEM;
2162 	}
2163 
2164 	n = get_tcp_inp_list(inp_list, n, gencnt);
2165 
2166 	error = 0;
2167 	for (i = 0; i < n; i++) {
2168 		struct xtcpcb64 xt;
2169 		struct inpcb *inp;
2170 
2171 		inp = inp_list[i];
2172 
2173 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2174 			continue;
2175 		}
2176 		socket_lock(inp->inp_socket, 1);
2177 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2178 			socket_unlock(inp->inp_socket, 1);
2179 			continue;
2180 		}
2181 		if (inp->inp_gencnt > gencnt) {
2182 			socket_unlock(inp->inp_socket, 1);
2183 			continue;
2184 		}
2185 
2186 		bzero(&xt, sizeof(xt));
2187 		xt.xt_len = sizeof(xt);
2188 		inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
2189 		xt.xt_inpcb.inp_ppcb =
2190 		    (uint64_t)VM_KERNEL_ADDRPERM(inp->inp_ppcb);
2191 		if (inp->inp_ppcb != NULL) {
2192 			tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb,
2193 			    &xt);
2194 		}
2195 		if (inp->inp_socket) {
2196 			sotoxsocket64(inp->inp_socket,
2197 			    &xt.xt_inpcb.xi_socket);
2198 		}
2199 
2200 		socket_unlock(inp->inp_socket, 1);
2201 
2202 		error = SYSCTL_OUT(req, &xt, sizeof(xt));
2203 	}
2204 	if (!error) {
2205 		/*
2206 		 * Give the user an updated idea of our state.
2207 		 * If the generation differs from what we told
2208 		 * her before, she knows that something happened
2209 		 * while we were processing this request, and it
2210 		 * might be necessary to retry.
2211 		 */
2212 		bzero(&xig, sizeof(xig));
2213 		xig.xig_len = sizeof(xig);
2214 		xig.xig_gen = tcbinfo.ipi_gencnt;
2215 		xig.xig_sogen = so_gencnt;
2216 		xig.xig_count = tcbinfo.ipi_count;
2217 		error = SYSCTL_OUT(req, &xig, sizeof(xig));
2218 	}
2219 
2220 	lck_rw_done(&tcbinfo.ipi_lock);
2221 	kfree_type(struct inpcb *, sz, inp_list);
2222 	return error;
2223 }
2224 
2225 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
2226     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2227     tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
2228 
2229 #endif /* XNU_TARGET_OS_OSX */
2230 
2231 static int
2232 tcp_pcblist_n SYSCTL_HANDLER_ARGS
2233 {
2234 #pragma unused(oidp, arg1, arg2)
2235 	int error = 0;
2236 
2237 	error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
2238 
2239 	return error;
2240 }
2241 
2242 
2243 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
2244     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2245     tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
2246 
2247 static int
2248 tcp_progress_indicators SYSCTL_HANDLER_ARGS
2249 {
2250 #pragma unused(oidp, arg1, arg2)
2251 
2252 	return ntstat_tcp_progress_indicators(req);
2253 }
2254 
2255 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress,
2256     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
2257     tcp_progress_indicators, "S", "Various items that indicate the current state of progress on the link");
2258 
2259 
2260 static int
2261 tcp_progress_probe_enable SYSCTL_HANDLER_ARGS
2262 {
2263 #pragma unused(oidp, arg1, arg2)
2264 
2265 	return ntstat_tcp_progress_enable(req);
2266 }
2267 
2268 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress_enable,
2269     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
2270     tcp_progress_probe_enable, "S", "Enable/disable TCP keepalive probing on the specified link(s)");
2271 
2272 
2273 __private_extern__ void
tcp_get_ports_used(ifnet_t ifp,int protocol,uint32_t flags,bitstr_t * bitfield)2274 tcp_get_ports_used(ifnet_t ifp, int protocol, uint32_t flags,
2275     bitstr_t *bitfield)
2276 {
2277 	inpcb_get_ports_used(ifp, protocol, flags, bitfield,
2278 	    &tcbinfo);
2279 }
2280 
2281 __private_extern__ uint32_t
tcp_count_opportunistic(unsigned int ifindex,u_int32_t flags)2282 tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
2283 {
2284 	return inpcb_count_opportunistic(ifindex, &tcbinfo, flags);
2285 }
2286 
2287 __private_extern__ uint32_t
tcp_find_anypcb_byaddr(struct ifaddr * ifa)2288 tcp_find_anypcb_byaddr(struct ifaddr *ifa)
2289 {
2290 #if SKYWALK
2291 	if (netns_is_enabled()) {
2292 		return netns_find_anyres_byaddr(ifa, IPPROTO_TCP);
2293 	} else
2294 #endif /* SKYWALK */
2295 	return inpcb_find_anypcb_byaddr(ifa, &tcbinfo);
2296 }
2297 
2298 static void
tcp_handle_msgsize(struct ip * ip,struct inpcb * inp)2299 tcp_handle_msgsize(struct ip *ip, struct inpcb *inp)
2300 {
2301 	struct rtentry *rt = NULL;
2302 	u_short ifscope = IFSCOPE_NONE;
2303 	int mtu;
2304 	struct sockaddr_in icmpsrc = {
2305 		.sin_len = sizeof(struct sockaddr_in),
2306 		.sin_family = AF_INET, .sin_port = 0, .sin_addr = { .s_addr = 0 },
2307 		.sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }
2308 	};
2309 	struct icmp *icp = NULL;
2310 
2311 	icp = (struct icmp *)(void *)
2312 	    ((caddr_t)ip - offsetof(struct icmp, icmp_ip));
2313 
2314 	icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
2315 
2316 	/*
2317 	 * MTU discovery:
2318 	 * If we got a needfrag and there is a host route to the
2319 	 * original destination, and the MTU is not locked, then
2320 	 * set the MTU in the route to the suggested new value
2321 	 * (if given) and then notify as usual.  The ULPs will
2322 	 * notice that the MTU has changed and adapt accordingly.
2323 	 * If no new MTU was suggested, then we guess a new one
2324 	 * less than the current value.  If the new MTU is
2325 	 * unreasonably small (defined by sysctl tcp_minmss), then
2326 	 * we reset the MTU to the interface value and enable the
2327 	 * lock bit, indicating that we are no longer doing MTU
2328 	 * discovery.
2329 	 */
2330 	if (ROUTE_UNUSABLE(&(inp->inp_route)) == false) {
2331 		rt = inp->inp_route.ro_rt;
2332 	}
2333 
2334 	/*
2335 	 * icmp6_mtudisc_update scopes the routing lookup
2336 	 * to the incoming interface (delivered from mbuf
2337 	 * packet header.
2338 	 * That is mostly ok but for asymmetric networks
2339 	 * that may be an issue.
2340 	 * Frag needed OR Packet too big really communicates
2341 	 * MTU for the out data path.
2342 	 * Take the interface scope from cached route or
2343 	 * the last outgoing interface from inp
2344 	 */
2345 	if (rt != NULL) {
2346 		ifscope = (rt->rt_ifp != NULL) ?
2347 		    rt->rt_ifp->if_index : IFSCOPE_NONE;
2348 	} else {
2349 		ifscope = (inp->inp_last_outifp != NULL) ?
2350 		    inp->inp_last_outifp->if_index : IFSCOPE_NONE;
2351 	}
2352 
2353 	if ((rt == NULL) ||
2354 	    !(rt->rt_flags & RTF_HOST) ||
2355 	    (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING))) {
2356 		rt = rtalloc1_scoped((struct sockaddr *)&icmpsrc, 0,
2357 		    RTF_CLONING | RTF_PRCLONING, ifscope);
2358 	} else if (rt) {
2359 		RT_LOCK(rt);
2360 		rtref(rt);
2361 		RT_UNLOCK(rt);
2362 	}
2363 
2364 	if (rt != NULL) {
2365 		RT_LOCK(rt);
2366 		if ((rt->rt_flags & RTF_HOST) &&
2367 		    !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
2368 			mtu = ntohs(icp->icmp_nextmtu);
2369 			/*
2370 			 * XXX Stock BSD has changed the following
2371 			 * to compare with icp->icmp_ip.ip_len
2372 			 * to converge faster when sent packet
2373 			 * < route's MTU. We may want to adopt
2374 			 * that change.
2375 			 */
2376 			if (mtu == 0) {
2377 				mtu = ip_next_mtu(rt->rt_rmx.
2378 				    rmx_mtu, 1);
2379 			}
2380 #if DEBUG_MTUDISC
2381 			printf("MTU for %s reduced to %d\n",
2382 			    inet_ntop(AF_INET,
2383 			    &icmpsrc.sin_addr, ipv4str,
2384 			    sizeof(ipv4str)), mtu);
2385 #endif
2386 			if (mtu < max(296, (tcp_minmss +
2387 			    sizeof(struct tcpiphdr)))) {
2388 				rt->rt_rmx.rmx_locks |= RTV_MTU;
2389 			} else if (rt->rt_rmx.rmx_mtu > mtu) {
2390 				rt->rt_rmx.rmx_mtu = mtu;
2391 			}
2392 		}
2393 		RT_UNLOCK(rt);
2394 		rtfree(rt);
2395 	}
2396 }
2397 
2398 void
tcp_ctlinput(int cmd,struct sockaddr * sa,void * vip,__unused struct ifnet * ifp)2399 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip, __unused struct ifnet *ifp)
2400 {
2401 	tcp_seq icmp_tcp_seq;
2402 	struct ipctlparam *ctl_param = vip;
2403 	struct ip *ip = NULL;
2404 	struct mbuf *m = NULL;
2405 	struct in_addr faddr;
2406 	struct inpcb *inp;
2407 	struct tcpcb *tp;
2408 	struct tcphdr *th;
2409 	struct icmp *icp;
2410 	size_t off;
2411 #if SKYWALK
2412 	union sockaddr_in_4_6 sock_laddr;
2413 	struct protoctl_ev_val prctl_ev_val;
2414 #endif /* SKYWALK */
2415 	void (*notify)(struct inpcb *, int) = tcp_notify;
2416 
2417 	if (ctl_param != NULL) {
2418 		ip = ctl_param->ipc_icmp_ip;
2419 		icp = ctl_param->ipc_icmp;
2420 		m = ctl_param->ipc_m;
2421 		off = ctl_param->ipc_off;
2422 	} else {
2423 		ip = NULL;
2424 		icp = NULL;
2425 		m = NULL;
2426 		off = 0;
2427 	}
2428 
2429 	faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr;
2430 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) {
2431 		return;
2432 	}
2433 
2434 	if ((unsigned)cmd >= PRC_NCMDS) {
2435 		return;
2436 	}
2437 
2438 	/* Source quench is deprecated */
2439 	if (cmd == PRC_QUENCH) {
2440 		return;
2441 	}
2442 
2443 	if (cmd == PRC_MSGSIZE) {
2444 		notify = tcp_mtudisc;
2445 	} else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2446 	    cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
2447 	    cmd == PRC_TIMXCEED_INTRANS) && ip) {
2448 		notify = tcp_drop_syn_sent;
2449 	}
2450 	/*
2451 	 * Hostdead is ugly because it goes linearly through all PCBs.
2452 	 * XXX: We never get this from ICMP, otherwise it makes an
2453 	 * excellent DoS attack on machines with many connections.
2454 	 */
2455 	else if (cmd == PRC_HOSTDEAD) {
2456 		ip = NULL;
2457 	} else if (inetctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2458 		return;
2459 	}
2460 
2461 #if SKYWALK
2462 	bzero(&prctl_ev_val, sizeof(prctl_ev_val));
2463 	bzero(&sock_laddr, sizeof(sock_laddr));
2464 #endif /* SKYWALK */
2465 
2466 	if (ip == NULL) {
2467 		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
2468 #if SKYWALK
2469 		protoctl_event_enqueue_nwk_wq_entry(ifp, NULL,
2470 		    sa, 0, 0, IPPROTO_TCP, cmd, NULL);
2471 #endif /* SKYWALK */
2472 		return;
2473 	}
2474 
2475 	/* Check if we can safely get the sport, dport and the sequence number from the tcp header. */
2476 	if (m == NULL ||
2477 	    (m->m_len < off + (sizeof(unsigned short) + sizeof(unsigned short) + sizeof(tcp_seq)))) {
2478 		/* Insufficient length */
2479 		return;
2480 	}
2481 
2482 	th = (struct tcphdr*)(void*)(mtod(m, uint8_t*) + off);
2483 	icmp_tcp_seq = ntohl(th->th_seq);
2484 
2485 	inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
2486 	    ip->ip_src, th->th_sport, 0, NULL);
2487 
2488 	if (inp == NULL ||
2489 	    inp->inp_socket == NULL) {
2490 #if SKYWALK
2491 		if (cmd == PRC_MSGSIZE) {
2492 			prctl_ev_val.val = ntohs(icp->icmp_nextmtu);
2493 		}
2494 		prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2495 
2496 		sock_laddr.sin.sin_family = AF_INET;
2497 		sock_laddr.sin.sin_len = sizeof(sock_laddr.sin);
2498 		sock_laddr.sin.sin_addr = ip->ip_src;
2499 
2500 		protoctl_event_enqueue_nwk_wq_entry(ifp,
2501 		    (struct sockaddr *)&sock_laddr, sa,
2502 		    th->th_sport, th->th_dport, IPPROTO_TCP,
2503 		    cmd, &prctl_ev_val);
2504 #endif /* SKYWALK */
2505 		return;
2506 	}
2507 
2508 	socket_lock(inp->inp_socket, 1);
2509 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2510 	    WNT_STOPUSING) {
2511 		socket_unlock(inp->inp_socket, 1);
2512 		return;
2513 	}
2514 
2515 	if (PRC_IS_REDIRECT(cmd)) {
2516 		/* signal EHOSTDOWN, as it flushes the cached route */
2517 		(*notify)(inp, EHOSTDOWN);
2518 	} else {
2519 		tp = intotcpcb(inp);
2520 		if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2521 		    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2522 			if (cmd == PRC_MSGSIZE) {
2523 				tcp_handle_msgsize(ip, inp);
2524 			}
2525 
2526 			(*notify)(inp, inetctlerrmap[cmd]);
2527 		}
2528 	}
2529 	socket_unlock(inp->inp_socket, 1);
2530 }
2531 
2532 void
tcp6_ctlinput(int cmd,struct sockaddr * sa,void * d,__unused struct ifnet * ifp)2533 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp)
2534 {
2535 	tcp_seq icmp_tcp_seq;
2536 	struct in6_addr *dst;
2537 	void (*notify)(struct inpcb *, int) = tcp_notify;
2538 	struct ip6_hdr *ip6;
2539 	struct mbuf *m;
2540 	struct inpcb *inp;
2541 	struct tcpcb *tp;
2542 	struct icmp6_hdr *icmp6;
2543 	struct ip6ctlparam *ip6cp = NULL;
2544 	const struct sockaddr_in6 *sa6_src = NULL;
2545 	unsigned int mtu;
2546 	unsigned int off;
2547 
2548 	struct tcp_ports {
2549 		uint16_t th_sport;
2550 		uint16_t th_dport;
2551 	} t_ports;
2552 #if SKYWALK
2553 	union sockaddr_in_4_6 sock_laddr;
2554 	struct protoctl_ev_val prctl_ev_val;
2555 #endif /* SKYWALK */
2556 
2557 	if (sa->sa_family != AF_INET6 ||
2558 	    sa->sa_len != sizeof(struct sockaddr_in6)) {
2559 		return;
2560 	}
2561 
2562 	/* Source quench is deprecated */
2563 	if (cmd == PRC_QUENCH) {
2564 		return;
2565 	}
2566 
2567 	if ((unsigned)cmd >= PRC_NCMDS) {
2568 		return;
2569 	}
2570 
2571 	/* if the parameter is from icmp6, decode it. */
2572 	if (d != NULL) {
2573 		ip6cp = (struct ip6ctlparam *)d;
2574 		icmp6 = ip6cp->ip6c_icmp6;
2575 		m = ip6cp->ip6c_m;
2576 		ip6 = ip6cp->ip6c_ip6;
2577 		off = ip6cp->ip6c_off;
2578 		sa6_src = ip6cp->ip6c_src;
2579 		dst = ip6cp->ip6c_finaldst;
2580 	} else {
2581 		m = NULL;
2582 		ip6 = NULL;
2583 		off = 0;        /* fool gcc */
2584 		sa6_src = &sa6_any;
2585 		dst = NULL;
2586 	}
2587 
2588 	if (cmd == PRC_MSGSIZE) {
2589 		notify = tcp_mtudisc;
2590 	} else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2591 	    cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
2592 	    ip6 != NULL) {
2593 		notify = tcp_drop_syn_sent;
2594 	}
2595 	/*
2596 	 * Hostdead is ugly because it goes linearly through all PCBs.
2597 	 * XXX: We never get this from ICMP, otherwise it makes an
2598 	 * excellent DoS attack on machines with many connections.
2599 	 */
2600 	else if (cmd == PRC_HOSTDEAD) {
2601 		ip6 = NULL;
2602 	} else if (inet6ctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2603 		return;
2604 	}
2605 
2606 #if SKYWALK
2607 	bzero(&prctl_ev_val, sizeof(prctl_ev_val));
2608 	bzero(&sock_laddr, sizeof(sock_laddr));
2609 #endif /* SKYWALK */
2610 
2611 	if (ip6 == NULL) {
2612 		in6_pcbnotify(&tcbinfo, sa, 0, (struct sockaddr *)(size_t)sa6_src,
2613 		    0, cmd, NULL, notify);
2614 #if SKYWALK
2615 		protoctl_event_enqueue_nwk_wq_entry(ifp, NULL, sa,
2616 		    0, 0, IPPROTO_TCP, cmd, NULL);
2617 #endif /* SKYWALK */
2618 		return;
2619 	}
2620 
2621 	/* Check if we can safely get the ports from the tcp hdr */
2622 	if (m == NULL ||
2623 	    (m->m_pkthdr.len <
2624 	    (int32_t) (off + sizeof(struct tcp_ports)))) {
2625 		return;
2626 	}
2627 	bzero(&t_ports, sizeof(struct tcp_ports));
2628 	m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
2629 
2630 	off += sizeof(struct tcp_ports);
2631 	if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
2632 		return;
2633 	}
2634 	m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
2635 	icmp_tcp_seq = ntohl(icmp_tcp_seq);
2636 
2637 	if (cmd == PRC_MSGSIZE) {
2638 		mtu = ntohl(icmp6->icmp6_mtu);
2639 		/*
2640 		 * If no alternative MTU was proposed, or the proposed
2641 		 * MTU was too small, set to the min.
2642 		 */
2643 		if (mtu < IPV6_MMTU) {
2644 			mtu = IPV6_MMTU - 8;
2645 		}
2646 	}
2647 
2648 	inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_dst, t_ports.th_dport, ip6_input_getdstifscope(m),
2649 	    &ip6->ip6_src, t_ports.th_sport, ip6_input_getsrcifscope(m), 0, NULL);
2650 
2651 	if (inp == NULL ||
2652 	    inp->inp_socket == NULL) {
2653 #if SKYWALK
2654 		if (cmd == PRC_MSGSIZE) {
2655 			prctl_ev_val.val = mtu;
2656 		}
2657 		prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2658 
2659 		sock_laddr.sin6.sin6_family = AF_INET6;
2660 		sock_laddr.sin6.sin6_len = sizeof(sock_laddr.sin6);
2661 		sock_laddr.sin6.sin6_addr = ip6->ip6_src;
2662 
2663 		protoctl_event_enqueue_nwk_wq_entry(ifp,
2664 		    (struct sockaddr *)&sock_laddr, sa,
2665 		    t_ports.th_sport, t_ports.th_dport, IPPROTO_TCP,
2666 		    cmd, &prctl_ev_val);
2667 #endif /* SKYWALK */
2668 		return;
2669 	}
2670 
2671 	socket_lock(inp->inp_socket, 1);
2672 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2673 	    WNT_STOPUSING) {
2674 		socket_unlock(inp->inp_socket, 1);
2675 		return;
2676 	}
2677 
2678 	if (PRC_IS_REDIRECT(cmd)) {
2679 		/* signal EHOSTDOWN, as it flushes the cached route */
2680 		(*notify)(inp, EHOSTDOWN);
2681 	} else {
2682 		tp = intotcpcb(inp);
2683 		if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2684 		    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2685 			if (cmd == PRC_MSGSIZE) {
2686 				/*
2687 				 * Only process the offered MTU if it
2688 				 * is smaller than the current one.
2689 				 */
2690 				if (mtu < tp->t_maxseg +
2691 				    (sizeof(struct tcphdr) + sizeof(struct ip6_hdr))) {
2692 					(*notify)(inp, inetctlerrmap[cmd]);
2693 				}
2694 			} else {
2695 				(*notify)(inp, inetctlerrmap[cmd]);
2696 			}
2697 		}
2698 	}
2699 	socket_unlock(inp->inp_socket, 1);
2700 }
2701 
2702 
2703 /*
2704  * Following is where TCP initial sequence number generation occurs.
2705  *
2706  * There are two places where we must use initial sequence numbers:
2707  * 1.  In SYN-ACK packets.
2708  * 2.  In SYN packets.
2709  *
2710  * The ISNs in SYN-ACK packets have no monotonicity requirement,
2711  * and should be as unpredictable as possible to avoid the possibility
2712  * of spoofing and/or connection hijacking.  To satisfy this
2713  * requirement, SYN-ACK ISNs are generated via the arc4random()
2714  * function.  If exact RFC 1948 compliance is requested via sysctl,
2715  * these ISNs will be generated just like those in SYN packets.
2716  *
2717  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
2718  * depends on this property.  In addition, these ISNs should be
2719  * unguessable so as to prevent connection hijacking.  To satisfy
2720  * the requirements of this situation, the algorithm outlined in
2721  * RFC 1948 is used to generate sequence numbers.
2722  *
2723  * For more information on the theory of operation, please see
2724  * RFC 1948.
2725  *
2726  * Implementation details:
2727  *
2728  * Time is based off the system timer, and is corrected so that it
2729  * increases by one megabyte per second.  This allows for proper
2730  * recycling on high speed LANs while still leaving over an hour
2731  * before rollover.
2732  *
2733  * Two sysctls control the generation of ISNs:
2734  *
2735  * net.inet.tcp.isn_reseed_interval controls the number of seconds
2736  * between seeding of isn_secret.  This is normally set to zero,
2737  * as reseeding should not be necessary.
2738  *
2739  * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
2740  * strictly.  When strict compliance is requested, reseeding is
2741  * disabled and SYN-ACKs will be generated in the same manner as
2742  * SYNs.  Strict mode is disabled by default.
2743  *
2744  */
2745 
2746 #define ISN_BYTES_PER_SECOND 1048576
2747 
2748 tcp_seq
tcp_new_isn(struct tcpcb * tp)2749 tcp_new_isn(struct tcpcb *tp)
2750 {
2751 	u_int32_t md5_buffer[4];
2752 	tcp_seq new_isn;
2753 	struct timeval timenow;
2754 	u_char isn_secret[32];
2755 	long isn_last_reseed = 0;
2756 	MD5_CTX isn_ctx;
2757 
2758 	/* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
2759 	if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) &&
2760 	    tcp_strict_rfc1948 == 0)
2761 #ifdef __APPLE__
2762 	{ return RandomULong(); }
2763 #else
2764 	{ return arc4random(); }
2765 #endif
2766 	getmicrotime(&timenow);
2767 
2768 	/* Seed if this is the first use, reseed if requested. */
2769 	if ((isn_last_reseed == 0) ||
2770 	    ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
2771 	    (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval * hz)
2772 	    < (u_int)timenow.tv_sec))) {
2773 #ifdef __APPLE__
2774 		read_frandom(&isn_secret, sizeof(isn_secret));
2775 #else
2776 		read_random_unlimited(&isn_secret, sizeof(isn_secret));
2777 #endif
2778 		isn_last_reseed = timenow.tv_sec;
2779 	}
2780 
2781 	/* Compute the md5 hash and return the ISN. */
2782 	MD5Init(&isn_ctx);
2783 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport,
2784 	    sizeof(u_short));
2785 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport,
2786 	    sizeof(u_short));
2787 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
2788 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
2789 		    sizeof(struct in6_addr));
2790 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
2791 		    sizeof(struct in6_addr));
2792 	} else {
2793 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
2794 		    sizeof(struct in_addr));
2795 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
2796 		    sizeof(struct in_addr));
2797 	}
2798 	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
2799 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
2800 	new_isn = (tcp_seq) md5_buffer[0];
2801 	new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
2802 	return new_isn;
2803 }
2804 
2805 
2806 /*
2807  * When a specific ICMP unreachable message is received and the
2808  * connection state is SYN-SENT, drop the connection.  This behavior
2809  * is controlled by the icmp_may_rst sysctl.
2810  */
2811 void
tcp_drop_syn_sent(struct inpcb * inp,int errno)2812 tcp_drop_syn_sent(struct inpcb *inp, int errno)
2813 {
2814 	struct tcpcb *tp = intotcpcb(inp);
2815 
2816 	if (tp && tp->t_state == TCPS_SYN_SENT) {
2817 		tcp_drop(tp, errno);
2818 	}
2819 }
2820 
2821 /*
2822  * When `need fragmentation' ICMP is received, update our idea of the MSS
2823  * based on the new value in the route.  Also nudge TCP to send something,
2824  * since we know the packet we just sent was dropped.
2825  * This duplicates some code in the tcp_mss() function in tcp_input.c.
2826  */
2827 void
tcp_mtudisc(struct inpcb * inp,__unused int errno)2828 tcp_mtudisc(struct inpcb *inp, __unused int errno)
2829 {
2830 	struct tcpcb *tp = intotcpcb(inp);
2831 	struct rtentry *rt;
2832 	struct socket *so = inp->inp_socket;
2833 	int mss;
2834 	u_int32_t mtu;
2835 	u_int32_t protoHdrOverhead = sizeof(struct tcpiphdr);
2836 	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
2837 
2838 	/*
2839 	 * Nothing left to send after the socket is defunct or TCP is in the closed state
2840 	 */
2841 	if ((so->so_state & SS_DEFUNCT) || (tp != NULL && tp->t_state == TCPS_CLOSED)) {
2842 		return;
2843 	}
2844 
2845 	if (isipv6) {
2846 		protoHdrOverhead = sizeof(struct ip6_hdr) +
2847 		    sizeof(struct tcphdr);
2848 	}
2849 
2850 	if (tp != NULL) {
2851 		if (isipv6) {
2852 			rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2853 		} else {
2854 			rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2855 		}
2856 		if (!rt || !rt->rt_rmx.rmx_mtu) {
2857 			tp->t_maxopd = tp->t_maxseg =
2858 			    isipv6 ? tcp_v6mssdflt :
2859 			    tcp_mssdflt;
2860 
2861 			/* Route locked during lookup above */
2862 			if (rt != NULL) {
2863 				RT_UNLOCK(rt);
2864 			}
2865 			return;
2866 		}
2867 		mtu = rt->rt_rmx.rmx_mtu;
2868 
2869 		/* Route locked during lookup above */
2870 		RT_UNLOCK(rt);
2871 
2872 #if NECP
2873 		// Adjust MTU if necessary.
2874 		mtu = necp_socket_get_effective_mtu(inp, mtu);
2875 #endif /* NECP */
2876 		mss = mtu - protoHdrOverhead;
2877 
2878 		if (tp->t_maxopd) {
2879 			mss = min(mss, tp->t_maxopd);
2880 		}
2881 		/*
2882 		 * XXX - The above conditional probably violates the TCP
2883 		 * spec.  The problem is that, since we don't know the
2884 		 * other end's MSS, we are supposed to use a conservative
2885 		 * default.  But, if we do that, then MTU discovery will
2886 		 * never actually take place, because the conservative
2887 		 * default is much less than the MTUs typically seen
2888 		 * on the Internet today.  For the moment, we'll sweep
2889 		 * this under the carpet.
2890 		 *
2891 		 * The conservative default might not actually be a problem
2892 		 * if the only case this occurs is when sending an initial
2893 		 * SYN with options and data to a host we've never talked
2894 		 * to before.  Then, they will reply with an MSS value which
2895 		 * will get recorded and the new parameters should get
2896 		 * recomputed.  For Further Study.
2897 		 */
2898 		if (tp->t_maxopd <= mss) {
2899 			return;
2900 		}
2901 		tp->t_maxopd = mss;
2902 
2903 		if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
2904 		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) {
2905 			mss -= TCPOLEN_TSTAMP_APPA;
2906 		}
2907 
2908 #if MPTCP
2909 		mss -= mptcp_adj_mss(tp, TRUE);
2910 #endif
2911 		if (so->so_snd.sb_hiwat < mss) {
2912 			mss = so->so_snd.sb_hiwat;
2913 		}
2914 
2915 		tp->t_maxseg = mss;
2916 
2917 		ASSERT(tp->t_maxseg);
2918 
2919 		/*
2920 		 * Reset the slow-start flight size as it may depends on the
2921 		 * new MSS
2922 		 */
2923 		if (CC_ALGO(tp)->cwnd_init != NULL) {
2924 			CC_ALGO(tp)->cwnd_init(tp);
2925 		}
2926 
2927 		if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.rwnd_init != NULL) {
2928 			tcp_cc_rledbat.rwnd_init(tp);
2929 		}
2930 
2931 		tcpstat.tcps_mturesent++;
2932 		tp->t_rtttime = 0;
2933 		tp->snd_nxt = tp->snd_una;
2934 		tcp_output(tp);
2935 	}
2936 }
2937 
2938 /*
2939  * Look-up the routing entry to the peer of this inpcb.  If no route
2940  * is found and it cannot be allocated the return NULL.  This routine
2941  * is called by TCP routines that access the rmx structure and by tcp_mss
2942  * to get the interface MTU.  If a route is found, this routine will
2943  * hold the rtentry lock; the caller is responsible for unlocking.
2944  */
2945 struct rtentry *
tcp_rtlookup(struct inpcb * inp,unsigned int input_ifscope)2946 tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope)
2947 {
2948 	struct route *ro;
2949 	struct rtentry *rt;
2950 	struct tcpcb *tp;
2951 
2952 	LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2953 
2954 	ro = &inp->inp_route;
2955 	if ((rt = ro->ro_rt) != NULL) {
2956 		RT_LOCK(rt);
2957 	}
2958 
2959 	if (ROUTE_UNUSABLE(ro)) {
2960 		if (rt != NULL) {
2961 			RT_UNLOCK(rt);
2962 			rt = NULL;
2963 		}
2964 		ROUTE_RELEASE(ro);
2965 		/* No route yet, so try to acquire one */
2966 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
2967 			unsigned int ifscope;
2968 
2969 			ro->ro_dst.sa_family = AF_INET;
2970 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
2971 			((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr =
2972 			    inp->inp_faddr;
2973 
2974 			/*
2975 			 * If the socket was bound to an interface, then
2976 			 * the bound-to-interface takes precedence over
2977 			 * the inbound interface passed in by the caller
2978 			 * (if we get here as part of the output path then
2979 			 * input_ifscope is IFSCOPE_NONE).
2980 			 */
2981 			ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2982 			    inp->inp_boundifp->if_index : input_ifscope;
2983 
2984 			rtalloc_scoped(ro, ifscope);
2985 			if ((rt = ro->ro_rt) != NULL) {
2986 				RT_LOCK(rt);
2987 			}
2988 		}
2989 	}
2990 	if (rt != NULL) {
2991 		RT_LOCK_ASSERT_HELD(rt);
2992 	}
2993 
2994 	/*
2995 	 * Update MTU discovery determination. Don't do it if:
2996 	 *	1) it is disabled via the sysctl
2997 	 *	2) the route isn't up
2998 	 *	3) the MTU is locked (if it is, then discovery has been
2999 	 *	   disabled)
3000 	 */
3001 
3002 	tp = intotcpcb(inp);
3003 
3004 	if (!path_mtu_discovery || ((rt != NULL) &&
3005 	    (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3006 		tp->t_flags &= ~TF_PMTUD;
3007 	} else {
3008 		tp->t_flags |= TF_PMTUD;
3009 	}
3010 
3011 	if (rt != NULL && rt->rt_ifp != NULL) {
3012 		somultipages(inp->inp_socket,
3013 		    (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3014 		tcp_set_tso(tp, rt->rt_ifp);
3015 		soif2kcl(inp->inp_socket,
3016 		    (rt->rt_ifp->if_eflags & IFEF_2KCL));
3017 		tcp_set_ecn(tp, rt->rt_ifp);
3018 		if (inp->inp_last_outifp == NULL) {
3019 			inp->inp_last_outifp = rt->rt_ifp;
3020 #if SKYWALK
3021 			if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3022 				netns_set_ifnet(&inp->inp_netns_token,
3023 				    inp->inp_last_outifp);
3024 			}
3025 #endif /* SKYWALK */
3026 		}
3027 	}
3028 
3029 	/* Note if the peer is local */
3030 	if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3031 	    (rt->rt_gateway->sa_family == AF_LINK ||
3032 	    rt->rt_ifp->if_flags & IFF_LOOPBACK ||
3033 	    in_localaddr(inp->inp_faddr))) {
3034 		tp->t_flags |= TF_LOCAL;
3035 	}
3036 
3037 	/*
3038 	 * Caller needs to call RT_UNLOCK(rt).
3039 	 */
3040 	return rt;
3041 }
3042 
3043 struct rtentry *
tcp_rtlookup6(struct inpcb * inp,unsigned int input_ifscope)3044 tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope)
3045 {
3046 	struct route_in6 *ro6;
3047 	struct rtentry *rt;
3048 	struct tcpcb *tp;
3049 
3050 	LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
3051 
3052 	ro6 = &inp->in6p_route;
3053 	if ((rt = ro6->ro_rt) != NULL) {
3054 		RT_LOCK(rt);
3055 	}
3056 
3057 	if (ROUTE_UNUSABLE(ro6)) {
3058 		if (rt != NULL) {
3059 			RT_UNLOCK(rt);
3060 			rt = NULL;
3061 		}
3062 		ROUTE_RELEASE(ro6);
3063 		/* No route yet, so try to acquire one */
3064 		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
3065 			struct sockaddr_in6 *dst6;
3066 			unsigned int ifscope;
3067 
3068 			dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
3069 			dst6->sin6_family = AF_INET6;
3070 			dst6->sin6_len = sizeof(*dst6);
3071 			dst6->sin6_addr = inp->in6p_faddr;
3072 
3073 			/*
3074 			 * If the socket was bound to an interface, then
3075 			 * the bound-to-interface takes precedence over
3076 			 * the inbound interface passed in by the caller
3077 			 * (if we get here as part of the output path then
3078 			 * input_ifscope is IFSCOPE_NONE).
3079 			 */
3080 			ifscope = (inp->inp_flags & INP_BOUND_IF) ?
3081 			    inp->inp_boundifp->if_index : input_ifscope;
3082 
3083 			rtalloc_scoped((struct route *)ro6, ifscope);
3084 			if ((rt = ro6->ro_rt) != NULL) {
3085 				RT_LOCK(rt);
3086 			}
3087 		}
3088 	}
3089 	if (rt != NULL) {
3090 		RT_LOCK_ASSERT_HELD(rt);
3091 	}
3092 
3093 	/*
3094 	 * Update path MTU Discovery determination
3095 	 * while looking up the route:
3096 	 *  1) we have a valid route to the destination
3097 	 *  2) the MTU is not locked (if it is, then discovery has been
3098 	 *    disabled)
3099 	 */
3100 
3101 
3102 	tp = intotcpcb(inp);
3103 
3104 	/*
3105 	 * Update MTU discovery determination. Don't do it if:
3106 	 *	1) it is disabled via the sysctl
3107 	 *	2) the route isn't up
3108 	 *	3) the MTU is locked (if it is, then discovery has been
3109 	 *	   disabled)
3110 	 */
3111 
3112 	if (!path_mtu_discovery || ((rt != NULL) &&
3113 	    (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3114 		tp->t_flags &= ~TF_PMTUD;
3115 	} else {
3116 		tp->t_flags |= TF_PMTUD;
3117 	}
3118 
3119 	if (rt != NULL && rt->rt_ifp != NULL) {
3120 		somultipages(inp->inp_socket,
3121 		    (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3122 		tcp_set_tso(tp, rt->rt_ifp);
3123 		soif2kcl(inp->inp_socket,
3124 		    (rt->rt_ifp->if_eflags & IFEF_2KCL));
3125 		tcp_set_ecn(tp, rt->rt_ifp);
3126 		if (inp->inp_last_outifp == NULL) {
3127 			inp->inp_last_outifp = rt->rt_ifp;
3128 #if SKYWALK
3129 			if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3130 				netns_set_ifnet(&inp->inp_netns_token,
3131 				    inp->inp_last_outifp);
3132 			}
3133 #endif /* SKYWALK */
3134 		}
3135 
3136 		/* Note if the peer is local */
3137 		if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3138 		    (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
3139 		    IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
3140 		    rt->rt_gateway->sa_family == AF_LINK ||
3141 		    in6_localaddr(&inp->in6p_faddr))) {
3142 			tp->t_flags |= TF_LOCAL;
3143 		}
3144 	}
3145 
3146 	/*
3147 	 * Caller needs to call RT_UNLOCK(rt).
3148 	 */
3149 	return rt;
3150 }
3151 
3152 #if IPSEC
3153 /* compute ESP/AH header size for TCP, including outer IP header. */
3154 size_t
ipsec_hdrsiz_tcp(struct tcpcb * tp)3155 ipsec_hdrsiz_tcp(struct tcpcb *tp)
3156 {
3157 	struct inpcb *inp;
3158 	struct mbuf *m;
3159 	size_t hdrsiz;
3160 	struct ip *ip;
3161 	struct ip6_hdr *ip6 = NULL;
3162 	struct tcphdr *th;
3163 
3164 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) {
3165 		return 0;
3166 	}
3167 	MGETHDR(m, M_DONTWAIT, MT_DATA);        /* MAC-OK */
3168 	if (!m) {
3169 		return 0;
3170 	}
3171 
3172 	if ((inp->inp_vflag & INP_IPV6) != 0) {
3173 		ip6 = mtod(m, struct ip6_hdr *);
3174 		th = (struct tcphdr *)(void *)(ip6 + 1);
3175 		m->m_pkthdr.len = m->m_len =
3176 		    sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3177 		tcp_fillheaders(m, tp, ip6, th);
3178 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3179 	} else {
3180 		ip = mtod(m, struct ip *);
3181 		th = (struct tcphdr *)(ip + 1);
3182 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
3183 		tcp_fillheaders(m, tp, ip, th);
3184 		hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3185 	}
3186 	m_free(m);
3187 	return hdrsiz;
3188 }
3189 #endif /* IPSEC */
3190 
3191 int
tcp_lock(struct socket * so,int refcount,void * lr)3192 tcp_lock(struct socket *so, int refcount, void *lr)
3193 {
3194 	void *lr_saved;
3195 
3196 	if (lr == NULL) {
3197 		lr_saved = __builtin_return_address(0);
3198 	} else {
3199 		lr_saved = lr;
3200 	}
3201 
3202 retry:
3203 	if (so->so_pcb != NULL) {
3204 		if (so->so_flags & SOF_MP_SUBFLOW) {
3205 			struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3206 			struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3207 
3208 			socket_lock(mp_so, refcount);
3209 
3210 			/*
3211 			 * Check if we became non-MPTCP while waiting for the lock.
3212 			 * If yes, we have to retry to grab the right lock.
3213 			 */
3214 			if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3215 				socket_unlock(mp_so, refcount);
3216 				goto retry;
3217 			}
3218 		} else {
3219 			lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3220 
3221 			if (so->so_flags & SOF_MP_SUBFLOW) {
3222 				/*
3223 				 * While waiting for the lock, we might have
3224 				 * become MPTCP-enabled (see mptcp_subflow_socreate).
3225 				 */
3226 				lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3227 				goto retry;
3228 			}
3229 		}
3230 	} else {
3231 		panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s",
3232 		    so, lr_saved, solockhistory_nr(so));
3233 		/* NOTREACHED */
3234 	}
3235 
3236 	if (so->so_usecount < 0) {
3237 		panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s",
3238 		    so, so->so_pcb, lr_saved, so->so_usecount,
3239 		    solockhistory_nr(so));
3240 		/* NOTREACHED */
3241 	}
3242 	if (refcount) {
3243 		so->so_usecount++;
3244 	}
3245 	so->lock_lr[so->next_lock_lr] = lr_saved;
3246 	so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3247 	return 0;
3248 }
3249 
3250 int
tcp_unlock(struct socket * so,int refcount,void * lr)3251 tcp_unlock(struct socket *so, int refcount, void *lr)
3252 {
3253 	void *lr_saved;
3254 
3255 	if (lr == NULL) {
3256 		lr_saved = __builtin_return_address(0);
3257 	} else {
3258 		lr_saved = lr;
3259 	}
3260 
3261 #ifdef MORE_TCPLOCK_DEBUG
3262 	printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
3263 	    "lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
3264 	    (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
3265 	    (uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
3266 	    so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
3267 #endif
3268 	if (refcount) {
3269 		so->so_usecount--;
3270 	}
3271 
3272 	if (so->so_usecount < 0) {
3273 		panic("tcp_unlock: so=%p usecount=%x lrh= %s",
3274 		    so, so->so_usecount, solockhistory_nr(so));
3275 		/* NOTREACHED */
3276 	}
3277 	if (so->so_pcb == NULL) {
3278 		panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s",
3279 		    so, so->so_usecount, lr_saved, solockhistory_nr(so));
3280 		/* NOTREACHED */
3281 	} else {
3282 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
3283 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3284 
3285 		if (so->so_flags & SOF_MP_SUBFLOW) {
3286 			struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3287 			struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3288 
3289 			socket_lock_assert_owned(mp_so);
3290 
3291 			socket_unlock(mp_so, refcount);
3292 		} else {
3293 			LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3294 			    LCK_MTX_ASSERT_OWNED);
3295 			lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3296 		}
3297 	}
3298 	return 0;
3299 }
3300 
3301 lck_mtx_t *
tcp_getlock(struct socket * so,int flags)3302 tcp_getlock(struct socket *so, int flags)
3303 {
3304 	struct inpcb *inp = sotoinpcb(so);
3305 
3306 	if (so->so_pcb) {
3307 		if (so->so_usecount < 0) {
3308 			panic("tcp_getlock: so=%p usecount=%x lrh= %s",
3309 			    so, so->so_usecount, solockhistory_nr(so));
3310 		}
3311 
3312 		if (so->so_flags & SOF_MP_SUBFLOW) {
3313 			struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3314 			struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
3315 
3316 			return mp_so->so_proto->pr_getlock(mp_so, flags);
3317 		} else {
3318 			return &inp->inpcb_mtx;
3319 		}
3320 	} else {
3321 		panic("tcp_getlock: so=%p NULL so_pcb %s",
3322 		    so, solockhistory_nr(so));
3323 		return so->so_proto->pr_domain->dom_mtx;
3324 	}
3325 }
3326 
3327 /*
3328  * Determine if we can grow the recieve socket buffer to avoid sending
3329  * a zero window update to the peer. We allow even socket buffers that
3330  * have fixed size (set by the application) to grow if the resource
3331  * constraints are met. They will also be trimmed after the application
3332  * reads data.
3333  */
3334 static void
tcp_sbrcv_grow_rwin(struct tcpcb * tp,struct sockbuf * sb)3335 tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb)
3336 {
3337 	u_int32_t rcvbufinc = tp->t_maxseg << 4;
3338 	u_int32_t rcvbuf = sb->sb_hiwat;
3339 	struct socket *so = tp->t_inpcb->inp_socket;
3340 
3341 	if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
3342 		return;
3343 	}
3344 
3345 	if (tcp_do_autorcvbuf == 1 &&
3346 	    tcp_cansbgrow(sb) &&
3347 	    (tp->t_flags & TF_SLOWLINK) == 0 &&
3348 	    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
3349 	    (rcvbuf - sb->sb_cc) < rcvbufinc &&
3350 	    rcvbuf < tcp_autorcvbuf_max &&
3351 	    (sb->sb_idealsize > 0 &&
3352 	    sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
3353 		sbreserve(sb,
3354 		    min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
3355 	}
3356 }
3357 
3358 int32_t
tcp_sbspace(struct tcpcb * tp)3359 tcp_sbspace(struct tcpcb *tp)
3360 {
3361 	struct socket *so = tp->t_inpcb->inp_socket;
3362 	struct sockbuf *sb = &so->so_rcv;
3363 	u_int32_t rcvbuf;
3364 	int32_t space;
3365 	int32_t pending = 0;
3366 
3367 	if (so->so_flags & SOF_MP_SUBFLOW) {
3368 		/* We still need to grow TCP's buffer to have a BDP-estimate */
3369 		tcp_sbrcv_grow_rwin(tp, sb);
3370 
3371 		return mptcp_sbspace(tptomptp(tp));
3372 	}
3373 
3374 	tcp_sbrcv_grow_rwin(tp, sb);
3375 
3376 	/* hiwat might have changed */
3377 	rcvbuf = sb->sb_hiwat;
3378 
3379 	space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
3380 	    (sb->sb_mbmax - sb->sb_mbcnt)));
3381 	if (space < 0) {
3382 		space = 0;
3383 	}
3384 
3385 #if CONTENT_FILTER
3386 	/* Compensate for data being processed by content filters */
3387 	pending = cfil_sock_data_space(sb);
3388 #endif /* CONTENT_FILTER */
3389 	if (pending > space) {
3390 		space = 0;
3391 	} else {
3392 		space -= pending;
3393 	}
3394 
3395 	/*
3396 	 * Avoid increasing window size if the current window
3397 	 * is already very low, we could be in "persist" mode and
3398 	 * we could break some apps (see rdar://5409343)
3399 	 */
3400 
3401 	if (space < tp->t_maxseg) {
3402 		return space;
3403 	}
3404 
3405 	/* Clip window size for slower link */
3406 
3407 	if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0) {
3408 		return imin(space, slowlink_wsize);
3409 	}
3410 
3411 	return space;
3412 }
3413 /*
3414  * Checks TCP Segment Offloading capability for a given connection
3415  * and interface pair.
3416  */
3417 void
tcp_set_tso(struct tcpcb * tp,struct ifnet * ifp)3418 tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp)
3419 {
3420 	struct inpcb *inp;
3421 	int isipv6;
3422 	struct ifnet *tunnel_ifp = NULL;
3423 #define IFNET_TSO_MASK (IFNET_TSO_IPV6 | IFNET_TSO_IPV4)
3424 
3425 	tp->t_flags &= ~TF_TSO;
3426 
3427 	/*
3428 	 * Bail if there's a non-TSO-capable filter on the interface.
3429 	 */
3430 	if (ifp == NULL || ifp->if_flt_no_tso_count > 0) {
3431 		return;
3432 	}
3433 
3434 	inp = tp->t_inpcb;
3435 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
3436 
3437 #if MPTCP
3438 	/*
3439 	 * We can't use TSO if this tcpcb belongs to an MPTCP session.
3440 	 */
3441 	if (inp->inp_socket->so_flags & SOF_MP_SUBFLOW) {
3442 		return;
3443 	}
3444 #endif
3445 	/*
3446 	 * We can't use TSO if the TSO capability of the tunnel interface does
3447 	 * not match the capability of another interface known by TCP
3448 	 */
3449 	if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL) {
3450 		u_int tunnel_if_index = inp->inp_policyresult.results.result_parameter.tunnel_interface_index;
3451 
3452 		if (tunnel_if_index != 0) {
3453 			ifnet_head_lock_shared();
3454 			tunnel_ifp = ifindex2ifnet[tunnel_if_index];
3455 			ifnet_head_done();
3456 		}
3457 
3458 		if (tunnel_ifp == NULL) {
3459 			return;
3460 		}
3461 
3462 		if ((ifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3463 			if (tso_debug > 0) {
3464 				os_log(OS_LOG_DEFAULT,
3465 				    "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with ifp %s",
3466 				    __func__,
3467 				    ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3468 				    tunnel_ifp->if_xname, ifp->if_xname);
3469 			}
3470 			return;
3471 		}
3472 		if (inp->inp_last_outifp != NULL &&
3473 		    (inp->inp_last_outifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3474 			if (tso_debug > 0) {
3475 				os_log(OS_LOG_DEFAULT,
3476 				    "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_last_outifp %s",
3477 				    __func__,
3478 				    ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3479 				    tunnel_ifp->if_xname, inp->inp_last_outifp->if_xname);
3480 			}
3481 			return;
3482 		}
3483 		if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp != NULL &&
3484 		    (inp->inp_boundifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3485 			if (tso_debug > 0) {
3486 				os_log(OS_LOG_DEFAULT,
3487 				    "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_boundifp %s",
3488 				    __func__,
3489 				    ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3490 				    tunnel_ifp->if_xname, inp->inp_boundifp->if_xname);
3491 			}
3492 			return;
3493 		}
3494 	}
3495 
3496 	if (isipv6) {
3497 		if (ifp->if_hwassist & IFNET_TSO_IPV6) {
3498 			tp->t_flags |= TF_TSO;
3499 			if (ifp->if_tso_v6_mtu != 0) {
3500 				tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
3501 			} else {
3502 				tp->tso_max_segment_size = TCP_MAXWIN;
3503 			}
3504 		}
3505 	} else {
3506 		if (ifp->if_hwassist & IFNET_TSO_IPV4) {
3507 			tp->t_flags |= TF_TSO;
3508 			if (ifp->if_tso_v4_mtu != 0) {
3509 				tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
3510 			} else {
3511 				tp->tso_max_segment_size = TCP_MAXWIN;
3512 			}
3513 			if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3514 				tp->tso_max_segment_size -=
3515 				    CLAT46_HDR_EXPANSION_OVERHD;
3516 			}
3517 		}
3518 	}
3519 
3520 	if (tso_debug > 1) {
3521 		os_log(OS_LOG_DEFAULT, "%s: %u > %u TSO %d ifp %s",
3522 		    __func__,
3523 		    ntohs(tp->t_inpcb->inp_lport),
3524 		    ntohs(tp->t_inpcb->inp_fport),
3525 		    (tp->t_flags & TF_TSO) != 0,
3526 		    ifp != NULL ? ifp->if_xname : "<NULL>");
3527 	}
3528 }
3529 
3530 #define TIMEVAL_TO_TCPHZ(_tv_) ((uint32_t)((_tv_).tv_sec * TCP_RETRANSHZ + \
3531 	(_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC))
3532 
3533 /*
3534  * Function to calculate the tcp clock. The tcp clock will get updated
3535  * at the boundaries of the tcp layer. This is done at 3 places:
3536  * 1. Right before processing an input tcp packet
3537  * 2. Whenever a connection wants to access the network using tcp_usrreqs
3538  * 3. When a tcp timer fires or before tcp slow timeout
3539  *
3540  */
3541 
3542 void
calculate_tcp_clock(void)3543 calculate_tcp_clock(void)
3544 {
3545 	struct timeval tv = tcp_uptime;
3546 	struct timeval interval = {.tv_sec = 0, .tv_usec = TCP_RETRANSHZ_TO_USEC};
3547 	struct timeval now, hold_now;
3548 	uint32_t incr = 0;
3549 
3550 	microuptime(&now);
3551 
3552 	/*
3553 	 * Update coarse-grained networking timestamp (in sec.); the idea
3554 	 * is to update the counter returnable via net_uptime() when
3555 	 * we read time.
3556 	 */
3557 	net_update_uptime_with_time(&now);
3558 
3559 	timevaladd(&tv, &interval);
3560 	if (timevalcmp(&now, &tv, >)) {
3561 		/* time to update the clock */
3562 		lck_spin_lock(&tcp_uptime_lock);
3563 		if (timevalcmp(&tcp_uptime, &now, >=)) {
3564 			/* clock got updated while waiting for the lock */
3565 			lck_spin_unlock(&tcp_uptime_lock);
3566 			return;
3567 		}
3568 
3569 		microuptime(&now);
3570 		hold_now = now;
3571 		tv = tcp_uptime;
3572 		timevalsub(&now, &tv);
3573 
3574 		incr = TIMEVAL_TO_TCPHZ(now);
3575 
3576 		/* Account for the previous remainder */
3577 		uint32_t remaining_us = (now.tv_usec % TCP_RETRANSHZ_TO_USEC) +
3578 		    tcp_now_remainder_us;
3579 		if (remaining_us >= TCP_RETRANSHZ_TO_USEC) {
3580 			incr += (remaining_us / TCP_RETRANSHZ_TO_USEC);
3581 		}
3582 
3583 		if (incr > 0) {
3584 			tcp_uptime = hold_now;
3585 			tcp_now_remainder_us = remaining_us % TCP_RETRANSHZ_TO_USEC;
3586 			tcp_now += incr;
3587 		}
3588 
3589 		lck_spin_unlock(&tcp_uptime_lock);
3590 	}
3591 }
3592 
3593 /*
3594  * Compute receive window scaling that we are going to request
3595  * for this connection based on  sb_hiwat. Try to leave some
3596  * room to potentially increase the window size upto a maximum
3597  * defined by the constant tcp_autorcvbuf_max.
3598  */
3599 void
tcp_set_max_rwinscale(struct tcpcb * tp,struct socket * so)3600 tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so)
3601 {
3602 	uint32_t maxsockbufsize;
3603 
3604 	tp->request_r_scale = MAX((uint8_t)tcp_win_scale, tp->request_r_scale);
3605 	maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
3606 	    so->so_rcv.sb_hiwat : tcp_autorcvbuf_max;
3607 
3608 	/*
3609 	 * Window scale should not exceed what is needed
3610 	 * to send the max receive window size; adding 1 to TCP_MAXWIN
3611 	 * ensures that.
3612 	 */
3613 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
3614 	    ((TCP_MAXWIN + 1) << tp->request_r_scale) < maxsockbufsize) {
3615 		tp->request_r_scale++;
3616 	}
3617 	tp->request_r_scale = MIN(tp->request_r_scale, TCP_MAX_WINSHIFT);
3618 }
3619 
3620 int
tcp_notsent_lowat_check(struct socket * so)3621 tcp_notsent_lowat_check(struct socket *so)
3622 {
3623 	struct inpcb *inp = sotoinpcb(so);
3624 	struct tcpcb *tp = NULL;
3625 	int notsent = 0;
3626 
3627 	if (inp != NULL) {
3628 		tp = intotcpcb(inp);
3629 	}
3630 
3631 	if (tp == NULL) {
3632 		return 0;
3633 	}
3634 
3635 	notsent = so->so_snd.sb_cc -
3636 	    (tp->snd_nxt - tp->snd_una);
3637 
3638 	/*
3639 	 * When we send a FIN or SYN, not_sent can be negative.
3640 	 * In that case also we need to send a write event to the
3641 	 * process if it is waiting. In the FIN case, it will
3642 	 * get an error from send because cantsendmore will be set.
3643 	 */
3644 	if (notsent <= tp->t_notsent_lowat) {
3645 		return 1;
3646 	}
3647 
3648 	/*
3649 	 * When Nagle's algorithm is not disabled, it is better
3650 	 * to wakeup the client until there is atleast one
3651 	 * maxseg of data to write.
3652 	 */
3653 	if ((tp->t_flags & TF_NODELAY) == 0 &&
3654 	    notsent > 0 && notsent < tp->t_maxseg) {
3655 		return 1;
3656 	}
3657 	return 0;
3658 }
3659 
3660 void
tcp_rxtseg_insert(struct tcpcb * tp,tcp_seq start,tcp_seq end)3661 tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3662 {
3663 	struct tcp_rxt_seg *rxseg = NULL, *prev = NULL, *next = NULL;
3664 	uint16_t rxcount = 0;
3665 
3666 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3667 		tp->t_dsack_lastuna = tp->snd_una;
3668 	}
3669 	/*
3670 	 * First check if there is a segment already existing for this
3671 	 * sequence space.
3672 	 */
3673 
3674 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3675 		if (SEQ_GT(rxseg->rx_start, start)) {
3676 			break;
3677 		}
3678 		prev = rxseg;
3679 	}
3680 	next = rxseg;
3681 
3682 	/* check if prev seg is for this sequence */
3683 	if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
3684 	    SEQ_GEQ(prev->rx_end, end)) {
3685 		prev->rx_count++;
3686 		return;
3687 	}
3688 
3689 	/*
3690 	 * There are a couple of possibilities at this point.
3691 	 * 1. prev overlaps with the beginning of this sequence
3692 	 * 2. next overlaps with the end of this sequence
3693 	 * 3. there is no overlap.
3694 	 */
3695 
3696 	if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
3697 		if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
3698 			start = prev->rx_end + 1;
3699 			prev->rx_count++;
3700 		} else {
3701 			prev->rx_end = (start - 1);
3702 			rxcount = prev->rx_count;
3703 		}
3704 	}
3705 
3706 	if (next != NULL && SEQ_LT(next->rx_start, end)) {
3707 		if (SEQ_LEQ(next->rx_end, end)) {
3708 			end = next->rx_start - 1;
3709 			next->rx_count++;
3710 		} else {
3711 			next->rx_start = end + 1;
3712 			rxcount = next->rx_count;
3713 		}
3714 	}
3715 	if (!SEQ_LT(start, end)) {
3716 		return;
3717 	}
3718 
3719 	if (tcp_rxt_seg_max > 0 && tp->t_rxt_seg_count >= tcp_rxt_seg_max) {
3720 		rxseg = SLIST_FIRST(&tp->t_rxt_segments);
3721 		if (prev == rxseg) {
3722 			prev = NULL;
3723 		}
3724 		SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3725 		    tcp_rxt_seg, rx_link);
3726 
3727 		tcp_rxt_seg_drop++;
3728 		tp->t_rxt_seg_drop++;
3729 		TCP_LOG(tp, "removed rxseg list overflow %u:%u ",
3730 		    rxseg->rx_start, rxseg->rx_end);
3731 		zfree(tcp_rxt_seg_zone, rxseg);
3732 
3733 		tp->t_rxt_seg_count -= 1;
3734 	}
3735 
3736 	rxseg = zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
3737 	rxseg->rx_start = start;
3738 	rxseg->rx_end = end;
3739 	rxseg->rx_count = rxcount + 1;
3740 
3741 	if (prev != NULL) {
3742 		SLIST_INSERT_AFTER(prev, rxseg, rx_link);
3743 	} else {
3744 		SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
3745 	}
3746 	tp->t_rxt_seg_count += 1;
3747 }
3748 
3749 struct tcp_rxt_seg *
tcp_rxtseg_find(struct tcpcb * tp,tcp_seq start,tcp_seq end)3750 tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3751 {
3752 	struct tcp_rxt_seg *rxseg;
3753 
3754 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3755 		return NULL;
3756 	}
3757 
3758 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3759 		if (SEQ_LEQ(rxseg->rx_start, start) &&
3760 		    SEQ_GEQ(rxseg->rx_end, end)) {
3761 			return rxseg;
3762 		}
3763 		if (SEQ_GT(rxseg->rx_start, start)) {
3764 			break;
3765 		}
3766 	}
3767 	return NULL;
3768 }
3769 
3770 void
tcp_rxtseg_set_spurious(struct tcpcb * tp,tcp_seq start,tcp_seq end)3771 tcp_rxtseg_set_spurious(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3772 {
3773 	struct tcp_rxt_seg *rxseg;
3774 
3775 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3776 		return;
3777 	}
3778 
3779 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3780 		if (SEQ_GEQ(rxseg->rx_start, start) &&
3781 		    SEQ_LEQ(rxseg->rx_end, end)) {
3782 			/*
3783 			 * If the segment was retransmitted only once, mark it as
3784 			 * spurious.
3785 			 */
3786 			if (rxseg->rx_count == 1) {
3787 				rxseg->rx_flags |= TCP_RXT_SPURIOUS;
3788 			}
3789 		}
3790 
3791 		if (SEQ_GEQ(rxseg->rx_start, end)) {
3792 			break;
3793 		}
3794 	}
3795 	return;
3796 }
3797 
3798 void
tcp_rxtseg_clean(struct tcpcb * tp)3799 tcp_rxtseg_clean(struct tcpcb *tp)
3800 {
3801 	struct tcp_rxt_seg *rxseg, *next;
3802 
3803 	SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
3804 		SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3805 		    tcp_rxt_seg, rx_link);
3806 		zfree(tcp_rxt_seg_zone, rxseg);
3807 	}
3808 	tp->t_rxt_seg_count = 0;
3809 	tp->t_dsack_lastuna = tp->snd_max;
3810 }
3811 
3812 boolean_t
tcp_rxtseg_detect_bad_rexmt(struct tcpcb * tp,tcp_seq th_ack)3813 tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
3814 {
3815 	boolean_t bad_rexmt;
3816 	struct tcp_rxt_seg *rxseg;
3817 
3818 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3819 		return FALSE;
3820 	}
3821 
3822 	/*
3823 	 * If all of the segments in this window are not cumulatively
3824 	 * acknowledged, then there can still be undetected packet loss.
3825 	 * Do not restore congestion window in that case.
3826 	 */
3827 	if (SEQ_LT(th_ack, tp->snd_recover)) {
3828 		return FALSE;
3829 	}
3830 
3831 	bad_rexmt = TRUE;
3832 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3833 		if (!(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
3834 			bad_rexmt = FALSE;
3835 			break;
3836 		}
3837 	}
3838 	return bad_rexmt;
3839 }
3840 
3841 boolean_t
tcp_rxtseg_dsack_for_tlp(struct tcpcb * tp)3842 tcp_rxtseg_dsack_for_tlp(struct tcpcb *tp)
3843 {
3844 	boolean_t dsack_for_tlp = FALSE;
3845 	struct tcp_rxt_seg *rxseg;
3846 
3847 	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3848 		return FALSE;
3849 	}
3850 
3851 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3852 		if (rxseg->rx_count == 1 &&
3853 		    SLIST_NEXT(rxseg, rx_link) == NULL &&
3854 		    (rxseg->rx_flags & TCP_RXT_DSACK_FOR_TLP)) {
3855 			dsack_for_tlp = TRUE;
3856 			break;
3857 		}
3858 	}
3859 	return dsack_for_tlp;
3860 }
3861 
3862 u_int32_t
tcp_rxtseg_total_size(struct tcpcb * tp)3863 tcp_rxtseg_total_size(struct tcpcb *tp)
3864 {
3865 	struct tcp_rxt_seg *rxseg;
3866 	u_int32_t total_size = 0;
3867 
3868 	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3869 		total_size += (rxseg->rx_end - rxseg->rx_start) + 1;
3870 	}
3871 	return total_size;
3872 }
3873 
3874 void
tcp_get_connectivity_status(struct tcpcb * tp,struct tcp_conn_status * connstatus)3875 tcp_get_connectivity_status(struct tcpcb *tp,
3876     struct tcp_conn_status *connstatus)
3877 {
3878 	if (tp == NULL || connstatus == NULL) {
3879 		return;
3880 	}
3881 	bzero(connstatus, sizeof(*connstatus));
3882 	if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
3883 		if (TCPS_HAVEESTABLISHED(tp->t_state)) {
3884 			connstatus->write_probe_failed = 1;
3885 		} else {
3886 			connstatus->conn_probe_failed = 1;
3887 		}
3888 	}
3889 	if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX) {
3890 		connstatus->read_probe_failed = 1;
3891 	}
3892 	if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL &&
3893 	    (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)) {
3894 		connstatus->probe_activated = 1;
3895 	}
3896 }
3897 
3898 boolean_t
tfo_enabled(const struct tcpcb * tp)3899 tfo_enabled(const struct tcpcb *tp)
3900 {
3901 	return (tp->t_flagsext & TF_FASTOPEN)? TRUE : FALSE;
3902 }
3903 
3904 void
tcp_disable_tfo(struct tcpcb * tp)3905 tcp_disable_tfo(struct tcpcb *tp)
3906 {
3907 	tp->t_flagsext &= ~TF_FASTOPEN;
3908 }
3909 
3910 static struct mbuf *
tcp_make_keepalive_frame(struct tcpcb * tp,struct ifnet * ifp,boolean_t is_probe)3911 tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp,
3912     boolean_t is_probe)
3913 {
3914 	struct inpcb *inp = tp->t_inpcb;
3915 	struct tcphdr *th;
3916 	u_int8_t *data;
3917 	int win = 0;
3918 	struct mbuf *m;
3919 
3920 	/*
3921 	 * The code assumes the IP + TCP headers fit in an mbuf packet header
3922 	 */
3923 	_CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN);
3924 	_CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN);
3925 
3926 	MGETHDR(m, M_WAIT, MT_HEADER);
3927 	if (m == NULL) {
3928 		return NULL;
3929 	}
3930 	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3931 
3932 	data = mbuf_datastart(m);
3933 
3934 	if (inp->inp_vflag & INP_IPV4) {
3935 		bzero(data, sizeof(struct ip) + sizeof(struct tcphdr));
3936 		th = (struct tcphdr *)(void *) (data + sizeof(struct ip));
3937 		m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
3938 		m->m_pkthdr.len = m->m_len;
3939 	} else {
3940 		VERIFY(inp->inp_vflag & INP_IPV6);
3941 
3942 		bzero(data, sizeof(struct ip6_hdr)
3943 		    + sizeof(struct tcphdr));
3944 		th = (struct tcphdr *)(void *)(data + sizeof(struct ip6_hdr));
3945 		m->m_len = sizeof(struct ip6_hdr) +
3946 		    sizeof(struct tcphdr);
3947 		m->m_pkthdr.len = m->m_len;
3948 	}
3949 
3950 	tcp_fillheaders(m, tp, data, th);
3951 
3952 	if (inp->inp_vflag & INP_IPV4) {
3953 		struct ip *ip;
3954 
3955 		ip = (__typeof__(ip))(void *)data;
3956 
3957 		ip->ip_id = rfc6864 ? 0 : ip_randomid((uint64_t)m);
3958 		ip->ip_off = htons(IP_DF);
3959 		ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
3960 		ip->ip_ttl = inp->inp_ip_ttl;
3961 		ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
3962 		ip->ip_sum = in_cksum_hdr(ip);
3963 	} else {
3964 		struct ip6_hdr *ip6;
3965 
3966 		ip6 = (__typeof__(ip6))(void *)data;
3967 
3968 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
3969 		ip6->ip6_hlim = in6_selecthlim(inp, ifp);
3970 		ip6->ip6_flow = ip6->ip6_flow & ~IPV6_FLOW_ECN_MASK;
3971 
3972 		if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
3973 			ip6->ip6_src.s6_addr16[1] = 0;
3974 		}
3975 		if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
3976 			ip6->ip6_dst.s6_addr16[1] = 0;
3977 		}
3978 	}
3979 	th->th_flags = TH_ACK;
3980 
3981 	win = tcp_sbspace(tp);
3982 	if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale)) {
3983 		win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
3984 	}
3985 	th->th_win = htons((u_short) (win >> tp->rcv_scale));
3986 
3987 	if (is_probe) {
3988 		th->th_seq = htonl(tp->snd_una - 1);
3989 	} else {
3990 		th->th_seq = htonl(tp->snd_una);
3991 	}
3992 	th->th_ack = htonl(tp->rcv_nxt);
3993 
3994 	/* Force recompute TCP checksum to be the final value */
3995 	th->th_sum = 0;
3996 	if (inp->inp_vflag & INP_IPV4) {
3997 		th->th_sum = inet_cksum(m, IPPROTO_TCP,
3998 		    sizeof(struct ip), sizeof(struct tcphdr));
3999 	} else {
4000 		th->th_sum = inet6_cksum(m, IPPROTO_TCP,
4001 		    sizeof(struct ip6_hdr), sizeof(struct tcphdr));
4002 	}
4003 
4004 	return m;
4005 }
4006 
4007 void
tcp_fill_keepalive_offload_frames(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frames_array,u_int32_t frames_array_count,size_t frame_data_offset,u_int32_t * used_frames_count)4008 tcp_fill_keepalive_offload_frames(ifnet_t ifp,
4009     struct ifnet_keepalive_offload_frame *frames_array,
4010     u_int32_t frames_array_count, size_t frame_data_offset,
4011     u_int32_t *used_frames_count)
4012 {
4013 	struct inpcb *inp;
4014 	inp_gen_t gencnt;
4015 	u_int32_t frame_index = *used_frames_count;
4016 
4017 	if (ifp == NULL || frames_array == NULL ||
4018 	    frames_array_count == 0 ||
4019 	    frame_index >= frames_array_count ||
4020 	    frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4021 		return;
4022 	}
4023 
4024 	/*
4025 	 * This function is called outside the regular TCP processing
4026 	 * so we need to update the TCP clock.
4027 	 */
4028 	calculate_tcp_clock();
4029 
4030 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
4031 	gencnt = tcbinfo.ipi_gencnt;
4032 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4033 		struct socket *so;
4034 		struct ifnet_keepalive_offload_frame *frame;
4035 		struct mbuf *m = NULL;
4036 		struct tcpcb *tp = intotcpcb(inp);
4037 
4038 		if (frame_index >= frames_array_count) {
4039 			break;
4040 		}
4041 
4042 		if (inp->inp_gencnt > gencnt ||
4043 		    inp->inp_state == INPCB_STATE_DEAD) {
4044 			continue;
4045 		}
4046 
4047 		if ((so = inp->inp_socket) == NULL ||
4048 		    (so->so_state & SS_DEFUNCT)) {
4049 			continue;
4050 		}
4051 		/*
4052 		 * check for keepalive offload flag without socket
4053 		 * lock to avoid a deadlock
4054 		 */
4055 		if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4056 			continue;
4057 		}
4058 
4059 		if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
4060 			continue;
4061 		}
4062 		if (inp->inp_ppcb == NULL ||
4063 		    in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4064 			continue;
4065 		}
4066 		socket_lock(so, 1);
4067 		/* Release the want count */
4068 		if (inp->inp_ppcb == NULL ||
4069 		    (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
4070 			socket_unlock(so, 1);
4071 			continue;
4072 		}
4073 		if ((inp->inp_vflag & INP_IPV4) &&
4074 		    (inp->inp_laddr.s_addr == INADDR_ANY ||
4075 		    inp->inp_faddr.s_addr == INADDR_ANY)) {
4076 			socket_unlock(so, 1);
4077 			continue;
4078 		}
4079 		if ((inp->inp_vflag & INP_IPV6) &&
4080 		    (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
4081 		    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) {
4082 			socket_unlock(so, 1);
4083 			continue;
4084 		}
4085 		if (inp->inp_lport == 0 || inp->inp_fport == 0) {
4086 			socket_unlock(so, 1);
4087 			continue;
4088 		}
4089 		if (inp->inp_last_outifp == NULL ||
4090 		    inp->inp_last_outifp->if_index != ifp->if_index) {
4091 			socket_unlock(so, 1);
4092 			continue;
4093 		}
4094 		if ((inp->inp_vflag & INP_IPV4) && frame_data_offset +
4095 		    sizeof(struct ip) + sizeof(struct tcphdr) >
4096 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4097 			socket_unlock(so, 1);
4098 			continue;
4099 		} else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset +
4100 		    sizeof(struct ip6_hdr) + sizeof(struct tcphdr) >
4101 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4102 			socket_unlock(so, 1);
4103 			continue;
4104 		}
4105 		/*
4106 		 * There is no point in waking up the device for connections
4107 		 * that are not established. Long lived connection are meant
4108 		 * for processes that will sent and receive data
4109 		 */
4110 		if (tp->t_state != TCPS_ESTABLISHED) {
4111 			socket_unlock(so, 1);
4112 			continue;
4113 		}
4114 		/*
4115 		 * This inp has all the information that is needed to
4116 		 * generate an offload frame.
4117 		 */
4118 		frame = &frames_array[frame_index];
4119 		frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP;
4120 		frame->ether_type = (inp->inp_vflag & INP_IPV4) ?
4121 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 :
4122 		    IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6;
4123 		frame->interval = (uint16_t)(tp->t_keepidle > 0 ? tp->t_keepidle :
4124 		    tcp_keepidle);
4125 		frame->keep_cnt = (uint8_t)TCP_CONN_KEEPCNT(tp);
4126 		frame->keep_retry = (uint16_t)TCP_CONN_KEEPINTVL(tp);
4127 		if (so->so_options & SO_NOWAKEFROMSLEEP) {
4128 			frame->flags |=
4129 			    IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP;
4130 		}
4131 		frame->local_port = ntohs(inp->inp_lport);
4132 		frame->remote_port = ntohs(inp->inp_fport);
4133 		frame->local_seq = tp->snd_nxt;
4134 		frame->remote_seq = tp->rcv_nxt;
4135 		if (inp->inp_vflag & INP_IPV4) {
4136 			ASSERT(frame_data_offset + sizeof(struct ip) + sizeof(struct tcphdr) <= UINT8_MAX);
4137 			frame->length = (uint8_t)(frame_data_offset +
4138 			    sizeof(struct ip) + sizeof(struct tcphdr));
4139 			frame->reply_length =  frame->length;
4140 
4141 			frame->addr_length = sizeof(struct in_addr);
4142 			bcopy(&inp->inp_laddr, frame->local_addr,
4143 			    sizeof(struct in_addr));
4144 			bcopy(&inp->inp_faddr, frame->remote_addr,
4145 			    sizeof(struct in_addr));
4146 		} else {
4147 			struct in6_addr *ip6;
4148 
4149 			ASSERT(frame_data_offset + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= UINT8_MAX);
4150 			frame->length = (uint8_t)(frame_data_offset +
4151 			    sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
4152 			frame->reply_length =  frame->length;
4153 
4154 			frame->addr_length = sizeof(struct in6_addr);
4155 			ip6 = (struct in6_addr *)(void *)frame->local_addr;
4156 			bcopy(&inp->in6p_laddr, ip6, sizeof(struct in6_addr));
4157 			if (IN6_IS_SCOPE_EMBED(ip6)) {
4158 				ip6->s6_addr16[1] = 0;
4159 			}
4160 
4161 			ip6 = (struct in6_addr *)(void *)frame->remote_addr;
4162 			bcopy(&inp->in6p_faddr, ip6, sizeof(struct in6_addr));
4163 			if (IN6_IS_SCOPE_EMBED(ip6)) {
4164 				ip6->s6_addr16[1] = 0;
4165 			}
4166 		}
4167 
4168 		/*
4169 		 * First the probe
4170 		 */
4171 		m = tcp_make_keepalive_frame(tp, ifp, TRUE);
4172 		if (m == NULL) {
4173 			socket_unlock(so, 1);
4174 			continue;
4175 		}
4176 		bcopy(m->m_data, frame->data + frame_data_offset,
4177 		    m->m_len);
4178 		m_freem(m);
4179 
4180 		/*
4181 		 * Now the response packet to incoming probes
4182 		 */
4183 		m = tcp_make_keepalive_frame(tp, ifp, FALSE);
4184 		if (m == NULL) {
4185 			socket_unlock(so, 1);
4186 			continue;
4187 		}
4188 		bcopy(m->m_data, frame->reply_data + frame_data_offset,
4189 		    m->m_len);
4190 		m_freem(m);
4191 
4192 		frame_index++;
4193 		socket_unlock(so, 1);
4194 	}
4195 	lck_rw_done(&tcbinfo.ipi_lock);
4196 	*used_frames_count = frame_index;
4197 }
4198 
4199 static bool
inp_matches_kao_frame(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frame,struct inpcb * inp)4200 inp_matches_kao_frame(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frame,
4201     struct inpcb *inp)
4202 {
4203 	if (inp->inp_ppcb == NULL) {
4204 		return false;
4205 	}
4206 	/* Release the want count */
4207 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4208 		return false;
4209 	}
4210 	if (inp->inp_last_outifp == NULL ||
4211 	    inp->inp_last_outifp->if_index != ifp->if_index) {
4212 		return false;
4213 	}
4214 	if (frame->local_port != ntohs(inp->inp_lport) ||
4215 	    frame->remote_port != ntohs(inp->inp_fport)) {
4216 		return false;
4217 	}
4218 	if (inp->inp_vflag & INP_IPV4) {
4219 		if (memcmp(&inp->inp_laddr, frame->local_addr,
4220 		    sizeof(struct in_addr)) != 0 ||
4221 		    memcmp(&inp->inp_faddr, frame->remote_addr,
4222 		    sizeof(struct in_addr)) != 0) {
4223 			return false;
4224 		}
4225 	} else if (inp->inp_vflag & INP_IPV6) {
4226 		if (memcmp(&inp->inp_laddr, frame->local_addr,
4227 		    sizeof(struct in6_addr)) != 0 ||
4228 		    memcmp(&inp->inp_faddr, frame->remote_addr,
4229 		    sizeof(struct in6_addr)) != 0) {
4230 			return false;
4231 		}
4232 	} else {
4233 		return false;
4234 	}
4235 	return true;
4236 }
4237 
4238 int
tcp_notify_kao_timeout(ifnet_t ifp,struct ifnet_keepalive_offload_frame * frame)4239 tcp_notify_kao_timeout(ifnet_t ifp,
4240     struct ifnet_keepalive_offload_frame *frame)
4241 {
4242 	struct inpcb *inp = NULL;
4243 	struct socket *so = NULL;
4244 	bool found = false;
4245 
4246 	/*
4247 	 *  Unlock the list before posting event on the matching socket
4248 	 */
4249 	lck_rw_lock_shared(&tcbinfo.ipi_lock);
4250 
4251 	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4252 		if ((so = inp->inp_socket) == NULL ||
4253 		    (so->so_state & SS_DEFUNCT)) {
4254 			continue;
4255 		}
4256 		if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4257 			continue;
4258 		}
4259 		if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
4260 			continue;
4261 		}
4262 		if (inp->inp_ppcb == NULL ||
4263 		    in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4264 			continue;
4265 		}
4266 		socket_lock(so, 1);
4267 		if (inp_matches_kao_frame(ifp, frame, inp)) {
4268 			/*
4269 			 * Keep the matching socket locked
4270 			 */
4271 			found = true;
4272 			break;
4273 		}
4274 		socket_unlock(so, 1);
4275 	}
4276 	lck_rw_done(&tcbinfo.ipi_lock);
4277 
4278 	if (found) {
4279 		ASSERT(inp != NULL);
4280 		ASSERT(so != NULL);
4281 		ASSERT(so == inp->inp_socket);
4282 		/*
4283 		 * Drop the TCP connection like tcptimers() does
4284 		 */
4285 		struct tcpcb *tp = inp->inp_ppcb;
4286 
4287 		tcpstat.tcps_keepdrops++;
4288 		soevent(so,
4289 		    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
4290 		tp = tcp_drop(tp, ETIMEDOUT);
4291 
4292 		tcpstat.tcps_ka_offload_drops++;
4293 		os_log_info(OS_LOG_DEFAULT, "%s: dropped lport %u fport %u\n",
4294 		    __func__, frame->local_port, frame->remote_port);
4295 
4296 		socket_unlock(so, 1);
4297 	}
4298 
4299 	return 0;
4300 }
4301 
4302 errno_t
tcp_notify_ack_id_valid(struct tcpcb * tp,struct socket * so,u_int32_t notify_id)4303 tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so,
4304     u_int32_t notify_id)
4305 {
4306 	struct tcp_notify_ack_marker *elm;
4307 
4308 	if (so->so_snd.sb_cc == 0) {
4309 		return ENOBUFS;
4310 	}
4311 
4312 	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4313 		/* Duplicate id is not allowed */
4314 		if (elm->notify_id == notify_id) {
4315 			return EINVAL;
4316 		}
4317 		/* Duplicate position is not allowed */
4318 		if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc) {
4319 			return EINVAL;
4320 		}
4321 	}
4322 	return 0;
4323 }
4324 
4325 errno_t
tcp_add_notify_ack_marker(struct tcpcb * tp,u_int32_t notify_id)4326 tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id)
4327 {
4328 	struct tcp_notify_ack_marker *nm, *elm = NULL;
4329 	struct socket *so = tp->t_inpcb->inp_socket;
4330 
4331 	nm = kalloc_type(struct tcp_notify_ack_marker, M_WAIT | Z_ZERO);
4332 	if (nm == NULL) {
4333 		return ENOMEM;
4334 	}
4335 	nm->notify_id = notify_id;
4336 	nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc;
4337 
4338 	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4339 		if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una)) {
4340 			break;
4341 		}
4342 	}
4343 
4344 	if (elm == NULL) {
4345 		VERIFY(SLIST_EMPTY(&tp->t_notify_ack));
4346 		SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next);
4347 	} else {
4348 		SLIST_INSERT_AFTER(elm, nm, notify_next);
4349 	}
4350 	tp->t_notify_ack_count++;
4351 	return 0;
4352 }
4353 
4354 void
tcp_notify_ack_free(struct tcpcb * tp)4355 tcp_notify_ack_free(struct tcpcb *tp)
4356 {
4357 	struct tcp_notify_ack_marker *elm, *next;
4358 	if (SLIST_EMPTY(&tp->t_notify_ack)) {
4359 		return;
4360 	}
4361 
4362 	SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
4363 		SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker,
4364 		    notify_next);
4365 		kfree_type(struct tcp_notify_ack_marker, elm);
4366 	}
4367 	SLIST_INIT(&tp->t_notify_ack);
4368 	tp->t_notify_ack_count = 0;
4369 }
4370 
4371 inline void
tcp_notify_acknowledgement(struct tcpcb * tp,struct socket * so)4372 tcp_notify_acknowledgement(struct tcpcb *tp, struct socket *so)
4373 {
4374 	struct tcp_notify_ack_marker *elm;
4375 
4376 	elm = SLIST_FIRST(&tp->t_notify_ack);
4377 	if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4378 		soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOTIFY_ACK);
4379 	}
4380 }
4381 
4382 void
tcp_get_notify_ack_count(struct tcpcb * tp,struct tcp_notify_ack_complete * retid)4383 tcp_get_notify_ack_count(struct tcpcb *tp,
4384     struct tcp_notify_ack_complete *retid)
4385 {
4386 	struct tcp_notify_ack_marker *elm;
4387 	uint32_t  complete = 0;
4388 
4389 	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4390 		if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4391 			ASSERT(complete < UINT32_MAX);
4392 			complete++;
4393 		} else {
4394 			break;
4395 		}
4396 	}
4397 	retid->notify_pending = tp->t_notify_ack_count - complete;
4398 	retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, complete);
4399 }
4400 
4401 void
tcp_get_notify_ack_ids(struct tcpcb * tp,struct tcp_notify_ack_complete * retid)4402 tcp_get_notify_ack_ids(struct tcpcb *tp,
4403     struct tcp_notify_ack_complete *retid)
4404 {
4405 	size_t i = 0;
4406 	struct tcp_notify_ack_marker *elm, *next;
4407 
4408 	SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
4409 		if (i >= retid->notify_complete_count) {
4410 			break;
4411 		}
4412 		if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4413 			retid->notify_complete_id[i++] = elm->notify_id;
4414 			SLIST_REMOVE(&tp->t_notify_ack, elm,
4415 			    tcp_notify_ack_marker, notify_next);
4416 			kfree_type(struct tcp_notify_ack_marker, elm);
4417 			tp->t_notify_ack_count--;
4418 		} else {
4419 			break;
4420 		}
4421 	}
4422 }
4423 
4424 bool
tcp_notify_ack_active(struct socket * so)4425 tcp_notify_ack_active(struct socket *so)
4426 {
4427 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
4428 	    SOCK_TYPE(so) == SOCK_STREAM) {
4429 		struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4430 
4431 		if (!SLIST_EMPTY(&tp->t_notify_ack)) {
4432 			struct tcp_notify_ack_marker *elm;
4433 			elm = SLIST_FIRST(&tp->t_notify_ack);
4434 			if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4435 				return true;
4436 			}
4437 		}
4438 	}
4439 	return false;
4440 }
4441 
4442 inline int32_t
inp_get_sndbytes_allunsent(struct socket * so,u_int32_t th_ack)4443 inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
4444 {
4445 	struct inpcb *inp = sotoinpcb(so);
4446 	struct tcpcb *tp = intotcpcb(inp);
4447 
4448 	if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) &&
4449 	    so->so_snd.sb_cc > 0) {
4450 		int32_t unsent, sent;
4451 		sent = tp->snd_max - th_ack;
4452 		if (tp->t_flags & TF_SENTFIN) {
4453 			sent--;
4454 		}
4455 		unsent = so->so_snd.sb_cc - sent;
4456 		return unsent;
4457 	}
4458 	return 0;
4459 }
4460 
4461 #define IFP_PER_FLOW_STAT(_ipv4_, _stat_) { \
4462 	if (_ipv4_) { \
4463 	        ifp->if_ipv4_stat->_stat_++; \
4464 	} else { \
4465 	        ifp->if_ipv6_stat->_stat_++; \
4466 	} \
4467 }
4468 
4469 #define FLOW_ECN_ENABLED(_flags_) \
4470     ((_flags_ & (TE_ECN_ON)) == (TE_ECN_ON))
4471 
4472 void
tcp_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)4473 tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
4474     struct ifnet *ifp)
4475 {
4476 	if (ifp == NULL || !IF_FULLY_ATTACHED(ifp)) {
4477 		return;
4478 	}
4479 
4480 	ifnet_lock_shared(ifp);
4481 	if (ifs->ecn_flags & TE_SETUPSENT) {
4482 		if (ifs->ecn_flags & TE_CLIENT_SETUP) {
4483 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_client_setup);
4484 			if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4485 				IFP_PER_FLOW_STAT(ifs->ipv4,
4486 				    ecn_client_success);
4487 			} else if (ifs->ecn_flags & TE_LOST_SYN) {
4488 				IFP_PER_FLOW_STAT(ifs->ipv4,
4489 				    ecn_syn_lost);
4490 			} else {
4491 				IFP_PER_FLOW_STAT(ifs->ipv4,
4492 				    ecn_peer_nosupport);
4493 			}
4494 		} else {
4495 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_server_setup);
4496 			if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4497 				IFP_PER_FLOW_STAT(ifs->ipv4,
4498 				    ecn_server_success);
4499 			} else if (ifs->ecn_flags & TE_LOST_SYN) {
4500 				IFP_PER_FLOW_STAT(ifs->ipv4,
4501 				    ecn_synack_lost);
4502 			} else {
4503 				IFP_PER_FLOW_STAT(ifs->ipv4,
4504 				    ecn_peer_nosupport);
4505 			}
4506 		}
4507 	} else {
4508 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off_conn);
4509 	}
4510 	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4511 		if (ifs->ecn_flags & TE_RECV_ECN_CE) {
4512 			tcpstat.tcps_ecn_conn_recv_ce++;
4513 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ce);
4514 		}
4515 		if (ifs->ecn_flags & TE_RECV_ECN_ECE) {
4516 			tcpstat.tcps_ecn_conn_recv_ece++;
4517 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ece);
4518 		}
4519 		if (ifs->ecn_flags & (TE_RECV_ECN_CE | TE_RECV_ECN_ECE)) {
4520 			if (ifs->txretransmitbytes > 0 ||
4521 			    ifs->rxoutoforderbytes > 0) {
4522 				tcpstat.tcps_ecn_conn_pl_ce++;
4523 				IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plce);
4524 			} else {
4525 				tcpstat.tcps_ecn_conn_nopl_ce++;
4526 				IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_noplce);
4527 			}
4528 		} else {
4529 			if (ifs->txretransmitbytes > 0 ||
4530 			    ifs->rxoutoforderbytes > 0) {
4531 				tcpstat.tcps_ecn_conn_plnoce++;
4532 				IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plnoce);
4533 			}
4534 		}
4535 	}
4536 
4537 	/* Other stats are interesting for non-local connections only */
4538 	if (ifs->local) {
4539 		ifnet_lock_done(ifp);
4540 		return;
4541 	}
4542 
4543 	if (ifs->ipv4) {
4544 		ifp->if_ipv4_stat->timestamp = net_uptime();
4545 		if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4546 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_on);
4547 		} else {
4548 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_off);
4549 		}
4550 	} else {
4551 		ifp->if_ipv6_stat->timestamp = net_uptime();
4552 		if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4553 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_on);
4554 		} else {
4555 			tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_off);
4556 		}
4557 	}
4558 
4559 	if (ifs->rxmit_drop) {
4560 		if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4561 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_on.rxmit_drop);
4562 		} else {
4563 			IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off.rxmit_drop);
4564 		}
4565 	}
4566 	if (ifs->ecn_fallback_synloss) {
4567 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_synloss);
4568 	}
4569 	if (ifs->ecn_fallback_droprst) {
4570 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprst);
4571 	}
4572 	if (ifs->ecn_fallback_droprxmt) {
4573 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprxmt);
4574 	}
4575 	if (ifs->ecn_fallback_ce) {
4576 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_ce);
4577 	}
4578 	if (ifs->ecn_fallback_reorder) {
4579 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_reorder);
4580 	}
4581 	if (ifs->ecn_recv_ce > 0) {
4582 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ce);
4583 	}
4584 	if (ifs->ecn_recv_ece > 0) {
4585 		IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ece);
4586 	}
4587 
4588 	tcp_flow_lim_stats(ifs, &ifp->if_lim_stat);
4589 	ifnet_lock_done(ifp);
4590 }
4591 
4592 #if SKYWALK
4593 
4594 #include <skywalk/core/skywalk_var.h>
4595 
4596 void
tcp_add_fsw_flow(struct tcpcb * tp,struct ifnet * ifp)4597 tcp_add_fsw_flow(struct tcpcb *tp, struct ifnet *ifp)
4598 {
4599 	struct inpcb *inp = tp->t_inpcb;
4600 	struct socket *so = inp->inp_socket;
4601 	uuid_t fsw_uuid;
4602 	struct nx_flow_req nfr;
4603 	int err;
4604 
4605 	if (sk_fsw_rx_agg_tcp == 0) {
4606 		return;
4607 	}
4608 
4609 	if (ifp == NULL || kern_nexus_get_flowswitch_instance(ifp, fsw_uuid)) {
4610 		TCP_LOG_FSW_FLOW(tp, "skip ifp no fsw");
4611 		return;
4612 	}
4613 
4614 	memset(&nfr, 0, sizeof(nfr));
4615 
4616 	if (inp->inp_vflag & INP_IPV4) {
4617 		ASSERT(!(inp->inp_laddr.s_addr == INADDR_ANY ||
4618 		    inp->inp_faddr.s_addr == INADDR_ANY ||
4619 		    IN_MULTICAST(ntohl(inp->inp_laddr.s_addr)) ||
4620 		    IN_MULTICAST(ntohl(inp->inp_faddr.s_addr))));
4621 		nfr.nfr_saddr.sin.sin_len = sizeof(struct sockaddr_in);
4622 		nfr.nfr_saddr.sin.sin_family = AF_INET;
4623 		nfr.nfr_saddr.sin.sin_port = inp->inp_lport;
4624 		memcpy(&nfr.nfr_saddr.sin.sin_addr, &inp->inp_laddr,
4625 		    sizeof(struct in_addr));
4626 		nfr.nfr_daddr.sin.sin_len = sizeof(struct sockaddr_in);
4627 		nfr.nfr_daddr.sin.sin_family = AF_INET;
4628 		nfr.nfr_daddr.sin.sin_port = inp->inp_fport;
4629 		memcpy(&nfr.nfr_daddr.sin.sin_addr, &inp->inp_faddr,
4630 		    sizeof(struct in_addr));
4631 	} else {
4632 		ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
4633 		    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
4634 		    IN6_IS_ADDR_MULTICAST(&inp->in6p_laddr) ||
4635 		    IN6_IS_ADDR_MULTICAST(&inp->in6p_faddr)));
4636 		nfr.nfr_saddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
4637 		nfr.nfr_saddr.sin6.sin6_family = AF_INET6;
4638 		nfr.nfr_saddr.sin6.sin6_port = inp->inp_lport;
4639 		memcpy(&nfr.nfr_saddr.sin6.sin6_addr, &inp->in6p_laddr,
4640 		    sizeof(struct in6_addr));
4641 		nfr.nfr_daddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
4642 		nfr.nfr_daddr.sin.sin_family = AF_INET6;
4643 		nfr.nfr_daddr.sin6.sin6_port = inp->inp_fport;
4644 		memcpy(&nfr.nfr_daddr.sin6.sin6_addr, &inp->in6p_faddr,
4645 		    sizeof(struct in6_addr));
4646 		/* clear embedded scope ID */
4647 		if (IN6_IS_SCOPE_EMBED(&nfr.nfr_saddr.sin6.sin6_addr)) {
4648 			nfr.nfr_saddr.sin6.sin6_addr.s6_addr16[1] = 0;
4649 		}
4650 		if (IN6_IS_SCOPE_EMBED(&nfr.nfr_daddr.sin6.sin6_addr)) {
4651 			nfr.nfr_daddr.sin6.sin6_addr.s6_addr16[1] = 0;
4652 		}
4653 	}
4654 
4655 	nfr.nfr_nx_port = 1;
4656 	nfr.nfr_ip_protocol = IPPROTO_TCP;
4657 	nfr.nfr_transport_protocol = IPPROTO_TCP;
4658 	nfr.nfr_flags = NXFLOWREQF_ASIS;
4659 	nfr.nfr_epid = (so != NULL ? so->last_pid : 0);
4660 	if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
4661 		nfr.nfr_port_reservation = inp->inp_netns_token;
4662 		nfr.nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
4663 	}
4664 	nfr.nfr_inp_flowhash = inp->inp_flowhash;
4665 
4666 	uuid_generate_random(nfr.nfr_flow_uuid);
4667 	err = kern_nexus_flow_add(kern_nexus_shared_controller(), fsw_uuid,
4668 	    &nfr, sizeof(nfr));
4669 
4670 	if (err == 0) {
4671 		uuid_copy(tp->t_fsw_uuid, fsw_uuid);
4672 		uuid_copy(tp->t_flow_uuid, nfr.nfr_flow_uuid);
4673 	}
4674 
4675 	TCP_LOG_FSW_FLOW(tp, "add err %d\n", err);
4676 }
4677 
4678 void
tcp_del_fsw_flow(struct tcpcb * tp)4679 tcp_del_fsw_flow(struct tcpcb *tp)
4680 {
4681 	if (uuid_is_null(tp->t_fsw_uuid) || uuid_is_null(tp->t_flow_uuid)) {
4682 		return;
4683 	}
4684 
4685 	struct nx_flow_req nfr;
4686 	uuid_copy(nfr.nfr_flow_uuid, tp->t_flow_uuid);
4687 
4688 	/* It's possible for this call to fail if the nexus has detached */
4689 	int err = kern_nexus_flow_del(kern_nexus_shared_controller(),
4690 	    tp->t_fsw_uuid, &nfr, sizeof(nfr));
4691 	VERIFY(err == 0 || err == ENOENT || err == ENXIO);
4692 
4693 	uuid_clear(tp->t_fsw_uuid);
4694 	uuid_clear(tp->t_flow_uuid);
4695 
4696 	TCP_LOG_FSW_FLOW(tp, "del err %d\n", err);
4697 }
4698 
4699 #endif /* SKYWALK */
4700