xref: /xnu-10063.121.3/bsd/netinet/tcp_timer.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
61  * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62  */
63 
64 #include "tcp_includes.h"
65 
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/mbuf.h>
70 #include <sys/sysctl.h>
71 #include <sys/socket.h>
72 #include <sys/socketvar.h>
73 #include <sys/protosw.h>
74 #include <sys/domain.h>
75 #include <sys/mcache.h>
76 #include <sys/queue.h>
77 #include <kern/locks.h>
78 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
79 #include <mach/boolean.h>
80 
81 #include <net/route.h>
82 #include <net/if_var.h>
83 #include <net/ntstat.h>
84 
85 #include <netinet/in.h>
86 #include <netinet/in_systm.h>
87 #include <netinet/in_pcb.h>
88 #include <netinet/in_var.h>
89 #include <netinet6/in6_pcb.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/tcp.h>
92 #include <netinet/tcp_cache.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/tcp_cc.h>
98 #include <netinet6/tcp6_var.h>
99 #include <netinet/tcpip.h>
100 #if TCPDEBUG
101 #include <netinet/tcp_debug.h>
102 #endif
103 #include <netinet/tcp_log.h>
104 
105 #include <sys/kdebug.h>
106 #include <mach/sdt.h>
107 #include <netinet/mptcp_var.h>
108 #include <net/content_filter.h>
109 #include <net/sockaddr_utils.h>
110 
111 /* Max number of times a stretch ack can be delayed on a connection */
112 #define TCP_STRETCHACK_DELAY_THRESHOLD  5
113 
114 /*
115  * If the host processor has been sleeping for too long, this is the threshold
116  * used to avoid sending stale retransmissions.
117  */
118 #define TCP_SLEEP_TOO_LONG      (10 * 60 * 1000) /* 10 minutes in ms */
119 
120 /* tcp timer list */
121 struct tcptimerlist tcp_timer_list;
122 
123 /* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
124 struct tcptailq tcp_tw_tailq;
125 
126 
127 static int
128 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
129 {
130 #pragma unused(arg2)
131 	int error, temp;
132 	long s, tt;
133 
134 	tt = *(int *)arg1;
135 	s = tt * 1000 / TCP_RETRANSHZ;
136 	if (tt < 0 || s > INT_MAX) {
137 		return EINVAL;
138 	}
139 	temp = (int)s;
140 
141 	error = sysctl_handle_int(oidp, &temp, 0, req);
142 	if (error || !req->newptr) {
143 		return error;
144 	}
145 
146 	tt = (long)temp * TCP_RETRANSHZ / 1000;
147 	if (tt < 1 || tt > INT_MAX) {
148 		return EINVAL;
149 	}
150 
151 	*(int *)arg1 = (int)tt;
152 	SYSCTL_SKMEM_UPDATE_AT_OFFSET(arg2, *(int*)arg1);
153 	return 0;
154 }
155 
156 #if SYSCTL_SKMEM
157 int     tcp_keepinit = TCPTV_KEEP_INIT;
158 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
159     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
160     &tcp_keepinit, offsetof(skmem_sysctl, tcp.keepinit),
161     sysctl_msec_to_ticks, "I", "");
162 
163 int     tcp_keepidle = TCPTV_KEEP_IDLE;
164 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
165     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
166     &tcp_keepidle, offsetof(skmem_sysctl, tcp.keepidle),
167     sysctl_msec_to_ticks, "I", "");
168 
169 int     tcp_keepintvl = TCPTV_KEEPINTVL;
170 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
171     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
172     &tcp_keepintvl, offsetof(skmem_sysctl, tcp.keepintvl),
173     sysctl_msec_to_ticks, "I", "");
174 
175 SYSCTL_SKMEM_TCP_INT(OID_AUTO, keepcnt,
176     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
177     int, tcp_keepcnt, TCPTV_KEEPCNT, "number of times to repeat keepalive");
178 
179 int     tcp_msl = TCPTV_MSL;
180 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
181     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
182     &tcp_msl, offsetof(skmem_sysctl, tcp.msl),
183     sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
184 #else /* SYSCTL_SKMEM */
185 int     tcp_keepinit;
186 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
187     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
188     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
189 
190 int     tcp_keepidle;
191 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
192     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
193     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
194 
195 int     tcp_keepintvl;
196 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
197     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
198     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
199 
200 int     tcp_keepcnt;
201 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
202     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
203     &tcp_keepcnt, 0, "number of times to repeat keepalive");
204 
205 int     tcp_msl;
206 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
207     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
208     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
209 #endif /* SYSCTL_SKMEM */
210 
211 /*
212  * Avoid DoS with connections half-closed in TIME_WAIT_2
213  */
214 int     tcp_fin_timeout = TCPTV_FINWAIT2;
215 
216 static int
217 sysctl_tcp_fin_timeout SYSCTL_HANDLER_ARGS
218 {
219 #pragma unused(arg2)
220 	int error;
221 	int value = tcp_fin_timeout;
222 
223 	error = sysctl_handle_int(oidp, &value, 0, req);
224 	if (error != 0 || req->newptr == USER_ADDR_NULL) {
225 		return error;
226 	}
227 
228 	if (value == -1) {
229 		/* Reset to default value */
230 		value = TCPTV_FINWAIT2;
231 	} else {
232 		/* Convert from milliseconds */
233 		long big_value = value * TCP_RETRANSHZ / 1000;
234 
235 		if (big_value < 0 || big_value > INT_MAX) {
236 			return EINVAL;
237 		}
238 		value = (int)big_value;
239 	}
240 	tcp_fin_timeout = value;
241 	SYSCTL_SKMEM_UPDATE_AT_OFFSET(arg2, value);
242 	return 0;
243 }
244 
245 #if SYSCTL_SKMEM
246 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fin_timeout,
247     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
248     &tcp_fin_timeout, offsetof(skmem_sysctl, tcp.fin_timeout),
249     sysctl_tcp_fin_timeout, "I", "");
250 #else /* SYSCTL_SKMEM */
251 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fin_timeout,
252     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
253     &tcp_fin_timeout, 0,
254     sysctl_tcp_fin_timeout, "I", "");
255 #endif /* SYSCTL_SKMEM */
256 
257 /*
258  * Avoid DoS via TCP Robustness in Persist Condition
259  * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
260  * by allowing a system wide maximum persistence timeout value when in
261  * Zero Window Probe mode.
262  *
263  * Expressed in milliseconds to be consistent without timeout related
264  * values, the TCP socket option is in seconds.
265  */
266 #if SYSCTL_SKMEM
267 u_int32_t tcp_max_persist_timeout = 0;
268 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
269     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
270     &tcp_max_persist_timeout, offsetof(skmem_sysctl, tcp.max_persist_timeout),
271     sysctl_msec_to_ticks, "I", "Maximum persistence timeout for ZWP");
272 #else /* SYSCTL_SKMEM */
273 u_int32_t tcp_max_persist_timeout = 0;
274 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
275     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
276     &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
277     "Maximum persistence timeout for ZWP");
278 #endif /* SYSCTL_SKMEM */
279 
280 SYSCTL_SKMEM_TCP_INT(OID_AUTO, always_keepalive,
281     CTLFLAG_RW | CTLFLAG_LOCKED, static int, always_keepalive, 0,
282     "Assume SO_KEEPALIVE on all TCP connections");
283 
284 /*
285  * This parameter determines how long the timer list will stay in fast or
286  * quick mode even though all connections are idle. In this state, the
287  * timer will run more frequently anticipating new data.
288  */
289 SYSCTL_SKMEM_TCP_INT(OID_AUTO, timer_fastmode_idlemax,
290     CTLFLAG_RW | CTLFLAG_LOCKED, int, timer_fastmode_idlemax,
291     TCP_FASTMODE_IDLERUN_MAX, "Maximum idle generations in fast mode");
292 
293 /*
294  * See tcp_syn_backoff[] for interval values between SYN retransmits;
295  * the value set below defines the number of retransmits, before we
296  * disable the timestamp and window scaling options during subsequent
297  * SYN retransmits.  Setting it to 0 disables the dropping off of those
298  * two options.
299  */
300 SYSCTL_SKMEM_TCP_INT(OID_AUTO, broken_peer_syn_rexmit_thres,
301     CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_broken_peer_syn_rxmit_thres,
302     10, "Number of retransmitted SYNs before disabling RFC 1323 "
303     "options on local connections");
304 
305 static int tcp_timer_advanced = 0;
306 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
307     CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
308     "Number of times one of the timers was advanced");
309 
310 static int tcp_resched_timerlist = 0;
311 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
312     CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
313     "Number of times timer list was rescheduled as part of processing a packet");
314 
315 SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_detection,
316     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_detect, 1,
317     "Path MTU Discovery Black Hole Detection");
318 
319 SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_mss,
320     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_mss, 1200,
321     "Path MTU Discovery Black Hole Detection lowered MSS");
322 
323 #if (DEBUG || DEVELOPMENT)
324 int tcp_probe_if_fix_port = 0;
325 SYSCTL_INT(_net_inet_tcp, OID_AUTO, probe_if_fix_port,
326     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
327     &tcp_probe_if_fix_port, 0, "");
328 #endif /* (DEBUG || DEVELOPMENT) */
329 
330 static u_int32_t tcp_mss_rec_medium = 1200;
331 static u_int32_t tcp_mss_rec_low = 512;
332 
333 #define TCP_REPORT_STATS_INTERVAL       43200 /* 12 hours, in seconds */
334 int tcp_report_stats_interval = TCP_REPORT_STATS_INTERVAL;
335 
336 /* performed garbage collection of "used" sockets */
337 static boolean_t tcp_gc_done = FALSE;
338 
339 /* max idle probes */
340 int     tcp_maxpersistidle = TCPTV_KEEP_IDLE;
341 
342 /*
343  * TCP delack timer is set to 100 ms. Since the processing of timer list
344  * in fast mode will happen no faster than 100 ms, the delayed ack timer
345  * will fire some where between 100 and 200 ms.
346  */
347 int     tcp_delack = TCP_RETRANSHZ / 10;
348 
349 #if MPTCP
350 /*
351  * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
352  */
353 int     tcp_jack_rxmt = TCP_RETRANSHZ / 2;
354 #endif /* MPTCP */
355 
356 static boolean_t tcp_itimer_done = FALSE;
357 
358 static void tcp_remove_timer(struct tcpcb *tp);
359 static void tcp_sched_timerlist(uint32_t offset);
360 static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode,
361     u_int16_t probe_if_index);
362 static inline void tcp_set_lotimer_index(struct tcpcb *);
363 __private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
364 static inline void tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp);
365 __private_extern__ void tcp_report_stats(void);
366 
367 static  u_int64_t tcp_last_report_time;
368 
369 /*
370  * Structure to store previously reported stats so that we can send
371  * incremental changes in each report interval.
372  */
373 struct tcp_last_report_stats {
374 	u_int32_t       tcps_connattempt;
375 	u_int32_t       tcps_accepts;
376 	u_int32_t       tcps_ecn_client_setup;
377 	u_int32_t       tcps_ecn_server_setup;
378 	u_int32_t       tcps_ecn_client_success;
379 	u_int32_t       tcps_ecn_server_success;
380 	u_int32_t       tcps_ecn_not_supported;
381 	u_int32_t       tcps_ecn_lost_syn;
382 	u_int32_t       tcps_ecn_lost_synack;
383 	u_int32_t       tcps_ecn_recv_ce;
384 	u_int32_t       tcps_ecn_recv_ece;
385 	u_int32_t       tcps_ecn_sent_ece;
386 	u_int32_t       tcps_ecn_conn_recv_ce;
387 	u_int32_t       tcps_ecn_conn_recv_ece;
388 	u_int32_t       tcps_ecn_conn_plnoce;
389 	u_int32_t       tcps_ecn_conn_pl_ce;
390 	u_int32_t       tcps_ecn_conn_nopl_ce;
391 	u_int32_t       tcps_ecn_fallback_synloss;
392 	u_int32_t       tcps_ecn_fallback_reorder;
393 	u_int32_t       tcps_ecn_fallback_ce;
394 
395 	/* TFO-related statistics */
396 	u_int32_t       tcps_tfo_syn_data_rcv;
397 	u_int32_t       tcps_tfo_cookie_req_rcv;
398 	u_int32_t       tcps_tfo_cookie_sent;
399 	u_int32_t       tcps_tfo_cookie_invalid;
400 	u_int32_t       tcps_tfo_cookie_req;
401 	u_int32_t       tcps_tfo_cookie_rcv;
402 	u_int32_t       tcps_tfo_syn_data_sent;
403 	u_int32_t       tcps_tfo_syn_data_acked;
404 	u_int32_t       tcps_tfo_syn_loss;
405 	u_int32_t       tcps_tfo_blackhole;
406 	u_int32_t       tcps_tfo_cookie_wrong;
407 	u_int32_t       tcps_tfo_no_cookie_rcv;
408 	u_int32_t       tcps_tfo_heuristics_disable;
409 	u_int32_t       tcps_tfo_sndblackhole;
410 
411 	/* MPTCP-related statistics */
412 	u_int32_t       tcps_mptcp_handover_attempt;
413 	u_int32_t       tcps_mptcp_interactive_attempt;
414 	u_int32_t       tcps_mptcp_aggregate_attempt;
415 	u_int32_t       tcps_mptcp_fp_handover_attempt;
416 	u_int32_t       tcps_mptcp_fp_interactive_attempt;
417 	u_int32_t       tcps_mptcp_fp_aggregate_attempt;
418 	u_int32_t       tcps_mptcp_heuristic_fallback;
419 	u_int32_t       tcps_mptcp_fp_heuristic_fallback;
420 	u_int32_t       tcps_mptcp_handover_success_wifi;
421 	u_int32_t       tcps_mptcp_handover_success_cell;
422 	u_int32_t       tcps_mptcp_interactive_success;
423 	u_int32_t       tcps_mptcp_aggregate_success;
424 	u_int32_t       tcps_mptcp_fp_handover_success_wifi;
425 	u_int32_t       tcps_mptcp_fp_handover_success_cell;
426 	u_int32_t       tcps_mptcp_fp_interactive_success;
427 	u_int32_t       tcps_mptcp_fp_aggregate_success;
428 	u_int32_t       tcps_mptcp_handover_cell_from_wifi;
429 	u_int32_t       tcps_mptcp_handover_wifi_from_cell;
430 	u_int32_t       tcps_mptcp_interactive_cell_from_wifi;
431 	u_int64_t       tcps_mptcp_handover_cell_bytes;
432 	u_int64_t       tcps_mptcp_interactive_cell_bytes;
433 	u_int64_t       tcps_mptcp_aggregate_cell_bytes;
434 	u_int64_t       tcps_mptcp_handover_all_bytes;
435 	u_int64_t       tcps_mptcp_interactive_all_bytes;
436 	u_int64_t       tcps_mptcp_aggregate_all_bytes;
437 	u_int32_t       tcps_mptcp_back_to_wifi;
438 	u_int32_t       tcps_mptcp_wifi_proxy;
439 	u_int32_t       tcps_mptcp_cell_proxy;
440 	u_int32_t       tcps_mptcp_triggered_cell;
441 };
442 
443 
444 /* Returns true if the timer is on the timer list */
445 #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
446 
447 /* Run the TCP timerlist atleast once every hour */
448 #define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
449 
450 
451 static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
452 static boolean_t tcp_garbage_collect(struct inpcb *, int);
453 
454 #define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
455 
456 #define VERIFY_NEXT_LINK(elm, field) do {       \
457 	if (LIST_NEXT((elm),field) != NULL &&   \
458 	    LIST_NEXT((elm),field)->field.le_prev !=    \
459 	        &((elm)->field.le_next))        \
460 	        panic("Bad link elm %p next->prev != elm", (elm));      \
461 } while(0)
462 
463 #define VERIFY_PREV_LINK(elm, field) do {       \
464 	if (*(elm)->field.le_prev != (elm))     \
465 	        panic("Bad link elm %p prev->next != elm", (elm));      \
466 } while(0)
467 
468 #define TCP_SET_TIMER_MODE(mode, i) do { \
469 	if (IS_TIMER_HZ_10MS(i)) \
470 	        (mode) |= TCP_TIMERLIST_10MS_MODE; \
471 	else if (IS_TIMER_HZ_100MS(i)) \
472 	        (mode) |= TCP_TIMERLIST_100MS_MODE; \
473 	else \
474 	        (mode) |= TCP_TIMERLIST_500MS_MODE; \
475 } while(0)
476 
477 #if (DEVELOPMENT || DEBUG)
478 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_medium,
479     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_medium, 0,
480     "Medium MSS based on recommendation in link status report");
481 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_low,
482     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_low, 0,
483     "Low MSS based on recommendation in link status report");
484 
485 static int32_t tcp_change_mss_recommended = 0;
486 static int
487 sysctl_change_mss_recommended SYSCTL_HANDLER_ARGS
488 {
489 #pragma unused(oidp, arg1, arg2)
490 	int i, err = 0, changed = 0;
491 	struct ifnet *ifp;
492 	struct if_link_status ifsr;
493 	struct if_cellular_status_v1 *new_cell_sr;
494 	err = sysctl_io_number(req, tcp_change_mss_recommended,
495 	    sizeof(int32_t), &i, &changed);
496 	if (changed) {
497 		if (i < 0 || i > UINT16_MAX) {
498 			return EINVAL;
499 		}
500 		ifnet_head_lock_shared();
501 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
502 			if (IFNET_IS_CELLULAR(ifp)) {
503 				bzero(&ifsr, sizeof(ifsr));
504 				new_cell_sr = &ifsr.ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
505 				ifsr.ifsr_version = IF_CELLULAR_STATUS_REPORT_CURRENT_VERSION;
506 				ifsr.ifsr_len = sizeof(*new_cell_sr);
507 
508 				/* Set MSS recommended */
509 				new_cell_sr->valid_bitmask |= IF_CELL_UL_MSS_RECOMMENDED_VALID;
510 				new_cell_sr->mss_recommended = (uint16_t)i;
511 				err = ifnet_link_status_report(ifp, new_cell_sr, sizeof(new_cell_sr));
512 				if (err == 0) {
513 					tcp_change_mss_recommended = i;
514 				} else {
515 					break;
516 				}
517 			}
518 		}
519 		ifnet_head_done();
520 	}
521 	return err;
522 }
523 
524 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, change_mss_recommended,
525     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_change_mss_recommended,
526     0, sysctl_change_mss_recommended, "IU", "Change MSS recommended");
527 
528 SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval,
529     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0,
530     "Report stats interval");
531 #endif /* (DEVELOPMENT || DEBUG) */
532 
533 /*
534  * Macro to compare two timers. If there is a reset of the sign bit,
535  * it is safe to assume that the timer has wrapped around. By doing
536  * signed comparision, we take care of wrap around such that the value
537  * with the sign bit reset is actually ahead of the other.
538  */
539 inline int32_t
timer_diff(uint32_t t1,uint32_t toff1,uint32_t t2,uint32_t toff2)540 timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2)
541 {
542 	return (int32_t)((t1 + toff1) - (t2 + toff2));
543 }
544 
545 /*
546  * Add to tcp timewait list, delay is given in milliseconds.
547  */
548 static void
add_to_time_wait_locked(struct tcpcb * tp,uint32_t delay)549 add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
550 {
551 	struct inpcbinfo *pcbinfo = &tcbinfo;
552 	struct inpcb *inp = tp->t_inpcb;
553 	uint32_t timer;
554 
555 	/* pcb list should be locked when we get here */
556 	LCK_RW_ASSERT(&pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
557 
558 	/* We may get here multiple times, so check */
559 	if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
560 		pcbinfo->ipi_twcount++;
561 		inp->inp_flags2 |= INP2_TIMEWAIT;
562 
563 		/* Remove from global inp list */
564 		LIST_REMOVE(inp, inp_list);
565 	} else {
566 		TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
567 	}
568 
569 	/* Compute the time at which this socket can be closed */
570 	timer = tcp_now + delay;
571 
572 	/* We will use the TCPT_2MSL timer for tracking this delay */
573 
574 	if (TIMER_IS_ON_LIST(tp)) {
575 		tcp_remove_timer(tp);
576 	}
577 	tp->t_timer[TCPT_2MSL] = timer;
578 
579 	TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
580 }
581 
582 void
add_to_time_wait(struct tcpcb * tp,uint32_t delay)583 add_to_time_wait(struct tcpcb *tp, uint32_t delay)
584 {
585 	if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP) {
586 		socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
587 	}
588 
589 	tcp_del_fsw_flow(tp);
590 
591 	/* 19182803: Notify nstat that connection is closing before waiting. */
592 	nstat_pcb_detach(tp->t_inpcb);
593 
594 #if CONTENT_FILTER
595 	if ((tp->t_inpcb->inp_socket->so_flags & SOF_CONTENT_FILTER) != 0) {
596 		/* If filter present, allow filter to finish processing all queued up data before adding to time wait queue */
597 		(void) cfil_sock_tcp_add_time_wait(tp->t_inpcb->inp_socket);
598 	} else
599 #endif /* CONTENT_FILTER */
600 	{
601 		add_to_time_wait_now(tp, delay);
602 	}
603 }
604 
605 void
add_to_time_wait_now(struct tcpcb * tp,uint32_t delay)606 add_to_time_wait_now(struct tcpcb *tp, uint32_t delay)
607 {
608 	struct inpcbinfo *pcbinfo = &tcbinfo;
609 
610 	if (!lck_rw_try_lock_exclusive(&pcbinfo->ipi_lock)) {
611 		socket_unlock(tp->t_inpcb->inp_socket, 0);
612 		lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
613 		socket_lock(tp->t_inpcb->inp_socket, 0);
614 	}
615 	add_to_time_wait_locked(tp, delay);
616 	lck_rw_done(&pcbinfo->ipi_lock);
617 
618 	inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
619 }
620 
621 /* If this is on time wait queue, remove it. */
622 void
tcp_remove_from_time_wait(struct inpcb * inp)623 tcp_remove_from_time_wait(struct inpcb *inp)
624 {
625 	struct tcpcb *tp = intotcpcb(inp);
626 	if (inp->inp_flags2 & INP2_TIMEWAIT) {
627 		TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
628 	}
629 }
630 
631 static boolean_t
tcp_garbage_collect(struct inpcb * inp,int istimewait)632 tcp_garbage_collect(struct inpcb *inp, int istimewait)
633 {
634 	boolean_t active = FALSE;
635 	struct socket *so, *mp_so = NULL;
636 	struct tcpcb *tp;
637 
638 	so = inp->inp_socket;
639 	tp = intotcpcb(inp);
640 
641 	if (so->so_flags & SOF_MP_SUBFLOW) {
642 		mp_so = mptetoso(tptomptp(tp)->mpt_mpte);
643 		if (!socket_try_lock(mp_so)) {
644 			mp_so = NULL;
645 			active = TRUE;
646 			goto out;
647 		}
648 		if (mpsotomppcb(mp_so)->mpp_inside > 0) {
649 			os_log(mptcp_log_handle, "%s - %lx: Still inside %d usecount %d\n", __func__,
650 			    (unsigned long)VM_KERNEL_ADDRPERM(mpsotompte(mp_so)),
651 			    mpsotomppcb(mp_so)->mpp_inside,
652 			    mp_so->so_usecount);
653 			socket_unlock(mp_so, 0);
654 			mp_so = NULL;
655 			active = TRUE;
656 			goto out;
657 		}
658 		/* We call socket_unlock with refcount further below */
659 		mp_so->so_usecount++;
660 		tptomptp(tp)->mpt_mpte->mpte_mppcb->mpp_inside++;
661 	}
662 
663 	/*
664 	 * Skip if still in use or busy; it would have been more efficient
665 	 * if we were to test so_usecount against 0, but this isn't possible
666 	 * due to the current implementation of tcp_dropdropablreq() where
667 	 * overflow sockets that are eligible for garbage collection have
668 	 * their usecounts set to 1.
669 	 */
670 	if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx)) {
671 		active = TRUE;
672 		goto out;
673 	}
674 
675 	/* Check again under the lock */
676 	if (so->so_usecount > 1) {
677 		if (inp->inp_wantcnt == WNT_STOPUSING) {
678 			active = TRUE;
679 		}
680 		lck_mtx_unlock(&inp->inpcb_mtx);
681 		goto out;
682 	}
683 
684 	if (istimewait && TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
685 	    tp->t_state != TCPS_CLOSED) {
686 		/* Become a regular mutex */
687 		lck_mtx_convert_spin(&inp->inpcb_mtx);
688 		tcp_close(tp);
689 	}
690 
691 	/*
692 	 * Overflowed socket dropped from the listening queue?  Do this
693 	 * only if we are called to clean up the time wait slots, since
694 	 * tcp_dropdropablreq() considers a socket to have been fully
695 	 * dropped after add_to_time_wait() is finished.
696 	 * Also handle the case of connections getting closed by the peer
697 	 * while in the queue as seen with rdar://6422317
698 	 *
699 	 */
700 	if (so->so_usecount == 1 &&
701 	    ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
702 	    ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
703 	    (so->so_head != NULL) &&
704 	    ((so->so_state & (SS_INCOMP | SS_CANTSENDMORE | SS_CANTRCVMORE)) ==
705 	    (SS_INCOMP | SS_CANTSENDMORE | SS_CANTRCVMORE))))) {
706 		if (inp->inp_state != INPCB_STATE_DEAD) {
707 			/* Become a regular mutex */
708 			lck_mtx_convert_spin(&inp->inpcb_mtx);
709 			if (SOCK_CHECK_DOM(so, PF_INET6)) {
710 				in6_pcbdetach(inp);
711 			} else {
712 				in_pcbdetach(inp);
713 			}
714 		}
715 		VERIFY(so->so_usecount > 0);
716 		so->so_usecount--;
717 		if (inp->inp_wantcnt == WNT_STOPUSING) {
718 			active = TRUE;
719 		}
720 		lck_mtx_unlock(&inp->inpcb_mtx);
721 		goto out;
722 	} else if (inp->inp_wantcnt != WNT_STOPUSING) {
723 		lck_mtx_unlock(&inp->inpcb_mtx);
724 		active = FALSE;
725 		goto out;
726 	}
727 
728 	/*
729 	 * We get here because the PCB is no longer searchable
730 	 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
731 	 * (usecount is 0).  This covers all cases, including overflow
732 	 * sockets and those that are considered as "embryonic",
733 	 * i.e. created by sonewconn() in TCP input path, and have
734 	 * not yet been committed.  For the former, we reduce the usecount
735 	 *  to 0 as done by the code above.  For the latter, the usecount
736 	 * would have reduced to 0 as part calling soabort() when the
737 	 * socket is dropped at the end of tcp_input().
738 	 */
739 	if (so->so_usecount == 0) {
740 		DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
741 		    struct tcpcb *, tp, int32_t, TCPS_CLOSED);
742 		/* Become a regular mutex */
743 		lck_mtx_convert_spin(&inp->inpcb_mtx);
744 
745 		/*
746 		 * If this tp still happens to be on the timer list,
747 		 * take it out
748 		 */
749 		if (TIMER_IS_ON_LIST(tp)) {
750 			tcp_remove_timer(tp);
751 		}
752 
753 		if (inp->inp_state != INPCB_STATE_DEAD) {
754 			if (SOCK_CHECK_DOM(so, PF_INET6)) {
755 				in6_pcbdetach(inp);
756 			} else {
757 				in_pcbdetach(inp);
758 			}
759 		}
760 
761 		if (mp_so) {
762 			mptcp_subflow_del(tptomptp(tp)->mpt_mpte, tp->t_mpsub);
763 
764 			/* so is now unlinked from mp_so - let's drop the lock */
765 			socket_unlock(mp_so, 1);
766 			mp_so = NULL;
767 		}
768 
769 		in_pcbdispose(inp);
770 		active = FALSE;
771 		goto out;
772 	}
773 
774 	lck_mtx_unlock(&inp->inpcb_mtx);
775 	active = TRUE;
776 
777 out:
778 	if (mp_so) {
779 		socket_unlock(mp_so, 1);
780 	}
781 
782 	return active;
783 }
784 
785 /*
786  * TCP garbage collector callback (inpcb_timer_func_t).
787  *
788  * Returns the number of pcbs that will need to be gc-ed soon,
789  * returnining > 0 will keep timer active.
790  */
791 void
tcp_gc(struct inpcbinfo * ipi)792 tcp_gc(struct inpcbinfo *ipi)
793 {
794 	struct inpcb *inp, *nxt;
795 	struct tcpcb *tw_tp, *tw_ntp;
796 #if TCPDEBUG
797 	int ostate;
798 #endif
799 #if  KDEBUG
800 	static int tws_checked = 0;
801 #endif
802 
803 	KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
804 
805 	/*
806 	 * Update tcp_now here as it may get used while
807 	 * processing the slow timer.
808 	 */
809 	calculate_tcp_clock();
810 
811 	/*
812 	 * Garbage collect socket/tcpcb: We need to acquire the list lock
813 	 * exclusively to do this
814 	 */
815 
816 	if (lck_rw_try_lock_exclusive(&ipi->ipi_lock) == FALSE) {
817 		/* don't sweat it this time; cleanup was done last time */
818 		if (tcp_gc_done == TRUE) {
819 			tcp_gc_done = FALSE;
820 			KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
821 			    tws_checked, cur_tw_slot, 0, 0, 0);
822 			/* Lock upgrade failed, give up this round */
823 			os_atomic_inc(&ipi->ipi_gc_req.intimer_fast, relaxed);
824 			return;
825 		}
826 		/* Upgrade failed, lost lock now take it again exclusive */
827 		lck_rw_lock_exclusive(&ipi->ipi_lock);
828 	}
829 	tcp_gc_done = TRUE;
830 
831 	LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
832 		if (tcp_garbage_collect(inp, 0)) {
833 			os_atomic_inc(&ipi->ipi_gc_req.intimer_fast, relaxed);
834 		}
835 	}
836 
837 	/* Now cleanup the time wait ones */
838 	TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
839 		/*
840 		 * We check the timestamp here without holding the
841 		 * socket lock for better performance. If there are
842 		 * any pcbs in time-wait, the timer will get rescheduled.
843 		 * Hence some error in this check can be tolerated.
844 		 *
845 		 * Sometimes a socket on time-wait queue can be closed if
846 		 * 2MSL timer expired but the application still has a
847 		 * usecount on it.
848 		 */
849 		if (tw_tp->t_state == TCPS_CLOSED ||
850 		    TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
851 			if (tcp_garbage_collect(tw_tp->t_inpcb, 1)) {
852 				os_atomic_inc(&ipi->ipi_gc_req.intimer_lazy, relaxed);
853 			}
854 		}
855 	}
856 
857 	/* take into account pcbs that are still in time_wait_slots */
858 	os_atomic_add(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount, relaxed);
859 
860 	lck_rw_done(&ipi->ipi_lock);
861 
862 	/* Clean up the socache while we are here */
863 	if (so_cache_timer()) {
864 		os_atomic_inc(&ipi->ipi_gc_req.intimer_lazy, relaxed);
865 	}
866 
867 	KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
868 	    cur_tw_slot, 0, 0, 0);
869 
870 	return;
871 }
872 
873 /*
874  * Cancel all timers for TCP tp.
875  */
876 void
tcp_canceltimers(struct tcpcb * tp)877 tcp_canceltimers(struct tcpcb *tp)
878 {
879 	int i;
880 
881 	tcp_remove_timer(tp);
882 	for (i = 0; i < TCPT_NTIMERS; i++) {
883 		tp->t_timer[i] = 0;
884 	}
885 	tp->tentry.timer_start = tcp_now;
886 	tp->tentry.index = TCPT_NONE;
887 }
888 
889 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
890 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
891 
892 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
893 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
894 
895 static int tcp_totbackoff = 511;        /* sum of tcp_backoff[] */
896 
897 void
tcp_rexmt_save_state(struct tcpcb * tp)898 tcp_rexmt_save_state(struct tcpcb *tp)
899 {
900 	u_int32_t fsize;
901 	if (TSTMP_SUPPORTED(tp)) {
902 		/*
903 		 * Since timestamps are supported on the connection,
904 		 * we can do recovery as described in rfc 4015.
905 		 */
906 		fsize = tp->snd_max - tp->snd_una;
907 		tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
908 		tp->snd_recover_prev = tp->snd_recover;
909 	} else {
910 		/*
911 		 * Timestamp option is not supported on this connection.
912 		 * Record ssthresh and cwnd so they can
913 		 * be recovered if this turns out to be a "bad" retransmit.
914 		 * A retransmit is considered "bad" if an ACK for this
915 		 * segment is received within RTT/2 interval; the assumption
916 		 * here is that the ACK was already in flight.  See
917 		 * "On Estimating End-to-End Network Path Properties" by
918 		 * Allman and Paxson for more details.
919 		 */
920 		tp->snd_cwnd_prev = tp->snd_cwnd;
921 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
922 		tp->snd_recover_prev = tp->snd_recover;
923 		if (IN_FASTRECOVERY(tp)) {
924 			tp->t_flags |= TF_WASFRECOVERY;
925 		} else {
926 			tp->t_flags &= ~TF_WASFRECOVERY;
927 		}
928 	}
929 	tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
930 	tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
931 	tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
932 }
933 
934 /*
935  * Revert to the older segment size if there is an indication that PMTU
936  * blackhole detection was not needed.
937  */
938 void
tcp_pmtud_revert_segment_size(struct tcpcb * tp)939 tcp_pmtud_revert_segment_size(struct tcpcb *tp)
940 {
941 	int32_t optlen;
942 
943 	VERIFY(tp->t_pmtud_saved_maxopd > 0);
944 	tp->t_flags |= TF_PMTUD;
945 	tp->t_flags &= ~TF_BLACKHOLE;
946 	optlen = tp->t_maxopd - tp->t_maxseg;
947 	tp->t_maxopd = tp->t_pmtud_saved_maxopd;
948 	tp->t_maxseg = tp->t_maxopd - optlen;
949 
950 	/*
951 	 * Reset the slow-start flight size as it
952 	 * may depend on the new MSS
953 	 */
954 	if (CC_ALGO(tp)->cwnd_init != NULL) {
955 		CC_ALGO(tp)->cwnd_init(tp);
956 	}
957 
958 	if (TCP_USE_RLEDBAT(tp, tp->t_inpcb->inp_socket) &&
959 	    tcp_cc_rledbat.rwnd_init != NULL) {
960 		tcp_cc_rledbat.rwnd_init(tp);
961 	}
962 
963 	tp->t_pmtud_start_ts = 0;
964 	tcpstat.tcps_pmtudbh_reverted++;
965 
966 	/* change MSS according to recommendation, if there was one */
967 	tcp_update_mss_locked(tp->t_inpcb->inp_socket, NULL);
968 }
969 
970 static uint32_t
tcp_pmtud_black_holed_next_mss(struct tcpcb * tp)971 tcp_pmtud_black_holed_next_mss(struct tcpcb *tp)
972 {
973 	/* Reduce the MSS to intermediary value */
974 	if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
975 		return tcp_pmtud_black_hole_mss;
976 	} else {
977 		if (tp->t_inpcb->inp_vflag & INP_IPV4) {
978 			return tcp_mssdflt;
979 		} else {
980 			return tcp_v6mssdflt;
981 		}
982 	}
983 }
984 
985 /*
986  * Send a packet designed to force a response
987  * if the peer is up and reachable:
988  * either an ACK if the connection is still alive,
989  * or an RST if the peer has closed the connection
990  * due to timeout or reboot.
991  * Using sequence number tp->snd_una-1
992  * causes the transmitted zero-length segment
993  * to lie outside the receive window;
994  * by the protocol spec, this requires the
995  * correspondent TCP to respond.
996  */
997 static bool
tcp_send_keep_alive(struct tcpcb * tp)998 tcp_send_keep_alive(struct tcpcb *tp)
999 {
1000 	struct tcptemp *t_template;
1001 	struct mbuf *m;
1002 
1003 	tcpstat.tcps_keepprobe++;
1004 	t_template = tcp_maketemplate(tp, &m);
1005 	if (t_template != NULL) {
1006 		struct inpcb *inp = tp->t_inpcb;
1007 		struct tcp_respond_args tra;
1008 
1009 		bzero(&tra, sizeof(tra));
1010 		tra.nocell = INP_NO_CELLULAR(inp) ? 1 : 0;
1011 		tra.noexpensive = INP_NO_EXPENSIVE(inp) ? 1 : 0;
1012 		tra.noconstrained = INP_NO_CONSTRAINED(inp) ? 1 : 0;
1013 		tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp) ? 1 : 0;
1014 		tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp) ? 1 : 0;
1015 		tra.management_allowed = INP_MANAGEMENT_ALLOWED(inp) ? 1 : 0;
1016 		tra.keep_alive = 1;
1017 		if (tp->t_inpcb->inp_flags & INP_BOUND_IF) {
1018 			tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
1019 		} else {
1020 			tra.ifscope = IFSCOPE_NONE;
1021 		}
1022 		tcp_respond(tp, t_template->tt_ipgen,
1023 		    &t_template->tt_t, (struct mbuf *)NULL,
1024 		    tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
1025 		(void) m_free(m);
1026 		return true;
1027 	} else {
1028 		return false;
1029 	}
1030 }
1031 
1032 /*
1033  * TCP timer processing.
1034  */
1035 struct tcpcb *
tcp_timers(struct tcpcb * tp,int timer)1036 tcp_timers(struct tcpcb *tp, int timer)
1037 {
1038 	int32_t rexmt, optlen = 0, idle_time = 0;
1039 	struct socket *so;
1040 #if TCPDEBUG
1041 	int ostate;
1042 #endif
1043 	u_int64_t accsleep_ms;
1044 	u_int64_t last_sleep_ms = 0;
1045 
1046 	so = tp->t_inpcb->inp_socket;
1047 	idle_time = tcp_now - tp->t_rcvtime;
1048 
1049 	switch (timer) {
1050 	/*
1051 	 * 2 MSL timeout in shutdown went off.  If we're closed but
1052 	 * still waiting for peer to close and connection has been idle
1053 	 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
1054 	 * delete connection control block.
1055 	 * Otherwise, (this case shouldn't happen) check again in a bit
1056 	 * we keep the socket in the main list in that case.
1057 	 */
1058 	case TCPT_2MSL:
1059 		tcp_free_sackholes(tp);
1060 		if (tp->t_state != TCPS_TIME_WAIT &&
1061 		    tp->t_state != TCPS_FIN_WAIT_2 &&
1062 		    ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
1063 			tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
1064 			    (u_int32_t)TCP_CONN_KEEPINTVL(tp));
1065 		} else {
1066 			if (tp->t_state == TCPS_FIN_WAIT_2) {
1067 				TCP_LOG_DROP_PCB(NULL, NULL, tp, false,
1068 				    "FIN wait timeout drop");
1069 				tcpstat.tcps_fin_timeout_drops++;
1070 				tp = tcp_drop(tp, 0);
1071 			} else {
1072 				tp = tcp_close(tp);
1073 			}
1074 			return tp;
1075 		}
1076 		break;
1077 
1078 	/*
1079 	 * Retransmission timer went off.  Message has not
1080 	 * been acked within retransmit interval.  Back off
1081 	 * to a longer retransmit interval and retransmit one segment.
1082 	 */
1083 	case TCPT_REXMT:
1084 		absolutetime_to_nanoseconds(mach_absolutetime_asleep,
1085 		    &accsleep_ms);
1086 		accsleep_ms = accsleep_ms / 1000000UL;
1087 		if (accsleep_ms > tp->t_accsleep_ms) {
1088 			last_sleep_ms = accsleep_ms - tp->t_accsleep_ms;
1089 		}
1090 		/*
1091 		 * Drop a connection in the retransmit timer
1092 		 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
1093 		 * times
1094 		 * 2. If the time spent in this retransmission episode is
1095 		 * more than the time limit set with TCP_RXT_CONNDROPTIME
1096 		 * socket option
1097 		 * 3. If TCP_RXT_FINDROP socket option was set and
1098 		 * we have already retransmitted the FIN 3 times without
1099 		 * receiving an ack
1100 		 */
1101 		if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
1102 		    (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
1103 		    (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
1104 		    ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
1105 		    (tp->t_flags & TF_SENTFIN) != 0 && tp->t_rxtshift >= 4) ||
1106 		    (tp->t_rxtshift > 4 && last_sleep_ms >= TCP_SLEEP_TOO_LONG)) {
1107 			if (tp->t_state == TCPS_ESTABLISHED &&
1108 			    tp->t_rxt_minimum_timeout > 0) {
1109 				/*
1110 				 * Avoid dropping a connection if minimum
1111 				 * timeout is set and that time did not
1112 				 * pass. We will retry sending
1113 				 * retransmissions at the maximum interval
1114 				 */
1115 				if (TSTMP_LT(tcp_now, (tp->t_rxtstart +
1116 				    tp->t_rxt_minimum_timeout))) {
1117 					tp->t_rxtshift = TCP_MAXRXTSHIFT - 1;
1118 					goto retransmit_packet;
1119 				}
1120 			}
1121 			if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
1122 				tcpstat.tcps_rxtfindrop++;
1123 			} else if (last_sleep_ms >= TCP_SLEEP_TOO_LONG) {
1124 				tcpstat.tcps_drop_after_sleep++;
1125 			} else {
1126 				tcpstat.tcps_timeoutdrop++;
1127 			}
1128 			if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) {
1129 				if (TCP_ECN_ENABLED(tp)) {
1130 					INP_INC_IFNET_STAT(tp->t_inpcb,
1131 					    ecn_on.rxmit_drop);
1132 				} else {
1133 					INP_INC_IFNET_STAT(tp->t_inpcb,
1134 					    ecn_off.rxmit_drop);
1135 				}
1136 			}
1137 			tp->t_rxtshift = TCP_MAXRXTSHIFT;
1138 			soevent(so,
1139 			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
1140 
1141 			if (TCP_ECN_ENABLED(tp) &&
1142 			    tp->t_state == TCPS_ESTABLISHED) {
1143 				tcp_heuristic_ecn_droprxmt(tp);
1144 			}
1145 
1146 			TCP_LOG_DROP_PCB(NULL, NULL, tp, false,
1147 			    "retransmission timeout drop");
1148 			tp = tcp_drop(tp, tp->t_softerror ?
1149 			    tp->t_softerror : ETIMEDOUT);
1150 
1151 			break;
1152 		}
1153 retransmit_packet:
1154 		tcpstat.tcps_rexmttimeo++;
1155 		tp->t_accsleep_ms = accsleep_ms;
1156 
1157 		if (tp->t_rxtshift == 1 &&
1158 		    tp->t_state == TCPS_ESTABLISHED) {
1159 			/* Set the time at which retransmission started. */
1160 			tp->t_rxtstart = tcp_now;
1161 
1162 			/*
1163 			 * if this is the first retransmit timeout, save
1164 			 * the state so that we can recover if the timeout
1165 			 * is spurious.
1166 			 */
1167 			tcp_rexmt_save_state(tp);
1168 			tcp_ccdbg_trace(tp, NULL, TCP_CC_FIRST_REXMT);
1169 		}
1170 #if MPTCP
1171 		if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
1172 		    (tp->t_state == TCPS_ESTABLISHED) &&
1173 		    (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
1174 			mptcp_act_on_txfail(so);
1175 		}
1176 
1177 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1178 		    (so->so_flags & SOF_MP_SUBFLOW)) {
1179 			struct mptses *mpte = tptomptp(tp)->mpt_mpte;
1180 
1181 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1182 			    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1183 				mptcp_check_subflows_and_add(mpte);
1184 			}
1185 		}
1186 #endif /* MPTCP */
1187 
1188 		if (tp->t_adaptive_wtimo > 0 &&
1189 		    tp->t_rxtshift > tp->t_adaptive_wtimo &&
1190 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
1191 			/* Send an event to the application */
1192 			soevent(so,
1193 			    (SO_FILT_HINT_LOCKED |
1194 			    SO_FILT_HINT_ADAPTIVE_WTIMO));
1195 		}
1196 
1197 		/*
1198 		 * If this is a retransmit timeout after PTO, the PTO
1199 		 * was not effective
1200 		 */
1201 		if (tp->t_flagsext & TF_SENT_TLPROBE) {
1202 			tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1203 			tcpstat.tcps_rto_after_pto++;
1204 		}
1205 
1206 		if (tp->t_flagsext & TF_DELAY_RECOVERY) {
1207 			/*
1208 			 * Retransmit timer fired before entering recovery
1209 			 * on a connection with packet re-ordering. This
1210 			 * suggests that the reordering metrics computed
1211 			 * are not accurate.
1212 			 */
1213 			tp->t_reorderwin = 0;
1214 			tp->t_timer[TCPT_DELAYFR] = 0;
1215 			tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
1216 		}
1217 
1218 		if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
1219 		    tp->t_state == TCPS_SYN_RECEIVED) {
1220 			tcp_disable_tfo(tp);
1221 		}
1222 
1223 		if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
1224 		    !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1225 		    (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
1226 		    !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
1227 		    ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) ||
1228 		    tp->t_rxtshift > 4)) {
1229 			/*
1230 			 * For regular retransmissions, a first one is being
1231 			 * done for tail-loss probe.
1232 			 * Thus, if rxtshift > 1, this means we have sent the segment
1233 			 * a total of 3 times.
1234 			 *
1235 			 * If we are in SYN-SENT state, then there is no tail-loss
1236 			 * probe thus we have to let rxtshift go up to 3.
1237 			 */
1238 			tcp_heuristic_tfo_middlebox(tp);
1239 
1240 			so->so_error = ENODATA;
1241 			soevent(so,
1242 			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
1243 			sorwakeup(so);
1244 			sowwakeup(so);
1245 
1246 			tp->t_tfo_stats |= TFO_S_SEND_BLACKHOLE;
1247 			tcpstat.tcps_tfo_sndblackhole++;
1248 		}
1249 
1250 		if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
1251 		    !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1252 		    (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) &&
1253 		    tp->t_rxtshift > 3) {
1254 			if (TSTMP_GT(tp->t_sndtime - 10 * TCP_RETRANSHZ, tp->t_rcvtime)) {
1255 				tcp_heuristic_tfo_middlebox(tp);
1256 
1257 				so->so_error = ENODATA;
1258 				soevent(so,
1259 				    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
1260 				sorwakeup(so);
1261 				sowwakeup(so);
1262 			}
1263 		}
1264 
1265 		if (tp->t_state == TCPS_SYN_SENT) {
1266 			rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
1267 			tp->t_stat.synrxtshift = tp->t_rxtshift;
1268 			tp->t_stat.rxmitsyns++;
1269 
1270 			/* When retransmitting, disable TFO */
1271 			if (tfo_enabled(tp) &&
1272 			    !(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) {
1273 				tcp_disable_tfo(tp);
1274 				tp->t_tfo_flags |= TFO_F_SYN_LOSS;
1275 			}
1276 		} else {
1277 			rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
1278 		}
1279 
1280 		TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX,
1281 		    TCP_ADD_REXMTSLOP(tp));
1282 		tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
1283 
1284 		TCP_LOG_RTT_INFO(tp);
1285 
1286 		if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) {
1287 			goto fc_output;
1288 		}
1289 
1290 		tcp_free_sackholes(tp);
1291 		/*
1292 		 * Check for potential Path MTU Discovery Black Hole
1293 		 */
1294 		if (tcp_pmtud_black_hole_detect &&
1295 		    !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
1296 		    (tp->t_state == TCPS_ESTABLISHED)) {
1297 			if ((tp->t_flags & TF_PMTUD) &&
1298 			    tp->t_pmtud_lastseg_size > tcp_pmtud_black_holed_next_mss(tp) &&
1299 			    tp->t_rxtshift == 2) {
1300 				/*
1301 				 * Enter Path MTU Black-hole Detection mechanism:
1302 				 * - Disable Path MTU Discovery (IP "DF" bit).
1303 				 * - Reduce MTU to lower value than what we
1304 				 * negotiated with the peer.
1305 				 */
1306 				/* Disable Path MTU Discovery for now */
1307 				tp->t_flags &= ~TF_PMTUD;
1308 				/* Record that we may have found a black hole */
1309 				tp->t_flags |= TF_BLACKHOLE;
1310 				optlen = tp->t_maxopd - tp->t_maxseg;
1311 				/* Keep track of previous MSS */
1312 				tp->t_pmtud_saved_maxopd = tp->t_maxopd;
1313 				tp->t_pmtud_start_ts = tcp_now;
1314 				if (tp->t_pmtud_start_ts == 0) {
1315 					tp->t_pmtud_start_ts++;
1316 				}
1317 				/* Reduce the MSS to intermediary value */
1318 				tp->t_maxopd = tcp_pmtud_black_holed_next_mss(tp);
1319 				tp->t_maxseg = tp->t_maxopd - optlen;
1320 
1321 				/*
1322 				 * Reset the slow-start flight size
1323 				 * as it may depend on the new MSS
1324 				 */
1325 				if (CC_ALGO(tp)->cwnd_init != NULL) {
1326 					CC_ALGO(tp)->cwnd_init(tp);
1327 				}
1328 				tp->snd_cwnd = tp->t_maxseg;
1329 
1330 				if (TCP_USE_RLEDBAT(tp, so) &&
1331 				    tcp_cc_rledbat.rwnd_init != NULL) {
1332 					tcp_cc_rledbat.rwnd_init(tp);
1333 				}
1334 			}
1335 			/*
1336 			 * If further retransmissions are still
1337 			 * unsuccessful with a lowered MTU, maybe this
1338 			 * isn't a Black Hole and we restore the previous
1339 			 * MSS and blackhole detection flags.
1340 			 */
1341 			else {
1342 				if ((tp->t_flags & TF_BLACKHOLE) &&
1343 				    (tp->t_rxtshift > 4)) {
1344 					tcp_pmtud_revert_segment_size(tp);
1345 					tp->snd_cwnd = tp->t_maxseg;
1346 				}
1347 			}
1348 		}
1349 
1350 		/*
1351 		 * Disable rfc1323 and rfc1644 if we haven't got any
1352 		 * response to our SYN (after we reach the threshold)
1353 		 * to work-around some broken terminal servers (most of
1354 		 * which have hopefully been retired) that have bad VJ
1355 		 * header compression code which trashes TCP segments
1356 		 * containing unknown-to-them TCP options.
1357 		 * Do this only on non-local connections.
1358 		 */
1359 		if (tp->t_state == TCPS_SYN_SENT &&
1360 		    tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) {
1361 			tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_REQ_CC);
1362 		}
1363 
1364 		/*
1365 		 * If losing, let the lower level know and try for
1366 		 * a better route.  Also, if we backed off this far,
1367 		 * our srtt estimate is probably bogus.  Clobber it
1368 		 * so we'll take the next rtt measurement as our srtt;
1369 		 * move the current srtt into rttvar to keep the current
1370 		 * retransmit times until then.
1371 		 */
1372 		if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
1373 			if (!(tp->t_inpcb->inp_vflag & INP_IPV4)) {
1374 				in6_losing(tp->t_inpcb);
1375 			} else {
1376 				in_losing(tp->t_inpcb);
1377 			}
1378 			tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
1379 			tp->t_srtt = 0;
1380 		}
1381 		tp->snd_nxt = tp->snd_una;
1382 		/*
1383 		 * Note:  We overload snd_recover to function also as the
1384 		 * snd_last variable described in RFC 2582
1385 		 */
1386 		tp->snd_recover = tp->snd_max;
1387 		/*
1388 		 * Force a segment to be sent.
1389 		 */
1390 		tp->t_flags |= TF_ACKNOW;
1391 
1392 		/* If timing a segment in this window, stop the timer */
1393 		tp->t_rtttime = 0;
1394 
1395 		if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1) {
1396 			tcpstat.tcps_tailloss_rto++;
1397 		}
1398 
1399 
1400 		/*
1401 		 * RFC 5681 says: when a TCP sender detects segment loss
1402 		 * using retransmit timer and the given segment has already
1403 		 * been retransmitted by way of the retransmission timer at
1404 		 * least once, the value of ssthresh is held constant
1405 		 */
1406 		if (tp->t_rxtshift == 1 &&
1407 		    CC_ALGO(tp)->after_timeout != NULL) {
1408 			CC_ALGO(tp)->after_timeout(tp);
1409 			/*
1410 			 * CWR notifications are to be sent on new data
1411 			 * right after Fast Retransmits and ECE
1412 			 * notification receipts.
1413 			 */
1414 			if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) {
1415 				tp->ecn_flags |= TE_SENDCWR;
1416 			}
1417 		}
1418 
1419 		EXIT_FASTRECOVERY(tp);
1420 
1421 		/* Exit cwnd non validated phase */
1422 		tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
1423 
1424 
1425 fc_output:
1426 		tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
1427 
1428 		(void) tcp_output(tp);
1429 		break;
1430 
1431 	/*
1432 	 * Persistance timer into zero window.
1433 	 * Force a byte to be output, if possible.
1434 	 */
1435 	case TCPT_PERSIST:
1436 		tcpstat.tcps_persisttimeo++;
1437 		/*
1438 		 * Hack: if the peer is dead/unreachable, we do not
1439 		 * time out if the window is closed.  After a full
1440 		 * backoff, drop the connection if the idle time
1441 		 * (no responses to probes) reaches the maximum
1442 		 * backoff that we would use if retransmitting.
1443 		 *
1444 		 * Drop the connection if we reached the maximum allowed time for
1445 		 * Zero Window Probes without a non-zero update from the peer.
1446 		 * See rdar://5805356
1447 		 */
1448 		if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
1449 		    (idle_time >= tcp_maxpersistidle ||
1450 		    idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
1451 		    ((tp->t_persist_stop != 0) &&
1452 		    TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
1453 			TCP_LOG_DROP_PCB(NULL, NULL, tp, false, "persist timeout drop");
1454 			tcpstat.tcps_persistdrop++;
1455 			soevent(so,
1456 			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
1457 			tp = tcp_drop(tp, ETIMEDOUT);
1458 			break;
1459 		}
1460 		tcp_setpersist(tp);
1461 		tp->t_flagsext |= TF_FORCE;
1462 		(void) tcp_output(tp);
1463 		tp->t_flagsext &= ~TF_FORCE;
1464 		break;
1465 
1466 	/*
1467 	 * Keep-alive timer went off; send something
1468 	 * or drop connection if idle for too long.
1469 	 */
1470 	case TCPT_KEEP:
1471 #if FLOW_DIVERT
1472 		if (tp->t_inpcb->inp_socket->so_flags & SOF_FLOW_DIVERT) {
1473 			break;
1474 		}
1475 #endif /* FLOW_DIVERT */
1476 
1477 		tcpstat.tcps_keeptimeo++;
1478 #if MPTCP
1479 		/*
1480 		 * Regular TCP connections do not send keepalives after closing
1481 		 * MPTCP must not also, after sending Data FINs.
1482 		 */
1483 		struct mptcb *mp_tp = tptomptp(tp);
1484 		if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
1485 		    (tp->t_state > TCPS_ESTABLISHED)) {
1486 			goto dropit;
1487 		} else if (mp_tp != NULL) {
1488 			if ((mptcp_ok_to_keepalive(mp_tp) == 0)) {
1489 				goto dropit;
1490 			}
1491 		}
1492 #endif /* MPTCP */
1493 		if (tp->t_state < TCPS_ESTABLISHED) {
1494 			goto dropit;
1495 		}
1496 		if ((always_keepalive ||
1497 		    (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
1498 		    (tp->t_flagsext & TF_DETECT_READSTALL) ||
1499 		    (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) &&
1500 		    (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
1501 			if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp)) {
1502 				TCP_LOG_DROP_PCB(NULL, NULL, tp, false,
1503 				    "keep alive timeout drop");
1504 				goto dropit;
1505 			}
1506 
1507 			if (tcp_send_keep_alive(tp)) {
1508 				if (tp->t_flagsext & TF_DETECT_READSTALL) {
1509 					tp->t_rtimo_probes++;
1510 				}
1511 
1512 				TCP_LOG_KEEP_ALIVE(tp, idle_time);
1513 			}
1514 
1515 			tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1516 			    TCP_CONN_KEEPINTVL(tp));
1517 		} else {
1518 			tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1519 			    TCP_CONN_KEEPIDLE(tp));
1520 		}
1521 		if (tp->t_flagsext & TF_DETECT_READSTALL) {
1522 			struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
1523 			bool reenable_probe = false;
1524 			/*
1525 			 * The keep alive packets sent to detect a read
1526 			 * stall did not get a response from the
1527 			 * peer. Generate more keep-alives to confirm this.
1528 			 * If the number of probes sent reaches the limit,
1529 			 * generate an event.
1530 			 */
1531 			if (tp->t_adaptive_rtimo > 0) {
1532 				if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1533 					/* Generate an event */
1534 					soevent(so,
1535 					    (SO_FILT_HINT_LOCKED |
1536 					    SO_FILT_HINT_ADAPTIVE_RTIMO));
1537 					tcp_keepalive_reset(tp);
1538 				} else {
1539 					reenable_probe = true;
1540 				}
1541 			} else if (outifp != NULL &&
1542 			    (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY) &&
1543 			    tp->t_rtimo_probes <= TCP_CONNECTIVITY_PROBES_MAX) {
1544 				reenable_probe = true;
1545 			} else {
1546 				tp->t_flagsext &= ~TF_DETECT_READSTALL;
1547 			}
1548 			if (reenable_probe) {
1549 				int ind = min(tp->t_rtimo_probes,
1550 				    TCP_MAXRXTSHIFT);
1551 				tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
1552 					tp, tcp_backoff[ind] * TCP_REXMTVAL(tp));
1553 			}
1554 		}
1555 		if (tp->t_tfo_probe_state == TFO_PROBE_PROBING) {
1556 			int ind;
1557 
1558 			tp->t_tfo_probes++;
1559 			ind = min(tp->t_tfo_probes, TCP_MAXRXTSHIFT);
1560 
1561 			/*
1562 			 * We take the minimum among the time set by true
1563 			 * keepalive (see above) and the backoff'd RTO. That
1564 			 * way we backoff in case of packet-loss but will never
1565 			 * timeout slower than regular keepalive due to the
1566 			 * backing off.
1567 			 */
1568 			tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START(
1569 				    tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)),
1570 			    tp->t_timer[TCPT_KEEP]);
1571 		} else if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
1572 		    !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1573 		    tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) {
1574 			/* Still no data! Let's assume a TFO-error and err out... */
1575 			tcp_heuristic_tfo_middlebox(tp);
1576 
1577 			so->so_error = ENODATA;
1578 			soevent(so,
1579 			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
1580 			sorwakeup(so);
1581 			tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE;
1582 			tcpstat.tcps_tfo_blackhole++;
1583 		}
1584 		break;
1585 	case TCPT_DELACK:
1586 		if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1587 			tp->t_flags &= ~TF_DELACK;
1588 			tp->t_timer[TCPT_DELACK] = 0;
1589 			tp->t_flags |= TF_ACKNOW;
1590 
1591 			/*
1592 			 * If delayed ack timer fired while stretching
1593 			 * acks, count the number of times the streaming
1594 			 * detection was not correct. If this exceeds a
1595 			 * threshold, disable strech ack on this
1596 			 * connection
1597 			 *
1598 			 * Also, go back to acking every other packet.
1599 			 */
1600 			if ((tp->t_flags & TF_STRETCHACK)) {
1601 				if (tp->t_unacksegs > 1 &&
1602 				    tp->t_unacksegs < maxseg_unacked) {
1603 					tp->t_stretchack_delayed++;
1604 				}
1605 
1606 				if (tp->t_stretchack_delayed >
1607 				    TCP_STRETCHACK_DELAY_THRESHOLD) {
1608 					tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1609 					/*
1610 					 * Note the time at which stretch
1611 					 * ack was disabled automatically
1612 					 */
1613 					tp->rcv_nostrack_ts = tcp_now;
1614 					tcpstat.tcps_nostretchack++;
1615 					tp->t_stretchack_delayed = 0;
1616 					tp->rcv_nostrack_pkts = 0;
1617 				}
1618 				tcp_reset_stretch_ack(tp);
1619 			}
1620 			tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
1621 
1622 			/*
1623 			 * If we are measuring inter packet arrival jitter
1624 			 * for throttling a connection, this delayed ack
1625 			 * might be the reason for accumulating some
1626 			 * jitter. So let's restart the measurement.
1627 			 */
1628 			CLEAR_IAJ_STATE(tp);
1629 
1630 			tcpstat.tcps_delack++;
1631 			tp->t_stat.delayed_acks_sent++;
1632 			(void) tcp_output(tp);
1633 		}
1634 		break;
1635 
1636 #if MPTCP
1637 	case TCPT_JACK_RXMT:
1638 		if ((tp->t_state == TCPS_ESTABLISHED) &&
1639 		    (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1640 		    (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1641 			if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1642 				tcpstat.tcps_timeoutdrop++;
1643 				soevent(so,
1644 				    (SO_FILT_HINT_LOCKED |
1645 				    SO_FILT_HINT_TIMEOUT));
1646 				tp = tcp_drop(tp, tp->t_softerror ?
1647 				    tp->t_softerror : ETIMEDOUT);
1648 				break;
1649 			}
1650 			tcpstat.tcps_join_rxmts++;
1651 			tp->t_mpflags |= TMPF_SND_JACK;
1652 			tp->t_flags |= TF_ACKNOW;
1653 
1654 			/*
1655 			 * No backoff is implemented for simplicity for this
1656 			 * corner case.
1657 			 */
1658 			(void) tcp_output(tp);
1659 		}
1660 		break;
1661 	case TCPT_CELLICON:
1662 	{
1663 		struct mptses *mpte = tptomptp(tp)->mpt_mpte;
1664 
1665 		tp->t_timer[TCPT_CELLICON] = 0;
1666 
1667 		if (mpte->mpte_cellicon_increments == 0) {
1668 			/* Cell-icon not set by this connection */
1669 			break;
1670 		}
1671 
1672 		if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
1673 			mptcp_unset_cellicon(mpte, NULL, 1);
1674 		}
1675 
1676 		if (mpte->mpte_cellicon_increments) {
1677 			tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
1678 		}
1679 
1680 		break;
1681 	}
1682 #endif /* MPTCP */
1683 
1684 	case TCPT_PTO:
1685 	{
1686 		int32_t ret = 0;
1687 
1688 		if (!(tp->t_flagsext & TF_IF_PROBING)) {
1689 			tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1690 		}
1691 		/*
1692 		 * Check if the connection is in the right state to
1693 		 * send a probe
1694 		 */
1695 		if ((tp->t_state != TCPS_ESTABLISHED ||
1696 		    tp->t_rxtshift > 0 ||
1697 		    tp->snd_max == tp->snd_una ||
1698 		    !SACK_ENABLED(tp) ||
1699 		    (tcp_do_better_lr != 1 && !TAILQ_EMPTY(&tp->snd_holes)) ||
1700 		    IN_FASTRECOVERY(tp)) &&
1701 		    !(tp->t_flagsext & TF_IF_PROBING)) {
1702 			break;
1703 		}
1704 
1705 		/*
1706 		 * When the interface state is changed explicitly reset the retransmission
1707 		 * timer state for both SYN and data packets because we do not want to
1708 		 * wait unnecessarily or timeout too quickly if the link characteristics
1709 		 * have changed drastically
1710 		 */
1711 		if (tp->t_flagsext & TF_IF_PROBING) {
1712 			tp->t_rxtshift = 0;
1713 			if (tp->t_state == TCPS_SYN_SENT) {
1714 				tp->t_stat.synrxtshift = tp->t_rxtshift;
1715 			}
1716 			/*
1717 			 * Reset to the the default RTO
1718 			 */
1719 			tp->t_srtt = TCPTV_SRTTBASE;
1720 			tp->t_rttvar =
1721 			    ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1722 			tp->t_rttmin = tp->t_flags & TF_LOCAL ? tcp_TCPTV_MIN :
1723 			    TCPTV_REXMTMIN;
1724 			TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1725 			    tp->t_rttmin, TCPTV_REXMTMAX, TCP_ADD_REXMTSLOP(tp));
1726 			TCP_LOG_RTT_INFO(tp);
1727 		}
1728 
1729 		if (tp->t_state == TCPS_SYN_SENT) {
1730 			/*
1731 			 * The PTO for SYN_SENT reinitializes TCP as if it was a fresh
1732 			 * connection attempt
1733 			 */
1734 			tp->snd_nxt = tp->snd_una;
1735 			/*
1736 			 * Note:  We overload snd_recover to function also as the
1737 			 * snd_last variable described in RFC 2582
1738 			 */
1739 			tp->snd_recover = tp->snd_max;
1740 			/*
1741 			 * Force a segment to be sent.
1742 			 */
1743 			tp->t_flags |= TF_ACKNOW;
1744 
1745 			/* If timing a segment in this window, stop the timer */
1746 			tp->t_rtttime = 0;
1747 		} else {
1748 			int32_t snd_len;
1749 
1750 			/*
1751 			 * If there is no new data to send or if the
1752 			 * connection is limited by receive window then
1753 			 * retransmit the last segment, otherwise send
1754 			 * new data.
1755 			 */
1756 			snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1757 			    - (tp->snd_max - tp->snd_una);
1758 			if (snd_len > 0) {
1759 				tp->snd_nxt = tp->snd_max;
1760 			} else {
1761 				snd_len = min((tp->snd_max - tp->snd_una),
1762 				    tp->t_maxseg);
1763 				tp->snd_nxt = tp->snd_max - snd_len;
1764 			}
1765 		}
1766 
1767 		tcpstat.tcps_pto++;
1768 		if (tp->t_flagsext & TF_IF_PROBING) {
1769 			tcpstat.tcps_probe_if++;
1770 		}
1771 
1772 		/* If timing a segment in this window, stop the timer */
1773 		tp->t_rtttime = 0;
1774 		/* Note that tail loss probe is being sent. Exclude IF probe */
1775 		if (!(tp->t_flagsext & TF_IF_PROBING)) {
1776 			tp->t_flagsext |= TF_SENT_TLPROBE;
1777 			tp->t_tlpstart = tcp_now;
1778 		}
1779 
1780 		tp->snd_cwnd += tp->t_maxseg;
1781 		/*
1782 		 * When tail-loss-probe fires, we reset the RTO timer, because
1783 		 * a probe just got sent, so we are good to push out the timer.
1784 		 *
1785 		 * Set to 0 to ensure that tcp_output() will reschedule it
1786 		 */
1787 		tp->t_timer[TCPT_REXMT] = 0;
1788 		ret = tcp_output(tp);
1789 
1790 #if (DEBUG || DEVELOPMENT)
1791 		if ((tp->t_flagsext & TF_IF_PROBING) &&
1792 		    ((IFNET_IS_COMPANION_LINK(tp->t_inpcb->inp_last_outifp)) ||
1793 		    tp->t_state == TCPS_SYN_SENT)) {
1794 			if (ret == 0 && tcp_probe_if_fix_port > 0 &&
1795 			    tcp_probe_if_fix_port <= IPPORT_HILASTAUTO) {
1796 				tp->t_timer[TCPT_REXMT] = 0;
1797 				tcp_set_lotimer_index(tp);
1798 			}
1799 
1800 			os_log(OS_LOG_DEFAULT,
1801 			    "%s: sent %s probe for %u > %u on interface %s"
1802 			    " (%u) %s(%d)",
1803 			    __func__,
1804 			    tp->t_state == TCPS_SYN_SENT ? "SYN" : "data",
1805 			    ntohs(tp->t_inpcb->inp_lport),
1806 			    ntohs(tp->t_inpcb->inp_fport),
1807 			    if_name(tp->t_inpcb->inp_last_outifp),
1808 			    tp->t_inpcb->inp_last_outifp->if_index,
1809 			    ret == 0 ? "succeeded" :"failed", ret);
1810 		}
1811 #endif /* DEBUG || DEVELOPMENT */
1812 
1813 		/*
1814 		 * When there is data (or a SYN) to send, the above call to
1815 		 * tcp_output() should have armed either the REXMT or the
1816 		 * PERSIST timer. If it didn't, something is wrong and this
1817 		 * connection would idle around forever. Let's make sure that
1818 		 * at least the REXMT timer is set.
1819 		 */
1820 		if (tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0 &&
1821 		    (tp->t_inpcb->inp_socket->so_snd.sb_cc != 0 || tp->t_state == TCPS_SYN_SENT ||
1822 		    tp->t_state == TCPS_SYN_RECEIVED)) {
1823 			tp->t_timer[TCPT_REXMT] =
1824 			    OFFSET_FROM_START(tp, tp->t_rxtcur);
1825 
1826 			os_log(OS_LOG_DEFAULT,
1827 			    "%s: tcp_output() returned %u with retransmission timer disabled "
1828 			    "for %u > %u in state %d, reset timer to %d",
1829 			    __func__, ret,
1830 			    ntohs(tp->t_inpcb->inp_lport),
1831 			    ntohs(tp->t_inpcb->inp_fport),
1832 			    tp->t_state,
1833 			    tp->t_timer[TCPT_REXMT]);
1834 
1835 			tcp_check_timer_state(tp);
1836 		}
1837 		tp->snd_cwnd -= tp->t_maxseg;
1838 
1839 		if (!(tp->t_flagsext & TF_IF_PROBING)) {
1840 			tp->t_tlphighrxt = tp->snd_nxt;
1841 		}
1842 		break;
1843 	}
1844 	case TCPT_DELAYFR:
1845 		tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1846 
1847 		/*
1848 		 * Don't do anything if one of the following is true:
1849 		 * - the connection is already in recovery
1850 		 * - sequence until snd_recover has been acknowledged.
1851 		 * - retransmit timeout has fired
1852 		 */
1853 		if (IN_FASTRECOVERY(tp) ||
1854 		    SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1855 		    tp->t_rxtshift > 0) {
1856 			break;
1857 		}
1858 
1859 		VERIFY(SACK_ENABLED(tp));
1860 		tcp_rexmt_save_state(tp);
1861 		if (CC_ALGO(tp)->pre_fr != NULL) {
1862 			CC_ALGO(tp)->pre_fr(tp);
1863 			if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) {
1864 				tp->ecn_flags |= TE_SENDCWR;
1865 			}
1866 		}
1867 		ENTER_FASTRECOVERY(tp);
1868 
1869 		tp->t_timer[TCPT_REXMT] = 0;
1870 		tcpstat.tcps_sack_recovery_episode++;
1871 		tp->t_sack_recovery_episode++;
1872 		tp->sack_newdata = tp->snd_nxt;
1873 		tp->snd_cwnd = tp->t_maxseg;
1874 		tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1875 		(void) tcp_output(tp);
1876 		break;
1877 
1878 dropit:
1879 		tcpstat.tcps_keepdrops++;
1880 		soevent(so,
1881 		    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
1882 		tp = tcp_drop(tp, ETIMEDOUT);
1883 		break;
1884 	}
1885 #if TCPDEBUG
1886 	if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
1887 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1888 		    PRU_SLOWTIMO);
1889 	}
1890 #endif
1891 	return tp;
1892 }
1893 
1894 /* Remove a timer entry from timer list */
1895 void
tcp_remove_timer(struct tcpcb * tp)1896 tcp_remove_timer(struct tcpcb *tp)
1897 {
1898 	struct tcptimerlist *listp = &tcp_timer_list;
1899 
1900 	socket_lock_assert_owned(tp->t_inpcb->inp_socket);
1901 	if (!(TIMER_IS_ON_LIST(tp))) {
1902 		return;
1903 	}
1904 	lck_mtx_lock(&listp->mtx);
1905 
1906 	/* Check if pcb is on timer list again after acquiring the lock */
1907 	if (!(TIMER_IS_ON_LIST(tp))) {
1908 		lck_mtx_unlock(&listp->mtx);
1909 		return;
1910 	}
1911 
1912 	if (listp->next_te != NULL && listp->next_te == &tp->tentry) {
1913 		listp->next_te = LIST_NEXT(&tp->tentry, le);
1914 	}
1915 
1916 	LIST_REMOVE(&tp->tentry, le);
1917 	tp->t_flags &= ~(TF_TIMER_ONLIST);
1918 
1919 	listp->entries--;
1920 
1921 	tp->tentry.le.le_next = NULL;
1922 	tp->tentry.le.le_prev = NULL;
1923 	lck_mtx_unlock(&listp->mtx);
1924 }
1925 
1926 /*
1927  * Function to check if the timerlist needs to be rescheduled to run
1928  * the timer entry correctly. Basically, this is to check if we can avoid
1929  * taking the list lock.
1930  */
1931 
1932 static boolean_t
need_to_resched_timerlist(u_int32_t runtime,u_int16_t mode)1933 need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1934 {
1935 	struct tcptimerlist *listp = &tcp_timer_list;
1936 	int32_t diff;
1937 
1938 	/*
1939 	 * If the list is being processed then the state of the list is
1940 	 * in flux. In this case always acquire the lock and set the state
1941 	 * correctly.
1942 	 */
1943 	if (listp->running) {
1944 		return TRUE;
1945 	}
1946 
1947 	if (!listp->scheduled) {
1948 		return TRUE;
1949 	}
1950 
1951 	diff = timer_diff(listp->runtime, 0, runtime, 0);
1952 	if (diff <= 0) {
1953 		/* The list is going to run before this timer */
1954 		return FALSE;
1955 	} else {
1956 		if (mode & TCP_TIMERLIST_10MS_MODE) {
1957 			if (diff <= TCP_TIMER_10MS_QUANTUM) {
1958 				return FALSE;
1959 			}
1960 		} else if (mode & TCP_TIMERLIST_100MS_MODE) {
1961 			if (diff <= TCP_TIMER_100MS_QUANTUM) {
1962 				return FALSE;
1963 			}
1964 		} else {
1965 			if (diff <= TCP_TIMER_500MS_QUANTUM) {
1966 				return FALSE;
1967 			}
1968 		}
1969 	}
1970 	return TRUE;
1971 }
1972 
1973 void
tcp_sched_timerlist(uint32_t offset)1974 tcp_sched_timerlist(uint32_t offset)
1975 {
1976 	uint64_t deadline = 0;
1977 	struct tcptimerlist *listp = &tcp_timer_list;
1978 
1979 	LCK_MTX_ASSERT(&listp->mtx, LCK_MTX_ASSERT_OWNED);
1980 
1981 	offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
1982 	listp->runtime = tcp_now + offset;
1983 	listp->schedtime = tcp_now;
1984 	if (listp->runtime == 0) {
1985 		listp->runtime++;
1986 		offset++;
1987 	}
1988 
1989 	clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
1990 
1991 	thread_call_enter_delayed(listp->call, deadline);
1992 	listp->scheduled = TRUE;
1993 }
1994 
1995 /*
1996  * Function to run the timers for a connection.
1997  *
1998  * Returns the offset of next timer to be run for this connection which
1999  * can be used to reschedule the timerlist.
2000  *
2001  * te_mode is an out parameter that indicates the modes of active
2002  * timers for this connection.
2003  */
2004 u_int32_t
tcp_run_conn_timer(struct tcpcb * tp,u_int16_t * te_mode,u_int16_t probe_if_index)2005 tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode,
2006     u_int16_t probe_if_index)
2007 {
2008 	struct socket *so;
2009 	u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
2010 	u_int32_t timer_val, offset = 0, lo_timer = 0;
2011 	int32_t diff;
2012 	boolean_t needtorun[TCPT_NTIMERS];
2013 	int count = 0;
2014 
2015 	VERIFY(tp != NULL);
2016 	bzero(needtorun, sizeof(needtorun));
2017 	*te_mode = 0;
2018 
2019 	socket_lock(tp->t_inpcb->inp_socket, 1);
2020 
2021 	so = tp->t_inpcb->inp_socket;
2022 	/* Release the want count on inp */
2023 	if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
2024 	    == WNT_STOPUSING) {
2025 		if (TIMER_IS_ON_LIST(tp)) {
2026 			tcp_remove_timer(tp);
2027 		}
2028 
2029 		/* Looks like the TCP connection got closed while we
2030 		 * were waiting for the lock.. Done
2031 		 */
2032 		goto done;
2033 	}
2034 
2035 	/*
2036 	 * If this connection is over an interface that needs to
2037 	 * be probed, send probe packets to reinitiate communication.
2038 	 */
2039 	if (TCP_IF_STATE_CHANGED(tp, probe_if_index)) {
2040 		tp->t_flagsext |= TF_IF_PROBING;
2041 		tcp_timers(tp, TCPT_PTO);
2042 		tp->t_timer[TCPT_PTO] = 0;
2043 		tp->t_flagsext &= ~TF_IF_PROBING;
2044 	}
2045 
2046 	/*
2047 	 * Since the timer thread needs to wait for tcp lock, it may race
2048 	 * with another thread that can cancel or reschedule the timer
2049 	 * that is about to run. Check if we need to run anything.
2050 	 */
2051 	if ((index = tp->tentry.index) == TCPT_NONE) {
2052 		goto done;
2053 	}
2054 
2055 	timer_val = tp->t_timer[index];
2056 
2057 	diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
2058 	if (diff > 0) {
2059 		if (tp->tentry.index != TCPT_NONE) {
2060 			offset = diff;
2061 			*(te_mode) = tp->tentry.mode;
2062 		}
2063 		goto done;
2064 	}
2065 
2066 	tp->t_timer[index] = 0;
2067 	if (timer_val > 0) {
2068 		tp = tcp_timers(tp, index);
2069 		if (tp == NULL) {
2070 			goto done;
2071 		}
2072 	}
2073 
2074 	/*
2075 	 * Check if there are any other timers that need to be run.
2076 	 * While doing it, adjust the timer values wrt tcp_now.
2077 	 */
2078 	tp->tentry.mode = 0;
2079 	for (i = 0; i < TCPT_NTIMERS; ++i) {
2080 		if (tp->t_timer[i] != 0) {
2081 			diff = timer_diff(tp->tentry.timer_start,
2082 			    tp->t_timer[i], tcp_now, 0);
2083 			if (diff <= 0) {
2084 				needtorun[i] = TRUE;
2085 				count++;
2086 			} else {
2087 				tp->t_timer[i] = diff;
2088 				needtorun[i] = FALSE;
2089 				if (lo_timer == 0 || diff < lo_timer) {
2090 					lo_timer = diff;
2091 					lo_index = i;
2092 				}
2093 				TCP_SET_TIMER_MODE(tp->tentry.mode, i);
2094 			}
2095 		}
2096 	}
2097 
2098 	tp->tentry.timer_start = tcp_now;
2099 	tp->tentry.index = lo_index;
2100 	VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
2101 
2102 	if (tp->tentry.index != TCPT_NONE) {
2103 		tp->tentry.runtime = tp->tentry.timer_start +
2104 		    tp->t_timer[tp->tentry.index];
2105 		if (tp->tentry.runtime == 0) {
2106 			tp->tentry.runtime++;
2107 		}
2108 	}
2109 
2110 	if (count > 0) {
2111 		/* run any other timers outstanding at this time. */
2112 		for (i = 0; i < TCPT_NTIMERS; ++i) {
2113 			if (needtorun[i]) {
2114 				tp->t_timer[i] = 0;
2115 				tp = tcp_timers(tp, i);
2116 				if (tp == NULL) {
2117 					offset = 0;
2118 					*(te_mode) = 0;
2119 					goto done;
2120 				}
2121 			}
2122 		}
2123 		tcp_set_lotimer_index(tp);
2124 	}
2125 
2126 	if (tp->tentry.index < TCPT_NONE) {
2127 		offset = tp->t_timer[tp->tentry.index];
2128 		*(te_mode) = tp->tentry.mode;
2129 	}
2130 
2131 done:
2132 	if (tp != NULL && tp->tentry.index == TCPT_NONE) {
2133 		tcp_remove_timer(tp);
2134 		offset = 0;
2135 	}
2136 
2137 	socket_unlock(so, 1);
2138 	return offset;
2139 }
2140 
2141 void
tcp_run_timerlist(void * arg1,void * arg2)2142 tcp_run_timerlist(void * arg1, void * arg2)
2143 {
2144 #pragma unused(arg1, arg2)
2145 	struct tcptimerentry *te, *next_te;
2146 	struct tcptimerlist *listp = &tcp_timer_list;
2147 	struct tcpcb *tp;
2148 	uint32_t next_timer = 0; /* offset of the next timer on the list */
2149 	u_int16_t te_mode = 0;  /* modes of all active timers in a tcpcb */
2150 	u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
2151 	uint32_t active_count = 0;
2152 
2153 	calculate_tcp_clock();
2154 
2155 	lck_mtx_lock(&listp->mtx);
2156 
2157 	int32_t drift = tcp_now - listp->runtime;
2158 	if (drift <= 1) {
2159 		tcpstat.tcps_timer_drift_le_1_ms++;
2160 	} else if (drift <= 10) {
2161 		tcpstat.tcps_timer_drift_le_10_ms++;
2162 	} else if (drift <= 20) {
2163 		tcpstat.tcps_timer_drift_le_20_ms++;
2164 	} else if (drift <= 50) {
2165 		tcpstat.tcps_timer_drift_le_50_ms++;
2166 	} else if (drift <= 100) {
2167 		tcpstat.tcps_timer_drift_le_100_ms++;
2168 	} else if (drift <= 200) {
2169 		tcpstat.tcps_timer_drift_le_200_ms++;
2170 	} else if (drift <= 500) {
2171 		tcpstat.tcps_timer_drift_le_500_ms++;
2172 	} else if (drift <= 1000) {
2173 		tcpstat.tcps_timer_drift_le_1000_ms++;
2174 	} else {
2175 		tcpstat.tcps_timer_drift_gt_1000_ms++;
2176 	}
2177 
2178 	listp->running = TRUE;
2179 
2180 	LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
2181 		uint32_t offset = 0;
2182 		uint32_t runtime = te->runtime;
2183 
2184 		tp = TIMERENTRY_TO_TP(te);
2185 
2186 		/*
2187 		 * An interface probe may need to happen before the previously scheduled runtime
2188 		 */
2189 		if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now) &&
2190 		    !TCP_IF_STATE_CHANGED(tp, listp->probe_if_index)) {
2191 			offset = timer_diff(runtime, 0, tcp_now, 0);
2192 			if (next_timer == 0 || offset < next_timer) {
2193 				next_timer = offset;
2194 			}
2195 			list_mode |= te->mode;
2196 			continue;
2197 		}
2198 
2199 		/*
2200 		 * Acquire an inp wantcnt on the inpcb so that the socket
2201 		 * won't get detached even if tcp_close is called
2202 		 */
2203 		if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
2204 		    == WNT_STOPUSING) {
2205 			/*
2206 			 * Some how this pcb went into dead state while
2207 			 * on the timer list, just take it off the list.
2208 			 * Since the timer list entry pointers are
2209 			 * protected by the timer list lock, we can
2210 			 * do it here without the socket lock.
2211 			 */
2212 			if (TIMER_IS_ON_LIST(tp)) {
2213 				tp->t_flags &= ~(TF_TIMER_ONLIST);
2214 				LIST_REMOVE(&tp->tentry, le);
2215 				listp->entries--;
2216 
2217 				tp->tentry.le.le_next = NULL;
2218 				tp->tentry.le.le_prev = NULL;
2219 			}
2220 			continue;
2221 		}
2222 		active_count++;
2223 
2224 		/*
2225 		 * Store the next timerentry pointer before releasing the
2226 		 * list lock. If that entry has to be removed when we
2227 		 * release the lock, this pointer will be updated to the
2228 		 * element after that.
2229 		 */
2230 		listp->next_te = next_te;
2231 
2232 		VERIFY_NEXT_LINK(&tp->tentry, le);
2233 		VERIFY_PREV_LINK(&tp->tentry, le);
2234 
2235 		lck_mtx_unlock(&listp->mtx);
2236 
2237 		offset = tcp_run_conn_timer(tp, &te_mode,
2238 		    listp->probe_if_index);
2239 
2240 		lck_mtx_lock(&listp->mtx);
2241 
2242 		next_te = listp->next_te;
2243 		listp->next_te = NULL;
2244 
2245 		if (offset > 0 && te_mode != 0) {
2246 			list_mode |= te_mode;
2247 
2248 			if (next_timer == 0 || offset < next_timer) {
2249 				next_timer = offset;
2250 			}
2251 		}
2252 	}
2253 
2254 	if (!LIST_EMPTY(&listp->lhead)) {
2255 		uint32_t next_mode = 0;
2256 		if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
2257 		    (listp->pref_mode & TCP_TIMERLIST_10MS_MODE)) {
2258 			next_mode = TCP_TIMERLIST_10MS_MODE;
2259 		} else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
2260 		    (listp->pref_mode & TCP_TIMERLIST_100MS_MODE)) {
2261 			next_mode = TCP_TIMERLIST_100MS_MODE;
2262 		} else {
2263 			next_mode = TCP_TIMERLIST_500MS_MODE;
2264 		}
2265 
2266 		if (next_mode != TCP_TIMERLIST_500MS_MODE) {
2267 			listp->idleruns = 0;
2268 		} else {
2269 			/*
2270 			 * the next required mode is slow mode, but if
2271 			 * the last one was a faster mode and we did not
2272 			 * have enough idle runs, repeat the last mode.
2273 			 *
2274 			 * We try to keep the timer list in fast mode for
2275 			 * some idle time in expectation of new data.
2276 			 */
2277 			if (listp->mode != next_mode &&
2278 			    listp->idleruns < timer_fastmode_idlemax) {
2279 				listp->idleruns++;
2280 				next_mode = listp->mode;
2281 				next_timer = TCP_TIMER_100MS_QUANTUM;
2282 			} else {
2283 				listp->idleruns = 0;
2284 			}
2285 		}
2286 		listp->mode = next_mode;
2287 		if (listp->pref_offset != 0) {
2288 			next_timer = min(listp->pref_offset, next_timer);
2289 		}
2290 
2291 		if (listp->mode == TCP_TIMERLIST_500MS_MODE) {
2292 			next_timer = max(next_timer,
2293 			    TCP_TIMER_500MS_QUANTUM);
2294 		}
2295 
2296 		tcp_sched_timerlist(next_timer);
2297 	} else {
2298 		/*
2299 		 * No need to reschedule this timer, but always run
2300 		 * periodically at a much higher granularity.
2301 		 */
2302 		tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
2303 	}
2304 
2305 	listp->running = FALSE;
2306 	listp->pref_mode = 0;
2307 	listp->pref_offset = 0;
2308 	listp->probe_if_index = 0;
2309 
2310 	lck_mtx_unlock(&listp->mtx);
2311 }
2312 
2313 /*
2314  * Function to check if the timerlist needs to be rescheduled to run this
2315  * connection's timers correctly.
2316  */
2317 void
tcp_sched_timers(struct tcpcb * tp)2318 tcp_sched_timers(struct tcpcb *tp)
2319 {
2320 	struct tcptimerentry *te = &tp->tentry;
2321 	u_int16_t index = te->index;
2322 	u_int16_t mode = te->mode;
2323 	struct tcptimerlist *listp = &tcp_timer_list;
2324 	int32_t offset = 0;
2325 	boolean_t list_locked = FALSE;
2326 
2327 	if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
2328 		/* Just return without adding the dead pcb to the list */
2329 		if (TIMER_IS_ON_LIST(tp)) {
2330 			tcp_remove_timer(tp);
2331 		}
2332 		return;
2333 	}
2334 
2335 	if (index == TCPT_NONE) {
2336 		/* Nothing to run */
2337 		tcp_remove_timer(tp);
2338 		return;
2339 	}
2340 
2341 	/*
2342 	 * compute the offset at which the next timer for this connection
2343 	 * has to run.
2344 	 */
2345 	offset = timer_diff(te->runtime, 0, tcp_now, 0);
2346 	if (offset <= 0) {
2347 		offset = 1;
2348 		tcp_timer_advanced++;
2349 	}
2350 
2351 	if (!TIMER_IS_ON_LIST(tp)) {
2352 		if (!list_locked) {
2353 			lck_mtx_lock(&listp->mtx);
2354 			list_locked = TRUE;
2355 		}
2356 
2357 		if (!TIMER_IS_ON_LIST(tp)) {
2358 			LIST_INSERT_HEAD(&listp->lhead, te, le);
2359 			tp->t_flags |= TF_TIMER_ONLIST;
2360 
2361 			listp->entries++;
2362 			if (listp->entries > listp->maxentries) {
2363 				listp->maxentries = listp->entries;
2364 			}
2365 
2366 			/* if the list is not scheduled, just schedule it */
2367 			if (!listp->scheduled) {
2368 				goto schedule;
2369 			}
2370 		}
2371 	}
2372 
2373 	/*
2374 	 * Timer entry is currently on the list, check if the list needs
2375 	 * to be rescheduled.
2376 	 */
2377 	if (need_to_resched_timerlist(te->runtime, mode)) {
2378 		tcp_resched_timerlist++;
2379 
2380 		if (!list_locked) {
2381 			lck_mtx_lock(&listp->mtx);
2382 			list_locked = TRUE;
2383 		}
2384 
2385 		VERIFY_NEXT_LINK(te, le);
2386 		VERIFY_PREV_LINK(te, le);
2387 
2388 		if (listp->running) {
2389 			listp->pref_mode |= mode;
2390 			if (listp->pref_offset == 0 ||
2391 			    offset < listp->pref_offset) {
2392 				listp->pref_offset = offset;
2393 			}
2394 		} else {
2395 			/*
2396 			 * The list could have got rescheduled while
2397 			 * this thread was waiting for the lock
2398 			 */
2399 			if (listp->scheduled) {
2400 				int32_t diff;
2401 				diff = timer_diff(listp->runtime, 0,
2402 				    tcp_now, offset);
2403 				if (diff <= 0) {
2404 					goto done;
2405 				} else {
2406 					goto schedule;
2407 				}
2408 			} else {
2409 				goto schedule;
2410 			}
2411 		}
2412 	}
2413 	goto done;
2414 
2415 schedule:
2416 	/*
2417 	 * Since a connection with timers is getting scheduled, the timer
2418 	 * list moves from idle to active state and that is why idlegen is
2419 	 * reset
2420 	 */
2421 	if (mode & TCP_TIMERLIST_10MS_MODE) {
2422 		listp->mode = TCP_TIMERLIST_10MS_MODE;
2423 		listp->idleruns = 0;
2424 		offset = min(offset, TCP_TIMER_10MS_QUANTUM);
2425 	} else if (mode & TCP_TIMERLIST_100MS_MODE) {
2426 		if (listp->mode > TCP_TIMERLIST_100MS_MODE) {
2427 			listp->mode = TCP_TIMERLIST_100MS_MODE;
2428 		}
2429 		listp->idleruns = 0;
2430 		offset = min(offset, TCP_TIMER_100MS_QUANTUM);
2431 	}
2432 	tcp_sched_timerlist(offset);
2433 
2434 done:
2435 	if (list_locked) {
2436 		lck_mtx_unlock(&listp->mtx);
2437 	}
2438 
2439 	return;
2440 }
2441 
2442 static inline void
tcp_set_lotimer_index(struct tcpcb * tp)2443 tcp_set_lotimer_index(struct tcpcb *tp)
2444 {
2445 	uint16_t i, lo_index = TCPT_NONE, mode = 0;
2446 	uint32_t lo_timer = 0;
2447 	for (i = 0; i < TCPT_NTIMERS; ++i) {
2448 		if (tp->t_timer[i] != 0) {
2449 			TCP_SET_TIMER_MODE(mode, i);
2450 			if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
2451 				lo_timer = tp->t_timer[i];
2452 				lo_index = i;
2453 			}
2454 		}
2455 	}
2456 	tp->tentry.index = lo_index;
2457 	tp->tentry.mode = mode;
2458 	VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
2459 
2460 	if (tp->tentry.index != TCPT_NONE) {
2461 		tp->tentry.runtime = tp->tentry.timer_start
2462 		    + tp->t_timer[tp->tentry.index];
2463 		if (tp->tentry.runtime == 0) {
2464 			tp->tentry.runtime++;
2465 		}
2466 	}
2467 }
2468 
2469 void
tcp_check_timer_state(struct tcpcb * tp)2470 tcp_check_timer_state(struct tcpcb *tp)
2471 {
2472 	socket_lock_assert_owned(tp->t_inpcb->inp_socket);
2473 
2474 	if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT) {
2475 		return;
2476 	}
2477 
2478 	tcp_set_lotimer_index(tp);
2479 
2480 	tcp_sched_timers(tp);
2481 	return;
2482 }
2483 
2484 static inline void
tcp_cumulative_stat(u_int32_t cur,u_int32_t * prev,u_int32_t * dest)2485 tcp_cumulative_stat(u_int32_t cur, u_int32_t *prev, u_int32_t *dest)
2486 {
2487 	/* handle wrap around */
2488 	int32_t diff = (int32_t) (cur - *prev);
2489 	if (diff > 0) {
2490 		*dest = diff;
2491 	} else {
2492 		*dest = 0;
2493 	}
2494 	*prev = cur;
2495 	return;
2496 }
2497 
2498 static inline void
tcp_cumulative_stat64(u_int64_t cur,u_int64_t * prev,u_int64_t * dest)2499 tcp_cumulative_stat64(u_int64_t cur, u_int64_t *prev, u_int64_t *dest)
2500 {
2501 	/* handle wrap around */
2502 	int64_t diff = (int64_t) (cur - *prev);
2503 	if (diff > 0) {
2504 		*dest = diff;
2505 	} else {
2506 		*dest = 0;
2507 	}
2508 	*prev = cur;
2509 	return;
2510 }
2511 
2512 __private_extern__ void
tcp_report_stats(void)2513 tcp_report_stats(void)
2514 {
2515 	struct nstat_sysinfo_data data;
2516 	struct sockaddr_in dst;
2517 	struct sockaddr_in6 dst6;
2518 	struct rtentry *rt = NULL;
2519 	static struct tcp_last_report_stats prev;
2520 	u_int64_t var, uptime;
2521 
2522 #define stat    data.u.tcp_stats
2523 	if (((uptime = net_uptime()) - tcp_last_report_time) <
2524 	    tcp_report_stats_interval) {
2525 		return;
2526 	}
2527 
2528 	tcp_last_report_time = uptime;
2529 
2530 	bzero(&data, sizeof(data));
2531 	data.flags = NSTAT_SYSINFO_TCP_STATS;
2532 
2533 	SOCKADDR_ZERO(&dst, sizeof(dst));
2534 	dst.sin_len = sizeof(dst);
2535 	dst.sin_family = AF_INET;
2536 
2537 	/* ipv4 avg rtt */
2538 	lck_mtx_lock(rnh_lock);
2539 	rt =  rt_lookup(TRUE, SA(&dst), NULL,
2540 	    rt_tables[AF_INET], IFSCOPE_NONE);
2541 	lck_mtx_unlock(rnh_lock);
2542 	if (rt != NULL) {
2543 		RT_LOCK(rt);
2544 		if (rt_primary_default(rt, rt_key(rt)) &&
2545 		    rt->rt_stats != NULL) {
2546 			stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
2547 		}
2548 		RT_UNLOCK(rt);
2549 		rtfree(rt);
2550 		rt = NULL;
2551 	}
2552 
2553 	/* ipv6 avg rtt */
2554 	SOCKADDR_ZERO(&dst6, sizeof(dst6));
2555 	dst6.sin6_len = sizeof(dst6);
2556 	dst6.sin6_family = AF_INET6;
2557 
2558 	lck_mtx_lock(rnh_lock);
2559 	rt = rt_lookup(TRUE, SA(&dst6), NULL,
2560 	    rt_tables[AF_INET6], IFSCOPE_NONE);
2561 	lck_mtx_unlock(rnh_lock);
2562 	if (rt != NULL) {
2563 		RT_LOCK(rt);
2564 		if (rt_primary_default(rt, rt_key(rt)) &&
2565 		    rt->rt_stats != NULL) {
2566 			stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
2567 		}
2568 		RT_UNLOCK(rt);
2569 		rtfree(rt);
2570 		rt = NULL;
2571 	}
2572 
2573 	/* send packet loss rate, shift by 10 for precision */
2574 	if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
2575 		var = tcpstat.tcps_sndrexmitpack << 10;
2576 		stat.send_plr = (uint32_t)((var * 100) / tcpstat.tcps_sndpack);
2577 	}
2578 
2579 	/* recv packet loss rate, shift by 10 for precision */
2580 	if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
2581 		var = tcpstat.tcps_recovered_pkts << 10;
2582 		stat.recv_plr = (uint32_t)((var * 100) / tcpstat.tcps_rcvpack);
2583 	}
2584 
2585 	/* RTO after tail loss, shift by 10 for precision */
2586 	if (tcpstat.tcps_sndrexmitpack > 0
2587 	    && tcpstat.tcps_tailloss_rto > 0) {
2588 		var = tcpstat.tcps_tailloss_rto << 10;
2589 		stat.send_tlrto_rate =
2590 		    (uint32_t)((var * 100) / tcpstat.tcps_sndrexmitpack);
2591 	}
2592 
2593 	/* packet reordering */
2594 	if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
2595 		var = tcpstat.tcps_reordered_pkts << 10;
2596 		stat.send_reorder_rate =
2597 		    (uint32_t)((var * 100) / tcpstat.tcps_sndpack);
2598 	}
2599 
2600 	if (tcp_ecn_outbound == 1) {
2601 		stat.ecn_client_enabled = 1;
2602 	}
2603 	if (tcp_ecn_inbound == 1) {
2604 		stat.ecn_server_enabled = 1;
2605 	}
2606 	tcp_cumulative_stat(tcpstat.tcps_connattempt,
2607 	    &prev.tcps_connattempt, &stat.connection_attempts);
2608 	tcp_cumulative_stat(tcpstat.tcps_accepts,
2609 	    &prev.tcps_accepts, &stat.connection_accepts);
2610 	tcp_cumulative_stat(tcpstat.tcps_ecn_client_setup,
2611 	    &prev.tcps_ecn_client_setup, &stat.ecn_client_setup);
2612 	tcp_cumulative_stat(tcpstat.tcps_ecn_server_setup,
2613 	    &prev.tcps_ecn_server_setup, &stat.ecn_server_setup);
2614 	tcp_cumulative_stat(tcpstat.tcps_ecn_client_success,
2615 	    &prev.tcps_ecn_client_success, &stat.ecn_client_success);
2616 	tcp_cumulative_stat(tcpstat.tcps_ecn_server_success,
2617 	    &prev.tcps_ecn_server_success, &stat.ecn_server_success);
2618 	tcp_cumulative_stat(tcpstat.tcps_ecn_not_supported,
2619 	    &prev.tcps_ecn_not_supported, &stat.ecn_not_supported);
2620 	tcp_cumulative_stat(tcpstat.tcps_ecn_lost_syn,
2621 	    &prev.tcps_ecn_lost_syn, &stat.ecn_lost_syn);
2622 	tcp_cumulative_stat(tcpstat.tcps_ecn_lost_synack,
2623 	    &prev.tcps_ecn_lost_synack, &stat.ecn_lost_synack);
2624 	tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ce,
2625 	    &prev.tcps_ecn_recv_ce, &stat.ecn_recv_ce);
2626 	tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2627 	    &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2628 	tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2629 	    &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2630 	tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2631 	    &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2632 	tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2633 	    &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2634 	tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ce,
2635 	    &prev.tcps_ecn_conn_recv_ce, &stat.ecn_conn_recv_ce);
2636 	tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ece,
2637 	    &prev.tcps_ecn_conn_recv_ece, &stat.ecn_conn_recv_ece);
2638 	tcp_cumulative_stat(tcpstat.tcps_ecn_conn_plnoce,
2639 	    &prev.tcps_ecn_conn_plnoce, &stat.ecn_conn_plnoce);
2640 	tcp_cumulative_stat(tcpstat.tcps_ecn_conn_pl_ce,
2641 	    &prev.tcps_ecn_conn_pl_ce, &stat.ecn_conn_pl_ce);
2642 	tcp_cumulative_stat(tcpstat.tcps_ecn_conn_nopl_ce,
2643 	    &prev.tcps_ecn_conn_nopl_ce, &stat.ecn_conn_nopl_ce);
2644 	tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_synloss,
2645 	    &prev.tcps_ecn_fallback_synloss, &stat.ecn_fallback_synloss);
2646 	tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_reorder,
2647 	    &prev.tcps_ecn_fallback_reorder, &stat.ecn_fallback_reorder);
2648 	tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_ce,
2649 	    &prev.tcps_ecn_fallback_ce, &stat.ecn_fallback_ce);
2650 	tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_rcv,
2651 	    &prev.tcps_tfo_syn_data_rcv, &stat.tfo_syn_data_rcv);
2652 	tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req_rcv,
2653 	    &prev.tcps_tfo_cookie_req_rcv, &stat.tfo_cookie_req_rcv);
2654 	tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_sent,
2655 	    &prev.tcps_tfo_cookie_sent, &stat.tfo_cookie_sent);
2656 	tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_invalid,
2657 	    &prev.tcps_tfo_cookie_invalid, &stat.tfo_cookie_invalid);
2658 	tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req,
2659 	    &prev.tcps_tfo_cookie_req, &stat.tfo_cookie_req);
2660 	tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_rcv,
2661 	    &prev.tcps_tfo_cookie_rcv, &stat.tfo_cookie_rcv);
2662 	tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_sent,
2663 	    &prev.tcps_tfo_syn_data_sent, &stat.tfo_syn_data_sent);
2664 	tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_acked,
2665 	    &prev.tcps_tfo_syn_data_acked, &stat.tfo_syn_data_acked);
2666 	tcp_cumulative_stat(tcpstat.tcps_tfo_syn_loss,
2667 	    &prev.tcps_tfo_syn_loss, &stat.tfo_syn_loss);
2668 	tcp_cumulative_stat(tcpstat.tcps_tfo_blackhole,
2669 	    &prev.tcps_tfo_blackhole, &stat.tfo_blackhole);
2670 	tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_wrong,
2671 	    &prev.tcps_tfo_cookie_wrong, &stat.tfo_cookie_wrong);
2672 	tcp_cumulative_stat(tcpstat.tcps_tfo_no_cookie_rcv,
2673 	    &prev.tcps_tfo_no_cookie_rcv, &stat.tfo_no_cookie_rcv);
2674 	tcp_cumulative_stat(tcpstat.tcps_tfo_heuristics_disable,
2675 	    &prev.tcps_tfo_heuristics_disable, &stat.tfo_heuristics_disable);
2676 	tcp_cumulative_stat(tcpstat.tcps_tfo_sndblackhole,
2677 	    &prev.tcps_tfo_sndblackhole, &stat.tfo_sndblackhole);
2678 
2679 
2680 	tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_attempt,
2681 	    &prev.tcps_mptcp_handover_attempt, &stat.mptcp_handover_attempt);
2682 	tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_attempt,
2683 	    &prev.tcps_mptcp_interactive_attempt, &stat.mptcp_interactive_attempt);
2684 	tcp_cumulative_stat(tcpstat.tcps_mptcp_aggregate_attempt,
2685 	    &prev.tcps_mptcp_aggregate_attempt, &stat.mptcp_aggregate_attempt);
2686 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_attempt,
2687 	    &prev.tcps_mptcp_fp_handover_attempt, &stat.mptcp_fp_handover_attempt);
2688 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_interactive_attempt,
2689 	    &prev.tcps_mptcp_fp_interactive_attempt, &stat.mptcp_fp_interactive_attempt);
2690 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_aggregate_attempt,
2691 	    &prev.tcps_mptcp_fp_aggregate_attempt, &stat.mptcp_fp_aggregate_attempt);
2692 	tcp_cumulative_stat(tcpstat.tcps_mptcp_heuristic_fallback,
2693 	    &prev.tcps_mptcp_heuristic_fallback, &stat.mptcp_heuristic_fallback);
2694 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_heuristic_fallback,
2695 	    &prev.tcps_mptcp_fp_heuristic_fallback, &stat.mptcp_fp_heuristic_fallback);
2696 	tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_success_wifi,
2697 	    &prev.tcps_mptcp_handover_success_wifi, &stat.mptcp_handover_success_wifi);
2698 	tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_success_cell,
2699 	    &prev.tcps_mptcp_handover_success_cell, &stat.mptcp_handover_success_cell);
2700 	tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_success,
2701 	    &prev.tcps_mptcp_interactive_success, &stat.mptcp_interactive_success);
2702 	tcp_cumulative_stat(tcpstat.tcps_mptcp_aggregate_success,
2703 	    &prev.tcps_mptcp_aggregate_success, &stat.mptcp_aggregate_success);
2704 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_success_wifi,
2705 	    &prev.tcps_mptcp_fp_handover_success_wifi, &stat.mptcp_fp_handover_success_wifi);
2706 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_success_cell,
2707 	    &prev.tcps_mptcp_fp_handover_success_cell, &stat.mptcp_fp_handover_success_cell);
2708 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_interactive_success,
2709 	    &prev.tcps_mptcp_fp_interactive_success, &stat.mptcp_fp_interactive_success);
2710 	tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_aggregate_success,
2711 	    &prev.tcps_mptcp_fp_aggregate_success, &stat.mptcp_fp_aggregate_success);
2712 	tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_cell_from_wifi,
2713 	    &prev.tcps_mptcp_handover_cell_from_wifi, &stat.mptcp_handover_cell_from_wifi);
2714 	tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_wifi_from_cell,
2715 	    &prev.tcps_mptcp_handover_wifi_from_cell, &stat.mptcp_handover_wifi_from_cell);
2716 	tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_cell_from_wifi,
2717 	    &prev.tcps_mptcp_interactive_cell_from_wifi, &stat.mptcp_interactive_cell_from_wifi);
2718 	tcp_cumulative_stat64(tcpstat.tcps_mptcp_handover_cell_bytes,
2719 	    &prev.tcps_mptcp_handover_cell_bytes, &stat.mptcp_handover_cell_bytes);
2720 	tcp_cumulative_stat64(tcpstat.tcps_mptcp_interactive_cell_bytes,
2721 	    &prev.tcps_mptcp_interactive_cell_bytes, &stat.mptcp_interactive_cell_bytes);
2722 	tcp_cumulative_stat64(tcpstat.tcps_mptcp_aggregate_cell_bytes,
2723 	    &prev.tcps_mptcp_aggregate_cell_bytes, &stat.mptcp_aggregate_cell_bytes);
2724 	tcp_cumulative_stat64(tcpstat.tcps_mptcp_handover_all_bytes,
2725 	    &prev.tcps_mptcp_handover_all_bytes, &stat.mptcp_handover_all_bytes);
2726 	tcp_cumulative_stat64(tcpstat.tcps_mptcp_interactive_all_bytes,
2727 	    &prev.tcps_mptcp_interactive_all_bytes, &stat.mptcp_interactive_all_bytes);
2728 	tcp_cumulative_stat64(tcpstat.tcps_mptcp_aggregate_all_bytes,
2729 	    &prev.tcps_mptcp_aggregate_all_bytes, &stat.mptcp_aggregate_all_bytes);
2730 	tcp_cumulative_stat(tcpstat.tcps_mptcp_back_to_wifi,
2731 	    &prev.tcps_mptcp_back_to_wifi, &stat.mptcp_back_to_wifi);
2732 	tcp_cumulative_stat(tcpstat.tcps_mptcp_wifi_proxy,
2733 	    &prev.tcps_mptcp_wifi_proxy, &stat.mptcp_wifi_proxy);
2734 	tcp_cumulative_stat(tcpstat.tcps_mptcp_cell_proxy,
2735 	    &prev.tcps_mptcp_cell_proxy, &stat.mptcp_cell_proxy);
2736 	tcp_cumulative_stat(tcpstat.tcps_mptcp_triggered_cell,
2737 	    &prev.tcps_mptcp_triggered_cell, &stat.mptcp_triggered_cell);
2738 
2739 	nstat_sysinfo_send_data(&data);
2740 
2741 #undef  stat
2742 }
2743 
2744 void
tcp_interface_send_probe(u_int16_t probe_if_index)2745 tcp_interface_send_probe(u_int16_t probe_if_index)
2746 {
2747 	int32_t offset = 0;
2748 	struct tcptimerlist *listp = &tcp_timer_list;
2749 
2750 	/* Make sure TCP clock is up to date */
2751 	calculate_tcp_clock();
2752 
2753 	lck_mtx_lock(&listp->mtx);
2754 	if (listp->probe_if_index > 0 && listp->probe_if_index != probe_if_index) {
2755 		tcpstat.tcps_probe_if_conflict++;
2756 		os_log(OS_LOG_DEFAULT,
2757 		    "%s: probe_if_index %u conflicts with %u, tcps_probe_if_conflict %u\n",
2758 		    __func__, probe_if_index, listp->probe_if_index,
2759 		    tcpstat.tcps_probe_if_conflict);
2760 		goto done;
2761 	}
2762 
2763 	listp->probe_if_index = probe_if_index;
2764 	if (listp->running) {
2765 		os_log(OS_LOG_DEFAULT, "%s: timer list already running for if_index %u\n",
2766 		    __func__, probe_if_index);
2767 		goto done;
2768 	}
2769 
2770 	/*
2771 	 * Reschedule the timerlist to run within the next 10ms, which is
2772 	 * the fastest that we can do.
2773 	 */
2774 	offset = TCP_TIMER_10MS_QUANTUM;
2775 	if (listp->scheduled) {
2776 		int32_t diff;
2777 		diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2778 		if (diff <= 0) {
2779 			/* The timer will fire sooner than what's needed */
2780 			os_log(OS_LOG_DEFAULT,
2781 			    "%s: timer will fire sooner than needed for if_index %u\n",
2782 			    __func__, probe_if_index);
2783 			goto done;
2784 		}
2785 	}
2786 	listp->mode = TCP_TIMERLIST_10MS_MODE;
2787 	listp->idleruns = 0;
2788 
2789 	tcp_sched_timerlist(offset);
2790 
2791 done:
2792 	lck_mtx_unlock(&listp->mtx);
2793 	return;
2794 }
2795 
2796 /*
2797  * Enable read probes on this connection, if:
2798  * - it is in established state
2799  * - doesn't have any data outstanding
2800  * - the outgoing ifp matches
2801  * - we have not already sent any read probes
2802  */
2803 static void
tcp_enable_read_probe(struct tcpcb * tp,struct ifnet * ifp)2804 tcp_enable_read_probe(struct tcpcb *tp, struct ifnet *ifp)
2805 {
2806 	if (tp->t_state == TCPS_ESTABLISHED &&
2807 	    tp->snd_max == tp->snd_una &&
2808 	    tp->t_inpcb->inp_last_outifp == ifp &&
2809 	    !(tp->t_flagsext & TF_DETECT_READSTALL) &&
2810 	    tp->t_rtimo_probes == 0) {
2811 		tp->t_flagsext |= TF_DETECT_READSTALL;
2812 		tp->t_rtimo_probes = 0;
2813 		tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2814 		    TCP_TIMER_10MS_QUANTUM);
2815 		if (tp->tentry.index == TCPT_NONE) {
2816 			tp->tentry.index = TCPT_KEEP;
2817 			tp->tentry.runtime = tcp_now +
2818 			    TCP_TIMER_10MS_QUANTUM;
2819 		} else {
2820 			int32_t diff = 0;
2821 
2822 			/* Reset runtime to be in next 10ms */
2823 			diff = timer_diff(tp->tentry.runtime, 0,
2824 			    tcp_now, TCP_TIMER_10MS_QUANTUM);
2825 			if (diff > 0) {
2826 				tp->tentry.index = TCPT_KEEP;
2827 				tp->tentry.runtime = tcp_now +
2828 				    TCP_TIMER_10MS_QUANTUM;
2829 				if (tp->tentry.runtime == 0) {
2830 					tp->tentry.runtime++;
2831 				}
2832 			}
2833 		}
2834 	}
2835 }
2836 
2837 /*
2838  * Disable read probe and reset the keep alive timer
2839  */
2840 static void
tcp_disable_read_probe(struct tcpcb * tp)2841 tcp_disable_read_probe(struct tcpcb *tp)
2842 {
2843 	if (tp->t_adaptive_rtimo == 0 &&
2844 	    ((tp->t_flagsext & TF_DETECT_READSTALL) ||
2845 	    tp->t_rtimo_probes > 0)) {
2846 		tcp_keepalive_reset(tp);
2847 
2848 		if (tp->t_mpsub) {
2849 			mptcp_reset_keepalive(tp);
2850 		}
2851 	}
2852 }
2853 
2854 /*
2855  * Reschedule the tcp timerlist in the next 10ms to re-enable read/write
2856  * probes on connections going over a particular interface.
2857  */
2858 void
tcp_probe_connectivity(struct ifnet * ifp,u_int32_t enable)2859 tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable)
2860 {
2861 	int32_t offset;
2862 	struct tcptimerlist *listp = &tcp_timer_list;
2863 	struct inpcbinfo *pcbinfo = &tcbinfo;
2864 	struct inpcb *inp, *nxt;
2865 
2866 	if (ifp == NULL) {
2867 		return;
2868 	}
2869 
2870 	/* update clock */
2871 	calculate_tcp_clock();
2872 
2873 	/*
2874 	 * Enable keep alive timer on all connections that are
2875 	 * active/established on this interface.
2876 	 */
2877 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2878 
2879 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, nxt) {
2880 		struct tcpcb *tp = NULL;
2881 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) ==
2882 		    WNT_STOPUSING) {
2883 			continue;
2884 		}
2885 
2886 		/* Acquire lock to look at the state of the connection */
2887 		socket_lock(inp->inp_socket, 1);
2888 
2889 		/* Release the want count */
2890 		if (inp->inp_ppcb == NULL ||
2891 		    (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
2892 			socket_unlock(inp->inp_socket, 1);
2893 			continue;
2894 		}
2895 		tp = intotcpcb(inp);
2896 		if (enable) {
2897 			tcp_enable_read_probe(tp, ifp);
2898 		} else {
2899 			tcp_disable_read_probe(tp);
2900 		}
2901 
2902 		socket_unlock(inp->inp_socket, 1);
2903 	}
2904 	lck_rw_done(&pcbinfo->ipi_lock);
2905 
2906 	lck_mtx_lock(&listp->mtx);
2907 	if (listp->running) {
2908 		listp->pref_mode |= TCP_TIMERLIST_10MS_MODE;
2909 		goto done;
2910 	}
2911 
2912 	/* Reschedule within the next 10ms */
2913 	offset = TCP_TIMER_10MS_QUANTUM;
2914 	if (listp->scheduled) {
2915 		int32_t diff;
2916 		diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2917 		if (diff <= 0) {
2918 			/* The timer will fire sooner than what's needed */
2919 			goto done;
2920 		}
2921 	}
2922 	listp->mode = TCP_TIMERLIST_10MS_MODE;
2923 	listp->idleruns = 0;
2924 
2925 	tcp_sched_timerlist(offset);
2926 done:
2927 	lck_mtx_unlock(&listp->mtx);
2928 	return;
2929 }
2930 
2931 inline void
tcp_update_mss_core(struct tcpcb * tp,struct ifnet * ifp)2932 tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp)
2933 {
2934 	struct if_cellular_status_v1 *ifsr;
2935 	u_int32_t optlen;
2936 	ifsr = &ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
2937 	if (ifsr->valid_bitmask & IF_CELL_UL_MSS_RECOMMENDED_VALID) {
2938 		optlen = tp->t_maxopd - tp->t_maxseg;
2939 
2940 		if (ifsr->mss_recommended ==
2941 		    IF_CELL_UL_MSS_RECOMMENDED_NONE &&
2942 		    tp->t_cached_maxopd > 0 &&
2943 		    tp->t_maxopd < tp->t_cached_maxopd) {
2944 			tp->t_maxopd = tp->t_cached_maxopd;
2945 			tcpstat.tcps_mss_to_default++;
2946 		} else if (ifsr->mss_recommended ==
2947 		    IF_CELL_UL_MSS_RECOMMENDED_MEDIUM &&
2948 		    tp->t_maxopd > tcp_mss_rec_medium) {
2949 			tp->t_cached_maxopd = tp->t_maxopd;
2950 			tp->t_maxopd = tcp_mss_rec_medium;
2951 			tcpstat.tcps_mss_to_medium++;
2952 		} else if (ifsr->mss_recommended ==
2953 		    IF_CELL_UL_MSS_RECOMMENDED_LOW &&
2954 		    tp->t_maxopd > tcp_mss_rec_low) {
2955 			tp->t_cached_maxopd = tp->t_maxopd;
2956 			tp->t_maxopd = tcp_mss_rec_low;
2957 			tcpstat.tcps_mss_to_low++;
2958 		}
2959 		tp->t_maxseg = tp->t_maxopd - optlen;
2960 
2961 		/*
2962 		 * clear the cached value if it is same as the current
2963 		 */
2964 		if (tp->t_maxopd == tp->t_cached_maxopd) {
2965 			tp->t_cached_maxopd = 0;
2966 		}
2967 	}
2968 }
2969 
2970 void
tcp_update_mss_locked(struct socket * so,struct ifnet * ifp)2971 tcp_update_mss_locked(struct socket *so, struct ifnet *ifp)
2972 {
2973 	struct inpcb *inp = sotoinpcb(so);
2974 	struct tcpcb *tp = intotcpcb(inp);
2975 
2976 	if (ifp == NULL && (ifp = inp->inp_last_outifp) == NULL) {
2977 		return;
2978 	}
2979 
2980 	if (!IFNET_IS_CELLULAR(ifp)) {
2981 		/*
2982 		 * This optimization is implemented for cellular
2983 		 * networks only
2984 		 */
2985 		return;
2986 	}
2987 	if (tp->t_state <= TCPS_CLOSE_WAIT) {
2988 		/*
2989 		 * If the connection is currently doing or has done PMTU
2990 		 * blackhole detection, do not change the MSS
2991 		 */
2992 		if (tp->t_flags & TF_BLACKHOLE) {
2993 			return;
2994 		}
2995 		if (ifp->if_link_status == NULL) {
2996 			return;
2997 		}
2998 		tcp_update_mss_core(tp, ifp);
2999 	}
3000 }
3001 
3002 void
tcp_itimer(struct inpcbinfo * ipi)3003 tcp_itimer(struct inpcbinfo *ipi)
3004 {
3005 	struct inpcb *inp, *nxt;
3006 
3007 	if (lck_rw_try_lock_exclusive(&ipi->ipi_lock) == FALSE) {
3008 		if (tcp_itimer_done == TRUE) {
3009 			tcp_itimer_done = FALSE;
3010 			os_atomic_inc(&ipi->ipi_timer_req.intimer_fast, relaxed);
3011 			return;
3012 		}
3013 		/* Upgrade failed, lost lock now take it again exclusive */
3014 		lck_rw_lock_exclusive(&ipi->ipi_lock);
3015 	}
3016 	tcp_itimer_done = TRUE;
3017 
3018 	LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
3019 		struct socket *so;
3020 		struct ifnet *ifp;
3021 
3022 		if (inp->inp_ppcb == NULL ||
3023 		    in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
3024 			continue;
3025 		}
3026 		so = inp->inp_socket;
3027 		ifp = inp->inp_last_outifp;
3028 		socket_lock(so, 1);
3029 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3030 			socket_unlock(so, 1);
3031 			continue;
3032 		}
3033 		so_check_extended_bk_idle_time(so);
3034 		if (ipi->ipi_flags & INPCBINFO_UPDATE_MSS) {
3035 			tcp_update_mss_locked(so, NULL);
3036 		}
3037 		socket_unlock(so, 1);
3038 
3039 		/*
3040 		 * Defunct all system-initiated background sockets if the
3041 		 * socket is using the cellular interface and the interface
3042 		 * has its LQM set to abort.
3043 		 */
3044 		if ((ipi->ipi_flags & INPCBINFO_HANDLE_LQM_ABORT) &&
3045 		    IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class) &&
3046 		    ifp != NULL && IFNET_IS_CELLULAR(ifp) &&
3047 		    (ifp->if_interface_state.valid_bitmask &
3048 		    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
3049 		    ifp->if_interface_state.lqm_state ==
3050 		    IFNET_LQM_THRESH_ABORT) {
3051 			socket_defunct(current_proc(), so,
3052 			    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
3053 		}
3054 	}
3055 
3056 	ipi->ipi_flags &= ~(INPCBINFO_UPDATE_MSS | INPCBINFO_HANDLE_LQM_ABORT);
3057 	lck_rw_done(&ipi->ipi_lock);
3058 }
3059