1 /*
2 * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <stdint.h>
71 #define _IP_VHL
72
73 #include "tcp_includes.h"
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/kernel.h>
78 #include <sys/sysctl.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/protosw.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <os/ptrtools.h>
85 #include <kern/clock.h>
86
87 #include <net/route.h>
88 #include <net/ntstat.h>
89 #include <net/if_var.h>
90 #include <net/if.h>
91 #include <net/if_types.h>
92 #include <net/dlil.h>
93 #include <net/droptap.h>
94
95 #include <netinet/in.h>
96 #include <netinet/in_systm.h>
97 #include <netinet/in_var.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/ip_var.h>
102 #include <mach/sdt.h>
103 #include <netinet6/in6_pcb.h>
104 #include <netinet/ip6.h>
105 #include <netinet6/ip6_var.h>
106 #include <netinet/tcp.h>
107 #include <netinet/tcp_cache.h>
108 #include <netinet/tcp_fsm.h>
109 #include <netinet/tcp_seq.h>
110 #include <netinet/tcp_timer.h>
111 #include <netinet/tcp_var.h>
112 #include <netinet/tcpip.h>
113 #include <netinet/tcp_cc.h>
114 #include <netinet/tcp_log.h>
115 #include <sys/kdebug.h>
116 #include <mach/sdt.h>
117
118 #if IPSEC
119 #include <netinet6/ipsec.h>
120 #endif /*IPSEC*/
121
122 #if MPTCP
123 #include <netinet/mptcp_var.h>
124 #include <netinet/mptcp.h>
125 #include <netinet/mptcp_opt.h>
126 #include <netinet/mptcp_seq.h>
127 #endif
128
129 #include <corecrypto/ccaes.h>
130
131 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
132 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
133 #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
134
135 SYSCTL_SKMEM_TCP_INT(OID_AUTO, path_mtu_discovery,
136 CTLFLAG_RW | CTLFLAG_LOCKED, int, path_mtu_discovery, 1,
137 "Enable Path MTU Discovery");
138
139 SYSCTL_SKMEM_TCP_INT(OID_AUTO, local_slowstart_flightsize,
140 CTLFLAG_RW | CTLFLAG_LOCKED, int, ss_fltsz_local, 8,
141 "Slow start flight size for local networks");
142
143 SYSCTL_SKMEM_TCP_INT(OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED,
144 int, tcp_do_tso, 1, "Enable TCP Segmentation Offload");
145
146 SYSCTL_SKMEM_TCP_INT(OID_AUTO, ecn_setup_percentage,
147 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_ecn_setup_percentage, 100,
148 "Max ECN setup percentage");
149
150 SYSCTL_SKMEM_TCP_INT(OID_AUTO, accurate_ecn,
151 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_acc_ecn, 0,
152 "Accurate ECN mode (0: disable, 1: enable Accurate ECN feedback");
153
154 int tcp_l4s_developer = 0;
155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, l4s_developer,
156 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_l4s_developer, 0,
157 "Developer L4S mode (0: system, 1: force enable L4S, 2: force disable L4S");
158
159 SYSCTL_SKMEM_TCP_INT(OID_AUTO, l4s,
160 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_l4s, 0,
161 "System L4S mode (0: disable, 1: enable L4S");
162
163 SYSCTL_SKMEM_TCP_INT(OID_AUTO, link_heuristics_flags,
164 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_link_heuristics_flags, TCP_LINK_HEURISTICS_DEFAULT,
165 "TCP LQM heuristics flags (1:rxmtcomp 2:noackpro 4:synrxmt 8:stealth 0x10:rtomin 0x20:notlp)");
166
167 SYSCTL_SKMEM_TCP_INT(OID_AUTO, link_heuristics_rto_min,
168 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_link_heuristics_rto_min, TCP_DEFAULT_LINK_HEUR_RTOMIN,
169 "");
170
171
172 // TO BE REMOVED
173 SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_ack_compression,
174 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_do_ack_compression, 1,
175 "Enable TCP ACK compression (on (cell only): 1, off: 0, on (all interfaces): 2)");
176
177 SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_compression_rate,
178 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_ack_compression_rate, TCP_COMP_CHANGE_RATE,
179 "Rate at which we force sending new ACKs (in ms)");
180
181 SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_timestamps,
182 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_randomize_timestamps, 1,
183 "Randomize TCP timestamps to prevent tracking (on: 1, off: 0)");
184
185 static int
186 sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
187 {
188 #pragma unused(oidp, arg1, arg2)
189 int i, err = 0, changed = 0;
190
191 err = sysctl_io_number(req, tcp_ecn, sizeof(int32_t),
192 &i, &changed);
193 if (err != 0 || req->newptr == USER_ADDR_NULL) {
194 return err;
195 }
196
197 if (changed) {
198 tcp_ecn = i;
199 SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn, tcp_ecn);
200 }
201 return err;
202 }
203
204 /* TODO: remove ecn_initiate_out once libnetcore ECN cleanup changes land */
205 int tcp_ecn_outbound = 2;
206 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out,
207 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0,
208 sysctl_change_ecn_setting, "IU",
209 "Initiate ECN for outbound connections");
210
211 int tcp_ecn = 1;
212 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn,
213 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn, 0,
214 sysctl_change_ecn_setting, "IU",
215 "ECN system setting (0: disable, 1: enable)");
216
217 SYSCTL_SKMEM_TCP_INT(OID_AUTO, packetchain,
218 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_packet_chaining, 50,
219 "Enable TCP output packet chaining");
220
221 SYSCTL_SKMEM_TCP_INT(OID_AUTO, socket_unlocked_on_output,
222 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_output_unlocked, 1,
223 "Unlock TCP when sending packets down to IP");
224
225 SYSCTL_SKMEM_TCP_INT(OID_AUTO, min_iaj_win,
226 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_min_iaj_win, MIN_IAJ_WIN,
227 "Minimum recv win based on inter-packet arrival jitter");
228
229 SYSCTL_SKMEM_TCP_INT(OID_AUTO, acc_iaj_react_limit,
230 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_acc_iaj_react_limit,
231 ACC_IAJ_REACT_LIMIT, "Accumulated IAJ when receiver starts to react");
232
233 SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufinc,
234 CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_inc,
235 8 * 1024, "Increment in send socket bufffer size");
236
237 SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufmax,
238 CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN, uint32_t, tcp_autosndbuf_max, 2 * 1024 * 1024,
239 "Maximum send socket buffer size");
240
241 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_recvbg,
242 CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_use_rtt_recvbg, 1,
243 "Use RTT for bg recv algorithm");
244
245 SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_throttle_minwin,
246 CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_recv_throttle_minwin, 16 * 1024,
247 "Minimum recv win for throttling");
248
249 SYSCTL_SKMEM_TCP_INT(OID_AUTO, enable_tlp,
250 CTLFLAG_RW | CTLFLAG_LOCKED,
251 int32_t, tcp_enable_tlp, 1, "Enable Tail loss probe");
252
253 static int32_t packchain_newlist = 0;
254 static int32_t packchain_looped = 0;
255 static int32_t packchain_sent = 0;
256
257 /* temporary: for testing */
258 #if IPSEC
259 extern int ipsec_bypass;
260 #endif
261
262 extern int slowlink_wsize; /* window correction for slow links */
263
264 extern u_int32_t kipf_count;
265
266 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *,
267 int, struct mbuf *, int, int, boolean_t);
268 static int tcp_recv_throttle(struct tcpcb *tp);
269
270 __attribute__((noinline))
271 static int32_t
tcp_tfo_check(struct tcpcb * tp,int32_t len)272 tcp_tfo_check(struct tcpcb *tp, int32_t len)
273 {
274 struct socket *__single so = tp->t_inpcb->inp_socket;
275 unsigned int optlen = 0;
276 unsigned int cookie_len;
277
278 if (tp->t_flags & TF_NOOPT) {
279 goto fallback;
280 }
281
282 if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
283 !tcp_heuristic_do_tfo(tp)) {
284 tp->t_tfo_stats |= TFO_S_HEURISTICS_DISABLE;
285 tcpstat.tcps_tfo_heuristics_disable++;
286 goto fallback;
287 }
288
289 if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
290 return len;
291 }
292
293 optlen += TCPOLEN_MAXSEG;
294
295 if (tp->t_flags & TF_REQ_SCALE) {
296 optlen += 4;
297 }
298
299 #if MPTCP
300 if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable &&
301 (tp->t_rxtshift <= mptcp_mpcap_retries ||
302 (tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE))) {
303 optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t);
304 }
305 #endif /* MPTCP */
306
307 if (tp->t_flags & TF_REQ_TSTMP) {
308 optlen += TCPOLEN_TSTAMP_APPA;
309 }
310
311 if (SACK_ENABLED(tp)) {
312 optlen += TCPOLEN_SACK_PERMITTED;
313 }
314
315 /* Now, decide whether to use TFO or not */
316
317 /* Don't even bother trying if there is no space at all... */
318 if (MAX_TCPOPTLEN - optlen < TCPOLEN_FASTOPEN_REQ) {
319 goto fallback;
320 }
321
322 cookie_len = tcp_cache_get_cookie_len(tp);
323 if (cookie_len == 0) {
324 /* No cookie, so we request one */
325 return 0;
326 }
327
328 /* There is not enough space for the cookie, so we cannot do TFO */
329 if (MAX_TCPOPTLEN - optlen < cookie_len) {
330 goto fallback;
331 }
332
333 /* Do not send SYN+data if there is more in the queue than MSS */
334 if (so->so_snd.sb_cc > (tp->t_maxopd - MAX_TCPOPTLEN)) {
335 goto fallback;
336 }
337
338 /* Ok, everything looks good. We can go on and do TFO */
339 return len;
340
341 fallback:
342 tcp_disable_tfo(tp);
343 return 0;
344 }
345
346 /* Returns the number of bytes written to the TCP option-space */
347 __attribute__((noinline))
348 static unsigned int
349 tcp_tfo_write_cookie_rep(struct tcpcb *tp, unsigned int optlen,
350 u_char *__counted_by(optlen + 2 + TFO_COOKIE_LEN_DEFAULT) opt)
351 {
352 u_char out[CCAES_BLOCK_SIZE];
353 unsigned ret = 0;
354 u_char *bp;
355
356 if (MAX_TCPOPTLEN - optlen <
357 TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT) {
358 return ret;
359 }
360
361 tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
362
363 bp = opt + optlen;
364
365 *bp++ = TCPOPT_FASTOPEN;
366 *bp++ = 2 + TFO_COOKIE_LEN_DEFAULT;
367 memcpy(bp, out, TFO_COOKIE_LEN_DEFAULT);
368 ret += 2 + TFO_COOKIE_LEN_DEFAULT;
369
370 tp->t_tfo_stats |= TFO_S_COOKIE_SENT;
371 tcpstat.tcps_tfo_cookie_sent++;
372
373 return ret;
374 }
375
376 __attribute__((noinline))
377 static unsigned int
tcp_tfo_write_cookie(struct tcpcb * tp,unsigned int optlen,int32_t len,u_char * __counted_by (TCP_MAXOLEN)opt)378 tcp_tfo_write_cookie(struct tcpcb *tp, unsigned int optlen, int32_t len,
379 u_char *__counted_by(TCP_MAXOLEN) opt)
380 {
381 uint8_t tfo_len;
382 struct socket *__single so = tp->t_inpcb->inp_socket;
383 unsigned ret = 0;
384 int res;
385 u_char *bp;
386
387 if (TCPOLEN_FASTOPEN_REQ > MAX_TCPOPTLEN - optlen) {
388 return 0;
389 }
390 tfo_len = (uint8_t)(MAX_TCPOPTLEN - optlen - TCPOLEN_FASTOPEN_REQ);
391
392 if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
393 /* If there is some data, let's track it */
394 if (len > 0) {
395 tp->t_tfo_stats |= TFO_S_SYN_DATA_SENT;
396 tcpstat.tcps_tfo_syn_data_sent++;
397 }
398
399 return 0;
400 }
401
402 bp = opt + optlen;
403
404 /*
405 * The cookie will be copied in the appropriate place within the
406 * TCP-option space. That way we avoid the need for an intermediate
407 * variable.
408 */
409 res = tcp_cache_get_cookie(tp, bp + TCPOLEN_FASTOPEN_REQ, tfo_len, &tfo_len);
410 if (res == 0) {
411 *bp++ = TCPOPT_FASTOPEN;
412 *bp++ = TCPOLEN_FASTOPEN_REQ;
413 ret += TCPOLEN_FASTOPEN_REQ;
414
415 tp->t_tfo_flags |= TFO_F_COOKIE_REQ;
416
417 tp->t_tfo_stats |= TFO_S_COOKIE_REQ;
418 tcpstat.tcps_tfo_cookie_req++;
419 } else {
420 *bp++ = TCPOPT_FASTOPEN;
421 *bp++ = TCPOLEN_FASTOPEN_REQ + tfo_len;
422
423 ret += TCPOLEN_FASTOPEN_REQ + tfo_len;
424
425 tp->t_tfo_flags |= TFO_F_COOKIE_SENT;
426
427 /* If there is some data, let's track it */
428 if (len > 0) {
429 tp->t_tfo_stats |= TFO_S_SYN_DATA_SENT;
430 tcpstat.tcps_tfo_syn_data_sent++;
431 }
432 }
433
434 return ret;
435 }
436
437 static inline bool
tcp_send_ecn_flags_on_syn(struct tcpcb * tp)438 tcp_send_ecn_flags_on_syn(struct tcpcb *tp)
439 {
440 /* We allow Accurate ECN negotiation on first retransmission as well */
441 bool send_on_first_retrans = (tp->ecn_flags & TE_ACE_SETUPSENT) &&
442 (tp->t_rxtshift <= 1);
443
444 return !(tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) || send_on_first_retrans;
445 }
446
447 /*
448 * Returns the RTO deadline as future Mach time value
449 * based on the TCPT_REXMT timer, if set.
450 */
451 static uint64_t
tcp_calculate_rto_deadline(const struct tcpcb * tp,uint32_t local_tcp_now)452 tcp_calculate_rto_deadline(const struct tcpcb *tp, uint32_t local_tcp_now)
453 {
454 uint64_t rto_val = tp->t_timer[TCPT_REXMT];
455 uint64_t rto_deadline_hz;
456 uint64_t rto_offset_hz;
457 uint64_t rto_offset_nano;
458 uint64_t rto_deadline_mach;
459 int sojourn_factor = tcp_rto_sojourn_factor;
460
461 /*
462 * Check whether the TCPT_REXMT timer is set.
463 */
464 if (rto_val == 0) {
465 DTRACE_TCP3(rto_deadline, struct tcpcb *, tp, uint32_t, local_tcp_now, uint64_t, 0);
466 return 0;
467 }
468
469 /*
470 * Check whether the TCPT_REXMT timer has already expired.
471 */
472 rto_deadline_hz = tp->tentry.te_timer_start + rto_val;
473 if (rto_deadline_hz < local_tcp_now) {
474 DTRACE_TCP3(rto_deadline, struct tcpcb *, tp, uint32_t, local_tcp_now, uint64_t, 0);
475 return 0;
476 }
477
478 /*
479 * Sanity check the sojourn factor
480 */
481 if (sojourn_factor < 25) {
482 sojourn_factor = 25;
483 } else if (125 <= sojourn_factor) {
484 sojourn_factor = 125; /* For testing */
485 }
486
487 /*
488 * Convert the retransmit timer to Mach deadline in the future.
489 * This is done in two steps:
490 * 1. Convert the timer value from the TCP RETRANSHZ units to
491 * a wall-clock timestamp (in nanosecond resolution).
492 * 2. Convert the wall-clock timestamp to a Mach timestamp.
493 */
494 rto_offset_hz = rto_deadline_hz - local_tcp_now;
495 rto_offset_hz = (uint64_t)((rto_offset_hz * sojourn_factor) / 100);
496 rto_offset_nano = rto_offset_hz * TCP_RETRANSHZ_TO_USEC * NSEC_PER_USEC;
497 nanoseconds_to_deadline(rto_offset_nano, &rto_deadline_mach);
498
499 DTRACE_TCP3(rto_deadline, struct tcpcb *, tp, uint32_t, local_tcp_now, uint64_t, rto_deadline_mach);
500
501 return rto_deadline_mach;
502 }
503
504
505 void
tcp_set_l4s(struct tcpcb * tp,struct ifnet * ifp)506 tcp_set_l4s(struct tcpcb *tp, struct ifnet *ifp)
507 {
508 if (tp->t_state >= TCPS_ESTABLISHED) {
509 return;
510 }
511
512 /*
513 * L4S is enabled if,
514 * 1. It is not disabled explicitly by developer or interface setting or tcp options
515 * 2. It is enabled either by developer or interface setting or A/B deployment or tcp_options,
516 * It implicitly enables Accurate ECN which supports ACE and AccECN option for ECN feedback
517 */
518 bool l4s_disabled = (tcp_l4s_developer == tcp_l4s_developer_disable ||
519 (ifp != NULL && ifp->if_l4s_mode == IFRTYPE_L4S_DISABLE) ||
520 (tp->t_flagsext & TF_L4S_DISABLED) == 1);
521
522 tp->l4s_enabled = !l4s_disabled && (tcp_l4s_developer == tcp_l4s_developer_enable ||
523 (ifp != NULL && ifp->if_l4s_mode == IFRTYPE_L4S_ENABLE) || tcp_l4s == 1 ||
524 ((tp->t_flagsext & TF_L4S_ENABLED)));
525 }
526
527 void
tcp_set_accurate_ecn(struct tcpcb * tp)528 tcp_set_accurate_ecn(struct tcpcb *tp)
529 {
530 if ((tp->ecn_flags & TE_ACC_ECN_ON) == TE_ACC_ECN_ON) {
531 tp->accurate_ecn_on = true;
532 } else {
533 tp->accurate_ecn_on = false;
534 }
535 }
536
537 void
tcp_set_ecn(struct tcpcb * tp)538 tcp_set_ecn(struct tcpcb *tp)
539 {
540 bool ecn_enabled = tcp_ecn_enabled(tp->ecn_flags);
541
542 if (!ecn_enabled || !tcp_heuristic_do_ecn(tp)) {
543 tp->ecn_flags &= ~TE_ENABLE_ECN;
544 return;
545 }
546
547 /* ECN is enabled based on system settings */
548 tp->ecn_flags |= TE_ENABLE_ECN;
549
550 if (tp->l4s_enabled) {
551 /* Set the accurate ECN state */
552 if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_disabled) {
553 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_feature_enabled;
554 }
555 if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_feature_disabled) {
556 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_feature_enabled;
557 }
558 }
559 }
560
561 bool
tcp_ecn_enabled(uint32_t ecn_flags)562 tcp_ecn_enabled(uint32_t ecn_flags)
563 {
564 /*
565 * Socket option has precedence
566 */
567 if (ecn_flags & TE_ECN_MODE_ENABLE) {
568 return true;
569 }
570 if (ecn_flags & TE_ECN_MODE_DISABLE) {
571 return false;
572 }
573
574 /*
575 * System wide settings come last
576 */
577 if (tcp_ecn == 1) {
578 return true;
579 } else {
580 return false;
581 }
582 }
583
584 uint32_t
tcp_flight_size(struct tcpcb * tp)585 tcp_flight_size(struct tcpcb *tp)
586 {
587 int ret;
588
589 VERIFY(tp->sackhint.sack_bytes_acked >= 0);
590 VERIFY(tp->sackhint.sack_bytes_rexmit >= 0);
591
592 /*
593 * RFC6675, SetPipe (), SACK'd bytes are discounted. All the rest is still in-flight.
594 */
595 ret = tp->snd_nxt - tp->snd_una - tp->sackhint.sack_bytes_acked;
596
597 if (TCP_RACK_ENABLED(tp)) {
598 /* In flight is bytes sent - bytes that left the network + bytes retransmitted */
599 const uint32_t bytes_sent = SEQ_MAX(tp->snd_max, tp->snd_nxt) - tp->snd_una;
600 const uint32_t bytes_not_in_flight = tp->bytes_sacked + tp->bytes_lost;
601 ret = bytes_sent - bytes_not_in_flight + tp->bytes_retransmitted;
602 }
603
604 if (ret < 0) {
605 /* It shouldn't happen when RACK is enabled */
606 if (TCP_RACK_ENABLED(tp)) {
607 os_log_error(OS_LOG_DEFAULT, "flight_size (%d) can't be negative "
608 "(snd_nxt:%u snd_max:%u, snd_una:%u, sacked:%u lost:%u retransmitted:%u)",
609 ret, tp->snd_nxt, tp->snd_max, tp->snd_una,
610 tp->bytes_sacked, tp->bytes_lost, tp->bytes_retransmitted);
611 }
612 /*
613 * This happens when the RTO-timer fires because snd_nxt gets artificially
614 * decreased. If we then receive some SACK-blogs, sack_bytes_acked is
615 * going to be high.
616 */
617 ret = 0;
618 }
619
620 return ret;
621 }
622
623 /*
624 * Either of ECT0 or ECT1 flag should be set
625 * when this function is called
626 */
627 static void
tcp_add_accecn_option(struct tcpcb * tp,uint16_t flags,uint32_t * __indexable lp,uint8_t * optlen)628 tcp_add_accecn_option(struct tcpcb *tp, uint16_t flags, uint32_t *__indexable lp, uint8_t *optlen)
629 {
630 uint8_t max_len = TCP_MAXOLEN - *optlen;
631 uint8_t len = TCPOLEN_ACCECN_EMPTY;
632
633 uint32_t e1b = (uint32_t)(tp->t_aecn.t_rcv_ect1_bytes & TCP_ACO_MASK);
634 uint32_t e0b = (uint32_t)(tp->t_aecn.t_rcv_ect0_bytes & TCP_ACO_MASK);
635 uint32_t ceb = (uint32_t)(tp->t_aecn.t_rcv_ce_bytes & TCP_ACO_MASK);
636
637 if (max_len < TCPOLEN_ACCECN_EMPTY) {
638 TCP_LOG(tp, "not enough space to add any AccECN option");
639 return;
640 }
641
642 if (!(flags & TH_SYN || (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) ||
643 tp->snd_una == tp->iss + 1 ||
644 tp->ecn_flags & (TE_ACO_ECT1 | TE_ACO_ECT0))) {
645 /*
646 * Since this is neither a SYN-ACK packet, nor the final ACK of
647 * the 3WHS (nor the first acked data segment) nor any of the ECT byte
648 * counter flags are set, no need to send the option.
649 */
650 return;
651 }
652
653 if ((flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) &&
654 tp->t_rxtshift >= 1) {
655 /*
656 * If this is a SYN-ACK retransmission (first),
657 * retry without AccECN option and just with ACE fields.
658 * From second retransmission onwards, we don't send any
659 * Accurate ECN state.
660 */
661 return;
662 }
663
664 if (max_len < (TCPOLEN_ACCECN_EMPTY + 1 * TCPOLEN_ACCECN_COUNTER)) {
665 /* Can carry EMPTY option (2 bytes) which can be used to test path in SYN-ACK packet */
666 if (flags & TH_SYN) {
667 *(uint16_t *)lp++ = htons((TCPOPT_ACCECN1 << 8) | len);
668 *optlen += len;
669 }
670 } else if (max_len < (TCPOLEN_ACCECN_EMPTY + 2 * TCPOLEN_ACCECN_COUNTER)) {
671 /* Can carry one option */
672 len += 1 * TCPOLEN_ACCECN_COUNTER;
673 if (tp->ecn_flags & TE_ACO_ECT1) {
674 *lp++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | ((e1b >> 8) & 0xffff));
675 *(uint16_t *)lp++ = htons((uint16_t)((e1b & 0xff) << 8) | TCPOPT_NOP);
676 } else {
677 *lp++ = htonl((TCPOPT_ACCECN0 << 24) | (len << 16) | ((e0b >> 8) & 0xffff));
678 *(uint16_t *)lp++ = htons((uint16_t)((e0b & 0xff) << 8) | TCPOPT_NOP);
679 }
680 *optlen += len + 1; /* 1 NOPs */
681 } else if (max_len < (TCPOLEN_ACCECN_EMPTY + 3 * TCPOLEN_ACCECN_COUNTER)) {
682 /* Can carry two options */
683 len += 2 * TCPOLEN_ACCECN_COUNTER;
684 if (tp->ecn_flags & TE_ACO_ECT1) {
685 *lp++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | ((e1b >> 8) & 0xffff));
686 *lp++ = htonl(((e1b & 0xff) << 24) | (ceb & 0xffffff));
687 } else {
688 *lp++ = htonl((TCPOPT_ACCECN0 << 24) | (len << 16) | ((e0b >> 8) & 0xffff));
689 *lp++ = htonl(((e0b & 0xff) << 24) | (ceb & 0xffffff));
690 }
691 *optlen += len; /* 0 NOPs */
692 } else {
693 /*
694 * TCP option sufficient to hold full AccECN option
695 * but send counter that changed during the entire connection.
696 */
697 len += 3 * TCPOLEN_ACCECN_COUNTER;
698 /* Can carry all three options */
699 if (tp->ecn_flags & TE_ACO_ECT1) {
700 *lp++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | ((e1b >> 8) & 0xffff));
701 *lp++ = htonl(((e1b & 0xff) << 24) | (ceb & 0xffffff));
702 *lp++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP);
703 } else {
704 *lp++ = htonl((TCPOPT_ACCECN0 << 24) | (len << 16) | ((e0b >> 8) & 0xffff));
705 *lp++ = htonl(((e0b & 0xff) << 24) | (ceb & 0xffffff));
706 *lp++ = htonl(((e1b & 0xffffff) << 8) | TCPOPT_NOP);
707 }
708 *optlen += len + 1; /* 1 NOP */
709 }
710 }
711
712 /*
713 * Insert TCP options according to the supplied parameters to the place
714 * optp in a consistent way. Can handle unaligned destinations.
715 *
716 * The order of the option processing is crucial for optimal packing and
717 * alignment for the scarce option space.
718 *
719 * The optimal order for a SYN/SYN-ACK segment is:
720 * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
721 * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
722 *
723 * The SACK options should be last. SACK blocks consume 8*n+2 bytes.
724 * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
725 * At minimum we need 10 bytes (to generate 1 SACK block). If both
726 * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
727 * we only have 10 bytes for SACK options (40 - (12 + 18)).
728 */
729 uint8_t
tcp_addoptions(struct tcpopt * to,u_char * __ended_by (optend)optp,u_char * optend)730 tcp_addoptions(struct tcpopt *to, u_char * __ended_by(optend) optp, u_char * optend)
731 {
732 uint32_t mask;
733 uint8_t optlen = 0;
734
735 for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
736 if ((to->to_flags & mask) != mask) {
737 continue;
738 }
739 if (optlen == TCP_MAXOLEN) {
740 break;
741 }
742 switch (to->to_flags & mask) {
743 case TOF_MSS:
744 while (optlen % 4) {
745 optlen += TCPOLEN_NOP;
746 *optp++ = TCPOPT_NOP;
747 }
748 if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) {
749 continue;
750 }
751 optlen += TCPOLEN_MAXSEG;
752 *optp++ = TCPOPT_MAXSEG;
753 *optp++ = TCPOLEN_MAXSEG;
754 to->to_mss = htons(to->to_mss);
755 bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
756 optp += sizeof(to->to_mss);
757 optend = optend;
758 break;
759 case TOF_SCALE:
760 while (!optlen || optlen % 2 != 1) {
761 optlen += TCPOLEN_NOP;
762 *optp++ = TCPOPT_NOP;
763 }
764 if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) {
765 continue;
766 }
767 optlen += TCPOLEN_WINDOW;
768 *optp++ = TCPOPT_WINDOW;
769 *optp++ = TCPOLEN_WINDOW;
770 *optp++ = to->to_wscale;
771 break;
772 case TOF_SACKPERM:
773 while (optlen % 2) {
774 optlen += TCPOLEN_NOP;
775 *optp++ = TCPOPT_NOP;
776 }
777 if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) {
778 continue;
779 }
780 optlen += TCPOLEN_SACK_PERMITTED;
781 *optp++ = TCPOPT_SACK_PERMITTED;
782 *optp++ = TCPOLEN_SACK_PERMITTED;
783 break;
784 case TOF_TS:
785 while (!optlen || optlen % 4 != 2) {
786 optlen += TCPOLEN_NOP;
787 *optp++ = TCPOPT_NOP;
788 }
789 if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) {
790 continue;
791 }
792 optlen += TCPOLEN_TIMESTAMP;
793 *optp++ = TCPOPT_TIMESTAMP;
794 *optp++ = TCPOLEN_TIMESTAMP;
795 to->to_tsval = htonl(to->to_tsval);
796 to->to_tsecr = htonl(to->to_tsecr);
797 bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
798 optp += sizeof(to->to_tsval);
799 optend = optend;
800 bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
801 optp += sizeof(to->to_tsecr);
802 optend = optend;
803 break;
804 case TOF_SACK:
805 {
806 int sackblks = 0;
807 struct sackblk *sack = (struct sackblk *)(void *)to->to_sacks;
808 tcp_seq sack_seq;
809
810 while (!optlen || optlen % 4 != 2) {
811 optlen += TCPOLEN_NOP;
812 *optp++ = TCPOPT_NOP;
813 }
814 if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) {
815 continue;
816 }
817 optlen += TCPOLEN_SACKHDR;
818 *optp++ = TCPOPT_SACK;
819 sackblks = min(to->to_nsacks,
820 (TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
821 *optp++ = TCPOLEN_SACKHDR + (uint8_t)sackblks * TCPOLEN_SACK;
822 while (sackblks--) {
823 sack_seq = htonl(sack->start);
824 bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
825 optp += sizeof(sack_seq);
826 optend = optend;
827 sack_seq = htonl(sack->end);
828 bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
829 optp += sizeof(sack_seq);
830 optend = optend;
831 optlen += TCPOLEN_SACK;
832 sack++;
833 }
834 tcpstat.tcps_sack_send_blocks++;
835 break;
836 }
837 default:
838 /* SYN cookies are disabled when TFO is used */
839 break;
840 }
841 }
842
843 /* Terminate and pad TCP options to a 4 byte boundary. */
844 if (optlen % 4) {
845 optlen += TCPOLEN_EOL;
846 *optp++ = TCPOPT_EOL;
847 }
848 /*
849 * According to RFC 793 (STD0007):
850 * "The content of the header beyond the End-of-Option option
851 * must be header padding (i.e., zero)."
852 * and later: "The padding is composed of zeros."
853 */
854 while (optlen % 4) {
855 optlen += TCPOLEN_EOL;
856 *optp++ = TCPOPT_EOL;
857 }
858
859 ASSERT(optlen <= TCP_MAXOLEN);
860 return optlen;
861 }
862 /*
863 * Set up the ECN information for the <SYN,ACK> from
864 * client SYN information.
865 */
866 static uint16_t
tcp_accecn_synack_respond(struct tcpcb * tp,uint16_t thflags)867 tcp_accecn_synack_respond(struct tcpcb * tp, uint16_t thflags)
868 {
869 /* Server received either legacy or Accurate ECN setup SYN */
870 if (tp->ecn_flags & (TE_SETUPRECEIVED | TE_ACE_SETUPRECEIVED)) {
871 if (tcp_send_ecn_flags_on_syn(tp)) {
872 if (tp->l4s_enabled && (tp->ecn_flags & TE_ACE_SETUPRECEIVED)) {
873 /*
874 * Accurate ECN mode is on. Initialize packet and byte counters
875 * for the server sending SYN-ACK. Although s_cep will be initialized
876 * during input processing of ACK of SYN-ACK, initialize here as well
877 * in case ACK gets lost.
878 *
879 * Non-zero initial values are used to
880 * support a stateless handshake (see
881 * Section 5.1 of AccECN draft) and to be
882 * distinct from cases where the fields
883 * are incorrectly zeroed.
884 */
885 tp->t_aecn.t_rcv_ce_packets = 5;
886 tp->t_aecn.t_snd_ce_packets = 5;
887
888 /* Initialize CE byte counter to 0 */
889 tp->t_aecn.t_rcv_ce_bytes = tp->t_aecn.t_snd_ce_bytes = 0;
890
891 if (tp->ecn_flags & TE_ACE_SETUP_NON_ECT) {
892 tp->t_prev_ace_flags = TH_CWR;
893 thflags |= tp->t_prev_ace_flags;
894 /* Remove the setup flag as it is also used for final ACK */
895 tp->ecn_flags &= ~TE_ACE_SETUP_NON_ECT;
896 tcpstat.tcps_ecn_ace_syn_not_ect++;
897 } else if (tp->ecn_flags & TE_ACE_SETUP_ECT1) {
898 tp->t_prev_ace_flags = (TH_CWR | TH_ECE);
899 thflags |= tp->t_prev_ace_flags;
900 tp->ecn_flags &= ~TE_ACE_SETUP_ECT1;
901 tcpstat.tcps_ecn_ace_syn_ect1++;
902 } else if (tp->ecn_flags & TE_ACE_SETUP_ECT0) {
903 tp->t_prev_ace_flags = TH_AE;
904 thflags |= tp->t_prev_ace_flags;
905 tp->ecn_flags &= ~TE_ACE_SETUP_ECT0;
906 tcpstat.tcps_ecn_ace_syn_ect0++;
907 } else if (tp->ecn_flags & TE_ACE_SETUP_CE) {
908 tp->t_prev_ace_flags = (TH_AE | TH_CWR);
909 thflags |= tp->t_prev_ace_flags;
910 tp->ecn_flags &= ~TE_ACE_SETUP_CE;
911 /*
912 * Receive counter is updated on
913 * all acceptable packets except
914 * CE on SYN packets (SYN=1, ACK=0)
915 */
916 tcpstat.tcps_ecn_ace_syn_ce++;
917 } else {
918 if (tp->t_prev_ace_flags != 0) {
919 /* Set the flags for retransmitted SYN-ACK same as the previous one */
920 thflags |= tp->t_prev_ace_flags;
921 } else {
922 /* We shouldn't come here */
923 panic("ECN flags (0x%x) not set correctly", tp->ecn_flags);
924 }
925 }
926 /*
927 * We now send ECT1 packets when
928 * L4S and Accurate ECN mode is on
929 */
930 tp->ecn_flags |= TE_ACE_SETUPSENT;
931 if (tp->l4s_enabled) {
932 tp->ecn_flags |= TE_SENDIPECT;
933 tcp_set_accurate_ecn(tp);
934 }
935 } else if (tp->ecn_flags & TE_SETUPRECEIVED) {
936 /*
937 * Setting TH_ECE makes this an ECN-setup
938 * SYN-ACK
939 */
940 thflags |= TH_ECE;
941 /*
942 * Record that we sent the ECN-setup and
943 * default to setting IP ECT.
944 */
945 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
946 }
947 tcpstat.tcps_ecn_server_setup++;
948 tcpstat.tcps_ecn_server_success++;
949 } else {
950 /*
951 * For classic ECN, we sent an ECN-setup SYN-ACK but it was
952 * dropped. Fallback to non-ECN-setup
953 * SYN-ACK and clear flag to indicate that
954 * we should not send data with IP ECT set
955 *
956 * Pretend we didn't receive an
957 * ECN-setup SYN.
958 *
959 * We already incremented the counter
960 * assuming that the ECN setup will
961 * succeed. Decrementing here
962 * tcps_ecn_server_success to correct it.
963 *
964 * For Accurate ECN, we don't yet remove TE_ACE_SETUPRECEIVED
965 * as the client might have received Accurate ECN SYN-ACK.
966 * We decide Accurate ECN's state on processing last ACK from the client.
967 */
968 if (tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) {
969 tcpstat.tcps_ecn_lost_synack++;
970 tcpstat.tcps_ecn_server_success--;
971 tp->ecn_flags |= TE_LOST_SYNACK;
972 }
973 if (!tp->l4s_enabled) {
974 /* Do this only for classic ECN. */
975 tp->ecn_flags &=
976 ~(TE_SETUPRECEIVED | TE_SENDIPECT |
977 TE_SENDCWR);
978 }
979 }
980 }
981 return thflags;
982 }
983
984 /*
985 * Tcp output routine: figure out what should be sent and send it.
986 *
987 * Returns: 0 Success
988 * EADDRNOTAVAIL
989 * ENOBUFS
990 * EMSGSIZE
991 * EHOSTUNREACH
992 * ENETDOWN
993 * ip_output_list:ENOMEM
994 * ip_output_list:EADDRNOTAVAIL
995 * ip_output_list:ENETUNREACH
996 * ip_output_list:EHOSTUNREACH
997 * ip_output_list:EACCES
998 * ip_output_list:EMSGSIZE
999 * ip_output_list:ENOBUFS
1000 * ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
1001 * ip6_output_list:EINVAL
1002 * ip6_output_list:EOPNOTSUPP
1003 * ip6_output_list:EHOSTUNREACH
1004 * ip6_output_list:EADDRNOTAVAIL
1005 * ip6_output_list:ENETUNREACH
1006 * ip6_output_list:EMSGSIZE
1007 * ip6_output_list:ENOBUFS
1008 * ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
1009 */
1010 int
tcp_output(struct tcpcb * tp)1011 tcp_output(struct tcpcb *tp)
1012 {
1013 uint32_t tcp_now_local = os_access_once(tcp_now);
1014 uint32_t *tsvalptr;
1015 uint64_t pacing_tx_time;
1016 struct inpcb *__single inp = tp->t_inpcb;
1017 struct socket *__single so = inp->inp_socket;
1018 int32_t len, recwin, sendwin, off;
1019 uint32_t max_len = 0;
1020 uint16_t flags;
1021 int error;
1022 mbuf_ref_t m;
1023 struct ip *ip = NULL;
1024 struct ip6_hdr *ip6 = NULL;
1025 struct tcphdr *th;
1026 u_char opt[TCP_MAXOLEN];
1027 unsigned int ipoptlen, optlen, hdrlen;
1028 int idle, sendalot, lost = 0;
1029 int sendalot_cnt = 0;
1030 int i, rack_sack_rxmit = 0;
1031 int tso = 0;
1032 int sack_bytes_rxmt;
1033 tcp_seq old_snd_nxt = 0;
1034 struct sackhole *p;
1035 struct tcp_seg_sent *seg;
1036 #if IPSEC
1037 size_t ipsec_optlen = 0;
1038 #endif /* IPSEC */
1039 int idle_time = 0;
1040 struct mbuf *__single packetlist = NULL;
1041 struct mbuf *__single tp_inp_options = inp->inp_depend4.inp4_options;
1042 int isipv6 = inp->inp_vflag & INP_IPV6;
1043 int packchain_listadd = 0;
1044 int so_options = so->so_options;
1045 rtentry_ref_t rt;
1046 u_int32_t svc_flags = 0, allocated_len;
1047 #if MPTCP
1048 boolean_t mptcp_acknow;
1049 #endif /* MPTCP */
1050 stats_functional_type ifnet_count_type = stats_functional_type_unclassified;
1051 int sotc = so->so_traffic_class;
1052 boolean_t do_not_compress = FALSE;
1053 bool sack_rescue_rxt = false;
1054 bool sack_rxmted = false;
1055 bool link_heuristics_enabled = false;
1056
1057 struct ifnet *outifp = inp != NULL ? inp->inp_last_outifp : NULL;
1058
1059 /*
1060 * Determine length of data that should be transmitted,
1061 * and flags that will be used.
1062 * If there is some data or critical controls (SYN, RST)
1063 * to send, then transmit; otherwise, investigate further.
1064 */
1065 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
1066
1067 /* Since idle_time is signed integer, the following integer subtraction
1068 * will take care of wrap around of tcp_now
1069 */
1070 idle_time = tcp_now_local - tp->t_rcvtime;
1071 if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) {
1072 if (CC_ALGO(tp)->after_idle != NULL &&
1073 ((tp->tcp_cc_index != TCP_CC_ALGO_CUBIC_INDEX &&
1074 tp->tcp_cc_index != TCP_CC_ALGO_PRAGUE_INDEX) ||
1075 idle_time >= TCP_CC_CWND_NONVALIDATED_PERIOD)) {
1076 CC_ALGO(tp)->after_idle(tp);
1077 tcp_ccdbg_trace(tp, NULL, TCP_CC_IDLE_TIMEOUT);
1078 }
1079
1080 /*
1081 * Do some other tasks that need to be done after
1082 * idle time
1083 */
1084 if (!SLIST_EMPTY(&tp->t_rxt_segments)) {
1085 tcp_rxtseg_clean(tp);
1086 }
1087
1088 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
1089 }
1090 tp->t_flags &= ~TF_LASTIDLE;
1091 if (idle) {
1092 if (tp->t_flags & TF_MORETOCOME) {
1093 tp->t_flags |= TF_LASTIDLE;
1094 idle = 0;
1095 }
1096 }
1097 #if MPTCP
1098 if (tp->t_mpflags & TMPF_RESET) {
1099 tcp_check_timer_state(tp);
1100 /*
1101 * Once a RST has been sent for an MPTCP subflow,
1102 * the subflow socket stays around until deleted.
1103 * No packets such as FINs must be sent after RST.
1104 */
1105 return 0;
1106 }
1107 #endif /* MPTCP */
1108
1109 link_heuristics_enabled = if_link_heuristics_enabled(outifp);
1110
1111 again:
1112 tcp_now_local = os_access_once(tcp_now);
1113 pacing_tx_time = 0;
1114 tsvalptr = NULL;
1115 #if MPTCP
1116 mptcp_acknow = FALSE;
1117
1118 if (so->so_flags & SOF_MP_SUBFLOW && SEQ_LT(tp->snd_nxt, tp->snd_una)) {
1119 os_log_error(mptcp_log_handle, "%s - %lx: snd_nxt is %u and snd_una is %u, cnt %d\n",
1120 __func__, (unsigned long)VM_KERNEL_ADDRPERM(tp->t_mpsub->mpts_mpte),
1121 tp->snd_nxt, tp->snd_una, sendalot_cnt);
1122 }
1123 #endif
1124 do_not_compress = FALSE;
1125 sendalot_cnt++;
1126
1127 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1128
1129 if (isipv6) {
1130 KERNEL_DEBUG(DBG_LAYER_BEG,
1131 ((inp->inp_fport << 16) | inp->inp_lport),
1132 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
1133 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
1134 sendalot, 0, 0);
1135 } else {
1136 KERNEL_DEBUG(DBG_LAYER_BEG,
1137 ((inp->inp_fport << 16) | inp->inp_lport),
1138 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
1139 (inp->inp_faddr.s_addr & 0xffff)),
1140 sendalot, 0, 0);
1141 }
1142 /*
1143 * If the route generation id changed, we need to check that our
1144 * local (source) IP address is still valid. If it isn't either
1145 * return error or silently do nothing (assuming the address will
1146 * come back before the TCP connection times out).
1147 */
1148 rt = inp->inp_route.ro_rt;
1149 if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) {
1150 struct ifnet *ifp;
1151 struct in_ifaddr *ia = NULL;
1152 struct in6_ifaddr *ia6 = NULL;
1153 int found_srcaddr = 0;
1154
1155 /* disable multipages at the socket */
1156 somultipages(so, FALSE);
1157
1158 /* Disable TSO for the socket until we know more */
1159 tp->t_flags &= ~TF_TSO;
1160
1161 soif2kcl(so, FALSE);
1162
1163 if (isipv6) {
1164 ia6 = ifa_foraddr6(&inp->in6p_laddr);
1165 if (ia6 != NULL) {
1166 found_srcaddr = 1;
1167 }
1168 } else {
1169 ia = ifa_foraddr(inp->inp_laddr.s_addr);
1170 if (ia != NULL) {
1171 found_srcaddr = 1;
1172 }
1173 }
1174
1175 /* check that the source address is still valid */
1176 if (found_srcaddr == 0) {
1177 soevent(so,
1178 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR));
1179
1180 if (tp->t_state >= TCPS_CLOSE_WAIT) {
1181 tcp_drop(tp, EADDRNOTAVAIL);
1182 return EADDRNOTAVAIL;
1183 }
1184
1185 /*
1186 * Set retransmit timer if it wasn't set,
1187 * reset Persist timer and shift register as the
1188 * advertised peer window may not be valid anymore
1189 */
1190 if (tp->t_timer[TCPT_REXMT] == 0) {
1191 tcp_set_rto(tp);
1192 if (tp->t_timer[TCPT_PERSIST] != 0) {
1193 tp->t_timer[TCPT_PERSIST] = 0;
1194 tp->t_persist_stop = 0;
1195 TCP_RESET_REXMT_STATE(tp);
1196 }
1197 }
1198
1199 if (tp->t_pktlist_head != NULL) {
1200 m_drop_list(tp->t_pktlist_head, NULL, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_SRC_ADDR_NOT_AVAIL, NULL, 0);
1201 }
1202 TCP_PKTLIST_CLEAR(tp);
1203
1204 /* drop connection if source address isn't available */
1205 if (so->so_flags & SOF_NOADDRAVAIL) {
1206 tcp_drop(tp, EADDRNOTAVAIL);
1207 return EADDRNOTAVAIL;
1208 } else {
1209 TCP_LOG_OUTPUT(tp, "no source address silently ignored");
1210 tcp_check_timer_state(tp);
1211 return 0; /* silently ignore, keep data in socket: address may be back */
1212 }
1213 }
1214 if (ia != NULL) {
1215 ifa_remref(&ia->ia_ifa);
1216 }
1217
1218 if (ia6 != NULL) {
1219 ifa_remref(&ia6->ia_ifa);
1220 }
1221
1222 /*
1223 * Address is still valid; check for multipages capability
1224 * again in case the outgoing interface has changed.
1225 */
1226 RT_LOCK(rt);
1227 if ((ifp = rt->rt_ifp) != NULL) {
1228 somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
1229 tcp_set_tso(tp, ifp);
1230 soif2kcl(so, (ifp->if_eflags & IFEF_2KCL));
1231 /* Don't do ECN for Loopback & Cellular */
1232 if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0 && !IFNET_IS_CELLULAR(ifp)) {
1233 tcp_set_ecn(tp);
1234 }
1235
1236 /*
1237 * If the route changes, we cannot use the link heuristics
1238 * based on the previous outgoing interface
1239 */
1240 if (rt->rt_ifp != tp->t_inpcb->inp_last_outifp) {
1241 link_heuristics_enabled = false;
1242 tp->t_comp_rxmt_gencnt = 0;
1243 }
1244 }
1245 if (rt->rt_flags & RTF_UP) {
1246 RT_GENID_SYNC(rt);
1247 }
1248 /*
1249 * See if we should do MTU discovery. Don't do it if:
1250 * 1) it is disabled via the sysctl
1251 * 2) the route isn't up
1252 * 3) the MTU is locked (if it is, then discovery
1253 * has been disabled)
1254 */
1255
1256 if (!path_mtu_discovery || ((rt != NULL) &&
1257 (!(rt->rt_flags & RTF_UP) ||
1258 (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
1259 tp->t_flags &= ~TF_PMTUD;
1260 } else {
1261 tp->t_flags |= TF_PMTUD;
1262 }
1263
1264 RT_UNLOCK(rt);
1265 }
1266
1267 if (rt != NULL) {
1268 ifnet_count_type = IFNET_COUNT_TYPE(rt->rt_ifp);
1269 }
1270
1271 /*
1272 * If we've recently taken a timeout, snd_max will be greater than
1273 * snd_nxt. There may be SACK information that allows us to avoid
1274 * resending already delivered data. Adjust snd_nxt accordingly.
1275 * It is ok to use this function with RACK as well as it is estimating
1276 * max_len based on a SACK hole.
1277 */
1278 if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1279 if (TCP_RACK_ENABLED(tp)) {
1280 /*
1281 * Calculated in the same manner as when rack_in_recovery
1282 * is set and new data is transmitted after retransmitted data
1283 */
1284 int32_t cwin = tp->snd_cwnd - tcp_flight_size(tp);
1285 if (cwin > 0) {
1286 max_len = tcp_rack_adjust(tp, (uint32_t)cwin);
1287 }
1288 } else {
1289 max_len = tcp_sack_adjust(tp);
1290 }
1291 }
1292 sendalot = 0;
1293 off = tp->snd_nxt - tp->snd_una;
1294 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
1295
1296 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1297 sendwin = min(sendwin, slowlink_wsize);
1298 }
1299
1300 flags = tcp_outflags[tp->t_state];
1301 /*
1302 * Send any SACK-generated retransmissions. If we're explicitly
1303 * trying to send out new data (when sendalot is 1), bypass this
1304 * function. If we retransmit in fast recovery mode, decrement
1305 * snd_cwnd, since we're replacing a (future) new transmission
1306 * with a retransmission now, and we previously incremented
1307 * snd_cwnd in tcp_input().
1308 */
1309 /*
1310 * Still in sack recovery, reset rxmit flag to zero.
1311 */
1312 rack_sack_rxmit = 0;
1313 sack_bytes_rxmt = 0;
1314 len = 0;
1315 p = NULL;
1316 seg = NULL;
1317
1318 if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp)) {
1319 int32_t cwin = min(tp->snd_wnd, tp->snd_cwnd) - tcp_flight_size(tp);
1320 if (cwin <= 0 && sack_rxmted == false) {
1321 /* Allow to clock out at least on per period */
1322 cwin = tp->t_maxseg;
1323 }
1324
1325 sack_rxmted = true;
1326 if (cwin < 0) {
1327 cwin = 0;
1328 }
1329
1330 if (TCP_RACK_ENABLED(tp)) {
1331 uint16_t rack_seg_len = 0;
1332 if ((seg = tcp_rack_output(tp, cwin, &rack_seg_len)) != NULL) {
1333 len = min(cwin, rack_seg_len);
1334
1335 if (len > 0) {
1336 off = seg->start_seq - tp->snd_una;
1337 rack_sack_rxmit = 1;
1338 sendalot = 1;
1339 tcpstat.tcps_rack_rexmits++;
1340 } else {
1341 seg = NULL;
1342 }
1343 }
1344 } else if ((p = tcp_sack_output(tp, &sack_bytes_rxmt)) != NULL) {
1345 /* Do not retransmit SACK segments beyond snd_recover */
1346 if (SEQ_GT(p->end, tp->snd_recover)) {
1347 /*
1348 * (At least) part of sack hole extends beyond
1349 * snd_recover. Check to see if we can rexmit data
1350 * for this hole.
1351 */
1352 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
1353 /*
1354 * Can't rexmit any more data for this hole.
1355 * That data will be rexmitted in the next
1356 * sack recovery episode, when snd_recover
1357 * moves past p->rxmit.
1358 */
1359 p = NULL;
1360 goto after_sack_rexmit;
1361 } else {
1362 /* Can rexmit part of the current hole */
1363 len = ((int32_t)min(cwin,
1364 tp->snd_recover - p->rxmit));
1365 }
1366 } else {
1367 len = ((int32_t)min(cwin, p->end - p->rxmit));
1368 }
1369 if (len > 0) {
1370 off = p->rxmit - tp->snd_una;
1371 rack_sack_rxmit = 1;
1372 sendalot = 1;
1373 /*
1374 * Optimization to avoid double retransmission due to SACK recovery
1375 * and when tp->snd_nxt points to already retransmitted segments
1376 */
1377 if (SEQ_LT(tp->snd_nxt, tp->snd_max) && SEQ_LEQ(tp->snd_nxt, p->rxmit) &&
1378 (uint32_t)len <= max_len) {
1379 sendalot = 0;
1380 }
1381
1382 tcpstat.tcps_sack_rexmits++;
1383 tcpstat.tcps_sack_rexmit_bytes +=
1384 min(len, tp->t_maxseg);
1385 } else {
1386 len = 0;
1387 }
1388 }
1389 }
1390 after_sack_rexmit:
1391 /*
1392 * Get standard flags, and add SYN or FIN if requested by 'hidden'
1393 * state flags.
1394 */
1395 if (tp->t_flags & TF_NEEDFIN) {
1396 flags |= TH_FIN;
1397 }
1398
1399 /*
1400 * If in persist timeout with window of 0, send 1 byte.
1401 * Otherwise, if window is small but nonzero
1402 * and timer expired, we will send what we can
1403 * and go to transmit state.
1404 */
1405 if (tp->t_flagsext & TF_FORCE) {
1406 if (sendwin == 0) {
1407 /*
1408 * If we still have some data to send, then
1409 * clear the FIN bit. Usually this would
1410 * happen below when it realizes that we
1411 * aren't sending all the data. However,
1412 * if we have exactly 1 byte of unsent data,
1413 * then it won't clear the FIN bit below,
1414 * and if we are in persist state, we wind
1415 * up sending the packet without recording
1416 * that we sent the FIN bit.
1417 *
1418 * We can't just blindly clear the FIN bit,
1419 * because if we don't have any more data
1420 * to send then the probe will be the FIN
1421 * itself.
1422 */
1423 if (off < so->so_snd.sb_cc) {
1424 flags &= ~TH_FIN;
1425 }
1426 sendwin = 1;
1427 } else {
1428 tp->t_timer[TCPT_PERSIST] = 0;
1429 tp->t_persist_stop = 0;
1430 TCP_RESET_REXMT_STATE(tp);
1431 }
1432 }
1433
1434 /*
1435 * If snd_nxt == snd_max and we have transmitted a FIN, the
1436 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
1437 * a negative length. This can also occur when TCP opens up
1438 * its congestion window while receiving additional duplicate
1439 * acks after fast-retransmit because TCP will reset snd_nxt
1440 * to snd_max after the fast-retransmit.
1441 *
1442 * In the normal retransmit-FIN-only case, however, snd_nxt will
1443 * be set to snd_una, the offset will be 0, and the length may
1444 * wind up 0.
1445 *
1446 * If sack_rxmit or rack_rxmit is true we are retransmitting from
1447 * the scoreboard in which case len is already set.
1448 */
1449 bool rack_in_recovery = TCP_RACK_ENABLED(tp) && IN_FASTRECOVERY(tp);
1450 if (rack_sack_rxmit == 0) {
1451 if (sack_bytes_rxmt == 0 && !rack_in_recovery) {
1452 len = min(so->so_snd.sb_cc, sendwin) - off;
1453 } else {
1454 int32_t cwin = tp->snd_cwnd - tcp_flight_size(tp);
1455 if (cwin < 0) {
1456 cwin = 0;
1457 }
1458 /*
1459 * We are inside of a SACK recovery episode and are
1460 * sending new data, having retransmitted all the
1461 * data possible in the scoreboard.
1462 */
1463 len = min(so->so_snd.sb_cc, tp->snd_wnd) - off;
1464 /*
1465 * Don't remove this (len > 0) check !
1466 * We explicitly check for len > 0 here (although it
1467 * isn't really necessary), to work around a gcc
1468 * optimization issue - to force gcc to compute
1469 * len above. Without this check, the computation
1470 * of len is bungled by the optimizer.
1471 */
1472 if (len > 0) {
1473 len = imin(len, cwin);
1474 } else {
1475 len = 0;
1476 }
1477 /*
1478 * At this point SACK recovery can not send any
1479 * data from scoreboard or any new data. Check
1480 * if we can do a rescue retransmit towards the
1481 * tail end of recovery window.
1482 * We don't do rescue retransmit for RACK.
1483 */
1484 if (len == 0 && cwin > 0 &&
1485 SEQ_LT(tp->snd_fack, tp->snd_recover) &&
1486 !(tp->t_flagsext & TF_RESCUE_RXT) && !TCP_RACK_ENABLED(tp)) {
1487 len = min((tp->snd_recover - tp->snd_fack),
1488 tp->t_maxseg);
1489 len = imin(len, cwin);
1490 old_snd_nxt = tp->snd_nxt;
1491 sack_rescue_rxt = true;
1492 tp->snd_nxt = tp->snd_recover - len;
1493 /*
1494 * If FIN has been sent, snd_max
1495 * must have been advanced to cover it.
1496 */
1497 if ((tp->t_flags & TF_SENTFIN) &&
1498 tp->snd_max == tp->snd_recover) {
1499 tp->snd_nxt--;
1500 }
1501
1502 off = tp->snd_nxt - tp->snd_una;
1503 sendalot = 0;
1504 tp->t_flagsext |= TF_RESCUE_RXT;
1505 }
1506 }
1507 }
1508
1509 if (max_len != 0 && len > 0) {
1510 len = min(len, max_len);
1511 }
1512
1513 /*
1514 * Lop off SYN bit if it has already been sent. However, if this
1515 * is SYN-SENT state and if segment contains data and if we don't
1516 * know that foreign host supports TAO, suppress sending segment.
1517 */
1518 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
1519 if (tp->t_state == TCPS_SYN_RECEIVED && TFO_ENABLED(tp) && tp->snd_nxt == tp->snd_una + 1) {
1520 /* We are sending the SYN again! */
1521 off--;
1522 len++;
1523 } else {
1524 if (tp->t_state != TCPS_SYN_RECEIVED || TFO_ENABLED(tp)) {
1525 flags &= ~TH_SYN;
1526 }
1527
1528 off--;
1529 len++;
1530 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
1531 while (inp->inp_sndinprog_cnt == 0 &&
1532 tp->t_pktlist_head != NULL) {
1533 packetlist = tp->t_pktlist_head;
1534 packchain_listadd = tp->t_lastchain;
1535 packchain_sent++;
1536 TCP_PKTLIST_CLEAR(tp);
1537
1538 error = tcp_ip_output(so, tp, packetlist,
1539 packchain_listadd, tp_inp_options,
1540 (so_options & SO_DONTROUTE),
1541 (rack_sack_rxmit || (sack_bytes_rxmt != 0)),
1542 isipv6);
1543 }
1544
1545 /*
1546 * tcp was closed while we were in ip,
1547 * resume close
1548 */
1549 if (inp->inp_sndinprog_cnt == 0 &&
1550 (tp->t_flags & TF_CLOSING)) {
1551 tp->t_flags &= ~TF_CLOSING;
1552 (void) tcp_close(tp);
1553 } else {
1554 tcp_check_timer_state(tp);
1555 }
1556 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
1557 0, 0, 0, 0, 0);
1558 return 0;
1559 }
1560 }
1561 }
1562
1563 /*
1564 * Be careful not to send data and/or FIN on SYN segments.
1565 * This measure is needed to prevent interoperability problems
1566 * with not fully conformant TCP implementations.
1567 *
1568 * In case of TFO, we handle the setting of the len in
1569 * tcp_tfo_check. In case TFO is not enabled, never ever send
1570 * SYN+data.
1571 */
1572 if ((flags & TH_SYN) && !TFO_ENABLED(tp)) {
1573 len = 0;
1574 flags &= ~TH_FIN;
1575 }
1576
1577 /*
1578 * Don't send a RST with data.
1579 */
1580 if (flags & TH_RST) {
1581 len = 0;
1582 }
1583
1584 if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && TFO_ENABLED(tp)) {
1585 len = tcp_tfo_check(tp, len);
1586 }
1587
1588 if ((tp->tcp_cc_index == TCP_CC_ALGO_PRAGUE_INDEX || inp->inp_max_pacing_rate != UINT64_MAX) &&
1589 tp->t_pacer.tso_burst_size != 0 && len > 0 &&
1590 (uint32_t)len > tp->t_pacer.tso_burst_size) {
1591 len = tp->t_pacer.tso_burst_size;
1592 sendalot = 1;
1593 }
1594
1595 /*
1596 * The check here used to be (len < 0). Some times len is zero
1597 * when the congestion window is closed and we need to check
1598 * if persist timer has to be set in that case. But don't set
1599 * persist until connection is established.
1600 */
1601 if (len <= 0 && !(flags & TH_SYN)) {
1602 /*
1603 * If FIN has been sent but not acked,
1604 * but we haven't been called to retransmit,
1605 * len will be < 0. Otherwise, window shrank
1606 * after we sent into it. If window shrank to 0,
1607 * cancel pending retransmit, pull snd_nxt back
1608 * to (closed) window, and set the persist timer
1609 * if it isn't already going. If the window didn't
1610 * close completely, just wait for an ACK.
1611 */
1612 len = 0;
1613 if (sendwin == 0) {
1614 tp->t_timer[TCPT_REXMT] = 0;
1615 tp->t_timer[TCPT_PTO] = 0;
1616 TCP_RESET_REXMT_STATE(tp);
1617 tp->snd_nxt = tp->snd_una;
1618 off = 0;
1619 if (tp->t_timer[TCPT_PERSIST] == 0) {
1620 tcp_setpersist(tp);
1621 }
1622 }
1623 }
1624
1625 /*
1626 * Automatic sizing of send socket buffer. Increase the send
1627 * socket buffer size if all of the following criteria are met
1628 * 1. the receiver has enough buffer space for this data
1629 * 2. send buffer is filled to 7/8th with data (so we actually
1630 * have data to make use of it);
1631 * 3. our send window (slow start and congestion controlled) is
1632 * larger than sent but unacknowledged data in send buffer.
1633 */
1634 if (!INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) &&
1635 (so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE) {
1636 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
1637 so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
1638 sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
1639 if (sbreserve(&so->so_snd,
1640 min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
1641 tcp_autosndbuf_max)) == 1) {
1642 so->so_snd.sb_idealsize = so->so_snd.sb_hiwat;
1643 }
1644 }
1645 }
1646
1647 /*
1648 * Truncate to the maximum segment length or enable TCP Segmentation
1649 * Offloading (if supported by hardware) and ensure that FIN is removed
1650 * if the length no longer contains the last data byte.
1651 *
1652 * TSO may only be used if we are in a pure bulk sending state.
1653 * The presence of TCP-MD5, SACK retransmits, SACK advertizements,
1654 * filters and IP options, as well as disabling hardware checksum
1655 * offload prevent using TSO. With TSO the TCP header is the same
1656 * (except for the sequence number) for all generated packets. This
1657 * makes it impossible to transmit any options which vary per generated
1658 * segment or packet.
1659 *
1660 * The length of TSO bursts is limited to TCP_MAXWIN. That limit and
1661 * removal of FIN (if not already catched here) are handled later after
1662 * the exact length of the TCP options are known.
1663 */
1664 #if IPSEC
1665 /*
1666 * Pre-calculate here as we save another lookup into the darknesses
1667 * of IPsec that way and can actually decide if TSO is ok.
1668 */
1669 if (ipsec_bypass == 0) {
1670 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
1671 }
1672 #endif
1673 if (len > tp->t_maxseg) {
1674 if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx &&
1675 kipf_count == 0 &&
1676 tp->rcv_numsacks == 0 && rack_sack_rxmit == 0 &&
1677 sack_bytes_rxmt == 0 &&
1678 inp->inp_options == NULL &&
1679 inp->in6p_options == NULL
1680 #if IPSEC
1681 && ipsec_optlen == 0
1682 #endif
1683 ) {
1684 tso = 1;
1685 sendalot = 0;
1686 } else {
1687 len = tp->t_maxseg;
1688 sendalot = 1;
1689 tso = 0;
1690 }
1691 } else {
1692 tso = 0;
1693 }
1694
1695 /* Send one segment or less as a tail loss probe */
1696 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1697 len = min(len, tp->t_maxseg);
1698 sendalot = 0;
1699 tso = 0;
1700 }
1701
1702 #if MPTCP
1703 if (so->so_flags & SOF_MP_SUBFLOW && off < 0) {
1704 os_log_error(mptcp_log_handle, "%s - %lx: offset is negative! len %d off %d\n",
1705 __func__, (unsigned long)VM_KERNEL_ADDRPERM(tp->t_mpsub->mpts_mpte),
1706 len, off);
1707 }
1708
1709 if ((so->so_flags & SOF_MP_SUBFLOW) &&
1710 !(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
1711 int newlen = len;
1712 struct mptcb *mp_tp = tptomptp(tp);
1713 if (tp->t_state >= TCPS_ESTABLISHED &&
1714 (tp->t_mpflags & TMPF_SND_MPPRIO ||
1715 tp->t_mpflags & TMPF_SND_REM_ADDR ||
1716 tp->t_mpflags & TMPF_SND_MPFAIL ||
1717 (tp->t_mpflags & TMPF_SND_KEYS &&
1718 mp_tp->mpt_version == MPTCP_VERSION_0) ||
1719 tp->t_mpflags & TMPF_SND_JACK ||
1720 tp->t_mpflags & TMPF_MPTCP_ECHO_ADDR)) {
1721 if (len > 0) {
1722 len = 0;
1723 tso = 0;
1724 }
1725 /*
1726 * On a new subflow, don't try to send again, because
1727 * we are still waiting for the fourth ack.
1728 */
1729 if (!(tp->t_mpflags & TMPF_PREESTABLISHED)) {
1730 sendalot = 1;
1731 }
1732 mptcp_acknow = TRUE;
1733 } else {
1734 mptcp_acknow = FALSE;
1735 }
1736 /*
1737 * The contiguous bytes in the subflow socket buffer can be
1738 * discontiguous at the MPTCP level. Since only one DSS
1739 * option can be sent in one packet, reduce length to match
1740 * the contiguous MPTCP level. Set sendalot to send remainder.
1741 */
1742 if (len > 0 && off >= 0) {
1743 newlen = mptcp_adj_sendlen(so, off);
1744 }
1745
1746 if (newlen < len) {
1747 len = newlen;
1748 if (len <= tp->t_maxseg) {
1749 tso = 0;
1750 }
1751 }
1752 }
1753 #endif /* MPTCP */
1754
1755 if (rack_sack_rxmit) {
1756 if (TCP_RACK_ENABLED(tp)) {
1757 if (SEQ_LT(seg->start_seq + len, tp->snd_una + so->so_snd.sb_cc)) {
1758 flags &= ~TH_FIN;
1759 }
1760 } else {
1761 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) {
1762 flags &= ~TH_FIN;
1763 }
1764 }
1765 } else {
1766 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) {
1767 flags &= ~TH_FIN;
1768 }
1769 }
1770 /*
1771 * Compare available window to amount of window
1772 * known to peer (as advertised window less
1773 * next expected input). If the difference is at least two
1774 * max size segments, or at least 25% of the maximum possible
1775 * window, then want to send a window update to peer.
1776 */
1777 recwin = tcp_sbspace(tp);
1778
1779 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
1780 if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) &&
1781 recwin < (int)tp->t_maxseg) {
1782 recwin = 0;
1783 }
1784 } else {
1785 struct mptcb *mp_tp = tptomptp(tp);
1786 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1787
1788 if (recwin < (int32_t)(mp_so->so_rcv.sb_hiwat / 4) &&
1789 recwin < (int)tp->t_maxseg) {
1790 recwin = 0;
1791 }
1792 }
1793
1794 #if TRAFFIC_MGT
1795 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
1796 /*
1797 * Timestamp MUST be supported to use rledbat unless we haven't
1798 * yet negotiated it.
1799 */
1800 if (TCP_RLEDBAT_ENABLED(tp) || (tcp_rledbat && tp->t_state <
1801 TCPS_ESTABLISHED)) {
1802 if (recwin > 0 && tcp_cc_rledbat.get_rlwin != NULL) {
1803 /* Min of flow control window and rledbat window */
1804 recwin = imin(recwin, tcp_cc_rledbat.get_rlwin(tp));
1805 }
1806 } else if (recwin > 0 && tcp_recv_throttle(tp)) {
1807 uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg;
1808 uint32_t bg_rwintop = tp->rcv_adv;
1809 if (SEQ_LT(bg_rwintop, tp->rcv_nxt + min_iaj_win)) {
1810 bg_rwintop = tp->rcv_nxt + min_iaj_win;
1811 }
1812 recwin = imin((int32_t)(bg_rwintop - tp->rcv_nxt),
1813 recwin);
1814 if (recwin < 0) {
1815 recwin = 0;
1816 }
1817 }
1818 }
1819 #endif /* TRAFFIC_MGT */
1820
1821 if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) {
1822 recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
1823 }
1824
1825 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
1826 if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) {
1827 recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1828 }
1829 } else {
1830 struct mptcb *mp_tp = tptomptp(tp);
1831 int64_t recwin_announced = (int64_t)(mp_tp->mpt_rcvadv - mp_tp->mpt_rcvnxt);
1832
1833 /* Don't remove what we announced at the MPTCP-layer */
1834 VERIFY(recwin_announced < INT32_MAX && recwin_announced > INT32_MIN);
1835 if (recwin < (int32_t)recwin_announced) {
1836 recwin = (int32_t)recwin_announced;
1837 }
1838
1839 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1840 if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) {
1841 recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1842 }
1843 }
1844 }
1845
1846 /* One day we should have a single ROUNDUP macro across xnu... */
1847 #ifndef ROUNDUP
1848 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
1849 #endif
1850 recwin = ROUNDUP(recwin, (1 << tp->rcv_scale));
1851 #undef ROUNDUP
1852
1853 /*
1854 * Sender silly window avoidance. We transmit under the following
1855 * conditions when len is non-zero:
1856 *
1857 * - we've timed out (e.g. persist timer)
1858 * - we need to retransmit
1859 * - We have a full segment (or more with TSO)
1860 * - This is the last buffer in a write()/send() and we are
1861 * either idle or running NODELAY
1862 * - we have more then 1/2 the maximum send window's worth of
1863 * data (receiver may be limited the window size)
1864 */
1865 if (len) {
1866 if (tp->t_flagsext & TF_FORCE) {
1867 goto send;
1868 }
1869 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1870 if (TCP_RACK_ENABLED(tp) && rack_sack_rxmit == 0) {
1871 len = min(len, tp->snd_max - tp->snd_nxt);
1872 }
1873 goto send;
1874 }
1875 if (rack_sack_rxmit) {
1876 goto send;
1877 }
1878
1879 /*
1880 * If this here is the first segment after SYN/ACK and TFO
1881 * is being used, then we always send it, regardless of Nagle,...
1882 */
1883 if (tp->t_state == TCPS_SYN_RECEIVED &&
1884 TFO_ENABLED(tp) &&
1885 (tp->t_tfo_flags & TFO_F_COOKIE_VALID) &&
1886 tp->snd_nxt == tp->iss + 1) {
1887 goto send;
1888 }
1889
1890 /*
1891 * Send new data on the connection only if it is
1892 * not flow controlled
1893 */
1894 if (!INP_WAIT_FOR_IF_FEEDBACK(inp) ||
1895 tp->t_state != TCPS_ESTABLISHED) {
1896 if (off + len == tp->snd_wnd) {
1897 /* We are limited by the receiver's window... */
1898 if (tp->t_rcvwnd_limited_start_time == 0) {
1899 tp->t_rcvwnd_limited_start_time = net_uptime_us();
1900 }
1901 } else {
1902 /* We are no more limited by the receiver's window... */
1903 if (tp->t_rcvwnd_limited_start_time != 0) {
1904 uint64_t now = net_uptime_us();
1905
1906 ASSERT(now >= tp->t_rcvwnd_limited_start_time);
1907
1908 tp->t_rcvwnd_limited_total_time += (now - tp->t_rcvwnd_limited_start_time);
1909
1910 tp->t_rcvwnd_limited_start_time = 0;
1911 }
1912 }
1913
1914 if (len >= tp->t_maxseg) {
1915 goto send;
1916 }
1917
1918 if (!(tp->t_flags & TF_MORETOCOME) &&
1919 (idle || tp->t_flags & TF_NODELAY ||
1920 (tp->t_flags & TF_MAXSEGSNT) ||
1921 ALLOW_LIMITED_TRANSMIT(tp)) &&
1922 (tp->t_flags & TF_NOPUSH) == 0 &&
1923 (len + off >= so->so_snd.sb_cc ||
1924 /*
1925 * MPTCP needs to respect the DSS-mappings. So, it
1926 * may be sending data that *could* have been
1927 * coalesced, but cannot because of
1928 * mptcp_adj_sendlen().
1929 */
1930 so->so_flags & SOF_MP_SUBFLOW)) {
1931 goto send;
1932 }
1933 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
1934 goto send;
1935 }
1936 } else {
1937 tcpstat.tcps_fcholdpacket++;
1938 }
1939 }
1940
1941 if (recwin > 0) {
1942 /*
1943 * "adv" is the amount we can increase the window,
1944 * taking into account that we are limited by
1945 * TCP_MAXWIN << tp->rcv_scale.
1946 */
1947 int32_t adv, oldwin = 0;
1948 adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
1949 (tp->rcv_adv - tp->rcv_nxt);
1950
1951 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
1952 oldwin = tp->rcv_adv - tp->rcv_nxt;
1953 }
1954
1955 if (adv >= (int32_t) (2 * tp->t_maxseg)) {
1956 /*
1957 * ACK every second full-sized segment, if the
1958 * ACK is advancing or the window becomes bigger
1959 */
1960 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1961 (tp->last_ack_sent != tp->rcv_nxt ||
1962 ((oldwin + adv) >> tp->rcv_scale) >
1963 (oldwin >> tp->rcv_scale))) {
1964 goto send;
1965 }
1966 } else if (tp->t_flags & TF_DELACK) {
1967 /*
1968 * If we delayed the ACK and the window
1969 * is not advancing by a lot (< 2MSS), ACK
1970 * immediately if the last incoming packet had
1971 * the push flag set and we emptied the buffer.
1972 *
1973 * This takes care of a sender doing small
1974 * repeated writes with Nagle enabled.
1975 */
1976 if (so->so_rcv.sb_cc == 0 &&
1977 tp->last_ack_sent != tp->rcv_nxt &&
1978 (tp->t_flagsext & TF_LAST_IS_PSH)) {
1979 goto send;
1980 }
1981 }
1982
1983 if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
1984 goto send;
1985 }
1986 }
1987
1988 /*
1989 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
1990 * is also a catch-all for the retransmit timer timeout case.
1991 */
1992 if (tp->t_flags & TF_ACKNOW) {
1993 if (tp->t_forced_acks > 0) {
1994 tp->t_forced_acks--;
1995 }
1996 goto send;
1997 }
1998 if ((flags & TH_RST) || (flags & TH_SYN)) {
1999 goto send;
2000 }
2001 if (SEQ_GT(tp->snd_up, tp->snd_una)) {
2002 goto send;
2003 }
2004 #if MPTCP
2005 if (mptcp_acknow) {
2006 goto send;
2007 }
2008 #endif /* MPTCP */
2009 /*
2010 * If our state indicates that FIN should be sent
2011 * and we have not yet done so, then we need to send.
2012 */
2013 if ((flags & TH_FIN) &&
2014 (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) {
2015 goto send;
2016 }
2017 /*
2018 * In SACK, it is possible for tcp_output to fail to send a segment
2019 * after the retransmission timer has been turned off. Make sure
2020 * that the retransmission timer is set.
2021 */
2022 if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) &&
2023 SEQ_GT(tp->snd_max, tp->snd_una) &&
2024 tp->t_timer[TCPT_REXMT] == 0 &&
2025 tp->t_timer[TCPT_PERSIST] == 0) {
2026 tcp_set_rto(tp);
2027 goto just_return;
2028 }
2029 /*
2030 * TCP window updates are not reliable, rather a polling protocol
2031 * using ``persist'' packets is used to insure receipt of window
2032 * updates. The three ``states'' for the output side are:
2033 * idle not doing retransmits or persists
2034 * persisting to move a small or zero window
2035 * (re)transmitting and thereby not persisting
2036 *
2037 * tp->t_timer[TCPT_PERSIST]
2038 * is set when we are in persist state.
2039 * tp->t_force
2040 * is set when we are called to send a persist packet.
2041 * tp->t_timer[TCPT_REXMT]
2042 * is set when we are retransmitting
2043 * The output side is idle when both timers are zero.
2044 *
2045 * If send window is too small, there is data to transmit, and no
2046 * retransmit or persist is pending, then go to persist state.
2047 * If nothing happens soon, send when timer expires:
2048 * if window is nonzero, transmit what we can,
2049 * otherwise force out a byte.
2050 */
2051 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
2052 tp->t_timer[TCPT_PERSIST] == 0) {
2053 TCP_RESET_REXMT_STATE(tp);
2054 tcp_setpersist(tp);
2055 }
2056 just_return:
2057 /*
2058 * If there is no reason to send a segment, just return.
2059 * but if there is some packets left in the packet list, send them now.
2060 */
2061 while (inp->inp_sndinprog_cnt == 0 &&
2062 tp->t_pktlist_head != NULL) {
2063 packetlist = tp->t_pktlist_head;
2064 packchain_listadd = tp->t_lastchain;
2065 packchain_sent++;
2066 TCP_PKTLIST_CLEAR(tp);
2067
2068 error = tcp_ip_output(so, tp, packetlist,
2069 packchain_listadd,
2070 tp_inp_options, (so_options & SO_DONTROUTE),
2071 (rack_sack_rxmit || (sack_bytes_rxmt != 0)), isipv6);
2072 }
2073 /* tcp was closed while we were in ip; resume close */
2074 if (inp->inp_sndinprog_cnt == 0 &&
2075 (tp->t_flags & TF_CLOSING)) {
2076 tp->t_flags &= ~TF_CLOSING;
2077 (void) tcp_close(tp);
2078 } else {
2079 tcp_check_timer_state(tp);
2080 }
2081 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
2082 return 0;
2083
2084 send:
2085 /*
2086 * Set TF_MAXSEGSNT flag if the segment size is greater than
2087 * the max segment size.
2088 */
2089 if (len > 0) {
2090 do_not_compress = TRUE;
2091
2092 if (len >= tp->t_maxseg) {
2093 tp->t_flags |= TF_MAXSEGSNT;
2094 } else {
2095 tp->t_flags &= ~TF_MAXSEGSNT;
2096 }
2097 }
2098 /*
2099 * If we are connected and no segment has been ACKed or SACKed yet and we
2100 * hit a retransmission timeout, then we should disable AccECN option
2101 * for the rest of the connection.
2102 */
2103 if (tp->accurate_ecn_on && tp->t_state == TCPS_ESTABLISHED &&
2104 tp->snd_una == tp->iss + 1 && (tp->snd_fack == tp->iss)
2105 && tp->t_rxtshift > 0) {
2106 if ((tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == 0) {
2107 tp->ecn_flags |= TE_RETRY_WITHOUT_ACO;
2108 }
2109 }
2110
2111 /*
2112 * Before ESTABLISHED, force sending of initial options
2113 * unless TCP set not to do any options.
2114 * NOTE: we assume that the IP/TCP header plus TCP options
2115 * always fit in a single mbuf, leaving room for a maximum
2116 * link header, i.e.
2117 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
2118 */
2119 optlen = 0;
2120 if (isipv6) {
2121 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
2122 } else {
2123 hdrlen = sizeof(struct tcpiphdr);
2124 }
2125 if (flags & TH_SYN) {
2126 tp->snd_nxt = tp->iss;
2127 tp->snd_fack = tp->iss;
2128 if ((tp->t_flags & TF_NOOPT) == 0) {
2129 u_short mss;
2130
2131 opt[0] = TCPOPT_MAXSEG;
2132 opt[1] = TCPOLEN_MAXSEG;
2133 mss = htons((u_short) tcp_mssopt(tp));
2134 (void)memcpy(opt + 2, &mss, sizeof(mss));
2135 optlen = TCPOLEN_MAXSEG;
2136
2137 if ((tp->t_flags & TF_REQ_SCALE) &&
2138 ((flags & TH_ACK) == 0 ||
2139 (tp->t_flags & TF_RCVD_SCALE))) {
2140 *((u_int32_t *)(void *)(opt + optlen)) = htonl(
2141 TCPOPT_NOP << 24 |
2142 TCPOPT_WINDOW << 16 |
2143 TCPOLEN_WINDOW << 8 |
2144 tp->request_r_scale);
2145 optlen += 4;
2146 }
2147 /* Check if L4S is enabled after outifp has been set and update the CC */
2148 if (tp->l4s_enabled && tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX) {
2149 tcp_set_foreground_cc(so);
2150 }
2151 #if MPTCP
2152 if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) {
2153 optlen = mptcp_setup_syn_opts(so, opt, opt + sizeof(opt), optlen);
2154 }
2155 #endif /* MPTCP */
2156 }
2157 }
2158
2159 /*
2160 * Send a timestamp and echo-reply if this is a SYN and our side
2161 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
2162 * and our peer have sent timestamps in our SYN's.
2163 */
2164 if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
2165 (flags & TH_RST) == 0 &&
2166 ((flags & TH_ACK) == 0 ||
2167 (tp->t_flags & TF_RCVD_TSTMP))) {
2168 uint32_t *lp = (u_int32_t *)(void *)(opt + optlen);
2169
2170 /* Form timestamp option as shown in appendix A of RFC 1323. */
2171 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
2172
2173 tsvalptr = lp;
2174 lp++; /* tsval will be set later (see access to tsvalptr) */
2175 *lp = htonl(tp->ts_recent);
2176 optlen += TCPOLEN_TSTAMP_APPA;
2177 }
2178
2179 if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
2180 /*
2181 * Tack on the SACK permitted option *last*.
2182 * And do padding of options after tacking this on.
2183 * This is because of MSS, TS, WinScale and Signatures are
2184 * all present, we have just 2 bytes left for the SACK
2185 * permitted option, which is just enough.
2186 */
2187 /*
2188 * If this is the first SYN of connection (not a SYN
2189 * ACK), include SACK permitted option. If this is a
2190 * SYN ACK, include SACK permitted option if peer has
2191 * already done so. This is only for active connect,
2192 * since the syncookie takes care of the passive connect.
2193 */
2194 if ((flags & TH_SYN) &&
2195 (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
2196 u_char *bp;
2197 bp = (u_char *)opt + optlen;
2198
2199 *bp++ = TCPOPT_SACK_PERMITTED;
2200 *bp++ = TCPOLEN_SACK_PERMITTED;
2201 optlen += TCPOLEN_SACK_PERMITTED;
2202 }
2203 }
2204 #if MPTCP
2205 if (so->so_flags & SOF_MP_SUBFLOW) {
2206 /*
2207 * Its important to piggyback acks with data as ack only packets
2208 * may get lost and data packets that don't send Data ACKs
2209 * still advance the subflow level ACK and therefore make it
2210 * hard for the remote end to recover in low cwnd situations.
2211 */
2212 if (len != 0) {
2213 tp->t_mpflags |= (TMPF_SEND_DSN |
2214 TMPF_MPTCP_ACKNOW);
2215 } else {
2216 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
2217 }
2218 optlen = mptcp_setup_opts(tp, off, opt, opt + TCP_MAXOLEN, optlen, flags,
2219 len, &mptcp_acknow, &do_not_compress);
2220 tp->t_mpflags &= ~TMPF_SEND_DSN;
2221 }
2222 #endif /* MPTCP */
2223
2224 if (TFO_ENABLED(tp) && !(tp->t_flags & TF_NOOPT) &&
2225 (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
2226 optlen += tcp_tfo_write_cookie(tp, optlen, len, opt);
2227 }
2228
2229 if (TFO_ENABLED(tp) &&
2230 (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) &&
2231 (tp->t_tfo_flags & TFO_F_OFFER_COOKIE)) {
2232 optlen += tcp_tfo_write_cookie_rep(tp, optlen, opt);
2233 }
2234
2235 if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
2236 /*
2237 * Send SACKs if necessary. This should be the last
2238 * option processed. Only as many SACKs are sent as
2239 * are permitted by the maximum options size.
2240 *
2241 * In general, SACK blocks consume 8*n+2 bytes.
2242 * So a full size SACK blocks option is 34 bytes
2243 * (to generate 4 SACK blocks). At a minimum,
2244 * we need 10 bytes (to generate 1 SACK block).
2245 * If TCP Timestamps (12 bytes) and TCP Signatures
2246 * (18 bytes) are both present, we'll just have
2247 * 10 bytes for SACK options 40 - (12 + 18).
2248 */
2249 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
2250 (tp->t_flags & TF_SACK_PERMIT) &&
2251 (tp->rcv_numsacks > 0 || TCP_SEND_DSACK_OPT(tp)) &&
2252 MAX_TCPOPTLEN - optlen >= TCPOLEN_SACK + 2) {
2253 unsigned int sackoptlen = 0;
2254 int nsack, padlen;
2255 u_char *bp = (u_char *)opt + optlen;
2256 u_int32_t *lp;
2257
2258 nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
2259 /*
2260 * Send lesser SACK blocks when we want
2261 * to send the smallest recommended AccECN Option
2262 * if the space wouldn't permit sending all blocks.
2263 */
2264 if (nsack > 2 && tp->accurate_ecn_on &&
2265 (tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == 0 &&
2266 tp->ecn_flags & (TE_ACO_ECT1 | TE_ACO_ECT0)) {
2267 nsack--;
2268 }
2269 nsack = min(nsack, (tp->rcv_numsacks +
2270 (TCP_SEND_DSACK_OPT(tp) ? 1 : 0)));
2271 sackoptlen = (2 + nsack * TCPOLEN_SACK);
2272 VERIFY(sackoptlen < UINT8_MAX);
2273
2274 /*
2275 * First we need to pad options so that the
2276 * SACK blocks can start at a 4-byte boundary
2277 * (sack option and length are at a 2 byte offset).
2278 */
2279 padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
2280 optlen += padlen;
2281 while (padlen-- > 0) {
2282 *bp++ = TCPOPT_NOP;
2283 }
2284
2285 tcpstat.tcps_sack_send_blocks++;
2286 *bp++ = TCPOPT_SACK;
2287 *bp++ = (uint8_t)sackoptlen;
2288 lp = (u_int32_t *)(void *)bp;
2289
2290 /*
2291 * First block of SACK option should represent
2292 * DSACK. Prefer to send SACK information if there
2293 * is space for only one SACK block. This will
2294 * allow for faster recovery.
2295 */
2296 if (TCP_SEND_DSACK_OPT(tp) && nsack > 0 &&
2297 (tp->rcv_numsacks == 0 || nsack > 1)) {
2298 *lp++ = htonl(tp->t_dsack_lseq);
2299 *lp++ = htonl(tp->t_dsack_rseq);
2300 tcpstat.tcps_dsack_sent++;
2301 tp->t_dsack_sent++;
2302 nsack--;
2303 }
2304 VERIFY(nsack == 0 || tp->rcv_numsacks >= nsack);
2305 for (i = 0; i < nsack; i++) {
2306 struct sackblk sack = tp->sackblks[i];
2307 *lp++ = htonl(sack.start);
2308 *lp++ = htonl(sack.end);
2309 }
2310 optlen += sackoptlen;
2311
2312 /* Make sure we didn't write too much */
2313 VERIFY((u_char *)lp - opt <= MAX_TCPOPTLEN);
2314 }
2315 }
2316
2317 /*
2318 * AccECN option - after SACK
2319 * Don't send on <SYN>,
2320 * send only on <SYN,ACK> before ACCECN is negotiated when
2321 * the client requests it or
2322 * when doing an AccECN session. Don't send AccECN option
2323 * if retransmitting a SYN-ACK or a data segment
2324 */
2325 if ((tp->accurate_ecn_on ||
2326 (tp->l4s_enabled && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) &&
2327 (tp->ecn_flags & TE_ACE_SETUPRECEIVED))) &&
2328 (tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == 0) {
2329 uint32_t *lp = (uint32_t *)(void *)(opt + optlen);
2330 /* lp will become outdated after options are added */
2331 tcp_add_accecn_option(tp, flags, lp, (uint8_t *)&optlen);
2332 /* Make sure we didn't write more than 40 bytes */
2333 ASSERT((u_char *)lp - opt <= MAX_TCPOPTLEN);
2334 }
2335 /* Pad TCP options to a 4 byte boundary */
2336 if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
2337 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
2338 u_char *bp = (u_char *)opt + optlen;
2339
2340 optlen += pad;
2341 while (pad) {
2342 *bp++ = TCPOPT_EOL;
2343 pad--;
2344 }
2345 }
2346
2347 /*
2348 * For Accurate ECN, send ACE flag based on r.cep, if
2349 * We have completed handshake and are in ESTABLISHED state, and
2350 * This is not the final ACK of 3WHS.
2351 */
2352 if (tp->accurate_ecn_on && TCPS_HAVEESTABLISHED(tp->t_state) &&
2353 (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) == 0) {
2354 uint8_t ace = tp->t_aecn.t_rcv_ce_packets & TCP_ACE_MASK;
2355 if (ace & 0x01) {
2356 flags |= TH_ECE;
2357 } else {
2358 flags &= ~TH_ECE;
2359 }
2360 if (ace & 0x02) {
2361 flags |= TH_CWR;
2362 } else {
2363 flags &= ~TH_CWR;
2364 }
2365 if (ace & 0x04) {
2366 flags |= TH_AE;
2367 } else {
2368 flags &= ~TH_AE;
2369 }
2370 }
2371
2372 /*
2373 * RFC 3168 states that:
2374 * - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
2375 * to handle the TCP ECE flag, even if you also later send a
2376 * non-ECN-setup SYN/SYN-ACK.
2377 * - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
2378 * the ip ECT flag.
2379 *
2380 * It is not clear how the ECE flag would ever be set if you never
2381 * set the IP ECT flag on outbound packets. All the same, we use
2382 * the TE_SETUPSENT to indicate that we have committed to handling
2383 * the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
2384 * whether or not we should set the IP ECT flag on outbound packet
2385 *
2386 * For a SYN-ACK, send an ECN setup SYN-ACK
2387 *
2388 * Below we send ECN for three different handhshake states:
2389 * 1. Server received SYN and is sending a SYN-ACK (state->TCPS_SYN_RECEIVED)
2390 * - both classic and Accurate ECN have special encoding
2391 * 2. Client is sending SYN packet (state->SYN_SENT)
2392 * - both classic and Accurate ECN have special encoding
2393 * 3. Client is sending final ACK of 3WHS (state->ESTABLISHED)
2394 * - Only Accurate ECN has special encoding
2395 */
2396 if ((flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) &&
2397 (tp->ecn_flags & TE_ENABLE_ECN)) {
2398 flags = tcp_accecn_synack_respond(tp, flags);
2399 } else if ((flags & (TH_SYN | TH_ACK)) == TH_SYN &&
2400 (tp->ecn_flags & TE_ENABLE_ECN)) {
2401 if (tcp_send_ecn_flags_on_syn(tp)) {
2402 if (tp->l4s_enabled) {
2403 /*
2404 * We are negotiating AccECN in SYN.
2405 * We only set TE_SENDIPECT after the handshake
2406 * is complete.
2407 */
2408 flags |= TH_ACE;
2409 tp->ecn_flags |= (TE_ACE_SETUPSENT);
2410 } else {
2411 /*
2412 * Setting TH_ECE and TH_CWR makes this an
2413 * ECN-setup SYN
2414 */
2415 flags |= (TH_ECE | TH_CWR);
2416 /*
2417 * Record that we sent the ECN-setup and default to
2418 * setting IP ECT.
2419 */
2420 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
2421 }
2422 tcpstat.tcps_ecn_client_setup++;
2423 tp->ecn_flags |= TE_CLIENT_SETUP;
2424 } else {
2425 /*
2426 * We sent an ECN-setup SYN but it was dropped.
2427 * Fall back to non-ECN and clear flag indicating
2428 * we should send data with IP ECT set.
2429 */
2430 if (tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) {
2431 tcpstat.tcps_ecn_lost_syn++;
2432 tp->ecn_flags |= TE_LOST_SYN;
2433 }
2434 tp->ecn_flags &= ~TE_SENDIPECT;
2435 }
2436 } else if (tp->accurate_ecn_on && (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) &&
2437 len == 0 && (flags & (TH_FLAGS_ALL)) == TH_ACK) {
2438 /*
2439 * Client has processed SYN-ACK and moved to ESTABLISHED.
2440 * This is the final ACK of 3WHS. If ACC_ECN has been negotiated,
2441 * then send the handshake encoding as per Table 3 of Accurate ECN draft.
2442 * We are clearing the ACE flags just in case if they were set before.
2443 * TODO: if client has to carry data in the 3WHS ACK, then we need to send a pure ACK first
2444 */
2445 flags &= ~(TH_AE | TH_CWR | TH_ECE);
2446 if (tp->ecn_flags & TE_ACE_SETUP_NON_ECT) {
2447 flags |= TH_CWR;
2448 tp->ecn_flags &= ~TE_ACE_SETUP_NON_ECT;
2449 } else if (tp->ecn_flags & TE_ACE_SETUP_ECT1) {
2450 flags |= (TH_CWR | TH_ECE);
2451 tp->ecn_flags &= ~TE_ACE_SETUP_ECT1;
2452 } else if (tp->ecn_flags & TE_ACE_SETUP_ECT0) {
2453 flags |= TH_AE;
2454 tp->ecn_flags &= ~TE_ACE_SETUP_ECT0;
2455 } else if (tp->ecn_flags & TE_ACE_SETUP_CE) {
2456 flags |= (TH_AE | TH_CWR);
2457 tp->ecn_flags &= ~TE_ACE_SETUP_CE;
2458 }
2459 tp->ecn_flags &= ~(TE_ACE_FINAL_ACK_3WHS);
2460 }
2461
2462 /*
2463 * Check if we should set the TCP CWR flag.
2464 * CWR flag is sent when we reduced the congestion window because
2465 * we received a TCP ECE or we performed a fast retransmit. We
2466 * never set the CWR flag on retransmitted packets. We only set
2467 * the CWR flag on data packets. Pure acks don't have this set.
2468 */
2469 if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
2470 !SEQ_LT(tp->snd_nxt, tp->snd_max) && !rack_sack_rxmit) {
2471 flags |= TH_CWR;
2472 tp->ecn_flags &= ~TE_SENDCWR;
2473 }
2474
2475 /*
2476 * Check if we should set the TCP ECE flag.
2477 */
2478 if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
2479 flags |= TH_ECE;
2480 tcpstat.tcps_ecn_sent_ece++;
2481 }
2482
2483 hdrlen += optlen;
2484
2485 /* Reset DSACK sequence numbers */
2486 tp->t_dsack_lseq = 0;
2487 tp->t_dsack_rseq = 0;
2488
2489 if (isipv6) {
2490 ipoptlen = ip6_optlen(inp);
2491 } else {
2492 if (tp_inp_options) {
2493 ipoptlen = tp_inp_options->m_len -
2494 offsetof(struct ipoption, ipopt_list);
2495 } else {
2496 ipoptlen = 0;
2497 }
2498 }
2499 #if IPSEC
2500 ipoptlen += ipsec_optlen;
2501 #endif
2502
2503 /*
2504 * Adjust data length if insertion of options will
2505 * bump the packet length beyond the t_maxopd length.
2506 * Clear the FIN bit because we cut off the tail of
2507 * the segment.
2508 *
2509 * When doing TSO limit a burst to TCP_MAXWIN minus the
2510 * IP, TCP and Options length to keep ip->ip_len from
2511 * overflowing. Prevent the last segment from being
2512 * fractional thus making them all equal sized and set
2513 * the flag to continue sending. TSO is disabled when
2514 * IP options or IPSEC are present.
2515 */
2516 if (len + optlen + ipoptlen > tp->t_maxopd) {
2517 /*
2518 * If there is still more to send,
2519 * don't close the connection.
2520 */
2521 flags &= ~TH_FIN;
2522 if (tso) {
2523 int32_t tso_maxlen;
2524
2525 tso_maxlen = tp->tso_max_segment_size ?
2526 tp->tso_max_segment_size : TCP_MAXWIN;
2527
2528 /* hdrlen includes optlen */
2529 if (len > tso_maxlen - hdrlen) {
2530 len = tso_maxlen - hdrlen;
2531 sendalot = 1;
2532 } else if (tp->t_flags & TF_NEEDFIN) {
2533 sendalot = 1;
2534 }
2535
2536 if (len % (tp->t_maxopd - optlen) != 0) {
2537 len = len - (len % (tp->t_maxopd - optlen));
2538 sendalot = 1;
2539 }
2540 } else {
2541 len = tp->t_maxopd - optlen - ipoptlen;
2542 sendalot = 1;
2543 }
2544 }
2545
2546 if (!(flags & TH_SYN) &&
2547 ((tp->accurate_ecn_on && (tp->ecn_flags & TE_SENDIPECT) != 0) ||
2548 inp->inp_max_pacing_rate != UINT64_MAX)) {
2549 uint32_t pacing_delay;
2550
2551 pacing_delay = tcp_pacer_get_packet_tx_time(tp, len, &pacing_tx_time);
2552
2553 if (TSTMP_GT(tcp_now_local + pacing_delay, tp->t_latest_tx)) {
2554 /*
2555 * We need to make sure that time never moves backwards. This is
2556 * needed because `tcp_now` is not the same as `microuptime`
2557 * and thus two threads trying to send (one from the app, one
2558 * from dlil_input) may end up with different views on the time
2559 * and thus we may end up going backwards...
2560 * So, make sure t_latest_tx is strictly increasing.
2561 */
2562 tp->t_latest_tx = tcp_now_local + pacing_delay;
2563 }
2564 } else {
2565 if (TSTMP_GT(tcp_now_local, tp->t_latest_tx)) {
2566 tp->t_latest_tx = tcp_now_local;
2567 }
2568 }
2569
2570 if (tsvalptr != NULL) {
2571 uint32_t tsval;
2572
2573 /*
2574 * pacing_delay is folded into t_latest_tx, so that our
2575 * RTT-estimate is not artificially inflated.
2576 */
2577 tsval = tp->t_ts_offset + tp->t_latest_tx;
2578 *tsvalptr = htonl(tsval);
2579 }
2580
2581 if (max_linkhdr + hdrlen > MCLBYTES) {
2582 panic("tcphdr too big");
2583 }
2584
2585 /* Check if there is enough data in the send socket
2586 * buffer to start measuring bandwidth
2587 */
2588 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2589 (tp->t_bwmeas != NULL) &&
2590 (tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0) {
2591 tp->t_bwmeas->bw_size = min(min(
2592 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)),
2593 tp->snd_cwnd), tp->snd_wnd);
2594 if (tp->t_bwmeas->bw_minsize > 0 &&
2595 tp->t_bwmeas->bw_size < tp->t_bwmeas->bw_minsize) {
2596 tp->t_bwmeas->bw_size = 0;
2597 }
2598 if (tp->t_bwmeas->bw_maxsize > 0) {
2599 tp->t_bwmeas->bw_size = min(tp->t_bwmeas->bw_size,
2600 tp->t_bwmeas->bw_maxsize);
2601 }
2602 if (tp->t_bwmeas->bw_size > 0) {
2603 tp->t_flagsext |= TF_BWMEAS_INPROGRESS;
2604 tp->t_bwmeas->bw_start = tp->snd_max;
2605 tp->t_bwmeas->bw_ts = tcp_now_local;
2606 }
2607 }
2608
2609 VERIFY(inp->inp_flowhash != 0);
2610 /*
2611 * Grab a header mbuf, attaching a copy of data to
2612 * be transmitted, and initialize the header from
2613 * the template for sends on this connection.
2614 */
2615 if (len) {
2616 /* Remember what the last head-of-line packet-size was */
2617 if (tp->t_pmtud_lastseg_size == 0 && tp->snd_nxt == tp->snd_una) {
2618 ASSERT(len + optlen + ipoptlen <= IP_MAXPACKET);
2619 tp->t_pmtud_lastseg_size = (uint16_t)(len + optlen + ipoptlen);
2620 }
2621 if ((tp->t_flagsext & TF_FORCE) && len == 1) {
2622 tcpstat.tcps_sndprobe++;
2623 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || rack_sack_rxmit) {
2624 tcpstat.tcps_sndrexmitpack++;
2625 tcpstat.tcps_sndrexmitbyte += len;
2626 if (nstat_collect) {
2627 nstat_route_tx(inp->inp_route.ro_rt, 1,
2628 len, NSTAT_TX_FLAG_RETRANSMIT);
2629 INP_ADD_TXSTAT(inp, ifnet_count_type, 1, len);
2630 tp->t_stat.txretransmitbytes += len;
2631 tp->t_stat.rxmitpkts++;
2632 }
2633 if (tp->ecn_flags & TE_SENDIPECT) {
2634 tp->t_ecn_capable_packets_lost++;
2635 }
2636 } else {
2637 tcpstat.tcps_sndpack++;
2638 tcpstat.tcps_sndbyte += len;
2639
2640 if (nstat_collect) {
2641 INP_ADD_TXSTAT(inp, ifnet_count_type, 1, len);
2642 }
2643 if (tp->ecn_flags & TE_SENDIPECT) {
2644 tp->t_ecn_capable_packets_sent++;
2645 }
2646 inp_decr_sndbytes_unsent(so, len);
2647 }
2648 #if MPTCP
2649 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
2650 tcpstat.tcps_mp_sndpacks++;
2651 tcpstat.tcps_mp_sndbytes += len;
2652 }
2653 #endif /* MPTCP */
2654 /*
2655 * try to use the new interface that allocates all
2656 * the necessary mbuf hdrs under 1 mbuf lock and
2657 * avoids rescanning the socket mbuf list if
2658 * certain conditions are met. This routine can't
2659 * be used in the following cases...
2660 * 1) the protocol headers exceed the capacity of
2661 * of a single mbuf header's data area (no cluster attached)
2662 * 2) the length of the data being transmitted plus
2663 * the protocol headers fits into a single mbuf header's
2664 * data area (no cluster attached)
2665 */
2666 m = NULL;
2667
2668 /* minimum length we are going to allocate */
2669 allocated_len = MHLEN;
2670 if (MHLEN < hdrlen + max_linkhdr) {
2671 MGETHDR(m, M_DONTWAIT, MT_HEADER);
2672 if (m == NULL) {
2673 error = ENOBUFS;
2674 TCP_LOG(tp, "MGETHDR error ENOBUFS");
2675 goto out;
2676 }
2677 MCLGET(m, M_DONTWAIT);
2678 if ((m->m_flags & M_EXT) == 0) {
2679 m_freem(m);
2680 error = ENOBUFS;
2681 TCP_LOG(tp, "MCLGET error ENOBUFS");
2682 goto out;
2683 }
2684 m->m_data += max_linkhdr;
2685 m->m_len = hdrlen;
2686 allocated_len = MCLBYTES;
2687 }
2688 if (len <= allocated_len - hdrlen - max_linkhdr) {
2689 if (m == NULL) {
2690 VERIFY(allocated_len <= MHLEN);
2691 MGETHDR(m, M_DONTWAIT, MT_HEADER);
2692 if (m == NULL) {
2693 error = ENOBUFS;
2694 TCP_LOG(tp, "MGETHDR error ENOBUFS");
2695 goto out;
2696 }
2697 m->m_data += max_linkhdr;
2698 m->m_len = hdrlen;
2699 }
2700 /* makes sure we still have data left to be sent at this point */
2701 if (so->so_snd.sb_mb == NULL || off < 0) {
2702 if (m != NULL) {
2703 m_freem(m);
2704 }
2705 error = 0; /* should we return an error? */
2706 goto out;
2707 }
2708 m_copydata(so->so_snd.sb_mb, off, (int) len,
2709 mtod(m, caddr_t) + hdrlen);
2710 m->m_len += len;
2711 } else {
2712 uint32_t copymode;
2713 /*
2714 * Retain packet header metadata at the socket
2715 * buffer if this is is an MPTCP subflow,
2716 * otherwise move it.
2717 */
2718 copymode = M_COPYM_MOVE_HDR;
2719 #if MPTCP
2720 if (so->so_flags & SOF_MP_SUBFLOW) {
2721 copymode = M_COPYM_NOOP_HDR;
2722 }
2723 #endif /* MPTCP */
2724 if (m != NULL) {
2725 if (so->so_snd.sb_flags & SB_SENDHEAD) {
2726 VERIFY(so->so_snd.sb_flags & SB_SENDHEAD);
2727 VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2728
2729 m->m_next = m_copym_mode(so->so_snd.sb_mb,
2730 off, (int)len, M_DONTWAIT,
2731 &so->so_snd.sb_sendhead,
2732 &so->so_snd.sb_sendoff, copymode);
2733
2734 VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2735 } else {
2736 m->m_next = m_copym_mode(so->so_snd.sb_mb,
2737 off, (int)len, M_DONTWAIT,
2738 NULL, NULL, copymode);
2739 }
2740 if (m->m_next == NULL) {
2741 (void) m_free(m);
2742 error = ENOBUFS;
2743 TCP_LOG(tp, "m_copym_mode error ENOBUFS");
2744 goto out;
2745 }
2746 } else {
2747 /*
2748 * make sure we still have data left
2749 * to be sent at this point
2750 */
2751 if (so->so_snd.sb_mb == NULL) {
2752 error = 0; /* should we return an error? */
2753 goto out;
2754 }
2755
2756 /*
2757 * m_copym_with_hdrs will always return the
2758 * last mbuf pointer and the offset into it that
2759 * it acted on to fullfill the current request,
2760 * whether a valid 'hint' was passed in or not.
2761 */
2762 if (so->so_snd.sb_flags & SB_SENDHEAD) {
2763 VERIFY(so->so_snd.sb_flags & SB_SENDHEAD);
2764 VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2765
2766 m = m_copym_with_hdrs(so->so_snd.sb_mb,
2767 off, len, M_DONTWAIT, &so->so_snd.sb_sendhead,
2768 &so->so_snd.sb_sendoff, copymode);
2769
2770 VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2771 } else {
2772 m = m_copym_with_hdrs(so->so_snd.sb_mb,
2773 off, len, M_DONTWAIT, NULL,
2774 NULL, copymode);
2775 }
2776 if (m == NULL) {
2777 error = ENOBUFS;
2778 TCP_LOG(tp, "m_copym_with_hdrs error ENOBUFS");
2779 goto out;
2780 }
2781 m->m_data += max_linkhdr;
2782 m->m_len = hdrlen;
2783 }
2784 }
2785 /*
2786 * If we're sending everything we've got, set PUSH.
2787 * (This will keep happy those implementations which only
2788 * give data to the user when a buffer fills or
2789 * a PUSH comes in.)
2790 *
2791 * On SYN-segments we should not add the PUSH-flag.
2792 */
2793 if (off + len == so->so_snd.sb_cc && !(flags & TH_SYN)) {
2794 flags |= TH_PUSH;
2795 }
2796 } else {
2797 if (tp->t_flags & TF_ACKNOW) {
2798 tcpstat.tcps_sndacks++;
2799 } else if (flags & (TH_SYN | TH_FIN | TH_RST)) {
2800 tcpstat.tcps_sndctrl++;
2801 } else if (SEQ_GT(tp->snd_up, tp->snd_una)) {
2802 tcpstat.tcps_sndurg++;
2803 } else {
2804 tcpstat.tcps_sndwinup++;
2805 }
2806
2807 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
2808 if (m == NULL) {
2809 error = ENOBUFS;
2810 TCP_LOG(tp, "MGETHDR error ENOBUFS");
2811 goto out;
2812 }
2813 if (MHLEN < (hdrlen + max_linkhdr)) {
2814 MCLGET(m, M_DONTWAIT);
2815 if ((m->m_flags & M_EXT) == 0) {
2816 m_freem(m);
2817 error = ENOBUFS;
2818 TCP_LOG(tp, "MCLGET error ENOBUFS");
2819 goto out;
2820 }
2821 }
2822 m->m_data += max_linkhdr;
2823 m->m_len = hdrlen;
2824 }
2825 m->m_pkthdr.rcvif = 0;
2826 m_add_crumb(m, PKT_CRUMB_TCP_OUTPUT);
2827
2828 /* Any flag other than pure-ACK: Do not compress! */
2829 if (flags & ~(TH_ACK)) {
2830 do_not_compress = TRUE;
2831 }
2832
2833 if (tp->rcv_scale == 0) {
2834 do_not_compress = TRUE;
2835 }
2836
2837 if (do_not_compress) {
2838 m->m_pkthdr.comp_gencnt = 0;
2839 } else {
2840 if (TSTMP_LT(tp->t_comp_ack_lastinc + tcp_ack_compression_rate, tcp_now_local)) {
2841 tp->t_comp_ack_gencnt++;
2842 /*
2843 * 0 means no compression, and ACK gencnt is encoded on 31 bits
2844 */
2845 if (tp->t_comp_ack_gencnt <= TCP_ACK_COMPRESSION_DUMMY ||
2846 tp->t_comp_ack_gencnt > INT_MAX) {
2847 tp->t_comp_ack_gencnt = TCP_ACK_COMPRESSION_DUMMY + 1;
2848 }
2849 tp->t_comp_ack_lastinc = tcp_now_local;
2850 }
2851 m->m_pkthdr.comp_gencnt = tp->t_comp_ack_gencnt;
2852 }
2853
2854 if (isipv6) {
2855 ip6 = mtod(m, struct ip6_hdr *);
2856 th = (struct tcphdr *)(void *)(ip6 + 1);
2857 tcp_fillheaders(m, tp, ip6, th, NULL, NULL);
2858
2859 if (tp->accurate_ecn_on) {
2860 /* We send ECT1 for ALL packets (data, control, fast retransmits, RTO) */
2861 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && !(flags & TH_SYN)) {
2862 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT1 << 20);
2863 }
2864 } else {
2865 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
2866 !SEQ_LT(tp->snd_nxt, tp->snd_max) && !rack_sack_rxmit) {
2867 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
2868 }
2869 }
2870 svc_flags |= PKT_SCF_IPV6;
2871 #if PF_ECN
2872 m_pftag(m)->pftag_hdr = (void *)ip6;
2873 m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET6;
2874 #endif /* PF_ECN */
2875 } else {
2876 ip = mtod(m, struct ip *);
2877 th = (struct tcphdr *)(void *)(ip + 1);
2878 /* this picks up the pseudo header (w/o the length) */
2879 tcp_fillheaders(m, tp, ip, th, NULL, NULL);
2880
2881 if (tp->accurate_ecn_on) {
2882 /* We send ECT1 for ALL packets (data, control, fast retransmits, RTO) */
2883 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && !(flags & TH_SYN)) {
2884 ip->ip_tos |= IPTOS_ECN_ECT1;
2885 }
2886 } else {
2887 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
2888 !SEQ_LT(tp->snd_nxt, tp->snd_max) &&
2889 !rack_sack_rxmit && !(flags & TH_SYN)) {
2890 ip->ip_tos |= IPTOS_ECN_ECT0;
2891 }
2892 }
2893 #if PF_ECN
2894 m_pftag(m)->pftag_hdr = (void *)ip;
2895 m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET;
2896 #endif /* PF_ECN */
2897 }
2898
2899 if (pacing_tx_time) {
2900 mbuf_set_tx_time(m, pacing_tx_time);
2901 }
2902
2903 /*
2904 * Fill in fields, remembering maximum advertised
2905 * window for use in delaying messages about window sizes.
2906 * If resending a FIN, be sure not to use a new sequence number.
2907 */
2908 if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
2909 tp->snd_nxt == tp->snd_max) {
2910 tp->snd_nxt--;
2911 }
2912 /*
2913 * If we are doing retransmissions, then snd_nxt will
2914 * not reflect the first unsent octet. For ACK only
2915 * packets, we do not want the sequence number of the
2916 * retransmitted packet, we want the sequence number
2917 * of the next unsent octet. So, if there is no data
2918 * (and no SYN or FIN), use snd_max instead of snd_nxt
2919 * when filling in ti_seq. But if we are in persist
2920 * state, snd_max might reflect one byte beyond the
2921 * right edge of the window, so use snd_nxt in that
2922 * case, since we know we aren't doing a retransmission.
2923 * (retransmit and persist are mutually exclusive...)
2924 *
2925 * Note the state of this retransmit segment to detect spurious
2926 * retransmissions.
2927 */
2928 if (rack_sack_rxmit == 0) {
2929 if (len || (flags & (TH_SYN | TH_FIN)) ||
2930 tp->t_timer[TCPT_PERSIST]) {
2931 th->th_seq = htonl(tp->snd_nxt);
2932 if (len > 0) {
2933 m->m_pkthdr.tx_start_seq = tp->snd_nxt;
2934 m->m_pkthdr.pkt_flags |= PKTF_START_SEQ;
2935 }
2936 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
2937 if (SACK_ENABLED(tp) && len > 1 &&
2938 !(tp->t_flagsext & TF_SENT_TLPROBE)) {
2939 tcp_rxtseg_insert(tp, tp->snd_nxt,
2940 (tp->snd_nxt + len - 1));
2941 }
2942 if (len > 0) {
2943 m->m_pkthdr.pkt_flags |=
2944 PKTF_TCP_REXMT;
2945 }
2946 }
2947 } else {
2948 th->th_seq = htonl(tp->snd_max);
2949 }
2950 } else {
2951 /* Use RACK if enabled otherwise use SACK */
2952 if (TCP_RACK_ENABLED(tp)) {
2953 th->th_seq = htonl(seg->start_seq);
2954 tcp_rxtseg_insert(tp, seg->start_seq, (seg->start_seq + len - 1));
2955 } else {
2956 th->th_seq = htonl(p->rxmit);
2957 tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - 1));
2958 p->rxmit += len;
2959 tp->sackhint.sack_bytes_rexmit += len;
2960 }
2961 if (len > 0) {
2962 m->m_pkthdr.tx_start_seq = ntohl(th->th_seq);
2963 m->m_pkthdr.pkt_flags |=
2964 (PKTF_TCP_REXMT | PKTF_START_SEQ);
2965 }
2966 }
2967 th->th_ack = htonl(tp->rcv_nxt);
2968 tp->last_ack_sent = tp->rcv_nxt;
2969 if (optlen) {
2970 bcopy(opt, th + 1, optlen);
2971 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
2972 }
2973 /* Separate AE from flags */
2974 tcp_set_flags(th, flags);
2975 th->th_win = htons((u_short) (recwin >> tp->rcv_scale));
2976 tp->t_last_recwin = recwin;
2977 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2978 if (recwin > 0 && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin)) {
2979 tp->rcv_adv = tp->rcv_nxt + recwin;
2980 }
2981 } else {
2982 struct mptcb *mp_tp = tptomptp(tp);
2983 if (recwin > 0) {
2984 tp->rcv_adv = tp->rcv_nxt + recwin;
2985 }
2986
2987 if (recwin > 0 && MPTCP_SEQ_LT(mp_tp->mpt_rcvadv, mp_tp->mpt_rcvnxt + recwin)) {
2988 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + recwin;
2989 }
2990 }
2991
2992 /*
2993 * Adjust the RXWIN0SENT flag - indicate that we have advertised
2994 * a 0 window. This may cause the remote transmitter to stall. This
2995 * flag tells soreceive() to disable delayed acknowledgements when
2996 * draining the buffer. This can occur if the receiver is attempting
2997 * to read more data then can be buffered prior to transmitting on
2998 * the connection.
2999 */
3000 if (th->th_win == 0) {
3001 tp->t_flags |= TF_RXWIN0SENT;
3002 } else {
3003 tp->t_flags &= ~TF_RXWIN0SENT;
3004 }
3005
3006 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
3007 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
3008 th->th_flags |= TH_URG;
3009 } else {
3010 /*
3011 * If no urgent pointer to send, then we pull
3012 * the urgent pointer to the left edge of the send window
3013 * so that it doesn't drift into the send window on sequence
3014 * number wraparound.
3015 */
3016 tp->snd_up = tp->snd_una; /* drag it along */
3017 }
3018
3019 /*
3020 * Put TCP length in extended header, and then
3021 * checksum extended header and data.
3022 */
3023 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
3024
3025 /*
3026 * If this is potentially the last packet on the stream, then mark
3027 * it in order to enable some optimizations in the underlying
3028 * layers
3029 */
3030 if (tp->t_state != TCPS_ESTABLISHED &&
3031 (tp->t_state == TCPS_CLOSING || tp->t_state == TCPS_TIME_WAIT
3032 || tp->t_state == TCPS_LAST_ACK || (th->th_flags & TH_RST))) {
3033 m->m_pkthdr.pkt_flags |= PKTF_LAST_PKT;
3034 }
3035
3036 if (isipv6) {
3037 /*
3038 * ip6_plen is not need to be filled now, and will be filled
3039 * in ip6_output.
3040 */
3041 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
3042 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
3043 if (len + optlen) {
3044 th->th_sum = in_addword(th->th_sum,
3045 htons((u_short)(optlen + len)));
3046 }
3047 } else {
3048 m->m_pkthdr.csum_flags = CSUM_TCP;
3049 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
3050 if (len + optlen) {
3051 th->th_sum = in_addword(th->th_sum,
3052 htons((u_short)(optlen + len)));
3053 }
3054 }
3055
3056 /*
3057 * Enable TSO and specify the size of the segments.
3058 * The TCP pseudo header checksum is always provided.
3059 */
3060 if (tso) {
3061 if (isipv6) {
3062 m->m_pkthdr.csum_flags |= CSUM_TSO_IPV6;
3063 } else {
3064 m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4;
3065 }
3066
3067 m->m_pkthdr.tso_segsz = (uint16_t)(tp->t_maxopd - optlen);
3068 m->m_pkthdr.tx_hdr_len = (uint16_t)hdrlen;
3069 } else {
3070 m->m_pkthdr.tso_segsz = 0;
3071 }
3072
3073 /*
3074 * In transmit state, time the transmission and arrange for
3075 * the retransmit. In persist state, just set snd_max.
3076 */
3077 if (!(tp->t_flagsext & TF_FORCE)
3078 || tp->t_timer[TCPT_PERSIST] == 0) {
3079 tcp_seq startseq = tp->snd_nxt;
3080
3081 /*
3082 * Advance snd_nxt over sequence space of this segment.
3083 */
3084 if (flags & (TH_SYN | TH_FIN)) {
3085 if (flags & TH_SYN) {
3086 tp->snd_nxt++;
3087 }
3088 if ((flags & TH_FIN) &&
3089 !(tp->t_flags & TF_SENTFIN)) {
3090 tp->snd_nxt++;
3091 tp->t_flags |= TF_SENTFIN;
3092 }
3093 }
3094 if (rack_sack_rxmit) {
3095 goto timer;
3096 }
3097 if (sack_rescue_rxt == true) {
3098 tp->snd_nxt = old_snd_nxt;
3099 sack_rescue_rxt = false;
3100 tcpstat.tcps_pto_in_recovery++;
3101 } else {
3102 tp->snd_nxt += len;
3103 }
3104 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
3105 tp->snd_max = tp->snd_nxt;
3106 tp->t_sndtime = tp->t_latest_tx;
3107 /*
3108 * Time this transmission if not a retransmission and
3109 * not currently timing anything.
3110 */
3111 if (tp->t_rtttime == 0) {
3112 tp->t_rtttime = tp->t_latest_tx;
3113 tp->t_rtseq = startseq;
3114 tcpstat.tcps_segstimed++;
3115
3116 /* update variables related to pipe ack */
3117 tp->t_pipeack_lastuna = tp->snd_una;
3118 }
3119 }
3120
3121 /*
3122 * Set retransmit timer if not currently set,
3123 * and not doing an ack or a keep-alive probe.
3124 */
3125 timer:
3126 if (tp->t_timer[TCPT_REXMT] == 0 &&
3127 ((rack_sack_rxmit && tp->snd_nxt != tp->snd_max) ||
3128 tp->snd_nxt != tp->snd_una || (flags & TH_FIN))) {
3129 if (tp->t_timer[TCPT_PERSIST]) {
3130 tp->t_timer[TCPT_PERSIST] = 0;
3131 tp->t_persist_stop = 0;
3132 TCP_RESET_REXMT_STATE(tp);
3133 }
3134 tcp_set_rto(tp);
3135 }
3136
3137 if (tcp_enable_tlp && len != 0) {
3138 tcp_set_pto(tp);
3139 }
3140 } else {
3141 /*
3142 * Persist case, update snd_max but since we are in
3143 * persist mode (no window) we do not update snd_nxt.
3144 */
3145 int xlen = len;
3146 if (flags & TH_SYN) {
3147 ++xlen;
3148 }
3149 if ((flags & TH_FIN) &&
3150 !(tp->t_flags & TF_SENTFIN)) {
3151 ++xlen;
3152 tp->t_flags |= TF_SENTFIN;
3153 }
3154 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
3155 tp->snd_max = tp->snd_nxt + len;
3156 tp->t_sndtime = tp->t_latest_tx;
3157 }
3158 }
3159
3160 /*
3161 * Fill in IP length and desired time to live and
3162 * send to IP level. There should be a better way
3163 * to handle ttl and tos; we could keep them in
3164 * the template, but need a way to checksum without them.
3165 */
3166 /*
3167 * m->m_pkthdr.len should have been set before cksum calcuration,
3168 * because in6_cksum() need it.
3169 */
3170 if (isipv6) {
3171 /*
3172 * we separately set hoplimit for every segment, since the
3173 * user might want to change the value via setsockopt.
3174 * Also, desired default hop limit might be changed via
3175 * Neighbor Discovery.
3176 */
3177 ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
3178 inp->in6p_route.ro_rt->rt_ifp : NULL);
3179
3180 /* Don't set ECT bit if requested by an app */
3181
3182 /* Set ECN bits for testing purposes */
3183 if (tp->ecn_flags & TE_FORCE_ECT1) {
3184 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT1 << 20);
3185 } else if (tp->ecn_flags & TE_FORCE_ECT0) {
3186 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
3187 }
3188
3189 KERNEL_DEBUG(DBG_LAYER_BEG,
3190 ((inp->inp_fport << 16) | inp->inp_lport),
3191 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
3192 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
3193 sendalot, 0, 0);
3194 } else {
3195 ASSERT(m->m_pkthdr.len <= IP_MAXPACKET);
3196 ip->ip_len = (u_short)m->m_pkthdr.len;
3197 ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
3198
3199 /* Don't set ECN bit if requested by an app */
3200 ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
3201
3202 /* Set ECN bits for testing purposes */
3203 if (tp->ecn_flags & TE_FORCE_ECT1) {
3204 ip->ip_tos |= IPTOS_ECN_ECT1;
3205 } else if (tp->ecn_flags & TE_FORCE_ECT0) {
3206 ip->ip_tos |= IPTOS_ECN_ECT0;
3207 }
3208
3209 KERNEL_DEBUG(DBG_LAYER_BEG,
3210 ((inp->inp_fport << 16) | inp->inp_lport),
3211 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
3212 (inp->inp_faddr.s_addr & 0xffff)), 0, 0, 0);
3213 }
3214
3215 /*
3216 * See if we should do MTU discovery.
3217 * Look at the flag updated on the following criterias:
3218 * 1) Path MTU discovery is authorized by the sysctl
3219 * 2) The route isn't set yet (unlikely but could happen)
3220 * 3) The route is up
3221 * 4) the MTU is not locked (if it is, then discovery has been
3222 * disabled for that route)
3223 */
3224 if (!isipv6) {
3225 if (path_mtu_discovery && (tp->t_flags & TF_PMTUD)) {
3226 ip->ip_off |= IP_DF;
3227 }
3228 }
3229
3230 #if NECP
3231 {
3232 necp_kernel_policy_id policy_id;
3233 necp_kernel_policy_id skip_policy_id;
3234 u_int32_t route_rule_id;
3235 u_int32_t pass_flags;
3236 if (!necp_socket_is_allowed_to_send_recv(inp, NULL, 0, &policy_id, &route_rule_id, &skip_policy_id, &pass_flags)) {
3237 TCP_LOG_DROP_NECP(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true);
3238 m_drop_if(m, outifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_NECP, NULL, 0);
3239 error = EHOSTUNREACH;
3240 goto out;
3241 }
3242 necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id, pass_flags);
3243
3244 if (net_qos_policy_restricted != 0) {
3245 necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt, route_rule_id);
3246 }
3247 }
3248 #endif /* NECP */
3249
3250 #if IPSEC
3251 if (inp->inp_sp != NULL) {
3252 ipsec_setsocket(m, so);
3253 }
3254 #endif /*IPSEC*/
3255
3256 /*
3257 * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
3258 */
3259 lost = 0;
3260
3261 /*
3262 * Embed the flow hash in pkt hdr and mark the packet as
3263 * capable of flow controlling
3264 */
3265 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
3266 m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
3267 m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
3268 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3269 m->m_pkthdr.tx_tcp_pid = so->last_pid;
3270 if (so->so_flags & SOF_DELEGATED) {
3271 m->m_pkthdr.tx_tcp_e_pid = so->e_pid;
3272 } else {
3273 m->m_pkthdr.tx_tcp_e_pid = 0;
3274 }
3275
3276 m->m_nextpkt = NULL;
3277
3278 if (outifp != NULL &&
3279 !(outifp->if_flags & IFF_LOOPBACK)) {
3280 /* Hint to prioritize this packet if
3281 * 1. if the packet has no data
3282 * 2. the interface supports transmit-start model and did
3283 * not disable ACK prioritization.
3284 * 3. Only ACK flag is set.
3285 * 4. there is no outstanding data on this connection.
3286 * 5. Link heuristics are not enabled for the interface
3287 */
3288 if (len == 0 && (outifp->if_eflags & (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART) {
3289 if (link_heuristics_enabled && (tcp_link_heuristics_flags & TCP_LINK_HEUR_NOACKPRI) != 0) {
3290 IF_TCP_STATINC(outifp, linkheur_noackpri);
3291 } else {
3292 if (th->th_flags == TH_ACK &&
3293 tp->snd_una == tp->snd_max &&
3294 tp->t_timer[TCPT_REXMT] == 0) {
3295 svc_flags |= PKT_SCF_TCP_ACK;
3296 }
3297 if (th->th_flags & TH_SYN) {
3298 svc_flags |= PKT_SCF_TCP_SYN;
3299 }
3300 }
3301 }
3302 set_packet_service_class(m, so, sotc, svc_flags);
3303 } else {
3304 /*
3305 * Optimization for loopback just set the mbuf
3306 * service class
3307 */
3308 (void) m_set_service_class(m, so_tc2msc(sotc));
3309 }
3310
3311 tp->t_pktlist_sentlen += len;
3312 tp->t_lastchain++;
3313
3314 if (isipv6) {
3315 DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp,
3316 struct ip6 *, ip6, struct tcpcb *, tp, struct tcphdr *,
3317 th);
3318 } else {
3319 DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp,
3320 struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th);
3321 }
3322
3323 if (tp->t_pktlist_head != NULL) {
3324 tp->t_pktlist_tail->m_nextpkt = m;
3325 tp->t_pktlist_tail = m;
3326 } else {
3327 packchain_newlist++;
3328 tp->t_pktlist_head = tp->t_pktlist_tail = m;
3329 }
3330
3331 /* Append segment to time-ordered list and RB tree used for RACK */
3332 if (TCP_RACK_ENABLED(tp) && len != 0) {
3333 uint8_t retransmit_flag = 0;
3334 if (tp->t_flagsext & TF_SENT_TLPROBE) {
3335 /* Only set the at least once retransmitted flag */
3336 retransmit_flag = (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) ? TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE : 0;
3337 } else {
3338 /* Set both RACK and EVER retransmitted flags */
3339 retransmit_flag = (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) ? TCP_SEGMENT_RETRANSMITTED : 0;
3340 }
3341 tcp_seg_sent_insert(tp, seg, ntohl(th->th_seq),
3342 ntohl(th->th_seq) + len, tp->t_latest_tx, retransmit_flag);
3343 }
3344
3345 if ((th->th_flags & TH_SYN) != 0) {
3346 (void)os_add_overflow(tp->t_syn_sent, 1, &tp->t_syn_sent);
3347 if (tp->t_rxtshift > 0) {
3348 m->m_pkthdr.pkt_flags |= PKTF_TCP_REXMT;
3349 }
3350 }
3351 if ((th->th_flags & TH_FIN) != 0) {
3352 (void)os_add_overflow(tp->t_fin_sent, 1, &tp->t_fin_sent);
3353 if (tp->t_rxtshift > 0) {
3354 m->m_pkthdr.pkt_flags |= PKTF_TCP_REXMT;
3355 }
3356 }
3357 if ((th->th_flags & TH_RST) != 0) {
3358 (void)os_add_overflow(tp->t_rst_sent, 1, &tp->t_rst_sent);
3359 if (tp->t_rxtshift > 0) {
3360 m->m_pkthdr.pkt_flags |= PKTF_TCP_REXMT;
3361 }
3362 }
3363 TCP_LOG_TH_FLAGS(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true,
3364 outifp != NULL ? outifp : inp->inp_boundifp);
3365
3366 if (__improbable((th->th_flags & TH_RST) != 0 && inp->inp_sndinprog_cnt == 0 &&
3367 sendalot == 0 && tp->t_pktlist_head == m)) {
3368 if (tcp_rst_rlc_compress(mtod(m, void *), m->m_len, th) == true) {
3369 error = 0;
3370 goto out;
3371 }
3372 }
3373
3374 if (link_heuristics_enabled && (tcp_link_heuristics_flags & TCP_LINK_HEUR_RXMT_COMP) != 0 &&
3375 (len != 0 || (th->th_flags & TH_FIN) != 0)) {
3376 /*
3377 * Set compression flag if gencnt of segment is the same as the last sent segment
3378 * otherwise record the gencnt of the segment that we are sending
3379 */
3380 uint32_t gencnt = ntohl(th->th_seq) & TCP_COMP_RXMT_GENCNT_MASK;
3381
3382 if ((m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) != 0 && gencnt == tp->t_comp_rxmt_gencnt) {
3383 IF_TCP_STATINC(outifp, linkheur_comprxmt);
3384 m->m_pkthdr.comp_gencnt = gencnt;
3385 } else {
3386 tp->t_comp_rxmt_gencnt = gencnt;
3387 }
3388 }
3389
3390 if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
3391 (tp->t_flags & TF_ACKNOW) ||
3392 (tp->t_flagsext & TF_FORCE) ||
3393 tp->t_lastchain >= tcp_packet_chaining) {
3394 error = 0;
3395 while (inp->inp_sndinprog_cnt == 0 &&
3396 tp->t_pktlist_head != NULL) {
3397 packetlist = tp->t_pktlist_head;
3398 packchain_listadd = tp->t_lastchain;
3399 packchain_sent++;
3400 lost = tp->t_pktlist_sentlen;
3401 TCP_PKTLIST_CLEAR(tp);
3402
3403 error = tcp_ip_output(so, tp, packetlist,
3404 packchain_listadd, tp_inp_options,
3405 (so_options & SO_DONTROUTE),
3406 (rack_sack_rxmit || (sack_bytes_rxmt != 0)), isipv6);
3407 if (error) {
3408 /*
3409 * Take into account the rest of unsent
3410 * packets in the packet list for this tcp
3411 * into "lost", since we're about to free
3412 * the whole list below.
3413 */
3414 lost += tp->t_pktlist_sentlen;
3415 break;
3416 } else {
3417 lost = 0;
3418 }
3419 }
3420 /* tcp was closed while we were in ip; resume close */
3421 if (inp->inp_sndinprog_cnt == 0 &&
3422 (tp->t_flags & TF_CLOSING)) {
3423 tp->t_flags &= ~TF_CLOSING;
3424 (void) tcp_close(tp);
3425 return 0;
3426 }
3427 } else {
3428 error = 0;
3429 packchain_looped++;
3430 tcpstat.tcps_sndtotal++;
3431
3432 goto again;
3433 }
3434 if (error) {
3435 /*
3436 * Assume that the packets were lost, so back out the
3437 * sequence number advance, if any. Note that the "lost"
3438 * variable represents the amount of user data sent during
3439 * the recent call to ip_output_list() plus the amount of
3440 * user data in the packet list for this tcp at the moment.
3441 */
3442 if (!(tp->t_flagsext & TF_FORCE)
3443 || tp->t_timer[TCPT_PERSIST] == 0) {
3444 /*
3445 * No need to check for TH_FIN here because
3446 * the TF_SENTFIN flag handles that case.
3447 */
3448 if ((flags & TH_SYN) == 0) {
3449 /*
3450 * RACK will mark these segments lost on its own
3451 * when new ACK arrives, no need to adjust anything here.
3452 * In fact doing so would be wrong, as RACK segments are
3453 * ordered in time (not sequence number).
3454 */
3455 if (rack_sack_rxmit && !TCP_RACK_ENABLED(tp)) {
3456 if (SEQ_GT((p->rxmit - lost),
3457 tp->snd_una)) {
3458 p->rxmit -= lost;
3459
3460 if (SEQ_LT(p->rxmit, p->start)) {
3461 p->rxmit = p->start;
3462 }
3463 } else {
3464 lost = p->rxmit - tp->snd_una;
3465 p->rxmit = tp->snd_una;
3466
3467 if (SEQ_LT(p->rxmit, p->start)) {
3468 p->rxmit = p->start;
3469 }
3470 }
3471 tp->sackhint.sack_bytes_rexmit -= lost;
3472 if (tp->sackhint.sack_bytes_rexmit < 0) {
3473 tp->sackhint.sack_bytes_rexmit = 0;
3474 }
3475 } else {
3476 if (SEQ_GT((tp->snd_nxt - lost),
3477 tp->snd_una)) {
3478 tp->snd_nxt -= lost;
3479 } else {
3480 tp->snd_nxt = tp->snd_una;
3481 }
3482 }
3483 }
3484 }
3485 out:
3486 if (tp->t_pktlist_head != NULL) {
3487 m_drop_list(tp->t_pktlist_head, outifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_PKT_UNSENT, NULL, 0);
3488 }
3489 TCP_PKTLIST_CLEAR(tp);
3490
3491 if (error == ENOBUFS) {
3492 /*
3493 * Set retransmit timer if not currently set
3494 * when we failed to send a segment that can be
3495 * retransmitted (i.e. not pure ack or rst)
3496 */
3497 if (tp->t_timer[TCPT_REXMT] == 0 &&
3498 tp->t_timer[TCPT_PERSIST] == 0 &&
3499 (len != 0 || (flags & (TH_SYN | TH_FIN)) != 0 ||
3500 so->so_snd.sb_cc > 0)) {
3501 tcp_set_rto(tp);
3502 }
3503 tp->snd_cwnd = tp->t_maxseg;
3504 tp->t_bytes_acked = 0;
3505 tcp_check_timer_state(tp);
3506 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3507
3508 TCP_LOG_OUTPUT(tp, "error ENOBUFS silently handled");
3509
3510 tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR);
3511 return 0;
3512 }
3513 if (error == EMSGSIZE) {
3514 /*
3515 * ip_output() will have already fixed the route
3516 * for us. tcp_mtudisc() will, as its last action,
3517 * initiate retransmission, so it is important to
3518 * not do so here.
3519 *
3520 * If TSO was active we either got an interface
3521 * without TSO capabilits or TSO was turned off.
3522 * Disable it for this connection as too and
3523 * immediatly retry with MSS sized segments generated
3524 * by this function.
3525 */
3526 if (tso) {
3527 tp->t_flags &= ~TF_TSO;
3528 }
3529
3530 tcp_mtudisc(inp, 0);
3531 tcp_check_timer_state(tp);
3532
3533 TCP_LOG_OUTPUT(tp, "error EMSGSIZE silently handled");
3534
3535 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3536 return 0;
3537 }
3538 /*
3539 * Unless this is due to interface restriction policy,
3540 * treat EHOSTUNREACH/ENETDOWN/EADDRNOTAVAIL as a soft error.
3541 */
3542 if ((error == EHOSTUNREACH || error == ENETDOWN || error == EADDRNOTAVAIL) &&
3543 TCPS_HAVERCVDSYN(tp->t_state) &&
3544 !inp_restricted_send(inp, outifp)) {
3545 tp->t_softerror = error;
3546 TCP_LOG_OUTPUT(tp, "soft error %d silently handled", error);
3547 error = 0;
3548 } else {
3549 TCP_LOG_OUTPUT(tp, "error %d", error);
3550 }
3551 tcp_check_timer_state(tp);
3552 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3553 return error;
3554 }
3555
3556 tcpstat.tcps_sndtotal++;
3557
3558 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3559 if (sendalot) {
3560 goto again;
3561 }
3562
3563 tcp_check_timer_state(tp);
3564
3565 return 0;
3566 }
3567
3568 static int
tcp_set_rto_deadline(struct tcpcb * tp,struct mbuf * pkt,int isipv6,uint32_t tcp_now_local)3569 tcp_set_rto_deadline(struct tcpcb *tp, struct mbuf *pkt, int isipv6, uint32_t tcp_now_local)
3570 {
3571 struct tcphdr *th;
3572 int th_offset = isipv6 ? sizeof(struct ip6_hdr) : sizeof(struct ip);
3573
3574 /*
3575 * Determine whether the RTO should be set
3576 */
3577 if (tp->t_timer[TCPT_REXMT] == 0) {
3578 /*
3579 * RTO is not set for the TP, can not set
3580 * the deadline.
3581 */
3582 return ENOENT;
3583 }
3584
3585 /*
3586 * RTO deadline is not applicable to pure ACK packets,
3587 * as well as to the SYN/RST/FIN packets.
3588 */
3589 th = (struct tcphdr *)(void*)(mtod(pkt, caddr_t) + th_offset);
3590
3591 /*
3592 * Check whether the packet has any TCP payload, or is a pure ACK.
3593 */
3594 if (pkt->m_pkthdr.len <= th_offset + (th->th_off << 2)) {
3595 return ENOENT;
3596 }
3597
3598 /*
3599 * Check whether this is an SYN/RST/FIN packet
3600 */
3601 if ((th->th_flags & (TH_SYN | TH_RST | TH_FIN)) != 0) {
3602 return ENOENT;
3603 }
3604
3605 /*
3606 * Set the deadline.
3607 */
3608 pkt->m_pkthdr.pkt_deadline = tcp_calculate_rto_deadline(tp, tcp_now_local);
3609
3610 return 0;
3611 }
3612
3613 static int
tcp_ip_output(struct socket * so,struct tcpcb * tp,struct mbuf * pkt,int cnt,struct mbuf * opt,int flags,int sack_in_progress,boolean_t isipv6)3614 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
3615 int cnt, struct mbuf *opt, int flags, int sack_in_progress, boolean_t isipv6)
3616 {
3617 int error = 0;
3618 boolean_t chain;
3619 boolean_t unlocked = FALSE;
3620 boolean_t ifdenied = FALSE;
3621 struct inpcb *__single inp = tp->t_inpcb;
3622 struct ifnet *__single outif = NULL;
3623 bool check_qos_marking_again = (so->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE) ? FALSE : TRUE;
3624 bool fadv_congested = FALSE;
3625 uint32_t tcp_now_local = os_access_once(tcp_now);
3626
3627 union {
3628 struct route _ro;
3629 struct route_in6 _ro6;
3630 } route_u_ = {};
3631 #define ro route_u_._ro
3632 #define ro6 route_u_._ro6
3633
3634 union {
3635 struct ip_out_args _ipoa;
3636 struct ip6_out_args _ip6oa;
3637 } out_args_u_ = {};
3638 #define ipoa out_args_u_._ipoa
3639 #define ip6oa out_args_u_._ip6oa
3640
3641 if (isipv6) {
3642 ip6oa.ip6oa_boundif = IFSCOPE_NONE;
3643 ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR;
3644 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
3645 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3646 } else {
3647 ipoa.ipoa_boundif = IFSCOPE_NONE;
3648 ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR;
3649 ipoa.ipoa_sotc = SO_TC_UNSPEC;
3650 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3651 }
3652
3653 struct flowadv *__single adv =
3654 (isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv);
3655
3656 /* If socket was bound to an ifindex, tell ip_output about it */
3657 if (inp->inp_flags & INP_BOUND_IF) {
3658 if (isipv6) {
3659 ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index;
3660 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
3661 } else {
3662 ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
3663 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
3664 }
3665 } else if (!in6_embedded_scope && isipv6 && (IN6_IS_SCOPE_EMBED(&inp->in6p_faddr))) {
3666 ip6oa.ip6oa_boundif = inp->inp_fifscope;
3667 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
3668 }
3669
3670 if (INP_NO_CELLULAR(inp)) {
3671 if (isipv6) {
3672 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
3673 } else {
3674 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
3675 }
3676 }
3677 if (INP_NO_EXPENSIVE(inp)) {
3678 if (isipv6) {
3679 ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
3680 } else {
3681 ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
3682 }
3683 }
3684 if (INP_NO_CONSTRAINED(inp)) {
3685 if (isipv6) {
3686 ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
3687 } else {
3688 ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED;
3689 }
3690 }
3691 if (INP_AWDL_UNRESTRICTED(inp)) {
3692 if (isipv6) {
3693 ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
3694 } else {
3695 ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
3696 }
3697 }
3698 if (INP_INTCOPROC_ALLOWED(inp) && isipv6) {
3699 ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
3700 }
3701 if (INP_MANAGEMENT_ALLOWED(inp)) {
3702 if (isipv6) {
3703 ip6oa.ip6oa_flags |= IP6OAF_MANAGEMENT_ALLOWED;
3704 } else {
3705 ipoa.ipoa_flags |= IPOAF_MANAGEMENT_ALLOWED;
3706 }
3707 }
3708 if (INP_ULTRA_CONSTRAINED_ALLOWED(inp)) {
3709 if (isipv6) {
3710 ip6oa.ip6oa_flags |= IP6OAF_ULTRA_CONSTRAINED_ALLOWED;
3711 } else {
3712 ipoa.ipoa_flags |= IPOAF_ULTRA_CONSTRAINED_ALLOWED;
3713 }
3714 }
3715 if (isipv6) {
3716 ip6oa.ip6oa_sotc = so->so_traffic_class;
3717 ip6oa.ip6oa_netsvctype = so->so_netsvctype;
3718 ip6oa.qos_marking_gencount = inp->inp_policyresult.results.qos_marking_gencount;
3719 } else {
3720 ipoa.ipoa_sotc = so->so_traffic_class;
3721 ipoa.ipoa_netsvctype = so->so_netsvctype;
3722 ipoa.qos_marking_gencount = inp->inp_policyresult.results.qos_marking_gencount;
3723 }
3724 if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
3725 if (isipv6) {
3726 ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
3727 } else {
3728 ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
3729 }
3730 }
3731 if (check_qos_marking_again) {
3732 if (isipv6) {
3733 ip6oa.ip6oa_flags |= IP6OAF_REDO_QOSMARKING_POLICY;
3734 } else {
3735 ipoa.ipoa_flags |= IPOAF_REDO_QOSMARKING_POLICY;
3736 }
3737 }
3738 if (isipv6) {
3739 flags |= IPV6_OUTARGS;
3740 } else {
3741 flags |= IP_OUTARGS;
3742 }
3743
3744 /* Copy the cached route and take an extra reference */
3745 if (isipv6) {
3746 in6p_route_copyout(inp, &ro6);
3747 } else {
3748 inp_route_copyout(inp, &ro);
3749 }
3750 #if (DEBUG || DEVELOPMENT)
3751 if ((so->so_flags & SOF_MARK_WAKE_PKT) && pkt != NULL) {
3752 so->so_flags &= ~SOF_MARK_WAKE_PKT;
3753 pkt->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
3754 }
3755 #endif /* (DEBUG || DEVELOPMENT) */
3756
3757 /*
3758 * Make sure ACK/DELACK conditions are cleared before
3759 * we unlock the socket.
3760 */
3761 tp->last_ack_sent = tp->rcv_nxt;
3762 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
3763 tp->t_timer[TCPT_DELACK] = 0;
3764 tp->t_unacksegs = 0;
3765 tp->t_unacksegs_ce = 0;
3766
3767 /* Increment the count of outstanding send operations */
3768 inp->inp_sndinprog_cnt++;
3769
3770 /*
3771 * If allowed, unlock TCP socket while in IP
3772 * but only if the connection is established and
3773 * in a normal mode where reentrancy on the tcpcb won't be
3774 * an issue:
3775 * - there is no SACK episode
3776 * - we're not in Fast Recovery mode
3777 * - if we're not sending from an upcall.
3778 */
3779 if (tcp_output_unlocked && !so->so_upcallusecount &&
3780 (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) &&
3781 !IN_FASTRECOVERY(tp) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3782 unlocked = TRUE;
3783 socket_unlock(so, 0);
3784 }
3785
3786 /*
3787 * Don't send down a chain of packets when:
3788 * - TCP chaining is disabled
3789 * - there is an IPsec rule set
3790 * - there is a non default rule set for the firewall
3791 */
3792
3793 chain = tcp_packet_chaining > 1
3794 #if IPSEC
3795 && ipsec_bypass
3796 #endif
3797 ; // I'm important, not extraneous
3798
3799 while (pkt != NULL) {
3800 struct mbuf *npkt = pkt->m_nextpkt;
3801
3802 /*
3803 * If enabled, set the RTO deadline for the packet.
3804 */
3805 if (tcp_use_rto_deadline) {
3806 tcp_set_rto_deadline(tp, pkt, isipv6, tcp_now_local);
3807 }
3808
3809 if (!chain) {
3810 pkt->m_nextpkt = NULL;
3811 /*
3812 * If we are not chaining, make sure to set the packet
3813 * list count to 0 so that IP takes the right path;
3814 * this is important for cases such as IPsec where a
3815 * single mbuf might result in multiple mbufs as part
3816 * of the encapsulation. If a non-zero count is passed
3817 * down to IP, the head of the chain might change and
3818 * we could end up skipping it (thus generating bogus
3819 * packets). Fixing it in IP would be desirable, but
3820 * for now this would do it.
3821 */
3822 cnt = 0;
3823 }
3824 if (isipv6) {
3825 error = ip6_output_list(pkt, cnt,
3826 inp->in6p_outputopts, &ro6, flags, NULL, NULL,
3827 &ip6oa);
3828 ifdenied = (ip6oa.ip6oa_flags & IP6OAF_R_IFDENIED);
3829 } else {
3830 error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL,
3831 &ipoa);
3832 ifdenied = (ipoa.ipoa_flags & IPOAF_R_IFDENIED);
3833 }
3834
3835 if (adv->code == FADV_CONGESTED) {
3836 fadv_congested = TRUE;
3837 }
3838
3839 if (chain || error) {
3840 /*
3841 * If we sent down a chain then we are done since
3842 * the callee had taken care of everything; else
3843 * we need to free the rest of the chain ourselves.
3844 */
3845 if (!chain) {
3846 m_drop_list(npkt, NULL, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_PKT_UNSENT, NULL, 0);
3847 }
3848 break;
3849 }
3850 pkt = npkt;
3851 }
3852
3853 if (unlocked) {
3854 socket_lock(so, 0);
3855 }
3856
3857 /*
3858 * Enter flow controlled state if the connection is established
3859 * and is not in recovery. Flow control is allowed only if there
3860 * is outstanding data.
3861 *
3862 * A connection will enter suspended state even if it is in
3863 * recovery.
3864 */
3865 if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) ||
3866 adv->code == FADV_SUSPENDED) &&
3867 !(tp->t_flags & TF_CLOSING) &&
3868 tp->t_state == TCPS_ESTABLISHED &&
3869 SEQ_GT(tp->snd_max, tp->snd_una)) {
3870 int rc;
3871 rc = inp_set_fc_state(inp, adv->code);
3872
3873 if (rc == 1) {
3874 tcp_ccdbg_trace(tp, NULL,
3875 ((adv->code == FADV_FLOW_CONTROLLED) ?
3876 TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND));
3877 if (adv->code == FADV_FLOW_CONTROLLED) {
3878 TCP_LOG_OUTPUT(tp, "flow controlled");
3879 } else {
3880 TCP_LOG_OUTPUT(tp, "flow suspended");
3881 }
3882 }
3883 }
3884
3885 if (fadv_congested && !IN_FASTRECOVERY(tp) && !(tp->t_flags & TF_CLOSING) &&
3886 tp->t_state == TCPS_ESTABLISHED) {
3887 TCP_LOG_OUTPUT(tp, "flow congestion notified");
3888 tcp_local_congestion_notification(tp);
3889 tcp_ccdbg_trace(tp, NULL, TCP_CC_FLOW_CONGESTION_NOTIFIED);
3890 }
3891
3892 /*
3893 * When an interface queue gets suspended, some of the
3894 * packets are dropped. Return ENOBUFS, to update the
3895 * pcb state.
3896 */
3897 if (adv->code == FADV_SUSPENDED) {
3898 error = ENOBUFS;
3899 }
3900
3901 VERIFY(inp->inp_sndinprog_cnt > 0);
3902 if (--inp->inp_sndinprog_cnt == 0) {
3903 inp->inp_flags &= ~(INP_FC_FEEDBACK);
3904 if (inp->inp_sndingprog_waiters > 0) {
3905 wakeup(&inp->inp_sndinprog_cnt);
3906 }
3907 }
3908
3909 if (isipv6) {
3910 /*
3911 * When an NECP IP tunnel policy forces the outbound interface,
3912 * ip6_output_list() informs the transport layer what is the actual
3913 * outgoing interface
3914 */
3915 if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) {
3916 ifnet_head_lock_shared();
3917 outif = ifindex2ifnet[ip6oa.ip6oa_boundif];
3918 ifnet_head_done();
3919 } else if (ro6.ro_rt != NULL) {
3920 outif = ro6.ro_rt->rt_ifp;
3921 }
3922 } else {
3923 if (ro.ro_rt != NULL) {
3924 outif = ro.ro_rt->rt_ifp;
3925 }
3926 }
3927 if (check_qos_marking_again) {
3928 uint32_t qos_marking_gencount;
3929 bool allow_qos_marking;
3930 if (isipv6) {
3931 qos_marking_gencount = ip6oa.qos_marking_gencount;
3932 allow_qos_marking = ip6oa.ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED ? TRUE : FALSE;
3933 } else {
3934 qos_marking_gencount = ipoa.qos_marking_gencount;
3935 allow_qos_marking = ipoa.ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE;
3936 }
3937 inp->inp_policyresult.results.qos_marking_gencount = qos_marking_gencount;
3938 if (allow_qos_marking == TRUE) {
3939 inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
3940 } else {
3941 inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
3942 }
3943 }
3944
3945 if (outif != NULL && outif != inp->inp_last_outifp) {
3946 /* Update the send byte count */
3947 if (so->so_snd.sb_cc > 0 && so->so_snd.sb_flags & SB_SNDBYTE_CNT) {
3948 inp_decr_sndbytes_total(so, so->so_snd.sb_cc);
3949 inp_decr_sndbytes_allunsent(so, tp->snd_una);
3950 so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT;
3951 }
3952 inp->inp_last_outifp = outif;
3953 #if SKYWALK
3954 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3955 netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
3956 }
3957 #endif /* SKYWALK */
3958 }
3959
3960 if (error != 0 && ifdenied &&
3961 (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp))) {
3962 soevent(so,
3963 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED));
3964 }
3965
3966 /* Synchronize cached PCB route & options */
3967 if (isipv6) {
3968 in6p_route_copyin(inp, &ro6);
3969 } else {
3970 inp_route_copyin(inp, &ro);
3971 }
3972
3973 if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 &&
3974 tp->t_inpcb->inp_route.ro_rt != NULL) {
3975 /* If we found the route and there is an rtt on it
3976 * reset the retransmit timer
3977 */
3978 tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt);
3979 tcp_set_rto(tp);
3980 }
3981 return error;
3982 #undef ro
3983 #undef ro6
3984 #undef ipoa
3985 #undef ip6oa
3986 }
3987
3988 int tcptv_persmin_val = TCPTV_PERSMIN;
3989
3990 void
tcp_setpersist(struct tcpcb * tp)3991 tcp_setpersist(struct tcpcb *tp)
3992 {
3993 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
3994
3995 /* If a PERSIST_TIMER option was set we will limit the
3996 * time the persist timer will be active for that connection
3997 * in order to avoid DOS by using zero window probes.
3998 * see rdar://5805356
3999 */
4000
4001 if (tp->t_persist_timeout != 0 &&
4002 tp->t_timer[TCPT_PERSIST] == 0 &&
4003 tp->t_persist_stop == 0) {
4004 tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
4005 }
4006
4007 /*
4008 * Start/restart persistance timer.
4009 */
4010 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
4011 t * tcp_backoff[tp->t_rxtshift],
4012 tcptv_persmin_val, TCPTV_PERSMAX, 0);
4013 tp->t_timer[TCPT_PERSIST] = tcp_offset_from_start(tp, tp->t_timer[TCPT_PERSIST]);
4014
4015 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) {
4016 tp->t_rxtshift++;
4017 }
4018 }
4019
4020 static int
tcp_recv_throttle(struct tcpcb * tp)4021 tcp_recv_throttle(struct tcpcb *tp)
4022 {
4023 uint32_t base_rtt, newsize;
4024 struct sockbuf *__single sbrcv = &tp->t_inpcb->inp_socket->so_rcv;
4025
4026 if (tcp_use_rtt_recvbg == 1 &&
4027 TSTMP_SUPPORTED(tp)) {
4028 /*
4029 * Timestamps are supported on this connection. Use
4030 * RTT to look for an increase in latency.
4031 */
4032
4033 /*
4034 * If the connection is already being throttled, leave it
4035 * in that state until rtt comes closer to base rtt
4036 */
4037 if (tp->t_flagsext & TF_RECV_THROTTLE) {
4038 return 1;
4039 }
4040
4041 base_rtt = get_base_rtt(tp);
4042
4043 if (base_rtt != 0 && tp->t_rttcur != 0) {
4044 /*
4045 * if latency increased on a background flow,
4046 * return 1 to start throttling.
4047 */
4048 if (tp->t_rttcur > (base_rtt + target_qdelay)) {
4049 tp->t_flagsext |= TF_RECV_THROTTLE;
4050 if (tp->t_recv_throttle_ts == 0) {
4051 tp->t_recv_throttle_ts = tcp_now;
4052 }
4053 /*
4054 * Reduce the recv socket buffer size to
4055 * minimize latecy.
4056 */
4057 if (sbrcv->sb_idealsize >
4058 tcp_recv_throttle_minwin) {
4059 newsize = sbrcv->sb_idealsize >> 1;
4060 /* Set a minimum of 16 K */
4061 newsize =
4062 max(newsize,
4063 tcp_recv_throttle_minwin);
4064 sbrcv->sb_idealsize = newsize;
4065 }
4066 return 1;
4067 } else {
4068 return 0;
4069 }
4070 }
4071 }
4072
4073 /*
4074 * Timestamps are not supported or there is no good RTT
4075 * measurement. Use IPDV in this case.
4076 */
4077 if (tp->acc_iaj > tcp_acc_iaj_react_limit) {
4078 return 1;
4079 }
4080
4081 return 0;
4082 }
4083