xref: /xnu-10063.121.3/bsd/netinet/ip_output.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2000-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1988, 1990, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
61  */
62 /*
63  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64  * support for mandatory and extensible security protections.  This notice
65  * is included in support of clause 2.2 (b) of the Apple Public License,
66  * Version 2.0.
67  */
68 
69 #define _IP_VHL
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/protosw.h>
77 #include <sys/socket.h>
78 #include <sys/socketvar.h>
79 #include <kern/locks.h>
80 #include <sys/sysctl.h>
81 #include <sys/mcache.h>
82 #include <sys/kdebug.h>
83 
84 #include <machine/endian.h>
85 #include <pexpert/pexpert.h>
86 #include <mach/sdt.h>
87 
88 #include <libkern/OSAtomic.h>
89 #include <libkern/OSByteOrder.h>
90 
91 #include <net/if.h>
92 #include <net/if_dl.h>
93 #include <net/if_types.h>
94 #include <net/route.h>
95 #include <net/ntstat.h>
96 #include <net/net_osdep.h>
97 #include <net/dlil.h>
98 #include <net/net_perf.h>
99 
100 #include <netinet/in.h>
101 #include <netinet/in_systm.h>
102 #include <netinet/ip.h>
103 #include <netinet/in_pcb.h>
104 #include <netinet/in_var.h>
105 #include <netinet/ip_var.h>
106 #include <netinet/kpi_ipfilter_var.h>
107 #include <netinet/in_tclass.h>
108 #include <netinet/udp.h>
109 
110 #include <netinet6/nd6.h>
111 
112 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 1)
113 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 3)
114 #define DBG_FNC_IP_OUTPUT       NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
115 #define DBG_FNC_IPSEC4_OUTPUT   NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
116 
117 #if IPSEC
118 #include <netinet6/ipsec.h>
119 #include <netkey/key.h>
120 #if IPSEC_DEBUG
121 #include <netkey/key_debug.h>
122 #else
123 #define KEYDEBUG(lev, arg)
124 #endif
125 #endif /* IPSEC */
126 
127 #if NECP
128 #include <net/necp.h>
129 #endif /* NECP */
130 
131 
132 #if DUMMYNET
133 #include <netinet/ip_dummynet.h>
134 #endif
135 
136 #if PF
137 #include <net/pfvar.h>
138 #endif /* PF */
139 
140 #include <net/sockaddr_utils.h>
141 
142 u_short ip_id;
143 
144 static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
145 static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
146 static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
147 static void ip_out_cksum_stats(int, u_int32_t);
148 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
149 static int ip_optcopy(struct ip *, struct ip *);
150 static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
151 static void imo_trace(struct ip_moptions *, int);
152 static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *,
153     struct sockaddr_in *, int);
154 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
155 
156 extern struct ip_linklocal_stat ip_linklocal_stat;
157 
158 /* temporary: for testing */
159 #if IPSEC
160 extern int ipsec_bypass;
161 #endif
162 
163 static int force_ipsum = 0;
164 static int ip_maxchainsent = 0;
165 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent,
166     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0,
167     "use dlil_output_list");
168 
169 SYSCTL_INT(_net_inet_ip, OID_AUTO, force_ipsum,
170     CTLFLAG_RW | CTLFLAG_LOCKED, &force_ipsum, 0,
171     "force IP checksum");
172 #if DEBUG
173 static int forge_ce = 0;
174 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce,
175     CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0,
176     "Forge ECN CE");
177 #endif /* DEBUG */
178 
179 static int ip_select_srcif_debug = 0;
180 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
181     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
182     "log source interface selection debug info");
183 
184 static int ip_output_measure = 0;
185 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
186     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
187     &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
188     "Do time measurement");
189 
190 static uint64_t ip_output_measure_bins = 0;
191 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
192     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
193     sysctl_ip_output_measure_bins, "I",
194     "bins for chaining performance data histogram");
195 
196 static net_perf_t net_perf;
197 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
198     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
199     0, 0, sysctl_ip_output_getperf, "S,net_perf",
200     "IP output performance data (struct net_perf, net/net_perf.h)");
201 
202 __private_extern__ int rfc6864 = 1;
203 SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
204     &rfc6864, 0, "updated ip id field behavior");
205 
206 #define IMO_TRACE_HIST_SIZE     32      /* size of trace history */
207 
208 /* For gdb */
209 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
210 
211 struct ip_moptions_dbg {
212 	struct ip_moptions      imo;                    /* ip_moptions */
213 	u_int16_t               imo_refhold_cnt;        /* # of IMO_ADDREF */
214 	u_int16_t               imo_refrele_cnt;        /* # of IMO_REMREF */
215 	/*
216 	 * Alloc and free callers.
217 	 */
218 	ctrace_t                imo_alloc;
219 	ctrace_t                imo_free;
220 	/*
221 	 * Circular lists of IMO_ADDREF and IMO_REMREF callers.
222 	 */
223 	ctrace_t                imo_refhold[IMO_TRACE_HIST_SIZE];
224 	ctrace_t                imo_refrele[IMO_TRACE_HIST_SIZE];
225 };
226 
227 #if DEBUG
228 static unsigned int imo_debug = 1;      /* debugging (enabled) */
229 #else
230 static unsigned int imo_debug;          /* debugging (disabled) */
231 #endif /* !DEBUG */
232 
233 static struct zone *imo_zone;           /* zone for ip_moptions */
234 #define IMO_ZONE_NAME           "ip_moptions"   /* zone name */
235 
236 #if PF
237 __attribute__((noinline))
238 static int
ip_output_pf_dn_hook(struct ifnet * ifp,struct mbuf ** mppn,struct mbuf ** mp,struct pf_rule * dn_pf_rule,struct route * ro,struct sockaddr_in * dst,int flags,struct ip_out_args * ipoa)239 ip_output_pf_dn_hook(struct ifnet *ifp, struct mbuf **mppn, struct mbuf **mp,
240     struct pf_rule *dn_pf_rule, struct route *ro, struct sockaddr_in *dst, int flags,
241     struct ip_out_args *ipoa)
242 {
243 	int rc;
244 	struct ip_fw_args args = {};
245 
246 	args.fwa_pf_rule = dn_pf_rule;
247 	args.fwa_oif = ifp;
248 	args.fwa_ro = ro;
249 	args.fwa_dst = dst;
250 	args.fwa_oflags = flags;
251 	if (flags & IP_OUTARGS) {
252 		args.fwa_ipoa = ipoa;
253 	}
254 	rc = pf_af_hook(ifp, mppn, mp, AF_INET, FALSE, &args);
255 
256 	return rc;
257 }
258 
259 #endif /* PF */
260 
261 
262 /*
263  * IP output.  The packet in mbuf chain m contains a skeletal IP
264  * header (with len, off, ttl, proto, tos, src, dst).
265  * The mbuf chain containing the packet will be freed.
266  * The mbuf opt, if present, will not be freed.
267  */
268 int
ip_output(struct mbuf * m0,struct mbuf * opt,struct route * ro,int flags,struct ip_moptions * imo,struct ip_out_args * ipoa)269 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
270     struct ip_moptions *imo, struct ip_out_args *ipoa)
271 {
272 	return ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
273 }
274 
275 /*
276  * IP output.  The packet in mbuf chain m contains a skeletal IP
277  * header (with len, off, ttl, proto, tos, src, dst).
278  * The mbuf chain containing the packet will be freed.
279  * The mbuf opt, if present, will not be freed.
280  *
281  * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
282  * skipped and ro->ro_rt would be used.  Otherwise the result of route
283  * lookup is stored in ro->ro_rt.
284  *
285  * In the IP forwarding case, the packet will arrive with options already
286  * inserted, so must have a NULL opt pointer.
287  */
288 int
ip_output_list(struct mbuf * m0,int packetchain,struct mbuf * opt,struct route * ro,int flags,struct ip_moptions * imo,struct ip_out_args * ipoa)289 ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
290     struct route *ro, int flags, struct ip_moptions *imo,
291     struct ip_out_args *ipoa)
292 {
293 	struct ip *ip;
294 	struct ifnet *ifp = NULL;               /* not refcnt'd */
295 	struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
296 	int hlen = sizeof(struct ip);
297 	int len = 0, error = 0;
298 	struct sockaddr_in *dst = NULL;
299 	struct in_ifaddr *ia = NULL, *src_ia = NULL;
300 	struct in_addr pkt_dst;
301 	struct ipf_pktopts *ippo = NULL;
302 	ipfilter_t inject_filter_ref = NULL;
303 	struct mbuf *packetlist;
304 	uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
305 	uint32_t packets_processed = 0;
306 	unsigned int ifscope = IFSCOPE_NONE;
307 	struct flowadv *adv = NULL;
308 	struct timeval start_tv;
309 #if IPSEC
310 	struct socket *so = NULL;
311 	struct secpolicy *sp = NULL;
312 #endif /* IPSEC */
313 #if NECP
314 	necp_kernel_policy_result necp_result = 0;
315 	necp_kernel_policy_result_parameter necp_result_parameter;
316 	necp_kernel_policy_id necp_matched_policy_id = 0;
317 #endif /* NECP */
318 #if DUMMYNET
319 	struct m_tag *tag;
320 	struct ip_out_args saved_ipoa;
321 	struct sockaddr_in dst_buf;
322 #endif /* DUMMYNET */
323 	struct {
324 #if IPSEC
325 		struct ipsec_output_state ipsec_state;
326 #endif /* IPSEC */
327 #if NECP
328 		struct route necp_route;
329 #endif /* NECP */
330 #if DUMMYNET
331 		struct route saved_route;
332 #endif /* DUMMYNET */
333 		struct ipf_pktopts ipf_pktopts;
334 	} ipobz;
335 #define ipsec_state     ipobz.ipsec_state
336 #define necp_route      ipobz.necp_route
337 #define sro_fwd         ipobz.sro_fwd
338 #define saved_route     ipobz.saved_route
339 #define ipf_pktopts     ipobz.ipf_pktopts
340 	union {
341 		struct {
342 			boolean_t select_srcif : 1;     /* set once */
343 			boolean_t srcbound : 1;         /* set once */
344 			boolean_t nocell : 1;           /* set once */
345 			boolean_t isbroadcast : 1;
346 			boolean_t didfilter : 1;
347 			boolean_t noexpensive : 1;      /* set once */
348 			boolean_t noconstrained : 1;      /* set once */
349 			boolean_t awdl_unrestricted : 1;        /* set once */
350 			boolean_t management_allowed : 1;        /* set once */
351 		};
352 		uint32_t raw;
353 	} ipobf = { .raw = 0 };
354 
355 	int interface_mtu = 0;
356 	struct pf_rule *dn_pf_rule = NULL;
357 /*
358  * Here we check for restrictions when sending frames.
359  * N.B.: IPv4 over internal co-processor interfaces is not allowed.
360  */
361 #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf)                                 \
362 	(((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) ||                    \
363 	 ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) ||              \
364 	 ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) ||          \
365 	  (IFNET_IS_INTCOPROC(_ifp)) ||                                     \
366 	 (!(_ipobf).management_allowed && IFNET_IS_MANAGEMENT(_ifp)) ||     \
367 	 (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
368 
369 	if (ip_output_measure) {
370 		net_perf_start_time(&net_perf, &start_tv);
371 	}
372 	KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
373 
374 	VERIFY(m0->m_flags & M_PKTHDR);
375 	packetlist = m0;
376 
377 	/* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
378 	bzero(&ipobz, sizeof(ipobz));
379 	ippo = &ipf_pktopts;
380 
381 #if DUMMYNET
382 	if (SLIST_EMPTY(&m0->m_pkthdr.tags)) {
383 		goto ipfw_tags_done;
384 	}
385 
386 	/* Grab info from mtags prepended to the chain */
387 	if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
388 	    KERNEL_TAG_TYPE_DUMMYNET)) != NULL) {
389 		struct dn_pkt_tag       *dn_tag;
390 
391 		dn_tag = (struct dn_pkt_tag *)(tag->m_tag_data);
392 		dn_pf_rule = dn_tag->dn_pf_rule;
393 		opt = NULL;
394 		saved_route = dn_tag->dn_ro;
395 		ro = &saved_route;
396 
397 		imo = NULL;
398 		SOCKADDR_COPY(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
399 		dst = &dst_buf;
400 		ifp = dn_tag->dn_ifp;
401 		flags = dn_tag->dn_flags;
402 		if ((dn_tag->dn_flags & IP_OUTARGS)) {
403 			saved_ipoa = dn_tag->dn_ipoa;
404 			ipoa = &saved_ipoa;
405 		}
406 
407 		m_tag_delete(m0, tag);
408 	}
409 ipfw_tags_done:
410 #endif /* DUMMYNET */
411 
412 	m = m0;
413 	m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP | PKTF_IFAINFO);
414 
415 #if IPSEC
416 	if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
417 		/* If packet is bound to an interface, check bound policies */
418 		if ((flags & IP_OUTARGS) && (ipoa != NULL) &&
419 		    (ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
420 		    ipoa->ipoa_boundif != IFSCOPE_NONE) {
421 			if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND,
422 			    &flags, ipoa, &sp) != 0) {
423 				goto bad;
424 			}
425 		}
426 	}
427 #endif /* IPSEC */
428 
429 	VERIFY(ro != NULL);
430 
431 	if (flags & IP_OUTARGS) {
432 		/*
433 		 * In the forwarding case, only the ifscope value is used,
434 		 * as source interface selection doesn't take place.
435 		 */
436 		if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) &&
437 		    (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
438 			ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
439 		}
440 
441 		if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
442 		    ipoa->ipoa_boundif != IFSCOPE_NONE) {
443 			ifscope = ipoa->ipoa_boundif;
444 			ipf_pktopts.ippo_flags |=
445 			    (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
446 		}
447 
448 		/* double negation needed for bool bit field */
449 		ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR);
450 		if (ipobf.srcbound) {
451 			ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
452 		}
453 	} else {
454 		ipobf.select_srcif = FALSE;
455 		ipobf.srcbound = FALSE;
456 		ifscope = IFSCOPE_NONE;
457 		if (flags & IP_OUTARGS) {
458 			ipoa->ipoa_boundif = IFSCOPE_NONE;
459 			ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF |
460 			    IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR);
461 		}
462 	}
463 
464 	if (flags & IP_OUTARGS) {
465 		if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) {
466 			ipobf.nocell = true;
467 			ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
468 		}
469 		if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) {
470 			ipobf.noexpensive = true;
471 			ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
472 		}
473 		if (ipoa->ipoa_flags & IPOAF_NO_CONSTRAINED) {
474 			ipobf.noconstrained = true;
475 			ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED;
476 		}
477 		if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) {
478 			ipobf.awdl_unrestricted = true;
479 		}
480 		if (ipoa->ipoa_flags & IPOAF_MANAGEMENT_ALLOWED) {
481 			ipobf.management_allowed = true;
482 		}
483 		adv = &ipoa->ipoa_flowadv;
484 		adv->code = FADV_SUCCESS;
485 		ipoa->ipoa_flags &= ~IPOAF_RET_MASK;
486 	}
487 
488 #if IPSEC
489 	if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
490 		so = ipsec_getsocket(m);
491 		if (so != NULL) {
492 			(void) ipsec_setsocket(m, NULL);
493 		}
494 	}
495 #endif /* IPSEC */
496 
497 #if DUMMYNET
498 	if (dn_pf_rule != NULL) {
499 		/* dummynet already saw us */
500 		ip = mtod(m, struct ip *);
501 		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
502 		pkt_dst = ip->ip_dst;
503 		if (ro->ro_rt != NULL) {
504 			RT_LOCK_SPIN(ro->ro_rt);
505 			ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
506 			if (ia) {
507 				/* Become a regular mutex */
508 				RT_CONVERT_LOCK(ro->ro_rt);
509 				ifa_addref(&ia->ia_ifa);
510 			}
511 			RT_UNLOCK(ro->ro_rt);
512 		}
513 
514 		goto sendit;
515 	}
516 #endif /* DUMMYNET */
517 
518 loopit:
519 	packets_processed++;
520 	ipobf.isbroadcast = FALSE;
521 	ipobf.didfilter = FALSE;
522 
523 	VERIFY(m->m_flags & M_PKTHDR);
524 	/*
525 	 * No need to proccess packet twice if we've already seen it.
526 	 */
527 	if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
528 		inject_filter_ref = ipf_get_inject_filter(m);
529 	} else {
530 		inject_filter_ref = NULL;
531 	}
532 
533 	if (opt) {
534 		m = ip_insertoptions(m, opt, &len);
535 		hlen = len;
536 		/* Update the chain */
537 		if (m != m0) {
538 			if (m0 == packetlist) {
539 				packetlist = m;
540 			}
541 			m0 = m;
542 		}
543 	}
544 	ip = mtod(m, struct ip *);
545 
546 	pkt_dst = ip->ip_dst;
547 
548 	/*
549 	 * We must not send if the packet is destined to network zero.
550 	 * RFC1122 3.2.1.3 (a) and (b).
551 	 */
552 	if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
553 		error = EHOSTUNREACH;
554 		goto bad;
555 	}
556 
557 	/*
558 	 * Fill in IP header.
559 	 */
560 	if (!(flags & (IP_FORWARDING | IP_RAWOUTPUT))) {
561 		ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
562 		ip->ip_off &= IP_DF;
563 		if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
564 			// Per RFC6864, value of ip_id is undefined for atomic ip packets
565 			ip->ip_id = 0;
566 		} else {
567 			ip->ip_id = ip_randomid((uint64_t)m);
568 		}
569 		OSAddAtomic(1, &ipstat.ips_localout);
570 	} else {
571 		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
572 	}
573 
574 #if DEBUG
575 	/* For debugging, we let the stack forge congestion */
576 	if (forge_ce != 0 &&
577 	    ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
578 	    (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
579 		ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
580 		forge_ce--;
581 	}
582 #endif /* DEBUG */
583 
584 	if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1) {
585 		m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
586 	}
587 
588 	KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
589 	    ip->ip_p, ip->ip_off, ip->ip_len);
590 
591 	dst = SIN(&ro->ro_dst);
592 
593 	/*
594 	 * If there is a cached route,
595 	 * check that it is to the same destination
596 	 * and is still up.  If not, free it and try again.
597 	 * The address family should also be checked in case of sharing the
598 	 * cache with IPv6.
599 	 */
600 
601 	if (ro->ro_rt != NULL) {
602 		if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY &&
603 		    !(flags & (IP_ROUTETOIF | IP_FORWARDING))) {
604 			src_ia = ifa_foraddr(ip->ip_src.s_addr);
605 			if (src_ia == NULL) {
606 				error = EADDRNOTAVAIL;
607 				goto bad;
608 			}
609 			ifa_remref(&src_ia->ia_ifa);
610 			src_ia = NULL;
611 		}
612 		/*
613 		 * Test rt_flags without holding rt_lock for performance
614 		 * reasons; if the route is down it will hopefully be
615 		 * caught by the layer below (since it uses this route
616 		 * as a hint) or during the next transmit.
617 		 */
618 		if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET ||
619 		    dst->sin_addr.s_addr != pkt_dst.s_addr) {
620 			ROUTE_RELEASE(ro);
621 		}
622 
623 		/*
624 		 * If we're doing source interface selection, we may not
625 		 * want to use this route; only synch up the generation
626 		 * count otherwise.
627 		 */
628 		if (!ipobf.select_srcif && ro->ro_rt != NULL &&
629 		    RT_GENID_OUTOFSYNC(ro->ro_rt)) {
630 			RT_GENID_SYNC(ro->ro_rt);
631 		}
632 	}
633 	if (ro->ro_rt == NULL) {
634 		SOCKADDR_ZERO(dst, sizeof(*dst));
635 		dst->sin_family = AF_INET;
636 		dst->sin_len = sizeof(*dst);
637 		dst->sin_addr = pkt_dst;
638 	}
639 	/*
640 	 * If routing to interface only,
641 	 * short circuit routing lookup.
642 	 */
643 	if (flags & IP_ROUTETOIF) {
644 		if (ia != NULL) {
645 			ifa_remref(&ia->ia_ifa);
646 		}
647 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
648 			ia = ifatoia(ifa_ifwithnet(sintosa(dst)));
649 			if (ia == NULL) {
650 				OSAddAtomic(1, &ipstat.ips_noroute);
651 				error = ENETUNREACH;
652 				/* XXX IPv6 APN fallback notification?? */
653 				goto bad;
654 			}
655 		}
656 		ifp = ia->ia_ifp;
657 		ip->ip_ttl = 1;
658 		ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
659 		/*
660 		 * For consistency with other cases below.  Loopback
661 		 * multicast case is handled separately by ip_mloopback().
662 		 */
663 		if ((ifp->if_flags & IFF_LOOPBACK) &&
664 		    !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
665 			m->m_pkthdr.rcvif = ifp;
666 			ip_setsrcifaddr_info(m, ifp->if_index, NULL);
667 			ip_setdstifaddr_info(m, ifp->if_index, NULL);
668 		}
669 	} else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
670 	    imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
671 		/*
672 		 * Bypass the normal routing lookup for multicast
673 		 * packets if the interface is specified.
674 		 */
675 		ipobf.isbroadcast = FALSE;
676 		if (ia != NULL) {
677 			ifa_remref(&ia->ia_ifa);
678 		}
679 
680 		/* Macro takes reference on ia */
681 		IFP_TO_IA(ifp, ia);
682 	} else {
683 		struct ifaddr *ia0 = NULL;
684 		boolean_t cloneok = FALSE;
685 		/*
686 		 * Perform source interface selection; the source IP address
687 		 * must belong to one of the addresses of the interface used
688 		 * by the route.  For performance reasons, do this only if
689 		 * there is no route, or if the routing table has changed,
690 		 * or if we haven't done source interface selection on this
691 		 * route (for this PCB instance) before.
692 		 */
693 		if (ipobf.select_srcif &&
694 		    ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) ||
695 		    !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
696 			/* Find the source interface */
697 			ia0 = in_selectsrcif(ip, ro, ifscope);
698 
699 			/*
700 			 * If the source address belongs to a restricted
701 			 * interface and the caller forbids our using
702 			 * interfaces of such type, pretend that there is no
703 			 * route.
704 			 */
705 			if (ia0 != NULL &&
706 			    IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
707 				ifa_remref(ia0);
708 				ia0 = NULL;
709 				error = EHOSTUNREACH;
710 				if (flags & IP_OUTARGS) {
711 					ipoa->ipoa_flags |= IPOAF_R_IFDENIED;
712 				}
713 				goto bad;
714 			}
715 
716 			/*
717 			 * If the source address is spoofed (in the case of
718 			 * IP_RAWOUTPUT on an unbounded socket), or if this
719 			 * is destined for local/loopback, just let it go out
720 			 * using the interface of the route.  Otherwise,
721 			 * there's no interface having such an address,
722 			 * so bail out.
723 			 */
724 			if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) ||
725 			    ipobf.srcbound) && ifscope != lo_ifp->if_index) {
726 				error = EADDRNOTAVAIL;
727 				goto bad;
728 			}
729 
730 			/*
731 			 * If the caller didn't explicitly specify the scope,
732 			 * pick it up from the source interface.  If the cached
733 			 * route was wrong and was blown away as part of source
734 			 * interface selection, don't mask out RTF_PRCLONING
735 			 * since that route may have been allocated by the ULP,
736 			 * unless the IP header was created by the caller or
737 			 * the destination is IPv4 LLA.  The check for the
738 			 * latter is needed because IPv4 LLAs are never scoped
739 			 * in the current implementation, and we don't want to
740 			 * replace the resolved IPv4 LLA route with one whose
741 			 * gateway points to that of the default gateway on
742 			 * the primary interface of the system.
743 			 */
744 			if (ia0 != NULL) {
745 				if (ifscope == IFSCOPE_NONE) {
746 					ifscope = ia0->ifa_ifp->if_index;
747 				}
748 				cloneok = (!(flags & IP_RAWOUTPUT) &&
749 				    !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
750 			}
751 		}
752 
753 		/*
754 		 * If this is the case, we probably don't want to allocate
755 		 * a protocol-cloned route since we didn't get one from the
756 		 * ULP.  This lets TCP do its thing, while not burdening
757 		 * forwarding or ICMP with the overhead of cloning a route.
758 		 * Of course, we still want to do any cloning requested by
759 		 * the link layer, as this is probably required in all cases
760 		 * for correct operation (as it is for ARP).
761 		 */
762 		if (ro->ro_rt == NULL) {
763 			uint32_t ign = RTF_PRCLONING;
764 			/*
765 			 * We make an exception here: if the destination
766 			 * address is INADDR_BROADCAST, allocate a protocol-
767 			 * cloned host route so that we end up with a route
768 			 * marked with the RTF_BROADCAST flag.  Otherwise,
769 			 * we would end up referring to the default route,
770 			 * instead of creating a cloned host route entry.
771 			 * That would introduce inconsistencies between ULPs
772 			 * that allocate a route and those that don't.  The
773 			 * RTF_BROADCAST route is important since we'd want
774 			 * to send out undirected IP broadcast packets using
775 			 * link-level broadcast address. Another exception
776 			 * is for ULP-created routes that got blown away by
777 			 * source interface selection (see above).
778 			 *
779 			 * These exceptions will no longer be necessary when
780 			 * the RTF_PRCLONING scheme is no longer present.
781 			 */
782 			if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) {
783 				ign &= ~RTF_PRCLONING;
784 			}
785 
786 			/*
787 			 * Loosen the route lookup criteria if the ifscope
788 			 * corresponds to the loopback interface; this is
789 			 * needed to support Application Layer Gateways
790 			 * listening on loopback, in conjunction with packet
791 			 * filter redirection rules.  The final source IP
792 			 * address will be rewritten by the packet filter
793 			 * prior to the RFC1122 loopback check below.
794 			 */
795 			if (ifscope == lo_ifp->if_index) {
796 				rtalloc_ign(ro, ign);
797 			} else {
798 				rtalloc_scoped_ign(ro, ign, ifscope);
799 			}
800 
801 			/*
802 			 * If the route points to a cellular/expensive interface
803 			 * and the caller forbids our using interfaces of such type,
804 			 * pretend that there is no route.
805 			 */
806 			if (ro->ro_rt != NULL) {
807 				RT_LOCK_SPIN(ro->ro_rt);
808 				if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp,
809 				    ipobf)) {
810 					RT_UNLOCK(ro->ro_rt);
811 					ROUTE_RELEASE(ro);
812 					if (flags & IP_OUTARGS) {
813 						ipoa->ipoa_flags |=
814 						    IPOAF_R_IFDENIED;
815 					}
816 				} else {
817 					RT_UNLOCK(ro->ro_rt);
818 				}
819 			}
820 		}
821 
822 		if (ro->ro_rt == NULL) {
823 			OSAddAtomic(1, &ipstat.ips_noroute);
824 			error = EHOSTUNREACH;
825 			if (ia0 != NULL) {
826 				ifa_remref(ia0);
827 				ia0 = NULL;
828 			}
829 			goto bad;
830 		}
831 
832 		if (ia != NULL) {
833 			ifa_remref(&ia->ia_ifa);
834 		}
835 		RT_LOCK_SPIN(ro->ro_rt);
836 		ia = ifatoia(ro->ro_rt->rt_ifa);
837 		if (ia != NULL) {
838 			/* Become a regular mutex */
839 			RT_CONVERT_LOCK(ro->ro_rt);
840 			ifa_addref(&ia->ia_ifa);
841 		}
842 		/*
843 		 * Note: ia_ifp may not be the same as rt_ifp; the latter
844 		 * is what we use for determining outbound i/f, mtu, etc.
845 		 */
846 		ifp = ro->ro_rt->rt_ifp;
847 		ro->ro_rt->rt_use++;
848 		if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
849 			dst = SIN(ro->ro_rt->rt_gateway);
850 		}
851 		if (ro->ro_rt->rt_flags & RTF_HOST) {
852 			/* double negation needed for bool bit field */
853 			ipobf.isbroadcast =
854 			    !!(ro->ro_rt->rt_flags & RTF_BROADCAST);
855 		} else {
856 			/* Become a regular mutex */
857 			RT_CONVERT_LOCK(ro->ro_rt);
858 			ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
859 		}
860 		/*
861 		 * For consistency with IPv6, as well as to ensure that
862 		 * IP_RECVIF is set correctly for packets that are sent
863 		 * to one of the local addresses.  ia (rt_ifa) would have
864 		 * been fixed up by rt_setif for local routes.  This
865 		 * would make it appear as if the packet arrives on the
866 		 * interface which owns the local address.  Loopback
867 		 * multicast case is handled separately by ip_mloopback().
868 		 */
869 		if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) &&
870 		    !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
871 			uint16_t srcidx;
872 
873 			m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp;
874 
875 			if (ia0 != NULL) {
876 				srcidx = ia0->ifa_ifp->if_index;
877 			} else if ((ro->ro_flags & ROF_SRCIF_SELECTED) &&
878 			    ro->ro_srcia != NULL) {
879 				srcidx = ro->ro_srcia->ifa_ifp->if_index;
880 			} else {
881 				srcidx = 0;
882 			}
883 
884 			ip_setsrcifaddr_info(m, srcidx, NULL);
885 			ip_setdstifaddr_info(m, 0, ia);
886 		}
887 		RT_UNLOCK(ro->ro_rt);
888 		if (ia0 != NULL) {
889 			ifa_remref(ia0);
890 			ia0 = NULL;
891 		}
892 	}
893 
894 	if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
895 		struct ifnet *srcifp = NULL;
896 		struct in_multi *inm;
897 		u_int32_t vif = 0;
898 		u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
899 		u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
900 
901 		m->m_flags |= M_MCAST;
902 		/*
903 		 * IP destination address is multicast.  Make sure "dst"
904 		 * still points to the address in "ro".  (It may have been
905 		 * changed to point to a gateway address, above.)
906 		 */
907 		dst = SIN(&ro->ro_dst);
908 		/*
909 		 * See if the caller provided any multicast options
910 		 */
911 		if (imo != NULL) {
912 			IMO_LOCK(imo);
913 			vif = imo->imo_multicast_vif;
914 			ttl = imo->imo_multicast_ttl;
915 			loop = imo->imo_multicast_loop;
916 			if (!(flags & IP_RAWOUTPUT)) {
917 				ip->ip_ttl = ttl;
918 			}
919 			if (imo->imo_multicast_ifp != NULL) {
920 				ifp = imo->imo_multicast_ifp;
921 			}
922 			IMO_UNLOCK(imo);
923 		} else if (!(flags & IP_RAWOUTPUT)) {
924 			vif = -1;
925 			ip->ip_ttl = ttl;
926 		}
927 		/*
928 		 * Confirm that the outgoing interface supports multicast.
929 		 */
930 		if (imo == NULL || vif == -1) {
931 			if (!(ifp->if_flags & IFF_MULTICAST)) {
932 				OSAddAtomic(1, &ipstat.ips_noroute);
933 				error = ENETUNREACH;
934 				goto bad;
935 			}
936 		}
937 		/*
938 		 * If source address not specified yet, use address
939 		 * of outgoing interface.
940 		 */
941 		if (ip->ip_src.s_addr == INADDR_ANY) {
942 			struct in_ifaddr *ia1;
943 			lck_rw_lock_shared(&in_ifaddr_rwlock);
944 			TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
945 				IFA_LOCK_SPIN(&ia1->ia_ifa);
946 				if (ia1->ia_ifp == ifp) {
947 					ip->ip_src = IA_SIN(ia1)->sin_addr;
948 					srcifp = ifp;
949 					IFA_UNLOCK(&ia1->ia_ifa);
950 					break;
951 				}
952 				IFA_UNLOCK(&ia1->ia_ifa);
953 			}
954 			lck_rw_done(&in_ifaddr_rwlock);
955 			if (ip->ip_src.s_addr == INADDR_ANY) {
956 				error = ENETUNREACH;
957 				goto bad;
958 			}
959 		}
960 
961 		in_multihead_lock_shared();
962 		IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
963 		in_multihead_lock_done();
964 		if (inm != NULL && (imo == NULL || loop)) {
965 			/*
966 			 * If we belong to the destination multicast group
967 			 * on the outgoing interface, and the caller did not
968 			 * forbid loopback, loop back a copy.
969 			 */
970 			if (!TAILQ_EMPTY(&ipv4_filters)
971 #if NECP
972 			    && !necp_packet_should_skip_filters(m)
973 #endif // NECP
974 			    ) {
975 				struct ipfilter *filter;
976 				int seen = (inject_filter_ref == NULL);
977 
978 				if (imo != NULL) {
979 					ipf_pktopts.ippo_flags |=
980 					    IPPOF_MCAST_OPTS;
981 					ipf_pktopts.ippo_mcast_ifnet = ifp;
982 					ipf_pktopts.ippo_mcast_ttl = ttl;
983 					ipf_pktopts.ippo_mcast_loop = loop;
984 				}
985 
986 				ipf_ref();
987 
988 				/*
989 				 * 4135317 - always pass network byte
990 				 * order to filter
991 				 */
992 #if BYTE_ORDER != BIG_ENDIAN
993 				HTONS(ip->ip_len);
994 				HTONS(ip->ip_off);
995 #endif
996 				TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
997 					if (seen == 0) {
998 						if ((struct ipfilter *)
999 						    inject_filter_ref == filter) {
1000 							seen = 1;
1001 						}
1002 					} else if (filter->ipf_filter.
1003 					    ipf_output != NULL) {
1004 						errno_t result;
1005 						result = filter->ipf_filter.
1006 						    ipf_output(filter->
1007 						    ipf_filter.cookie,
1008 						    (mbuf_t *)&m, ippo);
1009 						if (result == EJUSTRETURN) {
1010 							ipf_unref();
1011 							INM_REMREF(inm);
1012 							goto done;
1013 						}
1014 						if (result != 0) {
1015 							ipf_unref();
1016 							INM_REMREF(inm);
1017 							goto bad;
1018 						}
1019 					}
1020 				}
1021 
1022 				/* set back to host byte order */
1023 				ip = mtod(m, struct ip *);
1024 #if BYTE_ORDER != BIG_ENDIAN
1025 				NTOHS(ip->ip_len);
1026 				NTOHS(ip->ip_off);
1027 #endif
1028 				ipf_unref();
1029 				ipobf.didfilter = true;
1030 			}
1031 			ip_mloopback(srcifp, ifp, m, dst, hlen);
1032 		}
1033 		if (inm != NULL) {
1034 			INM_REMREF(inm);
1035 		}
1036 		/*
1037 		 * Multicasts with a time-to-live of zero may be looped-
1038 		 * back, above, but must not be transmitted on a network.
1039 		 * Also, multicasts addressed to the loopback interface
1040 		 * are not sent -- the above call to ip_mloopback() will
1041 		 * loop back a copy if this host actually belongs to the
1042 		 * destination group on the loopback interface.
1043 		 */
1044 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
1045 			m_freem(m);
1046 			goto done;
1047 		}
1048 
1049 		goto sendit;
1050 	}
1051 	/*
1052 	 * If source address not specified yet, use address
1053 	 * of outgoing interface.
1054 	 */
1055 	if (ip->ip_src.s_addr == INADDR_ANY) {
1056 		IFA_LOCK_SPIN(&ia->ia_ifa);
1057 		ip->ip_src = IA_SIN(ia)->sin_addr;
1058 		IFA_UNLOCK(&ia->ia_ifa);
1059 	}
1060 
1061 	/*
1062 	 * Look for broadcast address and
1063 	 * and verify user is allowed to send
1064 	 * such a packet.
1065 	 */
1066 	if (ipobf.isbroadcast) {
1067 		if (!(ifp->if_flags & IFF_BROADCAST)) {
1068 			error = EADDRNOTAVAIL;
1069 			goto bad;
1070 		}
1071 		if (!(flags & IP_ALLOWBROADCAST)) {
1072 			error = EACCES;
1073 			goto bad;
1074 		}
1075 		/* don't allow broadcast messages to be fragmented */
1076 		if ((u_short)ip->ip_len > ifp->if_mtu) {
1077 			error = EMSGSIZE;
1078 			goto bad;
1079 		}
1080 		m->m_flags |= M_BCAST;
1081 	} else {
1082 		m->m_flags &= ~M_BCAST;
1083 	}
1084 
1085 sendit:
1086 #if PF
1087 	/* Invoke outbound packet filter */
1088 	if (PF_IS_ENABLED) {
1089 		int rc;
1090 
1091 		m0 = m; /* Save for later */
1092 #if DUMMYNET
1093 		rc = ip_output_pf_dn_hook(ifp, mppn, &m, dn_pf_rule, ro, dst, flags, ipoa);
1094 #else /* DUMMYNET */
1095 		rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1096 #endif /* DUMMYNET */
1097 		if (rc != 0 || m == NULL) {
1098 			/* Move to the next packet */
1099 			m = *mppn;
1100 
1101 			/* Skip ahead if first packet in list got dropped */
1102 			if (packetlist == m0) {
1103 				packetlist = m;
1104 			}
1105 
1106 			if (m != NULL) {
1107 				m0 = m;
1108 				/* Next packet in the chain */
1109 				goto loopit;
1110 			} else if (packetlist != NULL) {
1111 				/* No more packet; send down the chain */
1112 				goto sendchain;
1113 			}
1114 			/* Nothing left; we're done */
1115 			goto done;
1116 		}
1117 		m0 = m;
1118 		ip = mtod(m, struct ip *);
1119 		pkt_dst = ip->ip_dst;
1120 		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1121 	}
1122 #endif /* PF */
1123 	/*
1124 	 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1125 	 */
1126 	if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
1127 	    IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1128 		ip_linklocal_stat.iplls_out_total++;
1129 		if (ip->ip_ttl != MAXTTL) {
1130 			ip_linklocal_stat.iplls_out_badttl++;
1131 			ip->ip_ttl = MAXTTL;
1132 		}
1133 	}
1134 
1135 	if (!ipobf.didfilter &&
1136 	    !TAILQ_EMPTY(&ipv4_filters)
1137 #if NECP
1138 	    && !necp_packet_should_skip_filters(m)
1139 #endif // NECP
1140 	    ) {
1141 		struct ipfilter *filter;
1142 		int seen = (inject_filter_ref == NULL);
1143 		ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1144 
1145 		/*
1146 		 * Check that a TSO frame isn't passed to a filter.
1147 		 * This could happen if a filter is inserted while
1148 		 * TCP is sending the TSO packet.
1149 		 */
1150 		if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1151 			error = EMSGSIZE;
1152 			goto bad;
1153 		}
1154 
1155 		ipf_ref();
1156 
1157 		/* 4135317 - always pass network byte order to filter */
1158 #if BYTE_ORDER != BIG_ENDIAN
1159 		HTONS(ip->ip_len);
1160 		HTONS(ip->ip_off);
1161 #endif
1162 		TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1163 			if (seen == 0) {
1164 				if ((struct ipfilter *)inject_filter_ref ==
1165 				    filter) {
1166 					seen = 1;
1167 				}
1168 			} else if (filter->ipf_filter.ipf_output) {
1169 				errno_t result;
1170 				result = filter->ipf_filter.
1171 				    ipf_output(filter->ipf_filter.cookie,
1172 				    (mbuf_t *)&m, ippo);
1173 				if (result == EJUSTRETURN) {
1174 					ipf_unref();
1175 					goto done;
1176 				}
1177 				if (result != 0) {
1178 					ipf_unref();
1179 					goto bad;
1180 				}
1181 			}
1182 		}
1183 		/* set back to host byte order */
1184 		ip = mtod(m, struct ip *);
1185 #if BYTE_ORDER != BIG_ENDIAN
1186 		NTOHS(ip->ip_len);
1187 		NTOHS(ip->ip_off);
1188 #endif
1189 		ipf_unref();
1190 	}
1191 
1192 #if NECP
1193 	/* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
1194 	necp_matched_policy_id = necp_ip_output_find_policy_match(m,
1195 	    flags, (flags & IP_OUTARGS) ? ipoa : NULL, ro ? ro->ro_rt : NULL, &necp_result, &necp_result_parameter);
1196 	if (necp_matched_policy_id) {
1197 		necp_mark_packet_from_ip(m, necp_matched_policy_id);
1198 		switch (necp_result) {
1199 		case NECP_KERNEL_POLICY_RESULT_PASS:
1200 			if (necp_result_parameter.pass_flags & NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC) {
1201 				break;
1202 			}
1203 			/* Check if the interface is allowed */
1204 			if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1205 				error = EHOSTUNREACH;
1206 				OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1207 				goto bad;
1208 			}
1209 			goto skip_ipsec;
1210 		case NECP_KERNEL_POLICY_RESULT_DROP:
1211 		case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
1212 			/* Flow divert packets should be blocked at the IP layer */
1213 			error = EHOSTUNREACH;
1214 			OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1215 			goto bad;
1216 		case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
1217 			/* Verify that the packet is being routed to the tunnel */
1218 			struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter);
1219 			if (policy_ifp == ifp) {
1220 				/* Check if the interface is allowed */
1221 				if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1222 					error = EHOSTUNREACH;
1223 					OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1224 					goto bad;
1225 				}
1226 				goto skip_ipsec;
1227 			} else {
1228 				if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) {
1229 					/* Check if the interface is allowed */
1230 					if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) {
1231 						error = EHOSTUNREACH;
1232 						OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1233 						goto bad;
1234 					}
1235 
1236 					/*
1237 					 * Update the QOS marking policy if
1238 					 * 1. up layer asks it to do so
1239 					 * 2. net_qos_policy_restricted is not set
1240 					 * 3. qos_marking_gencount doesn't match necp_kernel_socket_policies_gencount (checked in necp_lookup_current_qos_marking)
1241 					 */
1242 					if (ipoa != NULL &&
1243 					    (ipoa->ipoa_flags & IPOAF_REDO_QOSMARKING_POLICY) &&
1244 					    net_qos_policy_restricted != 0) {
1245 						bool qos_marking = (ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED) ? TRUE : FALSE;
1246 						qos_marking = necp_lookup_current_qos_marking(&ipoa->qos_marking_gencount, NULL, policy_ifp, necp_result_parameter.route_rule_id, qos_marking);
1247 						if (qos_marking) {
1248 							ipoa->ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1249 						} else {
1250 							ipoa->ipoa_flags &= ~IPOAF_QOSMARKING_ALLOWED;
1251 						}
1252 					}
1253 
1254 					/* Set ifp to the tunnel interface, since it is compatible with the packet */
1255 					ifp = policy_ifp;
1256 					ro = &necp_route;
1257 					goto skip_ipsec;
1258 				} else {
1259 					error = ENETUNREACH;
1260 					OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1261 					goto bad;
1262 				}
1263 			}
1264 		}
1265 		default:
1266 			break;
1267 		}
1268 	}
1269 	/* Catch-all to check if the interface is allowed */
1270 	if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1271 		error = EHOSTUNREACH;
1272 		OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1273 		goto bad;
1274 	}
1275 #endif /* NECP */
1276 
1277 #if IPSEC
1278 	if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) {
1279 		goto skip_ipsec;
1280 	}
1281 
1282 	KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1283 
1284 	if (sp == NULL) {
1285 		/* get SP for this packet */
1286 		if (so != NULL) {
1287 			sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND,
1288 			    so, &error);
1289 		} else {
1290 			sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
1291 			    flags, &error);
1292 		}
1293 		if (sp == NULL) {
1294 			IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1295 			KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1296 			    0, 0, 0, 0, 0);
1297 			goto bad;
1298 		}
1299 	}
1300 
1301 	error = 0;
1302 
1303 	/* check policy */
1304 	switch (sp->policy) {
1305 	case IPSEC_POLICY_DISCARD:
1306 	case IPSEC_POLICY_GENERATE:
1307 		/*
1308 		 * This packet is just discarded.
1309 		 */
1310 		IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1311 		KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1312 		    1, 0, 0, 0, 0);
1313 		goto bad;
1314 
1315 	case IPSEC_POLICY_BYPASS:
1316 	case IPSEC_POLICY_NONE:
1317 		/* no need to do IPsec. */
1318 		KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1319 		    2, 0, 0, 0, 0);
1320 		goto skip_ipsec;
1321 
1322 	case IPSEC_POLICY_IPSEC:
1323 		if (sp->req == NULL) {
1324 			/* acquire a policy */
1325 			error = key_spdacquire(sp);
1326 			KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1327 			    3, 0, 0, 0, 0);
1328 			goto bad;
1329 		}
1330 		if (sp->ipsec_if) {
1331 			/* Verify the redirect to ipsec interface */
1332 			if (sp->ipsec_if == ifp) {
1333 				goto skip_ipsec;
1334 			}
1335 			goto bad;
1336 		}
1337 		break;
1338 
1339 	case IPSEC_POLICY_ENTRUST:
1340 	default:
1341 		printf("ip_output: Invalid policy found. %d\n", sp->policy);
1342 	}
1343 	{
1344 		ipsec_state.m = m;
1345 		if (flags & IP_ROUTETOIF) {
1346 			bzero(&ipsec_state.ro, sizeof(ipsec_state.ro));
1347 		} else {
1348 			route_copyout((struct route *)&ipsec_state.ro, ro, sizeof(struct route));
1349 		}
1350 		ipsec_state.dst = SA(dst);
1351 
1352 		ip->ip_sum = 0;
1353 
1354 		/*
1355 		 * XXX
1356 		 * delayed checksums are not currently compatible with IPsec
1357 		 */
1358 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1359 			in_delayed_cksum(m);
1360 		}
1361 
1362 #if BYTE_ORDER != BIG_ENDIAN
1363 		HTONS(ip->ip_len);
1364 		HTONS(ip->ip_off);
1365 #endif
1366 
1367 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1368 		    struct ip *, ip, struct ifnet *, ifp,
1369 		    struct ip *, ip, struct ip6_hdr *, NULL);
1370 
1371 		error = ipsec4_output(&ipsec_state, sp, flags);
1372 		if (ipsec_state.tunneled == 6) {
1373 			m0 = m = NULL;
1374 			error = 0;
1375 			goto bad;
1376 		}
1377 
1378 		m0 = m = ipsec_state.m;
1379 
1380 #if DUMMYNET
1381 		/*
1382 		 * If we're about to use the route in ipsec_state
1383 		 * and this came from dummynet, cleaup now.
1384 		 */
1385 		if (ro == &saved_route &&
1386 		    (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) {
1387 			ROUTE_RELEASE(ro);
1388 		}
1389 #endif /* DUMMYNET */
1390 
1391 		if (flags & IP_ROUTETOIF) {
1392 			/*
1393 			 * if we have tunnel mode SA, we may need to ignore
1394 			 * IP_ROUTETOIF.
1395 			 */
1396 			if (ipsec_state.tunneled) {
1397 				flags &= ~IP_ROUTETOIF;
1398 				ro = (struct route *)&ipsec_state.ro;
1399 			}
1400 		} else {
1401 			ro = (struct route *)&ipsec_state.ro;
1402 		}
1403 		dst = SIN(ipsec_state.dst);
1404 		if (error) {
1405 			/* mbuf is already reclaimed in ipsec4_output. */
1406 			m0 = NULL;
1407 			switch (error) {
1408 			case EHOSTUNREACH:
1409 			case ENETUNREACH:
1410 			case EMSGSIZE:
1411 			case ENOBUFS:
1412 			case ENOMEM:
1413 				break;
1414 			default:
1415 				printf("ip4_output (ipsec): error code %d\n", error);
1416 				OS_FALLTHROUGH;
1417 			case ENOENT:
1418 				/* don't show these error codes to the user */
1419 				error = 0;
1420 				break;
1421 			}
1422 			KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1423 			    4, 0, 0, 0, 0);
1424 			goto bad;
1425 		}
1426 	}
1427 
1428 	/* be sure to update variables that are affected by ipsec4_output() */
1429 	ip = mtod(m, struct ip *);
1430 
1431 #ifdef _IP_VHL
1432 	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1433 #else /* !_IP_VHL */
1434 	hlen = ip->ip_hl << 2;
1435 #endif /* !_IP_VHL */
1436 	/* Check that there wasn't a route change and src is still valid */
1437 	if (ROUTE_UNUSABLE(ro)) {
1438 		ROUTE_RELEASE(ro);
1439 		VERIFY(src_ia == NULL);
1440 		if (ip->ip_src.s_addr != INADDR_ANY &&
1441 		    !(flags & (IP_ROUTETOIF | IP_FORWARDING)) &&
1442 		    (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) {
1443 			error = EADDRNOTAVAIL;
1444 			KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1445 			    5, 0, 0, 0, 0);
1446 			goto bad;
1447 		}
1448 		if (src_ia != NULL) {
1449 			ifa_remref(&src_ia->ia_ifa);
1450 			src_ia = NULL;
1451 		}
1452 	}
1453 
1454 	if (ro->ro_rt == NULL) {
1455 		if (!(flags & IP_ROUTETOIF)) {
1456 			printf("%s: can't update route after "
1457 			    "IPsec processing\n", __func__);
1458 			error = EHOSTUNREACH;   /* XXX */
1459 			KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1460 			    6, 0, 0, 0, 0);
1461 			goto bad;
1462 		}
1463 	} else {
1464 		if (ia != NULL) {
1465 			ifa_remref(&ia->ia_ifa);
1466 		}
1467 		RT_LOCK_SPIN(ro->ro_rt);
1468 		ia = ifatoia(ro->ro_rt->rt_ifa);
1469 		if (ia != NULL) {
1470 			/* Become a regular mutex */
1471 			RT_CONVERT_LOCK(ro->ro_rt);
1472 			ifa_addref(&ia->ia_ifa);
1473 		}
1474 		ifp = ro->ro_rt->rt_ifp;
1475 		RT_UNLOCK(ro->ro_rt);
1476 	}
1477 
1478 	/* make it flipped, again. */
1479 #if BYTE_ORDER != BIG_ENDIAN
1480 	NTOHS(ip->ip_len);
1481 	NTOHS(ip->ip_off);
1482 #endif
1483 	KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1484 	    7, 0xff, 0xff, 0xff, 0xff);
1485 
1486 	/* Pass to filters again */
1487 	if (!TAILQ_EMPTY(&ipv4_filters)
1488 #if NECP
1489 	    && !necp_packet_should_skip_filters(m)
1490 #endif // NECP
1491 	    ) {
1492 		struct ipfilter *filter;
1493 
1494 		ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1495 
1496 		/*
1497 		 * Check that a TSO frame isn't passed to a filter.
1498 		 * This could happen if a filter is inserted while
1499 		 * TCP is sending the TSO packet.
1500 		 */
1501 		if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1502 			error = EMSGSIZE;
1503 			goto bad;
1504 		}
1505 
1506 		ipf_ref();
1507 
1508 		/* 4135317 - always pass network byte order to filter */
1509 #if BYTE_ORDER != BIG_ENDIAN
1510 		HTONS(ip->ip_len);
1511 		HTONS(ip->ip_off);
1512 #endif
1513 		TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1514 			if (filter->ipf_filter.ipf_output) {
1515 				errno_t result;
1516 				result = filter->ipf_filter.
1517 				    ipf_output(filter->ipf_filter.cookie,
1518 				    (mbuf_t *)&m, ippo);
1519 				if (result == EJUSTRETURN) {
1520 					ipf_unref();
1521 					goto done;
1522 				}
1523 				if (result != 0) {
1524 					ipf_unref();
1525 					goto bad;
1526 				}
1527 			}
1528 		}
1529 		/* set back to host byte order */
1530 		ip = mtod(m, struct ip *);
1531 #if BYTE_ORDER != BIG_ENDIAN
1532 		NTOHS(ip->ip_len);
1533 		NTOHS(ip->ip_off);
1534 #endif
1535 		ipf_unref();
1536 	}
1537 skip_ipsec:
1538 #endif /* IPSEC */
1539 
1540 
1541 	/* 127/8 must not appear on wire - RFC1122 */
1542 	if (!(ifp->if_flags & IFF_LOOPBACK) &&
1543 	    ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1544 	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1545 		OSAddAtomic(1, &ipstat.ips_badaddr);
1546 		error = EADDRNOTAVAIL;
1547 		goto bad;
1548 	}
1549 
1550 	if (ipoa != NULL) {
1551 		u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
1552 
1553 		error = set_packet_qos(m, ifp,
1554 		    ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
1555 		    ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
1556 		if (error == 0) {
1557 			ip->ip_tos &= IPTOS_ECN_MASK;
1558 			ip->ip_tos |= (u_char)(dscp << IPTOS_DSCP_SHIFT);
1559 		} else {
1560 			printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
1561 			error = 0;
1562 		}
1563 	}
1564 
1565 	ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
1566 	    ip->ip_len, &sw_csum);
1567 
1568 	interface_mtu = ifp->if_mtu;
1569 
1570 	if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
1571 		interface_mtu = IN6_LINKMTU(ifp);
1572 		/* Further adjust the size for CLAT46 expansion */
1573 		interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
1574 	}
1575 
1576 	/*
1577 	 * If small enough for interface, or the interface will take
1578 	 * care of the fragmentation for us, can just send directly.
1579 	 */
1580 	if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) ||
1581 	    (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
1582 #if BYTE_ORDER != BIG_ENDIAN
1583 		HTONS(ip->ip_len);
1584 		HTONS(ip->ip_off);
1585 #endif
1586 
1587 		ip->ip_sum = 0;
1588 		if ((sw_csum & CSUM_DELAY_IP) || __improbable(force_ipsum != 0)) {
1589 			ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1590 			sw_csum &= ~CSUM_DELAY_IP;
1591 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1592 		}
1593 
1594 #if IPSEC
1595 		/* clean ipsec history once it goes out of the node */
1596 		if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1597 			ipsec_delaux(m);
1598 		}
1599 #endif /* IPSEC */
1600 		if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
1601 		    (m->m_pkthdr.tso_segsz > 0)) {
1602 			scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
1603 		} else {
1604 			scnt++;
1605 		}
1606 
1607 		if (packetchain == 0) {
1608 			if (ro->ro_rt != NULL && nstat_collect) {
1609 				nstat_route_tx(ro->ro_rt, scnt,
1610 				    m->m_pkthdr.len, 0);
1611 			}
1612 
1613 			error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1614 			    SA(dst), 0, adv);
1615 			if (dlil_verbose && error) {
1616 				printf("dlil_output error on interface %s: %d\n",
1617 				    ifp->if_xname, error);
1618 			}
1619 			scnt = 0;
1620 			goto done;
1621 		} else {
1622 			/*
1623 			 * packet chaining allows us to reuse the
1624 			 * route for all packets
1625 			 */
1626 			bytecnt += m->m_pkthdr.len;
1627 			mppn = &m->m_nextpkt;
1628 			m = m->m_nextpkt;
1629 			if (m == NULL) {
1630 #if PF
1631 sendchain:
1632 #endif /* PF */
1633 				if (pktcnt > ip_maxchainsent) {
1634 					ip_maxchainsent = pktcnt;
1635 				}
1636 				if (ro->ro_rt != NULL && nstat_collect) {
1637 					nstat_route_tx(ro->ro_rt, scnt,
1638 					    bytecnt, 0);
1639 				}
1640 
1641 				error = dlil_output(ifp, PF_INET, packetlist,
1642 				    ro->ro_rt, SA(dst), 0, adv);
1643 				if (dlil_verbose && error) {
1644 					printf("dlil_output error on interface %s: %d\n",
1645 					    ifp->if_xname, error);
1646 				}
1647 				pktcnt = 0;
1648 				scnt = 0;
1649 				bytecnt = 0;
1650 				goto done;
1651 			}
1652 			m0 = m;
1653 			pktcnt++;
1654 			goto loopit;
1655 		}
1656 	}
1657 
1658 	VERIFY(interface_mtu != 0);
1659 	/*
1660 	 * Too large for interface; fragment if possible.
1661 	 * Must be able to put at least 8 bytes per fragment.
1662 	 * Balk when DF bit is set or the interface didn't support TSO.
1663 	 */
1664 	if ((ip->ip_off & IP_DF) || pktcnt > 0 ||
1665 	    (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1666 		error = EMSGSIZE;
1667 		/*
1668 		 * This case can happen if the user changed the MTU
1669 		 * of an interface after enabling IP on it.  Because
1670 		 * most netifs don't keep track of routes pointing to
1671 		 * them, there is no way for one to update all its
1672 		 * routes when the MTU is changed.
1673 		 */
1674 		if (ro->ro_rt) {
1675 			RT_LOCK_SPIN(ro->ro_rt);
1676 			if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1677 			    !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1678 			    (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) {
1679 				ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu;
1680 			}
1681 			RT_UNLOCK(ro->ro_rt);
1682 		}
1683 		if (pktcnt > 0) {
1684 			m0 = packetlist;
1685 		}
1686 		OSAddAtomic(1, &ipstat.ips_cantfrag);
1687 		goto bad;
1688 	}
1689 
1690 	/*
1691 	 * XXX Only TCP seems to be passing a list of packets here.
1692 	 * The following issue is limited to UDP datagrams with 0 checksum.
1693 	 * For now limit it to the case when single packet is passed down.
1694 	 */
1695 	if (packetchain == 0 && IS_INTF_CLAT46(ifp)) {
1696 		/*
1697 		 * If it is a UDP packet that has checksum set to 0
1698 		 * and is also not being offloaded, compute a full checksum
1699 		 * and update the UDP checksum.
1700 		 */
1701 		if (ip->ip_p == IPPROTO_UDP &&
1702 		    !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) {
1703 			struct udphdr *uh = NULL;
1704 
1705 			if (m->m_len < hlen + sizeof(struct udphdr)) {
1706 				m = m_pullup(m, hlen + sizeof(struct udphdr));
1707 				if (m == NULL) {
1708 					error = ENOBUFS;
1709 					m0 = m;
1710 					goto bad;
1711 				}
1712 				m0 = m;
1713 				ip = mtod(m, struct ip *);
1714 			}
1715 			/*
1716 			 * Get UDP header and if checksum is 0, then compute the full
1717 			 * checksum.
1718 			 */
1719 			uh = (struct udphdr *)(void *)((caddr_t)ip + hlen);
1720 			if (uh->uh_sum == 0) {
1721 				uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen,
1722 				    ip->ip_len - hlen);
1723 				if (uh->uh_sum == 0) {
1724 					uh->uh_sum = 0xffff;
1725 				}
1726 			}
1727 		}
1728 	}
1729 
1730 	error = ip_fragment(m, ifp, interface_mtu, sw_csum);
1731 	if (error != 0) {
1732 		m0 = m = NULL;
1733 		goto bad;
1734 	}
1735 
1736 	KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1737 	    ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1738 
1739 	for (m = m0; m; m = m0) {
1740 		m0 = m->m_nextpkt;
1741 		m->m_nextpkt = 0;
1742 #if IPSEC
1743 		/* clean ipsec history once it goes out of the node */
1744 		if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1745 			ipsec_delaux(m);
1746 		}
1747 #endif /* IPSEC */
1748 		if (error == 0) {
1749 			if ((packetchain != 0) && (pktcnt > 0)) {
1750 				panic("%s: mix of packet in packetlist is "
1751 				    "wrong=%p", __func__, packetlist);
1752 				/* NOTREACHED */
1753 			}
1754 			if (ro->ro_rt != NULL && nstat_collect) {
1755 				nstat_route_tx(ro->ro_rt, 1,
1756 				    m->m_pkthdr.len, 0);
1757 			}
1758 			error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1759 			    SA(dst), 0, adv);
1760 			if (dlil_verbose && error) {
1761 				printf("dlil_output error on interface %s: %d\n",
1762 				    ifp->if_xname, error);
1763 			}
1764 		} else {
1765 			m_freem(m);
1766 		}
1767 	}
1768 
1769 	if (error == 0) {
1770 		OSAddAtomic(1, &ipstat.ips_fragmented);
1771 	}
1772 
1773 done:
1774 	if (ia != NULL) {
1775 		ifa_remref(&ia->ia_ifa);
1776 		ia = NULL;
1777 	}
1778 #if IPSEC
1779 	ROUTE_RELEASE(&ipsec_state.ro);
1780 	if (sp != NULL) {
1781 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1782 		    printf("DP ip_output call free SP:%x\n", sp));
1783 		key_freesp(sp, KEY_SADB_UNLOCKED);
1784 	}
1785 #endif /* IPSEC */
1786 #if NECP
1787 	ROUTE_RELEASE(&necp_route);
1788 #endif /* NECP */
1789 #if DUMMYNET
1790 	ROUTE_RELEASE(&saved_route);
1791 #endif /* DUMMYNET */
1792 
1793 	KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
1794 	if (ip_output_measure) {
1795 		net_perf_measure_time(&net_perf, &start_tv, packets_processed);
1796 		net_perf_histogram(&net_perf, packets_processed);
1797 	}
1798 	return error;
1799 bad:
1800 	if (pktcnt > 0) {
1801 		m0 = packetlist;
1802 	}
1803 	m_freem_list(m0);
1804 	goto done;
1805 
1806 #undef ipsec_state
1807 #undef args
1808 #undef sro_fwd
1809 #undef saved_route
1810 #undef ipf_pktopts
1811 #undef IP_CHECK_RESTRICTIONS
1812 }
1813 
1814 int
ip_fragment(struct mbuf * m,struct ifnet * ifp,uint32_t mtu,int sw_csum)1815 ip_fragment(struct mbuf *m, struct ifnet *ifp, uint32_t mtu, int sw_csum)
1816 {
1817 	struct ip *ip, *mhip;
1818 	int len, hlen, mhlen, firstlen, off, error = 0;
1819 	struct mbuf **mnext = &m->m_nextpkt, *m0;
1820 	int nfrags = 1;
1821 
1822 	ip = mtod(m, struct ip *);
1823 #ifdef _IP_VHL
1824 	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1825 #else /* !_IP_VHL */
1826 	hlen = ip->ip_hl << 2;
1827 #endif /* !_IP_VHL */
1828 
1829 	/*
1830 	 * We need to adjust the fragment sizes to account
1831 	 * for IPv6 fragment header if it needs to be translated
1832 	 * from IPv4 to IPv6.
1833 	 */
1834 	if (IS_INTF_CLAT46(ifp)) {
1835 		mtu -= sizeof(struct ip6_frag);
1836 	}
1837 
1838 	firstlen = len = (mtu - hlen) & ~7;
1839 	if (len < 8) {
1840 		m_freem(m);
1841 		return EMSGSIZE;
1842 	}
1843 
1844 	/*
1845 	 * if the interface will not calculate checksums on
1846 	 * fragmented packets, then do it here.
1847 	 */
1848 	if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) &&
1849 	    !(ifp->if_hwassist & CSUM_IP_FRAGS)) {
1850 		in_delayed_cksum(m);
1851 	}
1852 
1853 	/*
1854 	 * Loop through length of segment after first fragment,
1855 	 * make new header and copy data of each part and link onto chain.
1856 	 */
1857 	m0 = m;
1858 	mhlen = sizeof(struct ip);
1859 	for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1860 		MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1861 		if (m == NULL) {
1862 			error = ENOBUFS;
1863 			OSAddAtomic(1, &ipstat.ips_odropped);
1864 			goto sendorfree;
1865 		}
1866 		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1867 		m->m_data += max_linkhdr;
1868 		mhip = mtod(m, struct ip *);
1869 		*mhip = *ip;
1870 		if (hlen > sizeof(struct ip)) {
1871 			mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
1872 			mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1873 		}
1874 		m->m_len = mhlen;
1875 		mhip->ip_off = (u_short)(((off - hlen) >> 3) + (ip->ip_off & ~IP_MF));
1876 		if (ip->ip_off & IP_MF) {
1877 			mhip->ip_off |= IP_MF;
1878 		}
1879 		if (off + len >= (u_short)ip->ip_len) {
1880 			len = (u_short)ip->ip_len - off;
1881 		} else {
1882 			mhip->ip_off |= IP_MF;
1883 		}
1884 		mhip->ip_len = htons((u_short)(len + mhlen));
1885 		m->m_next = m_copy(m0, off, len);
1886 		if (m->m_next == NULL) {
1887 			(void) m_free(m);
1888 			error = ENOBUFS;        /* ??? */
1889 			OSAddAtomic(1, &ipstat.ips_odropped);
1890 			goto sendorfree;
1891 		}
1892 		m->m_pkthdr.len = mhlen + len;
1893 		m->m_pkthdr.rcvif = NULL;
1894 		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1895 
1896 		M_COPY_CLASSIFIER(m, m0);
1897 		M_COPY_PFTAG(m, m0);
1898 		M_COPY_NECPTAG(m, m0);
1899 
1900 #if BYTE_ORDER != BIG_ENDIAN
1901 		HTONS(mhip->ip_off);
1902 #endif
1903 
1904 		mhip->ip_sum = 0;
1905 		if (sw_csum & CSUM_DELAY_IP) {
1906 			mhip->ip_sum = ip_cksum_hdr_out(m, mhlen);
1907 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1908 		}
1909 		*mnext = m;
1910 		mnext = &m->m_nextpkt;
1911 		nfrags++;
1912 	}
1913 	OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1914 
1915 	/* set first/last markers for fragment chain */
1916 	m->m_flags |= M_LASTFRAG;
1917 	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1918 	m0->m_pkthdr.csum_data = nfrags;
1919 
1920 	/*
1921 	 * Update first fragment by trimming what's been copied out
1922 	 * and updating header, then send each fragment (in order).
1923 	 */
1924 	m = m0;
1925 	m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1926 	m->m_pkthdr.len = hlen + firstlen;
1927 	ip->ip_len = htons((u_short)m->m_pkthdr.len);
1928 	ip->ip_off |= IP_MF;
1929 
1930 #if BYTE_ORDER != BIG_ENDIAN
1931 	HTONS(ip->ip_off);
1932 #endif
1933 
1934 	ip->ip_sum = 0;
1935 	if (sw_csum & CSUM_DELAY_IP) {
1936 		ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1937 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1938 	}
1939 sendorfree:
1940 	if (error) {
1941 		m_freem_list(m0);
1942 	}
1943 
1944 	return error;
1945 }
1946 
1947 static void
ip_out_cksum_stats(int proto,u_int32_t len)1948 ip_out_cksum_stats(int proto, u_int32_t len)
1949 {
1950 	switch (proto) {
1951 	case IPPROTO_TCP:
1952 		tcp_out_cksum_stats(len);
1953 		break;
1954 	case IPPROTO_UDP:
1955 		udp_out_cksum_stats(len);
1956 		break;
1957 	default:
1958 		/* keep only TCP or UDP stats for now */
1959 		break;
1960 	}
1961 }
1962 
1963 /*
1964  * Process a delayed payload checksum calculation (outbound path.)
1965  *
1966  * hoff is the number of bytes beyond the mbuf data pointer which
1967  * points to the IP header.
1968  *
1969  * Returns a bitmask representing all the work done in software.
1970  */
1971 uint32_t
in_finalize_cksum(struct mbuf * m,uint32_t hoff,uint32_t csum_flags)1972 in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
1973 {
1974 	unsigned char buf[15 << 2] __attribute__((aligned(8)));
1975 	struct ip *ip;
1976 	uint32_t offset, _hlen, mlen, hlen, len, sw_csum;
1977 	uint16_t csum, ip_len;
1978 
1979 	_CASSERT(sizeof(csum) == sizeof(uint16_t));
1980 	VERIFY(m->m_flags & M_PKTHDR);
1981 
1982 	sw_csum = (csum_flags & m->m_pkthdr.csum_flags);
1983 
1984 	if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) {
1985 		goto done;
1986 	}
1987 
1988 	mlen = m->m_pkthdr.len;                         /* total mbuf len */
1989 
1990 	/* sanity check (need at least simple IP header) */
1991 	if (mlen < (hoff + sizeof(*ip))) {
1992 		panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
1993 		    "(%u+%u)\n", __func__, m, mlen, hoff,
1994 		    (uint32_t)sizeof(*ip));
1995 		/* NOTREACHED */
1996 	}
1997 
1998 	/*
1999 	 * In case the IP header is not contiguous, or not 32-bit aligned,
2000 	 * or if we're computing the IP header checksum, copy it to a local
2001 	 * buffer.  Copy only the simple IP header here (IP options case
2002 	 * is handled below.)
2003 	 */
2004 	if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof(*ip)) > m->m_len ||
2005 	    !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) {
2006 		m_copydata(m, hoff, sizeof(*ip), (caddr_t)buf);
2007 		ip = (struct ip *)(void *)buf;
2008 		_hlen = sizeof(*ip);
2009 	} else {
2010 		ip = (struct ip *)(void *)(m->m_data + hoff);
2011 		_hlen = 0;
2012 	}
2013 
2014 	hlen = IP_VHL_HL(ip->ip_vhl) << 2;              /* IP header len */
2015 
2016 	/* sanity check */
2017 	if (mlen < (hoff + hlen)) {
2018 		panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
2019 		    "hoff %u", __func__, m, mlen, hlen, hoff);
2020 		/* NOTREACHED */
2021 	}
2022 
2023 	/*
2024 	 * We could be in the context of an IP or interface filter; in the
2025 	 * former case, ip_len would be in host (correct) order while for
2026 	 * the latter it would be in network order.  Because of this, we
2027 	 * attempt to interpret the length field by comparing it against
2028 	 * the actual packet length.  If the comparison fails, byte swap
2029 	 * the length and check again.  If it still fails, use the actual
2030 	 * packet length.  This also covers the trailing bytes case.
2031 	 */
2032 	ip_len = ip->ip_len;
2033 	if (ip_len != (mlen - hoff)) {
2034 		ip_len = OSSwapInt16(ip_len);
2035 		if (ip_len != (mlen - hoff)) {
2036 			printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2037 			    "[swapped %d (%x)] doesn't match actual packet "
2038 			    "length; %d is used instead\n", __func__,
2039 			    (uint64_t)VM_KERNEL_ADDRHASH(m), ip->ip_p,
2040 			    ip->ip_len, ip->ip_len, ip_len, ip_len,
2041 			    (mlen - hoff));
2042 			if (mlen - hoff > UINT16_MAX) {
2043 				panic("%s: mlen %u - hoff %u > 65535",
2044 				    __func__, mlen, hoff);
2045 			}
2046 			ip_len = (uint16_t)(mlen - hoff);
2047 		}
2048 	}
2049 
2050 	len = ip_len - hlen;                            /* csum span */
2051 
2052 	if (sw_csum & CSUM_DELAY_DATA) {
2053 		uint16_t ulpoff;
2054 
2055 		/*
2056 		 * offset is added to the lower 16-bit value of csum_data,
2057 		 * which is expected to contain the ULP offset; therefore
2058 		 * CSUM_PARTIAL offset adjustment must be undone.
2059 		 */
2060 		if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL | CSUM_DATA_VALID)) ==
2061 		    (CSUM_PARTIAL | CSUM_DATA_VALID)) {
2062 			/*
2063 			 * Get back the original ULP offset (this will
2064 			 * undo the CSUM_PARTIAL logic in ip_output.)
2065 			 */
2066 			m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff -
2067 			    m->m_pkthdr.csum_tx_start);
2068 		}
2069 
2070 		ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */
2071 		offset = hoff + hlen;                   /* ULP header */
2072 
2073 		if (mlen < (ulpoff + sizeof(csum))) {
2074 			panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2075 			    "cksum offset (%u) cksum flags 0x%x\n", __func__,
2076 			    m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags);
2077 			/* NOTREACHED */
2078 		}
2079 
2080 		csum = inet_cksum(m, 0, offset, len);
2081 
2082 		/* Update stats */
2083 		ip_out_cksum_stats(ip->ip_p, len);
2084 
2085 		/* RFC1122 4.1.3.4 */
2086 		if (csum == 0 &&
2087 		    (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_ZERO_INVERT))) {
2088 			csum = 0xffff;
2089 		}
2090 
2091 		/* Insert the checksum in the ULP csum field */
2092 		offset += ulpoff;
2093 		if (offset + sizeof(csum) > m->m_len) {
2094 			m_copyback(m, offset, sizeof(csum), &csum);
2095 		} else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2096 			*(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2097 		} else {
2098 			bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2099 		}
2100 		m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
2101 		    CSUM_PARTIAL | CSUM_ZERO_INVERT);
2102 	}
2103 
2104 	if (sw_csum & CSUM_DELAY_IP) {
2105 		/* IP header must be in the local buffer */
2106 		VERIFY(_hlen == sizeof(*ip));
2107 		if (_hlen != hlen) {
2108 			VERIFY(hlen <= sizeof(buf));
2109 			m_copydata(m, hoff, hlen, (caddr_t)buf);
2110 			ip = (struct ip *)(void *)buf;
2111 			_hlen = hlen;
2112 		}
2113 
2114 		/*
2115 		 * Compute the IP header checksum as if the IP length
2116 		 * is the length which we believe is "correct"; see
2117 		 * how ip_len gets calculated above.  Note that this
2118 		 * is done on the local copy and not on the real one.
2119 		 */
2120 		ip->ip_len = htons(ip_len);
2121 		ip->ip_sum = 0;
2122 		csum = in_cksum_hdr_opt(ip);
2123 
2124 		/* Update stats */
2125 		ipstat.ips_snd_swcsum++;
2126 		ipstat.ips_snd_swcsum_bytes += hlen;
2127 
2128 		/*
2129 		 * Insert only the checksum in the existing IP header
2130 		 * csum field; all other fields are left unchanged.
2131 		 */
2132 		offset = hoff + offsetof(struct ip, ip_sum);
2133 		if (offset + sizeof(csum) > m->m_len) {
2134 			m_copyback(m, offset, sizeof(csum), &csum);
2135 		} else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2136 			*(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2137 		} else {
2138 			bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2139 		}
2140 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2141 	}
2142 
2143 done:
2144 	return sw_csum;
2145 }
2146 
2147 /*
2148  * Insert IP options into preformed packet.
2149  * Adjust IP destination as required for IP source routing,
2150  * as indicated by a non-zero in_addr at the start of the options.
2151  *
2152  * XXX This routine assumes that the packet has no options in place.
2153  */
2154 static struct mbuf *
ip_insertoptions(struct mbuf * m,struct mbuf * opt,int * phlen)2155 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
2156 {
2157 	struct ipoption *p = mtod(opt, struct ipoption *);
2158 	struct mbuf *n;
2159 	struct ip *ip = mtod(m, struct ip *);
2160 	unsigned optlen;
2161 
2162 	optlen = opt->m_len - sizeof(p->ipopt_dst);
2163 	if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
2164 		return m;             /* XXX should fail */
2165 	}
2166 	if (p->ipopt_dst.s_addr) {
2167 		ip->ip_dst = p->ipopt_dst;
2168 	}
2169 	if (m->m_flags & M_EXT || m_mtod_current(m) - optlen < m->m_pktdat) {
2170 		MGETHDR(n, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
2171 		if (n == NULL) {
2172 			return m;
2173 		}
2174 		n->m_pkthdr.rcvif = 0;
2175 		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2176 		m->m_len -= sizeof(struct ip);
2177 		m->m_data += sizeof(struct ip);
2178 		n->m_next = m;
2179 		m = n;
2180 		m->m_len = optlen + sizeof(struct ip);
2181 		m->m_data += max_linkhdr;
2182 		(void) memcpy(mtod(m, void *), ip, sizeof(struct ip));
2183 	} else {
2184 		m->m_data -= optlen;
2185 		m->m_len += optlen;
2186 		m->m_pkthdr.len += optlen;
2187 		ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2188 	}
2189 	ip = mtod(m, struct ip *);
2190 	bcopy(p->ipopt_list, ip + 1, optlen);
2191 	*phlen = sizeof(struct ip) + optlen;
2192 	ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2193 	ip->ip_len += optlen;
2194 	return m;
2195 }
2196 
2197 /*
2198  * Copy options from ip to jp,
2199  * omitting those not copied during fragmentation.
2200  */
2201 static int
ip_optcopy(struct ip * ip,struct ip * jp)2202 ip_optcopy(struct ip *ip, struct ip *jp)
2203 {
2204 	u_char *cp, *dp;
2205 	int opt, optlen, cnt;
2206 
2207 	cp = (u_char *)(ip + 1);
2208 	dp = (u_char *)(jp + 1);
2209 	cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
2210 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
2211 		opt = cp[0];
2212 		if (opt == IPOPT_EOL) {
2213 			break;
2214 		}
2215 		if (opt == IPOPT_NOP) {
2216 			/* Preserve for IP mcast tunnel's LSRR alignment. */
2217 			*dp++ = IPOPT_NOP;
2218 			optlen = 1;
2219 			continue;
2220 		}
2221 #if DIAGNOSTIC
2222 		if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2223 			panic("malformed IPv4 option passed to ip_optcopy");
2224 			/* NOTREACHED */
2225 		}
2226 #endif
2227 		optlen = cp[IPOPT_OLEN];
2228 #if DIAGNOSTIC
2229 		if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2230 			panic("malformed IPv4 option passed to ip_optcopy");
2231 			/* NOTREACHED */
2232 		}
2233 #endif
2234 		/* bogus lengths should have been caught by ip_dooptions */
2235 		if (optlen > cnt) {
2236 			optlen = cnt;
2237 		}
2238 		if (IPOPT_COPIED(opt)) {
2239 			bcopy(cp, dp, optlen);
2240 			dp += optlen;
2241 		}
2242 	}
2243 	for (optlen = (int)(dp - (u_char *)(jp + 1)); optlen & 0x3; optlen++) {
2244 		*dp++ = IPOPT_EOL;
2245 	}
2246 	return optlen;
2247 }
2248 
2249 /*
2250  * IP socket option processing.
2251  */
2252 int
ip_ctloutput(struct socket * so,struct sockopt * sopt)2253 ip_ctloutput(struct socket *so, struct sockopt *sopt)
2254 {
2255 	struct  inpcb *inp = sotoinpcb(so);
2256 	int     error, optval;
2257 	lck_mtx_t *mutex_held = NULL;
2258 
2259 	error = optval = 0;
2260 	if (sopt->sopt_level != IPPROTO_IP) {
2261 		return EINVAL;
2262 	}
2263 
2264 	switch (sopt->sopt_dir) {
2265 	case SOPT_SET:
2266 		mutex_held = socket_getlock(so, PR_F_WILLUNLOCK);
2267 		/*
2268 		 *  Wait if we are in the middle of ip_output
2269 		 *  as we unlocked the socket there and don't
2270 		 *  want to overwrite the IP options
2271 		 */
2272 		if (inp->inp_sndinprog_cnt > 0) {
2273 			inp->inp_sndingprog_waiters++;
2274 
2275 			while (inp->inp_sndinprog_cnt > 0) {
2276 				msleep(&inp->inp_sndinprog_cnt, mutex_held,
2277 				    PSOCK | PCATCH, "inp_sndinprog_cnt", NULL);
2278 			}
2279 			inp->inp_sndingprog_waiters--;
2280 		}
2281 		switch (sopt->sopt_name) {
2282 #ifdef notyet
2283 		case IP_RETOPTS:
2284 #endif
2285 		case IP_OPTIONS: {
2286 			struct mbuf *m;
2287 
2288 			if (sopt->sopt_valsize > MLEN) {
2289 				error = EMSGSIZE;
2290 				break;
2291 			}
2292 			MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2293 			    MT_HEADER);
2294 			if (m == NULL) {
2295 				error = ENOBUFS;
2296 				break;
2297 			}
2298 			m->m_len = (int32_t)sopt->sopt_valsize;
2299 			error = sooptcopyin(sopt, mtod(m, char *),
2300 			    m->m_len, m->m_len);
2301 			if (error) {
2302 				m_freem(m);
2303 				break;
2304 			}
2305 
2306 			return ip_pcbopts(sopt->sopt_name,
2307 			           &inp->inp_options, m);
2308 		}
2309 
2310 		case IP_TOS:
2311 		case IP_TTL:
2312 		case IP_RECVOPTS:
2313 		case IP_RECVRETOPTS:
2314 		case IP_RECVDSTADDR:
2315 		case IP_RECVIF:
2316 		case IP_RECVTTL:
2317 		case IP_RECVPKTINFO:
2318 		case IP_RECVTOS:
2319 		case IP_DONTFRAG:
2320 			error = sooptcopyin(sopt, &optval, sizeof(optval),
2321 			    sizeof(optval));
2322 			if (error) {
2323 				break;
2324 			}
2325 
2326 			switch (sopt->sopt_name) {
2327 			case IP_TOS:
2328 				if (optval > UINT8_MAX) {
2329 					error = EINVAL;
2330 					break;
2331 				}
2332 				inp->inp_ip_tos = (uint8_t)optval;
2333 				break;
2334 
2335 			case IP_TTL:
2336 				if (optval > UINT8_MAX) {
2337 					error = EINVAL;
2338 					break;
2339 				}
2340 				inp->inp_ip_ttl = (uint8_t)optval;
2341 				break;
2342 #define OPTSET(bit) do {                                                \
2343 	if (optval) {                                                   \
2344 	    inp->inp_flags |= bit;                                      \
2345 	} else {                                                        \
2346 	    inp->inp_flags &= ~bit;                                     \
2347 	}                                                               \
2348 } while (0)
2349 
2350 #define OPTSET2(bit) do {                                               \
2351 	if (optval) {                                                   \
2352 	    inp->inp_flags2 |= bit;                                     \
2353 	} else {                                                        \
2354 	    inp->inp_flags2 &= ~bit;                                    \
2355 	}                                                               \
2356 } while (0)
2357 
2358 			case IP_RECVOPTS:
2359 				OPTSET(INP_RECVOPTS);
2360 				break;
2361 
2362 			case IP_RECVRETOPTS:
2363 				OPTSET(INP_RECVRETOPTS);
2364 				break;
2365 
2366 			case IP_RECVDSTADDR:
2367 				OPTSET(INP_RECVDSTADDR);
2368 				break;
2369 
2370 			case IP_RECVIF:
2371 				OPTSET(INP_RECVIF);
2372 				break;
2373 
2374 			case IP_RECVTTL:
2375 				OPTSET(INP_RECVTTL);
2376 				break;
2377 
2378 			case IP_RECVPKTINFO:
2379 				OPTSET(INP_PKTINFO);
2380 				break;
2381 
2382 			case IP_RECVTOS:
2383 				OPTSET(INP_RECVTOS);
2384 				break;
2385 
2386 			case IP_DONTFRAG:
2387 				/* This option is settable only for IPv4 */
2388 				if (!(inp->inp_vflag & INP_IPV4)) {
2389 					error = EINVAL;
2390 					break;
2391 				}
2392 				OPTSET2(INP2_DONTFRAG);
2393 				break;
2394 #undef OPTSET
2395 #undef OPTSET2
2396 			}
2397 			break;
2398 		/*
2399 		 * Multicast socket options are processed by the in_mcast
2400 		 * module.
2401 		 */
2402 		case IP_MULTICAST_IF:
2403 		case IP_MULTICAST_IFINDEX:
2404 		case IP_MULTICAST_VIF:
2405 		case IP_MULTICAST_TTL:
2406 		case IP_MULTICAST_LOOP:
2407 		case IP_ADD_MEMBERSHIP:
2408 		case IP_DROP_MEMBERSHIP:
2409 		case IP_ADD_SOURCE_MEMBERSHIP:
2410 		case IP_DROP_SOURCE_MEMBERSHIP:
2411 		case IP_BLOCK_SOURCE:
2412 		case IP_UNBLOCK_SOURCE:
2413 		case IP_MSFILTER:
2414 		case MCAST_JOIN_GROUP:
2415 		case MCAST_LEAVE_GROUP:
2416 		case MCAST_JOIN_SOURCE_GROUP:
2417 		case MCAST_LEAVE_SOURCE_GROUP:
2418 		case MCAST_BLOCK_SOURCE:
2419 		case MCAST_UNBLOCK_SOURCE:
2420 			error = inp_setmoptions(inp, sopt);
2421 			break;
2422 
2423 		case IP_PORTRANGE:
2424 			error = sooptcopyin(sopt, &optval, sizeof(optval),
2425 			    sizeof(optval));
2426 			if (error) {
2427 				break;
2428 			}
2429 
2430 			switch (optval) {
2431 			case IP_PORTRANGE_DEFAULT:
2432 				inp->inp_flags &= ~(INP_LOWPORT);
2433 				inp->inp_flags &= ~(INP_HIGHPORT);
2434 				break;
2435 
2436 			case IP_PORTRANGE_HIGH:
2437 				inp->inp_flags &= ~(INP_LOWPORT);
2438 				inp->inp_flags |= INP_HIGHPORT;
2439 				break;
2440 
2441 			case IP_PORTRANGE_LOW:
2442 				inp->inp_flags &= ~(INP_HIGHPORT);
2443 				inp->inp_flags |= INP_LOWPORT;
2444 				break;
2445 
2446 			default:
2447 				error = EINVAL;
2448 				break;
2449 			}
2450 			break;
2451 
2452 #if IPSEC
2453 		case IP_IPSEC_POLICY: {
2454 			caddr_t req = NULL;
2455 			size_t len = 0;
2456 			int priv;
2457 			struct mbuf *m;
2458 			int optname;
2459 
2460 			if ((error = soopt_getm(sopt, &m)) != 0) { /* XXX */
2461 				break;
2462 			}
2463 			if ((error = soopt_mcopyin(sopt, m)) != 0) { /* XXX */
2464 				break;
2465 			}
2466 			priv = (proc_suser(sopt->sopt_p) == 0);
2467 			if (m) {
2468 				req = mtod(m, caddr_t);
2469 				len = m->m_len;
2470 			}
2471 			optname = sopt->sopt_name;
2472 			error = ipsec4_set_policy(inp, optname, req, len, priv);
2473 			m_freem(m);
2474 			break;
2475 		}
2476 #endif /* IPSEC */
2477 
2478 #if TRAFFIC_MGT
2479 		case IP_TRAFFIC_MGT_BACKGROUND: {
2480 			unsigned background = 0;
2481 
2482 			error = sooptcopyin(sopt, &background,
2483 			    sizeof(background), sizeof(background));
2484 			if (error) {
2485 				break;
2486 			}
2487 
2488 			if (background) {
2489 				socket_set_traffic_mgt_flags_locked(so,
2490 				    TRAFFIC_MGT_SO_BACKGROUND);
2491 			} else {
2492 				socket_clear_traffic_mgt_flags_locked(so,
2493 				    TRAFFIC_MGT_SO_BACKGROUND);
2494 			}
2495 
2496 			break;
2497 		}
2498 #endif /* TRAFFIC_MGT */
2499 
2500 		/*
2501 		 * On a multihomed system, scoped routing can be used to
2502 		 * restrict the source interface used for sending packets.
2503 		 * The socket option IP_BOUND_IF binds a particular AF_INET
2504 		 * socket to an interface such that data sent on the socket
2505 		 * is restricted to that interface.  This is unlike the
2506 		 * SO_DONTROUTE option where the routing table is bypassed;
2507 		 * therefore it allows for a greater flexibility and control
2508 		 * over the system behavior, and does not place any restriction
2509 		 * on the destination address type (e.g.  unicast, multicast,
2510 		 * or broadcast if applicable) or whether or not the host is
2511 		 * directly reachable.  Note that in the multicast transmit
2512 		 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2513 		 * IP_BOUND_IF, since the former practically bypasses the
2514 		 * routing table; in this case, IP_BOUND_IF sets the default
2515 		 * interface used for sending multicast packets in the absence
2516 		 * of an explicit multicast transmit interface.
2517 		 */
2518 		case IP_BOUND_IF:
2519 			/* This option is settable only for IPv4 */
2520 			if (!(inp->inp_vflag & INP_IPV4)) {
2521 				error = EINVAL;
2522 				break;
2523 			}
2524 
2525 			error = sooptcopyin(sopt, &optval, sizeof(optval),
2526 			    sizeof(optval));
2527 
2528 			if (error) {
2529 				break;
2530 			}
2531 
2532 			error = inp_bindif(inp, optval, NULL);
2533 			break;
2534 
2535 		case IP_NO_IFT_CELLULAR:
2536 			/* This option is settable only for IPv4 */
2537 			if (!(inp->inp_vflag & INP_IPV4)) {
2538 				error = EINVAL;
2539 				break;
2540 			}
2541 
2542 			error = sooptcopyin(sopt, &optval, sizeof(optval),
2543 			    sizeof(optval));
2544 
2545 			if (error) {
2546 				break;
2547 			}
2548 
2549 			/* once set, it cannot be unset */
2550 			if (!optval && INP_NO_CELLULAR(inp)) {
2551 				error = EINVAL;
2552 				break;
2553 			}
2554 
2555 			error = so_set_restrictions(so,
2556 			    SO_RESTRICT_DENY_CELLULAR);
2557 			break;
2558 
2559 		case IP_OUT_IF:
2560 			/* This option is not settable */
2561 			error = EINVAL;
2562 			break;
2563 
2564 		default:
2565 			error = ENOPROTOOPT;
2566 			break;
2567 		}
2568 		break;
2569 
2570 	case SOPT_GET:
2571 		switch (sopt->sopt_name) {
2572 		case IP_OPTIONS:
2573 		case IP_RETOPTS:
2574 			if (inp->inp_options) {
2575 				error = sooptcopyout(sopt,
2576 				    mtod(inp->inp_options, char *),
2577 				    inp->inp_options->m_len);
2578 			} else {
2579 				sopt->sopt_valsize = 0;
2580 			}
2581 			break;
2582 
2583 		case IP_TOS:
2584 		case IP_TTL:
2585 		case IP_RECVOPTS:
2586 		case IP_RECVRETOPTS:
2587 		case IP_RECVDSTADDR:
2588 		case IP_RECVIF:
2589 		case IP_RECVTTL:
2590 		case IP_PORTRANGE:
2591 		case IP_RECVPKTINFO:
2592 		case IP_RECVTOS:
2593 		case IP_DONTFRAG:
2594 			switch (sopt->sopt_name) {
2595 			case IP_TOS:
2596 				optval = inp->inp_ip_tos;
2597 				break;
2598 
2599 			case IP_TTL:
2600 				optval = inp->inp_ip_ttl;
2601 				break;
2602 
2603 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
2604 #define OPTBIT2(bit)    (inp->inp_flags2 & bit ? 1 : 0)
2605 			case IP_RECVOPTS:
2606 				optval = OPTBIT(INP_RECVOPTS);
2607 				break;
2608 
2609 			case IP_RECVRETOPTS:
2610 				optval = OPTBIT(INP_RECVRETOPTS);
2611 				break;
2612 
2613 			case IP_RECVDSTADDR:
2614 				optval = OPTBIT(INP_RECVDSTADDR);
2615 				break;
2616 
2617 			case IP_RECVIF:
2618 				optval = OPTBIT(INP_RECVIF);
2619 				break;
2620 
2621 			case IP_RECVTTL:
2622 				optval = OPTBIT(INP_RECVTTL);
2623 				break;
2624 
2625 			case IP_PORTRANGE:
2626 				if (inp->inp_flags & INP_HIGHPORT) {
2627 					optval = IP_PORTRANGE_HIGH;
2628 				} else if (inp->inp_flags & INP_LOWPORT) {
2629 					optval = IP_PORTRANGE_LOW;
2630 				} else {
2631 					optval = 0;
2632 				}
2633 				break;
2634 
2635 			case IP_RECVPKTINFO:
2636 				optval = OPTBIT(INP_PKTINFO);
2637 				break;
2638 
2639 			case IP_RECVTOS:
2640 				optval = OPTBIT(INP_RECVTOS);
2641 				break;
2642 			case IP_DONTFRAG:
2643 				optval = OPTBIT2(INP2_DONTFRAG);
2644 				break;
2645 			}
2646 			error = sooptcopyout(sopt, &optval, sizeof(optval));
2647 			break;
2648 
2649 		case IP_MULTICAST_IF:
2650 		case IP_MULTICAST_IFINDEX:
2651 		case IP_MULTICAST_VIF:
2652 		case IP_MULTICAST_TTL:
2653 		case IP_MULTICAST_LOOP:
2654 		case IP_MSFILTER:
2655 			error = inp_getmoptions(inp, sopt);
2656 			break;
2657 
2658 #if IPSEC
2659 		case IP_IPSEC_POLICY: {
2660 			error = 0; /* This option is no longer supported */
2661 			break;
2662 		}
2663 #endif /* IPSEC */
2664 
2665 #if TRAFFIC_MGT
2666 		case IP_TRAFFIC_MGT_BACKGROUND: {
2667 			unsigned background = (so->so_flags1 &
2668 			    SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2669 			return sooptcopyout(sopt, &background,
2670 			           sizeof(background));
2671 		}
2672 #endif /* TRAFFIC_MGT */
2673 
2674 		case IP_BOUND_IF:
2675 			if (inp->inp_flags & INP_BOUND_IF) {
2676 				optval = inp->inp_boundifp->if_index;
2677 			}
2678 			error = sooptcopyout(sopt, &optval, sizeof(optval));
2679 			break;
2680 
2681 		case IP_NO_IFT_CELLULAR:
2682 			optval = INP_NO_CELLULAR(inp) ? 1 : 0;
2683 			error = sooptcopyout(sopt, &optval, sizeof(optval));
2684 			break;
2685 
2686 		case IP_OUT_IF:
2687 			optval = (inp->inp_last_outifp != NULL) ?
2688 			    inp->inp_last_outifp->if_index : 0;
2689 			error = sooptcopyout(sopt, &optval, sizeof(optval));
2690 			break;
2691 
2692 		default:
2693 			error = ENOPROTOOPT;
2694 			break;
2695 		}
2696 		break;
2697 	}
2698 	return error;
2699 }
2700 
2701 /*
2702  * Set up IP options in pcb for insertion in output packets.
2703  * Store in mbuf with pointer in pcbopt, adding pseudo-option
2704  * with destination address if source routed.
2705  */
2706 static int
ip_pcbopts(int optname,struct mbuf ** pcbopt,struct mbuf * m)2707 ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m)
2708 {
2709 #pragma unused(optname)
2710 	int cnt, optlen;
2711 	u_char *cp;
2712 	u_char opt;
2713 
2714 	/* turn off any old options */
2715 	if (*pcbopt) {
2716 		(void) m_free(*pcbopt);
2717 	}
2718 	*pcbopt = 0;
2719 	if (m == (struct mbuf *)0 || m->m_len == 0) {
2720 		/*
2721 		 * Only turning off any previous options.
2722 		 */
2723 		if (m) {
2724 			(void) m_free(m);
2725 		}
2726 		return 0;
2727 	}
2728 
2729 	if (m->m_len % sizeof(int32_t)) {
2730 		goto bad;
2731 	}
2732 
2733 	/*
2734 	 * IP first-hop destination address will be stored before
2735 	 * actual options; move other options back
2736 	 * and clear it when none present.
2737 	 */
2738 	if (m_mtod_upper_bound(m) - m_mtod_end(m) < sizeof(struct in_addr)) {
2739 		goto bad;
2740 	}
2741 	cnt = m->m_len;
2742 	m->m_len += sizeof(struct in_addr);
2743 	cp = mtod(m, u_char *) + sizeof(struct in_addr);
2744 	ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2745 	bzero(mtod(m, caddr_t), sizeof(struct in_addr));
2746 
2747 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
2748 		opt = cp[IPOPT_OPTVAL];
2749 		if (opt == IPOPT_EOL) {
2750 			break;
2751 		}
2752 		if (opt == IPOPT_NOP) {
2753 			optlen = 1;
2754 		} else {
2755 			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2756 				goto bad;
2757 			}
2758 			optlen = cp[IPOPT_OLEN];
2759 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2760 				goto bad;
2761 			}
2762 		}
2763 		switch (opt) {
2764 		default:
2765 			break;
2766 
2767 		case IPOPT_LSRR:
2768 		case IPOPT_SSRR:
2769 			/*
2770 			 * user process specifies route as:
2771 			 *	->A->B->C->D
2772 			 * D must be our final destination (but we can't
2773 			 * check that since we may not have connected yet).
2774 			 * A is first hop destination, which doesn't appear in
2775 			 * actual IP option, but is stored before the options.
2776 			 */
2777 			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) {
2778 				goto bad;
2779 			}
2780 			if (optlen > UINT8_MAX) {
2781 				goto bad;
2782 			}
2783 			m->m_len -= sizeof(struct in_addr);
2784 			cnt -= sizeof(struct in_addr);
2785 			optlen -= sizeof(struct in_addr);
2786 			cp[IPOPT_OLEN] = (uint8_t)optlen;
2787 			/*
2788 			 * Move first hop before start of options.
2789 			 */
2790 			bcopy((caddr_t)&cp[IPOPT_OFFSET + 1], mtod(m, caddr_t),
2791 			    sizeof(struct in_addr));
2792 			/*
2793 			 * Then copy rest of options back
2794 			 * to close up the deleted entry.
2795 			 */
2796 			ovbcopy((caddr_t)(&cp[IPOPT_OFFSET + 1] +
2797 			    sizeof(struct in_addr)),
2798 			    (caddr_t)&cp[IPOPT_OFFSET + 1],
2799 			    (unsigned)cnt - (IPOPT_MINOFF - 1));
2800 			break;
2801 		}
2802 	}
2803 	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) {
2804 		goto bad;
2805 	}
2806 	*pcbopt = m;
2807 	return 0;
2808 
2809 bad:
2810 	(void) m_free(m);
2811 	return EINVAL;
2812 }
2813 
2814 void
ip_moptions_init(void)2815 ip_moptions_init(void)
2816 {
2817 	PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof(imo_debug));
2818 
2819 	vm_size_t imo_size = (imo_debug == 0) ? sizeof(struct ip_moptions) :
2820 	    sizeof(struct ip_moptions_dbg);
2821 
2822 	imo_zone = zone_create(IMO_ZONE_NAME, imo_size, ZC_ZFREE_CLEARMEM);
2823 }
2824 
2825 void
imo_addref(struct ip_moptions * imo,int locked)2826 imo_addref(struct ip_moptions *imo, int locked)
2827 {
2828 	if (!locked) {
2829 		IMO_LOCK(imo);
2830 	} else {
2831 		IMO_LOCK_ASSERT_HELD(imo);
2832 	}
2833 
2834 	if (++imo->imo_refcnt == 0) {
2835 		panic("%s: imo %p wraparound refcnt", __func__, imo);
2836 		/* NOTREACHED */
2837 	} else if (imo->imo_trace != NULL) {
2838 		(*imo->imo_trace)(imo, TRUE);
2839 	}
2840 
2841 	if (!locked) {
2842 		IMO_UNLOCK(imo);
2843 	}
2844 }
2845 
2846 void
imo_remref(struct ip_moptions * imo)2847 imo_remref(struct ip_moptions *imo)
2848 {
2849 	IMO_LOCK(imo);
2850 	if (imo->imo_refcnt == 0) {
2851 		panic("%s: imo %p negative refcnt", __func__, imo);
2852 		/* NOTREACHED */
2853 	} else if (imo->imo_trace != NULL) {
2854 		(*imo->imo_trace)(imo, FALSE);
2855 	}
2856 
2857 	--imo->imo_refcnt;
2858 	if (imo->imo_refcnt > 0) {
2859 		IMO_UNLOCK(imo);
2860 		return;
2861 	}
2862 
2863 	IMO_PURGE_LOCKED(imo);
2864 
2865 	IMO_UNLOCK(imo);
2866 
2867 	kfree_type(struct in_multi *, imo->imo_max_memberships, imo->imo_membership);
2868 	kfree_type(struct in_mfilter, imo->imo_max_memberships, imo->imo_mfilters);
2869 	lck_mtx_destroy(&imo->imo_lock, &ifa_mtx_grp);
2870 
2871 	if (!(imo->imo_debug & IFD_ALLOC)) {
2872 		panic("%s: imo %p cannot be freed", __func__, imo);
2873 		/* NOTREACHED */
2874 	}
2875 	zfree(imo_zone, imo);
2876 }
2877 
2878 static void
imo_trace(struct ip_moptions * imo,int refhold)2879 imo_trace(struct ip_moptions *imo, int refhold)
2880 {
2881 	struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2882 	ctrace_t *tr;
2883 	u_int32_t idx;
2884 	u_int16_t *cnt;
2885 
2886 	if (!(imo->imo_debug & IFD_DEBUG)) {
2887 		panic("%s: imo %p has no debug structure", __func__, imo);
2888 		/* NOTREACHED */
2889 	}
2890 	if (refhold) {
2891 		cnt = &imo_dbg->imo_refhold_cnt;
2892 		tr = imo_dbg->imo_refhold;
2893 	} else {
2894 		cnt = &imo_dbg->imo_refrele_cnt;
2895 		tr = imo_dbg->imo_refrele;
2896 	}
2897 
2898 	idx = os_atomic_inc_orig(cnt, relaxed) % IMO_TRACE_HIST_SIZE;
2899 	ctrace_record(&tr[idx]);
2900 }
2901 
2902 struct ip_moptions *
ip_allocmoptions(zalloc_flags_t how)2903 ip_allocmoptions(zalloc_flags_t how)
2904 {
2905 	struct ip_moptions *imo;
2906 
2907 	imo = zalloc_flags(imo_zone, how | Z_ZERO);
2908 	if (imo != NULL) {
2909 		lck_mtx_init(&imo->imo_lock, &ifa_mtx_grp, &ifa_mtx_attr);
2910 		imo->imo_debug |= IFD_ALLOC;
2911 		if (imo_debug != 0) {
2912 			imo->imo_debug |= IFD_DEBUG;
2913 			imo->imo_trace = imo_trace;
2914 		}
2915 		IMO_ADDREF(imo);
2916 	}
2917 
2918 	return imo;
2919 }
2920 
2921 /*
2922  * Routine called from ip_output() to loop back a copy of an IP multicast
2923  * packet to the input queue of a specified interface.  Note that this
2924  * calls the output routine of the loopback "driver", but with an interface
2925  * pointer that might NOT be a loopback interface -- evil, but easier than
2926  * replicating that code here.
2927  */
2928 static void
ip_mloopback(struct ifnet * srcifp,struct ifnet * origifp,struct mbuf * m,struct sockaddr_in * dst,int hlen)2929 ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
2930     struct sockaddr_in *dst, int hlen)
2931 {
2932 	struct mbuf *copym;
2933 	struct ip *ip;
2934 
2935 	if (lo_ifp == NULL) {
2936 		return;
2937 	}
2938 
2939 	/*
2940 	 * Copy the packet header as it's needed for the checksum
2941 	 * Make sure to deep-copy IP header portion in case the data
2942 	 * is in an mbuf cluster, so that we can safely override the IP
2943 	 * header portion later.
2944 	 */
2945 	copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, NULL, NULL, M_COPYM_COPY_HDR);
2946 	if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) {
2947 		copym = m_pullup(copym, hlen);
2948 	}
2949 
2950 	if (copym == NULL) {
2951 		return;
2952 	}
2953 
2954 	/*
2955 	 * We don't bother to fragment if the IP length is greater
2956 	 * than the interface's MTU.  Can this possibly matter?
2957 	 */
2958 	ip = mtod(copym, struct ip *);
2959 #if BYTE_ORDER != BIG_ENDIAN
2960 	HTONS(ip->ip_len);
2961 	HTONS(ip->ip_off);
2962 #endif
2963 	ip->ip_sum = 0;
2964 	ip->ip_sum = ip_cksum_hdr_out(copym, hlen);
2965 
2966 	/*
2967 	 * Mark checksum as valid unless receive checksum offload is
2968 	 * disabled; if so, compute checksum in software.  If the
2969 	 * interface itself is lo0, this will be overridden by if_loop.
2970 	 */
2971 	if (hwcksum_rx) {
2972 		copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL | CSUM_ZERO_INVERT);
2973 		copym->m_pkthdr.csum_flags |=
2974 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2975 		copym->m_pkthdr.csum_data = 0xffff;
2976 	} else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2977 #if BYTE_ORDER != BIG_ENDIAN
2978 		NTOHS(ip->ip_len);
2979 #endif
2980 		in_delayed_cksum(copym);
2981 #if BYTE_ORDER != BIG_ENDIAN
2982 		HTONS(ip->ip_len);
2983 #endif
2984 	}
2985 
2986 	/*
2987 	 * Stuff the 'real' ifp into the pkthdr, to be used in matching
2988 	 * in ip_input(); we need the loopback ifp/dl_tag passed as args
2989 	 * to make the loopback driver compliant with the data link
2990 	 * requirements.
2991 	 */
2992 	copym->m_pkthdr.rcvif = origifp;
2993 
2994 	/*
2995 	 * Also record the source interface (which owns the source address).
2996 	 * This is basically a stripped down version of ifa_foraddr().
2997 	 */
2998 	if (srcifp == NULL) {
2999 		struct in_ifaddr *ia;
3000 
3001 		lck_rw_lock_shared(&in_ifaddr_rwlock);
3002 		TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) {
3003 			IFA_LOCK_SPIN(&ia->ia_ifa);
3004 			if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) {
3005 				srcifp = ia->ia_ifp;
3006 				IFA_UNLOCK(&ia->ia_ifa);
3007 				break;
3008 			}
3009 			IFA_UNLOCK(&ia->ia_ifa);
3010 		}
3011 		lck_rw_done(&in_ifaddr_rwlock);
3012 	}
3013 	if (srcifp != NULL) {
3014 		ip_setsrcifaddr_info(copym, srcifp->if_index, NULL);
3015 	}
3016 	ip_setdstifaddr_info(copym, origifp->if_index, NULL);
3017 
3018 	dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL);
3019 }
3020 
3021 /*
3022  * Given a source IP address (and route, if available), determine the best
3023  * interface to send the packet from.  Checking for (and updating) the
3024  * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3025  * without any locks based on the assumption that ip_output() is single-
3026  * threaded per-pcb, i.e. for any given pcb there can only be one thread
3027  * performing output at the IP layer.
3028  *
3029  * This routine is analogous to in6_selectroute() for IPv6.
3030  */
3031 static struct ifaddr *
in_selectsrcif(struct ip * ip,struct route * ro,unsigned int ifscope)3032 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3033 {
3034 	struct ifaddr *ifa = NULL;
3035 	struct in_addr src = ip->ip_src;
3036 	struct in_addr dst = ip->ip_dst;
3037 	struct ifnet *rt_ifp;
3038 	char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3039 
3040 	VERIFY(src.s_addr != INADDR_ANY);
3041 
3042 	if (ip_select_srcif_debug) {
3043 		(void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof(s_src));
3044 		(void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof(s_dst));
3045 	}
3046 
3047 	if (ro->ro_rt != NULL) {
3048 		RT_LOCK(ro->ro_rt);
3049 	}
3050 
3051 	rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3052 
3053 	/*
3054 	 * Given the source IP address, find a suitable source interface
3055 	 * to use for transmission; if the caller has specified a scope,
3056 	 * optimize the search by looking at the addresses only for that
3057 	 * interface.  This is still suboptimal, however, as we need to
3058 	 * traverse the per-interface list.
3059 	 */
3060 	if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3061 		unsigned int scope = ifscope;
3062 
3063 		/*
3064 		 * If no scope is specified and the route is stale (pointing
3065 		 * to a defunct interface) use the current primary interface;
3066 		 * this happens when switching between interfaces configured
3067 		 * with the same IP address.  Otherwise pick up the scope
3068 		 * information from the route; the ULP may have looked up a
3069 		 * correct route and we just need to verify it here and mark
3070 		 * it with the ROF_SRCIF_SELECTED flag below.
3071 		 */
3072 		if (scope == IFSCOPE_NONE) {
3073 			scope = rt_ifp->if_index;
3074 			if (scope != get_primary_ifscope(AF_INET) &&
3075 			    ROUTE_UNUSABLE(ro)) {
3076 				scope = get_primary_ifscope(AF_INET);
3077 			}
3078 		}
3079 
3080 		ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3081 
3082 		if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3083 		    ip->ip_p != IPPROTO_TCP && ipforwarding) {
3084 			/*
3085 			 * If forwarding is enabled, and if the packet isn't
3086 			 * TCP or UDP, check if the source address belongs
3087 			 * to one of our own interfaces; if so, demote the
3088 			 * interface scope and do a route lookup right below.
3089 			 */
3090 			ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3091 			if (ifa != NULL) {
3092 				ifa_remref(ifa);
3093 				ifa = NULL;
3094 				ifscope = IFSCOPE_NONE;
3095 			}
3096 		}
3097 
3098 		if (ip_select_srcif_debug && ifa != NULL) {
3099 			if (ro->ro_rt != NULL) {
3100 				printf("%s->%s ifscope %d->%d ifa_if %s "
3101 				    "ro_if %s\n", s_src, s_dst, ifscope,
3102 				    scope, if_name(ifa->ifa_ifp),
3103 				    if_name(rt_ifp));
3104 			} else {
3105 				printf("%s->%s ifscope %d->%d ifa_if %s\n",
3106 				    s_src, s_dst, ifscope, scope,
3107 				    if_name(ifa->ifa_ifp));
3108 			}
3109 		}
3110 	}
3111 
3112 	/*
3113 	 * Slow path; search for an interface having the corresponding source
3114 	 * IP address if the scope was not specified by the caller, and:
3115 	 *
3116 	 *   1) There currently isn't any route, or,
3117 	 *   2) The interface used by the route does not own that source
3118 	 *	IP address; in this case, the route will get blown away
3119 	 *	and we'll do a more specific scoped search using the newly
3120 	 *	found interface.
3121 	 */
3122 	if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3123 		ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3124 
3125 		/*
3126 		 * If we have the IP address, but not the route, we don't
3127 		 * really know whether or not it belongs to the correct
3128 		 * interface (it could be shared across multiple interfaces.)
3129 		 * The only way to find out is to do a route lookup.
3130 		 */
3131 		if (ifa != NULL && ro->ro_rt == NULL) {
3132 			struct rtentry *rt;
3133 			struct sockaddr_in sin;
3134 			struct ifaddr *oifa = NULL;
3135 
3136 			SOCKADDR_ZERO(&sin, sizeof(sin));
3137 			sin.sin_family = AF_INET;
3138 			sin.sin_len = sizeof(sin);
3139 			sin.sin_addr = dst;
3140 
3141 			lck_mtx_lock(rnh_lock);
3142 			if ((rt = rt_lookup(TRUE, SA(&sin), NULL,
3143 			    rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3144 				RT_LOCK(rt);
3145 				/*
3146 				 * If the route uses a different interface,
3147 				 * use that one instead.  The IP address of
3148 				 * the ifaddr that we pick up here is not
3149 				 * relevant.
3150 				 */
3151 				if (ifa->ifa_ifp != rt->rt_ifp) {
3152 					oifa = ifa;
3153 					ifa = rt->rt_ifa;
3154 					ifa_addref(ifa);
3155 					RT_UNLOCK(rt);
3156 				} else {
3157 					RT_UNLOCK(rt);
3158 				}
3159 				rtfree_locked(rt);
3160 			}
3161 			lck_mtx_unlock(rnh_lock);
3162 
3163 			if (oifa != NULL) {
3164 				struct ifaddr *iifa;
3165 
3166 				/*
3167 				 * See if the interface pointed to by the
3168 				 * route is configured with the source IP
3169 				 * address of the packet.
3170 				 */
3171 				iifa = (struct ifaddr *)ifa_foraddr_scoped(
3172 					src.s_addr, ifa->ifa_ifp->if_index);
3173 
3174 				if (iifa != NULL) {
3175 					/*
3176 					 * Found it; drop the original one
3177 					 * as well as the route interface
3178 					 * address, and use this instead.
3179 					 */
3180 					ifa_remref(oifa);
3181 					ifa_remref(ifa);
3182 					ifa = iifa;
3183 				} else if (!ipforwarding ||
3184 				    (rt->rt_flags & RTF_GATEWAY)) {
3185 					/*
3186 					 * This interface doesn't have that
3187 					 * source IP address; drop the route
3188 					 * interface address and just use the
3189 					 * original one, and let the caller
3190 					 * do a scoped route lookup.
3191 					 */
3192 					ifa_remref(ifa);
3193 					ifa = oifa;
3194 				} else {
3195 					/*
3196 					 * Forwarding is enabled and the source
3197 					 * address belongs to one of our own
3198 					 * interfaces which isn't the outgoing
3199 					 * interface, and we have a route, and
3200 					 * the destination is on a network that
3201 					 * is directly attached (onlink); drop
3202 					 * the original one and use the route
3203 					 * interface address instead.
3204 					 */
3205 					ifa_remref(oifa);
3206 				}
3207 			}
3208 		} else if (ifa != NULL && ro->ro_rt != NULL &&
3209 		    !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3210 		    ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3211 			/*
3212 			 * Forwarding is enabled and the source address belongs
3213 			 * to one of our own interfaces which isn't the same
3214 			 * as the interface used by the known route; drop the
3215 			 * original one and use the route interface address.
3216 			 */
3217 			ifa_remref(ifa);
3218 			ifa = ro->ro_rt->rt_ifa;
3219 			ifa_addref(ifa);
3220 		}
3221 
3222 		if (ip_select_srcif_debug && ifa != NULL) {
3223 			printf("%s->%s ifscope %d ifa_if %s\n",
3224 			    s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3225 		}
3226 	}
3227 
3228 	if (ro->ro_rt != NULL) {
3229 		RT_LOCK_ASSERT_HELD(ro->ro_rt);
3230 	}
3231 	/*
3232 	 * If there is a non-loopback route with the wrong interface, or if
3233 	 * there is no interface configured with such an address, blow it
3234 	 * away.  Except for local/loopback, we look for one with a matching
3235 	 * interface scope/index.
3236 	 */
3237 	if (ro->ro_rt != NULL &&
3238 	    (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3239 	    !(ro->ro_rt->rt_flags & RTF_UP))) {
3240 		if (ip_select_srcif_debug) {
3241 			if (ifa != NULL) {
3242 				printf("%s->%s ifscope %d ro_if %s != "
3243 				    "ifa_if %s (cached route cleared)\n",
3244 				    s_src, s_dst, ifscope, if_name(rt_ifp),
3245 				    if_name(ifa->ifa_ifp));
3246 			} else {
3247 				printf("%s->%s ifscope %d ro_if %s "
3248 				    "(no ifa_if found)\n",
3249 				    s_src, s_dst, ifscope, if_name(rt_ifp));
3250 			}
3251 		}
3252 
3253 		RT_UNLOCK(ro->ro_rt);
3254 		ROUTE_RELEASE(ro);
3255 
3256 		/*
3257 		 * If the destination is IPv4 LLA and the route's interface
3258 		 * doesn't match the source interface, then the source IP
3259 		 * address is wrong; it most likely belongs to the primary
3260 		 * interface associated with the IPv4 LL subnet.  Drop the
3261 		 * packet rather than letting it go out and return an error
3262 		 * to the ULP.  This actually applies not only to IPv4 LL
3263 		 * but other shared subnets; for now we explicitly test only
3264 		 * for the former case and save the latter for future.
3265 		 */
3266 		if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3267 		    !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3268 			ifa_remref(ifa);
3269 			ifa = NULL;
3270 		}
3271 	}
3272 
3273 	if (ip_select_srcif_debug && ifa == NULL) {
3274 		printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3275 		    s_src, s_dst, ifscope);
3276 	}
3277 
3278 	/*
3279 	 * If there is a route, mark it accordingly.  If there isn't one,
3280 	 * we'll get here again during the next transmit (possibly with a
3281 	 * route) and the flag will get set at that point.  For IPv4 LLA
3282 	 * destination, mark it only if the route has been fully resolved;
3283 	 * otherwise we want to come back here again when the route points
3284 	 * to the interface over which the ARP reply arrives on.
3285 	 */
3286 	if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3287 	    (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3288 	    SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3289 		if (ifa != NULL) {
3290 			ifa_addref(ifa);        /* for route */
3291 		}
3292 		if (ro->ro_srcia != NULL) {
3293 			ifa_remref(ro->ro_srcia);
3294 		}
3295 		ro->ro_srcia = ifa;
3296 		ro->ro_flags |= ROF_SRCIF_SELECTED;
3297 		RT_GENID_SYNC(ro->ro_rt);
3298 	}
3299 
3300 	if (ro->ro_rt != NULL) {
3301 		RT_UNLOCK(ro->ro_rt);
3302 	}
3303 
3304 	return ifa;
3305 }
3306 
3307 /*
3308  * @brief	Given outgoing interface it determines what checksum needs
3309  *      to be computed in software and what needs to be offloaded to the
3310  *      interface.
3311  *
3312  * @param	ifp Pointer to the outgoing interface
3313  * @param	m Pointer to the packet
3314  * @param	hlen IP header length
3315  * @param	ip_len Total packet size i.e. headers + data payload
3316  * @param	sw_csum Pointer to a software checksum flag set
3317  *
3318  * @return	void
3319  */
3320 void
ip_output_checksum(struct ifnet * ifp,struct mbuf * m,int hlen,int ip_len,uint32_t * sw_csum)3321 ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
3322     uint32_t *sw_csum)
3323 {
3324 	uint32_t hwcap = ifp->if_hwassist;
3325 
3326 	m->m_pkthdr.csum_flags |= CSUM_IP;
3327 
3328 	if (!hwcksum_tx) {
3329 		/* do all in software; hardware checksum offload is disabled */
3330 		*sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3331 		    m->m_pkthdr.csum_flags;
3332 	} else {
3333 		/* do in software what the hardware cannot */
3334 		*sw_csum = m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(hwcap);
3335 	}
3336 
3337 	if (hlen != sizeof(struct ip)) {
3338 		*sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3339 		    m->m_pkthdr.csum_flags);
3340 	} else if ((*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
3341 		/*
3342 		 * If the explicitly required data csum offload is not supported by hardware,
3343 		 * do it by partial checksum. Here we assume TSO implies support for IP
3344 		 * and data sum.
3345 		 */
3346 		int interface_mtu = ifp->if_mtu;
3347 
3348 		if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3349 			interface_mtu = IN6_LINKMTU(ifp);
3350 			/* Further adjust the size for CLAT46 expansion */
3351 			interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
3352 		}
3353 
3354 		/*
3355 		 * Partial checksum offload, if non-IP fragment, and TCP only
3356 		 * (no UDP support, as the hardware may not be able to convert
3357 		 * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
3358 		 * supports "invert zero" capability.)
3359 		 */
3360 		if (hwcksum_tx &&
3361 		    ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
3362 		    ((hwcap & CSUM_ZERO_INVERT) &&
3363 		    (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
3364 		    ip_len <= interface_mtu) {
3365 			uint16_t start = sizeof(struct ip);
3366 			uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
3367 			m->m_pkthdr.csum_flags |=
3368 			    (CSUM_DATA_VALID | CSUM_PARTIAL);
3369 			m->m_pkthdr.csum_tx_stuff = (ulpoff + start);
3370 			m->m_pkthdr.csum_tx_start = start;
3371 			/* do IP hdr chksum in software */
3372 			*sw_csum = CSUM_DELAY_IP;
3373 		} else {
3374 			*sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
3375 		}
3376 	}
3377 
3378 	if (*sw_csum & CSUM_DELAY_DATA) {
3379 		in_delayed_cksum(m);
3380 		*sw_csum &= ~CSUM_DELAY_DATA;
3381 	}
3382 
3383 	if (hwcksum_tx) {
3384 		uint32_t delay_data = m->m_pkthdr.csum_flags & CSUM_DELAY_DATA;
3385 		uint32_t hw_csum = IF_HWASSIST_CSUM_FLAGS(hwcap);
3386 
3387 		/*
3388 		 * Drop off bits that aren't supported by hardware;
3389 		 * also make sure to preserve non-checksum related bits.
3390 		 */
3391 		m->m_pkthdr.csum_flags =
3392 		    ((m->m_pkthdr.csum_flags & (hw_csum | CSUM_DATA_VALID)) |
3393 		    (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK));
3394 
3395 		/*
3396 		 * If hardware supports partial checksum but not delay_data,
3397 		 * add back delay_data.
3398 		 */
3399 		if ((hw_csum & CSUM_PARTIAL) != 0 &&
3400 		    (hw_csum & delay_data) == 0) {
3401 			m->m_pkthdr.csum_flags |= delay_data;
3402 		}
3403 	} else {
3404 		/* drop all bits; hardware checksum offload is disabled */
3405 		m->m_pkthdr.csum_flags = 0;
3406 	}
3407 }
3408 
3409 /*
3410  * GRE protocol output for PPP/PPTP
3411  */
3412 int
ip_gre_output(struct mbuf * m)3413 ip_gre_output(struct mbuf *m)
3414 {
3415 	struct route ro;
3416 	int error;
3417 
3418 	bzero(&ro, sizeof(ro));
3419 
3420 	error = ip_output(m, NULL, &ro, 0, NULL, NULL);
3421 
3422 	ROUTE_RELEASE(&ro);
3423 
3424 	return error;
3425 }
3426 
3427 static int
3428 sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
3429 {
3430 #pragma unused(arg1, arg2)
3431 	int error, i;
3432 
3433 	i = ip_output_measure;
3434 	error = sysctl_handle_int(oidp, &i, 0, req);
3435 	if (error || req->newptr == USER_ADDR_NULL) {
3436 		goto done;
3437 	}
3438 	/* impose bounds */
3439 	if (i < 0 || i > 1) {
3440 		error = EINVAL;
3441 		goto done;
3442 	}
3443 	if (ip_output_measure != i && i == 1) {
3444 		net_perf_initialize(&net_perf, ip_output_measure_bins);
3445 	}
3446 	ip_output_measure = i;
3447 done:
3448 	return error;
3449 }
3450 
3451 static int
3452 sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
3453 {
3454 #pragma unused(arg1, arg2)
3455 	int error;
3456 	uint64_t i;
3457 
3458 	i = ip_output_measure_bins;
3459 	error = sysctl_handle_quad(oidp, &i, 0, req);
3460 	if (error || req->newptr == USER_ADDR_NULL) {
3461 		goto done;
3462 	}
3463 	/* validate data */
3464 	if (!net_perf_validate_bins(i)) {
3465 		error = EINVAL;
3466 		goto done;
3467 	}
3468 	ip_output_measure_bins = i;
3469 done:
3470 	return error;
3471 }
3472 
3473 static int
3474 sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
3475 {
3476 #pragma unused(oidp, arg1, arg2)
3477 	if (req->oldptr == USER_ADDR_NULL) {
3478 		req->oldlen = (size_t)sizeof(struct ipstat);
3479 	}
3480 
3481 	return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
3482 }
3483