1 /*
2 * Copyright (c) 2000-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
61 */
62 /*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69 #define _IP_VHL
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/protosw.h>
77 #include <sys/socket.h>
78 #include <sys/socketvar.h>
79 #include <kern/locks.h>
80 #include <sys/sysctl.h>
81 #include <sys/mcache.h>
82 #include <sys/kdebug.h>
83
84 #include <machine/endian.h>
85 #include <pexpert/pexpert.h>
86 #include <mach/sdt.h>
87
88 #include <libkern/OSAtomic.h>
89 #include <libkern/OSByteOrder.h>
90
91 #include <net/if.h>
92 #include <net/if_dl.h>
93 #include <net/if_types.h>
94 #include <net/route.h>
95 #include <net/ntstat.h>
96 #include <net/net_osdep.h>
97 #include <net/dlil.h>
98 #include <net/net_perf.h>
99
100 #include <netinet/in.h>
101 #include <netinet/in_systm.h>
102 #include <netinet/ip.h>
103 #include <netinet/in_pcb.h>
104 #include <netinet/in_var.h>
105 #include <netinet/ip_var.h>
106 #include <netinet/kpi_ipfilter_var.h>
107 #include <netinet/in_tclass.h>
108 #include <netinet/udp.h>
109
110 #include <netinet6/nd6.h>
111
112 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1)
113 #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3)
114 #define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
115 #define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
116
117 #if IPSEC
118 #include <netinet6/ipsec.h>
119 #include <netkey/key.h>
120 #if IPSEC_DEBUG
121 #include <netkey/key_debug.h>
122 #else
123 #define KEYDEBUG(lev, arg)
124 #endif
125 #endif /* IPSEC */
126
127 #if NECP
128 #include <net/necp.h>
129 #endif /* NECP */
130
131
132 #if DUMMYNET
133 #include <netinet/ip_dummynet.h>
134 #endif
135
136 #if PF
137 #include <net/pfvar.h>
138 #endif /* PF */
139
140
141 u_short ip_id;
142
143 static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
144 static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
145 static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
146 static void ip_out_cksum_stats(int, u_int32_t);
147 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
148 static int ip_optcopy(struct ip *, struct ip *);
149 static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
150 static void imo_trace(struct ip_moptions *, int);
151 static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *,
152 struct sockaddr_in *, int);
153 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
154
155 extern struct ip_linklocal_stat ip_linklocal_stat;
156
157 /* temporary: for testing */
158 #if IPSEC
159 extern int ipsec_bypass;
160 #endif
161
162 static int force_ipsum = 0;
163 static int ip_maxchainsent = 0;
164 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent,
165 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0,
166 "use dlil_output_list");
167
168 SYSCTL_INT(_net_inet_ip, OID_AUTO, force_ipsum,
169 CTLFLAG_RW | CTLFLAG_LOCKED, &force_ipsum, 0,
170 "force IP checksum");
171 #if DEBUG
172 static int forge_ce = 0;
173 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce,
174 CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0,
175 "Forge ECN CE");
176 #endif /* DEBUG */
177
178 static int ip_select_srcif_debug = 0;
179 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
180 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
181 "log source interface selection debug info");
182
183 static int ip_output_measure = 0;
184 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
185 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
186 &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
187 "Do time measurement");
188
189 static uint64_t ip_output_measure_bins = 0;
190 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
191 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
192 sysctl_ip_output_measure_bins, "I",
193 "bins for chaining performance data histogram");
194
195 static net_perf_t net_perf;
196 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
197 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
198 0, 0, sysctl_ip_output_getperf, "S,net_perf",
199 "IP output performance data (struct net_perf, net/net_perf.h)");
200
201 __private_extern__ int rfc6864 = 1;
202 SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
203 &rfc6864, 0, "updated ip id field behavior");
204
205 #define IMO_TRACE_HIST_SIZE 32 /* size of trace history */
206
207 /* For gdb */
208 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
209
210 struct ip_moptions_dbg {
211 struct ip_moptions imo; /* ip_moptions */
212 u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */
213 u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */
214 /*
215 * Alloc and free callers.
216 */
217 ctrace_t imo_alloc;
218 ctrace_t imo_free;
219 /*
220 * Circular lists of IMO_ADDREF and IMO_REMREF callers.
221 */
222 ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE];
223 ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE];
224 };
225
226 #if DEBUG
227 static unsigned int imo_debug = 1; /* debugging (enabled) */
228 #else
229 static unsigned int imo_debug; /* debugging (disabled) */
230 #endif /* !DEBUG */
231
232 static struct zone *imo_zone; /* zone for ip_moptions */
233 #define IMO_ZONE_NAME "ip_moptions" /* zone name */
234
235 #if PF
236 __attribute__((noinline))
237 static int
ip_output_pf_dn_hook(struct ifnet * ifp,struct mbuf ** mppn,struct mbuf ** mp,struct pf_rule * dn_pf_rule,struct route * ro,struct sockaddr_in * dst,int flags,struct ip_out_args * ipoa)238 ip_output_pf_dn_hook(struct ifnet *ifp, struct mbuf **mppn, struct mbuf **mp,
239 struct pf_rule *dn_pf_rule, struct route *ro, struct sockaddr_in *dst, int flags,
240 struct ip_out_args *ipoa)
241 {
242 int rc;
243 struct ip_fw_args args = {};
244
245 args.fwa_pf_rule = dn_pf_rule;
246 args.fwa_oif = ifp;
247 args.fwa_ro = ro;
248 args.fwa_dst = dst;
249 args.fwa_oflags = flags;
250 if (flags & IP_OUTARGS) {
251 args.fwa_ipoa = ipoa;
252 }
253 rc = pf_af_hook(ifp, mppn, mp, AF_INET, FALSE, &args);
254
255 return rc;
256 }
257
258 #endif /* PF */
259
260
261 /*
262 * IP output. The packet in mbuf chain m contains a skeletal IP
263 * header (with len, off, ttl, proto, tos, src, dst).
264 * The mbuf chain containing the packet will be freed.
265 * The mbuf opt, if present, will not be freed.
266 */
267 int
ip_output(struct mbuf * m0,struct mbuf * opt,struct route * ro,int flags,struct ip_moptions * imo,struct ip_out_args * ipoa)268 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
269 struct ip_moptions *imo, struct ip_out_args *ipoa)
270 {
271 return ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
272 }
273
274 /*
275 * IP output. The packet in mbuf chain m contains a skeletal IP
276 * header (with len, off, ttl, proto, tos, src, dst).
277 * The mbuf chain containing the packet will be freed.
278 * The mbuf opt, if present, will not be freed.
279 *
280 * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
281 * skipped and ro->ro_rt would be used. Otherwise the result of route
282 * lookup is stored in ro->ro_rt.
283 *
284 * In the IP forwarding case, the packet will arrive with options already
285 * inserted, so must have a NULL opt pointer.
286 */
287 int
ip_output_list(struct mbuf * m0,int packetchain,struct mbuf * opt,struct route * ro,int flags,struct ip_moptions * imo,struct ip_out_args * ipoa)288 ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
289 struct route *ro, int flags, struct ip_moptions *imo,
290 struct ip_out_args *ipoa)
291 {
292 struct ip *ip;
293 struct ifnet *ifp = NULL; /* not refcnt'd */
294 struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
295 int hlen = sizeof(struct ip);
296 int len = 0, error = 0;
297 struct sockaddr_in *dst = NULL;
298 struct in_ifaddr *ia = NULL, *src_ia = NULL;
299 struct in_addr pkt_dst;
300 struct ipf_pktopts *ippo = NULL;
301 ipfilter_t inject_filter_ref = NULL;
302 struct mbuf *packetlist;
303 uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
304 uint32_t packets_processed = 0;
305 unsigned int ifscope = IFSCOPE_NONE;
306 struct flowadv *adv = NULL;
307 struct timeval start_tv;
308 #if IPSEC
309 struct socket *so = NULL;
310 struct secpolicy *sp = NULL;
311 #endif /* IPSEC */
312 #if NECP
313 necp_kernel_policy_result necp_result = 0;
314 necp_kernel_policy_result_parameter necp_result_parameter;
315 necp_kernel_policy_id necp_matched_policy_id = 0;
316 #endif /* NECP */
317 #if DUMMYNET
318 struct m_tag *tag;
319 struct ip_out_args saved_ipoa;
320 struct sockaddr_in dst_buf;
321 #endif /* DUMMYNET */
322 struct {
323 #if IPSEC
324 struct ipsec_output_state ipsec_state;
325 #endif /* IPSEC */
326 #if NECP
327 struct route necp_route;
328 #endif /* NECP */
329 #if DUMMYNET
330 struct route saved_route;
331 #endif /* DUMMYNET */
332 struct ipf_pktopts ipf_pktopts;
333 } ipobz;
334 #define ipsec_state ipobz.ipsec_state
335 #define necp_route ipobz.necp_route
336 #define sro_fwd ipobz.sro_fwd
337 #define saved_route ipobz.saved_route
338 #define ipf_pktopts ipobz.ipf_pktopts
339 union {
340 struct {
341 boolean_t select_srcif : 1; /* set once */
342 boolean_t srcbound : 1; /* set once */
343 boolean_t nocell : 1; /* set once */
344 boolean_t isbroadcast : 1;
345 boolean_t didfilter : 1;
346 boolean_t noexpensive : 1; /* set once */
347 boolean_t noconstrained : 1; /* set once */
348 boolean_t awdl_unrestricted : 1; /* set once */
349 boolean_t management_allowed : 1; /* set once */
350 };
351 uint32_t raw;
352 } ipobf = { .raw = 0 };
353
354 int interface_mtu = 0;
355 struct pf_rule *dn_pf_rule = NULL;
356 /*
357 * Here we check for restrictions when sending frames.
358 * N.B.: IPv4 over internal co-processor interfaces is not allowed.
359 */
360 #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \
361 (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \
362 ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \
363 ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) || \
364 (IFNET_IS_INTCOPROC(_ifp)) || \
365 (!(_ipobf).management_allowed && IFNET_IS_MANAGEMENT(_ifp)) || \
366 (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
367
368 if (ip_output_measure) {
369 net_perf_start_time(&net_perf, &start_tv);
370 }
371 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
372
373 VERIFY(m0->m_flags & M_PKTHDR);
374 packetlist = m0;
375
376 /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
377 bzero(&ipobz, sizeof(ipobz));
378 ippo = &ipf_pktopts;
379
380 #if DUMMYNET
381 if (SLIST_EMPTY(&m0->m_pkthdr.tags)) {
382 goto ipfw_tags_done;
383 }
384
385 /* Grab info from mtags prepended to the chain */
386 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
387 KERNEL_TAG_TYPE_DUMMYNET)) != NULL) {
388 struct dn_pkt_tag *dn_tag;
389
390 dn_tag = (struct dn_pkt_tag *)(tag->m_tag_data);
391 dn_pf_rule = dn_tag->dn_pf_rule;
392 opt = NULL;
393 saved_route = dn_tag->dn_ro;
394 ro = &saved_route;
395
396 imo = NULL;
397 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
398 dst = &dst_buf;
399 ifp = dn_tag->dn_ifp;
400 flags = dn_tag->dn_flags;
401 if ((dn_tag->dn_flags & IP_OUTARGS)) {
402 saved_ipoa = dn_tag->dn_ipoa;
403 ipoa = &saved_ipoa;
404 }
405
406 m_tag_delete(m0, tag);
407 }
408 ipfw_tags_done:
409 #endif /* DUMMYNET */
410
411 m = m0;
412 m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP | PKTF_IFAINFO);
413
414 #if IPSEC
415 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
416 /* If packet is bound to an interface, check bound policies */
417 if ((flags & IP_OUTARGS) && (ipoa != NULL) &&
418 (ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
419 ipoa->ipoa_boundif != IFSCOPE_NONE) {
420 if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND,
421 &flags, ipoa, &sp) != 0) {
422 goto bad;
423 }
424 }
425 }
426 #endif /* IPSEC */
427
428 VERIFY(ro != NULL);
429
430 if (flags & IP_OUTARGS) {
431 /*
432 * In the forwarding case, only the ifscope value is used,
433 * as source interface selection doesn't take place.
434 */
435 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) &&
436 (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
437 ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
438 }
439
440 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
441 ipoa->ipoa_boundif != IFSCOPE_NONE) {
442 ifscope = ipoa->ipoa_boundif;
443 ipf_pktopts.ippo_flags |=
444 (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
445 }
446
447 /* double negation needed for bool bit field */
448 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR);
449 if (ipobf.srcbound) {
450 ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
451 }
452 } else {
453 ipobf.select_srcif = FALSE;
454 ipobf.srcbound = FALSE;
455 ifscope = IFSCOPE_NONE;
456 if (flags & IP_OUTARGS) {
457 ipoa->ipoa_boundif = IFSCOPE_NONE;
458 ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF |
459 IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR);
460 }
461 }
462
463 if (flags & IP_OUTARGS) {
464 if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) {
465 ipobf.nocell = true;
466 ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
467 }
468 if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) {
469 ipobf.noexpensive = true;
470 ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
471 }
472 if (ipoa->ipoa_flags & IPOAF_NO_CONSTRAINED) {
473 ipobf.noconstrained = true;
474 ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED;
475 }
476 if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) {
477 ipobf.awdl_unrestricted = true;
478 }
479 if (ipoa->ipoa_flags & IPOAF_MANAGEMENT_ALLOWED) {
480 ipobf.management_allowed = true;
481 }
482 adv = &ipoa->ipoa_flowadv;
483 adv->code = FADV_SUCCESS;
484 ipoa->ipoa_flags &= ~IPOAF_RET_MASK;
485 }
486
487 #if IPSEC
488 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
489 so = ipsec_getsocket(m);
490 if (so != NULL) {
491 (void) ipsec_setsocket(m, NULL);
492 }
493 }
494 #endif /* IPSEC */
495
496 #if DUMMYNET
497 if (dn_pf_rule != NULL) {
498 /* dummynet already saw us */
499 ip = mtod(m, struct ip *);
500 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
501 pkt_dst = ip->ip_dst;
502 if (ro->ro_rt != NULL) {
503 RT_LOCK_SPIN(ro->ro_rt);
504 ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
505 if (ia) {
506 /* Become a regular mutex */
507 RT_CONVERT_LOCK(ro->ro_rt);
508 IFA_ADDREF(&ia->ia_ifa);
509 }
510 RT_UNLOCK(ro->ro_rt);
511 }
512
513 goto sendit;
514 }
515 #endif /* DUMMYNET */
516
517 loopit:
518 packets_processed++;
519 ipobf.isbroadcast = FALSE;
520 ipobf.didfilter = FALSE;
521
522 VERIFY(m->m_flags & M_PKTHDR);
523 /*
524 * No need to proccess packet twice if we've already seen it.
525 */
526 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
527 inject_filter_ref = ipf_get_inject_filter(m);
528 } else {
529 inject_filter_ref = NULL;
530 }
531
532 if (opt) {
533 m = ip_insertoptions(m, opt, &len);
534 hlen = len;
535 /* Update the chain */
536 if (m != m0) {
537 if (m0 == packetlist) {
538 packetlist = m;
539 }
540 m0 = m;
541 }
542 }
543 ip = mtod(m, struct ip *);
544
545 pkt_dst = ip->ip_dst;
546
547 /*
548 * We must not send if the packet is destined to network zero.
549 * RFC1122 3.2.1.3 (a) and (b).
550 */
551 if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
552 error = EHOSTUNREACH;
553 goto bad;
554 }
555
556 /*
557 * Fill in IP header.
558 */
559 if (!(flags & (IP_FORWARDING | IP_RAWOUTPUT))) {
560 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
561 ip->ip_off &= IP_DF;
562 if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
563 // Per RFC6864, value of ip_id is undefined for atomic ip packets
564 ip->ip_id = 0;
565 } else {
566 ip->ip_id = ip_randomid((uint64_t)m);
567 }
568 OSAddAtomic(1, &ipstat.ips_localout);
569 } else {
570 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
571 }
572
573 #if DEBUG
574 /* For debugging, we let the stack forge congestion */
575 if (forge_ce != 0 &&
576 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
577 (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
578 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
579 forge_ce--;
580 }
581 #endif /* DEBUG */
582
583 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1) {
584 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
585 }
586
587 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
588 ip->ip_p, ip->ip_off, ip->ip_len);
589
590 dst = SIN(&ro->ro_dst);
591
592 /*
593 * If there is a cached route,
594 * check that it is to the same destination
595 * and is still up. If not, free it and try again.
596 * The address family should also be checked in case of sharing the
597 * cache with IPv6.
598 */
599
600 if (ro->ro_rt != NULL) {
601 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY &&
602 !(flags & (IP_ROUTETOIF | IP_FORWARDING))) {
603 src_ia = ifa_foraddr(ip->ip_src.s_addr);
604 if (src_ia == NULL) {
605 error = EADDRNOTAVAIL;
606 goto bad;
607 }
608 IFA_REMREF(&src_ia->ia_ifa);
609 src_ia = NULL;
610 }
611 /*
612 * Test rt_flags without holding rt_lock for performance
613 * reasons; if the route is down it will hopefully be
614 * caught by the layer below (since it uses this route
615 * as a hint) or during the next transmit.
616 */
617 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET ||
618 dst->sin_addr.s_addr != pkt_dst.s_addr) {
619 ROUTE_RELEASE(ro);
620 }
621
622 /*
623 * If we're doing source interface selection, we may not
624 * want to use this route; only synch up the generation
625 * count otherwise.
626 */
627 if (!ipobf.select_srcif && ro->ro_rt != NULL &&
628 RT_GENID_OUTOFSYNC(ro->ro_rt)) {
629 RT_GENID_SYNC(ro->ro_rt);
630 }
631 }
632 if (ro->ro_rt == NULL) {
633 bzero(dst, sizeof(*dst));
634 dst->sin_family = AF_INET;
635 dst->sin_len = sizeof(*dst);
636 dst->sin_addr = pkt_dst;
637 }
638 /*
639 * If routing to interface only,
640 * short circuit routing lookup.
641 */
642 if (flags & IP_ROUTETOIF) {
643 if (ia != NULL) {
644 IFA_REMREF(&ia->ia_ifa);
645 }
646 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
647 ia = ifatoia(ifa_ifwithnet(sintosa(dst)));
648 if (ia == NULL) {
649 OSAddAtomic(1, &ipstat.ips_noroute);
650 error = ENETUNREACH;
651 /* XXX IPv6 APN fallback notification?? */
652 goto bad;
653 }
654 }
655 ifp = ia->ia_ifp;
656 ip->ip_ttl = 1;
657 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
658 /*
659 * For consistency with other cases below. Loopback
660 * multicast case is handled separately by ip_mloopback().
661 */
662 if ((ifp->if_flags & IFF_LOOPBACK) &&
663 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
664 m->m_pkthdr.rcvif = ifp;
665 ip_setsrcifaddr_info(m, ifp->if_index, NULL);
666 ip_setdstifaddr_info(m, ifp->if_index, NULL);
667 }
668 } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
669 imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
670 /*
671 * Bypass the normal routing lookup for multicast
672 * packets if the interface is specified.
673 */
674 ipobf.isbroadcast = FALSE;
675 if (ia != NULL) {
676 IFA_REMREF(&ia->ia_ifa);
677 }
678
679 /* Macro takes reference on ia */
680 IFP_TO_IA(ifp, ia);
681 } else {
682 struct ifaddr *ia0 = NULL;
683 boolean_t cloneok = FALSE;
684 /*
685 * Perform source interface selection; the source IP address
686 * must belong to one of the addresses of the interface used
687 * by the route. For performance reasons, do this only if
688 * there is no route, or if the routing table has changed,
689 * or if we haven't done source interface selection on this
690 * route (for this PCB instance) before.
691 */
692 if (ipobf.select_srcif &&
693 ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) ||
694 !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
695 /* Find the source interface */
696 ia0 = in_selectsrcif(ip, ro, ifscope);
697
698 /*
699 * If the source address belongs to a restricted
700 * interface and the caller forbids our using
701 * interfaces of such type, pretend that there is no
702 * route.
703 */
704 if (ia0 != NULL &&
705 IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
706 IFA_REMREF(ia0);
707 ia0 = NULL;
708 error = EHOSTUNREACH;
709 if (flags & IP_OUTARGS) {
710 ipoa->ipoa_flags |= IPOAF_R_IFDENIED;
711 }
712 goto bad;
713 }
714
715 /*
716 * If the source address is spoofed (in the case of
717 * IP_RAWOUTPUT on an unbounded socket), or if this
718 * is destined for local/loopback, just let it go out
719 * using the interface of the route. Otherwise,
720 * there's no interface having such an address,
721 * so bail out.
722 */
723 if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) ||
724 ipobf.srcbound) && ifscope != lo_ifp->if_index) {
725 error = EADDRNOTAVAIL;
726 goto bad;
727 }
728
729 /*
730 * If the caller didn't explicitly specify the scope,
731 * pick it up from the source interface. If the cached
732 * route was wrong and was blown away as part of source
733 * interface selection, don't mask out RTF_PRCLONING
734 * since that route may have been allocated by the ULP,
735 * unless the IP header was created by the caller or
736 * the destination is IPv4 LLA. The check for the
737 * latter is needed because IPv4 LLAs are never scoped
738 * in the current implementation, and we don't want to
739 * replace the resolved IPv4 LLA route with one whose
740 * gateway points to that of the default gateway on
741 * the primary interface of the system.
742 */
743 if (ia0 != NULL) {
744 if (ifscope == IFSCOPE_NONE) {
745 ifscope = ia0->ifa_ifp->if_index;
746 }
747 cloneok = (!(flags & IP_RAWOUTPUT) &&
748 !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
749 }
750 }
751
752 /*
753 * If this is the case, we probably don't want to allocate
754 * a protocol-cloned route since we didn't get one from the
755 * ULP. This lets TCP do its thing, while not burdening
756 * forwarding or ICMP with the overhead of cloning a route.
757 * Of course, we still want to do any cloning requested by
758 * the link layer, as this is probably required in all cases
759 * for correct operation (as it is for ARP).
760 */
761 if (ro->ro_rt == NULL) {
762 uint32_t ign = RTF_PRCLONING;
763 /*
764 * We make an exception here: if the destination
765 * address is INADDR_BROADCAST, allocate a protocol-
766 * cloned host route so that we end up with a route
767 * marked with the RTF_BROADCAST flag. Otherwise,
768 * we would end up referring to the default route,
769 * instead of creating a cloned host route entry.
770 * That would introduce inconsistencies between ULPs
771 * that allocate a route and those that don't. The
772 * RTF_BROADCAST route is important since we'd want
773 * to send out undirected IP broadcast packets using
774 * link-level broadcast address. Another exception
775 * is for ULP-created routes that got blown away by
776 * source interface selection (see above).
777 *
778 * These exceptions will no longer be necessary when
779 * the RTF_PRCLONING scheme is no longer present.
780 */
781 if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) {
782 ign &= ~RTF_PRCLONING;
783 }
784
785 /*
786 * Loosen the route lookup criteria if the ifscope
787 * corresponds to the loopback interface; this is
788 * needed to support Application Layer Gateways
789 * listening on loopback, in conjunction with packet
790 * filter redirection rules. The final source IP
791 * address will be rewritten by the packet filter
792 * prior to the RFC1122 loopback check below.
793 */
794 if (ifscope == lo_ifp->if_index) {
795 rtalloc_ign(ro, ign);
796 } else {
797 rtalloc_scoped_ign(ro, ign, ifscope);
798 }
799
800 /*
801 * If the route points to a cellular/expensive interface
802 * and the caller forbids our using interfaces of such type,
803 * pretend that there is no route.
804 */
805 if (ro->ro_rt != NULL) {
806 RT_LOCK_SPIN(ro->ro_rt);
807 if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp,
808 ipobf)) {
809 RT_UNLOCK(ro->ro_rt);
810 ROUTE_RELEASE(ro);
811 if (flags & IP_OUTARGS) {
812 ipoa->ipoa_flags |=
813 IPOAF_R_IFDENIED;
814 }
815 } else {
816 RT_UNLOCK(ro->ro_rt);
817 }
818 }
819 }
820
821 if (ro->ro_rt == NULL) {
822 OSAddAtomic(1, &ipstat.ips_noroute);
823 error = EHOSTUNREACH;
824 if (ia0 != NULL) {
825 IFA_REMREF(ia0);
826 ia0 = NULL;
827 }
828 goto bad;
829 }
830
831 if (ia != NULL) {
832 IFA_REMREF(&ia->ia_ifa);
833 }
834 RT_LOCK_SPIN(ro->ro_rt);
835 ia = ifatoia(ro->ro_rt->rt_ifa);
836 if (ia != NULL) {
837 /* Become a regular mutex */
838 RT_CONVERT_LOCK(ro->ro_rt);
839 IFA_ADDREF(&ia->ia_ifa);
840 }
841 /*
842 * Note: ia_ifp may not be the same as rt_ifp; the latter
843 * is what we use for determining outbound i/f, mtu, etc.
844 */
845 ifp = ro->ro_rt->rt_ifp;
846 ro->ro_rt->rt_use++;
847 if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
848 dst = SIN(ro->ro_rt->rt_gateway);
849 }
850 if (ro->ro_rt->rt_flags & RTF_HOST) {
851 /* double negation needed for bool bit field */
852 ipobf.isbroadcast =
853 !!(ro->ro_rt->rt_flags & RTF_BROADCAST);
854 } else {
855 /* Become a regular mutex */
856 RT_CONVERT_LOCK(ro->ro_rt);
857 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
858 }
859 /*
860 * For consistency with IPv6, as well as to ensure that
861 * IP_RECVIF is set correctly for packets that are sent
862 * to one of the local addresses. ia (rt_ifa) would have
863 * been fixed up by rt_setif for local routes. This
864 * would make it appear as if the packet arrives on the
865 * interface which owns the local address. Loopback
866 * multicast case is handled separately by ip_mloopback().
867 */
868 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) &&
869 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
870 uint16_t srcidx;
871
872 m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp;
873
874 if (ia0 != NULL) {
875 srcidx = ia0->ifa_ifp->if_index;
876 } else if ((ro->ro_flags & ROF_SRCIF_SELECTED) &&
877 ro->ro_srcia != NULL) {
878 srcidx = ro->ro_srcia->ifa_ifp->if_index;
879 } else {
880 srcidx = 0;
881 }
882
883 ip_setsrcifaddr_info(m, srcidx, NULL);
884 ip_setdstifaddr_info(m, 0, ia);
885 }
886 RT_UNLOCK(ro->ro_rt);
887 if (ia0 != NULL) {
888 IFA_REMREF(ia0);
889 ia0 = NULL;
890 }
891 }
892
893 if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
894 struct ifnet *srcifp = NULL;
895 struct in_multi *inm;
896 u_int32_t vif = 0;
897 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
898 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
899
900 m->m_flags |= M_MCAST;
901 /*
902 * IP destination address is multicast. Make sure "dst"
903 * still points to the address in "ro". (It may have been
904 * changed to point to a gateway address, above.)
905 */
906 dst = SIN(&ro->ro_dst);
907 /*
908 * See if the caller provided any multicast options
909 */
910 if (imo != NULL) {
911 IMO_LOCK(imo);
912 vif = imo->imo_multicast_vif;
913 ttl = imo->imo_multicast_ttl;
914 loop = imo->imo_multicast_loop;
915 if (!(flags & IP_RAWOUTPUT)) {
916 ip->ip_ttl = ttl;
917 }
918 if (imo->imo_multicast_ifp != NULL) {
919 ifp = imo->imo_multicast_ifp;
920 }
921 IMO_UNLOCK(imo);
922 } else if (!(flags & IP_RAWOUTPUT)) {
923 vif = -1;
924 ip->ip_ttl = ttl;
925 }
926 /*
927 * Confirm that the outgoing interface supports multicast.
928 */
929 if (imo == NULL || vif == -1) {
930 if (!(ifp->if_flags & IFF_MULTICAST)) {
931 OSAddAtomic(1, &ipstat.ips_noroute);
932 error = ENETUNREACH;
933 goto bad;
934 }
935 }
936 /*
937 * If source address not specified yet, use address
938 * of outgoing interface.
939 */
940 if (ip->ip_src.s_addr == INADDR_ANY) {
941 struct in_ifaddr *ia1;
942 lck_rw_lock_shared(&in_ifaddr_rwlock);
943 TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
944 IFA_LOCK_SPIN(&ia1->ia_ifa);
945 if (ia1->ia_ifp == ifp) {
946 ip->ip_src = IA_SIN(ia1)->sin_addr;
947 srcifp = ifp;
948 IFA_UNLOCK(&ia1->ia_ifa);
949 break;
950 }
951 IFA_UNLOCK(&ia1->ia_ifa);
952 }
953 lck_rw_done(&in_ifaddr_rwlock);
954 if (ip->ip_src.s_addr == INADDR_ANY) {
955 error = ENETUNREACH;
956 goto bad;
957 }
958 }
959
960 in_multihead_lock_shared();
961 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
962 in_multihead_lock_done();
963 if (inm != NULL && (imo == NULL || loop)) {
964 /*
965 * If we belong to the destination multicast group
966 * on the outgoing interface, and the caller did not
967 * forbid loopback, loop back a copy.
968 */
969 if (!TAILQ_EMPTY(&ipv4_filters)
970 #if NECP
971 && !necp_packet_should_skip_filters(m)
972 #endif // NECP
973 ) {
974 struct ipfilter *filter;
975 int seen = (inject_filter_ref == NULL);
976
977 if (imo != NULL) {
978 ipf_pktopts.ippo_flags |=
979 IPPOF_MCAST_OPTS;
980 ipf_pktopts.ippo_mcast_ifnet = ifp;
981 ipf_pktopts.ippo_mcast_ttl = ttl;
982 ipf_pktopts.ippo_mcast_loop = loop;
983 }
984
985 ipf_ref();
986
987 /*
988 * 4135317 - always pass network byte
989 * order to filter
990 */
991 #if BYTE_ORDER != BIG_ENDIAN
992 HTONS(ip->ip_len);
993 HTONS(ip->ip_off);
994 #endif
995 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
996 if (seen == 0) {
997 if ((struct ipfilter *)
998 inject_filter_ref == filter) {
999 seen = 1;
1000 }
1001 } else if (filter->ipf_filter.
1002 ipf_output != NULL) {
1003 errno_t result;
1004 result = filter->ipf_filter.
1005 ipf_output(filter->
1006 ipf_filter.cookie,
1007 (mbuf_t *)&m, ippo);
1008 if (result == EJUSTRETURN) {
1009 ipf_unref();
1010 INM_REMREF(inm);
1011 goto done;
1012 }
1013 if (result != 0) {
1014 ipf_unref();
1015 INM_REMREF(inm);
1016 goto bad;
1017 }
1018 }
1019 }
1020
1021 /* set back to host byte order */
1022 ip = mtod(m, struct ip *);
1023 #if BYTE_ORDER != BIG_ENDIAN
1024 NTOHS(ip->ip_len);
1025 NTOHS(ip->ip_off);
1026 #endif
1027 ipf_unref();
1028 ipobf.didfilter = true;
1029 }
1030 ip_mloopback(srcifp, ifp, m, dst, hlen);
1031 }
1032 if (inm != NULL) {
1033 INM_REMREF(inm);
1034 }
1035 /*
1036 * Multicasts with a time-to-live of zero may be looped-
1037 * back, above, but must not be transmitted on a network.
1038 * Also, multicasts addressed to the loopback interface
1039 * are not sent -- the above call to ip_mloopback() will
1040 * loop back a copy if this host actually belongs to the
1041 * destination group on the loopback interface.
1042 */
1043 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
1044 m_freem(m);
1045 goto done;
1046 }
1047
1048 goto sendit;
1049 }
1050 /*
1051 * If source address not specified yet, use address
1052 * of outgoing interface.
1053 */
1054 if (ip->ip_src.s_addr == INADDR_ANY) {
1055 IFA_LOCK_SPIN(&ia->ia_ifa);
1056 ip->ip_src = IA_SIN(ia)->sin_addr;
1057 IFA_UNLOCK(&ia->ia_ifa);
1058 }
1059
1060 /*
1061 * Look for broadcast address and
1062 * and verify user is allowed to send
1063 * such a packet.
1064 */
1065 if (ipobf.isbroadcast) {
1066 if (!(ifp->if_flags & IFF_BROADCAST)) {
1067 error = EADDRNOTAVAIL;
1068 goto bad;
1069 }
1070 if (!(flags & IP_ALLOWBROADCAST)) {
1071 error = EACCES;
1072 goto bad;
1073 }
1074 /* don't allow broadcast messages to be fragmented */
1075 if ((u_short)ip->ip_len > ifp->if_mtu) {
1076 error = EMSGSIZE;
1077 goto bad;
1078 }
1079 m->m_flags |= M_BCAST;
1080 } else {
1081 m->m_flags &= ~M_BCAST;
1082 }
1083
1084 sendit:
1085 #if PF
1086 /* Invoke outbound packet filter */
1087 if (PF_IS_ENABLED) {
1088 int rc;
1089
1090 m0 = m; /* Save for later */
1091 #if DUMMYNET
1092 rc = ip_output_pf_dn_hook(ifp, mppn, &m, dn_pf_rule, ro, dst, flags, ipoa);
1093 #else /* DUMMYNET */
1094 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1095 #endif /* DUMMYNET */
1096 if (rc != 0 || m == NULL) {
1097 /* Move to the next packet */
1098 m = *mppn;
1099
1100 /* Skip ahead if first packet in list got dropped */
1101 if (packetlist == m0) {
1102 packetlist = m;
1103 }
1104
1105 if (m != NULL) {
1106 m0 = m;
1107 /* Next packet in the chain */
1108 goto loopit;
1109 } else if (packetlist != NULL) {
1110 /* No more packet; send down the chain */
1111 goto sendchain;
1112 }
1113 /* Nothing left; we're done */
1114 goto done;
1115 }
1116 m0 = m;
1117 ip = mtod(m, struct ip *);
1118 pkt_dst = ip->ip_dst;
1119 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1120 }
1121 #endif /* PF */
1122 /*
1123 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1124 */
1125 if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
1126 IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1127 ip_linklocal_stat.iplls_out_total++;
1128 if (ip->ip_ttl != MAXTTL) {
1129 ip_linklocal_stat.iplls_out_badttl++;
1130 ip->ip_ttl = MAXTTL;
1131 }
1132 }
1133
1134 if (!ipobf.didfilter &&
1135 !TAILQ_EMPTY(&ipv4_filters)
1136 #if NECP
1137 && !necp_packet_should_skip_filters(m)
1138 #endif // NECP
1139 ) {
1140 struct ipfilter *filter;
1141 int seen = (inject_filter_ref == NULL);
1142 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1143
1144 /*
1145 * Check that a TSO frame isn't passed to a filter.
1146 * This could happen if a filter is inserted while
1147 * TCP is sending the TSO packet.
1148 */
1149 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1150 error = EMSGSIZE;
1151 goto bad;
1152 }
1153
1154 ipf_ref();
1155
1156 /* 4135317 - always pass network byte order to filter */
1157 #if BYTE_ORDER != BIG_ENDIAN
1158 HTONS(ip->ip_len);
1159 HTONS(ip->ip_off);
1160 #endif
1161 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1162 if (seen == 0) {
1163 if ((struct ipfilter *)inject_filter_ref ==
1164 filter) {
1165 seen = 1;
1166 }
1167 } else if (filter->ipf_filter.ipf_output) {
1168 errno_t result;
1169 result = filter->ipf_filter.
1170 ipf_output(filter->ipf_filter.cookie,
1171 (mbuf_t *)&m, ippo);
1172 if (result == EJUSTRETURN) {
1173 ipf_unref();
1174 goto done;
1175 }
1176 if (result != 0) {
1177 ipf_unref();
1178 goto bad;
1179 }
1180 }
1181 }
1182 /* set back to host byte order */
1183 ip = mtod(m, struct ip *);
1184 #if BYTE_ORDER != BIG_ENDIAN
1185 NTOHS(ip->ip_len);
1186 NTOHS(ip->ip_off);
1187 #endif
1188 ipf_unref();
1189 }
1190
1191 #if NECP
1192 /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
1193 necp_matched_policy_id = necp_ip_output_find_policy_match(m,
1194 flags, (flags & IP_OUTARGS) ? ipoa : NULL, ro ? ro->ro_rt : NULL, &necp_result, &necp_result_parameter);
1195 if (necp_matched_policy_id) {
1196 necp_mark_packet_from_ip(m, necp_matched_policy_id);
1197 switch (necp_result) {
1198 case NECP_KERNEL_POLICY_RESULT_PASS:
1199 if (necp_result_parameter.pass_flags & NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC) {
1200 break;
1201 }
1202 /* Check if the interface is allowed */
1203 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1204 error = EHOSTUNREACH;
1205 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1206 goto bad;
1207 }
1208 goto skip_ipsec;
1209 case NECP_KERNEL_POLICY_RESULT_DROP:
1210 case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
1211 /* Flow divert packets should be blocked at the IP layer */
1212 error = EHOSTUNREACH;
1213 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1214 goto bad;
1215 case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
1216 /* Verify that the packet is being routed to the tunnel */
1217 struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter);
1218 if (policy_ifp == ifp) {
1219 /* Check if the interface is allowed */
1220 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1221 error = EHOSTUNREACH;
1222 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1223 goto bad;
1224 }
1225 goto skip_ipsec;
1226 } else {
1227 if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) {
1228 /* Check if the interface is allowed */
1229 if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) {
1230 error = EHOSTUNREACH;
1231 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1232 goto bad;
1233 }
1234
1235 /*
1236 * Update the QOS marking policy if
1237 * 1. up layer asks it to do so
1238 * 2. net_qos_policy_restricted is not set
1239 * 3. qos_marking_gencount doesn't match necp_kernel_socket_policies_gencount (checked in necp_lookup_current_qos_marking)
1240 */
1241 if (ipoa != NULL &&
1242 (ipoa->ipoa_flags & IPOAF_REDO_QOSMARKING_POLICY) &&
1243 net_qos_policy_restricted != 0) {
1244 bool qos_marking = (ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED) ? TRUE : FALSE;
1245 qos_marking = necp_lookup_current_qos_marking(&ipoa->qos_marking_gencount, NULL, policy_ifp, necp_result_parameter.route_rule_id, qos_marking);
1246 if (qos_marking) {
1247 ipoa->ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1248 } else {
1249 ipoa->ipoa_flags &= ~IPOAF_QOSMARKING_ALLOWED;
1250 }
1251 }
1252
1253 /* Set ifp to the tunnel interface, since it is compatible with the packet */
1254 ifp = policy_ifp;
1255 ro = &necp_route;
1256 goto skip_ipsec;
1257 } else {
1258 error = ENETUNREACH;
1259 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1260 goto bad;
1261 }
1262 }
1263 }
1264 default:
1265 break;
1266 }
1267 }
1268 /* Catch-all to check if the interface is allowed */
1269 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1270 error = EHOSTUNREACH;
1271 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1272 goto bad;
1273 }
1274 #endif /* NECP */
1275
1276 #if IPSEC
1277 if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) {
1278 goto skip_ipsec;
1279 }
1280
1281 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1282
1283 if (sp == NULL) {
1284 /* get SP for this packet */
1285 if (so != NULL) {
1286 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND,
1287 so, &error);
1288 } else {
1289 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
1290 flags, &error);
1291 }
1292 if (sp == NULL) {
1293 IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1294 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1295 0, 0, 0, 0, 0);
1296 goto bad;
1297 }
1298 }
1299
1300 error = 0;
1301
1302 /* check policy */
1303 switch (sp->policy) {
1304 case IPSEC_POLICY_DISCARD:
1305 case IPSEC_POLICY_GENERATE:
1306 /*
1307 * This packet is just discarded.
1308 */
1309 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1310 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1311 1, 0, 0, 0, 0);
1312 goto bad;
1313
1314 case IPSEC_POLICY_BYPASS:
1315 case IPSEC_POLICY_NONE:
1316 /* no need to do IPsec. */
1317 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1318 2, 0, 0, 0, 0);
1319 goto skip_ipsec;
1320
1321 case IPSEC_POLICY_IPSEC:
1322 if (sp->req == NULL) {
1323 /* acquire a policy */
1324 error = key_spdacquire(sp);
1325 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1326 3, 0, 0, 0, 0);
1327 goto bad;
1328 }
1329 if (sp->ipsec_if) {
1330 /* Verify the redirect to ipsec interface */
1331 if (sp->ipsec_if == ifp) {
1332 goto skip_ipsec;
1333 }
1334 goto bad;
1335 }
1336 break;
1337
1338 case IPSEC_POLICY_ENTRUST:
1339 default:
1340 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1341 }
1342 {
1343 ipsec_state.m = m;
1344 if (flags & IP_ROUTETOIF) {
1345 bzero(&ipsec_state.ro, sizeof(ipsec_state.ro));
1346 } else {
1347 route_copyout((struct route *)&ipsec_state.ro, ro, sizeof(struct route));
1348 }
1349 ipsec_state.dst = SA(dst);
1350
1351 ip->ip_sum = 0;
1352
1353 /*
1354 * XXX
1355 * delayed checksums are not currently compatible with IPsec
1356 */
1357 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1358 in_delayed_cksum(m);
1359 }
1360
1361 #if BYTE_ORDER != BIG_ENDIAN
1362 HTONS(ip->ip_len);
1363 HTONS(ip->ip_off);
1364 #endif
1365
1366 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1367 struct ip *, ip, struct ifnet *, ifp,
1368 struct ip *, ip, struct ip6_hdr *, NULL);
1369
1370 error = ipsec4_output(&ipsec_state, sp, flags);
1371 if (ipsec_state.tunneled == 6) {
1372 m0 = m = NULL;
1373 error = 0;
1374 goto bad;
1375 }
1376
1377 m0 = m = ipsec_state.m;
1378
1379 #if DUMMYNET
1380 /*
1381 * If we're about to use the route in ipsec_state
1382 * and this came from dummynet, cleaup now.
1383 */
1384 if (ro == &saved_route &&
1385 (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) {
1386 ROUTE_RELEASE(ro);
1387 }
1388 #endif /* DUMMYNET */
1389
1390 if (flags & IP_ROUTETOIF) {
1391 /*
1392 * if we have tunnel mode SA, we may need to ignore
1393 * IP_ROUTETOIF.
1394 */
1395 if (ipsec_state.tunneled) {
1396 flags &= ~IP_ROUTETOIF;
1397 ro = (struct route *)&ipsec_state.ro;
1398 }
1399 } else {
1400 ro = (struct route *)&ipsec_state.ro;
1401 }
1402 dst = SIN(ipsec_state.dst);
1403 if (error) {
1404 /* mbuf is already reclaimed in ipsec4_output. */
1405 m0 = NULL;
1406 switch (error) {
1407 case EHOSTUNREACH:
1408 case ENETUNREACH:
1409 case EMSGSIZE:
1410 case ENOBUFS:
1411 case ENOMEM:
1412 break;
1413 default:
1414 printf("ip4_output (ipsec): error code %d\n", error);
1415 OS_FALLTHROUGH;
1416 case ENOENT:
1417 /* don't show these error codes to the user */
1418 error = 0;
1419 break;
1420 }
1421 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1422 4, 0, 0, 0, 0);
1423 goto bad;
1424 }
1425 }
1426
1427 /* be sure to update variables that are affected by ipsec4_output() */
1428 ip = mtod(m, struct ip *);
1429
1430 #ifdef _IP_VHL
1431 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1432 #else /* !_IP_VHL */
1433 hlen = ip->ip_hl << 2;
1434 #endif /* !_IP_VHL */
1435 /* Check that there wasn't a route change and src is still valid */
1436 if (ROUTE_UNUSABLE(ro)) {
1437 ROUTE_RELEASE(ro);
1438 VERIFY(src_ia == NULL);
1439 if (ip->ip_src.s_addr != INADDR_ANY &&
1440 !(flags & (IP_ROUTETOIF | IP_FORWARDING)) &&
1441 (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) {
1442 error = EADDRNOTAVAIL;
1443 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1444 5, 0, 0, 0, 0);
1445 goto bad;
1446 }
1447 if (src_ia != NULL) {
1448 IFA_REMREF(&src_ia->ia_ifa);
1449 src_ia = NULL;
1450 }
1451 }
1452
1453 if (ro->ro_rt == NULL) {
1454 if (!(flags & IP_ROUTETOIF)) {
1455 printf("%s: can't update route after "
1456 "IPsec processing\n", __func__);
1457 error = EHOSTUNREACH; /* XXX */
1458 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1459 6, 0, 0, 0, 0);
1460 goto bad;
1461 }
1462 } else {
1463 if (ia != NULL) {
1464 IFA_REMREF(&ia->ia_ifa);
1465 }
1466 RT_LOCK_SPIN(ro->ro_rt);
1467 ia = ifatoia(ro->ro_rt->rt_ifa);
1468 if (ia != NULL) {
1469 /* Become a regular mutex */
1470 RT_CONVERT_LOCK(ro->ro_rt);
1471 IFA_ADDREF(&ia->ia_ifa);
1472 }
1473 ifp = ro->ro_rt->rt_ifp;
1474 RT_UNLOCK(ro->ro_rt);
1475 }
1476
1477 /* make it flipped, again. */
1478 #if BYTE_ORDER != BIG_ENDIAN
1479 NTOHS(ip->ip_len);
1480 NTOHS(ip->ip_off);
1481 #endif
1482 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1483 7, 0xff, 0xff, 0xff, 0xff);
1484
1485 /* Pass to filters again */
1486 if (!TAILQ_EMPTY(&ipv4_filters)
1487 #if NECP
1488 && !necp_packet_should_skip_filters(m)
1489 #endif // NECP
1490 ) {
1491 struct ipfilter *filter;
1492
1493 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1494
1495 /*
1496 * Check that a TSO frame isn't passed to a filter.
1497 * This could happen if a filter is inserted while
1498 * TCP is sending the TSO packet.
1499 */
1500 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1501 error = EMSGSIZE;
1502 goto bad;
1503 }
1504
1505 ipf_ref();
1506
1507 /* 4135317 - always pass network byte order to filter */
1508 #if BYTE_ORDER != BIG_ENDIAN
1509 HTONS(ip->ip_len);
1510 HTONS(ip->ip_off);
1511 #endif
1512 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1513 if (filter->ipf_filter.ipf_output) {
1514 errno_t result;
1515 result = filter->ipf_filter.
1516 ipf_output(filter->ipf_filter.cookie,
1517 (mbuf_t *)&m, ippo);
1518 if (result == EJUSTRETURN) {
1519 ipf_unref();
1520 goto done;
1521 }
1522 if (result != 0) {
1523 ipf_unref();
1524 goto bad;
1525 }
1526 }
1527 }
1528 /* set back to host byte order */
1529 ip = mtod(m, struct ip *);
1530 #if BYTE_ORDER != BIG_ENDIAN
1531 NTOHS(ip->ip_len);
1532 NTOHS(ip->ip_off);
1533 #endif
1534 ipf_unref();
1535 }
1536 skip_ipsec:
1537 #endif /* IPSEC */
1538
1539
1540 /* 127/8 must not appear on wire - RFC1122 */
1541 if (!(ifp->if_flags & IFF_LOOPBACK) &&
1542 ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1543 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1544 OSAddAtomic(1, &ipstat.ips_badaddr);
1545 error = EADDRNOTAVAIL;
1546 goto bad;
1547 }
1548
1549 if (ipoa != NULL) {
1550 u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
1551
1552 error = set_packet_qos(m, ifp,
1553 ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
1554 ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
1555 if (error == 0) {
1556 ip->ip_tos &= IPTOS_ECN_MASK;
1557 ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT;
1558 } else {
1559 printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
1560 error = 0;
1561 }
1562 }
1563
1564 ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
1565 ip->ip_len, &sw_csum);
1566
1567 interface_mtu = ifp->if_mtu;
1568
1569 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
1570 interface_mtu = IN6_LINKMTU(ifp);
1571 /* Further adjust the size for CLAT46 expansion */
1572 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
1573 }
1574
1575 /*
1576 * If small enough for interface, or the interface will take
1577 * care of the fragmentation for us, can just send directly.
1578 */
1579 if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) ||
1580 (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
1581 #if BYTE_ORDER != BIG_ENDIAN
1582 HTONS(ip->ip_len);
1583 HTONS(ip->ip_off);
1584 #endif
1585
1586 ip->ip_sum = 0;
1587 if ((sw_csum & CSUM_DELAY_IP) || __improbable(force_ipsum != 0)) {
1588 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1589 sw_csum &= ~CSUM_DELAY_IP;
1590 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1591 }
1592
1593 #if IPSEC
1594 /* clean ipsec history once it goes out of the node */
1595 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1596 ipsec_delaux(m);
1597 }
1598 #endif /* IPSEC */
1599 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
1600 (m->m_pkthdr.tso_segsz > 0)) {
1601 scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
1602 } else {
1603 scnt++;
1604 }
1605
1606 if (packetchain == 0) {
1607 if (ro->ro_rt != NULL && nstat_collect) {
1608 nstat_route_tx(ro->ro_rt, scnt,
1609 m->m_pkthdr.len, 0);
1610 }
1611
1612 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1613 SA(dst), 0, adv);
1614 if (dlil_verbose && error) {
1615 printf("dlil_output error on interface %s: %d\n",
1616 ifp->if_xname, error);
1617 }
1618 scnt = 0;
1619 goto done;
1620 } else {
1621 /*
1622 * packet chaining allows us to reuse the
1623 * route for all packets
1624 */
1625 bytecnt += m->m_pkthdr.len;
1626 mppn = &m->m_nextpkt;
1627 m = m->m_nextpkt;
1628 if (m == NULL) {
1629 #if PF
1630 sendchain:
1631 #endif /* PF */
1632 if (pktcnt > ip_maxchainsent) {
1633 ip_maxchainsent = pktcnt;
1634 }
1635 if (ro->ro_rt != NULL && nstat_collect) {
1636 nstat_route_tx(ro->ro_rt, scnt,
1637 bytecnt, 0);
1638 }
1639
1640 error = dlil_output(ifp, PF_INET, packetlist,
1641 ro->ro_rt, SA(dst), 0, adv);
1642 if (dlil_verbose && error) {
1643 printf("dlil_output error on interface %s: %d\n",
1644 ifp->if_xname, error);
1645 }
1646 pktcnt = 0;
1647 scnt = 0;
1648 bytecnt = 0;
1649 goto done;
1650 }
1651 m0 = m;
1652 pktcnt++;
1653 goto loopit;
1654 }
1655 }
1656
1657 VERIFY(interface_mtu != 0);
1658 /*
1659 * Too large for interface; fragment if possible.
1660 * Must be able to put at least 8 bytes per fragment.
1661 * Balk when DF bit is set or the interface didn't support TSO.
1662 */
1663 if ((ip->ip_off & IP_DF) || pktcnt > 0 ||
1664 (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1665 error = EMSGSIZE;
1666 /*
1667 * This case can happen if the user changed the MTU
1668 * of an interface after enabling IP on it. Because
1669 * most netifs don't keep track of routes pointing to
1670 * them, there is no way for one to update all its
1671 * routes when the MTU is changed.
1672 */
1673 if (ro->ro_rt) {
1674 RT_LOCK_SPIN(ro->ro_rt);
1675 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1676 !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1677 (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) {
1678 ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu;
1679 }
1680 RT_UNLOCK(ro->ro_rt);
1681 }
1682 if (pktcnt > 0) {
1683 m0 = packetlist;
1684 }
1685 OSAddAtomic(1, &ipstat.ips_cantfrag);
1686 goto bad;
1687 }
1688
1689 /*
1690 * XXX Only TCP seems to be passing a list of packets here.
1691 * The following issue is limited to UDP datagrams with 0 checksum.
1692 * For now limit it to the case when single packet is passed down.
1693 */
1694 if (packetchain == 0 && IS_INTF_CLAT46(ifp)) {
1695 /*
1696 * If it is a UDP packet that has checksum set to 0
1697 * and is also not being offloaded, compute a full checksum
1698 * and update the UDP checksum.
1699 */
1700 if (ip->ip_p == IPPROTO_UDP &&
1701 !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) {
1702 struct udphdr *uh = NULL;
1703
1704 if (m->m_len < hlen + sizeof(struct udphdr)) {
1705 m = m_pullup(m, hlen + sizeof(struct udphdr));
1706 if (m == NULL) {
1707 error = ENOBUFS;
1708 m0 = m;
1709 goto bad;
1710 }
1711 m0 = m;
1712 ip = mtod(m, struct ip *);
1713 }
1714 /*
1715 * Get UDP header and if checksum is 0, then compute the full
1716 * checksum.
1717 */
1718 uh = (struct udphdr *)(void *)((caddr_t)ip + hlen);
1719 if (uh->uh_sum == 0) {
1720 uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen,
1721 ip->ip_len - hlen);
1722 if (uh->uh_sum == 0) {
1723 uh->uh_sum = 0xffff;
1724 }
1725 }
1726 }
1727 }
1728
1729 error = ip_fragment(m, ifp, interface_mtu, sw_csum);
1730 if (error != 0) {
1731 m0 = m = NULL;
1732 goto bad;
1733 }
1734
1735 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1736 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1737
1738 for (m = m0; m; m = m0) {
1739 m0 = m->m_nextpkt;
1740 m->m_nextpkt = 0;
1741 #if IPSEC
1742 /* clean ipsec history once it goes out of the node */
1743 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1744 ipsec_delaux(m);
1745 }
1746 #endif /* IPSEC */
1747 if (error == 0) {
1748 if ((packetchain != 0) && (pktcnt > 0)) {
1749 panic("%s: mix of packet in packetlist is "
1750 "wrong=%p", __func__, packetlist);
1751 /* NOTREACHED */
1752 }
1753 if (ro->ro_rt != NULL && nstat_collect) {
1754 nstat_route_tx(ro->ro_rt, 1,
1755 m->m_pkthdr.len, 0);
1756 }
1757 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1758 SA(dst), 0, adv);
1759 if (dlil_verbose && error) {
1760 printf("dlil_output error on interface %s: %d\n",
1761 ifp->if_xname, error);
1762 }
1763 } else {
1764 m_freem(m);
1765 }
1766 }
1767
1768 if (error == 0) {
1769 OSAddAtomic(1, &ipstat.ips_fragmented);
1770 }
1771
1772 done:
1773 if (ia != NULL) {
1774 IFA_REMREF(&ia->ia_ifa);
1775 ia = NULL;
1776 }
1777 #if IPSEC
1778 ROUTE_RELEASE(&ipsec_state.ro);
1779 if (sp != NULL) {
1780 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1781 printf("DP ip_output call free SP:%x\n", sp));
1782 key_freesp(sp, KEY_SADB_UNLOCKED);
1783 }
1784 #endif /* IPSEC */
1785 #if NECP
1786 ROUTE_RELEASE(&necp_route);
1787 #endif /* NECP */
1788 #if DUMMYNET
1789 ROUTE_RELEASE(&saved_route);
1790 #endif /* DUMMYNET */
1791
1792 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
1793 if (ip_output_measure) {
1794 net_perf_measure_time(&net_perf, &start_tv, packets_processed);
1795 net_perf_histogram(&net_perf, packets_processed);
1796 }
1797 return error;
1798 bad:
1799 if (pktcnt > 0) {
1800 m0 = packetlist;
1801 }
1802 m_freem_list(m0);
1803 goto done;
1804
1805 #undef ipsec_state
1806 #undef args
1807 #undef sro_fwd
1808 #undef saved_route
1809 #undef ipf_pktopts
1810 #undef IP_CHECK_RESTRICTIONS
1811 }
1812
1813 int
ip_fragment(struct mbuf * m,struct ifnet * ifp,uint32_t mtu,int sw_csum)1814 ip_fragment(struct mbuf *m, struct ifnet *ifp, uint32_t mtu, int sw_csum)
1815 {
1816 struct ip *ip, *mhip;
1817 int len, hlen, mhlen, firstlen, off, error = 0;
1818 struct mbuf **mnext = &m->m_nextpkt, *m0;
1819 int nfrags = 1;
1820
1821 ip = mtod(m, struct ip *);
1822 #ifdef _IP_VHL
1823 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1824 #else /* !_IP_VHL */
1825 hlen = ip->ip_hl << 2;
1826 #endif /* !_IP_VHL */
1827
1828 /*
1829 * We need to adjust the fragment sizes to account
1830 * for IPv6 fragment header if it needs to be translated
1831 * from IPv4 to IPv6.
1832 */
1833 if (IS_INTF_CLAT46(ifp)) {
1834 mtu -= sizeof(struct ip6_frag);
1835 }
1836
1837 firstlen = len = (mtu - hlen) & ~7;
1838 if (len < 8) {
1839 m_freem(m);
1840 return EMSGSIZE;
1841 }
1842
1843 /*
1844 * if the interface will not calculate checksums on
1845 * fragmented packets, then do it here.
1846 */
1847 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) &&
1848 !(ifp->if_hwassist & CSUM_IP_FRAGS)) {
1849 in_delayed_cksum(m);
1850 }
1851
1852 /*
1853 * Loop through length of segment after first fragment,
1854 * make new header and copy data of each part and link onto chain.
1855 */
1856 m0 = m;
1857 mhlen = sizeof(struct ip);
1858 for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1859 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
1860 if (m == NULL) {
1861 error = ENOBUFS;
1862 OSAddAtomic(1, &ipstat.ips_odropped);
1863 goto sendorfree;
1864 }
1865 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1866 m->m_data += max_linkhdr;
1867 mhip = mtod(m, struct ip *);
1868 *mhip = *ip;
1869 if (hlen > sizeof(struct ip)) {
1870 mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
1871 mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1872 }
1873 m->m_len = mhlen;
1874 mhip->ip_off = (u_short)(((off - hlen) >> 3) + (ip->ip_off & ~IP_MF));
1875 if (ip->ip_off & IP_MF) {
1876 mhip->ip_off |= IP_MF;
1877 }
1878 if (off + len >= (u_short)ip->ip_len) {
1879 len = (u_short)ip->ip_len - off;
1880 } else {
1881 mhip->ip_off |= IP_MF;
1882 }
1883 mhip->ip_len = htons((u_short)(len + mhlen));
1884 m->m_next = m_copy(m0, off, len);
1885 if (m->m_next == NULL) {
1886 (void) m_free(m);
1887 error = ENOBUFS; /* ??? */
1888 OSAddAtomic(1, &ipstat.ips_odropped);
1889 goto sendorfree;
1890 }
1891 m->m_pkthdr.len = mhlen + len;
1892 m->m_pkthdr.rcvif = NULL;
1893 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1894
1895 M_COPY_CLASSIFIER(m, m0);
1896 M_COPY_PFTAG(m, m0);
1897 M_COPY_NECPTAG(m, m0);
1898
1899 #if BYTE_ORDER != BIG_ENDIAN
1900 HTONS(mhip->ip_off);
1901 #endif
1902
1903 mhip->ip_sum = 0;
1904 if (sw_csum & CSUM_DELAY_IP) {
1905 mhip->ip_sum = ip_cksum_hdr_out(m, mhlen);
1906 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1907 }
1908 *mnext = m;
1909 mnext = &m->m_nextpkt;
1910 nfrags++;
1911 }
1912 OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1913
1914 /* set first/last markers for fragment chain */
1915 m->m_flags |= M_LASTFRAG;
1916 m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1917 m0->m_pkthdr.csum_data = nfrags;
1918
1919 /*
1920 * Update first fragment by trimming what's been copied out
1921 * and updating header, then send each fragment (in order).
1922 */
1923 m = m0;
1924 m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1925 m->m_pkthdr.len = hlen + firstlen;
1926 ip->ip_len = htons((u_short)m->m_pkthdr.len);
1927 ip->ip_off |= IP_MF;
1928
1929 #if BYTE_ORDER != BIG_ENDIAN
1930 HTONS(ip->ip_off);
1931 #endif
1932
1933 ip->ip_sum = 0;
1934 if (sw_csum & CSUM_DELAY_IP) {
1935 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1936 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1937 }
1938 sendorfree:
1939 if (error) {
1940 m_freem_list(m0);
1941 }
1942
1943 return error;
1944 }
1945
1946 static void
ip_out_cksum_stats(int proto,u_int32_t len)1947 ip_out_cksum_stats(int proto, u_int32_t len)
1948 {
1949 switch (proto) {
1950 case IPPROTO_TCP:
1951 tcp_out_cksum_stats(len);
1952 break;
1953 case IPPROTO_UDP:
1954 udp_out_cksum_stats(len);
1955 break;
1956 default:
1957 /* keep only TCP or UDP stats for now */
1958 break;
1959 }
1960 }
1961
1962 /*
1963 * Process a delayed payload checksum calculation (outbound path.)
1964 *
1965 * hoff is the number of bytes beyond the mbuf data pointer which
1966 * points to the IP header.
1967 *
1968 * Returns a bitmask representing all the work done in software.
1969 */
1970 uint32_t
in_finalize_cksum(struct mbuf * m,uint32_t hoff,uint32_t csum_flags)1971 in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
1972 {
1973 unsigned char buf[15 << 2] __attribute__((aligned(8)));
1974 struct ip *ip;
1975 uint32_t offset, _hlen, mlen, hlen, len, sw_csum;
1976 uint16_t csum, ip_len;
1977
1978 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1979 VERIFY(m->m_flags & M_PKTHDR);
1980
1981 sw_csum = (csum_flags & m->m_pkthdr.csum_flags);
1982
1983 if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) {
1984 goto done;
1985 }
1986
1987 mlen = m->m_pkthdr.len; /* total mbuf len */
1988
1989 /* sanity check (need at least simple IP header) */
1990 if (mlen < (hoff + sizeof(*ip))) {
1991 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
1992 "(%u+%u)\n", __func__, m, mlen, hoff,
1993 (uint32_t)sizeof(*ip));
1994 /* NOTREACHED */
1995 }
1996
1997 /*
1998 * In case the IP header is not contiguous, or not 32-bit aligned,
1999 * or if we're computing the IP header checksum, copy it to a local
2000 * buffer. Copy only the simple IP header here (IP options case
2001 * is handled below.)
2002 */
2003 if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof(*ip)) > m->m_len ||
2004 !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) {
2005 m_copydata(m, hoff, sizeof(*ip), (caddr_t)buf);
2006 ip = (struct ip *)(void *)buf;
2007 _hlen = sizeof(*ip);
2008 } else {
2009 ip = (struct ip *)(void *)(m->m_data + hoff);
2010 _hlen = 0;
2011 }
2012
2013 hlen = IP_VHL_HL(ip->ip_vhl) << 2; /* IP header len */
2014
2015 /* sanity check */
2016 if (mlen < (hoff + hlen)) {
2017 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
2018 "hoff %u", __func__, m, mlen, hlen, hoff);
2019 /* NOTREACHED */
2020 }
2021
2022 /*
2023 * We could be in the context of an IP or interface filter; in the
2024 * former case, ip_len would be in host (correct) order while for
2025 * the latter it would be in network order. Because of this, we
2026 * attempt to interpret the length field by comparing it against
2027 * the actual packet length. If the comparison fails, byte swap
2028 * the length and check again. If it still fails, use the actual
2029 * packet length. This also covers the trailing bytes case.
2030 */
2031 ip_len = ip->ip_len;
2032 if (ip_len != (mlen - hoff)) {
2033 ip_len = OSSwapInt16(ip_len);
2034 if (ip_len != (mlen - hoff)) {
2035 printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2036 "[swapped %d (%x)] doesn't match actual packet "
2037 "length; %d is used instead\n", __func__,
2038 (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p,
2039 ip->ip_len, ip->ip_len, ip_len, ip_len,
2040 (mlen - hoff));
2041 if (mlen - hoff > UINT16_MAX) {
2042 panic("%s: mlen %u - hoff %u > 65535",
2043 __func__, mlen, hoff);
2044 }
2045 ip_len = (uint16_t)(mlen - hoff);
2046 }
2047 }
2048
2049 len = ip_len - hlen; /* csum span */
2050
2051 if (sw_csum & CSUM_DELAY_DATA) {
2052 uint16_t ulpoff;
2053
2054 /*
2055 * offset is added to the lower 16-bit value of csum_data,
2056 * which is expected to contain the ULP offset; therefore
2057 * CSUM_PARTIAL offset adjustment must be undone.
2058 */
2059 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL | CSUM_DATA_VALID)) ==
2060 (CSUM_PARTIAL | CSUM_DATA_VALID)) {
2061 /*
2062 * Get back the original ULP offset (this will
2063 * undo the CSUM_PARTIAL logic in ip_output.)
2064 */
2065 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff -
2066 m->m_pkthdr.csum_tx_start);
2067 }
2068
2069 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */
2070 offset = hoff + hlen; /* ULP header */
2071
2072 if (mlen < (ulpoff + sizeof(csum))) {
2073 panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2074 "cksum offset (%u) cksum flags 0x%x\n", __func__,
2075 m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags);
2076 /* NOTREACHED */
2077 }
2078
2079 csum = inet_cksum(m, 0, offset, len);
2080
2081 /* Update stats */
2082 ip_out_cksum_stats(ip->ip_p, len);
2083
2084 /* RFC1122 4.1.3.4 */
2085 if (csum == 0 &&
2086 (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_ZERO_INVERT))) {
2087 csum = 0xffff;
2088 }
2089
2090 /* Insert the checksum in the ULP csum field */
2091 offset += ulpoff;
2092 if (offset + sizeof(csum) > m->m_len) {
2093 m_copyback(m, offset, sizeof(csum), &csum);
2094 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2095 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2096 } else {
2097 bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2098 }
2099 m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
2100 CSUM_PARTIAL | CSUM_ZERO_INVERT);
2101 }
2102
2103 if (sw_csum & CSUM_DELAY_IP) {
2104 /* IP header must be in the local buffer */
2105 VERIFY(_hlen == sizeof(*ip));
2106 if (_hlen != hlen) {
2107 VERIFY(hlen <= sizeof(buf));
2108 m_copydata(m, hoff, hlen, (caddr_t)buf);
2109 ip = (struct ip *)(void *)buf;
2110 _hlen = hlen;
2111 }
2112
2113 /*
2114 * Compute the IP header checksum as if the IP length
2115 * is the length which we believe is "correct"; see
2116 * how ip_len gets calculated above. Note that this
2117 * is done on the local copy and not on the real one.
2118 */
2119 ip->ip_len = htons(ip_len);
2120 ip->ip_sum = 0;
2121 csum = in_cksum_hdr_opt(ip);
2122
2123 /* Update stats */
2124 ipstat.ips_snd_swcsum++;
2125 ipstat.ips_snd_swcsum_bytes += hlen;
2126
2127 /*
2128 * Insert only the checksum in the existing IP header
2129 * csum field; all other fields are left unchanged.
2130 */
2131 offset = hoff + offsetof(struct ip, ip_sum);
2132 if (offset + sizeof(csum) > m->m_len) {
2133 m_copyback(m, offset, sizeof(csum), &csum);
2134 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2135 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2136 } else {
2137 bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2138 }
2139 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2140 }
2141
2142 done:
2143 return sw_csum;
2144 }
2145
2146 /*
2147 * Insert IP options into preformed packet.
2148 * Adjust IP destination as required for IP source routing,
2149 * as indicated by a non-zero in_addr at the start of the options.
2150 *
2151 * XXX This routine assumes that the packet has no options in place.
2152 */
2153 static struct mbuf *
ip_insertoptions(struct mbuf * m,struct mbuf * opt,int * phlen)2154 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
2155 {
2156 struct ipoption *p = mtod(opt, struct ipoption *);
2157 struct mbuf *n;
2158 struct ip *ip = mtod(m, struct ip *);
2159 unsigned optlen;
2160
2161 optlen = opt->m_len - sizeof(p->ipopt_dst);
2162 if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
2163 return m; /* XXX should fail */
2164 }
2165 if (p->ipopt_dst.s_addr) {
2166 ip->ip_dst = p->ipopt_dst;
2167 }
2168 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2169 MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */
2170 if (n == NULL) {
2171 return m;
2172 }
2173 n->m_pkthdr.rcvif = 0;
2174 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2175 m->m_len -= sizeof(struct ip);
2176 m->m_data += sizeof(struct ip);
2177 n->m_next = m;
2178 m = n;
2179 m->m_len = optlen + sizeof(struct ip);
2180 m->m_data += max_linkhdr;
2181 (void) memcpy(mtod(m, void *), ip, sizeof(struct ip));
2182 } else {
2183 m->m_data -= optlen;
2184 m->m_len += optlen;
2185 m->m_pkthdr.len += optlen;
2186 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2187 }
2188 ip = mtod(m, struct ip *);
2189 bcopy(p->ipopt_list, ip + 1, optlen);
2190 *phlen = sizeof(struct ip) + optlen;
2191 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2192 ip->ip_len += optlen;
2193 return m;
2194 }
2195
2196 /*
2197 * Copy options from ip to jp,
2198 * omitting those not copied during fragmentation.
2199 */
2200 static int
ip_optcopy(struct ip * ip,struct ip * jp)2201 ip_optcopy(struct ip *ip, struct ip *jp)
2202 {
2203 u_char *cp, *dp;
2204 int opt, optlen, cnt;
2205
2206 cp = (u_char *)(ip + 1);
2207 dp = (u_char *)(jp + 1);
2208 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
2209 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2210 opt = cp[0];
2211 if (opt == IPOPT_EOL) {
2212 break;
2213 }
2214 if (opt == IPOPT_NOP) {
2215 /* Preserve for IP mcast tunnel's LSRR alignment. */
2216 *dp++ = IPOPT_NOP;
2217 optlen = 1;
2218 continue;
2219 }
2220 #if DIAGNOSTIC
2221 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2222 panic("malformed IPv4 option passed to ip_optcopy");
2223 /* NOTREACHED */
2224 }
2225 #endif
2226 optlen = cp[IPOPT_OLEN];
2227 #if DIAGNOSTIC
2228 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2229 panic("malformed IPv4 option passed to ip_optcopy");
2230 /* NOTREACHED */
2231 }
2232 #endif
2233 /* bogus lengths should have been caught by ip_dooptions */
2234 if (optlen > cnt) {
2235 optlen = cnt;
2236 }
2237 if (IPOPT_COPIED(opt)) {
2238 bcopy(cp, dp, optlen);
2239 dp += optlen;
2240 }
2241 }
2242 for (optlen = (int)(dp - (u_char *)(jp + 1)); optlen & 0x3; optlen++) {
2243 *dp++ = IPOPT_EOL;
2244 }
2245 return optlen;
2246 }
2247
2248 /*
2249 * IP socket option processing.
2250 */
2251 int
ip_ctloutput(struct socket * so,struct sockopt * sopt)2252 ip_ctloutput(struct socket *so, struct sockopt *sopt)
2253 {
2254 struct inpcb *inp = sotoinpcb(so);
2255 int error, optval;
2256 lck_mtx_t *mutex_held = NULL;
2257
2258 error = optval = 0;
2259 if (sopt->sopt_level != IPPROTO_IP) {
2260 return EINVAL;
2261 }
2262
2263 switch (sopt->sopt_dir) {
2264 case SOPT_SET:
2265 mutex_held = socket_getlock(so, PR_F_WILLUNLOCK);
2266 /*
2267 * Wait if we are in the middle of ip_output
2268 * as we unlocked the socket there and don't
2269 * want to overwrite the IP options
2270 */
2271 if (inp->inp_sndinprog_cnt > 0) {
2272 inp->inp_sndingprog_waiters++;
2273
2274 while (inp->inp_sndinprog_cnt > 0) {
2275 msleep(&inp->inp_sndinprog_cnt, mutex_held,
2276 PSOCK | PCATCH, "inp_sndinprog_cnt", NULL);
2277 }
2278 inp->inp_sndingprog_waiters--;
2279 }
2280 switch (sopt->sopt_name) {
2281 #ifdef notyet
2282 case IP_RETOPTS:
2283 #endif
2284 case IP_OPTIONS: {
2285 struct mbuf *m;
2286
2287 if (sopt->sopt_valsize > MLEN) {
2288 error = EMSGSIZE;
2289 break;
2290 }
2291 MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2292 MT_HEADER);
2293 if (m == NULL) {
2294 error = ENOBUFS;
2295 break;
2296 }
2297 m->m_len = (int32_t)sopt->sopt_valsize;
2298 error = sooptcopyin(sopt, mtod(m, char *),
2299 m->m_len, m->m_len);
2300 if (error) {
2301 m_freem(m);
2302 break;
2303 }
2304
2305 return ip_pcbopts(sopt->sopt_name,
2306 &inp->inp_options, m);
2307 }
2308
2309 case IP_TOS:
2310 case IP_TTL:
2311 case IP_RECVOPTS:
2312 case IP_RECVRETOPTS:
2313 case IP_RECVDSTADDR:
2314 case IP_RECVIF:
2315 case IP_RECVTTL:
2316 case IP_RECVPKTINFO:
2317 case IP_RECVTOS:
2318 case IP_DONTFRAG:
2319 error = sooptcopyin(sopt, &optval, sizeof(optval),
2320 sizeof(optval));
2321 if (error) {
2322 break;
2323 }
2324
2325 switch (sopt->sopt_name) {
2326 case IP_TOS:
2327 if (optval > UINT8_MAX) {
2328 error = EINVAL;
2329 break;
2330 }
2331 inp->inp_ip_tos = (uint8_t)optval;
2332 break;
2333
2334 case IP_TTL:
2335 if (optval > UINT8_MAX) {
2336 error = EINVAL;
2337 break;
2338 }
2339 inp->inp_ip_ttl = (uint8_t)optval;
2340 break;
2341 #define OPTSET(bit) do { \
2342 if (optval) { \
2343 inp->inp_flags |= bit; \
2344 } else { \
2345 inp->inp_flags &= ~bit; \
2346 } \
2347 } while (0)
2348
2349 #define OPTSET2(bit) do { \
2350 if (optval) { \
2351 inp->inp_flags2 |= bit; \
2352 } else { \
2353 inp->inp_flags2 &= ~bit; \
2354 } \
2355 } while (0)
2356
2357 case IP_RECVOPTS:
2358 OPTSET(INP_RECVOPTS);
2359 break;
2360
2361 case IP_RECVRETOPTS:
2362 OPTSET(INP_RECVRETOPTS);
2363 break;
2364
2365 case IP_RECVDSTADDR:
2366 OPTSET(INP_RECVDSTADDR);
2367 break;
2368
2369 case IP_RECVIF:
2370 OPTSET(INP_RECVIF);
2371 break;
2372
2373 case IP_RECVTTL:
2374 OPTSET(INP_RECVTTL);
2375 break;
2376
2377 case IP_RECVPKTINFO:
2378 OPTSET(INP_PKTINFO);
2379 break;
2380
2381 case IP_RECVTOS:
2382 OPTSET(INP_RECVTOS);
2383 break;
2384
2385 case IP_DONTFRAG:
2386 /* This option is settable only for IPv4 */
2387 if (!(inp->inp_vflag & INP_IPV4)) {
2388 error = EINVAL;
2389 break;
2390 }
2391 OPTSET2(INP2_DONTFRAG);
2392 break;
2393 #undef OPTSET
2394 #undef OPTSET2
2395 }
2396 break;
2397 /*
2398 * Multicast socket options are processed by the in_mcast
2399 * module.
2400 */
2401 case IP_MULTICAST_IF:
2402 case IP_MULTICAST_IFINDEX:
2403 case IP_MULTICAST_VIF:
2404 case IP_MULTICAST_TTL:
2405 case IP_MULTICAST_LOOP:
2406 case IP_ADD_MEMBERSHIP:
2407 case IP_DROP_MEMBERSHIP:
2408 case IP_ADD_SOURCE_MEMBERSHIP:
2409 case IP_DROP_SOURCE_MEMBERSHIP:
2410 case IP_BLOCK_SOURCE:
2411 case IP_UNBLOCK_SOURCE:
2412 case IP_MSFILTER:
2413 case MCAST_JOIN_GROUP:
2414 case MCAST_LEAVE_GROUP:
2415 case MCAST_JOIN_SOURCE_GROUP:
2416 case MCAST_LEAVE_SOURCE_GROUP:
2417 case MCAST_BLOCK_SOURCE:
2418 case MCAST_UNBLOCK_SOURCE:
2419 error = inp_setmoptions(inp, sopt);
2420 break;
2421
2422 case IP_PORTRANGE:
2423 error = sooptcopyin(sopt, &optval, sizeof(optval),
2424 sizeof(optval));
2425 if (error) {
2426 break;
2427 }
2428
2429 switch (optval) {
2430 case IP_PORTRANGE_DEFAULT:
2431 inp->inp_flags &= ~(INP_LOWPORT);
2432 inp->inp_flags &= ~(INP_HIGHPORT);
2433 break;
2434
2435 case IP_PORTRANGE_HIGH:
2436 inp->inp_flags &= ~(INP_LOWPORT);
2437 inp->inp_flags |= INP_HIGHPORT;
2438 break;
2439
2440 case IP_PORTRANGE_LOW:
2441 inp->inp_flags &= ~(INP_HIGHPORT);
2442 inp->inp_flags |= INP_LOWPORT;
2443 break;
2444
2445 default:
2446 error = EINVAL;
2447 break;
2448 }
2449 break;
2450
2451 #if IPSEC
2452 case IP_IPSEC_POLICY: {
2453 caddr_t req = NULL;
2454 size_t len = 0;
2455 int priv;
2456 struct mbuf *m;
2457 int optname;
2458
2459 if ((error = soopt_getm(sopt, &m)) != 0) { /* XXX */
2460 break;
2461 }
2462 if ((error = soopt_mcopyin(sopt, m)) != 0) { /* XXX */
2463 break;
2464 }
2465 priv = (proc_suser(sopt->sopt_p) == 0);
2466 if (m) {
2467 req = mtod(m, caddr_t);
2468 len = m->m_len;
2469 }
2470 optname = sopt->sopt_name;
2471 error = ipsec4_set_policy(inp, optname, req, len, priv);
2472 m_freem(m);
2473 break;
2474 }
2475 #endif /* IPSEC */
2476
2477 #if TRAFFIC_MGT
2478 case IP_TRAFFIC_MGT_BACKGROUND: {
2479 unsigned background = 0;
2480
2481 error = sooptcopyin(sopt, &background,
2482 sizeof(background), sizeof(background));
2483 if (error) {
2484 break;
2485 }
2486
2487 if (background) {
2488 socket_set_traffic_mgt_flags_locked(so,
2489 TRAFFIC_MGT_SO_BACKGROUND);
2490 } else {
2491 socket_clear_traffic_mgt_flags_locked(so,
2492 TRAFFIC_MGT_SO_BACKGROUND);
2493 }
2494
2495 break;
2496 }
2497 #endif /* TRAFFIC_MGT */
2498
2499 /*
2500 * On a multihomed system, scoped routing can be used to
2501 * restrict the source interface used for sending packets.
2502 * The socket option IP_BOUND_IF binds a particular AF_INET
2503 * socket to an interface such that data sent on the socket
2504 * is restricted to that interface. This is unlike the
2505 * SO_DONTROUTE option where the routing table is bypassed;
2506 * therefore it allows for a greater flexibility and control
2507 * over the system behavior, and does not place any restriction
2508 * on the destination address type (e.g. unicast, multicast,
2509 * or broadcast if applicable) or whether or not the host is
2510 * directly reachable. Note that in the multicast transmit
2511 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2512 * IP_BOUND_IF, since the former practically bypasses the
2513 * routing table; in this case, IP_BOUND_IF sets the default
2514 * interface used for sending multicast packets in the absence
2515 * of an explicit multicast transmit interface.
2516 */
2517 case IP_BOUND_IF:
2518 /* This option is settable only for IPv4 */
2519 if (!(inp->inp_vflag & INP_IPV4)) {
2520 error = EINVAL;
2521 break;
2522 }
2523
2524 error = sooptcopyin(sopt, &optval, sizeof(optval),
2525 sizeof(optval));
2526
2527 if (error) {
2528 break;
2529 }
2530
2531 error = inp_bindif(inp, optval, NULL);
2532 break;
2533
2534 case IP_NO_IFT_CELLULAR:
2535 /* This option is settable only for IPv4 */
2536 if (!(inp->inp_vflag & INP_IPV4)) {
2537 error = EINVAL;
2538 break;
2539 }
2540
2541 error = sooptcopyin(sopt, &optval, sizeof(optval),
2542 sizeof(optval));
2543
2544 if (error) {
2545 break;
2546 }
2547
2548 /* once set, it cannot be unset */
2549 if (!optval && INP_NO_CELLULAR(inp)) {
2550 error = EINVAL;
2551 break;
2552 }
2553
2554 error = so_set_restrictions(so,
2555 SO_RESTRICT_DENY_CELLULAR);
2556 break;
2557
2558 case IP_OUT_IF:
2559 /* This option is not settable */
2560 error = EINVAL;
2561 break;
2562
2563 default:
2564 error = ENOPROTOOPT;
2565 break;
2566 }
2567 break;
2568
2569 case SOPT_GET:
2570 switch (sopt->sopt_name) {
2571 case IP_OPTIONS:
2572 case IP_RETOPTS:
2573 if (inp->inp_options) {
2574 error = sooptcopyout(sopt,
2575 mtod(inp->inp_options, char *),
2576 inp->inp_options->m_len);
2577 } else {
2578 sopt->sopt_valsize = 0;
2579 }
2580 break;
2581
2582 case IP_TOS:
2583 case IP_TTL:
2584 case IP_RECVOPTS:
2585 case IP_RECVRETOPTS:
2586 case IP_RECVDSTADDR:
2587 case IP_RECVIF:
2588 case IP_RECVTTL:
2589 case IP_PORTRANGE:
2590 case IP_RECVPKTINFO:
2591 case IP_RECVTOS:
2592 case IP_DONTFRAG:
2593 switch (sopt->sopt_name) {
2594 case IP_TOS:
2595 optval = inp->inp_ip_tos;
2596 break;
2597
2598 case IP_TTL:
2599 optval = inp->inp_ip_ttl;
2600 break;
2601
2602 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
2603 #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
2604 case IP_RECVOPTS:
2605 optval = OPTBIT(INP_RECVOPTS);
2606 break;
2607
2608 case IP_RECVRETOPTS:
2609 optval = OPTBIT(INP_RECVRETOPTS);
2610 break;
2611
2612 case IP_RECVDSTADDR:
2613 optval = OPTBIT(INP_RECVDSTADDR);
2614 break;
2615
2616 case IP_RECVIF:
2617 optval = OPTBIT(INP_RECVIF);
2618 break;
2619
2620 case IP_RECVTTL:
2621 optval = OPTBIT(INP_RECVTTL);
2622 break;
2623
2624 case IP_PORTRANGE:
2625 if (inp->inp_flags & INP_HIGHPORT) {
2626 optval = IP_PORTRANGE_HIGH;
2627 } else if (inp->inp_flags & INP_LOWPORT) {
2628 optval = IP_PORTRANGE_LOW;
2629 } else {
2630 optval = 0;
2631 }
2632 break;
2633
2634 case IP_RECVPKTINFO:
2635 optval = OPTBIT(INP_PKTINFO);
2636 break;
2637
2638 case IP_RECVTOS:
2639 optval = OPTBIT(INP_RECVTOS);
2640 break;
2641 case IP_DONTFRAG:
2642 optval = OPTBIT2(INP2_DONTFRAG);
2643 break;
2644 }
2645 error = sooptcopyout(sopt, &optval, sizeof(optval));
2646 break;
2647
2648 case IP_MULTICAST_IF:
2649 case IP_MULTICAST_IFINDEX:
2650 case IP_MULTICAST_VIF:
2651 case IP_MULTICAST_TTL:
2652 case IP_MULTICAST_LOOP:
2653 case IP_MSFILTER:
2654 error = inp_getmoptions(inp, sopt);
2655 break;
2656
2657 #if IPSEC
2658 case IP_IPSEC_POLICY: {
2659 error = 0; /* This option is no longer supported */
2660 break;
2661 }
2662 #endif /* IPSEC */
2663
2664 #if TRAFFIC_MGT
2665 case IP_TRAFFIC_MGT_BACKGROUND: {
2666 unsigned background = (so->so_flags1 &
2667 SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2668 return sooptcopyout(sopt, &background,
2669 sizeof(background));
2670 }
2671 #endif /* TRAFFIC_MGT */
2672
2673 case IP_BOUND_IF:
2674 if (inp->inp_flags & INP_BOUND_IF) {
2675 optval = inp->inp_boundifp->if_index;
2676 }
2677 error = sooptcopyout(sopt, &optval, sizeof(optval));
2678 break;
2679
2680 case IP_NO_IFT_CELLULAR:
2681 optval = INP_NO_CELLULAR(inp) ? 1 : 0;
2682 error = sooptcopyout(sopt, &optval, sizeof(optval));
2683 break;
2684
2685 case IP_OUT_IF:
2686 optval = (inp->inp_last_outifp != NULL) ?
2687 inp->inp_last_outifp->if_index : 0;
2688 error = sooptcopyout(sopt, &optval, sizeof(optval));
2689 break;
2690
2691 default:
2692 error = ENOPROTOOPT;
2693 break;
2694 }
2695 break;
2696 }
2697 return error;
2698 }
2699
2700 /*
2701 * Set up IP options in pcb for insertion in output packets.
2702 * Store in mbuf with pointer in pcbopt, adding pseudo-option
2703 * with destination address if source routed.
2704 */
2705 static int
ip_pcbopts(int optname,struct mbuf ** pcbopt,struct mbuf * m)2706 ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m)
2707 {
2708 #pragma unused(optname)
2709 int cnt, optlen;
2710 u_char *cp;
2711 u_char opt;
2712
2713 /* turn off any old options */
2714 if (*pcbopt) {
2715 (void) m_free(*pcbopt);
2716 }
2717 *pcbopt = 0;
2718 if (m == (struct mbuf *)0 || m->m_len == 0) {
2719 /*
2720 * Only turning off any previous options.
2721 */
2722 if (m) {
2723 (void) m_free(m);
2724 }
2725 return 0;
2726 }
2727
2728 if (m->m_len % sizeof(int32_t)) {
2729 goto bad;
2730 }
2731
2732 /*
2733 * IP first-hop destination address will be stored before
2734 * actual options; move other options back
2735 * and clear it when none present.
2736 */
2737 if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) {
2738 goto bad;
2739 }
2740 cnt = m->m_len;
2741 m->m_len += sizeof(struct in_addr);
2742 cp = mtod(m, u_char *) + sizeof(struct in_addr);
2743 ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2744 bzero(mtod(m, caddr_t), sizeof(struct in_addr));
2745
2746 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2747 opt = cp[IPOPT_OPTVAL];
2748 if (opt == IPOPT_EOL) {
2749 break;
2750 }
2751 if (opt == IPOPT_NOP) {
2752 optlen = 1;
2753 } else {
2754 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2755 goto bad;
2756 }
2757 optlen = cp[IPOPT_OLEN];
2758 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2759 goto bad;
2760 }
2761 }
2762 switch (opt) {
2763 default:
2764 break;
2765
2766 case IPOPT_LSRR:
2767 case IPOPT_SSRR:
2768 /*
2769 * user process specifies route as:
2770 * ->A->B->C->D
2771 * D must be our final destination (but we can't
2772 * check that since we may not have connected yet).
2773 * A is first hop destination, which doesn't appear in
2774 * actual IP option, but is stored before the options.
2775 */
2776 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) {
2777 goto bad;
2778 }
2779 if (optlen > UINT8_MAX) {
2780 goto bad;
2781 }
2782 m->m_len -= sizeof(struct in_addr);
2783 cnt -= sizeof(struct in_addr);
2784 optlen -= sizeof(struct in_addr);
2785 cp[IPOPT_OLEN] = (uint8_t)optlen;
2786 /*
2787 * Move first hop before start of options.
2788 */
2789 bcopy((caddr_t)&cp[IPOPT_OFFSET + 1], mtod(m, caddr_t),
2790 sizeof(struct in_addr));
2791 /*
2792 * Then copy rest of options back
2793 * to close up the deleted entry.
2794 */
2795 ovbcopy((caddr_t)(&cp[IPOPT_OFFSET + 1] +
2796 sizeof(struct in_addr)),
2797 (caddr_t)&cp[IPOPT_OFFSET + 1],
2798 (unsigned)cnt - (IPOPT_MINOFF - 1));
2799 break;
2800 }
2801 }
2802 if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) {
2803 goto bad;
2804 }
2805 *pcbopt = m;
2806 return 0;
2807
2808 bad:
2809 (void) m_free(m);
2810 return EINVAL;
2811 }
2812
2813 void
ip_moptions_init(void)2814 ip_moptions_init(void)
2815 {
2816 PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof(imo_debug));
2817
2818 vm_size_t imo_size = (imo_debug == 0) ? sizeof(struct ip_moptions) :
2819 sizeof(struct ip_moptions_dbg);
2820
2821 imo_zone = zone_create(IMO_ZONE_NAME, imo_size, ZC_ZFREE_CLEARMEM);
2822 }
2823
2824 void
imo_addref(struct ip_moptions * imo,int locked)2825 imo_addref(struct ip_moptions *imo, int locked)
2826 {
2827 if (!locked) {
2828 IMO_LOCK(imo);
2829 } else {
2830 IMO_LOCK_ASSERT_HELD(imo);
2831 }
2832
2833 if (++imo->imo_refcnt == 0) {
2834 panic("%s: imo %p wraparound refcnt", __func__, imo);
2835 /* NOTREACHED */
2836 } else if (imo->imo_trace != NULL) {
2837 (*imo->imo_trace)(imo, TRUE);
2838 }
2839
2840 if (!locked) {
2841 IMO_UNLOCK(imo);
2842 }
2843 }
2844
2845 void
imo_remref(struct ip_moptions * imo)2846 imo_remref(struct ip_moptions *imo)
2847 {
2848 IMO_LOCK(imo);
2849 if (imo->imo_refcnt == 0) {
2850 panic("%s: imo %p negative refcnt", __func__, imo);
2851 /* NOTREACHED */
2852 } else if (imo->imo_trace != NULL) {
2853 (*imo->imo_trace)(imo, FALSE);
2854 }
2855
2856 --imo->imo_refcnt;
2857 if (imo->imo_refcnt > 0) {
2858 IMO_UNLOCK(imo);
2859 return;
2860 }
2861
2862 IMO_PURGE_LOCKED(imo);
2863
2864 IMO_UNLOCK(imo);
2865
2866 kfree_type(struct in_multi *, imo->imo_max_memberships, imo->imo_membership);
2867 kfree_type(struct in_mfilter, imo->imo_max_memberships, imo->imo_mfilters);
2868 lck_mtx_destroy(&imo->imo_lock, &ifa_mtx_grp);
2869
2870 if (!(imo->imo_debug & IFD_ALLOC)) {
2871 panic("%s: imo %p cannot be freed", __func__, imo);
2872 /* NOTREACHED */
2873 }
2874 zfree(imo_zone, imo);
2875 }
2876
2877 static void
imo_trace(struct ip_moptions * imo,int refhold)2878 imo_trace(struct ip_moptions *imo, int refhold)
2879 {
2880 struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2881 ctrace_t *tr;
2882 u_int32_t idx;
2883 u_int16_t *cnt;
2884
2885 if (!(imo->imo_debug & IFD_DEBUG)) {
2886 panic("%s: imo %p has no debug structure", __func__, imo);
2887 /* NOTREACHED */
2888 }
2889 if (refhold) {
2890 cnt = &imo_dbg->imo_refhold_cnt;
2891 tr = imo_dbg->imo_refhold;
2892 } else {
2893 cnt = &imo_dbg->imo_refrele_cnt;
2894 tr = imo_dbg->imo_refrele;
2895 }
2896
2897 idx = os_atomic_inc_orig(cnt, relaxed) % IMO_TRACE_HIST_SIZE;
2898 ctrace_record(&tr[idx]);
2899 }
2900
2901 struct ip_moptions *
ip_allocmoptions(zalloc_flags_t how)2902 ip_allocmoptions(zalloc_flags_t how)
2903 {
2904 struct ip_moptions *imo;
2905
2906 imo = zalloc_flags(imo_zone, how | Z_ZERO);
2907 if (imo != NULL) {
2908 lck_mtx_init(&imo->imo_lock, &ifa_mtx_grp, &ifa_mtx_attr);
2909 imo->imo_debug |= IFD_ALLOC;
2910 if (imo_debug != 0) {
2911 imo->imo_debug |= IFD_DEBUG;
2912 imo->imo_trace = imo_trace;
2913 }
2914 IMO_ADDREF(imo);
2915 }
2916
2917 return imo;
2918 }
2919
2920 /*
2921 * Routine called from ip_output() to loop back a copy of an IP multicast
2922 * packet to the input queue of a specified interface. Note that this
2923 * calls the output routine of the loopback "driver", but with an interface
2924 * pointer that might NOT be a loopback interface -- evil, but easier than
2925 * replicating that code here.
2926 */
2927 static void
ip_mloopback(struct ifnet * srcifp,struct ifnet * origifp,struct mbuf * m,struct sockaddr_in * dst,int hlen)2928 ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
2929 struct sockaddr_in *dst, int hlen)
2930 {
2931 struct mbuf *copym;
2932 struct ip *ip;
2933
2934 if (lo_ifp == NULL) {
2935 return;
2936 }
2937
2938 /*
2939 * Copy the packet header as it's needed for the checksum
2940 * Make sure to deep-copy IP header portion in case the data
2941 * is in an mbuf cluster, so that we can safely override the IP
2942 * header portion later.
2943 */
2944 copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR);
2945 if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) {
2946 copym = m_pullup(copym, hlen);
2947 }
2948
2949 if (copym == NULL) {
2950 return;
2951 }
2952
2953 /*
2954 * We don't bother to fragment if the IP length is greater
2955 * than the interface's MTU. Can this possibly matter?
2956 */
2957 ip = mtod(copym, struct ip *);
2958 #if BYTE_ORDER != BIG_ENDIAN
2959 HTONS(ip->ip_len);
2960 HTONS(ip->ip_off);
2961 #endif
2962 ip->ip_sum = 0;
2963 ip->ip_sum = ip_cksum_hdr_out(copym, hlen);
2964
2965 /*
2966 * Mark checksum as valid unless receive checksum offload is
2967 * disabled; if so, compute checksum in software. If the
2968 * interface itself is lo0, this will be overridden by if_loop.
2969 */
2970 if (hwcksum_rx) {
2971 copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL | CSUM_ZERO_INVERT);
2972 copym->m_pkthdr.csum_flags |=
2973 CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2974 copym->m_pkthdr.csum_data = 0xffff;
2975 } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2976 #if BYTE_ORDER != BIG_ENDIAN
2977 NTOHS(ip->ip_len);
2978 #endif
2979 in_delayed_cksum(copym);
2980 #if BYTE_ORDER != BIG_ENDIAN
2981 HTONS(ip->ip_len);
2982 #endif
2983 }
2984
2985 /*
2986 * Stuff the 'real' ifp into the pkthdr, to be used in matching
2987 * in ip_input(); we need the loopback ifp/dl_tag passed as args
2988 * to make the loopback driver compliant with the data link
2989 * requirements.
2990 */
2991 copym->m_pkthdr.rcvif = origifp;
2992
2993 /*
2994 * Also record the source interface (which owns the source address).
2995 * This is basically a stripped down version of ifa_foraddr().
2996 */
2997 if (srcifp == NULL) {
2998 struct in_ifaddr *ia;
2999
3000 lck_rw_lock_shared(&in_ifaddr_rwlock);
3001 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) {
3002 IFA_LOCK_SPIN(&ia->ia_ifa);
3003 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) {
3004 srcifp = ia->ia_ifp;
3005 IFA_UNLOCK(&ia->ia_ifa);
3006 break;
3007 }
3008 IFA_UNLOCK(&ia->ia_ifa);
3009 }
3010 lck_rw_done(&in_ifaddr_rwlock);
3011 }
3012 if (srcifp != NULL) {
3013 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL);
3014 }
3015 ip_setdstifaddr_info(copym, origifp->if_index, NULL);
3016
3017 dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL);
3018 }
3019
3020 /*
3021 * Given a source IP address (and route, if available), determine the best
3022 * interface to send the packet from. Checking for (and updating) the
3023 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3024 * without any locks based on the assumption that ip_output() is single-
3025 * threaded per-pcb, i.e. for any given pcb there can only be one thread
3026 * performing output at the IP layer.
3027 *
3028 * This routine is analogous to in6_selectroute() for IPv6.
3029 */
3030 static struct ifaddr *
in_selectsrcif(struct ip * ip,struct route * ro,unsigned int ifscope)3031 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3032 {
3033 struct ifaddr *ifa = NULL;
3034 struct in_addr src = ip->ip_src;
3035 struct in_addr dst = ip->ip_dst;
3036 struct ifnet *rt_ifp;
3037 char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3038
3039 VERIFY(src.s_addr != INADDR_ANY);
3040
3041 if (ip_select_srcif_debug) {
3042 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof(s_src));
3043 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof(s_dst));
3044 }
3045
3046 if (ro->ro_rt != NULL) {
3047 RT_LOCK(ro->ro_rt);
3048 }
3049
3050 rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3051
3052 /*
3053 * Given the source IP address, find a suitable source interface
3054 * to use for transmission; if the caller has specified a scope,
3055 * optimize the search by looking at the addresses only for that
3056 * interface. This is still suboptimal, however, as we need to
3057 * traverse the per-interface list.
3058 */
3059 if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3060 unsigned int scope = ifscope;
3061
3062 /*
3063 * If no scope is specified and the route is stale (pointing
3064 * to a defunct interface) use the current primary interface;
3065 * this happens when switching between interfaces configured
3066 * with the same IP address. Otherwise pick up the scope
3067 * information from the route; the ULP may have looked up a
3068 * correct route and we just need to verify it here and mark
3069 * it with the ROF_SRCIF_SELECTED flag below.
3070 */
3071 if (scope == IFSCOPE_NONE) {
3072 scope = rt_ifp->if_index;
3073 if (scope != get_primary_ifscope(AF_INET) &&
3074 ROUTE_UNUSABLE(ro)) {
3075 scope = get_primary_ifscope(AF_INET);
3076 }
3077 }
3078
3079 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3080
3081 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3082 ip->ip_p != IPPROTO_TCP && ipforwarding) {
3083 /*
3084 * If forwarding is enabled, and if the packet isn't
3085 * TCP or UDP, check if the source address belongs
3086 * to one of our own interfaces; if so, demote the
3087 * interface scope and do a route lookup right below.
3088 */
3089 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3090 if (ifa != NULL) {
3091 IFA_REMREF(ifa);
3092 ifa = NULL;
3093 ifscope = IFSCOPE_NONE;
3094 }
3095 }
3096
3097 if (ip_select_srcif_debug && ifa != NULL) {
3098 if (ro->ro_rt != NULL) {
3099 printf("%s->%s ifscope %d->%d ifa_if %s "
3100 "ro_if %s\n", s_src, s_dst, ifscope,
3101 scope, if_name(ifa->ifa_ifp),
3102 if_name(rt_ifp));
3103 } else {
3104 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3105 s_src, s_dst, ifscope, scope,
3106 if_name(ifa->ifa_ifp));
3107 }
3108 }
3109 }
3110
3111 /*
3112 * Slow path; search for an interface having the corresponding source
3113 * IP address if the scope was not specified by the caller, and:
3114 *
3115 * 1) There currently isn't any route, or,
3116 * 2) The interface used by the route does not own that source
3117 * IP address; in this case, the route will get blown away
3118 * and we'll do a more specific scoped search using the newly
3119 * found interface.
3120 */
3121 if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3122 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3123
3124 /*
3125 * If we have the IP address, but not the route, we don't
3126 * really know whether or not it belongs to the correct
3127 * interface (it could be shared across multiple interfaces.)
3128 * The only way to find out is to do a route lookup.
3129 */
3130 if (ifa != NULL && ro->ro_rt == NULL) {
3131 struct rtentry *rt;
3132 struct sockaddr_in sin;
3133 struct ifaddr *oifa = NULL;
3134
3135 bzero(&sin, sizeof(sin));
3136 sin.sin_family = AF_INET;
3137 sin.sin_len = sizeof(sin);
3138 sin.sin_addr = dst;
3139
3140 lck_mtx_lock(rnh_lock);
3141 if ((rt = rt_lookup(TRUE, SA(&sin), NULL,
3142 rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3143 RT_LOCK(rt);
3144 /*
3145 * If the route uses a different interface,
3146 * use that one instead. The IP address of
3147 * the ifaddr that we pick up here is not
3148 * relevant.
3149 */
3150 if (ifa->ifa_ifp != rt->rt_ifp) {
3151 oifa = ifa;
3152 ifa = rt->rt_ifa;
3153 IFA_ADDREF(ifa);
3154 RT_UNLOCK(rt);
3155 } else {
3156 RT_UNLOCK(rt);
3157 }
3158 rtfree_locked(rt);
3159 }
3160 lck_mtx_unlock(rnh_lock);
3161
3162 if (oifa != NULL) {
3163 struct ifaddr *iifa;
3164
3165 /*
3166 * See if the interface pointed to by the
3167 * route is configured with the source IP
3168 * address of the packet.
3169 */
3170 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3171 src.s_addr, ifa->ifa_ifp->if_index);
3172
3173 if (iifa != NULL) {
3174 /*
3175 * Found it; drop the original one
3176 * as well as the route interface
3177 * address, and use this instead.
3178 */
3179 IFA_REMREF(oifa);
3180 IFA_REMREF(ifa);
3181 ifa = iifa;
3182 } else if (!ipforwarding ||
3183 (rt->rt_flags & RTF_GATEWAY)) {
3184 /*
3185 * This interface doesn't have that
3186 * source IP address; drop the route
3187 * interface address and just use the
3188 * original one, and let the caller
3189 * do a scoped route lookup.
3190 */
3191 IFA_REMREF(ifa);
3192 ifa = oifa;
3193 } else {
3194 /*
3195 * Forwarding is enabled and the source
3196 * address belongs to one of our own
3197 * interfaces which isn't the outgoing
3198 * interface, and we have a route, and
3199 * the destination is on a network that
3200 * is directly attached (onlink); drop
3201 * the original one and use the route
3202 * interface address instead.
3203 */
3204 IFA_REMREF(oifa);
3205 }
3206 }
3207 } else if (ifa != NULL && ro->ro_rt != NULL &&
3208 !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3209 ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3210 /*
3211 * Forwarding is enabled and the source address belongs
3212 * to one of our own interfaces which isn't the same
3213 * as the interface used by the known route; drop the
3214 * original one and use the route interface address.
3215 */
3216 IFA_REMREF(ifa);
3217 ifa = ro->ro_rt->rt_ifa;
3218 IFA_ADDREF(ifa);
3219 }
3220
3221 if (ip_select_srcif_debug && ifa != NULL) {
3222 printf("%s->%s ifscope %d ifa_if %s\n",
3223 s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3224 }
3225 }
3226
3227 if (ro->ro_rt != NULL) {
3228 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3229 }
3230 /*
3231 * If there is a non-loopback route with the wrong interface, or if
3232 * there is no interface configured with such an address, blow it
3233 * away. Except for local/loopback, we look for one with a matching
3234 * interface scope/index.
3235 */
3236 if (ro->ro_rt != NULL &&
3237 (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3238 !(ro->ro_rt->rt_flags & RTF_UP))) {
3239 if (ip_select_srcif_debug) {
3240 if (ifa != NULL) {
3241 printf("%s->%s ifscope %d ro_if %s != "
3242 "ifa_if %s (cached route cleared)\n",
3243 s_src, s_dst, ifscope, if_name(rt_ifp),
3244 if_name(ifa->ifa_ifp));
3245 } else {
3246 printf("%s->%s ifscope %d ro_if %s "
3247 "(no ifa_if found)\n",
3248 s_src, s_dst, ifscope, if_name(rt_ifp));
3249 }
3250 }
3251
3252 RT_UNLOCK(ro->ro_rt);
3253 ROUTE_RELEASE(ro);
3254
3255 /*
3256 * If the destination is IPv4 LLA and the route's interface
3257 * doesn't match the source interface, then the source IP
3258 * address is wrong; it most likely belongs to the primary
3259 * interface associated with the IPv4 LL subnet. Drop the
3260 * packet rather than letting it go out and return an error
3261 * to the ULP. This actually applies not only to IPv4 LL
3262 * but other shared subnets; for now we explicitly test only
3263 * for the former case and save the latter for future.
3264 */
3265 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3266 !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3267 IFA_REMREF(ifa);
3268 ifa = NULL;
3269 }
3270 }
3271
3272 if (ip_select_srcif_debug && ifa == NULL) {
3273 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3274 s_src, s_dst, ifscope);
3275 }
3276
3277 /*
3278 * If there is a route, mark it accordingly. If there isn't one,
3279 * we'll get here again during the next transmit (possibly with a
3280 * route) and the flag will get set at that point. For IPv4 LLA
3281 * destination, mark it only if the route has been fully resolved;
3282 * otherwise we want to come back here again when the route points
3283 * to the interface over which the ARP reply arrives on.
3284 */
3285 if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3286 (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3287 SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3288 if (ifa != NULL) {
3289 IFA_ADDREF(ifa); /* for route */
3290 }
3291 if (ro->ro_srcia != NULL) {
3292 IFA_REMREF(ro->ro_srcia);
3293 }
3294 ro->ro_srcia = ifa;
3295 ro->ro_flags |= ROF_SRCIF_SELECTED;
3296 RT_GENID_SYNC(ro->ro_rt);
3297 }
3298
3299 if (ro->ro_rt != NULL) {
3300 RT_UNLOCK(ro->ro_rt);
3301 }
3302
3303 return ifa;
3304 }
3305
3306 /*
3307 * @brief Given outgoing interface it determines what checksum needs
3308 * to be computed in software and what needs to be offloaded to the
3309 * interface.
3310 *
3311 * @param ifp Pointer to the outgoing interface
3312 * @param m Pointer to the packet
3313 * @param hlen IP header length
3314 * @param ip_len Total packet size i.e. headers + data payload
3315 * @param sw_csum Pointer to a software checksum flag set
3316 *
3317 * @return void
3318 */
3319 void
ip_output_checksum(struct ifnet * ifp,struct mbuf * m,int hlen,int ip_len,uint32_t * sw_csum)3320 ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
3321 uint32_t *sw_csum)
3322 {
3323 uint32_t hwcap = ifp->if_hwassist;
3324
3325 m->m_pkthdr.csum_flags |= CSUM_IP;
3326
3327 if (!hwcksum_tx) {
3328 /* do all in software; hardware checksum offload is disabled */
3329 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3330 m->m_pkthdr.csum_flags;
3331 } else {
3332 /* do in software what the hardware cannot */
3333 *sw_csum = m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(hwcap);
3334 }
3335
3336 if (hlen != sizeof(struct ip)) {
3337 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3338 m->m_pkthdr.csum_flags);
3339 } else if ((*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
3340 /*
3341 * If the explicitly required data csum offload is not supported by hardware,
3342 * do it by partial checksum. Here we assume TSO implies support for IP
3343 * and data sum.
3344 */
3345 int interface_mtu = ifp->if_mtu;
3346
3347 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3348 interface_mtu = IN6_LINKMTU(ifp);
3349 /* Further adjust the size for CLAT46 expansion */
3350 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
3351 }
3352
3353 /*
3354 * Partial checksum offload, if non-IP fragment, and TCP only
3355 * (no UDP support, as the hardware may not be able to convert
3356 * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
3357 * supports "invert zero" capability.)
3358 */
3359 if (hwcksum_tx &&
3360 ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
3361 ((hwcap & CSUM_ZERO_INVERT) &&
3362 (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
3363 ip_len <= interface_mtu) {
3364 uint16_t start = sizeof(struct ip);
3365 uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
3366 m->m_pkthdr.csum_flags |=
3367 (CSUM_DATA_VALID | CSUM_PARTIAL);
3368 m->m_pkthdr.csum_tx_stuff = (ulpoff + start);
3369 m->m_pkthdr.csum_tx_start = start;
3370 /* do IP hdr chksum in software */
3371 *sw_csum = CSUM_DELAY_IP;
3372 } else {
3373 *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
3374 }
3375 }
3376
3377 if (*sw_csum & CSUM_DELAY_DATA) {
3378 in_delayed_cksum(m);
3379 *sw_csum &= ~CSUM_DELAY_DATA;
3380 }
3381
3382 if (hwcksum_tx) {
3383 uint32_t delay_data = m->m_pkthdr.csum_flags & CSUM_DELAY_DATA;
3384 uint32_t hw_csum = IF_HWASSIST_CSUM_FLAGS(hwcap);
3385
3386 /*
3387 * Drop off bits that aren't supported by hardware;
3388 * also make sure to preserve non-checksum related bits.
3389 */
3390 m->m_pkthdr.csum_flags =
3391 ((m->m_pkthdr.csum_flags & (hw_csum | CSUM_DATA_VALID)) |
3392 (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK));
3393
3394 /*
3395 * If hardware supports partial checksum but not delay_data,
3396 * add back delay_data.
3397 */
3398 if ((hw_csum & CSUM_PARTIAL) != 0 &&
3399 (hw_csum & delay_data) == 0) {
3400 m->m_pkthdr.csum_flags |= delay_data;
3401 }
3402 } else {
3403 /* drop all bits; hardware checksum offload is disabled */
3404 m->m_pkthdr.csum_flags = 0;
3405 }
3406 }
3407
3408 /*
3409 * GRE protocol output for PPP/PPTP
3410 */
3411 int
ip_gre_output(struct mbuf * m)3412 ip_gre_output(struct mbuf *m)
3413 {
3414 struct route ro;
3415 int error;
3416
3417 bzero(&ro, sizeof(ro));
3418
3419 error = ip_output(m, NULL, &ro, 0, NULL, NULL);
3420
3421 ROUTE_RELEASE(&ro);
3422
3423 return error;
3424 }
3425
3426 static int
3427 sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
3428 {
3429 #pragma unused(arg1, arg2)
3430 int error, i;
3431
3432 i = ip_output_measure;
3433 error = sysctl_handle_int(oidp, &i, 0, req);
3434 if (error || req->newptr == USER_ADDR_NULL) {
3435 goto done;
3436 }
3437 /* impose bounds */
3438 if (i < 0 || i > 1) {
3439 error = EINVAL;
3440 goto done;
3441 }
3442 if (ip_output_measure != i && i == 1) {
3443 net_perf_initialize(&net_perf, ip_output_measure_bins);
3444 }
3445 ip_output_measure = i;
3446 done:
3447 return error;
3448 }
3449
3450 static int
3451 sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
3452 {
3453 #pragma unused(arg1, arg2)
3454 int error;
3455 uint64_t i;
3456
3457 i = ip_output_measure_bins;
3458 error = sysctl_handle_quad(oidp, &i, 0, req);
3459 if (error || req->newptr == USER_ADDR_NULL) {
3460 goto done;
3461 }
3462 /* validate data */
3463 if (!net_perf_validate_bins(i)) {
3464 error = EINVAL;
3465 goto done;
3466 }
3467 ip_output_measure_bins = i;
3468 done:
3469 return error;
3470 }
3471
3472 static int
3473 sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
3474 {
3475 #pragma unused(oidp, arg1, arg2)
3476 if (req->oldptr == USER_ADDR_NULL) {
3477 req->oldlen = (size_t)sizeof(struct ipstat);
3478 }
3479
3480 return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
3481 }
3482