xref: /xnu-11215.41.3/bsd/netinet/raw_ip.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2000-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1988, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
61  */
62 /*
63  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64  * support for mandatory and extensible security protections.  This notice
65  * is included in support of clause 2.2 (b) of the Apple Public License,
66  * Version 2.0.
67  */
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/kernel.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/mcache.h>
75 #include <sys/proc.h>
76 #include <sys/domain.h>
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/sysctl.h>
81 #include <libkern/OSAtomic.h>
82 #include <kern/zalloc.h>
83 
84 #include <pexpert/pexpert.h>
85 
86 #include <net/if.h>
87 #include <net/net_api_stats.h>
88 #include <net/route.h>
89 #include <net/content_filter.h>
90 #include <net/sockaddr_utils.h>
91 
92 #define _IP_VHL
93 #include <netinet/in.h>
94 #include <netinet/in_systm.h>
95 #include <netinet/in_tclass.h>
96 #include <netinet/ip.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_var.h>
99 #include <netinet/ip_var.h>
100 
101 #include <netinet6/in6_pcb.h>
102 
103 
104 #if IPSEC
105 #include <netinet6/ipsec.h>
106 #endif /*IPSEC*/
107 
108 #if DUMMYNET
109 #include <netinet/ip_dummynet.h>
110 #endif /* DUMMYNET */
111 
112 int rip_detach(struct socket *);
113 int rip_abort(struct socket *);
114 int rip_disconnect(struct socket *);
115 int rip_bind(struct socket *, struct sockaddr *, struct proc *);
116 int rip_connect(struct socket *, struct sockaddr *, struct proc *);
117 int rip_shutdown(struct socket *);
118 
119 struct  inpcbhead ripcb;
120 struct  inpcbinfo ripcbinfo;
121 
122 /* control hooks for dummynet */
123 #if DUMMYNET
124 ip_dn_ctl_t *ip_dn_ctl_ptr;
125 #endif /* DUMMYNET */
126 
127 /*
128  * Nominal space allocated to a raw ip socket.
129  */
130 #define RIPSNDQ         8192
131 #define RIPRCVQ         8192
132 
133 static KALLOC_TYPE_DEFINE(ripzone, struct inpcb, NET_KT_DEFAULT);
134 
135 /*
136  * Raw interface to IP protocol.
137  */
138 
139 /*
140  * Initialize raw connection block q.
141  */
142 void
rip_init(struct protosw * pp,struct domain * dp)143 rip_init(struct protosw *pp, struct domain *dp)
144 {
145 #pragma unused(dp)
146 	static int rip_initialized = 0;
147 	struct inpcbinfo *pcbinfo;
148 
149 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
150 
151 	if (rip_initialized) {
152 		return;
153 	}
154 	rip_initialized = 1;
155 
156 	LIST_INIT(&ripcb);
157 	ripcbinfo.ipi_listhead = &ripcb;
158 	/*
159 	 * XXX We don't use the hash list for raw IP, but it's easier
160 	 * to allocate a one entry hash list than it is to check all
161 	 * over the place for ipi_hashbase == NULL.
162 	 */
163 	hashinit_counted_by(1, ripcbinfo.ipi_hashbase,
164 	    ripcbinfo.ipi_hashbase_count);
165 	ripcbinfo.ipi_hashmask = ripcbinfo.ipi_hashbase_count - 1;
166 	hashinit_counted_by(1, ripcbinfo.ipi_porthashbase,
167 	    ripcbinfo.ipi_porthashbase_count);
168 	ripcbinfo.ipi_porthashmask = ripcbinfo.ipi_porthashbase_count - 1;
169 	ripcbinfo.ipi_zone = ripzone;
170 
171 	pcbinfo = &ripcbinfo;
172 	/*
173 	 * allocate lock group attribute and group for udp pcb mutexes
174 	 */
175 	pcbinfo->ipi_lock_grp = lck_grp_alloc_init("ripcb", LCK_GRP_ATTR_NULL);
176 
177 	/*
178 	 * allocate the lock attribute for udp pcb mutexes
179 	 */
180 	lck_attr_setdefault(&pcbinfo->ipi_lock_attr);
181 	lck_rw_init(&pcbinfo->ipi_lock, pcbinfo->ipi_lock_grp,
182 	    &pcbinfo->ipi_lock_attr);
183 
184 	in_pcbinfo_attach(&ripcbinfo);
185 }
186 
187 static uint32_t
rip_inp_input(struct inpcb * inp,struct mbuf * m,int iphlen)188 rip_inp_input(struct inpcb *inp, struct mbuf *m, int iphlen)
189 {
190 	struct ip *ip = mtod(m, struct ip *);
191 	struct ifnet *ifp = m->m_pkthdr.rcvif;
192 	struct sockaddr_in ripsrc = {
193 		.sin_len = sizeof(ripsrc),
194 		.sin_family = AF_INET,
195 		.sin_port = 0,
196 		.sin_addr = { .s_addr = 0 },
197 		.sin_zero = {0, 0, 0, 0, 0, 0, 0, 0, }
198 	};
199 	mbuf_ref_t opts = NULL;
200 	boolean_t is_wake_pkt = false;
201 	uint32_t num_delivered = 0;
202 
203 #if NECP
204 	if (!necp_socket_is_allowed_to_send_recv_v4(inp, 0, 0,
205 	    &ip->ip_dst, &ip->ip_src, ifp, 0, NULL, NULL, NULL, NULL)) {
206 		/* do not inject data to pcb */
207 		goto done;
208 	}
209 #endif /* NECP */
210 
211 	ripsrc.sin_addr = ip->ip_src;
212 
213 	if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
214 		is_wake_pkt = true;
215 	}
216 
217 	if ((inp->inp_flags & INP_CONTROLOPTS) != 0 ||
218 	    SOFLOW_ENABLED(inp->inp_socket) ||
219 	    SO_RECV_CONTROL_OPTS(inp->inp_socket)) {
220 		if (ip_savecontrol(inp, &opts, ip, m) != 0) {
221 			m_freem(opts);
222 			goto done;
223 		}
224 	}
225 	if (inp->inp_flags & INP_STRIPHDR
226 #if CONTENT_FILTER
227 	    /*
228 	     * If socket is subject to Content Filter, delay stripping until reinject
229 	     */
230 	    && (!CFIL_DGRAM_FILTERED(inp->inp_socket))
231 #endif
232 	    ) {
233 		m->m_len -= iphlen;
234 		m->m_pkthdr.len -= iphlen;
235 		m->m_data += iphlen;
236 	}
237 	so_recv_data_stat(inp->inp_socket, m, 0);
238 	if (sbappendaddr(&inp->inp_socket->so_rcv,
239 	    (struct sockaddr *)&ripsrc, m, opts, NULL) != 0) {
240 		num_delivered = 1;
241 		sorwakeup(inp->inp_socket);
242 		if (is_wake_pkt) {
243 			soevent(inp->in6p_socket,
244 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_WAKE_PKT);
245 		}
246 	} else {
247 		ipstat.ips_raw_sappend_fail++;
248 	}
249 done:
250 	return num_delivered;
251 }
252 
253 /*
254  * The first pass is for IPv4 socket and the second pass for IPv6
255  */
256 static bool
rip_input_inner(struct mbuf * m,int iphlen,bool is_ipv4_pass,uint32_t * total_delivered)257 rip_input_inner(struct mbuf *m, int iphlen, bool is_ipv4_pass, uint32_t *total_delivered)
258 {
259 	struct inpcb *inp;
260 	struct inpcb *last = NULL;
261 	struct ip *ip = mtod(m, struct ip *);
262 	struct ifnet *ifp = m->m_pkthdr.rcvif;
263 	bool need_ipv6_pass = false;
264 	uint32_t num_delivered = 0;
265 
266 	lck_rw_lock_shared(&ripcbinfo.ipi_lock);
267 	LIST_FOREACH(inp, &ripcb, inp_list) {
268 		if (is_ipv4_pass) {
269 			if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) != INP_IPV4) {
270 				/* Tell if we need to an IPv6 pass */
271 				need_ipv6_pass = true;
272 				continue;
273 			}
274 		} else {
275 			if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) != (INP_IPV4 | INP_IPV6)) {
276 				continue;
277 			}
278 		}
279 		if (inp->inp_ip_p && (inp->inp_ip_p != ip->ip_p)) {
280 			continue;
281 		}
282 		if (inp->inp_laddr.s_addr &&
283 		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr) {
284 			continue;
285 		}
286 		if (inp->inp_faddr.s_addr &&
287 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr) {
288 			continue;
289 		}
290 		if (inp_restricted_recv(inp, ifp)) {
291 			continue;
292 		}
293 		if (last != NULL) {
294 			struct mbuf *n = m_copym_mode(m, 0, (int)M_COPYALL, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
295 
296 			if (n == NULL) {
297 				continue;
298 			}
299 			num_delivered += rip_inp_input(last, n, iphlen);
300 		}
301 		last = inp;
302 	}
303 
304 	/*
305 	 * Consume the orignal mbuf 'm' if:
306 	 * - it is the first pass and there is no IPv6 raw socket
307 	 * - it is the second pass for IPv6
308 	 */
309 	if (need_ipv6_pass == false || is_ipv4_pass == false) {
310 		if (last != NULL) {
311 			num_delivered += rip_inp_input(last, m, iphlen);
312 		} else {
313 			m_freem(m);
314 		}
315 	} else {
316 		if (last != NULL) {
317 			struct mbuf *n = m_copym_mode(m, 0, (int)M_COPYALL, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
318 
319 			if (n != NULL) {
320 				num_delivered += rip_inp_input(last, n, iphlen);
321 			}
322 		}
323 	}
324 	/*
325 	 * Keep the list locked because socket filter may force the socket lock
326 	 * to be released when calling sbappendaddr() -- see rdar://7627704
327 	 */
328 	lck_rw_done(&ripcbinfo.ipi_lock);
329 
330 	*total_delivered += num_delivered;
331 
332 	return need_ipv6_pass;
333 }
334 
335 
336 /*
337  * Setup generic address and protocol structures
338  * for raw_input routine, then pass them along with
339  * mbuf chain.
340  */
341 void
rip_input(struct mbuf * m,int iphlen)342 rip_input(struct mbuf *m, int iphlen)
343 {
344 	uint32_t num_delivered = 0;
345 	bool need_v6_pass = false;
346 
347 	/* Expect 32-bit aligned data pointer on strict-align platforms */
348 	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
349 
350 	/*
351 	 * First pass for raw IPv4 sockets that are protected by the inet_domain_mutex lock
352 	 */
353 	need_v6_pass = rip_input_inner(m, iphlen, true, &num_delivered);
354 
355 	/*
356 	 * For the IPv6 pass we need to switch to the inet6_domain_mutex lock
357 	 * to protect the raw IPv6 sockets
358 	 */
359 	if (need_v6_pass) {
360 		lck_mtx_unlock(inet_domain_mutex);
361 
362 		lck_mtx_lock(inet6_domain_mutex);
363 		rip_input_inner(m, iphlen, false, &num_delivered);
364 		lck_mtx_unlock(inet6_domain_mutex);
365 
366 		lck_mtx_lock(inet_domain_mutex);
367 	}
368 
369 	if (num_delivered > 0) {
370 		OSAddAtomic(1, &ipstat.ips_delivered);
371 	} else {
372 		OSAddAtomic(1, &ipstat.ips_noproto);
373 	}
374 }
375 
376 /*
377  * Generate IP header and pass packet to ip_output.
378  * Tack on options user may have setup with control call.
379  */
380 int
rip_output(struct mbuf * m,struct socket * so,u_int32_t dst,struct mbuf * control)381 rip_output(
382 	struct mbuf *m,
383 	struct socket *so,
384 	u_int32_t dst,
385 	struct mbuf *control)
386 {
387 	struct ip *ip;
388 	struct inpcb *inp = sotoinpcb(so);
389 	int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
390 	int inp_flags = inp ? inp->inp_flags : 0;
391 	struct ip_out_args ipoa;
392 	struct ip_moptions *imo;
393 	int tos = IPTOS_UNSPEC;
394 	int error = 0;
395 #if CONTENT_FILTER
396 	struct m_tag *cfil_tag = NULL;
397 	bool cfil_faddr_use = false;
398 	uint32_t cfil_so_state_change_cnt = 0;
399 	uint32_t cfil_so_options = 0;
400 	int cfil_inp_flags = 0;
401 	struct sockaddr *__single cfil_faddr = NULL;
402 	struct sockaddr_in *__single cfil_sin;
403 	u_int32_t cfil_dst = 0;
404 #endif
405 
406 #if CONTENT_FILTER
407 	/*
408 	 * If socket is subject to Content Filter and no addr is passed in,
409 	 * retrieve CFIL saved state from mbuf and use it if necessary.
410 	 */
411 	if (CFIL_DGRAM_FILTERED(so) && dst == INADDR_ANY) {
412 		cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, &cfil_inp_flags);
413 		if (cfil_tag) {
414 			cfil_sin = SIN(cfil_faddr);
415 			flags = (cfil_so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
416 			inp_flags = cfil_inp_flags;
417 			if (inp && inp->inp_faddr.s_addr == INADDR_ANY) {
418 				/*
419 				 * Socket is unconnected, simply use the saved faddr as 'addr' to go through
420 				 * the connect/disconnect logic.
421 				 */
422 				dst = cfil_sin->sin_addr.s_addr;
423 			} else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
424 			    (inp->inp_fport != cfil_sin->sin_port ||
425 			    inp->inp_faddr.s_addr != cfil_sin->sin_addr.s_addr)) {
426 				/*
427 				 * Socket is connected but socket state and dest addr/port changed.
428 				 * We need to use the saved faddr and socket options.
429 				 */
430 				cfil_faddr_use = true;
431 				cfil_dst = cfil_sin->sin_addr.s_addr;
432 			}
433 			m_tag_free(cfil_tag);
434 		}
435 	}
436 #endif
437 
438 	if (so->so_state & SS_ISCONNECTED) {
439 		if (dst != INADDR_ANY) {
440 			if (m != NULL) {
441 				m_freem(m);
442 			}
443 			if (control != NULL) {
444 				m_freem(control);
445 			}
446 			return EISCONN;
447 		}
448 		dst = cfil_faddr_use ? cfil_dst : inp->inp_faddr.s_addr;
449 	} else {
450 		if (dst == INADDR_ANY) {
451 			if (m != NULL) {
452 				m_freem(m);
453 			}
454 			if (control != NULL) {
455 				m_freem(control);
456 			}
457 			return ENOTCONN;
458 		}
459 	}
460 
461 	bzero(&ipoa, sizeof(ipoa));
462 	ipoa.ipoa_boundif = IFSCOPE_NONE;
463 	ipoa.ipoa_flags = IPOAF_SELECT_SRCIF;
464 
465 	int sotc = SO_TC_UNSPEC;
466 	int netsvctype = _NET_SERVICE_TYPE_UNSPEC;
467 
468 
469 	if (control != NULL) {
470 		tos = so_tos_from_control(control);
471 		sotc = so_tc_from_control(control, &netsvctype);
472 
473 		m_freem(control);
474 		control = NULL;
475 	}
476 	if (sotc == SO_TC_UNSPEC) {
477 		sotc = so->so_traffic_class;
478 		netsvctype = so->so_netsvctype;
479 	}
480 
481 	if (inp == NULL
482 #if NECP
483 	    || (necp_socket_should_use_flow_divert(inp))
484 #endif /* NECP */
485 	    ) {
486 		if (m != NULL) {
487 			m_freem(m);
488 		}
489 		VERIFY(control == NULL);
490 		return inp == NULL ? EINVAL : EPROTOTYPE;
491 	}
492 
493 	flags |= IP_OUTARGS;
494 	/* If socket was bound to an ifindex, tell ip_output about it */
495 	if (inp->inp_flags & INP_BOUND_IF) {
496 		ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
497 		ipoa.ipoa_flags |= IPOAF_BOUND_IF;
498 	}
499 	if (INP_NO_CELLULAR(inp)) {
500 		ipoa.ipoa_flags |=  IPOAF_NO_CELLULAR;
501 	}
502 	if (INP_NO_EXPENSIVE(inp)) {
503 		ipoa.ipoa_flags |=  IPOAF_NO_EXPENSIVE;
504 	}
505 	if (INP_NO_CONSTRAINED(inp)) {
506 		ipoa.ipoa_flags |=  IPOAF_NO_CONSTRAINED;
507 	}
508 	if (INP_AWDL_UNRESTRICTED(inp)) {
509 		ipoa.ipoa_flags |=  IPOAF_AWDL_UNRESTRICTED;
510 	}
511 	if (INP_MANAGEMENT_ALLOWED(inp)) {
512 		ipoa.ipoa_flags |=  IPOAF_MANAGEMENT_ALLOWED;
513 	}
514 	ipoa.ipoa_sotc = sotc;
515 	ipoa.ipoa_netsvctype = netsvctype;
516 
517 	if (inp->inp_flowhash == 0) {
518 		inp_calc_flowhash(inp);
519 		ASSERT(inp->inp_flowhash != 0);
520 	}
521 
522 	/*
523 	 * If the user handed us a complete IP packet, use it.
524 	 * Otherwise, allocate an mbuf for a header and fill it in.
525 	 */
526 	if ((inp_flags & INP_HDRINCL) == 0) {
527 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
528 			m_freem(m);
529 			return EMSGSIZE;
530 		}
531 		M_PREPEND(m, sizeof(struct ip), M_WAIT, 1);
532 		if (m == NULL) {
533 			return ENOBUFS;
534 		}
535 		ip = mtod(m, struct ip *);
536 		if (tos != IPTOS_UNSPEC) {
537 			ip->ip_tos = (uint8_t)(tos & IPTOS_MASK);
538 		} else {
539 			ip->ip_tos = inp->inp_ip_tos;
540 		}
541 		if (inp->inp_flags2 & INP2_DONTFRAG) {
542 			ip->ip_off = IP_DF;
543 		} else {
544 			ip->ip_off = 0;
545 		}
546 		ip->ip_p = inp->inp_ip_p;
547 		ip->ip_len = (uint16_t)m->m_pkthdr.len;
548 		ip->ip_src = inp->inp_laddr;
549 		ip->ip_dst.s_addr = dst;
550 		ip->ip_ttl = inp->inp_ip_ttl;
551 	} else {
552 		if (m->m_pkthdr.len > IP_MAXPACKET) {
553 			m_freem(m);
554 			return EMSGSIZE;
555 		}
556 		ip = mtod(m, struct ip *);
557 		/*
558 		 * don't allow both user specified and setsockopt options,
559 		 * and don't allow packet length sizes that will crash
560 		 */
561 		if (m->m_pkthdr.len < sizeof(struct ip) ||
562 		    ((IP_VHL_HL(ip->ip_vhl) != (sizeof(*ip) >> 2)) && inp->inp_options) ||
563 		    (ip->ip_len > m->m_pkthdr.len) ||
564 		    (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
565 			m_freem(m);
566 			return EINVAL;
567 		}
568 		if (ip->ip_id == 0 && !(rfc6864 && IP_OFF_IS_ATOMIC(ntohs(ip->ip_off)))) {
569 			ip->ip_id = ip_randomid((uint64_t)m);
570 		}
571 		/* XXX prevent ip_output from overwriting header fields */
572 		flags |= IP_RAWOUTPUT;
573 		OSAddAtomic(1, &ipstat.ips_rawout);
574 	}
575 
576 	if (inp->inp_laddr.s_addr != INADDR_ANY) {
577 		ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR;
578 	}
579 
580 #if NECP
581 	{
582 		necp_kernel_policy_id policy_id;
583 		necp_kernel_policy_id skip_policy_id;
584 		u_int32_t route_rule_id;
585 		u_int32_t pass_flags;
586 
587 		/*
588 		 * We need a route to perform NECP route rule checks
589 		 */
590 		if ((net_qos_policy_restricted != 0 &&
591 		    ROUTE_UNUSABLE(&inp->inp_route))
592 #if CONTENT_FILTER
593 		    || cfil_faddr_use
594 #endif
595 		    ) {
596 			struct sockaddr_in to;
597 			struct sockaddr_in from;
598 			struct in_addr laddr = ip->ip_src;
599 
600 			ROUTE_RELEASE(&inp->inp_route);
601 
602 			bzero(&from, sizeof(struct sockaddr_in));
603 			from.sin_family = AF_INET;
604 			from.sin_len = sizeof(struct sockaddr_in);
605 			from.sin_addr = laddr;
606 
607 			bzero(&to, sizeof(struct sockaddr_in));
608 			to.sin_family = AF_INET;
609 			to.sin_len = sizeof(struct sockaddr_in);
610 			to.sin_addr.s_addr = ip->ip_dst.s_addr;
611 
612 			if ((error = in_pcbladdr(inp, (struct sockaddr *)&to,
613 			    &laddr, ipoa.ipoa_boundif, NULL, 1)) != 0) {
614 				printf("%s in_pcbladdr(%p) error %d\n",
615 				    __func__, inp, error);
616 				m_freem(m);
617 				return error;
618 			}
619 
620 			inp_update_necp_policy(inp, (struct sockaddr *)&from,
621 			    (struct sockaddr *)&to, ipoa.ipoa_boundif);
622 			inp->inp_policyresult.results.qos_marking_gencount = 0;
623 		}
624 
625 		if (!necp_socket_is_allowed_to_send_recv_v4(inp, 0, 0,
626 		    &ip->ip_src, &ip->ip_dst, NULL, 0, &policy_id, &route_rule_id, &skip_policy_id, &pass_flags)) {
627 			m_freem(m);
628 			return EHOSTUNREACH;
629 		}
630 
631 		necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id, pass_flags);
632 
633 		if (net_qos_policy_restricted != 0) {
634 			struct ifnet *rt_ifp = NULL;
635 
636 			if (inp->inp_route.ro_rt != NULL) {
637 				rt_ifp = inp->inp_route.ro_rt->rt_ifp;
638 			}
639 
640 			necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt, route_rule_id);
641 		}
642 	}
643 #endif /* NECP */
644 	if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
645 		ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
646 	}
647 #if IPSEC
648 	if (inp->inp_sp != NULL && ipsec_setsocket(m, so) != 0) {
649 		m_freem(m);
650 		return ENOBUFS;
651 	}
652 #endif /*IPSEC*/
653 
654 	if (ROUTE_UNUSABLE(&inp->inp_route)) {
655 		ROUTE_RELEASE(&inp->inp_route);
656 	}
657 
658 	set_packet_service_class(m, so, sotc, 0);
659 	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
660 	m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
661 	m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC |
662 	    PKTF_FLOW_RAWSOCK);
663 	m->m_pkthdr.pkt_proto = inp->inp_ip_p;
664 	m->m_pkthdr.tx_rawip_pid = so->last_pid;
665 	m->m_pkthdr.tx_rawip_e_pid = so->e_pid;
666 	if (so->so_flags & SOF_DELEGATED) {
667 		m->m_pkthdr.tx_rawip_e_pid = so->e_pid;
668 	} else {
669 		m->m_pkthdr.tx_rawip_e_pid = 0;
670 	}
671 #if (DEBUG || DEVELOPMENT)
672 	if (so->so_flags & SOF_MARK_WAKE_PKT) {
673 		so->so_flags &= ~SOF_MARK_WAKE_PKT;
674 		m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
675 	}
676 #endif /* (DEBUG || DEVELOPMENT) */
677 
678 	imo = inp->inp_moptions;
679 	if (imo != NULL) {
680 		IMO_ADDREF(imo);
681 	}
682 	/*
683 	 * The domain lock is held across ip_output, so it is okay
684 	 * to pass the PCB cached route pointer directly to IP and
685 	 * the modules beneath it.
686 	 */
687 	// TODO: PASS DOWN ROUTE RULE ID
688 	error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
689 	    imo, &ipoa);
690 
691 	if (imo != NULL) {
692 		IMO_REMREF(imo);
693 	}
694 
695 	if (inp->inp_route.ro_rt != NULL) {
696 		struct rtentry *rt = inp->inp_route.ro_rt;
697 		struct ifnet *outif;
698 
699 		if ((rt->rt_flags & (RTF_MULTICAST | RTF_BROADCAST)) ||
700 		    inp->inp_socket == NULL ||
701 #if CONTENT_FILTER
702 		    /* Discard temporary route for cfil case */
703 		    cfil_faddr_use ||
704 #endif
705 		    !(inp->inp_socket->so_state & SS_ISCONNECTED)) {
706 			rt = NULL;      /* unusable */
707 		}
708 		/*
709 		 * Always discard the cached route for unconnected
710 		 * socket or if it is a multicast route.
711 		 */
712 		if (rt == NULL) {
713 			ROUTE_RELEASE(&inp->inp_route);
714 		}
715 
716 		/*
717 		 * If this is a connected socket and the destination
718 		 * route is unicast, update outif with that of the
719 		 * route interface used by IP.
720 		 */
721 		if (rt != NULL &&
722 		    (outif = rt->rt_ifp) != inp->inp_last_outifp) {
723 			inp->inp_last_outifp = outif;
724 		}
725 	} else {
726 		ROUTE_RELEASE(&inp->inp_route);
727 	}
728 
729 	/*
730 	 * If output interface was cellular/expensive/constrained, and this socket is
731 	 * denied access to it, generate an event.
732 	 */
733 	if (error != 0 && (ipoa.ipoa_flags & IPOAF_R_IFDENIED) &&
734 	    (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp))) {
735 		soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED));
736 	}
737 
738 	return error;
739 }
740 
741 
742 /*
743  * Raw IP socket option processing.
744  */
745 int
rip_ctloutput(struct socket * so,struct sockopt * sopt)746 rip_ctloutput(struct socket *so, struct sockopt *sopt)
747 {
748 	struct  inpcb *inp = sotoinpcb(so);
749 	int     error, optval;
750 
751 	/* Allow <SOL_SOCKET,SO_BINDTODEVICE> at this level */
752 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_BINDTODEVICE) {
753 		return ip_ctloutput(so, sopt);
754 	}
755 
756 	if (sopt->sopt_level != IPPROTO_IP) {
757 		return EINVAL;
758 	}
759 
760 	error = 0;
761 
762 	switch (sopt->sopt_dir) {
763 	case SOPT_GET:
764 		switch (sopt->sopt_name) {
765 		case IP_HDRINCL:
766 			optval = inp->inp_flags & INP_HDRINCL;
767 			error = sooptcopyout(sopt, &optval, sizeof optval);
768 			break;
769 
770 		case IP_STRIPHDR:
771 			optval = inp->inp_flags & INP_STRIPHDR;
772 			error = sooptcopyout(sopt, &optval, sizeof optval);
773 			break;
774 
775 
776 #if DUMMYNET
777 		case IP_DUMMYNET_GET:
778 			if (!DUMMYNET_LOADED) {
779 				ip_dn_init();
780 			}
781 			if (DUMMYNET_LOADED) {
782 				error = ip_dn_ctl_ptr(sopt);
783 			} else {
784 				error = ENOPROTOOPT;
785 			}
786 			break;
787 #endif /* DUMMYNET */
788 
789 		default:
790 			error = ip_ctloutput(so, sopt);
791 			break;
792 		}
793 		break;
794 
795 	case SOPT_SET:
796 		switch (sopt->sopt_name) {
797 		case IP_HDRINCL:
798 			error = sooptcopyin(sopt, &optval, sizeof optval,
799 			    sizeof optval);
800 			if (error) {
801 				break;
802 			}
803 			if (optval) {
804 				inp->inp_flags |= INP_HDRINCL;
805 			} else {
806 				inp->inp_flags &= ~INP_HDRINCL;
807 			}
808 			break;
809 
810 		case IP_STRIPHDR:
811 			error = sooptcopyin(sopt, &optval, sizeof optval,
812 			    sizeof optval);
813 			if (error) {
814 				break;
815 			}
816 			if (optval) {
817 				inp->inp_flags |= INP_STRIPHDR;
818 			} else {
819 				inp->inp_flags &= ~INP_STRIPHDR;
820 			}
821 			break;
822 
823 
824 #if DUMMYNET
825 		case IP_DUMMYNET_CONFIGURE:
826 		case IP_DUMMYNET_DEL:
827 		case IP_DUMMYNET_FLUSH:
828 			if (!DUMMYNET_LOADED) {
829 				ip_dn_init();
830 			}
831 			if (DUMMYNET_LOADED) {
832 				error = ip_dn_ctl_ptr(sopt);
833 			} else {
834 				error = ENOPROTOOPT;
835 			}
836 			break;
837 #endif /* DUMMYNET */
838 
839 		case SO_FLUSH:
840 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
841 			    sizeof(optval))) != 0) {
842 				break;
843 			}
844 
845 			error = inp_flush(inp, optval);
846 			break;
847 
848 		default:
849 			error = ip_ctloutput(so, sopt);
850 			break;
851 		}
852 		break;
853 	}
854 
855 	return error;
856 }
857 
858 /*
859  * This function exists solely to receive the PRC_IFDOWN messages which
860  * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
861  * and calls in_ifadown() to remove all routes corresponding to that address.
862  * It also receives the PRC_IFUP messages from if_up() and reinstalls the
863  * interface routes.
864  */
865 void
rip_ctlinput(int cmd,struct sockaddr * sa,__unused void * vip,__unused struct ifnet * ifp)866 rip_ctlinput(
867 	int cmd,
868 	struct sockaddr *sa,
869 	__unused void *vip,
870 	__unused struct ifnet *ifp)
871 {
872 	struct in_ifaddr *ia = NULL;
873 	struct ifnet *iaifp = NULL;
874 	int err = 0;
875 	int flags, done = 0;
876 
877 	switch (cmd) {
878 	case PRC_IFDOWN:
879 		lck_rw_lock_shared(&in_ifaddr_rwlock);
880 		for (ia = in_ifaddrhead.tqh_first; ia;
881 		    ia = ia->ia_link.tqe_next) {
882 			IFA_LOCK(&ia->ia_ifa);
883 			if (ia->ia_ifa.ifa_addr == sa &&
884 			    (ia->ia_flags & IFA_ROUTE)) {
885 				done = 1;
886 				ifa_addref(&ia->ia_ifa);
887 				IFA_UNLOCK(&ia->ia_ifa);
888 				lck_rw_done(&in_ifaddr_rwlock);
889 				lck_mtx_lock(rnh_lock);
890 				/*
891 				 * in_ifscrub kills the interface route.
892 				 */
893 				in_ifscrub(ia->ia_ifp, ia, 1);
894 				/*
895 				 * in_ifadown gets rid of all the rest of
896 				 * the routes.  This is not quite the right
897 				 * thing to do, but at least if we are running
898 				 * a routing process they will come back.
899 				 */
900 				in_ifadown(&ia->ia_ifa, 1);
901 				lck_mtx_unlock(rnh_lock);
902 				ifa_remref(&ia->ia_ifa);
903 				break;
904 			}
905 			IFA_UNLOCK(&ia->ia_ifa);
906 		}
907 		if (!done) {
908 			lck_rw_done(&in_ifaddr_rwlock);
909 		}
910 		break;
911 
912 	case PRC_IFUP:
913 		lck_rw_lock_shared(&in_ifaddr_rwlock);
914 		for (ia = in_ifaddrhead.tqh_first; ia;
915 		    ia = ia->ia_link.tqe_next) {
916 			IFA_LOCK(&ia->ia_ifa);
917 			if (ia->ia_ifa.ifa_addr == sa) {
918 				/* keep it locked */
919 				break;
920 			}
921 			IFA_UNLOCK(&ia->ia_ifa);
922 		}
923 		if (ia == NULL || (ia->ia_flags & IFA_ROUTE) ||
924 		    (ia->ia_ifa.ifa_debug & IFD_NOTREADY)) {
925 			if (ia != NULL) {
926 				IFA_UNLOCK(&ia->ia_ifa);
927 			}
928 			lck_rw_done(&in_ifaddr_rwlock);
929 			return;
930 		}
931 		ifa_addref(&ia->ia_ifa);
932 		IFA_UNLOCK(&ia->ia_ifa);
933 		lck_rw_done(&in_ifaddr_rwlock);
934 
935 		flags = RTF_UP;
936 		iaifp = ia->ia_ifa.ifa_ifp;
937 
938 		if ((iaifp->if_flags & IFF_LOOPBACK)
939 		    || (iaifp->if_flags & IFF_POINTOPOINT)) {
940 			flags |= RTF_HOST;
941 		}
942 
943 		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
944 		if (err == 0) {
945 			IFA_LOCK_SPIN(&ia->ia_ifa);
946 			ia->ia_flags |= IFA_ROUTE;
947 			IFA_UNLOCK(&ia->ia_ifa);
948 		}
949 		ifa_remref(&ia->ia_ifa);
950 		break;
951 	}
952 }
953 
954 u_int32_t       rip_sendspace = RIPSNDQ;
955 u_int32_t       rip_recvspace = RIPRCVQ;
956 
957 SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW | CTLFLAG_LOCKED,
958     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
959 SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED,
960     &rip_recvspace, 0, "Maximum incoming raw IP datagram size");
961 SYSCTL_UINT(_net_inet_raw, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
962     &ripcbinfo.ipi_count, 0, "Number of active PCBs");
963 
964 static int
rip_attach(struct socket * so,int proto,struct proc * p)965 rip_attach(struct socket *so, int proto, struct proc *p)
966 {
967 	struct inpcb *inp;
968 	int error;
969 
970 	inp = sotoinpcb(so);
971 	if (inp) {
972 		panic("rip_attach");
973 	}
974 	if ((so->so_state & SS_PRIV) == 0) {
975 		return EPERM;
976 	}
977 	if (proto > UINT8_MAX) {
978 		return EINVAL;
979 	}
980 
981 	error = soreserve(so, rip_sendspace, rip_recvspace);
982 	if (error) {
983 		return error;
984 	}
985 	error = in_pcballoc(so, &ripcbinfo, p);
986 	if (error) {
987 		return error;
988 	}
989 	inp = (struct inpcb *)so->so_pcb;
990 	inp->inp_vflag |= INP_IPV4;
991 	VERIFY(proto <= UINT8_MAX);
992 	inp->inp_ip_p = (u_char)proto;
993 	inp->inp_ip_ttl = (u_char)ip_defttl;
994 	return 0;
995 }
996 
997 __private_extern__ int
rip_detach(struct socket * so)998 rip_detach(struct socket *so)
999 {
1000 	struct inpcb *inp;
1001 
1002 	inp = sotoinpcb(so);
1003 	if (inp == 0) {
1004 		panic("rip_detach");
1005 	}
1006 	in_pcbdetach(inp);
1007 	return 0;
1008 }
1009 
1010 __private_extern__ int
rip_abort(struct socket * so)1011 rip_abort(struct socket *so)
1012 {
1013 	soisdisconnected(so);
1014 	return rip_detach(so);
1015 }
1016 
1017 __private_extern__ int
rip_disconnect(struct socket * so)1018 rip_disconnect(struct socket *so)
1019 {
1020 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1021 		return ENOTCONN;
1022 	}
1023 	return rip_abort(so);
1024 }
1025 
1026 __private_extern__ int
rip_bind(struct socket * so,struct sockaddr * nam,struct proc * p)1027 rip_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
1028 {
1029 #pragma unused(p)
1030 	struct inpcb *inp = sotoinpcb(so);
1031 	struct sockaddr_in sin;
1032 	struct ifaddr *ifa = NULL;
1033 	struct ifnet *outif = NULL;
1034 
1035 	if (inp == NULL
1036 #if NECP
1037 	    || (necp_socket_should_use_flow_divert(inp))
1038 #endif /* NECP */
1039 	    ) {
1040 		return inp == NULL ? EINVAL : EPROTOTYPE;
1041 	}
1042 
1043 	if (nam->sa_len != sizeof(struct sockaddr_in)) {
1044 		return EINVAL;
1045 	}
1046 
1047 	/* Sanitized local copy for interface address searches */
1048 	bzero(&sin, sizeof(sin));
1049 	sin.sin_family = AF_INET;
1050 	sin.sin_len = sizeof(struct sockaddr_in);
1051 	sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
1052 
1053 	if (TAILQ_EMPTY(&ifnet_head) ||
1054 	    (sin.sin_family != AF_INET && sin.sin_family != AF_IMPLINK) ||
1055 	    (sin.sin_addr.s_addr && (ifa = ifa_ifwithaddr(SA(&sin))) == 0)) {
1056 		return EADDRNOTAVAIL;
1057 	} else if (ifa) {
1058 		/*
1059 		 * Opportunistically determine the outbound
1060 		 * interface that may be used; this may not
1061 		 * hold true if we end up using a route
1062 		 * going over a different interface, e.g.
1063 		 * when sending to a local address.  This
1064 		 * will get updated again after sending.
1065 		 */
1066 		IFA_LOCK(ifa);
1067 		outif = ifa->ifa_ifp;
1068 		IFA_UNLOCK(ifa);
1069 		ifa_remref(ifa);
1070 	}
1071 	inp->inp_laddr = sin.sin_addr;
1072 	inp->inp_last_outifp = outif;
1073 
1074 	return 0;
1075 }
1076 
1077 __private_extern__ int
rip_connect(struct socket * so,struct sockaddr * nam,__unused struct proc * p)1078 rip_connect(struct socket *so, struct sockaddr *nam, __unused  struct proc *p)
1079 {
1080 	struct inpcb *inp = sotoinpcb(so);
1081 	struct sockaddr_in *addr = (struct sockaddr_in *)(void *)nam;
1082 
1083 	if (inp == NULL
1084 #if NECP
1085 	    || (necp_socket_should_use_flow_divert(inp))
1086 #endif /* NECP */
1087 	    ) {
1088 		return inp == NULL ? EINVAL : EPROTOTYPE;
1089 	}
1090 	if (nam->sa_len != sizeof(*addr)) {
1091 		return EINVAL;
1092 	}
1093 	if (TAILQ_EMPTY(&ifnet_head)) {
1094 		return EADDRNOTAVAIL;
1095 	}
1096 	if ((addr->sin_family != AF_INET) &&
1097 	    (addr->sin_family != AF_IMPLINK)) {
1098 		return EAFNOSUPPORT;
1099 	}
1100 
1101 	if (!(so->so_flags1 & SOF1_CONNECT_COUNTED)) {
1102 		so->so_flags1 |= SOF1_CONNECT_COUNTED;
1103 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_connected);
1104 	}
1105 
1106 	inp->inp_faddr = addr->sin_addr;
1107 	soisconnected(so);
1108 
1109 	return 0;
1110 }
1111 
1112 __private_extern__ int
rip_shutdown(struct socket * so)1113 rip_shutdown(struct socket *so)
1114 {
1115 	socantsendmore(so);
1116 	return 0;
1117 }
1118 
1119 __private_extern__ int
rip_send(struct socket * so,int flags,struct mbuf * m,struct sockaddr * nam,struct mbuf * control,struct proc * p)1120 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
1121     struct mbuf *control, struct proc *p)
1122 {
1123 #pragma unused(flags, p)
1124 	struct inpcb *inp = sotoinpcb(so);
1125 	u_int32_t dst = INADDR_ANY;
1126 	int error = 0;
1127 
1128 	if (inp == NULL
1129 #if NECP
1130 	    || (necp_socket_should_use_flow_divert(inp) && (error = EPROTOTYPE))
1131 #endif /* NECP */
1132 	    ) {
1133 		if (inp == NULL) {
1134 			error = EINVAL;
1135 		} else {
1136 			error = EPROTOTYPE;
1137 		}
1138 		goto bad;
1139 	}
1140 	so_update_tx_data_stats(so, 1, m->m_pkthdr.len);
1141 
1142 	if (nam != NULL) {
1143 		dst = ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr;
1144 	}
1145 	return rip_output(m, so, dst, control);
1146 
1147 bad:
1148 	VERIFY(error != 0);
1149 
1150 	if (m != NULL) {
1151 		m_freem(m);
1152 	}
1153 	if (control != NULL) {
1154 		m_freem(control);
1155 	}
1156 
1157 	return error;
1158 }
1159 
1160 /* note: rip_unlock is called from different protos  instead of the generic socket_unlock,
1161  * it will handle the socket dealloc on last reference
1162  * */
1163 int
rip_unlock(struct socket * so,int refcount,void * debug)1164 rip_unlock(struct socket *so, int refcount, void *debug)
1165 {
1166 	void *__single lr_saved;
1167 	struct inpcb *inp = sotoinpcb(so);
1168 
1169 	if (debug == NULL) {
1170 		lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
1171 	} else {
1172 		lr_saved = debug;
1173 	}
1174 
1175 	if (refcount) {
1176 		if (so->so_usecount <= 0) {
1177 			panic("rip_unlock: bad refoucnt so=%p val=%x lrh= %s",
1178 			    so, so->so_usecount, solockhistory_nr(so));
1179 			/* NOTREACHED */
1180 		}
1181 		so->so_usecount--;
1182 		if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) {
1183 			/* cleanup after last reference */
1184 			lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx);
1185 			lck_rw_lock_exclusive(&ripcbinfo.ipi_lock);
1186 			if (inp->inp_state != INPCB_STATE_DEAD) {
1187 				if (SOCK_CHECK_DOM(so, PF_INET6)) {
1188 					in6_pcbdetach(inp);
1189 				} else {
1190 					in_pcbdetach(inp);
1191 				}
1192 			}
1193 			in_pcbdispose(inp);
1194 			lck_rw_done(&ripcbinfo.ipi_lock);
1195 			return 0;
1196 		}
1197 	}
1198 	so->unlock_lr[so->next_unlock_lr] = lr_saved;
1199 	so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
1200 	lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx);
1201 	return 0;
1202 }
1203 
1204 static int
1205 rip_pcblist SYSCTL_HANDLER_ARGS
1206 {
1207 #pragma unused(oidp, arg1, arg2)
1208 	int error, i, n, sz;
1209 	struct inpcb *inp, **inp_list;
1210 	inp_gen_t gencnt;
1211 	struct xinpgen xig;
1212 
1213 	/*
1214 	 * The process of preparing the TCB list is too time-consuming and
1215 	 * resource-intensive to repeat twice on every request.
1216 	 */
1217 	lck_rw_lock_exclusive(&ripcbinfo.ipi_lock);
1218 	if (req->oldptr == USER_ADDR_NULL) {
1219 		n = ripcbinfo.ipi_count;
1220 		req->oldidx = 2 * (sizeof xig)
1221 		    + (n + n / 8) * sizeof(struct xinpcb);
1222 		lck_rw_done(&ripcbinfo.ipi_lock);
1223 		return 0;
1224 	}
1225 
1226 	if (req->newptr != USER_ADDR_NULL) {
1227 		lck_rw_done(&ripcbinfo.ipi_lock);
1228 		return EPERM;
1229 	}
1230 
1231 	/*
1232 	 * OK, now we're committed to doing something.
1233 	 */
1234 	gencnt = ripcbinfo.ipi_gencnt;
1235 	sz = n = ripcbinfo.ipi_count;
1236 
1237 	bzero(&xig, sizeof(xig));
1238 	xig.xig_len = sizeof xig;
1239 	xig.xig_count = n;
1240 	xig.xig_gen = gencnt;
1241 	xig.xig_sogen = so_gencnt;
1242 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1243 	if (error) {
1244 		lck_rw_done(&ripcbinfo.ipi_lock);
1245 		return error;
1246 	}
1247 	/*
1248 	 * We are done if there is no pcb
1249 	 */
1250 	if (n == 0) {
1251 		lck_rw_done(&ripcbinfo.ipi_lock);
1252 		return 0;
1253 	}
1254 
1255 	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
1256 	if (inp_list == NULL) {
1257 		lck_rw_done(&ripcbinfo.ipi_lock);
1258 		return ENOMEM;
1259 	}
1260 
1261 	for (inp = ripcbinfo.ipi_listhead->lh_first, i = 0; inp && i < n;
1262 	    inp = inp->inp_list.le_next) {
1263 		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1264 			inp_list[i++] = inp;
1265 		}
1266 	}
1267 	n = i;
1268 
1269 	error = 0;
1270 	for (i = 0; i < n; i++) {
1271 		inp = inp_list[i];
1272 		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1273 			struct xinpcb xi;
1274 
1275 			bzero(&xi, sizeof(xi));
1276 			xi.xi_len = sizeof xi;
1277 			/* XXX should avoid extra copy */
1278 			inpcb_to_compat(inp, &xi.xi_inp);
1279 			if (inp->inp_socket) {
1280 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
1281 			}
1282 			error = SYSCTL_OUT(req, &xi, sizeof xi);
1283 		}
1284 	}
1285 	if (!error) {
1286 		/*
1287 		 * Give the user an updated idea of our state.
1288 		 * If the generation differs from what we told
1289 		 * her before, she knows that something happened
1290 		 * while we were processing this request, and it
1291 		 * might be necessary to retry.
1292 		 */
1293 		bzero(&xig, sizeof(xig));
1294 		xig.xig_len = sizeof xig;
1295 		xig.xig_gen = ripcbinfo.ipi_gencnt;
1296 		xig.xig_sogen = so_gencnt;
1297 		xig.xig_count = ripcbinfo.ipi_count;
1298 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1299 	}
1300 
1301 	lck_rw_done(&ripcbinfo.ipi_lock);
1302 	kfree_type(struct inpcb *, sz, inp_list);
1303 	return error;
1304 }
1305 
1306 SYSCTL_PROC(_net_inet_raw, OID_AUTO /*XXX*/, pcblist,
1307     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1308     rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
1309 
1310 #if XNU_TARGET_OS_OSX
1311 
1312 static int
1313 rip_pcblist64 SYSCTL_HANDLER_ARGS
1314 {
1315 #pragma unused(oidp, arg1, arg2)
1316 	int error, i, n, sz;
1317 	struct inpcb *inp, **inp_list;
1318 	inp_gen_t gencnt;
1319 	struct xinpgen xig;
1320 
1321 	/*
1322 	 * The process of preparing the TCB list is too time-consuming and
1323 	 * resource-intensive to repeat twice on every request.
1324 	 */
1325 	lck_rw_lock_exclusive(&ripcbinfo.ipi_lock);
1326 	if (req->oldptr == USER_ADDR_NULL) {
1327 		n = ripcbinfo.ipi_count;
1328 		req->oldidx = 2 * (sizeof xig)
1329 		    + (n + n / 8) * sizeof(struct xinpcb64);
1330 		lck_rw_done(&ripcbinfo.ipi_lock);
1331 		return 0;
1332 	}
1333 
1334 	if (req->newptr != USER_ADDR_NULL) {
1335 		lck_rw_done(&ripcbinfo.ipi_lock);
1336 		return EPERM;
1337 	}
1338 
1339 	/*
1340 	 * OK, now we're committed to doing something.
1341 	 */
1342 	gencnt = ripcbinfo.ipi_gencnt;
1343 	sz = n = ripcbinfo.ipi_count;
1344 
1345 	bzero(&xig, sizeof(xig));
1346 	xig.xig_len = sizeof xig;
1347 	xig.xig_count = n;
1348 	xig.xig_gen = gencnt;
1349 	xig.xig_sogen = so_gencnt;
1350 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1351 	if (error) {
1352 		lck_rw_done(&ripcbinfo.ipi_lock);
1353 		return error;
1354 	}
1355 	/*
1356 	 * We are done if there is no pcb
1357 	 */
1358 	if (n == 0) {
1359 		lck_rw_done(&ripcbinfo.ipi_lock);
1360 		return 0;
1361 	}
1362 
1363 	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
1364 	if (inp_list == NULL) {
1365 		lck_rw_done(&ripcbinfo.ipi_lock);
1366 		return ENOMEM;
1367 	}
1368 
1369 	for (inp = ripcbinfo.ipi_listhead->lh_first, i = 0; inp && i < n;
1370 	    inp = inp->inp_list.le_next) {
1371 		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1372 			inp_list[i++] = inp;
1373 		}
1374 	}
1375 	n = i;
1376 
1377 	error = 0;
1378 	for (i = 0; i < n; i++) {
1379 		inp = inp_list[i];
1380 		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1381 			struct xinpcb64 xi;
1382 
1383 			bzero(&xi, sizeof(xi));
1384 			xi.xi_len = sizeof xi;
1385 			inpcb_to_xinpcb64(inp, &xi);
1386 			if (inp->inp_socket) {
1387 				sotoxsocket64(inp->inp_socket, &xi.xi_socket);
1388 			}
1389 			error = SYSCTL_OUT(req, &xi, sizeof xi);
1390 		}
1391 	}
1392 	if (!error) {
1393 		/*
1394 		 * Give the user an updated idea of our state.
1395 		 * If the generation differs from what we told
1396 		 * her before, she knows that something happened
1397 		 * while we were processing this request, and it
1398 		 * might be necessary to retry.
1399 		 */
1400 		bzero(&xig, sizeof(xig));
1401 		xig.xig_len = sizeof xig;
1402 		xig.xig_gen = ripcbinfo.ipi_gencnt;
1403 		xig.xig_sogen = so_gencnt;
1404 		xig.xig_count = ripcbinfo.ipi_count;
1405 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1406 	}
1407 
1408 	lck_rw_done(&ripcbinfo.ipi_lock);
1409 	kfree_type(struct inpcb *, sz, inp_list);
1410 	return error;
1411 }
1412 
1413 SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist64,
1414     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1415     rip_pcblist64, "S,xinpcb64", "List of active raw IP sockets");
1416 
1417 #endif /* XNU_TARGET_OS_OSX */
1418 
1419 
1420 static int
1421 rip_pcblist_n SYSCTL_HANDLER_ARGS
1422 {
1423 #pragma unused(oidp, arg1, arg2)
1424 	int error = 0;
1425 
1426 	error = get_pcblist_n(IPPROTO_IP, req, &ripcbinfo);
1427 
1428 	return error;
1429 }
1430 
1431 SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist_n,
1432     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1433     rip_pcblist_n, "S,xinpcb_n", "List of active raw IP sockets");
1434 
1435 struct pr_usrreqs rip_usrreqs = {
1436 	.pru_abort =            rip_abort,
1437 	.pru_attach =           rip_attach,
1438 	.pru_bind =             rip_bind,
1439 	.pru_connect =          rip_connect,
1440 	.pru_control =          in_control,
1441 	.pru_detach =           rip_detach,
1442 	.pru_disconnect =       rip_disconnect,
1443 	.pru_peeraddr =         in_getpeeraddr,
1444 	.pru_send =             rip_send,
1445 	.pru_shutdown =         rip_shutdown,
1446 	.pru_sockaddr =         in_getsockaddr,
1447 	.pru_sosend =           sosend,
1448 	.pru_soreceive =        soreceive,
1449 };
1450 /* DSEP Review Done pl-20051213-v02 @3253 */
1451