xref: /xnu-8796.141.3/bsd/netinet/in_pcb.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1991, 1993, 1995
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
61  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62  */
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/proc.h>
73 #include <sys/kernel.h>
74 #include <sys/sysctl.h>
75 #include <sys/mcache.h>
76 #include <sys/kauth.h>
77 #include <sys/priv.h>
78 #include <sys/proc_uuid_policy.h>
79 #include <sys/syslog.h>
80 #include <sys/priv.h>
81 #include <sys/file_internal.h>
82 #include <net/dlil.h>
83 
84 #include <libkern/OSAtomic.h>
85 #include <kern/locks.h>
86 
87 #include <machine/limits.h>
88 
89 #include <kern/zalloc.h>
90 
91 #include <net/if.h>
92 #include <net/if_types.h>
93 #include <net/route.h>
94 #include <net/flowhash.h>
95 #include <net/flowadv.h>
96 #include <net/nat464_utils.h>
97 #include <net/ntstat.h>
98 #include <net/nwk_wq.h>
99 #include <net/restricted_in_port.h>
100 
101 #include <netinet/in.h>
102 #include <netinet/in_pcb.h>
103 #include <netinet/in_var.h>
104 #include <netinet/ip_var.h>
105 
106 #include <netinet/ip6.h>
107 #include <netinet6/ip6_var.h>
108 
109 #include <sys/kdebug.h>
110 #include <sys/random.h>
111 
112 #include <dev/random/randomdev.h>
113 #include <mach/boolean.h>
114 
115 #include <pexpert/pexpert.h>
116 
117 #if NECP
118 #include <net/necp.h>
119 #endif
120 
121 #include <sys/stat.h>
122 #include <sys/ubc.h>
123 #include <sys/vnode.h>
124 
125 #include <os/log.h>
126 
127 #if SKYWALK
128 #include <skywalk/namespace/flowidns.h>
129 #endif /* SKYWALK */
130 
131 #include <IOKit/IOBSD.h>
132 
133 extern const char *proc_name_address(struct proc *);
134 
135 static LCK_GRP_DECLARE(inpcb_lock_grp, "inpcb");
136 static LCK_ATTR_DECLARE(inpcb_lock_attr, 0, 0);
137 static LCK_MTX_DECLARE_ATTR(inpcb_lock, &inpcb_lock_grp, &inpcb_lock_attr);
138 static LCK_MTX_DECLARE_ATTR(inpcb_timeout_lock, &inpcb_lock_grp, &inpcb_lock_attr);
139 
140 static TAILQ_HEAD(, inpcbinfo) inpcb_head = TAILQ_HEAD_INITIALIZER(inpcb_head);
141 
142 static u_int16_t inpcb_timeout_run = 0; /* INPCB timer is scheduled to run */
143 static boolean_t inpcb_garbage_collecting = FALSE; /* gc timer is scheduled */
144 static boolean_t inpcb_ticking = FALSE;         /* "slow" timer is scheduled */
145 static boolean_t inpcb_fast_timer_on = FALSE;
146 
147 #define INPCB_GCREQ_THRESHOLD   50000
148 
149 static thread_call_t inpcb_thread_call, inpcb_fast_thread_call;
150 static void inpcb_sched_timeout(void);
151 static void inpcb_sched_lazy_timeout(void);
152 static void _inpcb_sched_timeout(unsigned int);
153 static void inpcb_timeout(void *, void *);
154 const int inpcb_timeout_lazy = 10;      /* 10 seconds leeway for lazy timers */
155 extern int tvtohz(struct timeval *);
156 
157 #if CONFIG_PROC_UUID_POLICY
158 static void inp_update_cellular_policy(struct inpcb *, boolean_t);
159 #if NECP
160 static void inp_update_necp_want_app_policy(struct inpcb *, boolean_t);
161 #endif /* NECP */
162 #endif /* !CONFIG_PROC_UUID_POLICY */
163 
164 #define DBG_FNC_PCB_LOOKUP      NETDBG_CODE(DBG_NETTCP, (6 << 8))
165 #define DBG_FNC_PCB_HLOOKUP     NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
166 
167 int allow_udp_port_exhaustion = 0;
168 
169 /*
170  * These configure the range of local port addresses assigned to
171  * "unspecified" outgoing connections/packets/whatever.
172  */
173 int     ipport_lowfirstauto  = IPPORT_RESERVED - 1;     /* 1023 */
174 int     ipport_lowlastauto = IPPORT_RESERVEDSTART;      /* 600 */
175 int     ipport_firstauto = IPPORT_HIFIRSTAUTO;          /* 49152 */
176 int     ipport_lastauto  = IPPORT_HILASTAUTO;           /* 65535 */
177 int     ipport_hifirstauto = IPPORT_HIFIRSTAUTO;        /* 49152 */
178 int     ipport_hilastauto  = IPPORT_HILASTAUTO;         /* 65535 */
179 
180 #define RANGECHK(var, min, max) \
181 	if ((var) < (min)) { (var) = (min); } \
182 	else if ((var) > (max)) { (var) = (max); }
183 
184 static int
185 sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
186 {
187 #pragma unused(arg1, arg2)
188 	int error;
189 	int new_value = *(int *)oidp->oid_arg1;
190 #if (DEBUG | DEVELOPMENT)
191 	int old_value = *(int *)oidp->oid_arg1;
192 	/*
193 	 * For unit testing allow a non-superuser process with the
194 	 * proper entitlement to modify the variables
195 	 */
196 	if (req->newptr) {
197 		if (proc_suser(current_proc()) != 0 &&
198 		    (error = priv_check_cred(kauth_cred_get(),
199 		    PRIV_NETINET_RESERVEDPORT, 0))) {
200 			return EPERM;
201 		}
202 	}
203 #endif /* (DEBUG | DEVELOPMENT) */
204 
205 	error = sysctl_handle_int(oidp, &new_value, 0, req);
206 	if (!error) {
207 		if (oidp->oid_arg1 == &ipport_lowfirstauto || oidp->oid_arg1 == &ipport_lowlastauto) {
208 			RANGECHK(new_value, 1, IPPORT_RESERVED - 1);
209 		} else {
210 			RANGECHK(new_value, IPPORT_RESERVED, USHRT_MAX);
211 		}
212 		*(int *)oidp->oid_arg1 = new_value;
213 	}
214 
215 #if (DEBUG | DEVELOPMENT)
216 	os_log(OS_LOG_DEFAULT,
217 	    "%s:%u sysctl net.restricted_port.verbose: %d -> %d)",
218 	    proc_best_name(current_proc()), proc_selfpid(),
219 	    old_value, *(int *)oidp->oid_arg1);
220 #endif /* (DEBUG | DEVELOPMENT) */
221 
222 	return error;
223 }
224 
225 #undef RANGECHK
226 
227 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
228     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IP Ports");
229 
230 #if (DEBUG | DEVELOPMENT)
231 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY)
232 #else
233 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED)
234 #endif /* (DEBUG | DEVELOPMENT) */
235 
236 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
237     CTLFAGS_IP_PORTRANGE,
238     &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
239 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
240     CTLFAGS_IP_PORTRANGE,
241     &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
242 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
243     CTLFAGS_IP_PORTRANGE,
244     &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
245 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
246     CTLFAGS_IP_PORTRANGE,
247     &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
248 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
249     CTLFAGS_IP_PORTRANGE,
250     &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
251 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
252     CTLFAGS_IP_PORTRANGE,
253     &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
254 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, ipport_allow_udp_port_exhaustion,
255     CTLFLAG_LOCKED | CTLFLAG_RW, &allow_udp_port_exhaustion, 0, "");
256 
257 static uint32_t apn_fallbk_debug = 0;
258 #define apn_fallbk_log(x)       do { if (apn_fallbk_debug >= 1) log x; } while (0)
259 
260 #if !XNU_TARGET_OS_OSX
261 static boolean_t apn_fallbk_enabled = TRUE;
262 
263 SYSCTL_DECL(_net_inet);
264 SYSCTL_NODE(_net_inet, OID_AUTO, apn_fallback, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "APN Fallback");
265 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
266     &apn_fallbk_enabled, 0, "APN fallback enable");
267 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
268     &apn_fallbk_debug, 0, "APN fallback debug enable");
269 #else /* XNU_TARGET_OS_OSX */
270 static boolean_t apn_fallbk_enabled = FALSE;
271 #endif /* XNU_TARGET_OS_OSX */
272 
273 extern int      udp_use_randomport;
274 extern int      tcp_use_randomport;
275 
276 /* Structs used for flowhash computation */
277 struct inp_flowhash_key_addr {
278 	union {
279 		struct in_addr  v4;
280 		struct in6_addr v6;
281 		u_int8_t        addr8[16];
282 		u_int16_t       addr16[8];
283 		u_int32_t       addr32[4];
284 	} infha;
285 };
286 
287 struct inp_flowhash_key {
288 	struct inp_flowhash_key_addr    infh_laddr;
289 	struct inp_flowhash_key_addr    infh_faddr;
290 	u_int32_t                       infh_lport;
291 	u_int32_t                       infh_fport;
292 	u_int32_t                       infh_af;
293 	u_int32_t                       infh_proto;
294 	u_int32_t                       infh_rand1;
295 	u_int32_t                       infh_rand2;
296 };
297 
298 #if !SKYWALK
299 static u_int32_t inp_hash_seed = 0;
300 #endif /* !SKYWALK */
301 
302 static int infc_cmp(const struct inpcb *, const struct inpcb *);
303 
304 /* Flags used by inp_fc_getinp */
305 #define INPFC_SOLOCKED  0x1
306 #define INPFC_REMOVE    0x2
307 static struct inpcb *inp_fc_getinp(u_int32_t, u_int32_t);
308 
309 static void inp_fc_feedback(struct inpcb *);
310 extern void tcp_remove_from_time_wait(struct inpcb *inp);
311 
312 static LCK_MTX_DECLARE_ATTR(inp_fc_lck, &inpcb_lock_grp, &inpcb_lock_attr);
313 
314 RB_HEAD(inp_fc_tree, inpcb) inp_fc_tree;
315 RB_PROTOTYPE(inp_fc_tree, inpcb, infc_link, infc_cmp);
316 RB_GENERATE(inp_fc_tree, inpcb, infc_link, infc_cmp);
317 
318 /*
319  * Use this inp as a key to find an inp in the flowhash tree.
320  * Accesses to it are protected by inp_fc_lck.
321  */
322 struct inpcb key_inp;
323 
324 /*
325  * in_pcb.c: manage the Protocol Control Blocks.
326  */
327 
328 void
in_pcbinit(void)329 in_pcbinit(void)
330 {
331 	static int inpcb_initialized = 0;
332 
333 	VERIFY(!inpcb_initialized);
334 	inpcb_initialized = 1;
335 
336 	inpcb_thread_call = thread_call_allocate_with_priority(inpcb_timeout,
337 	    NULL, THREAD_CALL_PRIORITY_KERNEL);
338 	/* Give it an arg so that we know that this is the fast timer */
339 	inpcb_fast_thread_call = thread_call_allocate_with_priority(
340 		inpcb_timeout, &inpcb_timeout, THREAD_CALL_PRIORITY_KERNEL);
341 	if (inpcb_thread_call == NULL || inpcb_fast_thread_call == NULL) {
342 		panic("unable to alloc the inpcb thread call");
343 	}
344 
345 	/*
346 	 * Initialize data structures required to deliver
347 	 * flow advisories.
348 	 */
349 	lck_mtx_lock(&inp_fc_lck);
350 	RB_INIT(&inp_fc_tree);
351 	bzero(&key_inp, sizeof(key_inp));
352 	lck_mtx_unlock(&inp_fc_lck);
353 }
354 
355 #define INPCB_HAVE_TIMER_REQ(req)       (((req).intimer_lazy > 0) || \
356 	((req).intimer_fast > 0) || ((req).intimer_nodelay > 0))
357 static void
inpcb_timeout(void * arg0,void * arg1)358 inpcb_timeout(void *arg0, void *arg1)
359 {
360 #pragma unused(arg1)
361 	struct inpcbinfo *ipi;
362 	boolean_t t, gc;
363 	struct intimercount gccnt, tmcnt;
364 
365 	/*
366 	 * Update coarse-grained networking timestamp (in sec.); the idea
367 	 * is to piggy-back on the timeout callout to update the counter
368 	 * returnable via net_uptime().
369 	 */
370 	net_update_uptime();
371 
372 	bzero(&gccnt, sizeof(gccnt));
373 	bzero(&tmcnt, sizeof(tmcnt));
374 
375 	lck_mtx_lock_spin(&inpcb_timeout_lock);
376 	gc = inpcb_garbage_collecting;
377 	inpcb_garbage_collecting = FALSE;
378 
379 	t = inpcb_ticking;
380 	inpcb_ticking = FALSE;
381 
382 	if (gc || t) {
383 		lck_mtx_unlock(&inpcb_timeout_lock);
384 
385 		lck_mtx_lock(&inpcb_lock);
386 		TAILQ_FOREACH(ipi, &inpcb_head, ipi_entry) {
387 			if (INPCB_HAVE_TIMER_REQ(ipi->ipi_gc_req)) {
388 				bzero(&ipi->ipi_gc_req,
389 				    sizeof(ipi->ipi_gc_req));
390 				if (gc && ipi->ipi_gc != NULL) {
391 					ipi->ipi_gc(ipi);
392 					gccnt.intimer_lazy +=
393 					    ipi->ipi_gc_req.intimer_lazy;
394 					gccnt.intimer_fast +=
395 					    ipi->ipi_gc_req.intimer_fast;
396 					gccnt.intimer_nodelay +=
397 					    ipi->ipi_gc_req.intimer_nodelay;
398 				}
399 			}
400 			if (INPCB_HAVE_TIMER_REQ(ipi->ipi_timer_req)) {
401 				bzero(&ipi->ipi_timer_req,
402 				    sizeof(ipi->ipi_timer_req));
403 				if (t && ipi->ipi_timer != NULL) {
404 					ipi->ipi_timer(ipi);
405 					tmcnt.intimer_lazy +=
406 					    ipi->ipi_timer_req.intimer_lazy;
407 					tmcnt.intimer_fast +=
408 					    ipi->ipi_timer_req.intimer_fast;
409 					tmcnt.intimer_nodelay +=
410 					    ipi->ipi_timer_req.intimer_nodelay;
411 				}
412 			}
413 		}
414 		lck_mtx_unlock(&inpcb_lock);
415 		lck_mtx_lock_spin(&inpcb_timeout_lock);
416 	}
417 
418 	/* lock was dropped above, so check first before overriding */
419 	if (!inpcb_garbage_collecting) {
420 		inpcb_garbage_collecting = INPCB_HAVE_TIMER_REQ(gccnt);
421 	}
422 	if (!inpcb_ticking) {
423 		inpcb_ticking = INPCB_HAVE_TIMER_REQ(tmcnt);
424 	}
425 
426 	/* arg0 will be set if we are the fast timer */
427 	if (arg0 != NULL) {
428 		inpcb_fast_timer_on = FALSE;
429 	}
430 	inpcb_timeout_run--;
431 	VERIFY(inpcb_timeout_run >= 0 && inpcb_timeout_run < 2);
432 
433 	/* re-arm the timer if there's work to do */
434 	if (gccnt.intimer_nodelay > 0 || tmcnt.intimer_nodelay > 0) {
435 		inpcb_sched_timeout();
436 	} else if ((gccnt.intimer_fast + tmcnt.intimer_fast) <= 5) {
437 		/* be lazy when idle with little activity */
438 		inpcb_sched_lazy_timeout();
439 	} else {
440 		inpcb_sched_timeout();
441 	}
442 
443 	lck_mtx_unlock(&inpcb_timeout_lock);
444 }
445 
446 static void
inpcb_sched_timeout(void)447 inpcb_sched_timeout(void)
448 {
449 	_inpcb_sched_timeout(0);
450 }
451 
452 static void
inpcb_sched_lazy_timeout(void)453 inpcb_sched_lazy_timeout(void)
454 {
455 	_inpcb_sched_timeout(inpcb_timeout_lazy);
456 }
457 
458 static void
_inpcb_sched_timeout(unsigned int offset)459 _inpcb_sched_timeout(unsigned int offset)
460 {
461 	uint64_t deadline, leeway;
462 
463 	clock_interval_to_deadline(1, NSEC_PER_SEC, &deadline);
464 	LCK_MTX_ASSERT(&inpcb_timeout_lock, LCK_MTX_ASSERT_OWNED);
465 	if (inpcb_timeout_run == 0 &&
466 	    (inpcb_garbage_collecting || inpcb_ticking)) {
467 		lck_mtx_convert_spin(&inpcb_timeout_lock);
468 		inpcb_timeout_run++;
469 		if (offset == 0) {
470 			inpcb_fast_timer_on = TRUE;
471 			thread_call_enter_delayed(inpcb_fast_thread_call,
472 			    deadline);
473 		} else {
474 			inpcb_fast_timer_on = FALSE;
475 			clock_interval_to_absolutetime_interval(offset,
476 			    NSEC_PER_SEC, &leeway);
477 			thread_call_enter_delayed_with_leeway(
478 				inpcb_thread_call, NULL, deadline, leeway,
479 				THREAD_CALL_DELAY_LEEWAY);
480 		}
481 	} else if (inpcb_timeout_run == 1 &&
482 	    offset == 0 && !inpcb_fast_timer_on) {
483 		/*
484 		 * Since the request was for a fast timer but the
485 		 * scheduled timer is a lazy timer, try to schedule
486 		 * another instance of fast timer also.
487 		 */
488 		lck_mtx_convert_spin(&inpcb_timeout_lock);
489 		inpcb_timeout_run++;
490 		inpcb_fast_timer_on = TRUE;
491 		thread_call_enter_delayed(inpcb_fast_thread_call, deadline);
492 	}
493 }
494 
495 void
inpcb_gc_sched(struct inpcbinfo * ipi,u_int32_t type)496 inpcb_gc_sched(struct inpcbinfo *ipi, u_int32_t type)
497 {
498 	u_int32_t gccnt;
499 
500 	lck_mtx_lock_spin(&inpcb_timeout_lock);
501 	inpcb_garbage_collecting = TRUE;
502 	gccnt = ipi->ipi_gc_req.intimer_nodelay +
503 	    ipi->ipi_gc_req.intimer_fast;
504 
505 	if (gccnt > INPCB_GCREQ_THRESHOLD) {
506 		type = INPCB_TIMER_FAST;
507 	}
508 
509 	switch (type) {
510 	case INPCB_TIMER_NODELAY:
511 		atomic_add_32(&ipi->ipi_gc_req.intimer_nodelay, 1);
512 		inpcb_sched_timeout();
513 		break;
514 	case INPCB_TIMER_FAST:
515 		atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
516 		inpcb_sched_timeout();
517 		break;
518 	default:
519 		atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
520 		inpcb_sched_lazy_timeout();
521 		break;
522 	}
523 	lck_mtx_unlock(&inpcb_timeout_lock);
524 }
525 
526 void
inpcb_timer_sched(struct inpcbinfo * ipi,u_int32_t type)527 inpcb_timer_sched(struct inpcbinfo *ipi, u_int32_t type)
528 {
529 	lck_mtx_lock_spin(&inpcb_timeout_lock);
530 	inpcb_ticking = TRUE;
531 	switch (type) {
532 	case INPCB_TIMER_NODELAY:
533 		atomic_add_32(&ipi->ipi_timer_req.intimer_nodelay, 1);
534 		inpcb_sched_timeout();
535 		break;
536 	case INPCB_TIMER_FAST:
537 		atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
538 		inpcb_sched_timeout();
539 		break;
540 	default:
541 		atomic_add_32(&ipi->ipi_timer_req.intimer_lazy, 1);
542 		inpcb_sched_lazy_timeout();
543 		break;
544 	}
545 	lck_mtx_unlock(&inpcb_timeout_lock);
546 }
547 
548 void
in_pcbinfo_attach(struct inpcbinfo * ipi)549 in_pcbinfo_attach(struct inpcbinfo *ipi)
550 {
551 	struct inpcbinfo *ipi0;
552 
553 	lck_mtx_lock(&inpcb_lock);
554 	TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
555 		if (ipi0 == ipi) {
556 			panic("%s: ipi %p already in the list",
557 			    __func__, ipi);
558 			/* NOTREACHED */
559 		}
560 	}
561 	TAILQ_INSERT_TAIL(&inpcb_head, ipi, ipi_entry);
562 	lck_mtx_unlock(&inpcb_lock);
563 }
564 
565 int
in_pcbinfo_detach(struct inpcbinfo * ipi)566 in_pcbinfo_detach(struct inpcbinfo *ipi)
567 {
568 	struct inpcbinfo *ipi0;
569 	int error = 0;
570 
571 	lck_mtx_lock(&inpcb_lock);
572 	TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
573 		if (ipi0 == ipi) {
574 			break;
575 		}
576 	}
577 	if (ipi0 != NULL) {
578 		TAILQ_REMOVE(&inpcb_head, ipi0, ipi_entry);
579 	} else {
580 		error = ENXIO;
581 	}
582 	lck_mtx_unlock(&inpcb_lock);
583 
584 	return error;
585 }
586 
587 __attribute__((noinline))
588 char *
inp_snprintf_tuple(struct inpcb * inp,char * buf,size_t buflen)589 inp_snprintf_tuple(struct inpcb *inp, char *buf, size_t buflen)
590 {
591 	char laddrstr[MAX_IPv6_STR_LEN];
592 	char faddrstr[MAX_IPv6_STR_LEN];
593 	uint16_t lport = 0;
594 	uint16_t fport = 0;
595 	uint16_t proto = IPPROTO_IP;
596 
597 	if (inp->inp_socket != NULL && inp->inp_socket->so_proto != NULL) {
598 		proto = inp->inp_socket->so_proto->pr_protocol;
599 
600 		if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) {
601 			lport  = inp->inp_lport;
602 			fport = inp->inp_fport;
603 		}
604 	}
605 	if (inp->inp_vflag & INP_IPV4) {
606 		inet_ntop(AF_INET, (void *)&inp->inp_laddr.s_addr, laddrstr, sizeof(laddrstr));
607 		inet_ntop(AF_INET, (void *)&inp->inp_faddr.s_addr, faddrstr, sizeof(faddrstr));
608 	} else if (inp->inp_vflag & INP_IPV6) {
609 		inet_ntop(AF_INET6, (void *)&inp->in6p_faddr, laddrstr, sizeof(laddrstr));
610 		inet_ntop(AF_INET6, (void *)&inp->in6p_faddr, faddrstr, sizeof(faddrstr));
611 	}
612 	snprintf(buf, buflen, "[%u %s:%u %s:%u]",
613 	    proto, laddrstr, ntohs(lport), faddrstr, ntohs(fport));
614 
615 	return buf;
616 }
617 
618 __attribute__((noinline))
619 void
in_pcb_check_management_entitled(struct inpcb * inp)620 in_pcb_check_management_entitled(struct inpcb *inp)
621 {
622 	if (inp->inp_flags2 & INP2_MANAGEMENT_CHECKED) {
623 		return;
624 	}
625 
626 	if (management_data_unrestricted) {
627 		inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
628 		inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
629 	} else if (if_management_interface_check_needed == true) {
630 		inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
631 		/*
632 		 * Note that soopt_cred_check check both intcoproc entitlements
633 		 * We check MANAGEMENT_DATA_ENTITLEMENT as there is no corresponding PRIV value
634 		 */
635 		if (soopt_cred_check(inp->inp_socket, PRIV_NET_RESTRICTED_INTCOPROC, false, false) == 0
636 		    || IOCurrentTaskHasEntitlement(MANAGEMENT_DATA_ENTITLEMENT) == true
637 #if DEBUG || DEVELOPMENT
638 		    || IOCurrentTaskHasEntitlement(MANAGEMENT_DATA_ENTITLEMENT_DEVELOPMENT) == true
639 #endif /* DEBUG || DEVELOPMENT */
640 		    ) {
641 			inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
642 		} else {
643 			if (__improbable(if_management_verbose > 1)) {
644 				char buf[128];
645 
646 				os_log(OS_LOG_DEFAULT, "in_pcb_check_management_entitled %s:%d not management entitled %s",
647 				    proc_best_name(current_proc()),
648 				    proc_selfpid(),
649 				    inp_snprintf_tuple(inp, buf, sizeof(buf)));
650 			}
651 		}
652 	}
653 }
654 
655 /*
656  * Allocate a PCB and associate it with the socket.
657  *
658  * Returns:	0			Success
659  *		ENOBUFS
660  *		ENOMEM
661  */
662 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo,struct proc * p)663 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p)
664 {
665 #pragma unused(p)
666 	struct inpcb *inp;
667 	caddr_t temp;
668 
669 	if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
670 		inp = zalloc_flags(pcbinfo->ipi_zone,
671 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
672 	} else {
673 		inp = (struct inpcb *)(void *)so->so_saved_pcb;
674 		temp = inp->inp_saved_ppcb;
675 		bzero((caddr_t)inp, sizeof(*inp));
676 		inp->inp_saved_ppcb = temp;
677 	}
678 
679 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
680 	inp->inp_pcbinfo = pcbinfo;
681 	inp->inp_socket = so;
682 	/* make sure inp_stat is always 64-bit aligned */
683 	inp->inp_stat = (struct inp_stat *)P2ROUNDUP(inp->inp_stat_store,
684 	    sizeof(u_int64_t));
685 	if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store) +
686 	    sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) {
687 		panic("%s: insufficient space to align inp_stat", __func__);
688 		/* NOTREACHED */
689 	}
690 
691 	/* make sure inp_cstat is always 64-bit aligned */
692 	inp->inp_cstat = (struct inp_stat *)P2ROUNDUP(inp->inp_cstat_store,
693 	    sizeof(u_int64_t));
694 	if (((uintptr_t)inp->inp_cstat - (uintptr_t)inp->inp_cstat_store) +
695 	    sizeof(*inp->inp_cstat) > sizeof(inp->inp_cstat_store)) {
696 		panic("%s: insufficient space to align inp_cstat", __func__);
697 		/* NOTREACHED */
698 	}
699 
700 	/* make sure inp_wstat is always 64-bit aligned */
701 	inp->inp_wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_wstat_store,
702 	    sizeof(u_int64_t));
703 	if (((uintptr_t)inp->inp_wstat - (uintptr_t)inp->inp_wstat_store) +
704 	    sizeof(*inp->inp_wstat) > sizeof(inp->inp_wstat_store)) {
705 		panic("%s: insufficient space to align inp_wstat", __func__);
706 		/* NOTREACHED */
707 	}
708 
709 	/* make sure inp_Wstat is always 64-bit aligned */
710 	inp->inp_Wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_Wstat_store,
711 	    sizeof(u_int64_t));
712 	if (((uintptr_t)inp->inp_Wstat - (uintptr_t)inp->inp_Wstat_store) +
713 	    sizeof(*inp->inp_Wstat) > sizeof(inp->inp_Wstat_store)) {
714 		panic("%s: insufficient space to align inp_Wstat", __func__);
715 		/* NOTREACHED */
716 	}
717 
718 	so->so_pcb = (caddr_t)inp;
719 
720 	if (so->so_proto->pr_flags & PR_PCBLOCK) {
721 		lck_mtx_init(&inp->inpcb_mtx, pcbinfo->ipi_lock_grp,
722 		    &pcbinfo->ipi_lock_attr);
723 	}
724 
725 	if (SOCK_DOM(so) == PF_INET6 && !ip6_mapped_addr_on) {
726 		inp->inp_flags |= IN6P_IPV6_V6ONLY;
727 	}
728 
729 	if (ip6_auto_flowlabel) {
730 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
731 	}
732 	if (intcoproc_unrestricted) {
733 		inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
734 	}
735 
736 	(void) inp_update_policy(inp);
737 
738 	lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
739 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
740 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
741 	pcbinfo->ipi_count++;
742 	lck_rw_done(&pcbinfo->ipi_lock);
743 	return 0;
744 }
745 
746 /*
747  * in_pcblookup_local_and_cleanup does everything
748  * in_pcblookup_local does but it checks for a socket
749  * that's going away. Since we know that the lock is
750  * held read+write when this function is called, we
751  * can safely dispose of this socket like the slow
752  * timer would usually do and return NULL. This is
753  * great for bind.
754  */
755 struct inpcb *
in_pcblookup_local_and_cleanup(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_int lport_arg,int wild_okay)756 in_pcblookup_local_and_cleanup(struct inpcbinfo *pcbinfo, struct in_addr laddr,
757     u_int lport_arg, int wild_okay)
758 {
759 	struct inpcb *inp;
760 
761 	/* Perform normal lookup */
762 	inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
763 
764 	/* Check if we found a match but it's waiting to be disposed */
765 	if (inp != NULL && inp->inp_wantcnt == WNT_STOPUSING) {
766 		struct socket *so = inp->inp_socket;
767 
768 		socket_lock(so, 0);
769 
770 		if (so->so_usecount == 0) {
771 			if (inp->inp_state != INPCB_STATE_DEAD) {
772 				in_pcbdetach(inp);
773 			}
774 			in_pcbdispose(inp);     /* will unlock & destroy */
775 			inp = NULL;
776 		} else {
777 			socket_unlock(so, 0);
778 		}
779 	}
780 
781 	return inp;
782 }
783 
784 static void
in_pcb_conflict_post_msg(u_int16_t port)785 in_pcb_conflict_post_msg(u_int16_t port)
786 {
787 	/*
788 	 * Radar 5523020 send a kernel event notification if a
789 	 * non-participating socket tries to bind the port a socket
790 	 * who has set SOF_NOTIFYCONFLICT owns.
791 	 */
792 	struct kev_msg ev_msg;
793 	struct kev_in_portinuse in_portinuse;
794 
795 	bzero(&in_portinuse, sizeof(struct kev_in_portinuse));
796 	bzero(&ev_msg, sizeof(struct kev_msg));
797 	in_portinuse.port = ntohs(port);        /* port in host order */
798 	in_portinuse.req_pid = proc_selfpid();
799 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
800 	ev_msg.kev_class = KEV_NETWORK_CLASS;
801 	ev_msg.kev_subclass = KEV_INET_SUBCLASS;
802 	ev_msg.event_code = KEV_INET_PORTINUSE;
803 	ev_msg.dv[0].data_ptr = &in_portinuse;
804 	ev_msg.dv[0].data_length = sizeof(struct kev_in_portinuse);
805 	ev_msg.dv[1].data_length = 0;
806 	dlil_post_complete_msg(NULL, &ev_msg);
807 }
808 
809 /*
810  * Bind an INPCB to an address and/or port.  This routine should not alter
811  * the caller-supplied local address "nam".
812  *
813  * Returns:	0			Success
814  *		EADDRNOTAVAIL		Address not available.
815  *		EINVAL			Invalid argument
816  *		EAFNOSUPPORT		Address family not supported [notdef]
817  *		EACCES			Permission denied
818  *		EADDRINUSE		Address in use
819  *		EAGAIN			Resource unavailable, try again
820  *		priv_check_cred:EPERM	Operation not permitted
821  */
822 int
in_pcbbind(struct inpcb * inp,struct sockaddr * nam,struct proc * p)823 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
824 {
825 	struct socket *so = inp->inp_socket;
826 	unsigned short *lastport;
827 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
828 	u_short lport = 0, rand_port = 0;
829 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
830 	int error, randomport, conflict = 0;
831 	boolean_t anonport = FALSE;
832 	kauth_cred_t cred;
833 	struct in_addr laddr;
834 	struct ifnet *outif = NULL;
835 
836 	if (TAILQ_EMPTY(&in_ifaddrhead)) { /* XXX broken! */
837 		return EADDRNOTAVAIL;
838 	}
839 	if (!(so->so_options & (SO_REUSEADDR | SO_REUSEPORT))) {
840 		wild = 1;
841 	}
842 
843 	bzero(&laddr, sizeof(laddr));
844 
845 	socket_unlock(so, 0); /* keep reference on socket */
846 	lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
847 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
848 		/* another thread completed the bind */
849 		lck_rw_done(&pcbinfo->ipi_lock);
850 		socket_lock(so, 0);
851 		return EINVAL;
852 	}
853 
854 	if (nam != NULL) {
855 		if (nam->sa_len != sizeof(struct sockaddr_in)) {
856 			lck_rw_done(&pcbinfo->ipi_lock);
857 			socket_lock(so, 0);
858 			return EINVAL;
859 		}
860 #if 0
861 		/*
862 		 * We should check the family, but old programs
863 		 * incorrectly fail to initialize it.
864 		 */
865 		if (nam->sa_family != AF_INET) {
866 			lck_rw_done(&pcbinfo->ipi_lock);
867 			socket_lock(so, 0);
868 			return EAFNOSUPPORT;
869 		}
870 #endif /* 0 */
871 		lport = SIN(nam)->sin_port;
872 
873 		if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr))) {
874 			/*
875 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
876 			 * allow complete duplication of binding if
877 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
878 			 * and a multicast address is bound on both
879 			 * new and duplicated sockets.
880 			 */
881 			if (so->so_options & SO_REUSEADDR) {
882 				reuseport = SO_REUSEADDR | SO_REUSEPORT;
883 			}
884 		} else if (SIN(nam)->sin_addr.s_addr != INADDR_ANY) {
885 			struct sockaddr_in sin;
886 			struct ifaddr *ifa;
887 
888 			/* Sanitized for interface address searches */
889 			bzero(&sin, sizeof(sin));
890 			sin.sin_family = AF_INET;
891 			sin.sin_len = sizeof(struct sockaddr_in);
892 			sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
893 
894 			ifa = ifa_ifwithaddr(SA(&sin));
895 			if (ifa == NULL) {
896 				lck_rw_done(&pcbinfo->ipi_lock);
897 				socket_lock(so, 0);
898 				return EADDRNOTAVAIL;
899 			} else {
900 				/*
901 				 * Opportunistically determine the outbound
902 				 * interface that may be used; this may not
903 				 * hold true if we end up using a route
904 				 * going over a different interface, e.g.
905 				 * when sending to a local address.  This
906 				 * will get updated again after sending.
907 				 */
908 				IFA_LOCK(ifa);
909 				outif = ifa->ifa_ifp;
910 				IFA_UNLOCK(ifa);
911 				IFA_REMREF(ifa);
912 			}
913 		}
914 
915 #if SKYWALK
916 		if (inp->inp_flags2 & INP2_EXTERNAL_PORT) {
917 			// Extract the external flow info
918 			struct ns_flow_info nfi = {};
919 			error = necp_client_get_netns_flow_info(inp->necp_client_uuid,
920 			    &nfi);
921 			if (error != 0) {
922 				lck_rw_done(&pcbinfo->ipi_lock);
923 				socket_lock(so, 0);
924 				return error;
925 			}
926 
927 			// Extract the reserved port
928 			u_int16_t reserved_lport = 0;
929 			if (nfi.nfi_laddr.sa.sa_family == AF_INET) {
930 				reserved_lport = nfi.nfi_laddr.sin.sin_port;
931 			} else if (nfi.nfi_laddr.sa.sa_family == AF_INET6) {
932 				reserved_lport = nfi.nfi_laddr.sin6.sin6_port;
933 			} else {
934 				lck_rw_done(&pcbinfo->ipi_lock);
935 				socket_lock(so, 0);
936 				return EINVAL;
937 			}
938 
939 			// Validate or use the reserved port
940 			if (lport == 0) {
941 				lport = reserved_lport;
942 			} else if (lport != reserved_lport) {
943 				lck_rw_done(&pcbinfo->ipi_lock);
944 				socket_lock(so, 0);
945 				return EINVAL;
946 			}
947 		}
948 
949 		/* Do not allow reserving a UDP port if remaining UDP port count is below 4096 */
950 		if (SOCK_PROTO(so) == IPPROTO_UDP && !allow_udp_port_exhaustion) {
951 			uint32_t current_reservations = 0;
952 			if (inp->inp_vflag & INP_IPV6) {
953 				current_reservations = netns_lookup_reservations_count_in6(inp->in6p_laddr, IPPROTO_UDP);
954 			} else {
955 				current_reservations = netns_lookup_reservations_count_in(inp->inp_laddr, IPPROTO_UDP);
956 			}
957 			if (USHRT_MAX - UDP_RANDOM_PORT_RESERVE < current_reservations) {
958 				log(LOG_ERR, "UDP port not available, less than 4096 UDP ports left");
959 				lck_rw_done(&pcbinfo->ipi_lock);
960 				socket_lock(so, 0);
961 				return EADDRNOTAVAIL;
962 			}
963 		}
964 
965 #endif /* SKYWALK */
966 
967 		if (lport != 0) {
968 			struct inpcb *t;
969 			uid_t u;
970 
971 #if XNU_TARGET_OS_OSX
972 			if (ntohs(lport) < IPPORT_RESERVED &&
973 			    SIN(nam)->sin_addr.s_addr != 0 &&
974 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
975 				cred = kauth_cred_proc_ref(p);
976 				error = priv_check_cred(cred,
977 				    PRIV_NETINET_RESERVEDPORT, 0);
978 				kauth_cred_unref(&cred);
979 				if (error != 0) {
980 					lck_rw_done(&pcbinfo->ipi_lock);
981 					socket_lock(so, 0);
982 					return EACCES;
983 				}
984 			}
985 #endif /* XNU_TARGET_OS_OSX */
986 			/*
987 			 * Check wether the process is allowed to bind to a restricted port
988 			 */
989 			if (!current_task_can_use_restricted_in_port(lport,
990 			    (uint8_t)so->so_proto->pr_protocol, PORT_FLAGS_BSD)) {
991 				lck_rw_done(&pcbinfo->ipi_lock);
992 				socket_lock(so, 0);
993 				return EADDRINUSE;
994 			}
995 
996 			if (!IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
997 			    (u = kauth_cred_getuid(so->so_cred)) != 0 &&
998 			    (t = in_pcblookup_local_and_cleanup(
999 				    inp->inp_pcbinfo, SIN(nam)->sin_addr, lport,
1000 				    INPLOOKUP_WILDCARD)) != NULL &&
1001 			    (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
1002 			    t->inp_laddr.s_addr != INADDR_ANY ||
1003 			    !(t->inp_socket->so_options & SO_REUSEPORT)) &&
1004 			    (u != kauth_cred_getuid(t->inp_socket->so_cred)) &&
1005 			    !(t->inp_socket->so_flags & SOF_REUSESHAREUID) &&
1006 			    (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
1007 			    t->inp_laddr.s_addr != INADDR_ANY) &&
1008 			    (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
1009 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
1010 			    uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
1011 				if ((t->inp_socket->so_flags &
1012 				    SOF_NOTIFYCONFLICT) &&
1013 				    !(so->so_flags & SOF_NOTIFYCONFLICT)) {
1014 					conflict = 1;
1015 				}
1016 
1017 				lck_rw_done(&pcbinfo->ipi_lock);
1018 
1019 				if (conflict) {
1020 					in_pcb_conflict_post_msg(lport);
1021 				}
1022 
1023 				socket_lock(so, 0);
1024 				return EADDRINUSE;
1025 			}
1026 			t = in_pcblookup_local_and_cleanup(pcbinfo,
1027 			    SIN(nam)->sin_addr, lport, wild);
1028 			if (t != NULL &&
1029 			    (reuseport & t->inp_socket->so_options) == 0 &&
1030 			    (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
1031 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
1032 			    uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
1033 				if (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
1034 				    t->inp_laddr.s_addr != INADDR_ANY ||
1035 				    SOCK_DOM(so) != PF_INET6 ||
1036 				    SOCK_DOM(t->inp_socket) != PF_INET6) {
1037 					if ((t->inp_socket->so_flags &
1038 					    SOF_NOTIFYCONFLICT) &&
1039 					    !(so->so_flags & SOF_NOTIFYCONFLICT)) {
1040 						conflict = 1;
1041 					}
1042 
1043 					lck_rw_done(&pcbinfo->ipi_lock);
1044 
1045 					if (conflict) {
1046 						in_pcb_conflict_post_msg(lport);
1047 					}
1048 					socket_lock(so, 0);
1049 					return EADDRINUSE;
1050 				}
1051 			}
1052 #if SKYWALK
1053 			if ((SOCK_PROTO(so) == IPPROTO_TCP ||
1054 			    SOCK_PROTO(so) == IPPROTO_UDP) &&
1055 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1056 				int res_err = 0;
1057 				if (inp->inp_vflag & INP_IPV6) {
1058 					res_err = netns_reserve_in6(
1059 						&inp->inp_netns_token,
1060 						SIN6(nam)->sin6_addr,
1061 						(uint8_t)SOCK_PROTO(so), lport, NETNS_BSD,
1062 						NULL);
1063 				} else {
1064 					res_err = netns_reserve_in(
1065 						&inp->inp_netns_token,
1066 						SIN(nam)->sin_addr, (uint8_t)SOCK_PROTO(so),
1067 						lport, NETNS_BSD, NULL);
1068 				}
1069 				if (res_err != 0) {
1070 					lck_rw_done(&pcbinfo->ipi_lock);
1071 					socket_lock(so, 0);
1072 					return EADDRINUSE;
1073 				}
1074 			}
1075 #endif /* SKYWALK */
1076 		}
1077 		laddr = SIN(nam)->sin_addr;
1078 	}
1079 	if (lport == 0) {
1080 		u_short first, last;
1081 		int count;
1082 		bool found;
1083 
1084 		/*
1085 		 * Override wild = 1 for implicit bind (mainly used by connect)
1086 		 * For implicit bind (lport == 0), we always use an unused port,
1087 		 * so REUSEADDR|REUSEPORT don't apply
1088 		 */
1089 		wild = 1;
1090 
1091 		randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
1092 		    (so->so_type == SOCK_STREAM ? tcp_use_randomport :
1093 		    udp_use_randomport);
1094 
1095 		/*
1096 		 * Even though this looks similar to the code in
1097 		 * in6_pcbsetport, the v6 vs v4 checks are different.
1098 		 */
1099 		anonport = TRUE;
1100 		if (inp->inp_flags & INP_HIGHPORT) {
1101 			first = (u_short)ipport_hifirstauto;     /* sysctl */
1102 			last  = (u_short)ipport_hilastauto;
1103 			lastport = &pcbinfo->ipi_lasthi;
1104 		} else if (inp->inp_flags & INP_LOWPORT) {
1105 			cred = kauth_cred_proc_ref(p);
1106 			error = priv_check_cred(cred,
1107 			    PRIV_NETINET_RESERVEDPORT, 0);
1108 			kauth_cred_unref(&cred);
1109 			if (error != 0) {
1110 				lck_rw_done(&pcbinfo->ipi_lock);
1111 				socket_lock(so, 0);
1112 				return error;
1113 			}
1114 			first = (u_short)ipport_lowfirstauto;    /* 1023 */
1115 			last  = (u_short)ipport_lowlastauto;     /* 600 */
1116 			lastport = &pcbinfo->ipi_lastlow;
1117 		} else {
1118 			first = (u_short)ipport_firstauto;       /* sysctl */
1119 			last  = (u_short)ipport_lastauto;
1120 			lastport = &pcbinfo->ipi_lastport;
1121 		}
1122 		/* No point in randomizing if only one port is available */
1123 
1124 		if (first == last) {
1125 			randomport = 0;
1126 		}
1127 		/*
1128 		 * Simple check to ensure all ports are not used up causing
1129 		 * a deadlock here.
1130 		 *
1131 		 * We split the two cases (up and down) so that the direction
1132 		 * is not being tested on each round of the loop.
1133 		 */
1134 		if (first > last) {
1135 			struct in_addr lookup_addr;
1136 
1137 			/*
1138 			 * counting down
1139 			 */
1140 			if (randomport) {
1141 				read_frandom(&rand_port, sizeof(rand_port));
1142 				*lastport =
1143 				    first - (rand_port % (first - last));
1144 			}
1145 			count = first - last;
1146 
1147 			lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1148 			    inp->inp_laddr;
1149 
1150 			found = false;
1151 			do {
1152 				if (count-- < 0) {      /* completely used? */
1153 					lck_rw_done(&pcbinfo->ipi_lock);
1154 					socket_lock(so, 0);
1155 					return EADDRNOTAVAIL;
1156 				}
1157 				--*lastport;
1158 				if (*lastport > first || *lastport < last) {
1159 					*lastport = first;
1160 				}
1161 				lport = htons(*lastport);
1162 
1163 				/*
1164 				 * Skip if this is a restricted port as we do not want to
1165 				 * restricted ports as ephemeral
1166 				 */
1167 				if (IS_RESTRICTED_IN_PORT(lport)) {
1168 					continue;
1169 				}
1170 
1171 				found = in_pcblookup_local_and_cleanup(pcbinfo,
1172 				    lookup_addr, lport, wild) == NULL;
1173 #if SKYWALK
1174 				if (found &&
1175 				    (SOCK_PROTO(so) == IPPROTO_TCP ||
1176 				    SOCK_PROTO(so) == IPPROTO_UDP) &&
1177 				    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1178 					int res_err;
1179 					if (inp->inp_vflag & INP_IPV6) {
1180 						res_err = netns_reserve_in6(
1181 							&inp->inp_netns_token,
1182 							inp->in6p_laddr,
1183 							(uint8_t)SOCK_PROTO(so), lport,
1184 							NETNS_BSD, NULL);
1185 					} else {
1186 						res_err = netns_reserve_in(
1187 							&inp->inp_netns_token,
1188 							lookup_addr, (uint8_t)SOCK_PROTO(so),
1189 							lport, NETNS_BSD, NULL);
1190 					}
1191 					found = res_err == 0;
1192 				}
1193 #endif /* SKYWALK */
1194 			} while (!found);
1195 		} else {
1196 			struct in_addr lookup_addr;
1197 
1198 			/*
1199 			 * counting up
1200 			 */
1201 			if (randomport) {
1202 				read_frandom(&rand_port, sizeof(rand_port));
1203 				*lastport =
1204 				    first + (rand_port % (first - last));
1205 			}
1206 			count = last - first;
1207 
1208 			lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1209 			    inp->inp_laddr;
1210 
1211 			found = false;
1212 			do {
1213 				if (count-- < 0) {      /* completely used? */
1214 					lck_rw_done(&pcbinfo->ipi_lock);
1215 					socket_lock(so, 0);
1216 					return EADDRNOTAVAIL;
1217 				}
1218 				++*lastport;
1219 				if (*lastport < first || *lastport > last) {
1220 					*lastport = first;
1221 				}
1222 				lport = htons(*lastport);
1223 
1224 				/*
1225 				 * Skip if this is a restricted port as we do not want to
1226 				 * restricted ports as ephemeral
1227 				 */
1228 				if (IS_RESTRICTED_IN_PORT(lport)) {
1229 					continue;
1230 				}
1231 
1232 				found = in_pcblookup_local_and_cleanup(pcbinfo,
1233 				    lookup_addr, lport, wild) == NULL;
1234 #if SKYWALK
1235 				if (found &&
1236 				    (SOCK_PROTO(so) == IPPROTO_TCP ||
1237 				    SOCK_PROTO(so) == IPPROTO_UDP) &&
1238 				    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1239 					int res_err;
1240 					if (inp->inp_vflag & INP_IPV6) {
1241 						res_err = netns_reserve_in6(
1242 							&inp->inp_netns_token,
1243 							inp->in6p_laddr,
1244 							(uint8_t)SOCK_PROTO(so), lport,
1245 							NETNS_BSD, NULL);
1246 					} else {
1247 						res_err = netns_reserve_in(
1248 							&inp->inp_netns_token,
1249 							lookup_addr, (uint8_t)SOCK_PROTO(so),
1250 							lport, NETNS_BSD, NULL);
1251 					}
1252 					found = res_err == 0;
1253 				}
1254 #endif /* SKYWALK */
1255 			} while (!found);
1256 		}
1257 	}
1258 	socket_lock(so, 0);
1259 
1260 	/*
1261 	 * We unlocked socket's protocol lock for a long time.
1262 	 * The socket might have been dropped/defuncted.
1263 	 * Checking if world has changed since.
1264 	 */
1265 	if (inp->inp_state == INPCB_STATE_DEAD) {
1266 #if SKYWALK
1267 		netns_release(&inp->inp_netns_token);
1268 #endif /* SKYWALK */
1269 		lck_rw_done(&pcbinfo->ipi_lock);
1270 		return ECONNABORTED;
1271 	}
1272 
1273 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
1274 #if SKYWALK
1275 		netns_release(&inp->inp_netns_token);
1276 #endif /* SKYWALK */
1277 		lck_rw_done(&pcbinfo->ipi_lock);
1278 		return EINVAL;
1279 	}
1280 
1281 	if (laddr.s_addr != INADDR_ANY) {
1282 		inp->inp_laddr = laddr;
1283 		inp->inp_last_outifp = outif;
1284 #if SKYWALK
1285 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1286 			netns_set_ifnet(&inp->inp_netns_token, outif);
1287 		}
1288 #endif /* SKYWALK */
1289 	}
1290 	inp->inp_lport = lport;
1291 	if (anonport) {
1292 		inp->inp_flags |= INP_ANONPORT;
1293 	}
1294 
1295 	if (in_pcbinshash(inp, 1) != 0) {
1296 		inp->inp_laddr.s_addr = INADDR_ANY;
1297 		inp->inp_last_outifp = NULL;
1298 
1299 #if SKYWALK
1300 		netns_release(&inp->inp_netns_token);
1301 #endif /* SKYWALK */
1302 		inp->inp_lport = 0;
1303 		if (anonport) {
1304 			inp->inp_flags &= ~INP_ANONPORT;
1305 		}
1306 		lck_rw_done(&pcbinfo->ipi_lock);
1307 		return EAGAIN;
1308 	}
1309 	lck_rw_done(&pcbinfo->ipi_lock);
1310 	sflt_notify(so, sock_evt_bound, NULL);
1311 
1312 	in_pcb_check_management_entitled(inp);
1313 
1314 	return 0;
1315 }
1316 
1317 #define APN_FALLBACK_IP_FILTER(a)       \
1318 	(IN_LINKLOCAL(ntohl((a)->sin_addr.s_addr)) || \
1319 	 IN_LOOPBACK(ntohl((a)->sin_addr.s_addr)) || \
1320 	 IN_ZERONET(ntohl((a)->sin_addr.s_addr)) || \
1321 	 IN_MULTICAST(ntohl((a)->sin_addr.s_addr)) || \
1322 	 IN_PRIVATE(ntohl((a)->sin_addr.s_addr)))
1323 
1324 #define APN_FALLBACK_NOTIF_INTERVAL     2 /* Magic Number */
1325 static uint64_t last_apn_fallback = 0;
1326 
1327 static boolean_t
apn_fallback_required(proc_t proc,struct socket * so,struct sockaddr_in * p_dstv4)1328 apn_fallback_required(proc_t proc, struct socket *so, struct sockaddr_in *p_dstv4)
1329 {
1330 	uint64_t timenow;
1331 	struct sockaddr_storage lookup_default_addr;
1332 	struct rtentry *rt = NULL;
1333 
1334 	VERIFY(proc != NULL);
1335 
1336 	if (apn_fallbk_enabled == FALSE) {
1337 		return FALSE;
1338 	}
1339 
1340 	if (proc == kernproc) {
1341 		return FALSE;
1342 	}
1343 
1344 	if (so && (so->so_options & SO_NOAPNFALLBK)) {
1345 		return FALSE;
1346 	}
1347 
1348 	timenow = net_uptime();
1349 	if ((timenow - last_apn_fallback) < APN_FALLBACK_NOTIF_INTERVAL) {
1350 		apn_fallbk_log((LOG_INFO, "APN fallback notification throttled.\n"));
1351 		return FALSE;
1352 	}
1353 
1354 	if (p_dstv4 && APN_FALLBACK_IP_FILTER(p_dstv4)) {
1355 		return FALSE;
1356 	}
1357 
1358 	/* Check if we have unscoped IPv6 default route through cellular */
1359 	bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1360 	lookup_default_addr.ss_family = AF_INET6;
1361 	lookup_default_addr.ss_len = sizeof(struct sockaddr_in6);
1362 
1363 	rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1364 	if (NULL == rt) {
1365 		apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1366 		    "unscoped default IPv6 route.\n"));
1367 		return FALSE;
1368 	}
1369 
1370 	if (!IFNET_IS_CELLULAR(rt->rt_ifp)) {
1371 		rtfree(rt);
1372 		apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1373 		    "unscoped default IPv6 route through cellular interface.\n"));
1374 		return FALSE;
1375 	}
1376 
1377 	/*
1378 	 * We have a default IPv6 route, ensure that
1379 	 * we do not have IPv4 default route before triggering
1380 	 * the event
1381 	 */
1382 	rtfree(rt);
1383 	rt = NULL;
1384 
1385 	bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1386 	lookup_default_addr.ss_family = AF_INET;
1387 	lookup_default_addr.ss_len = sizeof(struct sockaddr_in);
1388 
1389 	rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1390 
1391 	if (rt) {
1392 		rtfree(rt);
1393 		rt = NULL;
1394 		apn_fallbk_log((LOG_INFO, "APN fallback notification found unscoped "
1395 		    "IPv4 default route!\n"));
1396 		return FALSE;
1397 	}
1398 
1399 	{
1400 		/*
1401 		 * We disable APN fallback if the binary is not a third-party app.
1402 		 * Note that platform daemons use their process name as a
1403 		 * bundle ID so we filter out bundle IDs without dots.
1404 		 */
1405 		const char *bundle_id = cs_identity_get(proc);
1406 		if (bundle_id == NULL ||
1407 		    bundle_id[0] == '\0' ||
1408 		    strchr(bundle_id, '.') == NULL ||
1409 		    strncmp(bundle_id, "com.apple.", sizeof("com.apple.") - 1) == 0) {
1410 			apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found first-"
1411 			    "party bundle ID \"%s\"!\n", (bundle_id ? bundle_id : "NULL")));
1412 			return FALSE;
1413 		}
1414 	}
1415 
1416 	{
1417 		/*
1418 		 * The Apple App Store IPv6 requirement started on
1419 		 * June 1st, 2016 at 12:00:00 AM PDT.
1420 		 * We disable APN fallback if the binary is more recent than that.
1421 		 * We check both atime and birthtime since birthtime is not always supported.
1422 		 */
1423 		static const long ipv6_start_date = 1464764400L;
1424 		vfs_context_t context;
1425 		struct stat64 sb;
1426 		int vn_stat_error;
1427 
1428 		bzero(&sb, sizeof(struct stat64));
1429 		context = vfs_context_create(NULL);
1430 		vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, 0, context);
1431 		(void)vfs_context_rele(context);
1432 
1433 		if (vn_stat_error != 0 ||
1434 		    sb.st_atimespec.tv_sec >= ipv6_start_date ||
1435 		    sb.st_birthtimespec.tv_sec >= ipv6_start_date) {
1436 			apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found binary "
1437 			    "too recent! (err %d atime %ld mtime %ld ctime %ld birthtime %ld)\n",
1438 			    vn_stat_error, sb.st_atimespec.tv_sec, sb.st_mtimespec.tv_sec,
1439 			    sb.st_ctimespec.tv_sec, sb.st_birthtimespec.tv_sec));
1440 			return FALSE;
1441 		}
1442 	}
1443 	return TRUE;
1444 }
1445 
1446 static void
apn_fallback_trigger(proc_t proc,struct socket * so)1447 apn_fallback_trigger(proc_t proc, struct socket *so)
1448 {
1449 	pid_t pid = 0;
1450 	struct kev_msg ev_msg;
1451 	struct kev_netevent_apnfallbk_data apnfallbk_data;
1452 
1453 	last_apn_fallback = net_uptime();
1454 	pid = proc_pid(proc);
1455 	uuid_t application_uuid;
1456 	uuid_clear(application_uuid);
1457 	proc_getexecutableuuid(proc, application_uuid,
1458 	    sizeof(application_uuid));
1459 
1460 	bzero(&ev_msg, sizeof(struct kev_msg));
1461 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1462 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1463 	ev_msg.kev_subclass     = KEV_NETEVENT_SUBCLASS;
1464 	ev_msg.event_code       = KEV_NETEVENT_APNFALLBACK;
1465 
1466 	bzero(&apnfallbk_data, sizeof(apnfallbk_data));
1467 
1468 	if (so->so_flags & SOF_DELEGATED) {
1469 		apnfallbk_data.epid = so->e_pid;
1470 		uuid_copy(apnfallbk_data.euuid, so->e_uuid);
1471 	} else {
1472 		apnfallbk_data.epid = so->last_pid;
1473 		uuid_copy(apnfallbk_data.euuid, so->last_uuid);
1474 	}
1475 
1476 	ev_msg.dv[0].data_ptr   = &apnfallbk_data;
1477 	ev_msg.dv[0].data_length = sizeof(apnfallbk_data);
1478 	kev_post_msg(&ev_msg);
1479 	apn_fallbk_log((LOG_INFO, "APN fallback notification issued.\n"));
1480 }
1481 
1482 /*
1483  * Transform old in_pcbconnect() into an inner subroutine for new
1484  * in_pcbconnect(); do some validity-checking on the remote address
1485  * (in "nam") and then determine local host address (i.e., which
1486  * interface) to use to access that remote host.
1487  *
1488  * This routine may alter the caller-supplied remote address "nam".
1489  *
1490  * The caller may override the bound-to-interface setting of the socket
1491  * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1492  *
1493  * This routine might return an ifp with a reference held if the caller
1494  * provides a non-NULL outif, even in the error case.  The caller is
1495  * responsible for releasing its reference.
1496  *
1497  * Returns:	0			Success
1498  *		EINVAL			Invalid argument
1499  *		EAFNOSUPPORT		Address family not supported
1500  *		EADDRNOTAVAIL		Address not available
1501  */
1502 int
in_pcbladdr(struct inpcb * inp,struct sockaddr * nam,struct in_addr * laddr,unsigned int ifscope,struct ifnet ** outif,int raw)1503 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr,
1504     unsigned int ifscope, struct ifnet **outif, int raw)
1505 {
1506 	struct route *ro = &inp->inp_route;
1507 	struct in_ifaddr *ia = NULL;
1508 	struct sockaddr_in sin;
1509 	int error = 0;
1510 	boolean_t restricted = FALSE;
1511 
1512 	if (outif != NULL) {
1513 		*outif = NULL;
1514 	}
1515 	if (nam->sa_len != sizeof(struct sockaddr_in)) {
1516 		return EINVAL;
1517 	}
1518 	if (SIN(nam)->sin_family != AF_INET) {
1519 		return EAFNOSUPPORT;
1520 	}
1521 	if (raw == 0 && SIN(nam)->sin_port == 0) {
1522 		return EADDRNOTAVAIL;
1523 	}
1524 
1525 	in_pcb_check_management_entitled(inp);
1526 
1527 	/*
1528 	 * If the destination address is INADDR_ANY,
1529 	 * use the primary local address.
1530 	 * If the supplied address is INADDR_BROADCAST,
1531 	 * and the primary interface supports broadcast,
1532 	 * choose the broadcast address for that interface.
1533 	 */
1534 	if (raw == 0 && (SIN(nam)->sin_addr.s_addr == INADDR_ANY ||
1535 	    SIN(nam)->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST)) {
1536 		lck_rw_lock_shared(&in_ifaddr_rwlock);
1537 		if (!TAILQ_EMPTY(&in_ifaddrhead)) {
1538 			ia = TAILQ_FIRST(&in_ifaddrhead);
1539 			IFA_LOCK_SPIN(&ia->ia_ifa);
1540 			if (SIN(nam)->sin_addr.s_addr == INADDR_ANY) {
1541 				SIN(nam)->sin_addr = IA_SIN(ia)->sin_addr;
1542 			} else if (ia->ia_ifp->if_flags & IFF_BROADCAST) {
1543 				SIN(nam)->sin_addr =
1544 				    SIN(&ia->ia_broadaddr)->sin_addr;
1545 			}
1546 			IFA_UNLOCK(&ia->ia_ifa);
1547 			ia = NULL;
1548 		}
1549 		lck_rw_done(&in_ifaddr_rwlock);
1550 	}
1551 	/*
1552 	 * Otherwise, if the socket has already bound the source, just use it.
1553 	 */
1554 	if (inp->inp_laddr.s_addr != INADDR_ANY) {
1555 		VERIFY(ia == NULL);
1556 		*laddr = inp->inp_laddr;
1557 		return 0;
1558 	}
1559 
1560 	/*
1561 	 * If the ifscope is specified by the caller (e.g. IP_PKTINFO)
1562 	 * then it overrides the sticky ifscope set for the socket.
1563 	 */
1564 	if (ifscope == IFSCOPE_NONE && (inp->inp_flags & INP_BOUND_IF)) {
1565 		ifscope = inp->inp_boundifp->if_index;
1566 	}
1567 
1568 	/*
1569 	 * If route is known or can be allocated now,
1570 	 * our src addr is taken from the i/f, else punt.
1571 	 * Note that we should check the address family of the cached
1572 	 * destination, in case of sharing the cache with IPv6.
1573 	 */
1574 	if (ro->ro_rt != NULL) {
1575 		RT_LOCK_SPIN(ro->ro_rt);
1576 	}
1577 	if (ROUTE_UNUSABLE(ro) || ro->ro_dst.sa_family != AF_INET ||
1578 	    SIN(&ro->ro_dst)->sin_addr.s_addr != SIN(nam)->sin_addr.s_addr ||
1579 	    (inp->inp_socket->so_options & SO_DONTROUTE)) {
1580 		if (ro->ro_rt != NULL) {
1581 			RT_UNLOCK(ro->ro_rt);
1582 		}
1583 		ROUTE_RELEASE(ro);
1584 	}
1585 	if (!(inp->inp_socket->so_options & SO_DONTROUTE) &&
1586 	    (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
1587 		if (ro->ro_rt != NULL) {
1588 			RT_UNLOCK(ro->ro_rt);
1589 		}
1590 		ROUTE_RELEASE(ro);
1591 		/* No route yet, so try to acquire one */
1592 		bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
1593 		ro->ro_dst.sa_family = AF_INET;
1594 		ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1595 		SIN(&ro->ro_dst)->sin_addr = SIN(nam)->sin_addr;
1596 		rtalloc_scoped(ro, ifscope);
1597 		if (ro->ro_rt != NULL) {
1598 			RT_LOCK_SPIN(ro->ro_rt);
1599 		}
1600 	}
1601 	/* Sanitized local copy for interface address searches */
1602 	bzero(&sin, sizeof(sin));
1603 	sin.sin_family = AF_INET;
1604 	sin.sin_len = sizeof(struct sockaddr_in);
1605 	sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
1606 	/*
1607 	 * If we did not find (or use) a route, assume dest is reachable
1608 	 * on a directly connected network and try to find a corresponding
1609 	 * interface to take the source address from.
1610 	 */
1611 	if (ro->ro_rt == NULL) {
1612 		proc_t proc = current_proc();
1613 
1614 		VERIFY(ia == NULL);
1615 		ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1616 		if (ia == NULL) {
1617 			ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1618 		}
1619 		error = ((ia == NULL) ? ENETUNREACH : 0);
1620 
1621 		if (apn_fallback_required(proc, inp->inp_socket,
1622 		    (void *)nam)) {
1623 			apn_fallback_trigger(proc, inp->inp_socket);
1624 		}
1625 
1626 		goto done;
1627 	}
1628 	RT_LOCK_ASSERT_HELD(ro->ro_rt);
1629 	/*
1630 	 * If the outgoing interface on the route found is not
1631 	 * a loopback interface, use the address from that interface.
1632 	 */
1633 	if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
1634 		VERIFY(ia == NULL);
1635 		/*
1636 		 * If the route points to a cellular interface and the
1637 		 * caller forbids our using interfaces of such type,
1638 		 * pretend that there is no route.
1639 		 * Apply the same logic for expensive interfaces.
1640 		 */
1641 		if (inp_restricted_send(inp, ro->ro_rt->rt_ifp)) {
1642 			RT_UNLOCK(ro->ro_rt);
1643 			ROUTE_RELEASE(ro);
1644 			error = EHOSTUNREACH;
1645 			restricted = TRUE;
1646 		} else {
1647 			/* Become a regular mutex */
1648 			RT_CONVERT_LOCK(ro->ro_rt);
1649 			ia = ifatoia(ro->ro_rt->rt_ifa);
1650 			IFA_ADDREF(&ia->ia_ifa);
1651 
1652 			/*
1653 			 * Mark the control block for notification of
1654 			 * a possible flow that might undergo clat46
1655 			 * translation.
1656 			 *
1657 			 * We defer the decision to a later point when
1658 			 * inpcb is being disposed off.
1659 			 * The reason is that we only want to send notification
1660 			 * if the flow was ever used to send data.
1661 			 */
1662 			if (IS_INTF_CLAT46(ro->ro_rt->rt_ifp)) {
1663 				inp->inp_flags2 |= INP2_CLAT46_FLOW;
1664 			}
1665 
1666 			RT_UNLOCK(ro->ro_rt);
1667 			error = 0;
1668 		}
1669 		goto done;
1670 	}
1671 	VERIFY(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK);
1672 	RT_UNLOCK(ro->ro_rt);
1673 	/*
1674 	 * The outgoing interface is marked with 'loopback net', so a route
1675 	 * to ourselves is here.
1676 	 * Try to find the interface of the destination address and then
1677 	 * take the address from there. That interface is not necessarily
1678 	 * a loopback interface.
1679 	 */
1680 	VERIFY(ia == NULL);
1681 	ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1682 	if (ia == NULL) {
1683 		ia = ifatoia(ifa_ifwithaddr_scoped(SA(&sin), ifscope));
1684 	}
1685 	if (ia == NULL) {
1686 		ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1687 	}
1688 	if (ia == NULL) {
1689 		RT_LOCK(ro->ro_rt);
1690 		ia = ifatoia(ro->ro_rt->rt_ifa);
1691 		if (ia != NULL) {
1692 			IFA_ADDREF(&ia->ia_ifa);
1693 		}
1694 		RT_UNLOCK(ro->ro_rt);
1695 	}
1696 	error = ((ia == NULL) ? ENETUNREACH : 0);
1697 
1698 done:
1699 	/*
1700 	 * If the destination address is multicast and an outgoing
1701 	 * interface has been set as a multicast option, use the
1702 	 * address of that interface as our source address.
1703 	 */
1704 	if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
1705 	    inp->inp_moptions != NULL) {
1706 		struct ip_moptions *imo;
1707 		struct ifnet *ifp;
1708 
1709 		imo = inp->inp_moptions;
1710 		IMO_LOCK(imo);
1711 		if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
1712 		    ia->ia_ifp != imo->imo_multicast_ifp)) {
1713 			ifp = imo->imo_multicast_ifp;
1714 			if (ia != NULL) {
1715 				IFA_REMREF(&ia->ia_ifa);
1716 			}
1717 			lck_rw_lock_shared(&in_ifaddr_rwlock);
1718 			TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
1719 				if (ia->ia_ifp == ifp) {
1720 					break;
1721 				}
1722 			}
1723 			if (ia != NULL) {
1724 				IFA_ADDREF(&ia->ia_ifa);
1725 			}
1726 			lck_rw_done(&in_ifaddr_rwlock);
1727 			if (ia == NULL) {
1728 				error = EADDRNOTAVAIL;
1729 			} else {
1730 				error = 0;
1731 			}
1732 		}
1733 		IMO_UNLOCK(imo);
1734 	}
1735 	/*
1736 	 * Don't do pcblookup call here; return interface in laddr
1737 	 * and exit to caller, that will do the lookup.
1738 	 */
1739 	if (ia != NULL) {
1740 		/*
1741 		 * If the source address belongs to a cellular interface
1742 		 * and the socket forbids our using interfaces of such
1743 		 * type, pretend that there is no source address.
1744 		 * Apply the same logic for expensive interfaces.
1745 		 */
1746 		IFA_LOCK_SPIN(&ia->ia_ifa);
1747 		if (inp_restricted_send(inp, ia->ia_ifa.ifa_ifp)) {
1748 			IFA_UNLOCK(&ia->ia_ifa);
1749 			error = EHOSTUNREACH;
1750 			restricted = TRUE;
1751 		} else if (error == 0) {
1752 			*laddr = ia->ia_addr.sin_addr;
1753 			if (outif != NULL) {
1754 				struct ifnet *ifp;
1755 
1756 				if (ro->ro_rt != NULL) {
1757 					ifp = ro->ro_rt->rt_ifp;
1758 				} else {
1759 					ifp = ia->ia_ifp;
1760 				}
1761 
1762 				VERIFY(ifp != NULL);
1763 				IFA_CONVERT_LOCK(&ia->ia_ifa);
1764 				ifnet_reference(ifp);   /* for caller */
1765 				if (*outif != NULL) {
1766 					ifnet_release(*outif);
1767 				}
1768 				*outif = ifp;
1769 			}
1770 			IFA_UNLOCK(&ia->ia_ifa);
1771 		} else {
1772 			IFA_UNLOCK(&ia->ia_ifa);
1773 		}
1774 		IFA_REMREF(&ia->ia_ifa);
1775 		ia = NULL;
1776 	}
1777 
1778 	if (restricted && error == EHOSTUNREACH) {
1779 		soevent(inp->inp_socket, (SO_FILT_HINT_LOCKED |
1780 		    SO_FILT_HINT_IFDENIED));
1781 	}
1782 
1783 	return error;
1784 }
1785 
1786 /*
1787  * Outer subroutine:
1788  * Connect from a socket to a specified address.
1789  * Both address and port must be specified in argument sin.
1790  * If don't have a local address for this socket yet,
1791  * then pick one.
1792  *
1793  * The caller may override the bound-to-interface setting of the socket
1794  * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1795  */
1796 int
in_pcbconnect(struct inpcb * inp,struct sockaddr * nam,struct proc * p,unsigned int ifscope,struct ifnet ** outif)1797 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
1798     unsigned int ifscope, struct ifnet **outif)
1799 {
1800 	struct in_addr laddr;
1801 	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
1802 	struct inpcb *pcb;
1803 	int error;
1804 	struct socket *so = inp->inp_socket;
1805 
1806 #if CONTENT_FILTER
1807 	if (so) {
1808 		so->so_state_change_cnt++;
1809 	}
1810 #endif
1811 
1812 	/*
1813 	 *   Call inner routine, to assign local interface address.
1814 	 */
1815 	if ((error = in_pcbladdr(inp, nam, &laddr, ifscope, outif, 0)) != 0) {
1816 		return error;
1817 	}
1818 
1819 	socket_unlock(so, 0);
1820 	pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
1821 	    inp->inp_laddr.s_addr ? inp->inp_laddr : laddr,
1822 	    inp->inp_lport, 0, NULL);
1823 	socket_lock(so, 0);
1824 
1825 	/*
1826 	 * Check if the socket is still in a valid state. When we unlock this
1827 	 * embryonic socket, it can get aborted if another thread is closing
1828 	 * the listener (radar 7947600).
1829 	 */
1830 	if ((so->so_flags & SOF_ABORTED) != 0) {
1831 		return ECONNREFUSED;
1832 	}
1833 
1834 	if (pcb != NULL) {
1835 		in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0);
1836 		return EADDRINUSE;
1837 	}
1838 	if (inp->inp_laddr.s_addr == INADDR_ANY) {
1839 		if (inp->inp_lport == 0) {
1840 			error = in_pcbbind(inp, NULL, p);
1841 			if (error) {
1842 				return error;
1843 			}
1844 		}
1845 		if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1846 			/*
1847 			 * Lock inversion issue, mostly with udp
1848 			 * multicast packets.
1849 			 */
1850 			socket_unlock(so, 0);
1851 			lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1852 			socket_lock(so, 0);
1853 		}
1854 		inp->inp_laddr = laddr;
1855 		/* no reference needed */
1856 		inp->inp_last_outifp = (outif != NULL) ? *outif : NULL;
1857 #if SKYWALK
1858 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1859 			netns_set_ifnet(&inp->inp_netns_token,
1860 			    inp->inp_last_outifp);
1861 		}
1862 #endif /* SKYWALK */
1863 		inp->inp_flags |= INP_INADDR_ANY;
1864 	} else {
1865 		/*
1866 		 * Usage of IP_PKTINFO, without local port already
1867 		 * speficified will cause kernel to panic,
1868 		 * see rdar://problem/18508185.
1869 		 * For now returning error to avoid a kernel panic
1870 		 * This routines can be refactored and handle this better
1871 		 * in future.
1872 		 */
1873 		if (inp->inp_lport == 0) {
1874 			return EINVAL;
1875 		}
1876 		if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1877 			/*
1878 			 * Lock inversion issue, mostly with udp
1879 			 * multicast packets.
1880 			 */
1881 			socket_unlock(so, 0);
1882 			lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1883 			socket_lock(so, 0);
1884 		}
1885 	}
1886 	inp->inp_faddr = sin->sin_addr;
1887 	inp->inp_fport = sin->sin_port;
1888 	if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1889 		nstat_pcb_invalidate_cache(inp);
1890 	}
1891 	in_pcbrehash(inp);
1892 	lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1893 	return 0;
1894 }
1895 
1896 void
in_pcbdisconnect(struct inpcb * inp)1897 in_pcbdisconnect(struct inpcb *inp)
1898 {
1899 	struct socket *so = inp->inp_socket;
1900 
1901 	if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1902 		nstat_pcb_cache(inp);
1903 	}
1904 
1905 	inp->inp_faddr.s_addr = INADDR_ANY;
1906 	inp->inp_fport = 0;
1907 
1908 #if CONTENT_FILTER
1909 	if (so) {
1910 		so->so_state_change_cnt++;
1911 	}
1912 #endif
1913 
1914 	if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1915 		/* lock inversion issue, mostly with udp multicast packets */
1916 		socket_unlock(so, 0);
1917 		lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1918 		socket_lock(so, 0);
1919 	}
1920 
1921 	in_pcbrehash(inp);
1922 	lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1923 	/*
1924 	 * A multipath subflow socket would have its SS_NOFDREF set by default,
1925 	 * so check for SOF_MP_SUBFLOW socket flag before detaching the PCB;
1926 	 * when the socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1927 	 */
1928 	if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF)) {
1929 		in_pcbdetach(inp);
1930 	}
1931 }
1932 
1933 void
in_pcbdetach(struct inpcb * inp)1934 in_pcbdetach(struct inpcb *inp)
1935 {
1936 	struct socket *so = inp->inp_socket;
1937 
1938 	if (so->so_pcb == NULL) {
1939 		/* PCB has been disposed */
1940 		panic("%s: inp=%p so=%p proto=%d so_pcb is null!", __func__,
1941 		    inp, so, SOCK_PROTO(so));
1942 		/* NOTREACHED */
1943 	}
1944 
1945 #if IPSEC
1946 	if (inp->inp_sp != NULL) {
1947 		(void) ipsec4_delete_pcbpolicy(inp);
1948 	}
1949 #endif /* IPSEC */
1950 
1951 	if (inp->inp_stat != NULL && SOCK_PROTO(so) == IPPROTO_UDP) {
1952 		if (inp->inp_stat->rxpackets == 0 && inp->inp_stat->txpackets == 0) {
1953 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_no_data);
1954 		}
1955 	}
1956 
1957 	/*
1958 	 * Let NetworkStatistics know this PCB is going away
1959 	 * before we detach it.
1960 	 */
1961 	if (nstat_collect &&
1962 	    (SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP)) {
1963 		nstat_pcb_detach(inp);
1964 	}
1965 
1966 	/* Free memory buffer held for generating keep alives */
1967 	if (inp->inp_keepalive_data != NULL) {
1968 		kfree_data(inp->inp_keepalive_data, inp->inp_keepalive_datalen);
1969 		inp->inp_keepalive_data = NULL;
1970 	}
1971 
1972 	/* mark socket state as dead */
1973 	if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING) {
1974 		panic("%s: so=%p proto=%d couldn't set to STOPUSING",
1975 		    __func__, so, SOCK_PROTO(so));
1976 		/* NOTREACHED */
1977 	}
1978 
1979 	if (!(so->so_flags & SOF_PCBCLEARING)) {
1980 		struct ip_moptions *imo;
1981 
1982 		inp->inp_vflag = 0;
1983 		if (inp->inp_options != NULL) {
1984 			(void) m_free(inp->inp_options);
1985 			inp->inp_options = NULL;
1986 		}
1987 		ROUTE_RELEASE(&inp->inp_route);
1988 		imo = inp->inp_moptions;
1989 		if (imo != NULL) {
1990 			IMO_REMREF(imo);
1991 		}
1992 		inp->inp_moptions = NULL;
1993 		sofreelastref(so, 0);
1994 		inp->inp_state = INPCB_STATE_DEAD;
1995 
1996 		/*
1997 		 * Enqueue an event to send kernel event notification
1998 		 * if the flow has to CLAT46 for data packets
1999 		 */
2000 		if (inp->inp_flags2 & INP2_CLAT46_FLOW) {
2001 			/*
2002 			 * If there has been any exchange of data bytes
2003 			 * over this flow.
2004 			 * Schedule a notification to report that flow is
2005 			 * using client side translation.
2006 			 */
2007 			if (inp->inp_stat != NULL &&
2008 			    (inp->inp_stat->txbytes != 0 ||
2009 			    inp->inp_stat->rxbytes != 0)) {
2010 				if (so->so_flags & SOF_DELEGATED) {
2011 					in6_clat46_event_enqueue_nwk_wq_entry(
2012 						IN6_CLAT46_EVENT_V4_FLOW,
2013 						so->e_pid,
2014 						so->e_uuid);
2015 				} else {
2016 					in6_clat46_event_enqueue_nwk_wq_entry(
2017 						IN6_CLAT46_EVENT_V4_FLOW,
2018 						so->last_pid,
2019 						so->last_uuid);
2020 				}
2021 			}
2022 		}
2023 
2024 		/* makes sure we're not called twice from so_close */
2025 		so->so_flags |= SOF_PCBCLEARING;
2026 
2027 		inpcb_gc_sched(inp->inp_pcbinfo, INPCB_TIMER_FAST);
2028 	}
2029 }
2030 
2031 
2032 void
in_pcbdispose(struct inpcb * inp)2033 in_pcbdispose(struct inpcb *inp)
2034 {
2035 	struct socket *so = inp->inp_socket;
2036 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
2037 
2038 	if (so != NULL && so->so_usecount != 0) {
2039 		panic("%s: so %p [%d,%d] usecount %d lockhistory %s",
2040 		    __func__, so, SOCK_DOM(so), SOCK_TYPE(so), so->so_usecount,
2041 		    solockhistory_nr(so));
2042 		/* NOTREACHED */
2043 	} else if (inp->inp_wantcnt != WNT_STOPUSING) {
2044 		if (so != NULL) {
2045 			panic_plain("%s: inp %p invalid wantcnt %d, so %p "
2046 			    "[%d,%d] usecount %d retaincnt %d state 0x%x "
2047 			    "flags 0x%x lockhistory %s\n", __func__, inp,
2048 			    inp->inp_wantcnt, so, SOCK_DOM(so), SOCK_TYPE(so),
2049 			    so->so_usecount, so->so_retaincnt, so->so_state,
2050 			    so->so_flags, solockhistory_nr(so));
2051 			/* NOTREACHED */
2052 		} else {
2053 			panic("%s: inp %p invalid wantcnt %d no socket",
2054 			    __func__, inp, inp->inp_wantcnt);
2055 			/* NOTREACHED */
2056 		}
2057 	}
2058 
2059 	LCK_RW_ASSERT(&ipi->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
2060 
2061 	inp->inp_gencnt = ++ipi->ipi_gencnt;
2062 	/* access ipi in in_pcbremlists */
2063 	in_pcbremlists(inp);
2064 
2065 	if (so != NULL) {
2066 		if (so->so_proto->pr_flags & PR_PCBLOCK) {
2067 			sofreelastref(so, 0);
2068 			if (so->so_rcv.sb_cc > 0 || so->so_snd.sb_cc > 0) {
2069 				/*
2070 				 * selthreadclear() already called
2071 				 * during sofreelastref() above.
2072 				 */
2073 				sbrelease(&so->so_rcv);
2074 				sbrelease(&so->so_snd);
2075 			}
2076 			if (so->so_head != NULL) {
2077 				panic("%s: so=%p head still exist",
2078 				    __func__, so);
2079 				/* NOTREACHED */
2080 			}
2081 			lck_mtx_unlock(&inp->inpcb_mtx);
2082 
2083 #if NECP
2084 			necp_inpcb_remove_cb(inp);
2085 #endif /* NECP */
2086 
2087 			lck_mtx_destroy(&inp->inpcb_mtx, ipi->ipi_lock_grp);
2088 		}
2089 		/* makes sure we're not called twice from so_close */
2090 		so->so_flags |= SOF_PCBCLEARING;
2091 		so->so_saved_pcb = (caddr_t)inp;
2092 		so->so_pcb = NULL;
2093 		inp->inp_socket = NULL;
2094 #if NECP
2095 		necp_inpcb_dispose(inp);
2096 #endif /* NECP */
2097 		/*
2098 		 * In case there a route cached after a detach (possible
2099 		 * in the tcp case), make sure that it is freed before
2100 		 * we deallocate the structure.
2101 		 */
2102 		ROUTE_RELEASE(&inp->inp_route);
2103 		if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
2104 			zfree(ipi->ipi_zone, inp);
2105 		}
2106 		sodealloc(so);
2107 	}
2108 }
2109 
2110 /*
2111  * The calling convention of in_getsockaddr() and in_getpeeraddr() was
2112  * modified to match the pru_sockaddr() and pru_peeraddr() entry points
2113  * in struct pr_usrreqs, so that protocols can just reference then directly
2114  * without the need for a wrapper function.
2115  */
2116 int
in_getsockaddr(struct socket * so,struct sockaddr ** nam)2117 in_getsockaddr(struct socket *so, struct sockaddr **nam)
2118 {
2119 	struct inpcb *inp;
2120 	struct sockaddr_in *sin;
2121 
2122 	/*
2123 	 * Do the malloc first in case it blocks.
2124 	 */
2125 	sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2126 	    Z_WAITOK | Z_NOFAIL);
2127 
2128 	sin->sin_family = AF_INET;
2129 
2130 	if ((inp = sotoinpcb(so)) == NULL) {
2131 		free_sockaddr(sin);
2132 		return EINVAL;
2133 	}
2134 	sin->sin_port = inp->inp_lport;
2135 	sin->sin_addr = inp->inp_laddr;
2136 
2137 	*nam = (struct sockaddr *)sin;
2138 	return 0;
2139 }
2140 
2141 int
in_getsockaddr_s(struct socket * so,struct sockaddr_in * ss)2142 in_getsockaddr_s(struct socket *so, struct sockaddr_in *ss)
2143 {
2144 	struct sockaddr_in *sin = ss;
2145 	struct inpcb *inp;
2146 
2147 	VERIFY(ss != NULL);
2148 	bzero(ss, sizeof(*ss));
2149 
2150 	sin->sin_family = AF_INET;
2151 	sin->sin_len = sizeof(*sin);
2152 
2153 	if ((inp = sotoinpcb(so)) == NULL) {
2154 		return EINVAL;
2155 	}
2156 
2157 	sin->sin_port = inp->inp_lport;
2158 	sin->sin_addr = inp->inp_laddr;
2159 	return 0;
2160 }
2161 
2162 int
in_getpeeraddr(struct socket * so,struct sockaddr ** nam)2163 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
2164 {
2165 	struct inpcb *inp;
2166 	struct sockaddr_in *sin;
2167 
2168 	/*
2169 	 * Do the malloc first in case it blocks.
2170 	 */
2171 	sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2172 	    Z_WAITOK | Z_NOFAIL);
2173 
2174 	sin->sin_family = AF_INET;
2175 
2176 	if ((inp = sotoinpcb(so)) == NULL) {
2177 		free_sockaddr(sin);
2178 		return EINVAL;
2179 	}
2180 	sin->sin_port = inp->inp_fport;
2181 	sin->sin_addr = inp->inp_faddr;
2182 
2183 	*nam = (struct sockaddr *)sin;
2184 	return 0;
2185 }
2186 
2187 void
in_pcbnotifyall(struct inpcbinfo * pcbinfo,struct in_addr faddr,int errno,void (* notify)(struct inpcb *,int))2188 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2189     int errno, void (*notify)(struct inpcb *, int))
2190 {
2191 	struct inpcb *inp;
2192 
2193 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2194 
2195 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
2196 		if (!(inp->inp_vflag & INP_IPV4)) {
2197 			continue;
2198 		}
2199 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
2200 		    inp->inp_socket == NULL) {
2201 			continue;
2202 		}
2203 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2204 			continue;
2205 		}
2206 		socket_lock(inp->inp_socket, 1);
2207 		(*notify)(inp, errno);
2208 		(void) in_pcb_checkstate(inp, WNT_RELEASE, 1);
2209 		socket_unlock(inp->inp_socket, 1);
2210 	}
2211 	lck_rw_done(&pcbinfo->ipi_lock);
2212 }
2213 
2214 /*
2215  * Check for alternatives when higher level complains
2216  * about service problems.  For now, invalidate cached
2217  * routing information.  If the route was created dynamically
2218  * (by a redirect), time to try a default gateway again.
2219  */
2220 void
in_losing(struct inpcb * inp)2221 in_losing(struct inpcb *inp)
2222 {
2223 	boolean_t release = FALSE;
2224 	struct rtentry *rt;
2225 
2226 	if ((rt = inp->inp_route.ro_rt) != NULL) {
2227 		struct in_ifaddr *ia = NULL;
2228 
2229 		RT_LOCK(rt);
2230 		if (rt->rt_flags & RTF_DYNAMIC) {
2231 			/*
2232 			 * Prevent another thread from modifying rt_key,
2233 			 * rt_gateway via rt_setgate() after rt_lock is
2234 			 * dropped by marking the route as defunct.
2235 			 */
2236 			rt->rt_flags |= RTF_CONDEMNED;
2237 			RT_UNLOCK(rt);
2238 			(void) rtrequest(RTM_DELETE, rt_key(rt),
2239 			    rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
2240 		} else {
2241 			RT_UNLOCK(rt);
2242 		}
2243 		/* if the address is gone keep the old route in the pcb */
2244 		if (inp->inp_laddr.s_addr != INADDR_ANY &&
2245 		    (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2246 			/*
2247 			 * Address is around; ditch the route.  A new route
2248 			 * can be allocated the next time output is attempted.
2249 			 */
2250 			release = TRUE;
2251 		}
2252 		if (ia != NULL) {
2253 			IFA_REMREF(&ia->ia_ifa);
2254 		}
2255 	}
2256 	if (rt == NULL || release) {
2257 		ROUTE_RELEASE(&inp->inp_route);
2258 	}
2259 }
2260 
2261 /*
2262  * After a routing change, flush old routing
2263  * and allocate a (hopefully) better one.
2264  */
2265 void
in_rtchange(struct inpcb * inp,int errno)2266 in_rtchange(struct inpcb *inp, int errno)
2267 {
2268 #pragma unused(errno)
2269 	boolean_t release = FALSE;
2270 	struct rtentry *rt;
2271 
2272 	if ((rt = inp->inp_route.ro_rt) != NULL) {
2273 		struct in_ifaddr *ia = NULL;
2274 
2275 		/* if address is gone, keep the old route */
2276 		if (inp->inp_laddr.s_addr != INADDR_ANY &&
2277 		    (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2278 			/*
2279 			 * Address is around; ditch the route.  A new route
2280 			 * can be allocated the next time output is attempted.
2281 			 */
2282 			release = TRUE;
2283 		}
2284 		if (ia != NULL) {
2285 			IFA_REMREF(&ia->ia_ifa);
2286 		}
2287 	}
2288 	if (rt == NULL || release) {
2289 		ROUTE_RELEASE(&inp->inp_route);
2290 	}
2291 }
2292 
2293 /*
2294  * Lookup a PCB based on the local address and port.
2295  */
2296 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,unsigned int lport_arg,int wild_okay)2297 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2298     unsigned int lport_arg, int wild_okay)
2299 {
2300 	struct inpcb *inp;
2301 	int matchwild = 3, wildcard;
2302 	u_short lport = (u_short)lport_arg;
2303 
2304 	KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0, 0, 0, 0, 0);
2305 
2306 	if (!wild_okay) {
2307 		struct inpcbhead *head;
2308 		/*
2309 		 * Look for an unconnected (wildcard foreign addr) PCB that
2310 		 * matches the local address and port we're looking for.
2311 		 */
2312 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2313 		    pcbinfo->ipi_hashmask)];
2314 		LIST_FOREACH(inp, head, inp_hash) {
2315 			if (!(inp->inp_vflag & INP_IPV4)) {
2316 				continue;
2317 			}
2318 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
2319 			    inp->inp_laddr.s_addr == laddr.s_addr &&
2320 			    inp->inp_lport == lport) {
2321 				/*
2322 				 * Found.
2323 				 */
2324 				return inp;
2325 			}
2326 		}
2327 		/*
2328 		 * Not found.
2329 		 */
2330 		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0, 0, 0, 0, 0);
2331 		return NULL;
2332 	} else {
2333 		struct inpcbporthead *porthash;
2334 		struct inpcbport *phd;
2335 		struct inpcb *match = NULL;
2336 		/*
2337 		 * Best fit PCB lookup.
2338 		 *
2339 		 * First see if this local port is in use by looking on the
2340 		 * port hash list.
2341 		 */
2342 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2343 		    pcbinfo->ipi_porthashmask)];
2344 		LIST_FOREACH(phd, porthash, phd_hash) {
2345 			if (phd->phd_port == lport) {
2346 				break;
2347 			}
2348 		}
2349 		if (phd != NULL) {
2350 			/*
2351 			 * Port is in use by one or more PCBs. Look for best
2352 			 * fit.
2353 			 */
2354 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2355 				wildcard = 0;
2356 				if (!(inp->inp_vflag & INP_IPV4)) {
2357 					continue;
2358 				}
2359 				if (inp->inp_faddr.s_addr != INADDR_ANY) {
2360 					wildcard++;
2361 				}
2362 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
2363 					if (laddr.s_addr == INADDR_ANY) {
2364 						wildcard++;
2365 					} else if (inp->inp_laddr.s_addr !=
2366 					    laddr.s_addr) {
2367 						continue;
2368 					}
2369 				} else {
2370 					if (laddr.s_addr != INADDR_ANY) {
2371 						wildcard++;
2372 					}
2373 				}
2374 				if (wildcard < matchwild) {
2375 					match = inp;
2376 					matchwild = wildcard;
2377 					if (matchwild == 0) {
2378 						break;
2379 					}
2380 				}
2381 			}
2382 		}
2383 		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,
2384 		    0, 0, 0, 0);
2385 		return match;
2386 	}
2387 }
2388 
2389 /*
2390  * Check if PCB exists in hash list.
2391  */
2392 int
in_pcblookup_hash_exists(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,uid_t * uid,gid_t * gid,struct ifnet * ifp)2393 in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2394     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2395     uid_t *uid, gid_t *gid, struct ifnet *ifp)
2396 {
2397 	struct inpcbhead *head;
2398 	struct inpcb *inp;
2399 	u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2400 	int found = 0;
2401 	struct inpcb *local_wild = NULL;
2402 	struct inpcb *local_wild_mapped = NULL;
2403 
2404 	*uid = UID_MAX;
2405 	*gid = GID_MAX;
2406 
2407 	/*
2408 	 * We may have found the pcb in the last lookup - check this first.
2409 	 */
2410 
2411 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2412 
2413 	/*
2414 	 * First look for an exact match.
2415 	 */
2416 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2417 	    pcbinfo->ipi_hashmask)];
2418 	LIST_FOREACH(inp, head, inp_hash) {
2419 		if (!(inp->inp_vflag & INP_IPV4)) {
2420 			continue;
2421 		}
2422 		if (inp_restricted_recv(inp, ifp)) {
2423 			continue;
2424 		}
2425 
2426 #if NECP
2427 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2428 			continue;
2429 		}
2430 #endif /* NECP */
2431 
2432 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
2433 		    inp->inp_laddr.s_addr == laddr.s_addr &&
2434 		    inp->inp_fport == fport &&
2435 		    inp->inp_lport == lport) {
2436 			if ((found = (inp->inp_socket != NULL))) {
2437 				/*
2438 				 * Found.
2439 				 */
2440 				*uid = kauth_cred_getuid(
2441 					inp->inp_socket->so_cred);
2442 				*gid = kauth_cred_getgid(
2443 					inp->inp_socket->so_cred);
2444 			}
2445 			lck_rw_done(&pcbinfo->ipi_lock);
2446 			return found;
2447 		}
2448 	}
2449 
2450 	if (!wildcard) {
2451 		/*
2452 		 * Not found.
2453 		 */
2454 		lck_rw_done(&pcbinfo->ipi_lock);
2455 		return 0;
2456 	}
2457 
2458 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2459 	    pcbinfo->ipi_hashmask)];
2460 	LIST_FOREACH(inp, head, inp_hash) {
2461 		if (!(inp->inp_vflag & INP_IPV4)) {
2462 			continue;
2463 		}
2464 		if (inp_restricted_recv(inp, ifp)) {
2465 			continue;
2466 		}
2467 
2468 #if NECP
2469 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2470 			continue;
2471 		}
2472 #endif /* NECP */
2473 
2474 		if (inp->inp_faddr.s_addr == INADDR_ANY &&
2475 		    inp->inp_lport == lport) {
2476 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
2477 				if ((found = (inp->inp_socket != NULL))) {
2478 					*uid = kauth_cred_getuid(
2479 						inp->inp_socket->so_cred);
2480 					*gid = kauth_cred_getgid(
2481 						inp->inp_socket->so_cred);
2482 				}
2483 				lck_rw_done(&pcbinfo->ipi_lock);
2484 				return found;
2485 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2486 				if (inp->inp_socket &&
2487 				    SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2488 					local_wild_mapped = inp;
2489 				} else {
2490 					local_wild = inp;
2491 				}
2492 			}
2493 		}
2494 	}
2495 	if (local_wild == NULL) {
2496 		if (local_wild_mapped != NULL) {
2497 			if ((found = (local_wild_mapped->inp_socket != NULL))) {
2498 				*uid = kauth_cred_getuid(
2499 					local_wild_mapped->inp_socket->so_cred);
2500 				*gid = kauth_cred_getgid(
2501 					local_wild_mapped->inp_socket->so_cred);
2502 			}
2503 			lck_rw_done(&pcbinfo->ipi_lock);
2504 			return found;
2505 		}
2506 		lck_rw_done(&pcbinfo->ipi_lock);
2507 		return 0;
2508 	}
2509 	if ((found = (local_wild->inp_socket != NULL))) {
2510 		*uid = kauth_cred_getuid(
2511 			local_wild->inp_socket->so_cred);
2512 		*gid = kauth_cred_getgid(
2513 			local_wild->inp_socket->so_cred);
2514 	}
2515 	lck_rw_done(&pcbinfo->ipi_lock);
2516 	return found;
2517 }
2518 
2519 /*
2520  * Lookup PCB in hash list.
2521  */
2522 struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,struct ifnet * ifp)2523 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2524     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2525     struct ifnet *ifp)
2526 {
2527 	struct inpcbhead *head;
2528 	struct inpcb *inp;
2529 	u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2530 	struct inpcb *local_wild = NULL;
2531 	struct inpcb *local_wild_mapped = NULL;
2532 
2533 	/*
2534 	 * We may have found the pcb in the last lookup - check this first.
2535 	 */
2536 
2537 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2538 
2539 	/*
2540 	 * First look for an exact match.
2541 	 */
2542 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2543 	    pcbinfo->ipi_hashmask)];
2544 	LIST_FOREACH(inp, head, inp_hash) {
2545 		if (!(inp->inp_vflag & INP_IPV4)) {
2546 			continue;
2547 		}
2548 		if (inp_restricted_recv(inp, ifp)) {
2549 			continue;
2550 		}
2551 
2552 #if NECP
2553 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2554 			continue;
2555 		}
2556 #endif /* NECP */
2557 
2558 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
2559 		    inp->inp_laddr.s_addr == laddr.s_addr &&
2560 		    inp->inp_fport == fport &&
2561 		    inp->inp_lport == lport) {
2562 			/*
2563 			 * Found.
2564 			 */
2565 			if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2566 			    WNT_STOPUSING) {
2567 				lck_rw_done(&pcbinfo->ipi_lock);
2568 				return inp;
2569 			} else {
2570 				/* it's there but dead, say it isn't found */
2571 				lck_rw_done(&pcbinfo->ipi_lock);
2572 				return NULL;
2573 			}
2574 		}
2575 	}
2576 
2577 	if (!wildcard) {
2578 		/*
2579 		 * Not found.
2580 		 */
2581 		lck_rw_done(&pcbinfo->ipi_lock);
2582 		return NULL;
2583 	}
2584 
2585 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2586 	    pcbinfo->ipi_hashmask)];
2587 	LIST_FOREACH(inp, head, inp_hash) {
2588 		if (!(inp->inp_vflag & INP_IPV4)) {
2589 			continue;
2590 		}
2591 		if (inp_restricted_recv(inp, ifp)) {
2592 			continue;
2593 		}
2594 
2595 #if NECP
2596 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2597 			continue;
2598 		}
2599 #endif /* NECP */
2600 
2601 		if (inp->inp_faddr.s_addr == INADDR_ANY &&
2602 		    inp->inp_lport == lport) {
2603 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
2604 				if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2605 				    WNT_STOPUSING) {
2606 					lck_rw_done(&pcbinfo->ipi_lock);
2607 					return inp;
2608 				} else {
2609 					/* it's dead; say it isn't found */
2610 					lck_rw_done(&pcbinfo->ipi_lock);
2611 					return NULL;
2612 				}
2613 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2614 				if (SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2615 					local_wild_mapped = inp;
2616 				} else {
2617 					local_wild = inp;
2618 				}
2619 			}
2620 		}
2621 	}
2622 	if (local_wild == NULL) {
2623 		if (local_wild_mapped != NULL) {
2624 			if (in_pcb_checkstate(local_wild_mapped,
2625 			    WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2626 				lck_rw_done(&pcbinfo->ipi_lock);
2627 				return local_wild_mapped;
2628 			} else {
2629 				/* it's dead; say it isn't found */
2630 				lck_rw_done(&pcbinfo->ipi_lock);
2631 				return NULL;
2632 			}
2633 		}
2634 		lck_rw_done(&pcbinfo->ipi_lock);
2635 		return NULL;
2636 	}
2637 	if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2638 		lck_rw_done(&pcbinfo->ipi_lock);
2639 		return local_wild;
2640 	}
2641 	/*
2642 	 * It's either not found or is already dead.
2643 	 */
2644 	lck_rw_done(&pcbinfo->ipi_lock);
2645 	return NULL;
2646 }
2647 
2648 /*
2649  * @brief	Insert PCB onto various hash lists.
2650  *
2651  * @param	inp Pointer to internet protocol control block
2652  * @param	locked	Implies if ipi_lock (protecting pcb list)
2653  *              is already locked or not.
2654  *
2655  * @return	int error on failure and 0 on success
2656  */
2657 int
in_pcbinshash(struct inpcb * inp,int locked)2658 in_pcbinshash(struct inpcb *inp, int locked)
2659 {
2660 	struct inpcbhead *pcbhash;
2661 	struct inpcbporthead *pcbporthash;
2662 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2663 	struct inpcbport *phd;
2664 	u_int32_t hashkey_faddr;
2665 
2666 	if (!locked) {
2667 		if (!lck_rw_try_lock_exclusive(&pcbinfo->ipi_lock)) {
2668 			/*
2669 			 * Lock inversion issue, mostly with udp
2670 			 * multicast packets
2671 			 */
2672 			socket_unlock(inp->inp_socket, 0);
2673 			lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
2674 			socket_lock(inp->inp_socket, 0);
2675 		}
2676 	}
2677 
2678 	/*
2679 	 * This routine or its caller may have given up
2680 	 * socket's protocol lock briefly.
2681 	 * During that time the socket may have been dropped.
2682 	 * Safe-guarding against that.
2683 	 */
2684 	if (inp->inp_state == INPCB_STATE_DEAD) {
2685 		if (!locked) {
2686 			lck_rw_done(&pcbinfo->ipi_lock);
2687 		}
2688 		return ECONNABORTED;
2689 	}
2690 
2691 
2692 	if (inp->inp_vflag & INP_IPV6) {
2693 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2694 	} else {
2695 		hashkey_faddr = inp->inp_faddr.s_addr;
2696 	}
2697 
2698 	inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2699 	    inp->inp_fport, pcbinfo->ipi_hashmask);
2700 
2701 	pcbhash = &pcbinfo->ipi_hashbase[inp->inp_hash_element];
2702 
2703 	pcbporthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(inp->inp_lport,
2704 	    pcbinfo->ipi_porthashmask)];
2705 
2706 	/*
2707 	 * Go through port list and look for a head for this lport.
2708 	 */
2709 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
2710 		if (phd->phd_port == inp->inp_lport) {
2711 			break;
2712 		}
2713 	}
2714 
2715 	/*
2716 	 * If none exists, malloc one and tack it on.
2717 	 */
2718 	if (phd == NULL) {
2719 		phd = kalloc_type(struct inpcbport, Z_WAITOK | Z_NOFAIL);
2720 		phd->phd_port = inp->inp_lport;
2721 		LIST_INIT(&phd->phd_pcblist);
2722 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2723 	}
2724 
2725 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2726 
2727 #if SKYWALK
2728 	int err;
2729 	struct socket *so = inp->inp_socket;
2730 	if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2731 	    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2732 		if (inp->inp_vflag & INP_IPV6) {
2733 			err = netns_reserve_in6(&inp->inp_netns_token,
2734 			    inp->in6p_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2735 			    NETNS_BSD | NETNS_PRERESERVED, NULL);
2736 		} else {
2737 			err = netns_reserve_in(&inp->inp_netns_token,
2738 			    inp->inp_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2739 			    NETNS_BSD | NETNS_PRERESERVED, NULL);
2740 		}
2741 		if (err) {
2742 			if (!locked) {
2743 				lck_rw_done(&pcbinfo->ipi_lock);
2744 			}
2745 			return err;
2746 		}
2747 		netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2748 		inp_update_netns_flags(so);
2749 	}
2750 #endif /* SKYWALK */
2751 
2752 	inp->inp_phd = phd;
2753 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2754 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2755 	inp->inp_flags2 |= INP2_INHASHLIST;
2756 
2757 	if (!locked) {
2758 		lck_rw_done(&pcbinfo->ipi_lock);
2759 	}
2760 
2761 #if NECP
2762 	// This call catches the original setting of the local address
2763 	inp_update_necp_policy(inp, NULL, NULL, 0);
2764 #endif /* NECP */
2765 
2766 	return 0;
2767 }
2768 
2769 /*
2770  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2771  * changed. NOTE: This does not handle the case of the lport changing (the
2772  * hashed port list would have to be updated as well), so the lport must
2773  * not change after in_pcbinshash() has been called.
2774  */
2775 void
in_pcbrehash(struct inpcb * inp)2776 in_pcbrehash(struct inpcb *inp)
2777 {
2778 	struct inpcbhead *head;
2779 	u_int32_t hashkey_faddr;
2780 
2781 #if SKYWALK
2782 	struct socket *so = inp->inp_socket;
2783 	if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2784 	    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2785 		int err;
2786 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
2787 			if (inp->inp_vflag & INP_IPV6) {
2788 				err = netns_change_addr_in6(
2789 					&inp->inp_netns_token, inp->in6p_laddr);
2790 			} else {
2791 				err = netns_change_addr_in(
2792 					&inp->inp_netns_token, inp->inp_laddr);
2793 			}
2794 		} else {
2795 			if (inp->inp_vflag & INP_IPV6) {
2796 				err = netns_reserve_in6(&inp->inp_netns_token,
2797 				    inp->in6p_laddr, (uint8_t)SOCK_PROTO(so),
2798 				    inp->inp_lport, NETNS_BSD, NULL);
2799 			} else {
2800 				err = netns_reserve_in(&inp->inp_netns_token,
2801 				    inp->inp_laddr, (uint8_t)SOCK_PROTO(so),
2802 				    inp->inp_lport, NETNS_BSD, NULL);
2803 			}
2804 		}
2805 		/* We are assuming that whatever code paths result in a rehash
2806 		 * did their due diligence and ensured that the given
2807 		 * <proto, laddr, lport> tuple was free ahead of time. Just
2808 		 * reserving the lport on INADDR_ANY should be enough, since
2809 		 * that will block Skywalk from trying to reserve that same
2810 		 * port. Given this assumption, the above netns calls should
2811 		 * never fail*/
2812 		VERIFY(err == 0);
2813 
2814 		netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2815 		inp_update_netns_flags(so);
2816 	}
2817 #endif /* SKYWALK */
2818 	if (inp->inp_vflag & INP_IPV6) {
2819 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2820 	} else {
2821 		hashkey_faddr = inp->inp_faddr.s_addr;
2822 	}
2823 
2824 	inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2825 	    inp->inp_fport, inp->inp_pcbinfo->ipi_hashmask);
2826 	head = &inp->inp_pcbinfo->ipi_hashbase[inp->inp_hash_element];
2827 
2828 	if (inp->inp_flags2 & INP2_INHASHLIST) {
2829 		LIST_REMOVE(inp, inp_hash);
2830 		inp->inp_flags2 &= ~INP2_INHASHLIST;
2831 	}
2832 
2833 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2834 	LIST_INSERT_HEAD(head, inp, inp_hash);
2835 	inp->inp_flags2 |= INP2_INHASHLIST;
2836 
2837 #if NECP
2838 	// This call catches updates to the remote addresses
2839 	inp_update_necp_policy(inp, NULL, NULL, 0);
2840 #endif /* NECP */
2841 }
2842 
2843 /*
2844  * Remove PCB from various lists.
2845  * Must be called pcbinfo lock is held in exclusive mode.
2846  */
2847 void
in_pcbremlists(struct inpcb * inp)2848 in_pcbremlists(struct inpcb *inp)
2849 {
2850 	inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
2851 
2852 	/*
2853 	 * Check if it's in hashlist -- an inp is placed in hashlist when
2854 	 * it's local port gets assigned. So it should also be present
2855 	 * in the port list.
2856 	 */
2857 	if (inp->inp_flags2 & INP2_INHASHLIST) {
2858 		struct inpcbport *phd = inp->inp_phd;
2859 
2860 		VERIFY(phd != NULL && inp->inp_lport > 0);
2861 
2862 		LIST_REMOVE(inp, inp_hash);
2863 		inp->inp_hash.le_next = NULL;
2864 		inp->inp_hash.le_prev = NULL;
2865 
2866 		LIST_REMOVE(inp, inp_portlist);
2867 		inp->inp_portlist.le_next = NULL;
2868 		inp->inp_portlist.le_prev = NULL;
2869 		if (LIST_EMPTY(&phd->phd_pcblist)) {
2870 			LIST_REMOVE(phd, phd_hash);
2871 			kfree_type(struct inpcbport, phd);
2872 		}
2873 		inp->inp_phd = NULL;
2874 		inp->inp_flags2 &= ~INP2_INHASHLIST;
2875 #if SKYWALK
2876 		/* Free up the port in the namespace registrar */
2877 		netns_release(&inp->inp_netns_token);
2878 		netns_release(&inp->inp_wildcard_netns_token);
2879 #endif /* SKYWALK */
2880 	}
2881 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2882 
2883 	if (inp->inp_flags2 & INP2_TIMEWAIT) {
2884 		/* Remove from time-wait queue */
2885 		tcp_remove_from_time_wait(inp);
2886 		inp->inp_flags2 &= ~INP2_TIMEWAIT;
2887 		VERIFY(inp->inp_pcbinfo->ipi_twcount != 0);
2888 		inp->inp_pcbinfo->ipi_twcount--;
2889 	} else {
2890 		/* Remove from global inp list if it is not time-wait */
2891 		LIST_REMOVE(inp, inp_list);
2892 	}
2893 
2894 	if (inp->inp_flags2 & INP2_IN_FCTREE) {
2895 		inp_fc_getinp(inp->inp_flowhash, (INPFC_SOLOCKED | INPFC_REMOVE));
2896 		VERIFY(!(inp->inp_flags2 & INP2_IN_FCTREE));
2897 	}
2898 
2899 	inp->inp_pcbinfo->ipi_count--;
2900 }
2901 
2902 /*
2903  * Mechanism used to defer the memory release of PCBs
2904  * The pcb list will contain the pcb until the reaper can clean it up if
2905  * the following conditions are met:
2906  *	1) state "DEAD",
2907  *	2) wantcnt is STOPUSING
2908  *	3) usecount is 0
2909  * This function will be called to either mark the pcb as
2910  */
2911 int
in_pcb_checkstate(struct inpcb * pcb,int mode,int locked)2912 in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
2913 {
2914 	volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt;
2915 	UInt32 origwant;
2916 	UInt32 newwant;
2917 
2918 	switch (mode) {
2919 	case WNT_STOPUSING:
2920 		/*
2921 		 * Try to mark the pcb as ready for recycling.  CAS with
2922 		 * STOPUSING, if success we're good, if it's in use, will
2923 		 * be marked later
2924 		 */
2925 		if (locked == 0) {
2926 			socket_lock(pcb->inp_socket, 1);
2927 		}
2928 		pcb->inp_state = INPCB_STATE_DEAD;
2929 
2930 stopusing:
2931 		if (pcb->inp_socket->so_usecount < 0) {
2932 			panic("%s: pcb=%p so=%p usecount is negative",
2933 			    __func__, pcb, pcb->inp_socket);
2934 			/* NOTREACHED */
2935 		}
2936 		if (locked == 0) {
2937 			socket_unlock(pcb->inp_socket, 1);
2938 		}
2939 
2940 		inpcb_gc_sched(pcb->inp_pcbinfo, INPCB_TIMER_FAST);
2941 
2942 		origwant = *wantcnt;
2943 		if ((UInt16) origwant == 0xffff) { /* should stop using */
2944 			return WNT_STOPUSING;
2945 		}
2946 		newwant = 0xffff;
2947 		if ((UInt16) origwant == 0) {
2948 			/* try to mark it as unsuable now */
2949 			OSCompareAndSwap(origwant, newwant, wantcnt);
2950 		}
2951 		return WNT_STOPUSING;
2952 
2953 	case WNT_ACQUIRE:
2954 		/*
2955 		 * Try to increase reference to pcb.  If WNT_STOPUSING
2956 		 * should bail out.  If socket state DEAD, try to set count
2957 		 * to STOPUSING, return failed otherwise increase cnt.
2958 		 */
2959 		do {
2960 			origwant = *wantcnt;
2961 			if ((UInt16) origwant == 0xffff) {
2962 				/* should stop using */
2963 				return WNT_STOPUSING;
2964 			}
2965 			newwant = origwant + 1;
2966 		} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
2967 		return WNT_ACQUIRE;
2968 
2969 	case WNT_RELEASE:
2970 		/*
2971 		 * Release reference.  If result is null and pcb state
2972 		 * is DEAD, set wanted bit to STOPUSING
2973 		 */
2974 		if (locked == 0) {
2975 			socket_lock(pcb->inp_socket, 1);
2976 		}
2977 
2978 		do {
2979 			origwant = *wantcnt;
2980 			if ((UInt16) origwant == 0x0) {
2981 				panic("%s: pcb=%p release with zero count",
2982 				    __func__, pcb);
2983 				/* NOTREACHED */
2984 			}
2985 			if ((UInt16) origwant == 0xffff) {
2986 				/* should stop using */
2987 				if (locked == 0) {
2988 					socket_unlock(pcb->inp_socket, 1);
2989 				}
2990 				return WNT_STOPUSING;
2991 			}
2992 			newwant = origwant - 1;
2993 		} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
2994 
2995 		if (pcb->inp_state == INPCB_STATE_DEAD) {
2996 			goto stopusing;
2997 		}
2998 		if (pcb->inp_socket->so_usecount < 0) {
2999 			panic("%s: RELEASE pcb=%p so=%p usecount is negative",
3000 			    __func__, pcb, pcb->inp_socket);
3001 			/* NOTREACHED */
3002 		}
3003 
3004 		if (locked == 0) {
3005 			socket_unlock(pcb->inp_socket, 1);
3006 		}
3007 		return WNT_RELEASE;
3008 
3009 	default:
3010 		panic("%s: so=%p not a valid state =%x", __func__,
3011 		    pcb->inp_socket, mode);
3012 		/* NOTREACHED */
3013 	}
3014 
3015 	/* NOTREACHED */
3016 	return mode;
3017 }
3018 
3019 /*
3020  * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
3021  * The inpcb_compat data structure is passed to user space and must
3022  * not change. We intentionally avoid copying pointers.
3023  */
3024 void
inpcb_to_compat(struct inpcb * inp,struct inpcb_compat * inp_compat)3025 inpcb_to_compat(struct inpcb *inp, struct inpcb_compat *inp_compat)
3026 {
3027 	bzero(inp_compat, sizeof(*inp_compat));
3028 	inp_compat->inp_fport = inp->inp_fport;
3029 	inp_compat->inp_lport = inp->inp_lport;
3030 	inp_compat->nat_owner = 0;
3031 	inp_compat->nat_cookie = 0;
3032 	inp_compat->inp_gencnt = inp->inp_gencnt;
3033 	inp_compat->inp_flags = inp->inp_flags;
3034 	inp_compat->inp_flow = inp->inp_flow;
3035 	inp_compat->inp_vflag = inp->inp_vflag;
3036 	inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
3037 	inp_compat->inp_ip_p = inp->inp_ip_p;
3038 	inp_compat->inp_dependfaddr.inp6_foreign =
3039 	    inp->inp_dependfaddr.inp6_foreign;
3040 	inp_compat->inp_dependladdr.inp6_local =
3041 	    inp->inp_dependladdr.inp6_local;
3042 	inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
3043 	inp_compat->inp_depend6.inp6_hlim = 0;
3044 	inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
3045 	inp_compat->inp_depend6.inp6_ifindex = 0;
3046 	inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
3047 }
3048 
3049 #if XNU_TARGET_OS_OSX
3050 void
inpcb_to_xinpcb64(struct inpcb * inp,struct xinpcb64 * xinp)3051 inpcb_to_xinpcb64(struct inpcb *inp, struct xinpcb64 *xinp)
3052 {
3053 	xinp->inp_fport = inp->inp_fport;
3054 	xinp->inp_lport = inp->inp_lport;
3055 	xinp->inp_gencnt = inp->inp_gencnt;
3056 	xinp->inp_flags = inp->inp_flags;
3057 	xinp->inp_flow = inp->inp_flow;
3058 	xinp->inp_vflag = inp->inp_vflag;
3059 	xinp->inp_ip_ttl = inp->inp_ip_ttl;
3060 	xinp->inp_ip_p = inp->inp_ip_p;
3061 	xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
3062 	xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
3063 	xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
3064 	xinp->inp_depend6.inp6_hlim = 0;
3065 	xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
3066 	xinp->inp_depend6.inp6_ifindex = 0;
3067 	xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
3068 }
3069 #endif /* XNU_TARGET_OS_OSX */
3070 
3071 /*
3072  * The following routines implement this scheme:
3073  *
3074  * Callers of ip_output() that intend to cache the route in the inpcb pass
3075  * a local copy of the struct route to ip_output().  Using a local copy of
3076  * the cached route significantly simplifies things as IP no longer has to
3077  * worry about having exclusive access to the passed in struct route, since
3078  * it's defined in the caller's stack; in essence, this allows for a lock-
3079  * less operation when updating the struct route at the IP level and below,
3080  * whenever necessary. The scheme works as follows:
3081  *
3082  * Prior to dropping the socket's lock and calling ip_output(), the caller
3083  * copies the struct route from the inpcb into its stack, and adds a reference
3084  * to the cached route entry, if there was any.  The socket's lock is then
3085  * dropped and ip_output() is called with a pointer to the copy of struct
3086  * route defined on the stack (not to the one in the inpcb.)
3087  *
3088  * Upon returning from ip_output(), the caller then acquires the socket's
3089  * lock and synchronizes the cache; if there is no route cached in the inpcb,
3090  * it copies the local copy of struct route (which may or may not contain any
3091  * route) back into the cache; otherwise, if the inpcb has a route cached in
3092  * it, the one in the local copy will be freed, if there's any.  Trashing the
3093  * cached route in the inpcb can be avoided because ip_output() is single-
3094  * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
3095  * by the socket/transport layer.)
3096  */
3097 void
inp_route_copyout(struct inpcb * inp,struct route * dst)3098 inp_route_copyout(struct inpcb *inp, struct route *dst)
3099 {
3100 	struct route *src = &inp->inp_route;
3101 
3102 	socket_lock_assert_owned(inp->inp_socket);
3103 
3104 	/*
3105 	 * If the route in the PCB is stale or not for IPv4, blow it away;
3106 	 * this is possible in the case of IPv4-mapped address case.
3107 	 */
3108 	if (ROUTE_UNUSABLE(src) || rt_key(src->ro_rt)->sa_family != AF_INET) {
3109 		ROUTE_RELEASE(src);
3110 	}
3111 
3112 	route_copyout(dst, src, sizeof(*dst));
3113 }
3114 
3115 void
inp_route_copyin(struct inpcb * inp,struct route * src)3116 inp_route_copyin(struct inpcb *inp, struct route *src)
3117 {
3118 	struct route *dst = &inp->inp_route;
3119 
3120 	socket_lock_assert_owned(inp->inp_socket);
3121 
3122 	/* Minor sanity check */
3123 	if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3124 		panic("%s: wrong or corrupted route: %p", __func__, src);
3125 	}
3126 
3127 	route_copyin(src, dst, sizeof(*src));
3128 }
3129 
3130 /*
3131  * Handler for setting IP_BOUND_IF/IPV6_BOUND_IF socket option.
3132  */
3133 int
inp_bindif(struct inpcb * inp,unsigned int ifscope,struct ifnet ** pifp)3134 inp_bindif(struct inpcb *inp, unsigned int ifscope, struct ifnet **pifp)
3135 {
3136 	struct ifnet *ifp = NULL;
3137 
3138 	ifnet_head_lock_shared();
3139 	if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE &&
3140 	    (ifp = ifindex2ifnet[ifscope]) == NULL)) {
3141 		ifnet_head_done();
3142 		return ENXIO;
3143 	}
3144 	ifnet_head_done();
3145 
3146 	VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE);
3147 
3148 	/*
3149 	 * A zero interface scope value indicates an "unbind".
3150 	 * Otherwise, take in whatever value the app desires;
3151 	 * the app may already know the scope (or force itself
3152 	 * to such a scope) ahead of time before the interface
3153 	 * gets attached.  It doesn't matter either way; any
3154 	 * route lookup from this point on will require an
3155 	 * exact match for the embedded interface scope.
3156 	 */
3157 	inp->inp_boundifp = ifp;
3158 	if (inp->inp_boundifp == NULL) {
3159 		inp->inp_flags &= ~INP_BOUND_IF;
3160 	} else {
3161 		inp->inp_flags |= INP_BOUND_IF;
3162 	}
3163 
3164 	/* Blow away any cached route in the PCB */
3165 	ROUTE_RELEASE(&inp->inp_route);
3166 
3167 	if (pifp != NULL) {
3168 		*pifp = ifp;
3169 	}
3170 
3171 	return 0;
3172 }
3173 
3174 /*
3175  * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3176  * as well as for setting PROC_UUID_NO_CELLULAR policy.
3177  */
3178 void
inp_set_nocellular(struct inpcb * inp)3179 inp_set_nocellular(struct inpcb *inp)
3180 {
3181 	inp->inp_flags |= INP_NO_IFT_CELLULAR;
3182 
3183 	/* Blow away any cached route in the PCB */
3184 	ROUTE_RELEASE(&inp->inp_route);
3185 }
3186 
3187 /*
3188  * Handler for clearing IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3189  * as well as for clearing PROC_UUID_NO_CELLULAR policy.
3190  */
3191 void
inp_clear_nocellular(struct inpcb * inp)3192 inp_clear_nocellular(struct inpcb *inp)
3193 {
3194 	struct socket *so = inp->inp_socket;
3195 
3196 	/*
3197 	 * SO_RESTRICT_DENY_CELLULAR socket restriction issued on the socket
3198 	 * has a higher precendence than INP_NO_IFT_CELLULAR.  Clear the flag
3199 	 * if and only if the socket is unrestricted.
3200 	 */
3201 	if (so != NULL && !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
3202 		inp->inp_flags &= ~INP_NO_IFT_CELLULAR;
3203 
3204 		/* Blow away any cached route in the PCB */
3205 		ROUTE_RELEASE(&inp->inp_route);
3206 	}
3207 }
3208 
3209 void
inp_set_noexpensive(struct inpcb * inp)3210 inp_set_noexpensive(struct inpcb *inp)
3211 {
3212 	inp->inp_flags2 |= INP2_NO_IFF_EXPENSIVE;
3213 
3214 	/* Blow away any cached route in the PCB */
3215 	ROUTE_RELEASE(&inp->inp_route);
3216 }
3217 
3218 void
inp_set_noconstrained(struct inpcb * inp)3219 inp_set_noconstrained(struct inpcb *inp)
3220 {
3221 	inp->inp_flags2 |= INP2_NO_IFF_CONSTRAINED;
3222 
3223 	/* Blow away any cached route in the PCB */
3224 	ROUTE_RELEASE(&inp->inp_route);
3225 }
3226 
3227 void
inp_set_awdl_unrestricted(struct inpcb * inp)3228 inp_set_awdl_unrestricted(struct inpcb *inp)
3229 {
3230 	inp->inp_flags2 |= INP2_AWDL_UNRESTRICTED;
3231 
3232 	/* Blow away any cached route in the PCB */
3233 	ROUTE_RELEASE(&inp->inp_route);
3234 }
3235 
3236 boolean_t
inp_get_awdl_unrestricted(struct inpcb * inp)3237 inp_get_awdl_unrestricted(struct inpcb *inp)
3238 {
3239 	return (inp->inp_flags2 & INP2_AWDL_UNRESTRICTED) ? TRUE : FALSE;
3240 }
3241 
3242 void
inp_clear_awdl_unrestricted(struct inpcb * inp)3243 inp_clear_awdl_unrestricted(struct inpcb *inp)
3244 {
3245 	inp->inp_flags2 &= ~INP2_AWDL_UNRESTRICTED;
3246 
3247 	/* Blow away any cached route in the PCB */
3248 	ROUTE_RELEASE(&inp->inp_route);
3249 }
3250 
3251 void
inp_set_intcoproc_allowed(struct inpcb * inp)3252 inp_set_intcoproc_allowed(struct inpcb *inp)
3253 {
3254 	inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
3255 
3256 	/* Blow away any cached route in the PCB */
3257 	ROUTE_RELEASE(&inp->inp_route);
3258 }
3259 
3260 boolean_t
inp_get_intcoproc_allowed(struct inpcb * inp)3261 inp_get_intcoproc_allowed(struct inpcb *inp)
3262 {
3263 	return (inp->inp_flags2 & INP2_INTCOPROC_ALLOWED) ? TRUE : FALSE;
3264 }
3265 
3266 void
inp_clear_intcoproc_allowed(struct inpcb * inp)3267 inp_clear_intcoproc_allowed(struct inpcb *inp)
3268 {
3269 	inp->inp_flags2 &= ~INP2_INTCOPROC_ALLOWED;
3270 
3271 	/* Blow away any cached route in the PCB */
3272 	ROUTE_RELEASE(&inp->inp_route);
3273 }
3274 
3275 void
inp_set_management_allowed(struct inpcb * inp)3276 inp_set_management_allowed(struct inpcb *inp)
3277 {
3278 	inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
3279 	inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
3280 
3281 	/* Blow away any cached route in the PCB */
3282 	ROUTE_RELEASE(&inp->inp_route);
3283 }
3284 
3285 boolean_t
inp_get_management_allowed(struct inpcb * inp)3286 inp_get_management_allowed(struct inpcb *inp)
3287 {
3288 	return (inp->inp_flags2 & INP2_MANAGEMENT_ALLOWED) ? TRUE : FALSE;
3289 }
3290 
3291 void
inp_clear_management_allowed(struct inpcb * inp)3292 inp_clear_management_allowed(struct inpcb *inp)
3293 {
3294 	inp->inp_flags2 &= ~INP2_MANAGEMENT_ALLOWED;
3295 
3296 	/* Blow away any cached route in the PCB */
3297 	ROUTE_RELEASE(&inp->inp_route);
3298 }
3299 
3300 #if NECP
3301 /*
3302  * Called when PROC_UUID_NECP_APP_POLICY is set.
3303  */
3304 void
inp_set_want_app_policy(struct inpcb * inp)3305 inp_set_want_app_policy(struct inpcb *inp)
3306 {
3307 	inp->inp_flags2 |= INP2_WANT_APP_POLICY;
3308 }
3309 
3310 /*
3311  * Called when PROC_UUID_NECP_APP_POLICY is cleared.
3312  */
3313 void
inp_clear_want_app_policy(struct inpcb * inp)3314 inp_clear_want_app_policy(struct inpcb *inp)
3315 {
3316 	inp->inp_flags2 &= ~INP2_WANT_APP_POLICY;
3317 }
3318 #endif /* NECP */
3319 
3320 /*
3321  * Calculate flow hash for an inp, used by an interface to identify a
3322  * flow. When an interface provides flow control advisory, this flow
3323  * hash is used as an identifier.
3324  */
3325 u_int32_t
inp_calc_flowhash(struct inpcb * inp)3326 inp_calc_flowhash(struct inpcb *inp)
3327 {
3328 #if SKYWALK
3329 
3330 	uint32_t flowid;
3331 	struct flowidns_flow_key fk;
3332 
3333 	bzero(&fk, sizeof(fk));
3334 
3335 	if (inp->inp_vflag & INP_IPV4) {
3336 		fk.ffk_af = AF_INET;
3337 		fk.ffk_laddr_v4 = inp->inp_laddr;
3338 		fk.ffk_raddr_v4 = inp->inp_faddr;
3339 	} else {
3340 		fk.ffk_af = AF_INET6;
3341 		fk.ffk_laddr_v6 = inp->in6p_laddr;
3342 		fk.ffk_raddr_v6 = inp->in6p_faddr;
3343 		/* clear embedded scope ID */
3344 		if (IN6_IS_SCOPE_EMBED(&fk.ffk_laddr_v6)) {
3345 			fk.ffk_laddr_v6.s6_addr16[1] = 0;
3346 		}
3347 		if (IN6_IS_SCOPE_EMBED(&fk.ffk_raddr_v6)) {
3348 			fk.ffk_raddr_v6.s6_addr16[1] = 0;
3349 		}
3350 	}
3351 
3352 	fk.ffk_lport = inp->inp_lport;
3353 	fk.ffk_rport = inp->inp_fport;
3354 	fk.ffk_proto = (inp->inp_ip_p != 0) ? inp->inp_ip_p :
3355 	    (uint8_t)SOCK_PROTO(inp->inp_socket);
3356 	flowidns_allocate_flowid(FLOWIDNS_DOMAIN_INPCB, &fk, &flowid);
3357 	/* Insert the inp into inp_fc_tree */
3358 	lck_mtx_lock_spin(&inp_fc_lck);
3359 	ASSERT(inp->inp_flowhash == 0);
3360 	ASSERT((inp->inp_flags2 & INP2_IN_FCTREE) == 0);
3361 	inp->inp_flowhash = flowid;
3362 	VERIFY(RB_INSERT(inp_fc_tree, &inp_fc_tree, inp) == NULL);
3363 	inp->inp_flags2 |= INP2_IN_FCTREE;
3364 	lck_mtx_unlock(&inp_fc_lck);
3365 
3366 	return flowid;
3367 
3368 #else /* !SKYWALK */
3369 
3370 	struct inp_flowhash_key fh __attribute__((aligned(8)));
3371 	u_int32_t flowhash = 0;
3372 	struct inpcb *tmp_inp = NULL;
3373 
3374 	if (inp_hash_seed == 0) {
3375 		inp_hash_seed = RandomULong();
3376 	}
3377 
3378 	bzero(&fh, sizeof(fh));
3379 
3380 	bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof(fh.infh_laddr));
3381 	bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof(fh.infh_faddr));
3382 
3383 	fh.infh_lport = inp->inp_lport;
3384 	fh.infh_fport = inp->inp_fport;
3385 	fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET;
3386 	fh.infh_proto = inp->inp_ip_p;
3387 	fh.infh_rand1 = RandomULong();
3388 	fh.infh_rand2 = RandomULong();
3389 
3390 try_again:
3391 	flowhash = net_flowhash(&fh, sizeof(fh), inp_hash_seed);
3392 	if (flowhash == 0) {
3393 		/* try to get a non-zero flowhash */
3394 		inp_hash_seed = RandomULong();
3395 		goto try_again;
3396 	}
3397 
3398 	inp->inp_flowhash = flowhash;
3399 
3400 	/* Insert the inp into inp_fc_tree */
3401 	lck_mtx_lock_spin(&inp_fc_lck);
3402 	tmp_inp = RB_FIND(inp_fc_tree, &inp_fc_tree, inp);
3403 	if (tmp_inp != NULL) {
3404 		/*
3405 		 * There is a different inp with the same flowhash.
3406 		 * There can be a collision on flow hash but the
3407 		 * probability is low.  Let's recompute the
3408 		 * flowhash.
3409 		 */
3410 		lck_mtx_unlock(&inp_fc_lck);
3411 		/* recompute hash seed */
3412 		inp_hash_seed = RandomULong();
3413 		goto try_again;
3414 	}
3415 
3416 	RB_INSERT(inp_fc_tree, &inp_fc_tree, inp);
3417 	inp->inp_flags2 |= INP2_IN_FCTREE;
3418 	lck_mtx_unlock(&inp_fc_lck);
3419 
3420 	return flowhash;
3421 
3422 #endif /* !SKYWALK */
3423 }
3424 
3425 void
inp_flowadv(uint32_t flowhash)3426 inp_flowadv(uint32_t flowhash)
3427 {
3428 	struct inpcb *inp;
3429 
3430 	inp = inp_fc_getinp(flowhash, 0);
3431 
3432 	if (inp == NULL) {
3433 		return;
3434 	}
3435 	inp_fc_feedback(inp);
3436 }
3437 
3438 /*
3439  * Function to compare inp_fc_entries in inp flow control tree
3440  */
3441 static inline int
infc_cmp(const struct inpcb * inp1,const struct inpcb * inp2)3442 infc_cmp(const struct inpcb *inp1, const struct inpcb *inp2)
3443 {
3444 	return memcmp(&(inp1->inp_flowhash), &(inp2->inp_flowhash),
3445 	           sizeof(inp1->inp_flowhash));
3446 }
3447 
3448 static struct inpcb *
inp_fc_getinp(u_int32_t flowhash,u_int32_t flags)3449 inp_fc_getinp(u_int32_t flowhash, u_int32_t flags)
3450 {
3451 	struct inpcb *inp = NULL;
3452 	int locked = (flags & INPFC_SOLOCKED) ? 1 : 0;
3453 
3454 	lck_mtx_lock_spin(&inp_fc_lck);
3455 	key_inp.inp_flowhash = flowhash;
3456 	inp = RB_FIND(inp_fc_tree, &inp_fc_tree, &key_inp);
3457 	if (inp == NULL) {
3458 		/* inp is not present, return */
3459 		lck_mtx_unlock(&inp_fc_lck);
3460 		return NULL;
3461 	}
3462 
3463 	if (flags & INPFC_REMOVE) {
3464 		ASSERT((inp->inp_flags2 & INP2_IN_FCTREE) != 0);
3465 		lck_mtx_convert_spin(&inp_fc_lck);
3466 		RB_REMOVE(inp_fc_tree, &inp_fc_tree, inp);
3467 		bzero(&(inp->infc_link), sizeof(inp->infc_link));
3468 #if SKYWALK
3469 		VERIFY(inp->inp_flowhash != 0);
3470 		flowidns_release_flowid(inp->inp_flowhash);
3471 		inp->inp_flowhash = 0;
3472 #endif /* !SKYWALK */
3473 		inp->inp_flags2 &= ~INP2_IN_FCTREE;
3474 		lck_mtx_unlock(&inp_fc_lck);
3475 		return NULL;
3476 	}
3477 
3478 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) {
3479 		inp = NULL;
3480 	}
3481 	lck_mtx_unlock(&inp_fc_lck);
3482 
3483 	return inp;
3484 }
3485 
3486 static void
inp_fc_feedback(struct inpcb * inp)3487 inp_fc_feedback(struct inpcb *inp)
3488 {
3489 	struct socket *so = inp->inp_socket;
3490 
3491 	/* we already hold a want_cnt on this inp, socket can't be null */
3492 	VERIFY(so != NULL);
3493 	socket_lock(so, 1);
3494 
3495 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3496 		socket_unlock(so, 1);
3497 		return;
3498 	}
3499 
3500 	if (inp->inp_sndinprog_cnt > 0) {
3501 		inp->inp_flags |= INP_FC_FEEDBACK;
3502 	}
3503 
3504 	/*
3505 	 * Return if the connection is not in flow-controlled state.
3506 	 * This can happen if the connection experienced
3507 	 * loss while it was in flow controlled state
3508 	 */
3509 	if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) {
3510 		socket_unlock(so, 1);
3511 		return;
3512 	}
3513 	inp_reset_fc_state(inp);
3514 
3515 	if (SOCK_TYPE(so) == SOCK_STREAM) {
3516 		inp_fc_unthrottle_tcp(inp);
3517 	}
3518 
3519 	socket_unlock(so, 1);
3520 }
3521 
3522 void
inp_reset_fc_state(struct inpcb * inp)3523 inp_reset_fc_state(struct inpcb *inp)
3524 {
3525 	struct socket *so = inp->inp_socket;
3526 	int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0;
3527 	int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0;
3528 
3529 	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3530 
3531 	if (suspended) {
3532 		so->so_flags &= ~(SOF_SUSPENDED);
3533 		soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME));
3534 	}
3535 
3536 	/* Give a write wakeup to unblock the socket */
3537 	if (needwakeup) {
3538 		sowwakeup(so);
3539 	}
3540 }
3541 
3542 int
inp_set_fc_state(struct inpcb * inp,int advcode)3543 inp_set_fc_state(struct inpcb *inp, int advcode)
3544 {
3545 	boolean_t is_flow_controlled = INP_WAIT_FOR_IF_FEEDBACK(inp);
3546 	struct inpcb *tmp_inp = NULL;
3547 	/*
3548 	 * If there was a feedback from the interface when
3549 	 * send operation was in progress, we should ignore
3550 	 * this flow advisory to avoid a race between setting
3551 	 * flow controlled state and receiving feedback from
3552 	 * the interface
3553 	 */
3554 	if (inp->inp_flags & INP_FC_FEEDBACK) {
3555 		return 0;
3556 	}
3557 
3558 	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3559 	if ((tmp_inp = inp_fc_getinp(inp->inp_flowhash,
3560 	    INPFC_SOLOCKED)) != NULL) {
3561 		if (in_pcb_checkstate(tmp_inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3562 			return 0;
3563 		}
3564 		VERIFY(tmp_inp == inp);
3565 		switch (advcode) {
3566 		case FADV_FLOW_CONTROLLED:
3567 			inp->inp_flags |= INP_FLOW_CONTROLLED;
3568 			inp->inp_fadv_flow_ctrl_cnt++;
3569 			break;
3570 		case FADV_SUSPENDED:
3571 			inp->inp_flags |= INP_FLOW_SUSPENDED;
3572 			inp->inp_fadv_suspended_cnt++;
3573 			soevent(inp->inp_socket,
3574 			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND));
3575 
3576 			/* Record the fact that suspend event was sent */
3577 			inp->inp_socket->so_flags |= SOF_SUSPENDED;
3578 			break;
3579 		}
3580 
3581 		if (!is_flow_controlled && SOCK_TYPE(inp->inp_socket) == SOCK_STREAM) {
3582 			inp_fc_throttle_tcp(inp);
3583 		}
3584 		return 1;
3585 	}
3586 	return 0;
3587 }
3588 
3589 /*
3590  * Handler for SO_FLUSH socket option.
3591  */
3592 int
inp_flush(struct inpcb * inp,int optval)3593 inp_flush(struct inpcb *inp, int optval)
3594 {
3595 	u_int32_t flowhash = inp->inp_flowhash;
3596 	struct ifnet *rtifp, *oifp;
3597 
3598 	/* Either all classes or one of the valid ones */
3599 	if (optval != SO_TC_ALL && !SO_VALID_TC(optval)) {
3600 		return EINVAL;
3601 	}
3602 
3603 	/* We need a flow hash for identification */
3604 	if (flowhash == 0) {
3605 		return 0;
3606 	}
3607 
3608 	/* Grab the interfaces from the route and pcb */
3609 	rtifp = ((inp->inp_route.ro_rt != NULL) ?
3610 	    inp->inp_route.ro_rt->rt_ifp : NULL);
3611 	oifp = inp->inp_last_outifp;
3612 
3613 	if (rtifp != NULL) {
3614 		if_qflush_sc(rtifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3615 	}
3616 	if (oifp != NULL && oifp != rtifp) {
3617 		if_qflush_sc(oifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3618 	}
3619 
3620 	return 0;
3621 }
3622 
3623 /*
3624  * Clear the INP_INADDR_ANY flag (special case for PPP only)
3625  */
3626 void
inp_clear_INP_INADDR_ANY(struct socket * so)3627 inp_clear_INP_INADDR_ANY(struct socket *so)
3628 {
3629 	struct inpcb *inp = NULL;
3630 
3631 	socket_lock(so, 1);
3632 	inp = sotoinpcb(so);
3633 	if (inp) {
3634 		inp->inp_flags &= ~INP_INADDR_ANY;
3635 	}
3636 	socket_unlock(so, 1);
3637 }
3638 
3639 void
inp_get_soprocinfo(struct inpcb * inp,struct so_procinfo * soprocinfo)3640 inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo)
3641 {
3642 	struct socket *so = inp->inp_socket;
3643 
3644 	soprocinfo->spi_pid = so->last_pid;
3645 	strlcpy(&soprocinfo->spi_proc_name[0], &inp->inp_last_proc_name[0],
3646 	    sizeof(soprocinfo->spi_proc_name));
3647 	if (so->last_pid != 0) {
3648 		uuid_copy(soprocinfo->spi_uuid, so->last_uuid);
3649 	}
3650 	/*
3651 	 * When not delegated, the effective pid is the same as the real pid
3652 	 */
3653 	if (so->so_flags & SOF_DELEGATED) {
3654 		soprocinfo->spi_delegated = 1;
3655 		soprocinfo->spi_epid = so->e_pid;
3656 		uuid_copy(soprocinfo->spi_euuid, so->e_uuid);
3657 	} else {
3658 		soprocinfo->spi_delegated = 0;
3659 		soprocinfo->spi_epid = so->last_pid;
3660 	}
3661 	strlcpy(&soprocinfo->spi_e_proc_name[0], &inp->inp_e_proc_name[0],
3662 	    sizeof(soprocinfo->spi_e_proc_name));
3663 }
3664 
3665 int
inp_findinpcb_procinfo(struct inpcbinfo * pcbinfo,uint32_t flowhash,struct so_procinfo * soprocinfo)3666 inp_findinpcb_procinfo(struct inpcbinfo *pcbinfo, uint32_t flowhash,
3667     struct so_procinfo *soprocinfo)
3668 {
3669 	struct inpcb *inp = NULL;
3670 	int found = 0;
3671 
3672 	bzero(soprocinfo, sizeof(struct so_procinfo));
3673 
3674 	if (!flowhash) {
3675 		return -1;
3676 	}
3677 
3678 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
3679 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
3680 		if (inp->inp_state != INPCB_STATE_DEAD &&
3681 		    inp->inp_socket != NULL &&
3682 		    inp->inp_flowhash == flowhash) {
3683 			found = 1;
3684 			inp_get_soprocinfo(inp, soprocinfo);
3685 			break;
3686 		}
3687 	}
3688 	lck_rw_done(&pcbinfo->ipi_lock);
3689 
3690 	return found;
3691 }
3692 
3693 #if CONFIG_PROC_UUID_POLICY
3694 static void
inp_update_cellular_policy(struct inpcb * inp,boolean_t set)3695 inp_update_cellular_policy(struct inpcb *inp, boolean_t set)
3696 {
3697 	struct socket *so = inp->inp_socket;
3698 	int before, after;
3699 
3700 	VERIFY(so != NULL);
3701 	VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3702 
3703 	before = INP_NO_CELLULAR(inp);
3704 	if (set) {
3705 		inp_set_nocellular(inp);
3706 	} else {
3707 		inp_clear_nocellular(inp);
3708 	}
3709 	after = INP_NO_CELLULAR(inp);
3710 	if (net_io_policy_log && (before != after)) {
3711 		static const char *ok = "OK";
3712 		static const char *nok = "NOACCESS";
3713 		uuid_string_t euuid_buf;
3714 		pid_t epid;
3715 
3716 		if (so->so_flags & SOF_DELEGATED) {
3717 			uuid_unparse(so->e_uuid, euuid_buf);
3718 			epid = so->e_pid;
3719 		} else {
3720 			uuid_unparse(so->last_uuid, euuid_buf);
3721 			epid = so->last_pid;
3722 		}
3723 
3724 		/* allow this socket to generate another notification event */
3725 		so->so_ifdenied_notifies = 0;
3726 
3727 		log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3728 		    "euuid %s%s %s->%s\n", __func__,
3729 		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3730 		    SOCK_TYPE(so), epid, euuid_buf,
3731 		    (so->so_flags & SOF_DELEGATED) ?
3732 		    " [delegated]" : "",
3733 		    ((before < after) ? ok : nok),
3734 		    ((before < after) ? nok : ok));
3735 	}
3736 }
3737 
3738 #if NECP
3739 static void
inp_update_necp_want_app_policy(struct inpcb * inp,boolean_t set)3740 inp_update_necp_want_app_policy(struct inpcb *inp, boolean_t set)
3741 {
3742 	struct socket *so = inp->inp_socket;
3743 	int before, after;
3744 
3745 	VERIFY(so != NULL);
3746 	VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3747 
3748 	before = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3749 	if (set) {
3750 		inp_set_want_app_policy(inp);
3751 	} else {
3752 		inp_clear_want_app_policy(inp);
3753 	}
3754 	after = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3755 	if (net_io_policy_log && (before != after)) {
3756 		static const char *wanted = "WANTED";
3757 		static const char *unwanted = "UNWANTED";
3758 		uuid_string_t euuid_buf;
3759 		pid_t epid;
3760 
3761 		if (so->so_flags & SOF_DELEGATED) {
3762 			uuid_unparse(so->e_uuid, euuid_buf);
3763 			epid = so->e_pid;
3764 		} else {
3765 			uuid_unparse(so->last_uuid, euuid_buf);
3766 			epid = so->last_pid;
3767 		}
3768 
3769 		log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3770 		    "euuid %s%s %s->%s\n", __func__,
3771 		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3772 		    SOCK_TYPE(so), epid, euuid_buf,
3773 		    (so->so_flags & SOF_DELEGATED) ?
3774 		    " [delegated]" : "",
3775 		    ((before < after) ? unwanted : wanted),
3776 		    ((before < after) ? wanted : unwanted));
3777 	}
3778 }
3779 #endif /* NECP */
3780 #endif /* !CONFIG_PROC_UUID_POLICY */
3781 
3782 #if NECP
3783 void
inp_update_necp_policy(struct inpcb * inp,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr,u_int override_bound_interface)3784 inp_update_necp_policy(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int override_bound_interface)
3785 {
3786 	necp_socket_find_policy_match(inp, override_local_addr, override_remote_addr, override_bound_interface);
3787 	if (necp_socket_should_rescope(inp) &&
3788 	    inp->inp_lport == 0 &&
3789 	    inp->inp_laddr.s_addr == INADDR_ANY &&
3790 	    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
3791 		// If we should rescope, and the socket is not yet bound
3792 		inp_bindif(inp, necp_socket_get_rescope_if_index(inp), NULL);
3793 		inp->inp_flags2 |= INP2_SCOPED_BY_NECP;
3794 	}
3795 }
3796 #endif /* NECP */
3797 
3798 int
inp_update_policy(struct inpcb * inp)3799 inp_update_policy(struct inpcb *inp)
3800 {
3801 #if CONFIG_PROC_UUID_POLICY
3802 	struct socket *so = inp->inp_socket;
3803 	uint32_t pflags = 0;
3804 	int32_t ogencnt;
3805 	int err = 0;
3806 	uint8_t *lookup_uuid = NULL;
3807 
3808 	if (!net_io_policy_uuid ||
3809 	    so == NULL || inp->inp_state == INPCB_STATE_DEAD) {
3810 		return 0;
3811 	}
3812 
3813 	/*
3814 	 * Kernel-created sockets that aren't delegating other sockets
3815 	 * are currently exempted from UUID policy checks.
3816 	 */
3817 	if (so->last_pid == 0 && !(so->so_flags & SOF_DELEGATED)) {
3818 		return 0;
3819 	}
3820 
3821 #if defined(XNU_TARGET_OS_OSX)
3822 	if (so->so_rpid > 0) {
3823 		lookup_uuid = so->so_ruuid;
3824 		ogencnt = so->so_policy_gencnt;
3825 		err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3826 	}
3827 #endif
3828 	if (lookup_uuid == NULL || err == ENOENT) {
3829 		lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid);
3830 		ogencnt = so->so_policy_gencnt;
3831 		err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3832 	}
3833 
3834 	/*
3835 	 * Discard cached generation count if the entry is gone (ENOENT),
3836 	 * so that we go thru the checks below.
3837 	 */
3838 	if (err == ENOENT && ogencnt != 0) {
3839 		so->so_policy_gencnt = 0;
3840 	}
3841 
3842 	/*
3843 	 * If the generation count has changed, inspect the policy flags
3844 	 * and act accordingly.  If a policy flag was previously set and
3845 	 * the UUID is no longer present in the table (ENOENT), treat it
3846 	 * as if the flag has been cleared.
3847 	 */
3848 	if ((err == 0 || err == ENOENT) && ogencnt != so->so_policy_gencnt) {
3849 		/* update cellular policy for this socket */
3850 		if (err == 0 && (pflags & PROC_UUID_NO_CELLULAR)) {
3851 			inp_update_cellular_policy(inp, TRUE);
3852 		} else if (!(pflags & PROC_UUID_NO_CELLULAR)) {
3853 			inp_update_cellular_policy(inp, FALSE);
3854 		}
3855 #if NECP
3856 		/* update necp want app policy for this socket */
3857 		if (err == 0 && (pflags & PROC_UUID_NECP_APP_POLICY)) {
3858 			inp_update_necp_want_app_policy(inp, TRUE);
3859 		} else if (!(pflags & PROC_UUID_NECP_APP_POLICY)) {
3860 			inp_update_necp_want_app_policy(inp, FALSE);
3861 		}
3862 #endif /* NECP */
3863 	}
3864 
3865 	return (err == ENOENT) ? 0 : err;
3866 #else /* !CONFIG_PROC_UUID_POLICY */
3867 #pragma unused(inp)
3868 	return 0;
3869 #endif /* !CONFIG_PROC_UUID_POLICY */
3870 }
3871 
3872 unsigned int log_restricted;
3873 SYSCTL_DECL(_net_inet);
3874 SYSCTL_INT(_net_inet, OID_AUTO, log_restricted,
3875     CTLFLAG_RW | CTLFLAG_LOCKED, &log_restricted, 0,
3876     "Log network restrictions");
3877 
3878 
3879 /*
3880  * Called when we need to enforce policy restrictions in the input path.
3881  *
3882  * Returns TRUE if we're not allowed to receive data, otherwise FALSE.
3883  */
3884 static boolean_t
_inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)3885 _inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
3886 {
3887 	VERIFY(inp != NULL);
3888 
3889 	/*
3890 	 * Inbound restrictions.
3891 	 */
3892 	if (!sorestrictrecv) {
3893 		return FALSE;
3894 	}
3895 
3896 	if (ifp == NULL) {
3897 		return FALSE;
3898 	}
3899 
3900 	if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
3901 		return TRUE;
3902 	}
3903 
3904 	if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
3905 		return TRUE;
3906 	}
3907 
3908 	if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
3909 		return TRUE;
3910 	}
3911 
3912 	if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
3913 		return TRUE;
3914 	}
3915 
3916 	if (!(ifp->if_eflags & IFEF_RESTRICTED_RECV)) {
3917 		return FALSE;
3918 	}
3919 
3920 	if (inp->inp_flags & INP_RECV_ANYIF) {
3921 		return FALSE;
3922 	}
3923 
3924 	/*
3925 	 * An entitled process can use the management interface without being bound
3926 	 * to the interface
3927 	 */
3928 	if (IFNET_IS_MANAGEMENT(ifp)) {
3929 		if (INP_MANAGEMENT_ALLOWED(inp)) {
3930 			return FALSE;
3931 		}
3932 		if (if_management_verbose > 1) {
3933 			os_log(OS_LOG_DEFAULT, "_inp_restricted_recv %s:%d not allowed on management interface %s",
3934 			    proc_best_name(current_proc()), proc_getpid(current_proc()),
3935 			    ifp->if_xname);
3936 		}
3937 		return TRUE;
3938 	}
3939 
3940 	if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp == ifp) {
3941 		return FALSE;
3942 	}
3943 
3944 	if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
3945 		return TRUE;
3946 	}
3947 
3948 
3949 	return TRUE;
3950 }
3951 
3952 boolean_t
inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)3953 inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
3954 {
3955 	boolean_t ret;
3956 
3957 	ret = _inp_restricted_recv(inp, ifp);
3958 	if (ret == TRUE && log_restricted) {
3959 		printf("pid %d (%s) is unable to receive packets on %s\n",
3960 		    proc_getpid(current_proc()), proc_best_name(current_proc()),
3961 		    ifp->if_xname);
3962 	}
3963 	return ret;
3964 }
3965 
3966 /*
3967  * Called when we need to enforce policy restrictions in the output path.
3968  *
3969  * Returns TRUE if we're not allowed to send data out, otherwise FALSE.
3970  */
3971 static boolean_t
_inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)3972 _inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
3973 {
3974 	VERIFY(inp != NULL);
3975 
3976 	/*
3977 	 * Outbound restrictions.
3978 	 */
3979 	if (!sorestrictsend) {
3980 		return FALSE;
3981 	}
3982 
3983 	if (ifp == NULL) {
3984 		return FALSE;
3985 	}
3986 
3987 	if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
3988 		return TRUE;
3989 	}
3990 
3991 	if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
3992 		return TRUE;
3993 	}
3994 
3995 	if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
3996 		return TRUE;
3997 	}
3998 
3999 	if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
4000 		return TRUE;
4001 	}
4002 
4003 	if (IFNET_IS_MANAGEMENT(ifp)) {
4004 		if (!INP_MANAGEMENT_ALLOWED(inp)) {
4005 			if (if_management_verbose > 1) {
4006 				os_log(OS_LOG_DEFAULT, "_inp_restricted_send %s:%d not allowed on management interface %s",
4007 				    proc_best_name(current_proc()), proc_getpid(current_proc()),
4008 				    ifp->if_xname);
4009 			}
4010 			return TRUE;
4011 		}
4012 	}
4013 
4014 	if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
4015 		return TRUE;
4016 	}
4017 
4018 	return FALSE;
4019 }
4020 
4021 boolean_t
inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)4022 inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
4023 {
4024 	boolean_t ret;
4025 
4026 	ret = _inp_restricted_send(inp, ifp);
4027 	if (ret == TRUE && log_restricted) {
4028 		printf("pid %d (%s) is unable to transmit packets on %s\n",
4029 		    proc_getpid(current_proc()), proc_best_name(current_proc()),
4030 		    ifp->if_xname);
4031 	}
4032 	return ret;
4033 }
4034 
4035 inline void
inp_count_sndbytes(struct inpcb * inp,u_int32_t th_ack)4036 inp_count_sndbytes(struct inpcb *inp, u_int32_t th_ack)
4037 {
4038 	struct ifnet *ifp = inp->inp_last_outifp;
4039 	struct socket *so = inp->inp_socket;
4040 	if (ifp != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
4041 	    (ifp->if_type == IFT_CELLULAR || IFNET_IS_WIFI(ifp))) {
4042 		int32_t unsent;
4043 
4044 		so->so_snd.sb_flags |= SB_SNDBYTE_CNT;
4045 
4046 		/*
4047 		 * There can be data outstanding before the connection
4048 		 * becomes established -- TFO case
4049 		 */
4050 		if (so->so_snd.sb_cc > 0) {
4051 			inp_incr_sndbytes_total(so, so->so_snd.sb_cc);
4052 		}
4053 
4054 		unsent = inp_get_sndbytes_allunsent(so, th_ack);
4055 		if (unsent > 0) {
4056 			inp_incr_sndbytes_unsent(so, unsent);
4057 		}
4058 	}
4059 }
4060 
4061 inline void
inp_incr_sndbytes_total(struct socket * so,int32_t len)4062 inp_incr_sndbytes_total(struct socket *so, int32_t len)
4063 {
4064 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4065 	struct ifnet *ifp = inp->inp_last_outifp;
4066 
4067 	if (ifp != NULL) {
4068 		VERIFY(ifp->if_sndbyte_total >= 0);
4069 		OSAddAtomic64(len, &ifp->if_sndbyte_total);
4070 	}
4071 }
4072 
4073 inline void
inp_decr_sndbytes_total(struct socket * so,int32_t len)4074 inp_decr_sndbytes_total(struct socket *so, int32_t len)
4075 {
4076 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4077 	struct ifnet *ifp = inp->inp_last_outifp;
4078 
4079 	if (ifp != NULL) {
4080 		if (ifp->if_sndbyte_total >= len) {
4081 			OSAddAtomic64(-len, &ifp->if_sndbyte_total);
4082 		} else {
4083 			ifp->if_sndbyte_total = 0;
4084 		}
4085 	}
4086 }
4087 
4088 inline void
inp_incr_sndbytes_unsent(struct socket * so,int32_t len)4089 inp_incr_sndbytes_unsent(struct socket *so, int32_t len)
4090 {
4091 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4092 	struct ifnet *ifp = inp->inp_last_outifp;
4093 
4094 	if (ifp != NULL) {
4095 		VERIFY(ifp->if_sndbyte_unsent >= 0);
4096 		OSAddAtomic64(len, &ifp->if_sndbyte_unsent);
4097 	}
4098 }
4099 
4100 inline void
inp_decr_sndbytes_unsent(struct socket * so,int32_t len)4101 inp_decr_sndbytes_unsent(struct socket *so, int32_t len)
4102 {
4103 	if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
4104 		return;
4105 	}
4106 
4107 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4108 	struct ifnet *ifp = inp->inp_last_outifp;
4109 
4110 	if (ifp != NULL) {
4111 		if (ifp->if_sndbyte_unsent >= len) {
4112 			OSAddAtomic64(-len, &ifp->if_sndbyte_unsent);
4113 		} else {
4114 			ifp->if_sndbyte_unsent = 0;
4115 		}
4116 	}
4117 }
4118 
4119 inline void
inp_decr_sndbytes_allunsent(struct socket * so,u_int32_t th_ack)4120 inp_decr_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
4121 {
4122 	int32_t len;
4123 
4124 	if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
4125 		return;
4126 	}
4127 
4128 	len = inp_get_sndbytes_allunsent(so, th_ack);
4129 	inp_decr_sndbytes_unsent(so, len);
4130 }
4131 
4132 #if SKYWALK
4133 inline void
inp_update_netns_flags(struct socket * so)4134 inp_update_netns_flags(struct socket *so)
4135 {
4136 	struct inpcb *inp;
4137 	uint32_t set_flags = 0;
4138 	uint32_t clear_flags = 0;
4139 
4140 	if (!(SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
4141 		return;
4142 	}
4143 
4144 	inp = sotoinpcb(so);
4145 
4146 	if (inp == NULL) {
4147 		return;
4148 	}
4149 
4150 	if (!NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
4151 		return;
4152 	}
4153 
4154 	if (so->so_options & SO_NOWAKEFROMSLEEP) {
4155 		set_flags |= NETNS_NOWAKEFROMSLEEP;
4156 	} else {
4157 		clear_flags |= NETNS_NOWAKEFROMSLEEP;
4158 	}
4159 
4160 	if (inp->inp_flags & INP_RECV_ANYIF) {
4161 		set_flags |= NETNS_RECVANYIF;
4162 	} else {
4163 		clear_flags |= NETNS_RECVANYIF;
4164 	}
4165 
4166 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
4167 		set_flags |= NETNS_EXTBGIDLE;
4168 	} else {
4169 		clear_flags |= NETNS_EXTBGIDLE;
4170 	}
4171 
4172 	netns_change_flags(&inp->inp_netns_token, set_flags, clear_flags);
4173 }
4174 #endif /* SKYWALK */
4175 
4176 inline void
inp_set_activity_bitmap(struct inpcb * inp)4177 inp_set_activity_bitmap(struct inpcb *inp)
4178 {
4179 	in_stat_set_activity_bitmap(&inp->inp_nw_activity, net_uptime());
4180 }
4181 
4182 inline void
inp_get_activity_bitmap(struct inpcb * inp,activity_bitmap_t * ab)4183 inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *ab)
4184 {
4185 	bcopy(&inp->inp_nw_activity, ab, sizeof(*ab));
4186 }
4187 
4188 void
inp_update_last_owner(struct socket * so,struct proc * p,struct proc * ep)4189 inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep)
4190 {
4191 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4192 
4193 	if (inp == NULL) {
4194 		return;
4195 	}
4196 
4197 	if (p != NULL) {
4198 		strlcpy(&inp->inp_last_proc_name[0], proc_name_address(p), sizeof(inp->inp_last_proc_name));
4199 	}
4200 	if (so->so_flags & SOF_DELEGATED) {
4201 		if (ep != NULL) {
4202 			strlcpy(&inp->inp_e_proc_name[0], proc_name_address(ep), sizeof(inp->inp_e_proc_name));
4203 		} else {
4204 			inp->inp_e_proc_name[0] = 0;
4205 		}
4206 	} else {
4207 		inp->inp_e_proc_name[0] = 0;
4208 	}
4209 }
4210 
4211 void
inp_copy_last_owner(struct socket * so,struct socket * head)4212 inp_copy_last_owner(struct socket *so, struct socket *head)
4213 {
4214 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4215 	struct inpcb *head_inp = (struct inpcb *)head->so_pcb;
4216 
4217 	if (inp == NULL || head_inp == NULL) {
4218 		return;
4219 	}
4220 
4221 	strlcpy(&inp->inp_last_proc_name[0], &head_inp->inp_last_proc_name[0], sizeof(inp->inp_last_proc_name));
4222 	strlcpy(&inp->inp_e_proc_name[0], &head_inp->inp_e_proc_name[0], sizeof(inp->inp_e_proc_name));
4223 }
4224 
4225 static int
in_check_management_interface_proc_callout(proc_t proc,void * arg __unused)4226 in_check_management_interface_proc_callout(proc_t proc, void *arg __unused)
4227 {
4228 	struct fileproc *fp = NULL;
4229 	task_t task = proc_task(proc);
4230 	bool allowed = false;
4231 
4232 	if (IOTaskHasEntitlement(task, INTCOPROC_RESTRICTED_ENTITLEMENT) == true
4233 	    || IOTaskHasEntitlement(task, MANAGEMENT_DATA_ENTITLEMENT) == true
4234 #if DEBUG || DEVELOPMENT
4235 	    || IOTaskHasEntitlement(task, INTCOPROC_RESTRICTED_ENTITLEMENT_DEVELOPMENT) == true
4236 	    || IOTaskHasEntitlement(task, MANAGEMENT_DATA_ENTITLEMENT_DEVELOPMENT) == true
4237 #endif /* DEBUG || DEVELOPMENT */
4238 	    ) {
4239 		allowed = true;
4240 	}
4241 	if (allowed == false && management_data_unrestricted == false) {
4242 		return PROC_RETURNED;
4243 	}
4244 
4245 	proc_fdlock(proc);
4246 	fdt_foreach(fp, proc) {
4247 		struct fileglob *fg = fp->fp_glob;
4248 		struct socket *so;
4249 		struct inpcb *inp;
4250 
4251 		if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4252 			continue;
4253 		}
4254 
4255 		so = (struct socket *)fp_get_data(fp);
4256 		if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
4257 			continue;
4258 		}
4259 
4260 		inp = (struct inpcb *)so->so_pcb;
4261 
4262 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4263 			continue;
4264 		}
4265 
4266 		socket_lock(so, 1);
4267 
4268 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4269 			socket_unlock(so, 1);
4270 			continue;
4271 		}
4272 		inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
4273 		inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
4274 
4275 		socket_unlock(so, 1);
4276 	}
4277 	proc_fdunlock(proc);
4278 
4279 	return PROC_RETURNED;
4280 }
4281 
4282 static bool in_management_interface_checked = false;
4283 
4284 static void
in_management_interface_event_callback(struct nwk_wq_entry * nwk_item)4285 in_management_interface_event_callback(struct nwk_wq_entry *nwk_item)
4286 {
4287 	kfree_type(struct nwk_wq_entry, nwk_item);
4288 
4289 	if (in_management_interface_checked == true) {
4290 		return;
4291 	}
4292 	in_management_interface_checked = true;
4293 
4294 	proc_iterate(PROC_ALLPROCLIST,
4295 	    in_check_management_interface_proc_callout,
4296 	    NULL, NULL, NULL);
4297 }
4298 
4299 void
in_management_interface_check(void)4300 in_management_interface_check(void)
4301 {
4302 	struct nwk_wq_entry *nwk_item;
4303 
4304 	if (if_management_interface_check_needed == false ||
4305 	    in_management_interface_checked == true) {
4306 		return;
4307 	}
4308 
4309 	nwk_item  = kalloc_type(struct nwk_wq_entry,
4310 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
4311 
4312 	nwk_item->func = in_management_interface_event_callback;
4313 
4314 	nwk_wq_enqueue(nwk_item);
4315 }
4316