xref: /xnu-10002.61.3/bsd/netinet/in_pcb.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1991, 1993, 1995
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
61  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62  */
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/proc.h>
73 #include <sys/kernel.h>
74 #include <sys/sysctl.h>
75 #include <sys/mcache.h>
76 #include <sys/kauth.h>
77 #include <sys/priv.h>
78 #include <sys/proc_uuid_policy.h>
79 #include <sys/syslog.h>
80 #include <sys/priv.h>
81 #include <sys/file_internal.h>
82 #include <net/dlil.h>
83 
84 #include <libkern/OSAtomic.h>
85 #include <kern/locks.h>
86 
87 #include <machine/limits.h>
88 
89 #include <kern/zalloc.h>
90 
91 #include <net/if.h>
92 #include <net/if_types.h>
93 #include <net/route.h>
94 #include <net/flowhash.h>
95 #include <net/flowadv.h>
96 #include <net/nat464_utils.h>
97 #include <net/ntstat.h>
98 #include <net/nwk_wq.h>
99 #include <net/restricted_in_port.h>
100 
101 #include <netinet/in.h>
102 #include <netinet/in_pcb.h>
103 #include <netinet/inp_log.h>
104 #include <netinet/in_var.h>
105 #include <netinet/ip_var.h>
106 
107 #include <netinet/ip6.h>
108 #include <netinet6/ip6_var.h>
109 
110 #include <sys/kdebug.h>
111 #include <sys/random.h>
112 
113 #include <dev/random/randomdev.h>
114 #include <mach/boolean.h>
115 
116 #include <atm/atm_internal.h>
117 #include <pexpert/pexpert.h>
118 
119 #if NECP
120 #include <net/necp.h>
121 #endif
122 
123 #include <sys/stat.h>
124 #include <sys/ubc.h>
125 #include <sys/vnode.h>
126 
127 #include <os/log.h>
128 
129 #if SKYWALK
130 #include <skywalk/namespace/flowidns.h>
131 #endif /* SKYWALK */
132 
133 #include <IOKit/IOBSD.h>
134 
135 extern const char *proc_name_address(struct proc *);
136 
137 static LCK_GRP_DECLARE(inpcb_lock_grp, "inpcb");
138 static LCK_ATTR_DECLARE(inpcb_lock_attr, 0, 0);
139 static LCK_MTX_DECLARE_ATTR(inpcb_lock, &inpcb_lock_grp, &inpcb_lock_attr);
140 static LCK_MTX_DECLARE_ATTR(inpcb_timeout_lock, &inpcb_lock_grp, &inpcb_lock_attr);
141 
142 static TAILQ_HEAD(, inpcbinfo) inpcb_head = TAILQ_HEAD_INITIALIZER(inpcb_head);
143 
144 static u_int16_t inpcb_timeout_run = 0; /* INPCB timer is scheduled to run */
145 static boolean_t inpcb_garbage_collecting = FALSE; /* gc timer is scheduled */
146 static boolean_t inpcb_ticking = FALSE;         /* "slow" timer is scheduled */
147 static boolean_t inpcb_fast_timer_on = FALSE;
148 
149 #define INPCB_GCREQ_THRESHOLD   50000
150 
151 static thread_call_t inpcb_thread_call, inpcb_fast_thread_call;
152 static void inpcb_sched_timeout(void);
153 static void inpcb_sched_lazy_timeout(void);
154 static void _inpcb_sched_timeout(unsigned int);
155 static void inpcb_timeout(void *, void *);
156 const int inpcb_timeout_lazy = 10;      /* 10 seconds leeway for lazy timers */
157 extern int tvtohz(struct timeval *);
158 
159 #if CONFIG_PROC_UUID_POLICY
160 static void inp_update_cellular_policy(struct inpcb *, boolean_t);
161 #if NECP
162 static void inp_update_necp_want_app_policy(struct inpcb *, boolean_t);
163 #endif /* NECP */
164 #endif /* !CONFIG_PROC_UUID_POLICY */
165 
166 #define DBG_FNC_PCB_LOOKUP      NETDBG_CODE(DBG_NETTCP, (6 << 8))
167 #define DBG_FNC_PCB_HLOOKUP     NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
168 
169 int allow_udp_port_exhaustion = 0;
170 
171 /*
172  * These configure the range of local port addresses assigned to
173  * "unspecified" outgoing connections/packets/whatever.
174  */
175 int     ipport_lowfirstauto  = IPPORT_RESERVED - 1;     /* 1023 */
176 int     ipport_lowlastauto = IPPORT_RESERVEDSTART;      /* 600 */
177 int     ipport_firstauto = IPPORT_HIFIRSTAUTO;          /* 49152 */
178 int     ipport_lastauto  = IPPORT_HILASTAUTO;           /* 65535 */
179 int     ipport_hifirstauto = IPPORT_HIFIRSTAUTO;        /* 49152 */
180 int     ipport_hilastauto  = IPPORT_HILASTAUTO;         /* 65535 */
181 
182 #define RANGECHK(var, min, max) \
183 	if ((var) < (min)) { (var) = (min); } \
184 	else if ((var) > (max)) { (var) = (max); }
185 
186 static int
187 sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
188 {
189 #pragma unused(arg1, arg2)
190 	int error;
191 	int new_value = *(int *)oidp->oid_arg1;
192 #if (DEBUG | DEVELOPMENT)
193 	int old_value = *(int *)oidp->oid_arg1;
194 	/*
195 	 * For unit testing allow a non-superuser process with the
196 	 * proper entitlement to modify the variables
197 	 */
198 	if (req->newptr) {
199 		if (proc_suser(current_proc()) != 0 &&
200 		    (error = priv_check_cred(kauth_cred_get(),
201 		    PRIV_NETINET_RESERVEDPORT, 0))) {
202 			return EPERM;
203 		}
204 	}
205 #endif /* (DEBUG | DEVELOPMENT) */
206 
207 	error = sysctl_handle_int(oidp, &new_value, 0, req);
208 	if (!error) {
209 		if (oidp->oid_arg1 == &ipport_lowfirstauto || oidp->oid_arg1 == &ipport_lowlastauto) {
210 			RANGECHK(new_value, 1, IPPORT_RESERVED - 1);
211 		} else {
212 			RANGECHK(new_value, IPPORT_RESERVED, USHRT_MAX);
213 		}
214 		*(int *)oidp->oid_arg1 = new_value;
215 	}
216 
217 #if (DEBUG | DEVELOPMENT)
218 	os_log(OS_LOG_DEFAULT,
219 	    "%s:%u sysctl net.restricted_port.verbose: %d -> %d)",
220 	    proc_best_name(current_proc()), proc_selfpid(),
221 	    old_value, *(int *)oidp->oid_arg1);
222 #endif /* (DEBUG | DEVELOPMENT) */
223 
224 	return error;
225 }
226 
227 #undef RANGECHK
228 
229 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
230     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IP Ports");
231 
232 #if (DEBUG | DEVELOPMENT)
233 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY)
234 #else
235 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED)
236 #endif /* (DEBUG | DEVELOPMENT) */
237 
238 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
239     CTLFAGS_IP_PORTRANGE,
240     &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
241 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
242     CTLFAGS_IP_PORTRANGE,
243     &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
244 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
245     CTLFAGS_IP_PORTRANGE,
246     &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
247 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
248     CTLFAGS_IP_PORTRANGE,
249     &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
250 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
251     CTLFAGS_IP_PORTRANGE,
252     &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
253 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
254     CTLFAGS_IP_PORTRANGE,
255     &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
256 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, ipport_allow_udp_port_exhaustion,
257     CTLFLAG_LOCKED | CTLFLAG_RW, &allow_udp_port_exhaustion, 0, "");
258 
259 static uint32_t apn_fallbk_debug = 0;
260 #define apn_fallbk_log(x)       do { if (apn_fallbk_debug >= 1) log x; } while (0)
261 
262 #if !XNU_TARGET_OS_OSX
263 static boolean_t apn_fallbk_enabled = TRUE;
264 
265 SYSCTL_DECL(_net_inet);
266 SYSCTL_NODE(_net_inet, OID_AUTO, apn_fallback, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "APN Fallback");
267 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
268     &apn_fallbk_enabled, 0, "APN fallback enable");
269 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
270     &apn_fallbk_debug, 0, "APN fallback debug enable");
271 #else /* XNU_TARGET_OS_OSX */
272 static boolean_t apn_fallbk_enabled = FALSE;
273 #endif /* XNU_TARGET_OS_OSX */
274 
275 extern int      udp_use_randomport;
276 extern int      tcp_use_randomport;
277 
278 /* Structs used for flowhash computation */
279 struct inp_flowhash_key_addr {
280 	union {
281 		struct in_addr  v4;
282 		struct in6_addr v6;
283 		u_int8_t        addr8[16];
284 		u_int16_t       addr16[8];
285 		u_int32_t       addr32[4];
286 	} infha;
287 };
288 
289 struct inp_flowhash_key {
290 	struct inp_flowhash_key_addr    infh_laddr;
291 	struct inp_flowhash_key_addr    infh_faddr;
292 	u_int32_t                       infh_lport;
293 	u_int32_t                       infh_fport;
294 	u_int32_t                       infh_af;
295 	u_int32_t                       infh_proto;
296 	u_int32_t                       infh_rand1;
297 	u_int32_t                       infh_rand2;
298 };
299 
300 #if !SKYWALK
301 static u_int32_t inp_hash_seed = 0;
302 #endif /* !SKYWALK */
303 
304 static int infc_cmp(const struct inpcb *, const struct inpcb *);
305 
306 /* Flags used by inp_fc_getinp */
307 #define INPFC_SOLOCKED  0x1
308 #define INPFC_REMOVE    0x2
309 static struct inpcb *inp_fc_getinp(u_int32_t, u_int32_t);
310 
311 static void inp_fc_feedback(struct inpcb *);
312 extern void tcp_remove_from_time_wait(struct inpcb *inp);
313 
314 static LCK_MTX_DECLARE_ATTR(inp_fc_lck, &inpcb_lock_grp, &inpcb_lock_attr);
315 
316 RB_HEAD(inp_fc_tree, inpcb) inp_fc_tree;
317 RB_PROTOTYPE(inp_fc_tree, inpcb, infc_link, infc_cmp);
318 RB_GENERATE(inp_fc_tree, inpcb, infc_link, infc_cmp);
319 
320 /*
321  * Use this inp as a key to find an inp in the flowhash tree.
322  * Accesses to it are protected by inp_fc_lck.
323  */
324 struct inpcb key_inp;
325 
326 /*
327  * in_pcb.c: manage the Protocol Control Blocks.
328  */
329 
330 void
in_pcbinit(void)331 in_pcbinit(void)
332 {
333 	static int inpcb_initialized = 0;
334 	uint32_t logging_config;
335 
336 	VERIFY(!inpcb_initialized);
337 	inpcb_initialized = 1;
338 
339 	logging_config = atm_get_diagnostic_config();
340 	if (logging_config & 0x80000000) {
341 		inp_log_privacy = 1;
342 	}
343 
344 	inpcb_thread_call = thread_call_allocate_with_priority(inpcb_timeout,
345 	    NULL, THREAD_CALL_PRIORITY_KERNEL);
346 	/* Give it an arg so that we know that this is the fast timer */
347 	inpcb_fast_thread_call = thread_call_allocate_with_priority(
348 		inpcb_timeout, &inpcb_timeout, THREAD_CALL_PRIORITY_KERNEL);
349 	if (inpcb_thread_call == NULL || inpcb_fast_thread_call == NULL) {
350 		panic("unable to alloc the inpcb thread call");
351 	}
352 
353 	/*
354 	 * Initialize data structures required to deliver
355 	 * flow advisories.
356 	 */
357 	lck_mtx_lock(&inp_fc_lck);
358 	RB_INIT(&inp_fc_tree);
359 	bzero(&key_inp, sizeof(key_inp));
360 	lck_mtx_unlock(&inp_fc_lck);
361 }
362 
363 #define INPCB_HAVE_TIMER_REQ(req)       (((req).intimer_lazy > 0) || \
364 	((req).intimer_fast > 0) || ((req).intimer_nodelay > 0))
365 static void
inpcb_timeout(void * arg0,void * arg1)366 inpcb_timeout(void *arg0, void *arg1)
367 {
368 #pragma unused(arg1)
369 	struct inpcbinfo *ipi;
370 	boolean_t t, gc;
371 	struct intimercount gccnt, tmcnt;
372 
373 	/*
374 	 * Update coarse-grained networking timestamp (in sec.); the idea
375 	 * is to piggy-back on the timeout callout to update the counter
376 	 * returnable via net_uptime().
377 	 */
378 	net_update_uptime();
379 
380 	bzero(&gccnt, sizeof(gccnt));
381 	bzero(&tmcnt, sizeof(tmcnt));
382 
383 	lck_mtx_lock_spin(&inpcb_timeout_lock);
384 	gc = inpcb_garbage_collecting;
385 	inpcb_garbage_collecting = FALSE;
386 
387 	t = inpcb_ticking;
388 	inpcb_ticking = FALSE;
389 
390 	if (gc || t) {
391 		lck_mtx_unlock(&inpcb_timeout_lock);
392 
393 		lck_mtx_lock(&inpcb_lock);
394 		TAILQ_FOREACH(ipi, &inpcb_head, ipi_entry) {
395 			if (INPCB_HAVE_TIMER_REQ(ipi->ipi_gc_req)) {
396 				bzero(&ipi->ipi_gc_req,
397 				    sizeof(ipi->ipi_gc_req));
398 				if (gc && ipi->ipi_gc != NULL) {
399 					ipi->ipi_gc(ipi);
400 					gccnt.intimer_lazy +=
401 					    ipi->ipi_gc_req.intimer_lazy;
402 					gccnt.intimer_fast +=
403 					    ipi->ipi_gc_req.intimer_fast;
404 					gccnt.intimer_nodelay +=
405 					    ipi->ipi_gc_req.intimer_nodelay;
406 				}
407 			}
408 			if (INPCB_HAVE_TIMER_REQ(ipi->ipi_timer_req)) {
409 				bzero(&ipi->ipi_timer_req,
410 				    sizeof(ipi->ipi_timer_req));
411 				if (t && ipi->ipi_timer != NULL) {
412 					ipi->ipi_timer(ipi);
413 					tmcnt.intimer_lazy +=
414 					    ipi->ipi_timer_req.intimer_lazy;
415 					tmcnt.intimer_fast +=
416 					    ipi->ipi_timer_req.intimer_fast;
417 					tmcnt.intimer_nodelay +=
418 					    ipi->ipi_timer_req.intimer_nodelay;
419 				}
420 			}
421 		}
422 		lck_mtx_unlock(&inpcb_lock);
423 		lck_mtx_lock_spin(&inpcb_timeout_lock);
424 	}
425 
426 	/* lock was dropped above, so check first before overriding */
427 	if (!inpcb_garbage_collecting) {
428 		inpcb_garbage_collecting = INPCB_HAVE_TIMER_REQ(gccnt);
429 	}
430 	if (!inpcb_ticking) {
431 		inpcb_ticking = INPCB_HAVE_TIMER_REQ(tmcnt);
432 	}
433 
434 	/* arg0 will be set if we are the fast timer */
435 	if (arg0 != NULL) {
436 		inpcb_fast_timer_on = FALSE;
437 	}
438 	inpcb_timeout_run--;
439 	VERIFY(inpcb_timeout_run >= 0 && inpcb_timeout_run < 2);
440 
441 	/* re-arm the timer if there's work to do */
442 	if (gccnt.intimer_nodelay > 0 || tmcnt.intimer_nodelay > 0) {
443 		inpcb_sched_timeout();
444 	} else if ((gccnt.intimer_fast + tmcnt.intimer_fast) <= 5) {
445 		/* be lazy when idle with little activity */
446 		inpcb_sched_lazy_timeout();
447 	} else {
448 		inpcb_sched_timeout();
449 	}
450 
451 	lck_mtx_unlock(&inpcb_timeout_lock);
452 }
453 
454 static void
inpcb_sched_timeout(void)455 inpcb_sched_timeout(void)
456 {
457 	_inpcb_sched_timeout(0);
458 }
459 
460 static void
inpcb_sched_lazy_timeout(void)461 inpcb_sched_lazy_timeout(void)
462 {
463 	_inpcb_sched_timeout(inpcb_timeout_lazy);
464 }
465 
466 static void
_inpcb_sched_timeout(unsigned int offset)467 _inpcb_sched_timeout(unsigned int offset)
468 {
469 	uint64_t deadline, leeway;
470 
471 	clock_interval_to_deadline(1, NSEC_PER_SEC, &deadline);
472 	LCK_MTX_ASSERT(&inpcb_timeout_lock, LCK_MTX_ASSERT_OWNED);
473 	if (inpcb_timeout_run == 0 &&
474 	    (inpcb_garbage_collecting || inpcb_ticking)) {
475 		lck_mtx_convert_spin(&inpcb_timeout_lock);
476 		inpcb_timeout_run++;
477 		if (offset == 0) {
478 			inpcb_fast_timer_on = TRUE;
479 			thread_call_enter_delayed(inpcb_fast_thread_call,
480 			    deadline);
481 		} else {
482 			inpcb_fast_timer_on = FALSE;
483 			clock_interval_to_absolutetime_interval(offset,
484 			    NSEC_PER_SEC, &leeway);
485 			thread_call_enter_delayed_with_leeway(
486 				inpcb_thread_call, NULL, deadline, leeway,
487 				THREAD_CALL_DELAY_LEEWAY);
488 		}
489 	} else if (inpcb_timeout_run == 1 &&
490 	    offset == 0 && !inpcb_fast_timer_on) {
491 		/*
492 		 * Since the request was for a fast timer but the
493 		 * scheduled timer is a lazy timer, try to schedule
494 		 * another instance of fast timer also.
495 		 */
496 		lck_mtx_convert_spin(&inpcb_timeout_lock);
497 		inpcb_timeout_run++;
498 		inpcb_fast_timer_on = TRUE;
499 		thread_call_enter_delayed(inpcb_fast_thread_call, deadline);
500 	}
501 }
502 
503 void
inpcb_gc_sched(struct inpcbinfo * ipi,u_int32_t type)504 inpcb_gc_sched(struct inpcbinfo *ipi, u_int32_t type)
505 {
506 	u_int32_t gccnt;
507 
508 	lck_mtx_lock_spin(&inpcb_timeout_lock);
509 	inpcb_garbage_collecting = TRUE;
510 	gccnt = ipi->ipi_gc_req.intimer_nodelay +
511 	    ipi->ipi_gc_req.intimer_fast;
512 
513 	if (gccnt > INPCB_GCREQ_THRESHOLD) {
514 		type = INPCB_TIMER_FAST;
515 	}
516 
517 	switch (type) {
518 	case INPCB_TIMER_NODELAY:
519 		os_atomic_inc(&ipi->ipi_gc_req.intimer_nodelay, relaxed);
520 		inpcb_sched_timeout();
521 		break;
522 	case INPCB_TIMER_FAST:
523 		os_atomic_inc(&ipi->ipi_gc_req.intimer_fast, relaxed);
524 		inpcb_sched_timeout();
525 		break;
526 	default:
527 		os_atomic_inc(&ipi->ipi_gc_req.intimer_lazy, relaxed);
528 		inpcb_sched_lazy_timeout();
529 		break;
530 	}
531 	lck_mtx_unlock(&inpcb_timeout_lock);
532 }
533 
534 void
inpcb_timer_sched(struct inpcbinfo * ipi,u_int32_t type)535 inpcb_timer_sched(struct inpcbinfo *ipi, u_int32_t type)
536 {
537 	lck_mtx_lock_spin(&inpcb_timeout_lock);
538 	inpcb_ticking = TRUE;
539 	switch (type) {
540 	case INPCB_TIMER_NODELAY:
541 		os_atomic_inc(&ipi->ipi_timer_req.intimer_nodelay, relaxed);
542 		inpcb_sched_timeout();
543 		break;
544 	case INPCB_TIMER_FAST:
545 		os_atomic_inc(&ipi->ipi_timer_req.intimer_fast, relaxed);
546 		inpcb_sched_timeout();
547 		break;
548 	default:
549 		os_atomic_inc(&ipi->ipi_timer_req.intimer_lazy, relaxed);
550 		inpcb_sched_lazy_timeout();
551 		break;
552 	}
553 	lck_mtx_unlock(&inpcb_timeout_lock);
554 }
555 
556 void
in_pcbinfo_attach(struct inpcbinfo * ipi)557 in_pcbinfo_attach(struct inpcbinfo *ipi)
558 {
559 	struct inpcbinfo *ipi0;
560 
561 	lck_mtx_lock(&inpcb_lock);
562 	TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
563 		if (ipi0 == ipi) {
564 			panic("%s: ipi %p already in the list",
565 			    __func__, ipi);
566 			/* NOTREACHED */
567 		}
568 	}
569 	TAILQ_INSERT_TAIL(&inpcb_head, ipi, ipi_entry);
570 	lck_mtx_unlock(&inpcb_lock);
571 }
572 
573 int
in_pcbinfo_detach(struct inpcbinfo * ipi)574 in_pcbinfo_detach(struct inpcbinfo *ipi)
575 {
576 	struct inpcbinfo *ipi0;
577 	int error = 0;
578 
579 	lck_mtx_lock(&inpcb_lock);
580 	TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
581 		if (ipi0 == ipi) {
582 			break;
583 		}
584 	}
585 	if (ipi0 != NULL) {
586 		TAILQ_REMOVE(&inpcb_head, ipi0, ipi_entry);
587 	} else {
588 		error = ENXIO;
589 	}
590 	lck_mtx_unlock(&inpcb_lock);
591 
592 	return error;
593 }
594 
595 __attribute__((noinline))
596 char *
inp_snprintf_tuple(struct inpcb * inp,char * buf,size_t buflen)597 inp_snprintf_tuple(struct inpcb *inp, char *buf, size_t buflen)
598 {
599 	char laddrstr[MAX_IPv6_STR_LEN];
600 	char faddrstr[MAX_IPv6_STR_LEN];
601 	uint16_t lport = 0;
602 	uint16_t fport = 0;
603 	uint16_t proto = IPPROTO_IP;
604 
605 	if (inp->inp_socket != NULL && inp->inp_socket->so_proto != NULL) {
606 		proto = inp->inp_socket->so_proto->pr_protocol;
607 
608 		if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) {
609 			lport  = inp->inp_lport;
610 			fport = inp->inp_fport;
611 		}
612 	}
613 	if (inp->inp_vflag & INP_IPV4) {
614 		inet_ntop(AF_INET, (void *)&inp->inp_laddr.s_addr, laddrstr, sizeof(laddrstr));
615 		inet_ntop(AF_INET, (void *)&inp->inp_faddr.s_addr, faddrstr, sizeof(faddrstr));
616 	} else if (inp->inp_vflag & INP_IPV6) {
617 		inet_ntop(AF_INET6, (void *)&inp->in6p_faddr, laddrstr, sizeof(laddrstr));
618 		inet_ntop(AF_INET6, (void *)&inp->in6p_faddr, faddrstr, sizeof(faddrstr));
619 	}
620 	snprintf(buf, buflen, "[%u %s:%u %s:%u]",
621 	    proto, laddrstr, ntohs(lport), faddrstr, ntohs(fport));
622 
623 	return buf;
624 }
625 
626 __attribute__((noinline))
627 void
in_pcb_check_management_entitled(struct inpcb * inp)628 in_pcb_check_management_entitled(struct inpcb *inp)
629 {
630 	if (inp->inp_flags2 & INP2_MANAGEMENT_CHECKED) {
631 		return;
632 	}
633 
634 	if (management_data_unrestricted) {
635 		inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
636 		inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
637 	} else if (if_management_interface_check_needed == true) {
638 		inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
639 		/*
640 		 * Note that soopt_cred_check check both intcoproc entitlements
641 		 * We check MANAGEMENT_DATA_ENTITLEMENT as there is no corresponding PRIV value
642 		 */
643 		if (soopt_cred_check(inp->inp_socket, PRIV_NET_RESTRICTED_INTCOPROC, false, false) == 0
644 		    || IOCurrentTaskHasEntitlement(MANAGEMENT_DATA_ENTITLEMENT) == true
645 #if DEBUG || DEVELOPMENT
646 		    || IOCurrentTaskHasEntitlement(MANAGEMENT_DATA_ENTITLEMENT_DEVELOPMENT) == true
647 #endif /* DEBUG || DEVELOPMENT */
648 		    ) {
649 			inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
650 		} else {
651 			if (__improbable(if_management_verbose > 1)) {
652 				char buf[128];
653 
654 				os_log(OS_LOG_DEFAULT, "in_pcb_check_management_entitled %s:%d not management entitled %s",
655 				    proc_best_name(current_proc()),
656 				    proc_selfpid(),
657 				    inp_snprintf_tuple(inp, buf, sizeof(buf)));
658 			}
659 		}
660 	}
661 }
662 
663 /*
664  * Allocate a PCB and associate it with the socket.
665  *
666  * Returns:	0			Success
667  *		ENOBUFS
668  *		ENOMEM
669  */
670 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo,struct proc * p)671 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p)
672 {
673 #pragma unused(p)
674 	struct inpcb *inp;
675 	caddr_t temp;
676 
677 	if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
678 		inp = zalloc_flags(pcbinfo->ipi_zone,
679 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
680 	} else {
681 		inp = (struct inpcb *)(void *)so->so_saved_pcb;
682 		temp = inp->inp_saved_ppcb;
683 		bzero((caddr_t)inp, sizeof(*inp));
684 		inp->inp_saved_ppcb = temp;
685 	}
686 
687 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
688 	inp->inp_pcbinfo = pcbinfo;
689 	inp->inp_socket = so;
690 	/* make sure inp_stat is always 64-bit aligned */
691 	inp->inp_stat = (struct inp_stat *)P2ROUNDUP(inp->inp_stat_store,
692 	    sizeof(u_int64_t));
693 	if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store) +
694 	    sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) {
695 		panic("%s: insufficient space to align inp_stat", __func__);
696 		/* NOTREACHED */
697 	}
698 
699 	/* make sure inp_cstat is always 64-bit aligned */
700 	inp->inp_cstat = (struct inp_stat *)P2ROUNDUP(inp->inp_cstat_store,
701 	    sizeof(u_int64_t));
702 	if (((uintptr_t)inp->inp_cstat - (uintptr_t)inp->inp_cstat_store) +
703 	    sizeof(*inp->inp_cstat) > sizeof(inp->inp_cstat_store)) {
704 		panic("%s: insufficient space to align inp_cstat", __func__);
705 		/* NOTREACHED */
706 	}
707 
708 	/* make sure inp_wstat is always 64-bit aligned */
709 	inp->inp_wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_wstat_store,
710 	    sizeof(u_int64_t));
711 	if (((uintptr_t)inp->inp_wstat - (uintptr_t)inp->inp_wstat_store) +
712 	    sizeof(*inp->inp_wstat) > sizeof(inp->inp_wstat_store)) {
713 		panic("%s: insufficient space to align inp_wstat", __func__);
714 		/* NOTREACHED */
715 	}
716 
717 	/* make sure inp_Wstat is always 64-bit aligned */
718 	inp->inp_Wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_Wstat_store,
719 	    sizeof(u_int64_t));
720 	if (((uintptr_t)inp->inp_Wstat - (uintptr_t)inp->inp_Wstat_store) +
721 	    sizeof(*inp->inp_Wstat) > sizeof(inp->inp_Wstat_store)) {
722 		panic("%s: insufficient space to align inp_Wstat", __func__);
723 		/* NOTREACHED */
724 	}
725 
726 	so->so_pcb = (caddr_t)inp;
727 
728 	if (so->so_proto->pr_flags & PR_PCBLOCK) {
729 		lck_mtx_init(&inp->inpcb_mtx, pcbinfo->ipi_lock_grp,
730 		    &pcbinfo->ipi_lock_attr);
731 	}
732 
733 	if (SOCK_DOM(so) == PF_INET6 && !ip6_mapped_addr_on) {
734 		inp->inp_flags |= IN6P_IPV6_V6ONLY;
735 	}
736 
737 	if (ip6_auto_flowlabel) {
738 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
739 	}
740 	if (intcoproc_unrestricted) {
741 		inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
742 	}
743 
744 	(void) inp_update_policy(inp);
745 
746 	lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
747 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
748 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
749 	pcbinfo->ipi_count++;
750 	lck_rw_done(&pcbinfo->ipi_lock);
751 	return 0;
752 }
753 
754 /*
755  * in_pcblookup_local_and_cleanup does everything
756  * in_pcblookup_local does but it checks for a socket
757  * that's going away. Since we know that the lock is
758  * held read+write when this function is called, we
759  * can safely dispose of this socket like the slow
760  * timer would usually do and return NULL. This is
761  * great for bind.
762  */
763 struct inpcb *
in_pcblookup_local_and_cleanup(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_int lport_arg,int wild_okay)764 in_pcblookup_local_and_cleanup(struct inpcbinfo *pcbinfo, struct in_addr laddr,
765     u_int lport_arg, int wild_okay)
766 {
767 	struct inpcb *inp;
768 
769 	/* Perform normal lookup */
770 	inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
771 
772 	/* Check if we found a match but it's waiting to be disposed */
773 	if (inp != NULL && inp->inp_wantcnt == WNT_STOPUSING) {
774 		struct socket *so = inp->inp_socket;
775 
776 		socket_lock(so, 0);
777 
778 		if (so->so_usecount == 0) {
779 			if (inp->inp_state != INPCB_STATE_DEAD) {
780 				in_pcbdetach(inp);
781 			}
782 			in_pcbdispose(inp);     /* will unlock & destroy */
783 			inp = NULL;
784 		} else {
785 			socket_unlock(so, 0);
786 		}
787 	}
788 
789 	return inp;
790 }
791 
792 static void
in_pcb_conflict_post_msg(u_int16_t port)793 in_pcb_conflict_post_msg(u_int16_t port)
794 {
795 	/*
796 	 * Radar 5523020 send a kernel event notification if a
797 	 * non-participating socket tries to bind the port a socket
798 	 * who has set SOF_NOTIFYCONFLICT owns.
799 	 */
800 	struct kev_msg ev_msg;
801 	struct kev_in_portinuse in_portinuse;
802 
803 	bzero(&in_portinuse, sizeof(struct kev_in_portinuse));
804 	bzero(&ev_msg, sizeof(struct kev_msg));
805 	in_portinuse.port = ntohs(port);        /* port in host order */
806 	in_portinuse.req_pid = proc_selfpid();
807 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
808 	ev_msg.kev_class = KEV_NETWORK_CLASS;
809 	ev_msg.kev_subclass = KEV_INET_SUBCLASS;
810 	ev_msg.event_code = KEV_INET_PORTINUSE;
811 	ev_msg.dv[0].data_ptr = &in_portinuse;
812 	ev_msg.dv[0].data_length = sizeof(struct kev_in_portinuse);
813 	ev_msg.dv[1].data_length = 0;
814 	dlil_post_complete_msg(NULL, &ev_msg);
815 }
816 
817 /*
818  * Bind an INPCB to an address and/or port.  This routine should not alter
819  * the caller-supplied local address "nam".
820  *
821  * Returns:	0			Success
822  *		EADDRNOTAVAIL		Address not available.
823  *		EINVAL			Invalid argument
824  *		EAFNOSUPPORT		Address family not supported [notdef]
825  *		EACCES			Permission denied
826  *		EADDRINUSE		Address in use
827  *		EAGAIN			Resource unavailable, try again
828  *		priv_check_cred:EPERM	Operation not permitted
829  */
830 int
in_pcbbind(struct inpcb * inp,struct sockaddr * nam,struct proc * p)831 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
832 {
833 	struct socket *so = inp->inp_socket;
834 	unsigned short *lastport;
835 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
836 	u_short lport = 0, rand_port = 0;
837 	int wild = 0;
838 	int reuseport = (so->so_options & SO_REUSEPORT);
839 	int error = 0;
840 	int randomport;
841 	int conflict = 0;
842 	boolean_t anonport = FALSE;
843 	kauth_cred_t cred;
844 	struct in_addr laddr;
845 	struct ifnet *outif = NULL;
846 
847 	if (inp->inp_flags2 & INP2_BIND_IN_PROGRESS) {
848 		return EINVAL;
849 	}
850 	inp->inp_flags2 |= INP2_BIND_IN_PROGRESS;
851 
852 	if (TAILQ_EMPTY(&in_ifaddrhead)) { /* XXX broken! */
853 		error = EADDRNOTAVAIL;
854 		goto done;
855 	}
856 	if (!(so->so_options & (SO_REUSEADDR | SO_REUSEPORT))) {
857 		wild = 1;
858 	}
859 
860 	bzero(&laddr, sizeof(laddr));
861 
862 	socket_unlock(so, 0); /* keep reference on socket */
863 	lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
864 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
865 		/* another thread completed the bind */
866 		lck_rw_done(&pcbinfo->ipi_lock);
867 		socket_lock(so, 0);
868 		error = EINVAL;
869 		goto done;
870 	}
871 
872 	if (nam != NULL) {
873 		if (nam->sa_len != sizeof(struct sockaddr_in)) {
874 			lck_rw_done(&pcbinfo->ipi_lock);
875 			socket_lock(so, 0);
876 			error = EINVAL;
877 			goto done;
878 		}
879 #if 0
880 		/*
881 		 * We should check the family, but old programs
882 		 * incorrectly fail to initialize it.
883 		 */
884 		if (nam->sa_family != AF_INET) {
885 			lck_rw_done(&pcbinfo->ipi_lock);
886 			socket_lock(so, 0);
887 			error = EAFNOSUPPORT;
888 			goto done;
889 		}
890 #endif /* 0 */
891 		lport = SIN(nam)->sin_port;
892 
893 		if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr))) {
894 			/*
895 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
896 			 * allow complete duplication of binding if
897 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
898 			 * and a multicast address is bound on both
899 			 * new and duplicated sockets.
900 			 */
901 			if (so->so_options & SO_REUSEADDR) {
902 				reuseport = SO_REUSEADDR | SO_REUSEPORT;
903 			}
904 		} else if (SIN(nam)->sin_addr.s_addr != INADDR_ANY) {
905 			struct sockaddr_in sin;
906 			struct ifaddr *ifa;
907 
908 			/* Sanitized for interface address searches */
909 			bzero(&sin, sizeof(sin));
910 			sin.sin_family = AF_INET;
911 			sin.sin_len = sizeof(struct sockaddr_in);
912 			sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
913 
914 			ifa = ifa_ifwithaddr(SA(&sin));
915 			if (ifa == NULL) {
916 				lck_rw_done(&pcbinfo->ipi_lock);
917 				socket_lock(so, 0);
918 				error = EADDRNOTAVAIL;
919 				goto done;
920 			} else {
921 				/*
922 				 * Opportunistically determine the outbound
923 				 * interface that may be used; this may not
924 				 * hold true if we end up using a route
925 				 * going over a different interface, e.g.
926 				 * when sending to a local address.  This
927 				 * will get updated again after sending.
928 				 */
929 				IFA_LOCK(ifa);
930 				outif = ifa->ifa_ifp;
931 				IFA_UNLOCK(ifa);
932 				IFA_REMREF(ifa);
933 			}
934 		}
935 
936 #if SKYWALK
937 		if (inp->inp_flags2 & INP2_EXTERNAL_PORT) {
938 			// Extract the external flow info
939 			struct ns_flow_info nfi = {};
940 			error = necp_client_get_netns_flow_info(inp->necp_client_uuid,
941 			    &nfi);
942 			if (error != 0) {
943 				lck_rw_done(&pcbinfo->ipi_lock);
944 				socket_lock(so, 0);
945 				goto done;
946 			}
947 
948 			// Extract the reserved port
949 			u_int16_t reserved_lport = 0;
950 			if (nfi.nfi_laddr.sa.sa_family == AF_INET) {
951 				reserved_lport = nfi.nfi_laddr.sin.sin_port;
952 			} else if (nfi.nfi_laddr.sa.sa_family == AF_INET6) {
953 				reserved_lport = nfi.nfi_laddr.sin6.sin6_port;
954 			} else {
955 				lck_rw_done(&pcbinfo->ipi_lock);
956 				socket_lock(so, 0);
957 				error = EINVAL;
958 				goto done;
959 			}
960 
961 			// Validate or use the reserved port
962 			if (lport == 0) {
963 				lport = reserved_lport;
964 			} else if (lport != reserved_lport) {
965 				lck_rw_done(&pcbinfo->ipi_lock);
966 				socket_lock(so, 0);
967 				error = EINVAL;
968 				goto done;
969 			}
970 		}
971 
972 		/* Do not allow reserving a UDP port if remaining UDP port count is below 4096 */
973 		if (SOCK_PROTO(so) == IPPROTO_UDP && !allow_udp_port_exhaustion) {
974 			uint32_t current_reservations = 0;
975 			if (inp->inp_vflag & INP_IPV6) {
976 				current_reservations = netns_lookup_reservations_count_in6(inp->in6p_laddr, IPPROTO_UDP);
977 			} else {
978 				current_reservations = netns_lookup_reservations_count_in(inp->inp_laddr, IPPROTO_UDP);
979 			}
980 			if (USHRT_MAX - UDP_RANDOM_PORT_RESERVE < current_reservations) {
981 				log(LOG_ERR, "UDP port not available, less than 4096 UDP ports left");
982 				lck_rw_done(&pcbinfo->ipi_lock);
983 				socket_lock(so, 0);
984 				error = EADDRNOTAVAIL;
985 				goto done;
986 			}
987 		}
988 
989 #endif /* SKYWALK */
990 
991 		if (lport != 0) {
992 			struct inpcb *t;
993 			uid_t u;
994 
995 #if XNU_TARGET_OS_OSX
996 			if (ntohs(lport) < IPPORT_RESERVED &&
997 			    SIN(nam)->sin_addr.s_addr != 0 &&
998 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
999 				cred = kauth_cred_proc_ref(p);
1000 				error = priv_check_cred(cred,
1001 				    PRIV_NETINET_RESERVEDPORT, 0);
1002 				kauth_cred_unref(&cred);
1003 				if (error != 0) {
1004 					lck_rw_done(&pcbinfo->ipi_lock);
1005 					socket_lock(so, 0);
1006 					error = EACCES;
1007 					goto done;
1008 				}
1009 			}
1010 #endif /* XNU_TARGET_OS_OSX */
1011 			/*
1012 			 * Check wether the process is allowed to bind to a restricted port
1013 			 */
1014 			if (!current_task_can_use_restricted_in_port(lport,
1015 			    (uint8_t)so->so_proto->pr_protocol, PORT_FLAGS_BSD)) {
1016 				lck_rw_done(&pcbinfo->ipi_lock);
1017 				socket_lock(so, 0);
1018 				error = EADDRINUSE;
1019 				goto done;
1020 			}
1021 
1022 			if (!IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
1023 			    (u = kauth_cred_getuid(so->so_cred)) != 0 &&
1024 			    (t = in_pcblookup_local_and_cleanup(
1025 				    inp->inp_pcbinfo, SIN(nam)->sin_addr, lport,
1026 				    INPLOOKUP_WILDCARD)) != NULL &&
1027 			    (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
1028 			    t->inp_laddr.s_addr != INADDR_ANY ||
1029 			    !(t->inp_socket->so_options & SO_REUSEPORT)) &&
1030 			    (u != kauth_cred_getuid(t->inp_socket->so_cred)) &&
1031 			    !(t->inp_socket->so_flags & SOF_REUSESHAREUID) &&
1032 			    (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
1033 			    t->inp_laddr.s_addr != INADDR_ANY) &&
1034 			    (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
1035 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
1036 			    uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
1037 				if ((t->inp_socket->so_flags &
1038 				    SOF_NOTIFYCONFLICT) &&
1039 				    !(so->so_flags & SOF_NOTIFYCONFLICT)) {
1040 					conflict = 1;
1041 				}
1042 
1043 				lck_rw_done(&pcbinfo->ipi_lock);
1044 
1045 				if (conflict) {
1046 					in_pcb_conflict_post_msg(lport);
1047 				}
1048 
1049 				socket_lock(so, 0);
1050 				error = EADDRINUSE;
1051 				goto done;
1052 			}
1053 			t = in_pcblookup_local_and_cleanup(pcbinfo,
1054 			    SIN(nam)->sin_addr, lport, wild);
1055 			if (t != NULL &&
1056 			    (reuseport & t->inp_socket->so_options) == 0 &&
1057 			    (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
1058 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
1059 			    uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
1060 				if (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
1061 				    t->inp_laddr.s_addr != INADDR_ANY ||
1062 				    SOCK_DOM(so) != PF_INET6 ||
1063 				    SOCK_DOM(t->inp_socket) != PF_INET6) {
1064 					if ((t->inp_socket->so_flags &
1065 					    SOF_NOTIFYCONFLICT) &&
1066 					    !(so->so_flags & SOF_NOTIFYCONFLICT)) {
1067 						conflict = 1;
1068 					}
1069 
1070 					lck_rw_done(&pcbinfo->ipi_lock);
1071 
1072 					if (conflict) {
1073 						in_pcb_conflict_post_msg(lport);
1074 					}
1075 					socket_lock(so, 0);
1076 					error = EADDRINUSE;
1077 					goto done;
1078 				}
1079 			}
1080 #if SKYWALK
1081 			if ((SOCK_PROTO(so) == IPPROTO_TCP ||
1082 			    SOCK_PROTO(so) == IPPROTO_UDP) &&
1083 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1084 				int res_err = 0;
1085 				if (inp->inp_vflag & INP_IPV6) {
1086 					res_err = netns_reserve_in6(
1087 						&inp->inp_netns_token,
1088 						SIN6(nam)->sin6_addr,
1089 						(uint8_t)SOCK_PROTO(so), lport, NETNS_BSD,
1090 						NULL);
1091 				} else {
1092 					res_err = netns_reserve_in(
1093 						&inp->inp_netns_token,
1094 						SIN(nam)->sin_addr, (uint8_t)SOCK_PROTO(so),
1095 						lport, NETNS_BSD, NULL);
1096 				}
1097 				if (res_err != 0) {
1098 					lck_rw_done(&pcbinfo->ipi_lock);
1099 					socket_lock(so, 0);
1100 					error = EADDRINUSE;
1101 					goto done;
1102 				}
1103 			}
1104 #endif /* SKYWALK */
1105 		}
1106 		laddr = SIN(nam)->sin_addr;
1107 	}
1108 	if (lport == 0) {
1109 		u_short first, last;
1110 		int count;
1111 		bool found;
1112 
1113 		/*
1114 		 * Override wild = 1 for implicit bind (mainly used by connect)
1115 		 * For implicit bind (lport == 0), we always use an unused port,
1116 		 * so REUSEADDR|REUSEPORT don't apply
1117 		 */
1118 		wild = 1;
1119 
1120 		randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
1121 		    (so->so_type == SOCK_STREAM ? tcp_use_randomport :
1122 		    udp_use_randomport);
1123 
1124 		/*
1125 		 * Even though this looks similar to the code in
1126 		 * in6_pcbsetport, the v6 vs v4 checks are different.
1127 		 */
1128 		anonport = TRUE;
1129 		if (inp->inp_flags & INP_HIGHPORT) {
1130 			first = (u_short)ipport_hifirstauto;     /* sysctl */
1131 			last  = (u_short)ipport_hilastauto;
1132 			lastport = &pcbinfo->ipi_lasthi;
1133 		} else if (inp->inp_flags & INP_LOWPORT) {
1134 			cred = kauth_cred_proc_ref(p);
1135 			error = priv_check_cred(cred,
1136 			    PRIV_NETINET_RESERVEDPORT, 0);
1137 			kauth_cred_unref(&cred);
1138 			if (error != 0) {
1139 				lck_rw_done(&pcbinfo->ipi_lock);
1140 				socket_lock(so, 0);
1141 				goto done;
1142 			}
1143 			first = (u_short)ipport_lowfirstauto;    /* 1023 */
1144 			last  = (u_short)ipport_lowlastauto;     /* 600 */
1145 			lastport = &pcbinfo->ipi_lastlow;
1146 		} else {
1147 			first = (u_short)ipport_firstauto;       /* sysctl */
1148 			last  = (u_short)ipport_lastauto;
1149 			lastport = &pcbinfo->ipi_lastport;
1150 		}
1151 		/* No point in randomizing if only one port is available */
1152 
1153 		if (first == last) {
1154 			randomport = 0;
1155 		}
1156 		/*
1157 		 * Simple check to ensure all ports are not used up causing
1158 		 * a deadlock here.
1159 		 *
1160 		 * We split the two cases (up and down) so that the direction
1161 		 * is not being tested on each round of the loop.
1162 		 */
1163 		if (first > last) {
1164 			struct in_addr lookup_addr;
1165 
1166 			/*
1167 			 * counting down
1168 			 */
1169 			if (randomport) {
1170 				read_frandom(&rand_port, sizeof(rand_port));
1171 				*lastport =
1172 				    first - (rand_port % (first - last));
1173 			}
1174 			count = first - last;
1175 
1176 			lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1177 			    inp->inp_laddr;
1178 
1179 			found = false;
1180 			do {
1181 				if (count-- < 0) {      /* completely used? */
1182 					lck_rw_done(&pcbinfo->ipi_lock);
1183 					socket_lock(so, 0);
1184 					error = EADDRNOTAVAIL;
1185 					goto done;
1186 				}
1187 				--*lastport;
1188 				if (*lastport > first || *lastport < last) {
1189 					*lastport = first;
1190 				}
1191 				lport = htons(*lastport);
1192 
1193 				/*
1194 				 * Skip if this is a restricted port as we do not want to
1195 				 * restricted ports as ephemeral
1196 				 */
1197 				if (IS_RESTRICTED_IN_PORT(lport)) {
1198 					continue;
1199 				}
1200 
1201 				found = in_pcblookup_local_and_cleanup(pcbinfo,
1202 				    lookup_addr, lport, wild) == NULL;
1203 #if SKYWALK
1204 				if (found &&
1205 				    (SOCK_PROTO(so) == IPPROTO_TCP ||
1206 				    SOCK_PROTO(so) == IPPROTO_UDP) &&
1207 				    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1208 					int res_err;
1209 					if (inp->inp_vflag & INP_IPV6) {
1210 						res_err = netns_reserve_in6(
1211 							&inp->inp_netns_token,
1212 							inp->in6p_laddr,
1213 							(uint8_t)SOCK_PROTO(so), lport,
1214 							NETNS_BSD, NULL);
1215 					} else {
1216 						res_err = netns_reserve_in(
1217 							&inp->inp_netns_token,
1218 							lookup_addr, (uint8_t)SOCK_PROTO(so),
1219 							lport, NETNS_BSD, NULL);
1220 					}
1221 					found = res_err == 0;
1222 				}
1223 #endif /* SKYWALK */
1224 			} while (!found);
1225 		} else {
1226 			struct in_addr lookup_addr;
1227 
1228 			/*
1229 			 * counting up
1230 			 */
1231 			if (randomport) {
1232 				read_frandom(&rand_port, sizeof(rand_port));
1233 				*lastport =
1234 				    first + (rand_port % (first - last));
1235 			}
1236 			count = last - first;
1237 
1238 			lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1239 			    inp->inp_laddr;
1240 
1241 			found = false;
1242 			do {
1243 				if (count-- < 0) {      /* completely used? */
1244 					lck_rw_done(&pcbinfo->ipi_lock);
1245 					socket_lock(so, 0);
1246 					error = EADDRNOTAVAIL;
1247 					goto done;
1248 				}
1249 				++*lastport;
1250 				if (*lastport < first || *lastport > last) {
1251 					*lastport = first;
1252 				}
1253 				lport = htons(*lastport);
1254 
1255 				/*
1256 				 * Skip if this is a restricted port as we do not want to
1257 				 * restricted ports as ephemeral
1258 				 */
1259 				if (IS_RESTRICTED_IN_PORT(lport)) {
1260 					continue;
1261 				}
1262 
1263 				found = in_pcblookup_local_and_cleanup(pcbinfo,
1264 				    lookup_addr, lport, wild) == NULL;
1265 #if SKYWALK
1266 				if (found &&
1267 				    (SOCK_PROTO(so) == IPPROTO_TCP ||
1268 				    SOCK_PROTO(so) == IPPROTO_UDP) &&
1269 				    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1270 					int res_err;
1271 					if (inp->inp_vflag & INP_IPV6) {
1272 						res_err = netns_reserve_in6(
1273 							&inp->inp_netns_token,
1274 							inp->in6p_laddr,
1275 							(uint8_t)SOCK_PROTO(so), lport,
1276 							NETNS_BSD, NULL);
1277 					} else {
1278 						res_err = netns_reserve_in(
1279 							&inp->inp_netns_token,
1280 							lookup_addr, (uint8_t)SOCK_PROTO(so),
1281 							lport, NETNS_BSD, NULL);
1282 					}
1283 					found = res_err == 0;
1284 				}
1285 #endif /* SKYWALK */
1286 			} while (!found);
1287 		}
1288 	}
1289 	socket_lock(so, 0);
1290 
1291 	/*
1292 	 * We unlocked socket's protocol lock for a long time.
1293 	 * The socket might have been dropped/defuncted.
1294 	 * Checking if world has changed since.
1295 	 */
1296 	if (inp->inp_state == INPCB_STATE_DEAD) {
1297 #if SKYWALK
1298 		netns_release(&inp->inp_netns_token);
1299 #endif /* SKYWALK */
1300 		lck_rw_done(&pcbinfo->ipi_lock);
1301 		error = ECONNABORTED;
1302 		goto done;
1303 	}
1304 
1305 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
1306 #if SKYWALK
1307 		netns_release(&inp->inp_netns_token);
1308 #endif /* SKYWALK */
1309 		lck_rw_done(&pcbinfo->ipi_lock);
1310 		error = EINVAL;
1311 		goto done;
1312 	}
1313 
1314 	if (laddr.s_addr != INADDR_ANY) {
1315 		inp->inp_laddr = laddr;
1316 		inp->inp_last_outifp = outif;
1317 #if SKYWALK
1318 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1319 			netns_set_ifnet(&inp->inp_netns_token, outif);
1320 		}
1321 #endif /* SKYWALK */
1322 	}
1323 	inp->inp_lport = lport;
1324 	if (anonport) {
1325 		inp->inp_flags |= INP_ANONPORT;
1326 	}
1327 
1328 	if (in_pcbinshash(inp, 1) != 0) {
1329 		inp->inp_laddr.s_addr = INADDR_ANY;
1330 		inp->inp_last_outifp = NULL;
1331 
1332 #if SKYWALK
1333 		netns_release(&inp->inp_netns_token);
1334 #endif /* SKYWALK */
1335 		inp->inp_lport = 0;
1336 		if (anonport) {
1337 			inp->inp_flags &= ~INP_ANONPORT;
1338 		}
1339 		lck_rw_done(&pcbinfo->ipi_lock);
1340 		error = EAGAIN;
1341 		goto done;
1342 	}
1343 	lck_rw_done(&pcbinfo->ipi_lock);
1344 	sflt_notify(so, sock_evt_bound, NULL);
1345 
1346 	in_pcb_check_management_entitled(inp);
1347 done:
1348 	inp->inp_flags2 &= ~INP2_BIND_IN_PROGRESS;
1349 	return error;
1350 }
1351 
1352 #define APN_FALLBACK_IP_FILTER(a)       \
1353 	(IN_LINKLOCAL(ntohl((a)->sin_addr.s_addr)) || \
1354 	 IN_LOOPBACK(ntohl((a)->sin_addr.s_addr)) || \
1355 	 IN_ZERONET(ntohl((a)->sin_addr.s_addr)) || \
1356 	 IN_MULTICAST(ntohl((a)->sin_addr.s_addr)) || \
1357 	 IN_PRIVATE(ntohl((a)->sin_addr.s_addr)))
1358 
1359 #define APN_FALLBACK_NOTIF_INTERVAL     2 /* Magic Number */
1360 static uint64_t last_apn_fallback = 0;
1361 
1362 static boolean_t
apn_fallback_required(proc_t proc,struct socket * so,struct sockaddr_in * p_dstv4)1363 apn_fallback_required(proc_t proc, struct socket *so, struct sockaddr_in *p_dstv4)
1364 {
1365 	uint64_t timenow;
1366 	struct sockaddr_storage lookup_default_addr;
1367 	struct rtentry *rt = NULL;
1368 
1369 	VERIFY(proc != NULL);
1370 
1371 	if (apn_fallbk_enabled == FALSE) {
1372 		return FALSE;
1373 	}
1374 
1375 	if (proc == kernproc) {
1376 		return FALSE;
1377 	}
1378 
1379 	if (so && (so->so_options & SO_NOAPNFALLBK)) {
1380 		return FALSE;
1381 	}
1382 
1383 	timenow = net_uptime();
1384 	if ((timenow - last_apn_fallback) < APN_FALLBACK_NOTIF_INTERVAL) {
1385 		apn_fallbk_log((LOG_INFO, "APN fallback notification throttled.\n"));
1386 		return FALSE;
1387 	}
1388 
1389 	if (p_dstv4 && APN_FALLBACK_IP_FILTER(p_dstv4)) {
1390 		return FALSE;
1391 	}
1392 
1393 	/* Check if we have unscoped IPv6 default route through cellular */
1394 	bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1395 	lookup_default_addr.ss_family = AF_INET6;
1396 	lookup_default_addr.ss_len = sizeof(struct sockaddr_in6);
1397 
1398 	rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1399 	if (NULL == rt) {
1400 		apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1401 		    "unscoped default IPv6 route.\n"));
1402 		return FALSE;
1403 	}
1404 
1405 	if (!IFNET_IS_CELLULAR(rt->rt_ifp)) {
1406 		rtfree(rt);
1407 		apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1408 		    "unscoped default IPv6 route through cellular interface.\n"));
1409 		return FALSE;
1410 	}
1411 
1412 	/*
1413 	 * We have a default IPv6 route, ensure that
1414 	 * we do not have IPv4 default route before triggering
1415 	 * the event
1416 	 */
1417 	rtfree(rt);
1418 	rt = NULL;
1419 
1420 	bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1421 	lookup_default_addr.ss_family = AF_INET;
1422 	lookup_default_addr.ss_len = sizeof(struct sockaddr_in);
1423 
1424 	rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1425 
1426 	if (rt) {
1427 		rtfree(rt);
1428 		rt = NULL;
1429 		apn_fallbk_log((LOG_INFO, "APN fallback notification found unscoped "
1430 		    "IPv4 default route!\n"));
1431 		return FALSE;
1432 	}
1433 
1434 	{
1435 		/*
1436 		 * We disable APN fallback if the binary is not a third-party app.
1437 		 * Note that platform daemons use their process name as a
1438 		 * bundle ID so we filter out bundle IDs without dots.
1439 		 */
1440 		const char *bundle_id = cs_identity_get(proc);
1441 		if (bundle_id == NULL ||
1442 		    bundle_id[0] == '\0' ||
1443 		    strchr(bundle_id, '.') == NULL ||
1444 		    strncmp(bundle_id, "com.apple.", sizeof("com.apple.") - 1) == 0) {
1445 			apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found first-"
1446 			    "party bundle ID \"%s\"!\n", (bundle_id ? bundle_id : "NULL")));
1447 			return FALSE;
1448 		}
1449 	}
1450 
1451 	{
1452 		/*
1453 		 * The Apple App Store IPv6 requirement started on
1454 		 * June 1st, 2016 at 12:00:00 AM PDT.
1455 		 * We disable APN fallback if the binary is more recent than that.
1456 		 * We check both atime and birthtime since birthtime is not always supported.
1457 		 */
1458 		static const long ipv6_start_date = 1464764400L;
1459 		vfs_context_t context;
1460 		struct stat64 sb;
1461 		int vn_stat_error;
1462 
1463 		bzero(&sb, sizeof(struct stat64));
1464 		context = vfs_context_create(NULL);
1465 		vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, 0, context);
1466 		(void)vfs_context_rele(context);
1467 
1468 		if (vn_stat_error != 0 ||
1469 		    sb.st_atimespec.tv_sec >= ipv6_start_date ||
1470 		    sb.st_birthtimespec.tv_sec >= ipv6_start_date) {
1471 			apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found binary "
1472 			    "too recent! (err %d atime %ld mtime %ld ctime %ld birthtime %ld)\n",
1473 			    vn_stat_error, sb.st_atimespec.tv_sec, sb.st_mtimespec.tv_sec,
1474 			    sb.st_ctimespec.tv_sec, sb.st_birthtimespec.tv_sec));
1475 			return FALSE;
1476 		}
1477 	}
1478 	return TRUE;
1479 }
1480 
1481 static void
apn_fallback_trigger(proc_t proc,struct socket * so)1482 apn_fallback_trigger(proc_t proc, struct socket *so)
1483 {
1484 	pid_t pid = 0;
1485 	struct kev_msg ev_msg;
1486 	struct kev_netevent_apnfallbk_data apnfallbk_data;
1487 
1488 	last_apn_fallback = net_uptime();
1489 	pid = proc_pid(proc);
1490 	uuid_t application_uuid;
1491 	uuid_clear(application_uuid);
1492 	proc_getexecutableuuid(proc, application_uuid,
1493 	    sizeof(application_uuid));
1494 
1495 	bzero(&ev_msg, sizeof(struct kev_msg));
1496 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1497 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1498 	ev_msg.kev_subclass     = KEV_NETEVENT_SUBCLASS;
1499 	ev_msg.event_code       = KEV_NETEVENT_APNFALLBACK;
1500 
1501 	bzero(&apnfallbk_data, sizeof(apnfallbk_data));
1502 
1503 	if (so->so_flags & SOF_DELEGATED) {
1504 		apnfallbk_data.epid = so->e_pid;
1505 		uuid_copy(apnfallbk_data.euuid, so->e_uuid);
1506 	} else {
1507 		apnfallbk_data.epid = so->last_pid;
1508 		uuid_copy(apnfallbk_data.euuid, so->last_uuid);
1509 	}
1510 
1511 	ev_msg.dv[0].data_ptr   = &apnfallbk_data;
1512 	ev_msg.dv[0].data_length = sizeof(apnfallbk_data);
1513 	kev_post_msg(&ev_msg);
1514 	apn_fallbk_log((LOG_INFO, "APN fallback notification issued.\n"));
1515 }
1516 
1517 /*
1518  * Transform old in_pcbconnect() into an inner subroutine for new
1519  * in_pcbconnect(); do some validity-checking on the remote address
1520  * (in "nam") and then determine local host address (i.e., which
1521  * interface) to use to access that remote host.
1522  *
1523  * This routine may alter the caller-supplied remote address "nam".
1524  *
1525  * The caller may override the bound-to-interface setting of the socket
1526  * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1527  *
1528  * This routine might return an ifp with a reference held if the caller
1529  * provides a non-NULL outif, even in the error case.  The caller is
1530  * responsible for releasing its reference.
1531  *
1532  * Returns:	0			Success
1533  *		EINVAL			Invalid argument
1534  *		EAFNOSUPPORT		Address family not supported
1535  *		EADDRNOTAVAIL		Address not available
1536  */
1537 int
in_pcbladdr(struct inpcb * inp,struct sockaddr * nam,struct in_addr * laddr,unsigned int ifscope,struct ifnet ** outif,int raw)1538 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr,
1539     unsigned int ifscope, struct ifnet **outif, int raw)
1540 {
1541 	struct route *ro = &inp->inp_route;
1542 	struct in_ifaddr *ia = NULL;
1543 	struct sockaddr_in sin;
1544 	int error = 0;
1545 	boolean_t restricted = FALSE;
1546 
1547 	if (outif != NULL) {
1548 		*outif = NULL;
1549 	}
1550 	if (nam->sa_len != sizeof(struct sockaddr_in)) {
1551 		return EINVAL;
1552 	}
1553 	if (SIN(nam)->sin_family != AF_INET) {
1554 		return EAFNOSUPPORT;
1555 	}
1556 	if (raw == 0 && SIN(nam)->sin_port == 0) {
1557 		return EADDRNOTAVAIL;
1558 	}
1559 
1560 	in_pcb_check_management_entitled(inp);
1561 
1562 	/*
1563 	 * If the destination address is INADDR_ANY,
1564 	 * use the primary local address.
1565 	 * If the supplied address is INADDR_BROADCAST,
1566 	 * and the primary interface supports broadcast,
1567 	 * choose the broadcast address for that interface.
1568 	 */
1569 	if (raw == 0 && (SIN(nam)->sin_addr.s_addr == INADDR_ANY ||
1570 	    SIN(nam)->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST)) {
1571 		lck_rw_lock_shared(&in_ifaddr_rwlock);
1572 		if (!TAILQ_EMPTY(&in_ifaddrhead)) {
1573 			ia = TAILQ_FIRST(&in_ifaddrhead);
1574 			IFA_LOCK_SPIN(&ia->ia_ifa);
1575 			if (SIN(nam)->sin_addr.s_addr == INADDR_ANY) {
1576 				SIN(nam)->sin_addr = IA_SIN(ia)->sin_addr;
1577 			} else if (ia->ia_ifp->if_flags & IFF_BROADCAST) {
1578 				SIN(nam)->sin_addr =
1579 				    SIN(&ia->ia_broadaddr)->sin_addr;
1580 			}
1581 			IFA_UNLOCK(&ia->ia_ifa);
1582 			ia = NULL;
1583 		}
1584 		lck_rw_done(&in_ifaddr_rwlock);
1585 	}
1586 	/*
1587 	 * Otherwise, if the socket has already bound the source, just use it.
1588 	 */
1589 	if (inp->inp_laddr.s_addr != INADDR_ANY) {
1590 		VERIFY(ia == NULL);
1591 		*laddr = inp->inp_laddr;
1592 		return 0;
1593 	}
1594 
1595 	/*
1596 	 * If the ifscope is specified by the caller (e.g. IP_PKTINFO)
1597 	 * then it overrides the sticky ifscope set for the socket.
1598 	 */
1599 	if (ifscope == IFSCOPE_NONE && (inp->inp_flags & INP_BOUND_IF)) {
1600 		ifscope = inp->inp_boundifp->if_index;
1601 	}
1602 
1603 	/*
1604 	 * If route is known or can be allocated now,
1605 	 * our src addr is taken from the i/f, else punt.
1606 	 * Note that we should check the address family of the cached
1607 	 * destination, in case of sharing the cache with IPv6.
1608 	 */
1609 	if (ro->ro_rt != NULL) {
1610 		RT_LOCK_SPIN(ro->ro_rt);
1611 	}
1612 	if (ROUTE_UNUSABLE(ro) || ro->ro_dst.sa_family != AF_INET ||
1613 	    SIN(&ro->ro_dst)->sin_addr.s_addr != SIN(nam)->sin_addr.s_addr ||
1614 	    (inp->inp_socket->so_options & SO_DONTROUTE)) {
1615 		if (ro->ro_rt != NULL) {
1616 			RT_UNLOCK(ro->ro_rt);
1617 		}
1618 		ROUTE_RELEASE(ro);
1619 	}
1620 	if (!(inp->inp_socket->so_options & SO_DONTROUTE) &&
1621 	    (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
1622 		if (ro->ro_rt != NULL) {
1623 			RT_UNLOCK(ro->ro_rt);
1624 		}
1625 		ROUTE_RELEASE(ro);
1626 		/* No route yet, so try to acquire one */
1627 		bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
1628 		ro->ro_dst.sa_family = AF_INET;
1629 		ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1630 		SIN(&ro->ro_dst)->sin_addr = SIN(nam)->sin_addr;
1631 		rtalloc_scoped(ro, ifscope);
1632 		if (ro->ro_rt != NULL) {
1633 			RT_LOCK_SPIN(ro->ro_rt);
1634 		}
1635 	}
1636 	/* Sanitized local copy for interface address searches */
1637 	bzero(&sin, sizeof(sin));
1638 	sin.sin_family = AF_INET;
1639 	sin.sin_len = sizeof(struct sockaddr_in);
1640 	sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
1641 	/*
1642 	 * If we did not find (or use) a route, assume dest is reachable
1643 	 * on a directly connected network and try to find a corresponding
1644 	 * interface to take the source address from.
1645 	 */
1646 	if (ro->ro_rt == NULL) {
1647 		proc_t proc = current_proc();
1648 
1649 		VERIFY(ia == NULL);
1650 		ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1651 		if (ia == NULL) {
1652 			ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1653 		}
1654 		error = ((ia == NULL) ? ENETUNREACH : 0);
1655 
1656 		if (apn_fallback_required(proc, inp->inp_socket,
1657 		    (void *)nam)) {
1658 			apn_fallback_trigger(proc, inp->inp_socket);
1659 		}
1660 
1661 		goto done;
1662 	}
1663 	RT_LOCK_ASSERT_HELD(ro->ro_rt);
1664 	/*
1665 	 * If the outgoing interface on the route found is not
1666 	 * a loopback interface, use the address from that interface.
1667 	 */
1668 	if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
1669 		VERIFY(ia == NULL);
1670 		/*
1671 		 * If the route points to a cellular interface and the
1672 		 * caller forbids our using interfaces of such type,
1673 		 * pretend that there is no route.
1674 		 * Apply the same logic for expensive interfaces.
1675 		 */
1676 		if (inp_restricted_send(inp, ro->ro_rt->rt_ifp)) {
1677 			RT_UNLOCK(ro->ro_rt);
1678 			ROUTE_RELEASE(ro);
1679 			error = EHOSTUNREACH;
1680 			restricted = TRUE;
1681 		} else {
1682 			/* Become a regular mutex */
1683 			RT_CONVERT_LOCK(ro->ro_rt);
1684 			ia = ifatoia(ro->ro_rt->rt_ifa);
1685 			IFA_ADDREF(&ia->ia_ifa);
1686 
1687 			/*
1688 			 * Mark the control block for notification of
1689 			 * a possible flow that might undergo clat46
1690 			 * translation.
1691 			 *
1692 			 * We defer the decision to a later point when
1693 			 * inpcb is being disposed off.
1694 			 * The reason is that we only want to send notification
1695 			 * if the flow was ever used to send data.
1696 			 */
1697 			if (IS_INTF_CLAT46(ro->ro_rt->rt_ifp)) {
1698 				inp->inp_flags2 |= INP2_CLAT46_FLOW;
1699 			}
1700 
1701 			RT_UNLOCK(ro->ro_rt);
1702 			error = 0;
1703 		}
1704 		goto done;
1705 	}
1706 	VERIFY(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK);
1707 	RT_UNLOCK(ro->ro_rt);
1708 	/*
1709 	 * The outgoing interface is marked with 'loopback net', so a route
1710 	 * to ourselves is here.
1711 	 * Try to find the interface of the destination address and then
1712 	 * take the address from there. That interface is not necessarily
1713 	 * a loopback interface.
1714 	 */
1715 	VERIFY(ia == NULL);
1716 	ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1717 	if (ia == NULL) {
1718 		ia = ifatoia(ifa_ifwithaddr_scoped(SA(&sin), ifscope));
1719 	}
1720 	if (ia == NULL) {
1721 		ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1722 	}
1723 	if (ia == NULL) {
1724 		RT_LOCK(ro->ro_rt);
1725 		ia = ifatoia(ro->ro_rt->rt_ifa);
1726 		if (ia != NULL) {
1727 			IFA_ADDREF(&ia->ia_ifa);
1728 		}
1729 		RT_UNLOCK(ro->ro_rt);
1730 	}
1731 	error = ((ia == NULL) ? ENETUNREACH : 0);
1732 
1733 done:
1734 	/*
1735 	 * If the destination address is multicast and an outgoing
1736 	 * interface has been set as a multicast option, use the
1737 	 * address of that interface as our source address.
1738 	 */
1739 	if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
1740 	    inp->inp_moptions != NULL) {
1741 		struct ip_moptions *imo;
1742 		struct ifnet *ifp;
1743 
1744 		imo = inp->inp_moptions;
1745 		IMO_LOCK(imo);
1746 		if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
1747 		    ia->ia_ifp != imo->imo_multicast_ifp)) {
1748 			ifp = imo->imo_multicast_ifp;
1749 			if (ia != NULL) {
1750 				IFA_REMREF(&ia->ia_ifa);
1751 			}
1752 			lck_rw_lock_shared(&in_ifaddr_rwlock);
1753 			TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
1754 				if (ia->ia_ifp == ifp) {
1755 					break;
1756 				}
1757 			}
1758 			if (ia != NULL) {
1759 				IFA_ADDREF(&ia->ia_ifa);
1760 			}
1761 			lck_rw_done(&in_ifaddr_rwlock);
1762 			if (ia == NULL) {
1763 				error = EADDRNOTAVAIL;
1764 			} else {
1765 				error = 0;
1766 			}
1767 		}
1768 		IMO_UNLOCK(imo);
1769 	}
1770 	/*
1771 	 * Don't do pcblookup call here; return interface in laddr
1772 	 * and exit to caller, that will do the lookup.
1773 	 */
1774 	if (ia != NULL) {
1775 		/*
1776 		 * If the source address belongs to a cellular interface
1777 		 * and the socket forbids our using interfaces of such
1778 		 * type, pretend that there is no source address.
1779 		 * Apply the same logic for expensive interfaces.
1780 		 */
1781 		IFA_LOCK_SPIN(&ia->ia_ifa);
1782 		if (inp_restricted_send(inp, ia->ia_ifa.ifa_ifp)) {
1783 			IFA_UNLOCK(&ia->ia_ifa);
1784 			error = EHOSTUNREACH;
1785 			restricted = TRUE;
1786 		} else if (error == 0) {
1787 			*laddr = ia->ia_addr.sin_addr;
1788 			if (outif != NULL) {
1789 				struct ifnet *ifp;
1790 
1791 				if (ro->ro_rt != NULL) {
1792 					ifp = ro->ro_rt->rt_ifp;
1793 				} else {
1794 					ifp = ia->ia_ifp;
1795 				}
1796 
1797 				VERIFY(ifp != NULL);
1798 				IFA_CONVERT_LOCK(&ia->ia_ifa);
1799 				ifnet_reference(ifp);   /* for caller */
1800 				if (*outif != NULL) {
1801 					ifnet_release(*outif);
1802 				}
1803 				*outif = ifp;
1804 			}
1805 			IFA_UNLOCK(&ia->ia_ifa);
1806 		} else {
1807 			IFA_UNLOCK(&ia->ia_ifa);
1808 		}
1809 		IFA_REMREF(&ia->ia_ifa);
1810 		ia = NULL;
1811 	}
1812 
1813 	if (restricted && error == EHOSTUNREACH) {
1814 		soevent(inp->inp_socket, (SO_FILT_HINT_LOCKED |
1815 		    SO_FILT_HINT_IFDENIED));
1816 	}
1817 
1818 	return error;
1819 }
1820 
1821 /*
1822  * Outer subroutine:
1823  * Connect from a socket to a specified address.
1824  * Both address and port must be specified in argument sin.
1825  * If don't have a local address for this socket yet,
1826  * then pick one.
1827  *
1828  * The caller may override the bound-to-interface setting of the socket
1829  * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1830  */
1831 int
in_pcbconnect(struct inpcb * inp,struct sockaddr * nam,struct proc * p,unsigned int ifscope,struct ifnet ** outif)1832 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
1833     unsigned int ifscope, struct ifnet **outif)
1834 {
1835 	struct in_addr laddr;
1836 	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
1837 	struct inpcb *pcb;
1838 	int error;
1839 	struct socket *so = inp->inp_socket;
1840 
1841 #if CONTENT_FILTER
1842 	if (so) {
1843 		so->so_state_change_cnt++;
1844 	}
1845 #endif
1846 
1847 	/*
1848 	 *   Call inner routine, to assign local interface address.
1849 	 */
1850 	if ((error = in_pcbladdr(inp, nam, &laddr, ifscope, outif, 0)) != 0) {
1851 		return error;
1852 	}
1853 
1854 	socket_unlock(so, 0);
1855 	pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
1856 	    inp->inp_laddr.s_addr ? inp->inp_laddr : laddr,
1857 	    inp->inp_lport, 0, NULL);
1858 	socket_lock(so, 0);
1859 
1860 	/*
1861 	 * Check if the socket is still in a valid state. When we unlock this
1862 	 * embryonic socket, it can get aborted if another thread is closing
1863 	 * the listener (radar 7947600).
1864 	 */
1865 	if ((so->so_flags & SOF_ABORTED) != 0) {
1866 		return ECONNREFUSED;
1867 	}
1868 
1869 	if (pcb != NULL) {
1870 		in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0);
1871 		return EADDRINUSE;
1872 	}
1873 	if (inp->inp_laddr.s_addr == INADDR_ANY) {
1874 		if (inp->inp_lport == 0) {
1875 			error = in_pcbbind(inp, NULL, p);
1876 			if (error) {
1877 				return error;
1878 			}
1879 		}
1880 		if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1881 			/*
1882 			 * Lock inversion issue, mostly with udp
1883 			 * multicast packets.
1884 			 */
1885 			socket_unlock(so, 0);
1886 			lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1887 			socket_lock(so, 0);
1888 		}
1889 		inp->inp_laddr = laddr;
1890 		/* no reference needed */
1891 		inp->inp_last_outifp = (outif != NULL) ? *outif : NULL;
1892 #if SKYWALK
1893 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1894 			netns_set_ifnet(&inp->inp_netns_token,
1895 			    inp->inp_last_outifp);
1896 		}
1897 #endif /* SKYWALK */
1898 		inp->inp_flags |= INP_INADDR_ANY;
1899 	} else {
1900 		/*
1901 		 * Usage of IP_PKTINFO, without local port already
1902 		 * speficified will cause kernel to panic,
1903 		 * see rdar://problem/18508185.
1904 		 * For now returning error to avoid a kernel panic
1905 		 * This routines can be refactored and handle this better
1906 		 * in future.
1907 		 */
1908 		if (inp->inp_lport == 0) {
1909 			return EINVAL;
1910 		}
1911 		if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1912 			/*
1913 			 * Lock inversion issue, mostly with udp
1914 			 * multicast packets.
1915 			 */
1916 			socket_unlock(so, 0);
1917 			lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1918 			socket_lock(so, 0);
1919 		}
1920 	}
1921 	inp->inp_faddr = sin->sin_addr;
1922 	inp->inp_fport = sin->sin_port;
1923 	if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1924 		nstat_pcb_invalidate_cache(inp);
1925 	}
1926 	in_pcbrehash(inp);
1927 	lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1928 	return 0;
1929 }
1930 
1931 void
in_pcbdisconnect(struct inpcb * inp)1932 in_pcbdisconnect(struct inpcb *inp)
1933 {
1934 	struct socket *so = inp->inp_socket;
1935 
1936 	if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1937 		nstat_pcb_cache(inp);
1938 	}
1939 
1940 	inp->inp_faddr.s_addr = INADDR_ANY;
1941 	inp->inp_fport = 0;
1942 
1943 #if CONTENT_FILTER
1944 	if (so) {
1945 		so->so_state_change_cnt++;
1946 	}
1947 #endif
1948 
1949 	if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1950 		/* lock inversion issue, mostly with udp multicast packets */
1951 		socket_unlock(so, 0);
1952 		lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1953 		socket_lock(so, 0);
1954 	}
1955 
1956 	in_pcbrehash(inp);
1957 	lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1958 	/*
1959 	 * A multipath subflow socket would have its SS_NOFDREF set by default,
1960 	 * so check for SOF_MP_SUBFLOW socket flag before detaching the PCB;
1961 	 * when the socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1962 	 */
1963 	if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF)) {
1964 		in_pcbdetach(inp);
1965 	}
1966 }
1967 
1968 void
in_pcbdetach(struct inpcb * inp)1969 in_pcbdetach(struct inpcb *inp)
1970 {
1971 	struct socket *so = inp->inp_socket;
1972 
1973 	if (so->so_pcb == NULL) {
1974 		/* PCB has been disposed */
1975 		panic("%s: inp=%p so=%p proto=%d so_pcb is null!", __func__,
1976 		    inp, so, SOCK_PROTO(so));
1977 		/* NOTREACHED */
1978 	}
1979 
1980 #if IPSEC
1981 	if (inp->inp_sp != NULL) {
1982 		(void) ipsec4_delete_pcbpolicy(inp);
1983 	}
1984 #endif /* IPSEC */
1985 
1986 	if (inp->inp_stat != NULL && SOCK_PROTO(so) == IPPROTO_UDP) {
1987 		if (inp->inp_stat->rxpackets == 0 && inp->inp_stat->txpackets == 0) {
1988 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_no_data);
1989 		}
1990 	}
1991 
1992 	/*
1993 	 * Let NetworkStatistics know this PCB is going away
1994 	 * before we detach it.
1995 	 */
1996 	if (nstat_collect &&
1997 	    (SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP)) {
1998 		nstat_pcb_detach(inp);
1999 	}
2000 
2001 	/* Free memory buffer held for generating keep alives */
2002 	if (inp->inp_keepalive_data != NULL) {
2003 		kfree_data(inp->inp_keepalive_data, inp->inp_keepalive_datalen);
2004 		inp->inp_keepalive_data = NULL;
2005 	}
2006 
2007 	/* mark socket state as dead */
2008 	if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING) {
2009 		panic("%s: so=%p proto=%d couldn't set to STOPUSING",
2010 		    __func__, so, SOCK_PROTO(so));
2011 		/* NOTREACHED */
2012 	}
2013 
2014 #if SKYWALK
2015 	/* Free up the port in the namespace registrar if not in TIME_WAIT */
2016 	if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
2017 		netns_release(&inp->inp_netns_token);
2018 		netns_release(&inp->inp_wildcard_netns_token);
2019 	}
2020 #endif /* SKYWALK */
2021 
2022 	if (!(so->so_flags & SOF_PCBCLEARING)) {
2023 		struct ip_moptions *imo;
2024 
2025 		inp->inp_vflag = 0;
2026 		if (inp->inp_options != NULL) {
2027 			(void) m_free(inp->inp_options);
2028 			inp->inp_options = NULL;
2029 		}
2030 		ROUTE_RELEASE(&inp->inp_route);
2031 		imo = inp->inp_moptions;
2032 		if (imo != NULL) {
2033 			IMO_REMREF(imo);
2034 		}
2035 		inp->inp_moptions = NULL;
2036 		sofreelastref(so, 0);
2037 		inp->inp_state = INPCB_STATE_DEAD;
2038 
2039 		/*
2040 		 * Enqueue an event to send kernel event notification
2041 		 * if the flow has to CLAT46 for data packets
2042 		 */
2043 		if (inp->inp_flags2 & INP2_CLAT46_FLOW) {
2044 			/*
2045 			 * If there has been any exchange of data bytes
2046 			 * over this flow.
2047 			 * Schedule a notification to report that flow is
2048 			 * using client side translation.
2049 			 */
2050 			if (inp->inp_stat != NULL &&
2051 			    (inp->inp_stat->txbytes != 0 ||
2052 			    inp->inp_stat->rxbytes != 0)) {
2053 				if (so->so_flags & SOF_DELEGATED) {
2054 					in6_clat46_event_enqueue_nwk_wq_entry(
2055 						IN6_CLAT46_EVENT_V4_FLOW,
2056 						so->e_pid,
2057 						so->e_uuid);
2058 				} else {
2059 					in6_clat46_event_enqueue_nwk_wq_entry(
2060 						IN6_CLAT46_EVENT_V4_FLOW,
2061 						so->last_pid,
2062 						so->last_uuid);
2063 				}
2064 			}
2065 		}
2066 
2067 		/* makes sure we're not called twice from so_close */
2068 		so->so_flags |= SOF_PCBCLEARING;
2069 
2070 		inpcb_gc_sched(inp->inp_pcbinfo, INPCB_TIMER_FAST);
2071 	}
2072 }
2073 
2074 
2075 void
in_pcbdispose(struct inpcb * inp)2076 in_pcbdispose(struct inpcb *inp)
2077 {
2078 	struct socket *so = inp->inp_socket;
2079 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
2080 
2081 	if (so != NULL && so->so_usecount != 0) {
2082 		panic("%s: so %p [%d,%d] usecount %d lockhistory %s",
2083 		    __func__, so, SOCK_DOM(so), SOCK_TYPE(so), so->so_usecount,
2084 		    solockhistory_nr(so));
2085 		/* NOTREACHED */
2086 	} else if (inp->inp_wantcnt != WNT_STOPUSING) {
2087 		if (so != NULL) {
2088 			panic_plain("%s: inp %p invalid wantcnt %d, so %p "
2089 			    "[%d,%d] usecount %d retaincnt %d state 0x%x "
2090 			    "flags 0x%x lockhistory %s\n", __func__, inp,
2091 			    inp->inp_wantcnt, so, SOCK_DOM(so), SOCK_TYPE(so),
2092 			    so->so_usecount, so->so_retaincnt, so->so_state,
2093 			    so->so_flags, solockhistory_nr(so));
2094 			/* NOTREACHED */
2095 		} else {
2096 			panic("%s: inp %p invalid wantcnt %d no socket",
2097 			    __func__, inp, inp->inp_wantcnt);
2098 			/* NOTREACHED */
2099 		}
2100 	}
2101 
2102 	LCK_RW_ASSERT(&ipi->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
2103 
2104 	inp->inp_gencnt = ++ipi->ipi_gencnt;
2105 	/* access ipi in in_pcbremlists */
2106 	in_pcbremlists(inp);
2107 
2108 	if (so != NULL) {
2109 		if (so->so_proto->pr_flags & PR_PCBLOCK) {
2110 			sofreelastref(so, 0);
2111 			if (so->so_rcv.sb_cc > 0 || so->so_snd.sb_cc > 0) {
2112 				/*
2113 				 * selthreadclear() already called
2114 				 * during sofreelastref() above.
2115 				 */
2116 				sbrelease(&so->so_rcv);
2117 				sbrelease(&so->so_snd);
2118 			}
2119 			if (so->so_head != NULL) {
2120 				panic("%s: so=%p head still exist",
2121 				    __func__, so);
2122 				/* NOTREACHED */
2123 			}
2124 			lck_mtx_unlock(&inp->inpcb_mtx);
2125 
2126 #if NECP
2127 			necp_inpcb_remove_cb(inp);
2128 #endif /* NECP */
2129 
2130 			lck_mtx_destroy(&inp->inpcb_mtx, ipi->ipi_lock_grp);
2131 		}
2132 		/* makes sure we're not called twice from so_close */
2133 		so->so_flags |= SOF_PCBCLEARING;
2134 		so->so_saved_pcb = (caddr_t)inp;
2135 		so->so_pcb = NULL;
2136 		inp->inp_socket = NULL;
2137 #if NECP
2138 		necp_inpcb_dispose(inp);
2139 #endif /* NECP */
2140 		/*
2141 		 * In case there a route cached after a detach (possible
2142 		 * in the tcp case), make sure that it is freed before
2143 		 * we deallocate the structure.
2144 		 */
2145 		ROUTE_RELEASE(&inp->inp_route);
2146 		if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
2147 			zfree(ipi->ipi_zone, inp);
2148 		}
2149 		sodealloc(so);
2150 	}
2151 }
2152 
2153 /*
2154  * The calling convention of in_getsockaddr() and in_getpeeraddr() was
2155  * modified to match the pru_sockaddr() and pru_peeraddr() entry points
2156  * in struct pr_usrreqs, so that protocols can just reference then directly
2157  * without the need for a wrapper function.
2158  */
2159 int
in_getsockaddr(struct socket * so,struct sockaddr ** nam)2160 in_getsockaddr(struct socket *so, struct sockaddr **nam)
2161 {
2162 	struct inpcb *inp;
2163 	struct sockaddr_in *sin;
2164 
2165 	/*
2166 	 * Do the malloc first in case it blocks.
2167 	 */
2168 	sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2169 	    Z_WAITOK | Z_NOFAIL);
2170 
2171 	sin->sin_family = AF_INET;
2172 
2173 	if ((inp = sotoinpcb(so)) == NULL) {
2174 		free_sockaddr(sin);
2175 		return EINVAL;
2176 	}
2177 	sin->sin_port = inp->inp_lport;
2178 	sin->sin_addr = inp->inp_laddr;
2179 
2180 	*nam = (struct sockaddr *)sin;
2181 	return 0;
2182 }
2183 
2184 int
in_getsockaddr_s(struct socket * so,struct sockaddr_in * ss)2185 in_getsockaddr_s(struct socket *so, struct sockaddr_in *ss)
2186 {
2187 	struct sockaddr_in *sin = ss;
2188 	struct inpcb *inp;
2189 
2190 	VERIFY(ss != NULL);
2191 	bzero(ss, sizeof(*ss));
2192 
2193 	sin->sin_family = AF_INET;
2194 	sin->sin_len = sizeof(*sin);
2195 
2196 	if ((inp = sotoinpcb(so)) == NULL) {
2197 		return EINVAL;
2198 	}
2199 
2200 	sin->sin_port = inp->inp_lport;
2201 	sin->sin_addr = inp->inp_laddr;
2202 	return 0;
2203 }
2204 
2205 int
in_getpeeraddr(struct socket * so,struct sockaddr ** nam)2206 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
2207 {
2208 	struct inpcb *inp;
2209 	struct sockaddr_in *sin;
2210 
2211 	/*
2212 	 * Do the malloc first in case it blocks.
2213 	 */
2214 	sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2215 	    Z_WAITOK | Z_NOFAIL);
2216 
2217 	sin->sin_family = AF_INET;
2218 
2219 	if ((inp = sotoinpcb(so)) == NULL) {
2220 		free_sockaddr(sin);
2221 		return EINVAL;
2222 	}
2223 	sin->sin_port = inp->inp_fport;
2224 	sin->sin_addr = inp->inp_faddr;
2225 
2226 	*nam = (struct sockaddr *)sin;
2227 	return 0;
2228 }
2229 
2230 void
in_pcbnotifyall(struct inpcbinfo * pcbinfo,struct in_addr faddr,int errno,void (* notify)(struct inpcb *,int))2231 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2232     int errno, void (*notify)(struct inpcb *, int))
2233 {
2234 	struct inpcb *inp;
2235 
2236 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2237 
2238 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
2239 		if (!(inp->inp_vflag & INP_IPV4)) {
2240 			continue;
2241 		}
2242 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
2243 		    inp->inp_socket == NULL) {
2244 			continue;
2245 		}
2246 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2247 			continue;
2248 		}
2249 		socket_lock(inp->inp_socket, 1);
2250 		(*notify)(inp, errno);
2251 		(void) in_pcb_checkstate(inp, WNT_RELEASE, 1);
2252 		socket_unlock(inp->inp_socket, 1);
2253 	}
2254 	lck_rw_done(&pcbinfo->ipi_lock);
2255 }
2256 
2257 /*
2258  * Check for alternatives when higher level complains
2259  * about service problems.  For now, invalidate cached
2260  * routing information.  If the route was created dynamically
2261  * (by a redirect), time to try a default gateway again.
2262  */
2263 void
in_losing(struct inpcb * inp)2264 in_losing(struct inpcb *inp)
2265 {
2266 	boolean_t release = FALSE;
2267 	struct rtentry *rt;
2268 
2269 	if ((rt = inp->inp_route.ro_rt) != NULL) {
2270 		struct in_ifaddr *ia = NULL;
2271 
2272 		RT_LOCK(rt);
2273 		if (rt->rt_flags & RTF_DYNAMIC) {
2274 			/*
2275 			 * Prevent another thread from modifying rt_key,
2276 			 * rt_gateway via rt_setgate() after rt_lock is
2277 			 * dropped by marking the route as defunct.
2278 			 */
2279 			rt->rt_flags |= RTF_CONDEMNED;
2280 			RT_UNLOCK(rt);
2281 			(void) rtrequest(RTM_DELETE, rt_key(rt),
2282 			    rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
2283 		} else {
2284 			RT_UNLOCK(rt);
2285 		}
2286 		/* if the address is gone keep the old route in the pcb */
2287 		if (inp->inp_laddr.s_addr != INADDR_ANY &&
2288 		    (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2289 			/*
2290 			 * Address is around; ditch the route.  A new route
2291 			 * can be allocated the next time output is attempted.
2292 			 */
2293 			release = TRUE;
2294 		}
2295 		if (ia != NULL) {
2296 			IFA_REMREF(&ia->ia_ifa);
2297 		}
2298 	}
2299 	if (rt == NULL || release) {
2300 		ROUTE_RELEASE(&inp->inp_route);
2301 	}
2302 }
2303 
2304 /*
2305  * After a routing change, flush old routing
2306  * and allocate a (hopefully) better one.
2307  */
2308 void
in_rtchange(struct inpcb * inp,int errno)2309 in_rtchange(struct inpcb *inp, int errno)
2310 {
2311 #pragma unused(errno)
2312 	boolean_t release = FALSE;
2313 	struct rtentry *rt;
2314 
2315 	if ((rt = inp->inp_route.ro_rt) != NULL) {
2316 		struct in_ifaddr *ia = NULL;
2317 
2318 		/* if address is gone, keep the old route */
2319 		if (inp->inp_laddr.s_addr != INADDR_ANY &&
2320 		    (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2321 			/*
2322 			 * Address is around; ditch the route.  A new route
2323 			 * can be allocated the next time output is attempted.
2324 			 */
2325 			release = TRUE;
2326 		}
2327 		if (ia != NULL) {
2328 			IFA_REMREF(&ia->ia_ifa);
2329 		}
2330 	}
2331 	if (rt == NULL || release) {
2332 		ROUTE_RELEASE(&inp->inp_route);
2333 	}
2334 }
2335 
2336 /*
2337  * Lookup a PCB based on the local address and port.
2338  */
2339 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,unsigned int lport_arg,int wild_okay)2340 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2341     unsigned int lport_arg, int wild_okay)
2342 {
2343 	struct inpcb *inp;
2344 	int matchwild = 3, wildcard;
2345 	u_short lport = (u_short)lport_arg;
2346 
2347 	KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0, 0, 0, 0, 0);
2348 
2349 	if (!wild_okay) {
2350 		struct inpcbhead *head;
2351 		/*
2352 		 * Look for an unconnected (wildcard foreign addr) PCB that
2353 		 * matches the local address and port we're looking for.
2354 		 */
2355 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2356 		    pcbinfo->ipi_hashmask)];
2357 		LIST_FOREACH(inp, head, inp_hash) {
2358 			if (!(inp->inp_vflag & INP_IPV4)) {
2359 				continue;
2360 			}
2361 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
2362 			    inp->inp_laddr.s_addr == laddr.s_addr &&
2363 			    inp->inp_lport == lport) {
2364 				/*
2365 				 * Found.
2366 				 */
2367 				return inp;
2368 			}
2369 		}
2370 		/*
2371 		 * Not found.
2372 		 */
2373 		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0, 0, 0, 0, 0);
2374 		return NULL;
2375 	} else {
2376 		struct inpcbporthead *porthash;
2377 		struct inpcbport *phd;
2378 		struct inpcb *match = NULL;
2379 		/*
2380 		 * Best fit PCB lookup.
2381 		 *
2382 		 * First see if this local port is in use by looking on the
2383 		 * port hash list.
2384 		 */
2385 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2386 		    pcbinfo->ipi_porthashmask)];
2387 		LIST_FOREACH(phd, porthash, phd_hash) {
2388 			if (phd->phd_port == lport) {
2389 				break;
2390 			}
2391 		}
2392 		if (phd != NULL) {
2393 			/*
2394 			 * Port is in use by one or more PCBs. Look for best
2395 			 * fit.
2396 			 */
2397 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2398 				wildcard = 0;
2399 				if (!(inp->inp_vflag & INP_IPV4)) {
2400 					continue;
2401 				}
2402 				if (inp->inp_faddr.s_addr != INADDR_ANY) {
2403 					wildcard++;
2404 				}
2405 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
2406 					if (laddr.s_addr == INADDR_ANY) {
2407 						wildcard++;
2408 					} else if (inp->inp_laddr.s_addr !=
2409 					    laddr.s_addr) {
2410 						continue;
2411 					}
2412 				} else {
2413 					if (laddr.s_addr != INADDR_ANY) {
2414 						wildcard++;
2415 					}
2416 				}
2417 				if (wildcard < matchwild) {
2418 					match = inp;
2419 					matchwild = wildcard;
2420 					if (matchwild == 0) {
2421 						break;
2422 					}
2423 				}
2424 			}
2425 		}
2426 		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,
2427 		    0, 0, 0, 0);
2428 		return match;
2429 	}
2430 }
2431 
2432 /*
2433  * Check if PCB exists in hash list.
2434  */
2435 int
in_pcblookup_hash_exists(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,uid_t * uid,gid_t * gid,struct ifnet * ifp)2436 in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2437     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2438     uid_t *uid, gid_t *gid, struct ifnet *ifp)
2439 {
2440 	struct inpcbhead *head;
2441 	struct inpcb *inp;
2442 	u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2443 	int found = 0;
2444 	struct inpcb *local_wild = NULL;
2445 	struct inpcb *local_wild_mapped = NULL;
2446 
2447 	*uid = UID_MAX;
2448 	*gid = GID_MAX;
2449 
2450 	/*
2451 	 * We may have found the pcb in the last lookup - check this first.
2452 	 */
2453 
2454 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2455 
2456 	/*
2457 	 * First look for an exact match.
2458 	 */
2459 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2460 	    pcbinfo->ipi_hashmask)];
2461 	LIST_FOREACH(inp, head, inp_hash) {
2462 		if (!(inp->inp_vflag & INP_IPV4)) {
2463 			continue;
2464 		}
2465 		if (inp_restricted_recv(inp, ifp)) {
2466 			continue;
2467 		}
2468 
2469 #if NECP
2470 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2471 			continue;
2472 		}
2473 #endif /* NECP */
2474 
2475 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
2476 		    inp->inp_laddr.s_addr == laddr.s_addr &&
2477 		    inp->inp_fport == fport &&
2478 		    inp->inp_lport == lport) {
2479 			if ((found = (inp->inp_socket != NULL))) {
2480 				/*
2481 				 * Found.
2482 				 */
2483 				*uid = kauth_cred_getuid(
2484 					inp->inp_socket->so_cred);
2485 				*gid = kauth_cred_getgid(
2486 					inp->inp_socket->so_cred);
2487 			}
2488 			lck_rw_done(&pcbinfo->ipi_lock);
2489 			return found;
2490 		}
2491 	}
2492 
2493 	if (!wildcard) {
2494 		/*
2495 		 * Not found.
2496 		 */
2497 		lck_rw_done(&pcbinfo->ipi_lock);
2498 		return 0;
2499 	}
2500 
2501 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2502 	    pcbinfo->ipi_hashmask)];
2503 	LIST_FOREACH(inp, head, inp_hash) {
2504 		if (!(inp->inp_vflag & INP_IPV4)) {
2505 			continue;
2506 		}
2507 		if (inp_restricted_recv(inp, ifp)) {
2508 			continue;
2509 		}
2510 
2511 #if NECP
2512 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2513 			continue;
2514 		}
2515 #endif /* NECP */
2516 
2517 		if (inp->inp_faddr.s_addr == INADDR_ANY &&
2518 		    inp->inp_lport == lport) {
2519 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
2520 				if ((found = (inp->inp_socket != NULL))) {
2521 					*uid = kauth_cred_getuid(
2522 						inp->inp_socket->so_cred);
2523 					*gid = kauth_cred_getgid(
2524 						inp->inp_socket->so_cred);
2525 				}
2526 				lck_rw_done(&pcbinfo->ipi_lock);
2527 				return found;
2528 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2529 				if (inp->inp_socket &&
2530 				    SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2531 					local_wild_mapped = inp;
2532 				} else {
2533 					local_wild = inp;
2534 				}
2535 			}
2536 		}
2537 	}
2538 	if (local_wild == NULL) {
2539 		if (local_wild_mapped != NULL) {
2540 			if ((found = (local_wild_mapped->inp_socket != NULL))) {
2541 				*uid = kauth_cred_getuid(
2542 					local_wild_mapped->inp_socket->so_cred);
2543 				*gid = kauth_cred_getgid(
2544 					local_wild_mapped->inp_socket->so_cred);
2545 			}
2546 			lck_rw_done(&pcbinfo->ipi_lock);
2547 			return found;
2548 		}
2549 		lck_rw_done(&pcbinfo->ipi_lock);
2550 		return 0;
2551 	}
2552 	if ((found = (local_wild->inp_socket != NULL))) {
2553 		*uid = kauth_cred_getuid(
2554 			local_wild->inp_socket->so_cred);
2555 		*gid = kauth_cred_getgid(
2556 			local_wild->inp_socket->so_cred);
2557 	}
2558 	lck_rw_done(&pcbinfo->ipi_lock);
2559 	return found;
2560 }
2561 
2562 /*
2563  * Lookup PCB in hash list.
2564  */
2565 struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,struct ifnet * ifp)2566 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2567     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2568     struct ifnet *ifp)
2569 {
2570 	struct inpcbhead *head;
2571 	struct inpcb *inp;
2572 	u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2573 	struct inpcb *local_wild = NULL;
2574 	struct inpcb *local_wild_mapped = NULL;
2575 
2576 	/*
2577 	 * We may have found the pcb in the last lookup - check this first.
2578 	 */
2579 
2580 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2581 
2582 	/*
2583 	 * First look for an exact match.
2584 	 */
2585 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2586 	    pcbinfo->ipi_hashmask)];
2587 	LIST_FOREACH(inp, head, inp_hash) {
2588 		if (!(inp->inp_vflag & INP_IPV4)) {
2589 			continue;
2590 		}
2591 		if (inp_restricted_recv(inp, ifp)) {
2592 			continue;
2593 		}
2594 
2595 #if NECP
2596 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2597 			continue;
2598 		}
2599 #endif /* NECP */
2600 
2601 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
2602 		    inp->inp_laddr.s_addr == laddr.s_addr &&
2603 		    inp->inp_fport == fport &&
2604 		    inp->inp_lport == lport) {
2605 			/*
2606 			 * Found.
2607 			 */
2608 			if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2609 			    WNT_STOPUSING) {
2610 				lck_rw_done(&pcbinfo->ipi_lock);
2611 				return inp;
2612 			} else {
2613 				/* it's there but dead, say it isn't found */
2614 				lck_rw_done(&pcbinfo->ipi_lock);
2615 				return NULL;
2616 			}
2617 		}
2618 	}
2619 
2620 	if (!wildcard) {
2621 		/*
2622 		 * Not found.
2623 		 */
2624 		lck_rw_done(&pcbinfo->ipi_lock);
2625 		return NULL;
2626 	}
2627 
2628 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2629 	    pcbinfo->ipi_hashmask)];
2630 	LIST_FOREACH(inp, head, inp_hash) {
2631 		if (!(inp->inp_vflag & INP_IPV4)) {
2632 			continue;
2633 		}
2634 		if (inp_restricted_recv(inp, ifp)) {
2635 			continue;
2636 		}
2637 
2638 #if NECP
2639 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2640 			continue;
2641 		}
2642 #endif /* NECP */
2643 
2644 		if (inp->inp_faddr.s_addr == INADDR_ANY &&
2645 		    inp->inp_lport == lport) {
2646 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
2647 				if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2648 				    WNT_STOPUSING) {
2649 					lck_rw_done(&pcbinfo->ipi_lock);
2650 					return inp;
2651 				} else {
2652 					/* it's dead; say it isn't found */
2653 					lck_rw_done(&pcbinfo->ipi_lock);
2654 					return NULL;
2655 				}
2656 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2657 				if (SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2658 					local_wild_mapped = inp;
2659 				} else {
2660 					local_wild = inp;
2661 				}
2662 			}
2663 		}
2664 	}
2665 	if (local_wild == NULL) {
2666 		if (local_wild_mapped != NULL) {
2667 			if (in_pcb_checkstate(local_wild_mapped,
2668 			    WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2669 				lck_rw_done(&pcbinfo->ipi_lock);
2670 				return local_wild_mapped;
2671 			} else {
2672 				/* it's dead; say it isn't found */
2673 				lck_rw_done(&pcbinfo->ipi_lock);
2674 				return NULL;
2675 			}
2676 		}
2677 		lck_rw_done(&pcbinfo->ipi_lock);
2678 		return NULL;
2679 	}
2680 	if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2681 		lck_rw_done(&pcbinfo->ipi_lock);
2682 		return local_wild;
2683 	}
2684 	/*
2685 	 * It's either not found or is already dead.
2686 	 */
2687 	lck_rw_done(&pcbinfo->ipi_lock);
2688 	return NULL;
2689 }
2690 
2691 /*
2692  * @brief	Insert PCB onto various hash lists.
2693  *
2694  * @param	inp Pointer to internet protocol control block
2695  * @param	locked	Implies if ipi_lock (protecting pcb list)
2696  *              is already locked or not.
2697  *
2698  * @return	int error on failure and 0 on success
2699  */
2700 int
in_pcbinshash(struct inpcb * inp,int locked)2701 in_pcbinshash(struct inpcb *inp, int locked)
2702 {
2703 	struct inpcbhead *pcbhash;
2704 	struct inpcbporthead *pcbporthash;
2705 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2706 	struct inpcbport *phd;
2707 	u_int32_t hashkey_faddr;
2708 
2709 	if (!locked) {
2710 		if (!lck_rw_try_lock_exclusive(&pcbinfo->ipi_lock)) {
2711 			/*
2712 			 * Lock inversion issue, mostly with udp
2713 			 * multicast packets
2714 			 */
2715 			socket_unlock(inp->inp_socket, 0);
2716 			lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
2717 			socket_lock(inp->inp_socket, 0);
2718 		}
2719 	}
2720 
2721 	/*
2722 	 * This routine or its caller may have given up
2723 	 * socket's protocol lock briefly.
2724 	 * During that time the socket may have been dropped.
2725 	 * Safe-guarding against that.
2726 	 */
2727 	if (inp->inp_state == INPCB_STATE_DEAD) {
2728 		if (!locked) {
2729 			lck_rw_done(&pcbinfo->ipi_lock);
2730 		}
2731 		return ECONNABORTED;
2732 	}
2733 
2734 
2735 	if (inp->inp_vflag & INP_IPV6) {
2736 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2737 	} else {
2738 		hashkey_faddr = inp->inp_faddr.s_addr;
2739 	}
2740 
2741 	inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2742 	    inp->inp_fport, pcbinfo->ipi_hashmask);
2743 
2744 	pcbhash = &pcbinfo->ipi_hashbase[inp->inp_hash_element];
2745 
2746 	pcbporthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(inp->inp_lport,
2747 	    pcbinfo->ipi_porthashmask)];
2748 
2749 	/*
2750 	 * Go through port list and look for a head for this lport.
2751 	 */
2752 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
2753 		if (phd->phd_port == inp->inp_lport) {
2754 			break;
2755 		}
2756 	}
2757 
2758 	/*
2759 	 * If none exists, malloc one and tack it on.
2760 	 */
2761 	if (phd == NULL) {
2762 		phd = kalloc_type(struct inpcbport, Z_WAITOK | Z_NOFAIL);
2763 		phd->phd_port = inp->inp_lport;
2764 		LIST_INIT(&phd->phd_pcblist);
2765 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2766 	}
2767 
2768 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2769 
2770 #if SKYWALK
2771 	int err;
2772 	struct socket *so = inp->inp_socket;
2773 	if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2774 	    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2775 		if (inp->inp_vflag & INP_IPV6) {
2776 			err = netns_reserve_in6(&inp->inp_netns_token,
2777 			    inp->in6p_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2778 			    NETNS_BSD | NETNS_PRERESERVED, NULL);
2779 		} else {
2780 			err = netns_reserve_in(&inp->inp_netns_token,
2781 			    inp->inp_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2782 			    NETNS_BSD | NETNS_PRERESERVED, NULL);
2783 		}
2784 		if (err) {
2785 			if (!locked) {
2786 				lck_rw_done(&pcbinfo->ipi_lock);
2787 			}
2788 			return err;
2789 		}
2790 		netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2791 		inp_update_netns_flags(so);
2792 	}
2793 #endif /* SKYWALK */
2794 
2795 	inp->inp_phd = phd;
2796 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2797 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2798 	inp->inp_flags2 |= INP2_INHASHLIST;
2799 
2800 	if (!locked) {
2801 		lck_rw_done(&pcbinfo->ipi_lock);
2802 	}
2803 
2804 #if NECP
2805 	// This call catches the original setting of the local address
2806 	inp_update_necp_policy(inp, NULL, NULL, 0);
2807 #endif /* NECP */
2808 
2809 	return 0;
2810 }
2811 
2812 /*
2813  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2814  * changed. NOTE: This does not handle the case of the lport changing (the
2815  * hashed port list would have to be updated as well), so the lport must
2816  * not change after in_pcbinshash() has been called.
2817  */
2818 void
in_pcbrehash(struct inpcb * inp)2819 in_pcbrehash(struct inpcb *inp)
2820 {
2821 	struct inpcbhead *head;
2822 	u_int32_t hashkey_faddr;
2823 
2824 #if SKYWALK
2825 	struct socket *so = inp->inp_socket;
2826 	if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2827 	    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2828 		int err;
2829 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
2830 			if (inp->inp_vflag & INP_IPV6) {
2831 				err = netns_change_addr_in6(
2832 					&inp->inp_netns_token, inp->in6p_laddr);
2833 			} else {
2834 				err = netns_change_addr_in(
2835 					&inp->inp_netns_token, inp->inp_laddr);
2836 			}
2837 		} else {
2838 			if (inp->inp_vflag & INP_IPV6) {
2839 				err = netns_reserve_in6(&inp->inp_netns_token,
2840 				    inp->in6p_laddr, (uint8_t)SOCK_PROTO(so),
2841 				    inp->inp_lport, NETNS_BSD, NULL);
2842 			} else {
2843 				err = netns_reserve_in(&inp->inp_netns_token,
2844 				    inp->inp_laddr, (uint8_t)SOCK_PROTO(so),
2845 				    inp->inp_lport, NETNS_BSD, NULL);
2846 			}
2847 		}
2848 		/* We are assuming that whatever code paths result in a rehash
2849 		 * did their due diligence and ensured that the given
2850 		 * <proto, laddr, lport> tuple was free ahead of time. Just
2851 		 * reserving the lport on INADDR_ANY should be enough, since
2852 		 * that will block Skywalk from trying to reserve that same
2853 		 * port. Given this assumption, the above netns calls should
2854 		 * never fail*/
2855 		VERIFY(err == 0);
2856 
2857 		netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2858 		inp_update_netns_flags(so);
2859 	}
2860 #endif /* SKYWALK */
2861 	if (inp->inp_vflag & INP_IPV6) {
2862 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2863 	} else {
2864 		hashkey_faddr = inp->inp_faddr.s_addr;
2865 	}
2866 
2867 	inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2868 	    inp->inp_fport, inp->inp_pcbinfo->ipi_hashmask);
2869 	head = &inp->inp_pcbinfo->ipi_hashbase[inp->inp_hash_element];
2870 
2871 	if (inp->inp_flags2 & INP2_INHASHLIST) {
2872 		LIST_REMOVE(inp, inp_hash);
2873 		inp->inp_flags2 &= ~INP2_INHASHLIST;
2874 	}
2875 
2876 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2877 	LIST_INSERT_HEAD(head, inp, inp_hash);
2878 	inp->inp_flags2 |= INP2_INHASHLIST;
2879 
2880 #if NECP
2881 	// This call catches updates to the remote addresses
2882 	inp_update_necp_policy(inp, NULL, NULL, 0);
2883 #endif /* NECP */
2884 }
2885 
2886 /*
2887  * Remove PCB from various lists.
2888  * Must be called pcbinfo lock is held in exclusive mode.
2889  */
2890 void
in_pcbremlists(struct inpcb * inp)2891 in_pcbremlists(struct inpcb *inp)
2892 {
2893 	inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
2894 
2895 	/*
2896 	 * Check if it's in hashlist -- an inp is placed in hashlist when
2897 	 * it's local port gets assigned. So it should also be present
2898 	 * in the port list.
2899 	 */
2900 	if (inp->inp_flags2 & INP2_INHASHLIST) {
2901 		struct inpcbport *phd = inp->inp_phd;
2902 
2903 		VERIFY(phd != NULL && inp->inp_lport > 0);
2904 
2905 		LIST_REMOVE(inp, inp_hash);
2906 		inp->inp_hash.le_next = NULL;
2907 		inp->inp_hash.le_prev = NULL;
2908 
2909 		LIST_REMOVE(inp, inp_portlist);
2910 		inp->inp_portlist.le_next = NULL;
2911 		inp->inp_portlist.le_prev = NULL;
2912 		if (LIST_EMPTY(&phd->phd_pcblist)) {
2913 			LIST_REMOVE(phd, phd_hash);
2914 			kfree_type(struct inpcbport, phd);
2915 		}
2916 		inp->inp_phd = NULL;
2917 		inp->inp_flags2 &= ~INP2_INHASHLIST;
2918 #if SKYWALK
2919 		/* Free up the port in the namespace registrar */
2920 		netns_release(&inp->inp_netns_token);
2921 		netns_release(&inp->inp_wildcard_netns_token);
2922 #endif /* SKYWALK */
2923 	}
2924 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2925 
2926 	if (inp->inp_flags2 & INP2_TIMEWAIT) {
2927 		/* Remove from time-wait queue */
2928 		tcp_remove_from_time_wait(inp);
2929 		inp->inp_flags2 &= ~INP2_TIMEWAIT;
2930 		VERIFY(inp->inp_pcbinfo->ipi_twcount != 0);
2931 		inp->inp_pcbinfo->ipi_twcount--;
2932 	} else {
2933 		/* Remove from global inp list if it is not time-wait */
2934 		LIST_REMOVE(inp, inp_list);
2935 	}
2936 
2937 	if (inp->inp_flags2 & INP2_IN_FCTREE) {
2938 		inp_fc_getinp(inp->inp_flowhash, (INPFC_SOLOCKED | INPFC_REMOVE));
2939 		VERIFY(!(inp->inp_flags2 & INP2_IN_FCTREE));
2940 	}
2941 
2942 	inp->inp_pcbinfo->ipi_count--;
2943 }
2944 
2945 /*
2946  * Mechanism used to defer the memory release of PCBs
2947  * The pcb list will contain the pcb until the reaper can clean it up if
2948  * the following conditions are met:
2949  *	1) state "DEAD",
2950  *	2) wantcnt is STOPUSING
2951  *	3) usecount is 0
2952  * This function will be called to either mark the pcb as
2953  */
2954 int
in_pcb_checkstate(struct inpcb * pcb,int mode,int locked)2955 in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
2956 {
2957 	volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt;
2958 	UInt32 origwant;
2959 	UInt32 newwant;
2960 
2961 	switch (mode) {
2962 	case WNT_STOPUSING:
2963 		/*
2964 		 * Try to mark the pcb as ready for recycling.  CAS with
2965 		 * STOPUSING, if success we're good, if it's in use, will
2966 		 * be marked later
2967 		 */
2968 		if (locked == 0) {
2969 			socket_lock(pcb->inp_socket, 1);
2970 		}
2971 		pcb->inp_state = INPCB_STATE_DEAD;
2972 
2973 stopusing:
2974 		if (pcb->inp_socket->so_usecount < 0) {
2975 			panic("%s: pcb=%p so=%p usecount is negative",
2976 			    __func__, pcb, pcb->inp_socket);
2977 			/* NOTREACHED */
2978 		}
2979 		if (locked == 0) {
2980 			socket_unlock(pcb->inp_socket, 1);
2981 		}
2982 
2983 		inpcb_gc_sched(pcb->inp_pcbinfo, INPCB_TIMER_FAST);
2984 
2985 		origwant = *wantcnt;
2986 		if ((UInt16) origwant == 0xffff) { /* should stop using */
2987 			return WNT_STOPUSING;
2988 		}
2989 		newwant = 0xffff;
2990 		if ((UInt16) origwant == 0) {
2991 			/* try to mark it as unsuable now */
2992 			OSCompareAndSwap(origwant, newwant, wantcnt);
2993 		}
2994 		return WNT_STOPUSING;
2995 
2996 	case WNT_ACQUIRE:
2997 		/*
2998 		 * Try to increase reference to pcb.  If WNT_STOPUSING
2999 		 * should bail out.  If socket state DEAD, try to set count
3000 		 * to STOPUSING, return failed otherwise increase cnt.
3001 		 */
3002 		do {
3003 			origwant = *wantcnt;
3004 			if ((UInt16) origwant == 0xffff) {
3005 				/* should stop using */
3006 				return WNT_STOPUSING;
3007 			}
3008 			newwant = origwant + 1;
3009 		} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
3010 		return WNT_ACQUIRE;
3011 
3012 	case WNT_RELEASE:
3013 		/*
3014 		 * Release reference.  If result is null and pcb state
3015 		 * is DEAD, set wanted bit to STOPUSING
3016 		 */
3017 		if (locked == 0) {
3018 			socket_lock(pcb->inp_socket, 1);
3019 		}
3020 
3021 		do {
3022 			origwant = *wantcnt;
3023 			if ((UInt16) origwant == 0x0) {
3024 				panic("%s: pcb=%p release with zero count",
3025 				    __func__, pcb);
3026 				/* NOTREACHED */
3027 			}
3028 			if ((UInt16) origwant == 0xffff) {
3029 				/* should stop using */
3030 				if (locked == 0) {
3031 					socket_unlock(pcb->inp_socket, 1);
3032 				}
3033 				return WNT_STOPUSING;
3034 			}
3035 			newwant = origwant - 1;
3036 		} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
3037 
3038 		if (pcb->inp_state == INPCB_STATE_DEAD) {
3039 			goto stopusing;
3040 		}
3041 		if (pcb->inp_socket->so_usecount < 0) {
3042 			panic("%s: RELEASE pcb=%p so=%p usecount is negative",
3043 			    __func__, pcb, pcb->inp_socket);
3044 			/* NOTREACHED */
3045 		}
3046 
3047 		if (locked == 0) {
3048 			socket_unlock(pcb->inp_socket, 1);
3049 		}
3050 		return WNT_RELEASE;
3051 
3052 	default:
3053 		panic("%s: so=%p not a valid state =%x", __func__,
3054 		    pcb->inp_socket, mode);
3055 		/* NOTREACHED */
3056 	}
3057 
3058 	/* NOTREACHED */
3059 	return mode;
3060 }
3061 
3062 /*
3063  * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
3064  * The inpcb_compat data structure is passed to user space and must
3065  * not change. We intentionally avoid copying pointers.
3066  */
3067 void
inpcb_to_compat(struct inpcb * inp,struct inpcb_compat * inp_compat)3068 inpcb_to_compat(struct inpcb *inp, struct inpcb_compat *inp_compat)
3069 {
3070 	bzero(inp_compat, sizeof(*inp_compat));
3071 	inp_compat->inp_fport = inp->inp_fport;
3072 	inp_compat->inp_lport = inp->inp_lport;
3073 	inp_compat->nat_owner = 0;
3074 	inp_compat->nat_cookie = 0;
3075 	inp_compat->inp_gencnt = inp->inp_gencnt;
3076 	inp_compat->inp_flags = inp->inp_flags;
3077 	inp_compat->inp_flow = inp->inp_flow;
3078 	inp_compat->inp_vflag = inp->inp_vflag;
3079 	inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
3080 	inp_compat->inp_ip_p = inp->inp_ip_p;
3081 	inp_compat->inp_dependfaddr.inp6_foreign =
3082 	    inp->inp_dependfaddr.inp6_foreign;
3083 	inp_compat->inp_dependladdr.inp6_local =
3084 	    inp->inp_dependladdr.inp6_local;
3085 	inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
3086 	inp_compat->inp_depend6.inp6_hlim = 0;
3087 	inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
3088 	inp_compat->inp_depend6.inp6_ifindex = 0;
3089 	inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
3090 }
3091 
3092 #if XNU_TARGET_OS_OSX
3093 void
inpcb_to_xinpcb64(struct inpcb * inp,struct xinpcb64 * xinp)3094 inpcb_to_xinpcb64(struct inpcb *inp, struct xinpcb64 *xinp)
3095 {
3096 	xinp->inp_fport = inp->inp_fport;
3097 	xinp->inp_lport = inp->inp_lport;
3098 	xinp->inp_gencnt = inp->inp_gencnt;
3099 	xinp->inp_flags = inp->inp_flags;
3100 	xinp->inp_flow = inp->inp_flow;
3101 	xinp->inp_vflag = inp->inp_vflag;
3102 	xinp->inp_ip_ttl = inp->inp_ip_ttl;
3103 	xinp->inp_ip_p = inp->inp_ip_p;
3104 	xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
3105 	xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
3106 	xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
3107 	xinp->inp_depend6.inp6_hlim = 0;
3108 	xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
3109 	xinp->inp_depend6.inp6_ifindex = 0;
3110 	xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
3111 }
3112 #endif /* XNU_TARGET_OS_OSX */
3113 
3114 /*
3115  * The following routines implement this scheme:
3116  *
3117  * Callers of ip_output() that intend to cache the route in the inpcb pass
3118  * a local copy of the struct route to ip_output().  Using a local copy of
3119  * the cached route significantly simplifies things as IP no longer has to
3120  * worry about having exclusive access to the passed in struct route, since
3121  * it's defined in the caller's stack; in essence, this allows for a lock-
3122  * less operation when updating the struct route at the IP level and below,
3123  * whenever necessary. The scheme works as follows:
3124  *
3125  * Prior to dropping the socket's lock and calling ip_output(), the caller
3126  * copies the struct route from the inpcb into its stack, and adds a reference
3127  * to the cached route entry, if there was any.  The socket's lock is then
3128  * dropped and ip_output() is called with a pointer to the copy of struct
3129  * route defined on the stack (not to the one in the inpcb.)
3130  *
3131  * Upon returning from ip_output(), the caller then acquires the socket's
3132  * lock and synchronizes the cache; if there is no route cached in the inpcb,
3133  * it copies the local copy of struct route (which may or may not contain any
3134  * route) back into the cache; otherwise, if the inpcb has a route cached in
3135  * it, the one in the local copy will be freed, if there's any.  Trashing the
3136  * cached route in the inpcb can be avoided because ip_output() is single-
3137  * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
3138  * by the socket/transport layer.)
3139  */
3140 void
inp_route_copyout(struct inpcb * inp,struct route * dst)3141 inp_route_copyout(struct inpcb *inp, struct route *dst)
3142 {
3143 	struct route *src = &inp->inp_route;
3144 
3145 	socket_lock_assert_owned(inp->inp_socket);
3146 
3147 	/*
3148 	 * If the route in the PCB is stale or not for IPv4, blow it away;
3149 	 * this is possible in the case of IPv4-mapped address case.
3150 	 */
3151 	if (ROUTE_UNUSABLE(src) || rt_key(src->ro_rt)->sa_family != AF_INET) {
3152 		ROUTE_RELEASE(src);
3153 	}
3154 
3155 	route_copyout(dst, src, sizeof(*dst));
3156 }
3157 
3158 void
inp_route_copyin(struct inpcb * inp,struct route * src)3159 inp_route_copyin(struct inpcb *inp, struct route *src)
3160 {
3161 	struct route *dst = &inp->inp_route;
3162 
3163 	socket_lock_assert_owned(inp->inp_socket);
3164 
3165 	/* Minor sanity check */
3166 	if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3167 		panic("%s: wrong or corrupted route: %p", __func__, src);
3168 	}
3169 
3170 	route_copyin(src, dst, sizeof(*src));
3171 }
3172 
3173 /*
3174  * Handler for setting IP_BOUND_IF/IPV6_BOUND_IF socket option.
3175  */
3176 int
inp_bindif(struct inpcb * inp,unsigned int ifscope,struct ifnet ** pifp)3177 inp_bindif(struct inpcb *inp, unsigned int ifscope, struct ifnet **pifp)
3178 {
3179 	struct ifnet *ifp = NULL;
3180 
3181 	ifnet_head_lock_shared();
3182 	if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE &&
3183 	    (ifp = ifindex2ifnet[ifscope]) == NULL)) {
3184 		ifnet_head_done();
3185 		return ENXIO;
3186 	}
3187 	ifnet_head_done();
3188 
3189 	VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE);
3190 
3191 	/*
3192 	 * A zero interface scope value indicates an "unbind".
3193 	 * Otherwise, take in whatever value the app desires;
3194 	 * the app may already know the scope (or force itself
3195 	 * to such a scope) ahead of time before the interface
3196 	 * gets attached.  It doesn't matter either way; any
3197 	 * route lookup from this point on will require an
3198 	 * exact match for the embedded interface scope.
3199 	 */
3200 	inp->inp_boundifp = ifp;
3201 	if (inp->inp_boundifp == NULL) {
3202 		inp->inp_flags &= ~INP_BOUND_IF;
3203 	} else {
3204 		inp->inp_flags |= INP_BOUND_IF;
3205 	}
3206 
3207 	/* Blow away any cached route in the PCB */
3208 	ROUTE_RELEASE(&inp->inp_route);
3209 
3210 	if (pifp != NULL) {
3211 		*pifp = ifp;
3212 	}
3213 
3214 	return 0;
3215 }
3216 
3217 /*
3218  * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3219  * as well as for setting PROC_UUID_NO_CELLULAR policy.
3220  */
3221 void
inp_set_nocellular(struct inpcb * inp)3222 inp_set_nocellular(struct inpcb *inp)
3223 {
3224 	inp->inp_flags |= INP_NO_IFT_CELLULAR;
3225 
3226 	/* Blow away any cached route in the PCB */
3227 	ROUTE_RELEASE(&inp->inp_route);
3228 }
3229 
3230 /*
3231  * Handler for clearing IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3232  * as well as for clearing PROC_UUID_NO_CELLULAR policy.
3233  */
3234 void
inp_clear_nocellular(struct inpcb * inp)3235 inp_clear_nocellular(struct inpcb *inp)
3236 {
3237 	struct socket *so = inp->inp_socket;
3238 
3239 	/*
3240 	 * SO_RESTRICT_DENY_CELLULAR socket restriction issued on the socket
3241 	 * has a higher precendence than INP_NO_IFT_CELLULAR.  Clear the flag
3242 	 * if and only if the socket is unrestricted.
3243 	 */
3244 	if (so != NULL && !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
3245 		inp->inp_flags &= ~INP_NO_IFT_CELLULAR;
3246 
3247 		/* Blow away any cached route in the PCB */
3248 		ROUTE_RELEASE(&inp->inp_route);
3249 	}
3250 }
3251 
3252 void
inp_set_noexpensive(struct inpcb * inp)3253 inp_set_noexpensive(struct inpcb *inp)
3254 {
3255 	inp->inp_flags2 |= INP2_NO_IFF_EXPENSIVE;
3256 
3257 	/* Blow away any cached route in the PCB */
3258 	ROUTE_RELEASE(&inp->inp_route);
3259 }
3260 
3261 void
inp_set_noconstrained(struct inpcb * inp)3262 inp_set_noconstrained(struct inpcb *inp)
3263 {
3264 	inp->inp_flags2 |= INP2_NO_IFF_CONSTRAINED;
3265 
3266 	/* Blow away any cached route in the PCB */
3267 	ROUTE_RELEASE(&inp->inp_route);
3268 }
3269 
3270 void
inp_set_awdl_unrestricted(struct inpcb * inp)3271 inp_set_awdl_unrestricted(struct inpcb *inp)
3272 {
3273 	inp->inp_flags2 |= INP2_AWDL_UNRESTRICTED;
3274 
3275 	/* Blow away any cached route in the PCB */
3276 	ROUTE_RELEASE(&inp->inp_route);
3277 }
3278 
3279 boolean_t
inp_get_awdl_unrestricted(struct inpcb * inp)3280 inp_get_awdl_unrestricted(struct inpcb *inp)
3281 {
3282 	return (inp->inp_flags2 & INP2_AWDL_UNRESTRICTED) ? TRUE : FALSE;
3283 }
3284 
3285 void
inp_clear_awdl_unrestricted(struct inpcb * inp)3286 inp_clear_awdl_unrestricted(struct inpcb *inp)
3287 {
3288 	inp->inp_flags2 &= ~INP2_AWDL_UNRESTRICTED;
3289 
3290 	/* Blow away any cached route in the PCB */
3291 	ROUTE_RELEASE(&inp->inp_route);
3292 }
3293 
3294 void
inp_set_intcoproc_allowed(struct inpcb * inp)3295 inp_set_intcoproc_allowed(struct inpcb *inp)
3296 {
3297 	inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
3298 
3299 	/* Blow away any cached route in the PCB */
3300 	ROUTE_RELEASE(&inp->inp_route);
3301 }
3302 
3303 boolean_t
inp_get_intcoproc_allowed(struct inpcb * inp)3304 inp_get_intcoproc_allowed(struct inpcb *inp)
3305 {
3306 	return (inp->inp_flags2 & INP2_INTCOPROC_ALLOWED) ? TRUE : FALSE;
3307 }
3308 
3309 void
inp_clear_intcoproc_allowed(struct inpcb * inp)3310 inp_clear_intcoproc_allowed(struct inpcb *inp)
3311 {
3312 	inp->inp_flags2 &= ~INP2_INTCOPROC_ALLOWED;
3313 
3314 	/* Blow away any cached route in the PCB */
3315 	ROUTE_RELEASE(&inp->inp_route);
3316 }
3317 
3318 void
inp_set_management_allowed(struct inpcb * inp)3319 inp_set_management_allowed(struct inpcb *inp)
3320 {
3321 	inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
3322 	inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
3323 
3324 	/* Blow away any cached route in the PCB */
3325 	ROUTE_RELEASE(&inp->inp_route);
3326 }
3327 
3328 boolean_t
inp_get_management_allowed(struct inpcb * inp)3329 inp_get_management_allowed(struct inpcb *inp)
3330 {
3331 	return (inp->inp_flags2 & INP2_MANAGEMENT_ALLOWED) ? TRUE : FALSE;
3332 }
3333 
3334 void
inp_clear_management_allowed(struct inpcb * inp)3335 inp_clear_management_allowed(struct inpcb *inp)
3336 {
3337 	inp->inp_flags2 &= ~INP2_MANAGEMENT_ALLOWED;
3338 
3339 	/* Blow away any cached route in the PCB */
3340 	ROUTE_RELEASE(&inp->inp_route);
3341 }
3342 
3343 #if NECP
3344 /*
3345  * Called when PROC_UUID_NECP_APP_POLICY is set.
3346  */
3347 void
inp_set_want_app_policy(struct inpcb * inp)3348 inp_set_want_app_policy(struct inpcb *inp)
3349 {
3350 	inp->inp_flags2 |= INP2_WANT_APP_POLICY;
3351 }
3352 
3353 /*
3354  * Called when PROC_UUID_NECP_APP_POLICY is cleared.
3355  */
3356 void
inp_clear_want_app_policy(struct inpcb * inp)3357 inp_clear_want_app_policy(struct inpcb *inp)
3358 {
3359 	inp->inp_flags2 &= ~INP2_WANT_APP_POLICY;
3360 }
3361 #endif /* NECP */
3362 
3363 /*
3364  * Calculate flow hash for an inp, used by an interface to identify a
3365  * flow. When an interface provides flow control advisory, this flow
3366  * hash is used as an identifier.
3367  */
3368 u_int32_t
inp_calc_flowhash(struct inpcb * inp)3369 inp_calc_flowhash(struct inpcb *inp)
3370 {
3371 #if SKYWALK
3372 
3373 	uint32_t flowid;
3374 	struct flowidns_flow_key fk;
3375 
3376 	bzero(&fk, sizeof(fk));
3377 
3378 	if (inp->inp_vflag & INP_IPV4) {
3379 		fk.ffk_af = AF_INET;
3380 		fk.ffk_laddr_v4 = inp->inp_laddr;
3381 		fk.ffk_raddr_v4 = inp->inp_faddr;
3382 	} else {
3383 		fk.ffk_af = AF_INET6;
3384 		fk.ffk_laddr_v6 = inp->in6p_laddr;
3385 		fk.ffk_raddr_v6 = inp->in6p_faddr;
3386 		/* clear embedded scope ID */
3387 		if (IN6_IS_SCOPE_EMBED(&fk.ffk_laddr_v6)) {
3388 			fk.ffk_laddr_v6.s6_addr16[1] = 0;
3389 		}
3390 		if (IN6_IS_SCOPE_EMBED(&fk.ffk_raddr_v6)) {
3391 			fk.ffk_raddr_v6.s6_addr16[1] = 0;
3392 		}
3393 	}
3394 
3395 	fk.ffk_lport = inp->inp_lport;
3396 	fk.ffk_rport = inp->inp_fport;
3397 	fk.ffk_proto = (inp->inp_ip_p != 0) ? inp->inp_ip_p :
3398 	    (uint8_t)SOCK_PROTO(inp->inp_socket);
3399 	flowidns_allocate_flowid(FLOWIDNS_DOMAIN_INPCB, &fk, &flowid);
3400 	/* Insert the inp into inp_fc_tree */
3401 	lck_mtx_lock_spin(&inp_fc_lck);
3402 	ASSERT(inp->inp_flowhash == 0);
3403 	ASSERT((inp->inp_flags2 & INP2_IN_FCTREE) == 0);
3404 	inp->inp_flowhash = flowid;
3405 	VERIFY(RB_INSERT(inp_fc_tree, &inp_fc_tree, inp) == NULL);
3406 	inp->inp_flags2 |= INP2_IN_FCTREE;
3407 	lck_mtx_unlock(&inp_fc_lck);
3408 
3409 	return flowid;
3410 
3411 #else /* !SKYWALK */
3412 
3413 	struct inp_flowhash_key fh __attribute__((aligned(8)));
3414 	u_int32_t flowhash = 0;
3415 	struct inpcb *tmp_inp = NULL;
3416 
3417 	if (inp_hash_seed == 0) {
3418 		inp_hash_seed = RandomULong();
3419 	}
3420 
3421 	bzero(&fh, sizeof(fh));
3422 
3423 	bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof(fh.infh_laddr));
3424 	bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof(fh.infh_faddr));
3425 
3426 	fh.infh_lport = inp->inp_lport;
3427 	fh.infh_fport = inp->inp_fport;
3428 	fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET;
3429 	fh.infh_proto = inp->inp_ip_p;
3430 	fh.infh_rand1 = RandomULong();
3431 	fh.infh_rand2 = RandomULong();
3432 
3433 try_again:
3434 	flowhash = net_flowhash(&fh, sizeof(fh), inp_hash_seed);
3435 	if (flowhash == 0) {
3436 		/* try to get a non-zero flowhash */
3437 		inp_hash_seed = RandomULong();
3438 		goto try_again;
3439 	}
3440 
3441 	inp->inp_flowhash = flowhash;
3442 
3443 	/* Insert the inp into inp_fc_tree */
3444 	lck_mtx_lock_spin(&inp_fc_lck);
3445 	tmp_inp = RB_FIND(inp_fc_tree, &inp_fc_tree, inp);
3446 	if (tmp_inp != NULL) {
3447 		/*
3448 		 * There is a different inp with the same flowhash.
3449 		 * There can be a collision on flow hash but the
3450 		 * probability is low.  Let's recompute the
3451 		 * flowhash.
3452 		 */
3453 		lck_mtx_unlock(&inp_fc_lck);
3454 		/* recompute hash seed */
3455 		inp_hash_seed = RandomULong();
3456 		goto try_again;
3457 	}
3458 
3459 	RB_INSERT(inp_fc_tree, &inp_fc_tree, inp);
3460 	inp->inp_flags2 |= INP2_IN_FCTREE;
3461 	lck_mtx_unlock(&inp_fc_lck);
3462 
3463 	return flowhash;
3464 
3465 #endif /* !SKYWALK */
3466 }
3467 
3468 void
inp_flowadv(uint32_t flowhash)3469 inp_flowadv(uint32_t flowhash)
3470 {
3471 	struct inpcb *inp;
3472 
3473 	inp = inp_fc_getinp(flowhash, 0);
3474 
3475 	if (inp == NULL) {
3476 		return;
3477 	}
3478 	inp_fc_feedback(inp);
3479 }
3480 
3481 /*
3482  * Function to compare inp_fc_entries in inp flow control tree
3483  */
3484 static inline int
infc_cmp(const struct inpcb * inp1,const struct inpcb * inp2)3485 infc_cmp(const struct inpcb *inp1, const struct inpcb *inp2)
3486 {
3487 	return memcmp(&(inp1->inp_flowhash), &(inp2->inp_flowhash),
3488 	           sizeof(inp1->inp_flowhash));
3489 }
3490 
3491 static struct inpcb *
inp_fc_getinp(u_int32_t flowhash,u_int32_t flags)3492 inp_fc_getinp(u_int32_t flowhash, u_int32_t flags)
3493 {
3494 	struct inpcb *inp = NULL;
3495 	int locked = (flags & INPFC_SOLOCKED) ? 1 : 0;
3496 
3497 	lck_mtx_lock_spin(&inp_fc_lck);
3498 	key_inp.inp_flowhash = flowhash;
3499 	inp = RB_FIND(inp_fc_tree, &inp_fc_tree, &key_inp);
3500 	if (inp == NULL) {
3501 		/* inp is not present, return */
3502 		lck_mtx_unlock(&inp_fc_lck);
3503 		return NULL;
3504 	}
3505 
3506 	if (flags & INPFC_REMOVE) {
3507 		ASSERT((inp->inp_flags2 & INP2_IN_FCTREE) != 0);
3508 		lck_mtx_convert_spin(&inp_fc_lck);
3509 		RB_REMOVE(inp_fc_tree, &inp_fc_tree, inp);
3510 		bzero(&(inp->infc_link), sizeof(inp->infc_link));
3511 #if SKYWALK
3512 		VERIFY(inp->inp_flowhash != 0);
3513 		flowidns_release_flowid(inp->inp_flowhash);
3514 		inp->inp_flowhash = 0;
3515 #endif /* !SKYWALK */
3516 		inp->inp_flags2 &= ~INP2_IN_FCTREE;
3517 		lck_mtx_unlock(&inp_fc_lck);
3518 		return NULL;
3519 	}
3520 
3521 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) {
3522 		inp = NULL;
3523 	}
3524 	lck_mtx_unlock(&inp_fc_lck);
3525 
3526 	return inp;
3527 }
3528 
3529 static void
inp_fc_feedback(struct inpcb * inp)3530 inp_fc_feedback(struct inpcb *inp)
3531 {
3532 	struct socket *so = inp->inp_socket;
3533 
3534 	/* we already hold a want_cnt on this inp, socket can't be null */
3535 	VERIFY(so != NULL);
3536 	socket_lock(so, 1);
3537 
3538 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3539 		socket_unlock(so, 1);
3540 		return;
3541 	}
3542 
3543 	if (inp->inp_sndinprog_cnt > 0) {
3544 		inp->inp_flags |= INP_FC_FEEDBACK;
3545 	}
3546 
3547 	/*
3548 	 * Return if the connection is not in flow-controlled state.
3549 	 * This can happen if the connection experienced
3550 	 * loss while it was in flow controlled state
3551 	 */
3552 	if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) {
3553 		socket_unlock(so, 1);
3554 		return;
3555 	}
3556 	inp_reset_fc_state(inp);
3557 
3558 	if (SOCK_TYPE(so) == SOCK_STREAM) {
3559 		inp_fc_unthrottle_tcp(inp);
3560 	}
3561 
3562 	socket_unlock(so, 1);
3563 }
3564 
3565 static void
inp_reset_fc_timerstat(struct inpcb * inp)3566 inp_reset_fc_timerstat(struct inpcb *inp)
3567 {
3568 	uint64_t now;
3569 
3570 	if (inp->inp_fadv_start_time == 0) {
3571 		return;
3572 	}
3573 
3574 	now = net_uptime_us();
3575 	ASSERT(now >= inp->inp_fadv_start_time);
3576 
3577 	inp->inp_fadv_total_time += (now - inp->inp_fadv_start_time);
3578 	inp->inp_fadv_cnt++;
3579 
3580 	inp->inp_fadv_start_time = 0;
3581 }
3582 
3583 static void
inp_set_fc_timerstat(struct inpcb * inp)3584 inp_set_fc_timerstat(struct inpcb *inp)
3585 {
3586 	if (inp->inp_fadv_start_time != 0) {
3587 		return;
3588 	}
3589 
3590 	inp->inp_fadv_start_time = net_uptime_us();
3591 }
3592 
3593 void
inp_reset_fc_state(struct inpcb * inp)3594 inp_reset_fc_state(struct inpcb *inp)
3595 {
3596 	struct socket *so = inp->inp_socket;
3597 	int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0;
3598 	int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0;
3599 
3600 	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3601 
3602 	inp_reset_fc_timerstat(inp);
3603 
3604 	if (suspended) {
3605 		so->so_flags &= ~(SOF_SUSPENDED);
3606 		soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME));
3607 	}
3608 
3609 	/* Give a write wakeup to unblock the socket */
3610 	if (needwakeup) {
3611 		sowwakeup(so);
3612 	}
3613 }
3614 
3615 int
inp_set_fc_state(struct inpcb * inp,int advcode)3616 inp_set_fc_state(struct inpcb *inp, int advcode)
3617 {
3618 	boolean_t is_flow_controlled = INP_WAIT_FOR_IF_FEEDBACK(inp);
3619 	struct inpcb *tmp_inp = NULL;
3620 	/*
3621 	 * If there was a feedback from the interface when
3622 	 * send operation was in progress, we should ignore
3623 	 * this flow advisory to avoid a race between setting
3624 	 * flow controlled state and receiving feedback from
3625 	 * the interface
3626 	 */
3627 	if (inp->inp_flags & INP_FC_FEEDBACK) {
3628 		return 0;
3629 	}
3630 
3631 	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3632 	if ((tmp_inp = inp_fc_getinp(inp->inp_flowhash,
3633 	    INPFC_SOLOCKED)) != NULL) {
3634 		if (in_pcb_checkstate(tmp_inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3635 			goto exit_reset;
3636 		}
3637 		VERIFY(tmp_inp == inp);
3638 		switch (advcode) {
3639 		case FADV_FLOW_CONTROLLED:
3640 			inp->inp_flags |= INP_FLOW_CONTROLLED;
3641 			inp_set_fc_timerstat(inp);
3642 			break;
3643 		case FADV_SUSPENDED:
3644 			inp->inp_flags |= INP_FLOW_SUSPENDED;
3645 			inp_set_fc_timerstat(inp);
3646 
3647 			soevent(inp->inp_socket,
3648 			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND));
3649 
3650 			/* Record the fact that suspend event was sent */
3651 			inp->inp_socket->so_flags |= SOF_SUSPENDED;
3652 			break;
3653 		}
3654 
3655 		if (!is_flow_controlled && SOCK_TYPE(inp->inp_socket) == SOCK_STREAM) {
3656 			inp_fc_throttle_tcp(inp);
3657 		}
3658 		return 1;
3659 	}
3660 
3661 exit_reset:
3662 	inp_reset_fc_timerstat(inp);
3663 
3664 	return 0;
3665 }
3666 
3667 /*
3668  * Handler for SO_FLUSH socket option.
3669  */
3670 int
inp_flush(struct inpcb * inp,int optval)3671 inp_flush(struct inpcb *inp, int optval)
3672 {
3673 	u_int32_t flowhash = inp->inp_flowhash;
3674 	struct ifnet *rtifp, *oifp;
3675 
3676 	/* Either all classes or one of the valid ones */
3677 	if (optval != SO_TC_ALL && !SO_VALID_TC(optval)) {
3678 		return EINVAL;
3679 	}
3680 
3681 	/* We need a flow hash for identification */
3682 	if (flowhash == 0) {
3683 		return 0;
3684 	}
3685 
3686 	/* Grab the interfaces from the route and pcb */
3687 	rtifp = ((inp->inp_route.ro_rt != NULL) ?
3688 	    inp->inp_route.ro_rt->rt_ifp : NULL);
3689 	oifp = inp->inp_last_outifp;
3690 
3691 	if (rtifp != NULL) {
3692 		if_qflush_sc(rtifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3693 	}
3694 	if (oifp != NULL && oifp != rtifp) {
3695 		if_qflush_sc(oifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3696 	}
3697 
3698 	return 0;
3699 }
3700 
3701 /*
3702  * Clear the INP_INADDR_ANY flag (special case for PPP only)
3703  */
3704 void
inp_clear_INP_INADDR_ANY(struct socket * so)3705 inp_clear_INP_INADDR_ANY(struct socket *so)
3706 {
3707 	struct inpcb *inp = NULL;
3708 
3709 	socket_lock(so, 1);
3710 	inp = sotoinpcb(so);
3711 	if (inp) {
3712 		inp->inp_flags &= ~INP_INADDR_ANY;
3713 	}
3714 	socket_unlock(so, 1);
3715 }
3716 
3717 void
inp_get_soprocinfo(struct inpcb * inp,struct so_procinfo * soprocinfo)3718 inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo)
3719 {
3720 	struct socket *so = inp->inp_socket;
3721 
3722 	soprocinfo->spi_pid = so->last_pid;
3723 	strlcpy(&soprocinfo->spi_proc_name[0], &inp->inp_last_proc_name[0],
3724 	    sizeof(soprocinfo->spi_proc_name));
3725 	if (so->last_pid != 0) {
3726 		uuid_copy(soprocinfo->spi_uuid, so->last_uuid);
3727 	}
3728 	/*
3729 	 * When not delegated, the effective pid is the same as the real pid
3730 	 */
3731 	if (so->so_flags & SOF_DELEGATED) {
3732 		soprocinfo->spi_delegated = 1;
3733 		soprocinfo->spi_epid = so->e_pid;
3734 		uuid_copy(soprocinfo->spi_euuid, so->e_uuid);
3735 	} else {
3736 		soprocinfo->spi_delegated = 0;
3737 		soprocinfo->spi_epid = so->last_pid;
3738 	}
3739 	strlcpy(&soprocinfo->spi_e_proc_name[0], &inp->inp_e_proc_name[0],
3740 	    sizeof(soprocinfo->spi_e_proc_name));
3741 }
3742 
3743 int
inp_findinpcb_procinfo(struct inpcbinfo * pcbinfo,uint32_t flowhash,struct so_procinfo * soprocinfo)3744 inp_findinpcb_procinfo(struct inpcbinfo *pcbinfo, uint32_t flowhash,
3745     struct so_procinfo *soprocinfo)
3746 {
3747 	struct inpcb *inp = NULL;
3748 	int found = 0;
3749 
3750 	bzero(soprocinfo, sizeof(struct so_procinfo));
3751 
3752 	if (!flowhash) {
3753 		return -1;
3754 	}
3755 
3756 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
3757 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
3758 		if (inp->inp_state != INPCB_STATE_DEAD &&
3759 		    inp->inp_socket != NULL &&
3760 		    inp->inp_flowhash == flowhash) {
3761 			found = 1;
3762 			inp_get_soprocinfo(inp, soprocinfo);
3763 			break;
3764 		}
3765 	}
3766 	lck_rw_done(&pcbinfo->ipi_lock);
3767 
3768 	return found;
3769 }
3770 
3771 #if CONFIG_PROC_UUID_POLICY
3772 static void
inp_update_cellular_policy(struct inpcb * inp,boolean_t set)3773 inp_update_cellular_policy(struct inpcb *inp, boolean_t set)
3774 {
3775 	struct socket *so = inp->inp_socket;
3776 	int before, after;
3777 
3778 	VERIFY(so != NULL);
3779 	VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3780 
3781 	before = INP_NO_CELLULAR(inp);
3782 	if (set) {
3783 		inp_set_nocellular(inp);
3784 	} else {
3785 		inp_clear_nocellular(inp);
3786 	}
3787 	after = INP_NO_CELLULAR(inp);
3788 	if (net_io_policy_log && (before != after)) {
3789 		static const char *ok = "OK";
3790 		static const char *nok = "NOACCESS";
3791 		uuid_string_t euuid_buf;
3792 		pid_t epid;
3793 
3794 		if (so->so_flags & SOF_DELEGATED) {
3795 			uuid_unparse(so->e_uuid, euuid_buf);
3796 			epid = so->e_pid;
3797 		} else {
3798 			uuid_unparse(so->last_uuid, euuid_buf);
3799 			epid = so->last_pid;
3800 		}
3801 
3802 		/* allow this socket to generate another notification event */
3803 		so->so_ifdenied_notifies = 0;
3804 
3805 		log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3806 		    "euuid %s%s %s->%s\n", __func__,
3807 		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3808 		    SOCK_TYPE(so), epid, euuid_buf,
3809 		    (so->so_flags & SOF_DELEGATED) ?
3810 		    " [delegated]" : "",
3811 		    ((before < after) ? ok : nok),
3812 		    ((before < after) ? nok : ok));
3813 	}
3814 }
3815 
3816 #if NECP
3817 static void
inp_update_necp_want_app_policy(struct inpcb * inp,boolean_t set)3818 inp_update_necp_want_app_policy(struct inpcb *inp, boolean_t set)
3819 {
3820 	struct socket *so = inp->inp_socket;
3821 	int before, after;
3822 
3823 	VERIFY(so != NULL);
3824 	VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3825 
3826 	before = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3827 	if (set) {
3828 		inp_set_want_app_policy(inp);
3829 	} else {
3830 		inp_clear_want_app_policy(inp);
3831 	}
3832 	after = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3833 	if (net_io_policy_log && (before != after)) {
3834 		static const char *wanted = "WANTED";
3835 		static const char *unwanted = "UNWANTED";
3836 		uuid_string_t euuid_buf;
3837 		pid_t epid;
3838 
3839 		if (so->so_flags & SOF_DELEGATED) {
3840 			uuid_unparse(so->e_uuid, euuid_buf);
3841 			epid = so->e_pid;
3842 		} else {
3843 			uuid_unparse(so->last_uuid, euuid_buf);
3844 			epid = so->last_pid;
3845 		}
3846 
3847 		log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3848 		    "euuid %s%s %s->%s\n", __func__,
3849 		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3850 		    SOCK_TYPE(so), epid, euuid_buf,
3851 		    (so->so_flags & SOF_DELEGATED) ?
3852 		    " [delegated]" : "",
3853 		    ((before < after) ? unwanted : wanted),
3854 		    ((before < after) ? wanted : unwanted));
3855 	}
3856 }
3857 #endif /* NECP */
3858 #endif /* !CONFIG_PROC_UUID_POLICY */
3859 
3860 #if NECP
3861 void
inp_update_necp_policy(struct inpcb * inp,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr,u_int override_bound_interface)3862 inp_update_necp_policy(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int override_bound_interface)
3863 {
3864 	necp_socket_find_policy_match(inp, override_local_addr, override_remote_addr, override_bound_interface);
3865 	if (necp_socket_should_rescope(inp) &&
3866 	    inp->inp_lport == 0 &&
3867 	    inp->inp_laddr.s_addr == INADDR_ANY &&
3868 	    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
3869 		// If we should rescope, and the socket is not yet bound
3870 		inp_bindif(inp, necp_socket_get_rescope_if_index(inp), NULL);
3871 		inp->inp_flags2 |= INP2_SCOPED_BY_NECP;
3872 	}
3873 }
3874 #endif /* NECP */
3875 
3876 int
inp_update_policy(struct inpcb * inp)3877 inp_update_policy(struct inpcb *inp)
3878 {
3879 #if CONFIG_PROC_UUID_POLICY
3880 	struct socket *so = inp->inp_socket;
3881 	uint32_t pflags = 0;
3882 	int32_t ogencnt;
3883 	int err = 0;
3884 	uint8_t *lookup_uuid = NULL;
3885 
3886 	if (!net_io_policy_uuid ||
3887 	    so == NULL || inp->inp_state == INPCB_STATE_DEAD) {
3888 		return 0;
3889 	}
3890 
3891 	/*
3892 	 * Kernel-created sockets that aren't delegating other sockets
3893 	 * are currently exempted from UUID policy checks.
3894 	 */
3895 	if (so->last_pid == 0 && !(so->so_flags & SOF_DELEGATED)) {
3896 		return 0;
3897 	}
3898 
3899 #if defined(XNU_TARGET_OS_OSX)
3900 	if (so->so_rpid > 0) {
3901 		lookup_uuid = so->so_ruuid;
3902 		ogencnt = so->so_policy_gencnt;
3903 		err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3904 	}
3905 #endif
3906 	if (lookup_uuid == NULL || err == ENOENT) {
3907 		lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid);
3908 		ogencnt = so->so_policy_gencnt;
3909 		err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3910 	}
3911 
3912 	/*
3913 	 * Discard cached generation count if the entry is gone (ENOENT),
3914 	 * so that we go thru the checks below.
3915 	 */
3916 	if (err == ENOENT && ogencnt != 0) {
3917 		so->so_policy_gencnt = 0;
3918 	}
3919 
3920 	/*
3921 	 * If the generation count has changed, inspect the policy flags
3922 	 * and act accordingly.  If a policy flag was previously set and
3923 	 * the UUID is no longer present in the table (ENOENT), treat it
3924 	 * as if the flag has been cleared.
3925 	 */
3926 	if ((err == 0 || err == ENOENT) && ogencnt != so->so_policy_gencnt) {
3927 		/* update cellular policy for this socket */
3928 		if (err == 0 && (pflags & PROC_UUID_NO_CELLULAR)) {
3929 			inp_update_cellular_policy(inp, TRUE);
3930 		} else if (!(pflags & PROC_UUID_NO_CELLULAR)) {
3931 			inp_update_cellular_policy(inp, FALSE);
3932 		}
3933 #if NECP
3934 		/* update necp want app policy for this socket */
3935 		if (err == 0 && (pflags & PROC_UUID_NECP_APP_POLICY)) {
3936 			inp_update_necp_want_app_policy(inp, TRUE);
3937 		} else if (!(pflags & PROC_UUID_NECP_APP_POLICY)) {
3938 			inp_update_necp_want_app_policy(inp, FALSE);
3939 		}
3940 #endif /* NECP */
3941 	}
3942 
3943 	return (err == ENOENT) ? 0 : err;
3944 #else /* !CONFIG_PROC_UUID_POLICY */
3945 #pragma unused(inp)
3946 	return 0;
3947 #endif /* !CONFIG_PROC_UUID_POLICY */
3948 }
3949 
3950 unsigned int log_restricted;
3951 SYSCTL_DECL(_net_inet);
3952 SYSCTL_INT(_net_inet, OID_AUTO, log_restricted,
3953     CTLFLAG_RW | CTLFLAG_LOCKED, &log_restricted, 0,
3954     "Log network restrictions");
3955 
3956 
3957 /*
3958  * Called when we need to enforce policy restrictions in the input path.
3959  *
3960  * Returns TRUE if we're not allowed to receive data, otherwise FALSE.
3961  */
3962 static boolean_t
_inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)3963 _inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
3964 {
3965 	VERIFY(inp != NULL);
3966 
3967 	/*
3968 	 * Inbound restrictions.
3969 	 */
3970 	if (!sorestrictrecv) {
3971 		return FALSE;
3972 	}
3973 
3974 	if (ifp == NULL) {
3975 		return FALSE;
3976 	}
3977 
3978 	if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
3979 		return TRUE;
3980 	}
3981 
3982 	if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
3983 		return TRUE;
3984 	}
3985 
3986 	if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
3987 		return TRUE;
3988 	}
3989 
3990 	if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
3991 		return TRUE;
3992 	}
3993 
3994 	if (!(ifp->if_eflags & IFEF_RESTRICTED_RECV)) {
3995 		return FALSE;
3996 	}
3997 
3998 	if (inp->inp_flags & INP_RECV_ANYIF) {
3999 		return FALSE;
4000 	}
4001 
4002 	/*
4003 	 * An entitled process can use the management interface without being bound
4004 	 * to the interface
4005 	 */
4006 	if (IFNET_IS_MANAGEMENT(ifp)) {
4007 		if (INP_MANAGEMENT_ALLOWED(inp)) {
4008 			return FALSE;
4009 		}
4010 		if (if_management_verbose > 1) {
4011 			os_log(OS_LOG_DEFAULT, "_inp_restricted_recv %s:%d not allowed on management interface %s",
4012 			    proc_best_name(current_proc()), proc_getpid(current_proc()),
4013 			    ifp->if_xname);
4014 		}
4015 		return TRUE;
4016 	}
4017 
4018 	if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp == ifp) {
4019 		return FALSE;
4020 	}
4021 
4022 	if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
4023 		return TRUE;
4024 	}
4025 
4026 
4027 	return TRUE;
4028 }
4029 
4030 boolean_t
inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)4031 inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
4032 {
4033 	boolean_t ret;
4034 
4035 	ret = _inp_restricted_recv(inp, ifp);
4036 	if (ret == TRUE && log_restricted) {
4037 		printf("pid %d (%s) is unable to receive packets on %s\n",
4038 		    proc_getpid(current_proc()), proc_best_name(current_proc()),
4039 		    ifp->if_xname);
4040 	}
4041 	return ret;
4042 }
4043 
4044 /*
4045  * Called when we need to enforce policy restrictions in the output path.
4046  *
4047  * Returns TRUE if we're not allowed to send data out, otherwise FALSE.
4048  */
4049 static boolean_t
_inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)4050 _inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
4051 {
4052 	VERIFY(inp != NULL);
4053 
4054 	/*
4055 	 * Outbound restrictions.
4056 	 */
4057 	if (!sorestrictsend) {
4058 		return FALSE;
4059 	}
4060 
4061 	if (ifp == NULL) {
4062 		return FALSE;
4063 	}
4064 
4065 	if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
4066 		return TRUE;
4067 	}
4068 
4069 	if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
4070 		return TRUE;
4071 	}
4072 
4073 	if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
4074 		return TRUE;
4075 	}
4076 
4077 	if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
4078 		return TRUE;
4079 	}
4080 
4081 	if (IFNET_IS_MANAGEMENT(ifp)) {
4082 		if (!INP_MANAGEMENT_ALLOWED(inp)) {
4083 			if (if_management_verbose > 1) {
4084 				os_log(OS_LOG_DEFAULT, "_inp_restricted_send %s:%d not allowed on management interface %s",
4085 				    proc_best_name(current_proc()), proc_getpid(current_proc()),
4086 				    ifp->if_xname);
4087 			}
4088 			return TRUE;
4089 		}
4090 	}
4091 
4092 	if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
4093 		return TRUE;
4094 	}
4095 
4096 	return FALSE;
4097 }
4098 
4099 boolean_t
inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)4100 inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
4101 {
4102 	boolean_t ret;
4103 
4104 	ret = _inp_restricted_send(inp, ifp);
4105 	if (ret == TRUE && log_restricted) {
4106 		printf("pid %d (%s) is unable to transmit packets on %s\n",
4107 		    proc_getpid(current_proc()), proc_best_name(current_proc()),
4108 		    ifp->if_xname);
4109 	}
4110 	return ret;
4111 }
4112 
4113 inline void
inp_count_sndbytes(struct inpcb * inp,u_int32_t th_ack)4114 inp_count_sndbytes(struct inpcb *inp, u_int32_t th_ack)
4115 {
4116 	struct ifnet *ifp = inp->inp_last_outifp;
4117 	struct socket *so = inp->inp_socket;
4118 	if (ifp != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
4119 	    (ifp->if_type == IFT_CELLULAR || IFNET_IS_WIFI(ifp))) {
4120 		int32_t unsent;
4121 
4122 		so->so_snd.sb_flags |= SB_SNDBYTE_CNT;
4123 
4124 		/*
4125 		 * There can be data outstanding before the connection
4126 		 * becomes established -- TFO case
4127 		 */
4128 		if (so->so_snd.sb_cc > 0) {
4129 			inp_incr_sndbytes_total(so, so->so_snd.sb_cc);
4130 		}
4131 
4132 		unsent = inp_get_sndbytes_allunsent(so, th_ack);
4133 		if (unsent > 0) {
4134 			inp_incr_sndbytes_unsent(so, unsent);
4135 		}
4136 	}
4137 }
4138 
4139 inline void
inp_incr_sndbytes_total(struct socket * so,int32_t len)4140 inp_incr_sndbytes_total(struct socket *so, int32_t len)
4141 {
4142 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4143 	struct ifnet *ifp = inp->inp_last_outifp;
4144 
4145 	if (ifp != NULL) {
4146 		VERIFY(ifp->if_sndbyte_total >= 0);
4147 		OSAddAtomic64(len, &ifp->if_sndbyte_total);
4148 	}
4149 }
4150 
4151 inline void
inp_decr_sndbytes_total(struct socket * so,int32_t len)4152 inp_decr_sndbytes_total(struct socket *so, int32_t len)
4153 {
4154 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4155 	struct ifnet *ifp = inp->inp_last_outifp;
4156 
4157 	if (ifp != NULL) {
4158 		if (ifp->if_sndbyte_total >= len) {
4159 			OSAddAtomic64(-len, &ifp->if_sndbyte_total);
4160 		} else {
4161 			ifp->if_sndbyte_total = 0;
4162 		}
4163 	}
4164 }
4165 
4166 inline void
inp_incr_sndbytes_unsent(struct socket * so,int32_t len)4167 inp_incr_sndbytes_unsent(struct socket *so, int32_t len)
4168 {
4169 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4170 	struct ifnet *ifp = inp->inp_last_outifp;
4171 
4172 	if (ifp != NULL) {
4173 		VERIFY(ifp->if_sndbyte_unsent >= 0);
4174 		OSAddAtomic64(len, &ifp->if_sndbyte_unsent);
4175 	}
4176 }
4177 
4178 inline void
inp_decr_sndbytes_unsent(struct socket * so,int32_t len)4179 inp_decr_sndbytes_unsent(struct socket *so, int32_t len)
4180 {
4181 	if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
4182 		return;
4183 	}
4184 
4185 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4186 	struct ifnet *ifp = inp->inp_last_outifp;
4187 
4188 	if (ifp != NULL) {
4189 		if (ifp->if_sndbyte_unsent >= len) {
4190 			OSAddAtomic64(-len, &ifp->if_sndbyte_unsent);
4191 		} else {
4192 			ifp->if_sndbyte_unsent = 0;
4193 		}
4194 	}
4195 }
4196 
4197 inline void
inp_decr_sndbytes_allunsent(struct socket * so,u_int32_t th_ack)4198 inp_decr_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
4199 {
4200 	int32_t len;
4201 
4202 	if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
4203 		return;
4204 	}
4205 
4206 	len = inp_get_sndbytes_allunsent(so, th_ack);
4207 	inp_decr_sndbytes_unsent(so, len);
4208 }
4209 
4210 #if SKYWALK
4211 inline void
inp_update_netns_flags(struct socket * so)4212 inp_update_netns_flags(struct socket *so)
4213 {
4214 	struct inpcb *inp;
4215 	uint32_t set_flags = 0;
4216 	uint32_t clear_flags = 0;
4217 
4218 	if (!(SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
4219 		return;
4220 	}
4221 
4222 	inp = sotoinpcb(so);
4223 
4224 	if (inp == NULL) {
4225 		return;
4226 	}
4227 
4228 	if (!NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
4229 		return;
4230 	}
4231 
4232 	if (so->so_options & SO_NOWAKEFROMSLEEP) {
4233 		set_flags |= NETNS_NOWAKEFROMSLEEP;
4234 	} else {
4235 		clear_flags |= NETNS_NOWAKEFROMSLEEP;
4236 	}
4237 
4238 	if (inp->inp_flags & INP_RECV_ANYIF) {
4239 		set_flags |= NETNS_RECVANYIF;
4240 	} else {
4241 		clear_flags |= NETNS_RECVANYIF;
4242 	}
4243 
4244 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
4245 		set_flags |= NETNS_EXTBGIDLE;
4246 	} else {
4247 		clear_flags |= NETNS_EXTBGIDLE;
4248 	}
4249 
4250 	netns_change_flags(&inp->inp_netns_token, set_flags, clear_flags);
4251 }
4252 #endif /* SKYWALK */
4253 
4254 inline void
inp_set_activity_bitmap(struct inpcb * inp)4255 inp_set_activity_bitmap(struct inpcb *inp)
4256 {
4257 	in_stat_set_activity_bitmap(&inp->inp_nw_activity, net_uptime());
4258 }
4259 
4260 inline void
inp_get_activity_bitmap(struct inpcb * inp,activity_bitmap_t * ab)4261 inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *ab)
4262 {
4263 	bcopy(&inp->inp_nw_activity, ab, sizeof(*ab));
4264 }
4265 
4266 void
inp_update_last_owner(struct socket * so,struct proc * p,struct proc * ep)4267 inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep)
4268 {
4269 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4270 
4271 	if (inp == NULL) {
4272 		return;
4273 	}
4274 
4275 	if (p != NULL) {
4276 		strlcpy(&inp->inp_last_proc_name[0], proc_name_address(p), sizeof(inp->inp_last_proc_name));
4277 	}
4278 	if (so->so_flags & SOF_DELEGATED) {
4279 		if (ep != NULL) {
4280 			strlcpy(&inp->inp_e_proc_name[0], proc_name_address(ep), sizeof(inp->inp_e_proc_name));
4281 		} else {
4282 			inp->inp_e_proc_name[0] = 0;
4283 		}
4284 	} else {
4285 		inp->inp_e_proc_name[0] = 0;
4286 	}
4287 }
4288 
4289 void
inp_copy_last_owner(struct socket * so,struct socket * head)4290 inp_copy_last_owner(struct socket *so, struct socket *head)
4291 {
4292 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4293 	struct inpcb *head_inp = (struct inpcb *)head->so_pcb;
4294 
4295 	if (inp == NULL || head_inp == NULL) {
4296 		return;
4297 	}
4298 
4299 	strlcpy(&inp->inp_last_proc_name[0], &head_inp->inp_last_proc_name[0], sizeof(inp->inp_last_proc_name));
4300 	strlcpy(&inp->inp_e_proc_name[0], &head_inp->inp_e_proc_name[0], sizeof(inp->inp_e_proc_name));
4301 }
4302 
4303 static int
in_check_management_interface_proc_callout(proc_t proc,void * arg __unused)4304 in_check_management_interface_proc_callout(proc_t proc, void *arg __unused)
4305 {
4306 	struct fileproc *fp = NULL;
4307 	task_t task = proc_task(proc);
4308 	bool allowed = false;
4309 
4310 	if (IOTaskHasEntitlement(task, INTCOPROC_RESTRICTED_ENTITLEMENT) == true
4311 	    || IOTaskHasEntitlement(task, MANAGEMENT_DATA_ENTITLEMENT) == true
4312 #if DEBUG || DEVELOPMENT
4313 	    || IOTaskHasEntitlement(task, INTCOPROC_RESTRICTED_ENTITLEMENT_DEVELOPMENT) == true
4314 	    || IOTaskHasEntitlement(task, MANAGEMENT_DATA_ENTITLEMENT_DEVELOPMENT) == true
4315 #endif /* DEBUG || DEVELOPMENT */
4316 	    ) {
4317 		allowed = true;
4318 	}
4319 	if (allowed == false && management_data_unrestricted == false) {
4320 		return PROC_RETURNED;
4321 	}
4322 
4323 	proc_fdlock(proc);
4324 	fdt_foreach(fp, proc) {
4325 		struct fileglob *fg = fp->fp_glob;
4326 		struct socket *so;
4327 		struct inpcb *inp;
4328 
4329 		if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4330 			continue;
4331 		}
4332 
4333 		so = (struct socket *)fp_get_data(fp);
4334 		if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
4335 			continue;
4336 		}
4337 
4338 		inp = (struct inpcb *)so->so_pcb;
4339 
4340 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4341 			continue;
4342 		}
4343 
4344 		socket_lock(so, 1);
4345 
4346 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4347 			socket_unlock(so, 1);
4348 			continue;
4349 		}
4350 		inp->inp_flags2 |= INP2_MANAGEMENT_ALLOWED;
4351 		inp->inp_flags2 |= INP2_MANAGEMENT_CHECKED;
4352 
4353 		socket_unlock(so, 1);
4354 	}
4355 	proc_fdunlock(proc);
4356 
4357 	return PROC_RETURNED;
4358 }
4359 
4360 static bool in_management_interface_checked = false;
4361 
4362 static void
in_management_interface_event_callback(struct nwk_wq_entry * nwk_item)4363 in_management_interface_event_callback(struct nwk_wq_entry *nwk_item)
4364 {
4365 	kfree_type(struct nwk_wq_entry, nwk_item);
4366 
4367 	if (in_management_interface_checked == true) {
4368 		return;
4369 	}
4370 	in_management_interface_checked = true;
4371 
4372 	proc_iterate(PROC_ALLPROCLIST,
4373 	    in_check_management_interface_proc_callout,
4374 	    NULL, NULL, NULL);
4375 }
4376 
4377 void
in_management_interface_check(void)4378 in_management_interface_check(void)
4379 {
4380 	struct nwk_wq_entry *nwk_item;
4381 
4382 	if (if_management_interface_check_needed == false ||
4383 	    in_management_interface_checked == true) {
4384 		return;
4385 	}
4386 
4387 	nwk_item  = kalloc_type(struct nwk_wq_entry,
4388 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
4389 
4390 	nwk_item->func = in_management_interface_event_callback;
4391 
4392 	nwk_wq_enqueue(nwk_item);
4393 }
4394