xref: /xnu-8796.101.5/bsd/netinet/in_pcb.c (revision aca3beaa3dfbd42498b42c5e5ce20a938e6554e5)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1982, 1986, 1991, 1993, 1995
30  *	The Regents of the University of California.  All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. All advertising materials mentioning features or use of this software
41  *    must display the following acknowledgement:
42  *	This product includes software developed by the University of
43  *	California, Berkeley and its contributors.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
61  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62  */
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/proc.h>
73 #include <sys/kernel.h>
74 #include <sys/sysctl.h>
75 #include <sys/mcache.h>
76 #include <sys/kauth.h>
77 #include <sys/priv.h>
78 #include <sys/proc_uuid_policy.h>
79 #include <sys/syslog.h>
80 #include <sys/priv.h>
81 #include <net/dlil.h>
82 
83 #include <libkern/OSAtomic.h>
84 #include <kern/locks.h>
85 
86 #include <machine/limits.h>
87 
88 #include <kern/zalloc.h>
89 
90 #include <net/if.h>
91 #include <net/if_types.h>
92 #include <net/route.h>
93 #include <net/flowhash.h>
94 #include <net/flowadv.h>
95 #include <net/nat464_utils.h>
96 #include <net/ntstat.h>
97 #include <net/restricted_in_port.h>
98 
99 #include <netinet/in.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/in_var.h>
102 #include <netinet/ip_var.h>
103 
104 #include <netinet/ip6.h>
105 #include <netinet6/ip6_var.h>
106 
107 #include <sys/kdebug.h>
108 #include <sys/random.h>
109 
110 #include <dev/random/randomdev.h>
111 #include <mach/boolean.h>
112 
113 #include <pexpert/pexpert.h>
114 
115 #if NECP
116 #include <net/necp.h>
117 #endif
118 
119 #include <sys/stat.h>
120 #include <sys/ubc.h>
121 #include <sys/vnode.h>
122 
123 #include <os/log.h>
124 
125 #if SKYWALK
126 #include <skywalk/namespace/flowidns.h>
127 #endif /* SKYWALK */
128 
129 extern const char *proc_name_address(struct proc *);
130 
131 static LCK_GRP_DECLARE(inpcb_lock_grp, "inpcb");
132 static LCK_ATTR_DECLARE(inpcb_lock_attr, 0, 0);
133 static LCK_MTX_DECLARE_ATTR(inpcb_lock, &inpcb_lock_grp, &inpcb_lock_attr);
134 static LCK_MTX_DECLARE_ATTR(inpcb_timeout_lock, &inpcb_lock_grp, &inpcb_lock_attr);
135 
136 static TAILQ_HEAD(, inpcbinfo) inpcb_head = TAILQ_HEAD_INITIALIZER(inpcb_head);
137 
138 static u_int16_t inpcb_timeout_run = 0; /* INPCB timer is scheduled to run */
139 static boolean_t inpcb_garbage_collecting = FALSE; /* gc timer is scheduled */
140 static boolean_t inpcb_ticking = FALSE;         /* "slow" timer is scheduled */
141 static boolean_t inpcb_fast_timer_on = FALSE;
142 
143 #define INPCB_GCREQ_THRESHOLD   50000
144 
145 static thread_call_t inpcb_thread_call, inpcb_fast_thread_call;
146 static void inpcb_sched_timeout(void);
147 static void inpcb_sched_lazy_timeout(void);
148 static void _inpcb_sched_timeout(unsigned int);
149 static void inpcb_timeout(void *, void *);
150 const int inpcb_timeout_lazy = 10;      /* 10 seconds leeway for lazy timers */
151 extern int tvtohz(struct timeval *);
152 
153 #if CONFIG_PROC_UUID_POLICY
154 static void inp_update_cellular_policy(struct inpcb *, boolean_t);
155 #if NECP
156 static void inp_update_necp_want_app_policy(struct inpcb *, boolean_t);
157 #endif /* NECP */
158 #endif /* !CONFIG_PROC_UUID_POLICY */
159 
160 #define DBG_FNC_PCB_LOOKUP      NETDBG_CODE(DBG_NETTCP, (6 << 8))
161 #define DBG_FNC_PCB_HLOOKUP     NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
162 
163 int allow_udp_port_exhaustion = 0;
164 
165 /*
166  * These configure the range of local port addresses assigned to
167  * "unspecified" outgoing connections/packets/whatever.
168  */
169 int     ipport_lowfirstauto  = IPPORT_RESERVED - 1;     /* 1023 */
170 int     ipport_lowlastauto = IPPORT_RESERVEDSTART;      /* 600 */
171 int     ipport_firstauto = IPPORT_HIFIRSTAUTO;          /* 49152 */
172 int     ipport_lastauto  = IPPORT_HILASTAUTO;           /* 65535 */
173 int     ipport_hifirstauto = IPPORT_HIFIRSTAUTO;        /* 49152 */
174 int     ipport_hilastauto  = IPPORT_HILASTAUTO;         /* 65535 */
175 
176 #define RANGECHK(var, min, max) \
177 	if ((var) < (min)) { (var) = (min); } \
178 	else if ((var) > (max)) { (var) = (max); }
179 
180 static int
181 sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
182 {
183 #pragma unused(arg1, arg2)
184 	int error;
185 	int new_value = *(int *)oidp->oid_arg1;
186 #if (DEBUG | DEVELOPMENT)
187 	int old_value = *(int *)oidp->oid_arg1;
188 	/*
189 	 * For unit testing allow a non-superuser process with the
190 	 * proper entitlement to modify the variables
191 	 */
192 	if (req->newptr) {
193 		if (proc_suser(current_proc()) != 0 &&
194 		    (error = priv_check_cred(kauth_cred_get(),
195 		    PRIV_NETINET_RESERVEDPORT, 0))) {
196 			return EPERM;
197 		}
198 	}
199 #endif /* (DEBUG | DEVELOPMENT) */
200 
201 	error = sysctl_handle_int(oidp, &new_value, 0, req);
202 	if (!error) {
203 		if (oidp->oid_arg1 == &ipport_lowfirstauto || oidp->oid_arg1 == &ipport_lowlastauto) {
204 			RANGECHK(new_value, 1, IPPORT_RESERVED - 1);
205 		} else {
206 			RANGECHK(new_value, IPPORT_RESERVED, USHRT_MAX);
207 		}
208 		*(int *)oidp->oid_arg1 = new_value;
209 	}
210 
211 #if (DEBUG | DEVELOPMENT)
212 	os_log(OS_LOG_DEFAULT,
213 	    "%s:%u sysctl net.restricted_port.verbose: %d -> %d)",
214 	    proc_best_name(current_proc()), proc_selfpid(),
215 	    old_value, *(int *)oidp->oid_arg1);
216 #endif /* (DEBUG | DEVELOPMENT) */
217 
218 	return error;
219 }
220 
221 #undef RANGECHK
222 
223 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
224     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IP Ports");
225 
226 #if (DEBUG | DEVELOPMENT)
227 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY)
228 #else
229 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED)
230 #endif /* (DEBUG | DEVELOPMENT) */
231 
232 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
233     CTLFAGS_IP_PORTRANGE,
234     &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
235 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
236     CTLFAGS_IP_PORTRANGE,
237     &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
238 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
239     CTLFAGS_IP_PORTRANGE,
240     &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
241 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
242     CTLFAGS_IP_PORTRANGE,
243     &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
244 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
245     CTLFAGS_IP_PORTRANGE,
246     &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
247 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
248     CTLFAGS_IP_PORTRANGE,
249     &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
250 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, ipport_allow_udp_port_exhaustion,
251     CTLFLAG_LOCKED | CTLFLAG_RW, &allow_udp_port_exhaustion, 0, "");
252 
253 static uint32_t apn_fallbk_debug = 0;
254 #define apn_fallbk_log(x)       do { if (apn_fallbk_debug >= 1) log x; } while (0)
255 
256 #if !XNU_TARGET_OS_OSX
257 static boolean_t apn_fallbk_enabled = TRUE;
258 
259 SYSCTL_DECL(_net_inet);
260 SYSCTL_NODE(_net_inet, OID_AUTO, apn_fallback, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "APN Fallback");
261 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
262     &apn_fallbk_enabled, 0, "APN fallback enable");
263 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
264     &apn_fallbk_debug, 0, "APN fallback debug enable");
265 #else /* XNU_TARGET_OS_OSX */
266 static boolean_t apn_fallbk_enabled = FALSE;
267 #endif /* XNU_TARGET_OS_OSX */
268 
269 extern int      udp_use_randomport;
270 extern int      tcp_use_randomport;
271 
272 /* Structs used for flowhash computation */
273 struct inp_flowhash_key_addr {
274 	union {
275 		struct in_addr  v4;
276 		struct in6_addr v6;
277 		u_int8_t        addr8[16];
278 		u_int16_t       addr16[8];
279 		u_int32_t       addr32[4];
280 	} infha;
281 };
282 
283 struct inp_flowhash_key {
284 	struct inp_flowhash_key_addr    infh_laddr;
285 	struct inp_flowhash_key_addr    infh_faddr;
286 	u_int32_t                       infh_lport;
287 	u_int32_t                       infh_fport;
288 	u_int32_t                       infh_af;
289 	u_int32_t                       infh_proto;
290 	u_int32_t                       infh_rand1;
291 	u_int32_t                       infh_rand2;
292 };
293 
294 #if !SKYWALK
295 static u_int32_t inp_hash_seed = 0;
296 #endif /* !SKYWALK */
297 
298 static int infc_cmp(const struct inpcb *, const struct inpcb *);
299 
300 /* Flags used by inp_fc_getinp */
301 #define INPFC_SOLOCKED  0x1
302 #define INPFC_REMOVE    0x2
303 static struct inpcb *inp_fc_getinp(u_int32_t, u_int32_t);
304 
305 static void inp_fc_feedback(struct inpcb *);
306 extern void tcp_remove_from_time_wait(struct inpcb *inp);
307 
308 static LCK_MTX_DECLARE_ATTR(inp_fc_lck, &inpcb_lock_grp, &inpcb_lock_attr);
309 
310 RB_HEAD(inp_fc_tree, inpcb) inp_fc_tree;
311 RB_PROTOTYPE(inp_fc_tree, inpcb, infc_link, infc_cmp);
312 RB_GENERATE(inp_fc_tree, inpcb, infc_link, infc_cmp);
313 
314 /*
315  * Use this inp as a key to find an inp in the flowhash tree.
316  * Accesses to it are protected by inp_fc_lck.
317  */
318 struct inpcb key_inp;
319 
320 /*
321  * in_pcb.c: manage the Protocol Control Blocks.
322  */
323 
324 void
in_pcbinit(void)325 in_pcbinit(void)
326 {
327 	static int inpcb_initialized = 0;
328 
329 	VERIFY(!inpcb_initialized);
330 	inpcb_initialized = 1;
331 
332 	inpcb_thread_call = thread_call_allocate_with_priority(inpcb_timeout,
333 	    NULL, THREAD_CALL_PRIORITY_KERNEL);
334 	/* Give it an arg so that we know that this is the fast timer */
335 	inpcb_fast_thread_call = thread_call_allocate_with_priority(
336 		inpcb_timeout, &inpcb_timeout, THREAD_CALL_PRIORITY_KERNEL);
337 	if (inpcb_thread_call == NULL || inpcb_fast_thread_call == NULL) {
338 		panic("unable to alloc the inpcb thread call");
339 	}
340 
341 	/*
342 	 * Initialize data structures required to deliver
343 	 * flow advisories.
344 	 */
345 	lck_mtx_lock(&inp_fc_lck);
346 	RB_INIT(&inp_fc_tree);
347 	bzero(&key_inp, sizeof(key_inp));
348 	lck_mtx_unlock(&inp_fc_lck);
349 }
350 
351 #define INPCB_HAVE_TIMER_REQ(req)       (((req).intimer_lazy > 0) || \
352 	((req).intimer_fast > 0) || ((req).intimer_nodelay > 0))
353 static void
inpcb_timeout(void * arg0,void * arg1)354 inpcb_timeout(void *arg0, void *arg1)
355 {
356 #pragma unused(arg1)
357 	struct inpcbinfo *ipi;
358 	boolean_t t, gc;
359 	struct intimercount gccnt, tmcnt;
360 
361 	/*
362 	 * Update coarse-grained networking timestamp (in sec.); the idea
363 	 * is to piggy-back on the timeout callout to update the counter
364 	 * returnable via net_uptime().
365 	 */
366 	net_update_uptime();
367 
368 	bzero(&gccnt, sizeof(gccnt));
369 	bzero(&tmcnt, sizeof(tmcnt));
370 
371 	lck_mtx_lock_spin(&inpcb_timeout_lock);
372 	gc = inpcb_garbage_collecting;
373 	inpcb_garbage_collecting = FALSE;
374 
375 	t = inpcb_ticking;
376 	inpcb_ticking = FALSE;
377 
378 	if (gc || t) {
379 		lck_mtx_unlock(&inpcb_timeout_lock);
380 
381 		lck_mtx_lock(&inpcb_lock);
382 		TAILQ_FOREACH(ipi, &inpcb_head, ipi_entry) {
383 			if (INPCB_HAVE_TIMER_REQ(ipi->ipi_gc_req)) {
384 				bzero(&ipi->ipi_gc_req,
385 				    sizeof(ipi->ipi_gc_req));
386 				if (gc && ipi->ipi_gc != NULL) {
387 					ipi->ipi_gc(ipi);
388 					gccnt.intimer_lazy +=
389 					    ipi->ipi_gc_req.intimer_lazy;
390 					gccnt.intimer_fast +=
391 					    ipi->ipi_gc_req.intimer_fast;
392 					gccnt.intimer_nodelay +=
393 					    ipi->ipi_gc_req.intimer_nodelay;
394 				}
395 			}
396 			if (INPCB_HAVE_TIMER_REQ(ipi->ipi_timer_req)) {
397 				bzero(&ipi->ipi_timer_req,
398 				    sizeof(ipi->ipi_timer_req));
399 				if (t && ipi->ipi_timer != NULL) {
400 					ipi->ipi_timer(ipi);
401 					tmcnt.intimer_lazy +=
402 					    ipi->ipi_timer_req.intimer_lazy;
403 					tmcnt.intimer_fast +=
404 					    ipi->ipi_timer_req.intimer_fast;
405 					tmcnt.intimer_nodelay +=
406 					    ipi->ipi_timer_req.intimer_nodelay;
407 				}
408 			}
409 		}
410 		lck_mtx_unlock(&inpcb_lock);
411 		lck_mtx_lock_spin(&inpcb_timeout_lock);
412 	}
413 
414 	/* lock was dropped above, so check first before overriding */
415 	if (!inpcb_garbage_collecting) {
416 		inpcb_garbage_collecting = INPCB_HAVE_TIMER_REQ(gccnt);
417 	}
418 	if (!inpcb_ticking) {
419 		inpcb_ticking = INPCB_HAVE_TIMER_REQ(tmcnt);
420 	}
421 
422 	/* arg0 will be set if we are the fast timer */
423 	if (arg0 != NULL) {
424 		inpcb_fast_timer_on = FALSE;
425 	}
426 	inpcb_timeout_run--;
427 	VERIFY(inpcb_timeout_run >= 0 && inpcb_timeout_run < 2);
428 
429 	/* re-arm the timer if there's work to do */
430 	if (gccnt.intimer_nodelay > 0 || tmcnt.intimer_nodelay > 0) {
431 		inpcb_sched_timeout();
432 	} else if ((gccnt.intimer_fast + tmcnt.intimer_fast) <= 5) {
433 		/* be lazy when idle with little activity */
434 		inpcb_sched_lazy_timeout();
435 	} else {
436 		inpcb_sched_timeout();
437 	}
438 
439 	lck_mtx_unlock(&inpcb_timeout_lock);
440 }
441 
442 static void
inpcb_sched_timeout(void)443 inpcb_sched_timeout(void)
444 {
445 	_inpcb_sched_timeout(0);
446 }
447 
448 static void
inpcb_sched_lazy_timeout(void)449 inpcb_sched_lazy_timeout(void)
450 {
451 	_inpcb_sched_timeout(inpcb_timeout_lazy);
452 }
453 
454 static void
_inpcb_sched_timeout(unsigned int offset)455 _inpcb_sched_timeout(unsigned int offset)
456 {
457 	uint64_t deadline, leeway;
458 
459 	clock_interval_to_deadline(1, NSEC_PER_SEC, &deadline);
460 	LCK_MTX_ASSERT(&inpcb_timeout_lock, LCK_MTX_ASSERT_OWNED);
461 	if (inpcb_timeout_run == 0 &&
462 	    (inpcb_garbage_collecting || inpcb_ticking)) {
463 		lck_mtx_convert_spin(&inpcb_timeout_lock);
464 		inpcb_timeout_run++;
465 		if (offset == 0) {
466 			inpcb_fast_timer_on = TRUE;
467 			thread_call_enter_delayed(inpcb_fast_thread_call,
468 			    deadline);
469 		} else {
470 			inpcb_fast_timer_on = FALSE;
471 			clock_interval_to_absolutetime_interval(offset,
472 			    NSEC_PER_SEC, &leeway);
473 			thread_call_enter_delayed_with_leeway(
474 				inpcb_thread_call, NULL, deadline, leeway,
475 				THREAD_CALL_DELAY_LEEWAY);
476 		}
477 	} else if (inpcb_timeout_run == 1 &&
478 	    offset == 0 && !inpcb_fast_timer_on) {
479 		/*
480 		 * Since the request was for a fast timer but the
481 		 * scheduled timer is a lazy timer, try to schedule
482 		 * another instance of fast timer also.
483 		 */
484 		lck_mtx_convert_spin(&inpcb_timeout_lock);
485 		inpcb_timeout_run++;
486 		inpcb_fast_timer_on = TRUE;
487 		thread_call_enter_delayed(inpcb_fast_thread_call, deadline);
488 	}
489 }
490 
491 void
inpcb_gc_sched(struct inpcbinfo * ipi,u_int32_t type)492 inpcb_gc_sched(struct inpcbinfo *ipi, u_int32_t type)
493 {
494 	u_int32_t gccnt;
495 
496 	lck_mtx_lock_spin(&inpcb_timeout_lock);
497 	inpcb_garbage_collecting = TRUE;
498 	gccnt = ipi->ipi_gc_req.intimer_nodelay +
499 	    ipi->ipi_gc_req.intimer_fast;
500 
501 	if (gccnt > INPCB_GCREQ_THRESHOLD) {
502 		type = INPCB_TIMER_FAST;
503 	}
504 
505 	switch (type) {
506 	case INPCB_TIMER_NODELAY:
507 		atomic_add_32(&ipi->ipi_gc_req.intimer_nodelay, 1);
508 		inpcb_sched_timeout();
509 		break;
510 	case INPCB_TIMER_FAST:
511 		atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
512 		inpcb_sched_timeout();
513 		break;
514 	default:
515 		atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
516 		inpcb_sched_lazy_timeout();
517 		break;
518 	}
519 	lck_mtx_unlock(&inpcb_timeout_lock);
520 }
521 
522 void
inpcb_timer_sched(struct inpcbinfo * ipi,u_int32_t type)523 inpcb_timer_sched(struct inpcbinfo *ipi, u_int32_t type)
524 {
525 	lck_mtx_lock_spin(&inpcb_timeout_lock);
526 	inpcb_ticking = TRUE;
527 	switch (type) {
528 	case INPCB_TIMER_NODELAY:
529 		atomic_add_32(&ipi->ipi_timer_req.intimer_nodelay, 1);
530 		inpcb_sched_timeout();
531 		break;
532 	case INPCB_TIMER_FAST:
533 		atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
534 		inpcb_sched_timeout();
535 		break;
536 	default:
537 		atomic_add_32(&ipi->ipi_timer_req.intimer_lazy, 1);
538 		inpcb_sched_lazy_timeout();
539 		break;
540 	}
541 	lck_mtx_unlock(&inpcb_timeout_lock);
542 }
543 
544 void
in_pcbinfo_attach(struct inpcbinfo * ipi)545 in_pcbinfo_attach(struct inpcbinfo *ipi)
546 {
547 	struct inpcbinfo *ipi0;
548 
549 	lck_mtx_lock(&inpcb_lock);
550 	TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
551 		if (ipi0 == ipi) {
552 			panic("%s: ipi %p already in the list",
553 			    __func__, ipi);
554 			/* NOTREACHED */
555 		}
556 	}
557 	TAILQ_INSERT_TAIL(&inpcb_head, ipi, ipi_entry);
558 	lck_mtx_unlock(&inpcb_lock);
559 }
560 
561 int
in_pcbinfo_detach(struct inpcbinfo * ipi)562 in_pcbinfo_detach(struct inpcbinfo *ipi)
563 {
564 	struct inpcbinfo *ipi0;
565 	int error = 0;
566 
567 	lck_mtx_lock(&inpcb_lock);
568 	TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
569 		if (ipi0 == ipi) {
570 			break;
571 		}
572 	}
573 	if (ipi0 != NULL) {
574 		TAILQ_REMOVE(&inpcb_head, ipi0, ipi_entry);
575 	} else {
576 		error = ENXIO;
577 	}
578 	lck_mtx_unlock(&inpcb_lock);
579 
580 	return error;
581 }
582 
583 /*
584  * Allocate a PCB and associate it with the socket.
585  *
586  * Returns:	0			Success
587  *		ENOBUFS
588  *		ENOMEM
589  */
590 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo,struct proc * p)591 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p)
592 {
593 #pragma unused(p)
594 	struct inpcb *inp;
595 	caddr_t temp;
596 
597 	if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
598 		inp = zalloc_flags(pcbinfo->ipi_zone,
599 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
600 	} else {
601 		inp = (struct inpcb *)(void *)so->so_saved_pcb;
602 		temp = inp->inp_saved_ppcb;
603 		bzero((caddr_t)inp, sizeof(*inp));
604 		inp->inp_saved_ppcb = temp;
605 	}
606 
607 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
608 	inp->inp_pcbinfo = pcbinfo;
609 	inp->inp_socket = so;
610 	/* make sure inp_stat is always 64-bit aligned */
611 	inp->inp_stat = (struct inp_stat *)P2ROUNDUP(inp->inp_stat_store,
612 	    sizeof(u_int64_t));
613 	if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store) +
614 	    sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) {
615 		panic("%s: insufficient space to align inp_stat", __func__);
616 		/* NOTREACHED */
617 	}
618 
619 	/* make sure inp_cstat is always 64-bit aligned */
620 	inp->inp_cstat = (struct inp_stat *)P2ROUNDUP(inp->inp_cstat_store,
621 	    sizeof(u_int64_t));
622 	if (((uintptr_t)inp->inp_cstat - (uintptr_t)inp->inp_cstat_store) +
623 	    sizeof(*inp->inp_cstat) > sizeof(inp->inp_cstat_store)) {
624 		panic("%s: insufficient space to align inp_cstat", __func__);
625 		/* NOTREACHED */
626 	}
627 
628 	/* make sure inp_wstat is always 64-bit aligned */
629 	inp->inp_wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_wstat_store,
630 	    sizeof(u_int64_t));
631 	if (((uintptr_t)inp->inp_wstat - (uintptr_t)inp->inp_wstat_store) +
632 	    sizeof(*inp->inp_wstat) > sizeof(inp->inp_wstat_store)) {
633 		panic("%s: insufficient space to align inp_wstat", __func__);
634 		/* NOTREACHED */
635 	}
636 
637 	/* make sure inp_Wstat is always 64-bit aligned */
638 	inp->inp_Wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_Wstat_store,
639 	    sizeof(u_int64_t));
640 	if (((uintptr_t)inp->inp_Wstat - (uintptr_t)inp->inp_Wstat_store) +
641 	    sizeof(*inp->inp_Wstat) > sizeof(inp->inp_Wstat_store)) {
642 		panic("%s: insufficient space to align inp_Wstat", __func__);
643 		/* NOTREACHED */
644 	}
645 
646 	so->so_pcb = (caddr_t)inp;
647 
648 	if (so->so_proto->pr_flags & PR_PCBLOCK) {
649 		lck_mtx_init(&inp->inpcb_mtx, pcbinfo->ipi_lock_grp,
650 		    &pcbinfo->ipi_lock_attr);
651 	}
652 
653 	if (SOCK_DOM(so) == PF_INET6 && !ip6_mapped_addr_on) {
654 		inp->inp_flags |= IN6P_IPV6_V6ONLY;
655 	}
656 
657 	if (ip6_auto_flowlabel) {
658 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
659 	}
660 	if (intcoproc_unrestricted) {
661 		inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
662 	}
663 
664 	(void) inp_update_policy(inp);
665 
666 	lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
667 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
668 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
669 	pcbinfo->ipi_count++;
670 	lck_rw_done(&pcbinfo->ipi_lock);
671 	return 0;
672 }
673 
674 /*
675  * in_pcblookup_local_and_cleanup does everything
676  * in_pcblookup_local does but it checks for a socket
677  * that's going away. Since we know that the lock is
678  * held read+write when this function is called, we
679  * can safely dispose of this socket like the slow
680  * timer would usually do and return NULL. This is
681  * great for bind.
682  */
683 struct inpcb *
in_pcblookup_local_and_cleanup(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_int lport_arg,int wild_okay)684 in_pcblookup_local_and_cleanup(struct inpcbinfo *pcbinfo, struct in_addr laddr,
685     u_int lport_arg, int wild_okay)
686 {
687 	struct inpcb *inp;
688 
689 	/* Perform normal lookup */
690 	inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
691 
692 	/* Check if we found a match but it's waiting to be disposed */
693 	if (inp != NULL && inp->inp_wantcnt == WNT_STOPUSING) {
694 		struct socket *so = inp->inp_socket;
695 
696 		socket_lock(so, 0);
697 
698 		if (so->so_usecount == 0) {
699 			if (inp->inp_state != INPCB_STATE_DEAD) {
700 				in_pcbdetach(inp);
701 			}
702 			in_pcbdispose(inp);     /* will unlock & destroy */
703 			inp = NULL;
704 		} else {
705 			socket_unlock(so, 0);
706 		}
707 	}
708 
709 	return inp;
710 }
711 
712 static void
in_pcb_conflict_post_msg(u_int16_t port)713 in_pcb_conflict_post_msg(u_int16_t port)
714 {
715 	/*
716 	 * Radar 5523020 send a kernel event notification if a
717 	 * non-participating socket tries to bind the port a socket
718 	 * who has set SOF_NOTIFYCONFLICT owns.
719 	 */
720 	struct kev_msg ev_msg;
721 	struct kev_in_portinuse in_portinuse;
722 
723 	bzero(&in_portinuse, sizeof(struct kev_in_portinuse));
724 	bzero(&ev_msg, sizeof(struct kev_msg));
725 	in_portinuse.port = ntohs(port);        /* port in host order */
726 	in_portinuse.req_pid = proc_selfpid();
727 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
728 	ev_msg.kev_class = KEV_NETWORK_CLASS;
729 	ev_msg.kev_subclass = KEV_INET_SUBCLASS;
730 	ev_msg.event_code = KEV_INET_PORTINUSE;
731 	ev_msg.dv[0].data_ptr = &in_portinuse;
732 	ev_msg.dv[0].data_length = sizeof(struct kev_in_portinuse);
733 	ev_msg.dv[1].data_length = 0;
734 	dlil_post_complete_msg(NULL, &ev_msg);
735 }
736 
737 /*
738  * Bind an INPCB to an address and/or port.  This routine should not alter
739  * the caller-supplied local address "nam".
740  *
741  * Returns:	0			Success
742  *		EADDRNOTAVAIL		Address not available.
743  *		EINVAL			Invalid argument
744  *		EAFNOSUPPORT		Address family not supported [notdef]
745  *		EACCES			Permission denied
746  *		EADDRINUSE		Address in use
747  *		EAGAIN			Resource unavailable, try again
748  *		priv_check_cred:EPERM	Operation not permitted
749  */
750 int
in_pcbbind(struct inpcb * inp,struct sockaddr * nam,struct proc * p)751 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
752 {
753 	struct socket *so = inp->inp_socket;
754 	unsigned short *lastport;
755 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
756 	u_short lport = 0, rand_port = 0;
757 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
758 	int error, randomport, conflict = 0;
759 	boolean_t anonport = FALSE;
760 	kauth_cred_t cred;
761 	struct in_addr laddr;
762 	struct ifnet *outif = NULL;
763 
764 	if (TAILQ_EMPTY(&in_ifaddrhead)) { /* XXX broken! */
765 		return EADDRNOTAVAIL;
766 	}
767 	if (!(so->so_options & (SO_REUSEADDR | SO_REUSEPORT))) {
768 		wild = 1;
769 	}
770 
771 	bzero(&laddr, sizeof(laddr));
772 
773 	socket_unlock(so, 0); /* keep reference on socket */
774 	lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
775 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
776 		/* another thread completed the bind */
777 		lck_rw_done(&pcbinfo->ipi_lock);
778 		socket_lock(so, 0);
779 		return EINVAL;
780 	}
781 
782 	if (nam != NULL) {
783 		if (nam->sa_len != sizeof(struct sockaddr_in)) {
784 			lck_rw_done(&pcbinfo->ipi_lock);
785 			socket_lock(so, 0);
786 			return EINVAL;
787 		}
788 #if 0
789 		/*
790 		 * We should check the family, but old programs
791 		 * incorrectly fail to initialize it.
792 		 */
793 		if (nam->sa_family != AF_INET) {
794 			lck_rw_done(&pcbinfo->ipi_lock);
795 			socket_lock(so, 0);
796 			return EAFNOSUPPORT;
797 		}
798 #endif /* 0 */
799 		lport = SIN(nam)->sin_port;
800 
801 		if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr))) {
802 			/*
803 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
804 			 * allow complete duplication of binding if
805 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
806 			 * and a multicast address is bound on both
807 			 * new and duplicated sockets.
808 			 */
809 			if (so->so_options & SO_REUSEADDR) {
810 				reuseport = SO_REUSEADDR | SO_REUSEPORT;
811 			}
812 		} else if (SIN(nam)->sin_addr.s_addr != INADDR_ANY) {
813 			struct sockaddr_in sin;
814 			struct ifaddr *ifa;
815 
816 			/* Sanitized for interface address searches */
817 			bzero(&sin, sizeof(sin));
818 			sin.sin_family = AF_INET;
819 			sin.sin_len = sizeof(struct sockaddr_in);
820 			sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
821 
822 			ifa = ifa_ifwithaddr(SA(&sin));
823 			if (ifa == NULL) {
824 				lck_rw_done(&pcbinfo->ipi_lock);
825 				socket_lock(so, 0);
826 				return EADDRNOTAVAIL;
827 			} else {
828 				/*
829 				 * Opportunistically determine the outbound
830 				 * interface that may be used; this may not
831 				 * hold true if we end up using a route
832 				 * going over a different interface, e.g.
833 				 * when sending to a local address.  This
834 				 * will get updated again after sending.
835 				 */
836 				IFA_LOCK(ifa);
837 				outif = ifa->ifa_ifp;
838 				IFA_UNLOCK(ifa);
839 				IFA_REMREF(ifa);
840 			}
841 		}
842 
843 #if SKYWALK
844 		if (inp->inp_flags2 & INP2_EXTERNAL_PORT) {
845 			// Extract the external flow info
846 			struct ns_flow_info nfi = {};
847 			error = necp_client_get_netns_flow_info(inp->necp_client_uuid,
848 			    &nfi);
849 			if (error != 0) {
850 				lck_rw_done(&pcbinfo->ipi_lock);
851 				socket_lock(so, 0);
852 				return error;
853 			}
854 
855 			// Extract the reserved port
856 			u_int16_t reserved_lport = 0;
857 			if (nfi.nfi_laddr.sa.sa_family == AF_INET) {
858 				reserved_lport = nfi.nfi_laddr.sin.sin_port;
859 			} else if (nfi.nfi_laddr.sa.sa_family == AF_INET6) {
860 				reserved_lport = nfi.nfi_laddr.sin6.sin6_port;
861 			} else {
862 				lck_rw_done(&pcbinfo->ipi_lock);
863 				socket_lock(so, 0);
864 				return EINVAL;
865 			}
866 
867 			// Validate or use the reserved port
868 			if (lport == 0) {
869 				lport = reserved_lport;
870 			} else if (lport != reserved_lport) {
871 				lck_rw_done(&pcbinfo->ipi_lock);
872 				socket_lock(so, 0);
873 				return EINVAL;
874 			}
875 		}
876 
877 		/* Do not allow reserving a UDP port if remaining UDP port count is below 4096 */
878 		if (SOCK_PROTO(so) == IPPROTO_UDP && !allow_udp_port_exhaustion) {
879 			uint32_t current_reservations = 0;
880 			if (inp->inp_vflag & INP_IPV6) {
881 				current_reservations = netns_lookup_reservations_count_in6(inp->in6p_laddr, IPPROTO_UDP);
882 			} else {
883 				current_reservations = netns_lookup_reservations_count_in(inp->inp_laddr, IPPROTO_UDP);
884 			}
885 			if (USHRT_MAX - UDP_RANDOM_PORT_RESERVE < current_reservations) {
886 				log(LOG_ERR, "UDP port not available, less than 4096 UDP ports left");
887 				lck_rw_done(&pcbinfo->ipi_lock);
888 				socket_lock(so, 0);
889 				return EADDRNOTAVAIL;
890 			}
891 		}
892 
893 #endif /* SKYWALK */
894 
895 		if (lport != 0) {
896 			struct inpcb *t;
897 			uid_t u;
898 
899 #if XNU_TARGET_OS_OSX
900 			if (ntohs(lport) < IPPORT_RESERVED &&
901 			    SIN(nam)->sin_addr.s_addr != 0 &&
902 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
903 				cred = kauth_cred_proc_ref(p);
904 				error = priv_check_cred(cred,
905 				    PRIV_NETINET_RESERVEDPORT, 0);
906 				kauth_cred_unref(&cred);
907 				if (error != 0) {
908 					lck_rw_done(&pcbinfo->ipi_lock);
909 					socket_lock(so, 0);
910 					return EACCES;
911 				}
912 			}
913 #endif /* XNU_TARGET_OS_OSX */
914 			/*
915 			 * Check wether the process is allowed to bind to a restricted port
916 			 */
917 			if (!current_task_can_use_restricted_in_port(lport,
918 			    (uint8_t)so->so_proto->pr_protocol, PORT_FLAGS_BSD)) {
919 				lck_rw_done(&pcbinfo->ipi_lock);
920 				socket_lock(so, 0);
921 				return EADDRINUSE;
922 			}
923 
924 			if (!IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
925 			    (u = kauth_cred_getuid(so->so_cred)) != 0 &&
926 			    (t = in_pcblookup_local_and_cleanup(
927 				    inp->inp_pcbinfo, SIN(nam)->sin_addr, lport,
928 				    INPLOOKUP_WILDCARD)) != NULL &&
929 			    (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
930 			    t->inp_laddr.s_addr != INADDR_ANY ||
931 			    !(t->inp_socket->so_options & SO_REUSEPORT)) &&
932 			    (u != kauth_cred_getuid(t->inp_socket->so_cred)) &&
933 			    !(t->inp_socket->so_flags & SOF_REUSESHAREUID) &&
934 			    (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
935 			    t->inp_laddr.s_addr != INADDR_ANY) &&
936 			    (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
937 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
938 			    uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
939 				if ((t->inp_socket->so_flags &
940 				    SOF_NOTIFYCONFLICT) &&
941 				    !(so->so_flags & SOF_NOTIFYCONFLICT)) {
942 					conflict = 1;
943 				}
944 
945 				lck_rw_done(&pcbinfo->ipi_lock);
946 
947 				if (conflict) {
948 					in_pcb_conflict_post_msg(lport);
949 				}
950 
951 				socket_lock(so, 0);
952 				return EADDRINUSE;
953 			}
954 			t = in_pcblookup_local_and_cleanup(pcbinfo,
955 			    SIN(nam)->sin_addr, lport, wild);
956 			if (t != NULL &&
957 			    (reuseport & t->inp_socket->so_options) == 0 &&
958 			    (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
959 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
960 			    uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
961 				if (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
962 				    t->inp_laddr.s_addr != INADDR_ANY ||
963 				    SOCK_DOM(so) != PF_INET6 ||
964 				    SOCK_DOM(t->inp_socket) != PF_INET6) {
965 					if ((t->inp_socket->so_flags &
966 					    SOF_NOTIFYCONFLICT) &&
967 					    !(so->so_flags & SOF_NOTIFYCONFLICT)) {
968 						conflict = 1;
969 					}
970 
971 					lck_rw_done(&pcbinfo->ipi_lock);
972 
973 					if (conflict) {
974 						in_pcb_conflict_post_msg(lport);
975 					}
976 					socket_lock(so, 0);
977 					return EADDRINUSE;
978 				}
979 			}
980 #if SKYWALK
981 			if ((SOCK_PROTO(so) == IPPROTO_TCP ||
982 			    SOCK_PROTO(so) == IPPROTO_UDP) &&
983 			    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
984 				int res_err = 0;
985 				if (inp->inp_vflag & INP_IPV6) {
986 					res_err = netns_reserve_in6(
987 						&inp->inp_netns_token,
988 						SIN6(nam)->sin6_addr,
989 						(uint8_t)SOCK_PROTO(so), lport, NETNS_BSD,
990 						NULL);
991 				} else {
992 					res_err = netns_reserve_in(
993 						&inp->inp_netns_token,
994 						SIN(nam)->sin_addr, (uint8_t)SOCK_PROTO(so),
995 						lport, NETNS_BSD, NULL);
996 				}
997 				if (res_err != 0) {
998 					lck_rw_done(&pcbinfo->ipi_lock);
999 					socket_lock(so, 0);
1000 					return EADDRINUSE;
1001 				}
1002 			}
1003 #endif /* SKYWALK */
1004 		}
1005 		laddr = SIN(nam)->sin_addr;
1006 	}
1007 	if (lport == 0) {
1008 		u_short first, last;
1009 		int count;
1010 		bool found;
1011 
1012 		/*
1013 		 * Override wild = 1 for implicit bind (mainly used by connect)
1014 		 * For implicit bind (lport == 0), we always use an unused port,
1015 		 * so REUSEADDR|REUSEPORT don't apply
1016 		 */
1017 		wild = 1;
1018 
1019 		randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
1020 		    (so->so_type == SOCK_STREAM ? tcp_use_randomport :
1021 		    udp_use_randomport);
1022 
1023 		/*
1024 		 * Even though this looks similar to the code in
1025 		 * in6_pcbsetport, the v6 vs v4 checks are different.
1026 		 */
1027 		anonport = TRUE;
1028 		if (inp->inp_flags & INP_HIGHPORT) {
1029 			first = (u_short)ipport_hifirstauto;     /* sysctl */
1030 			last  = (u_short)ipport_hilastauto;
1031 			lastport = &pcbinfo->ipi_lasthi;
1032 		} else if (inp->inp_flags & INP_LOWPORT) {
1033 			cred = kauth_cred_proc_ref(p);
1034 			error = priv_check_cred(cred,
1035 			    PRIV_NETINET_RESERVEDPORT, 0);
1036 			kauth_cred_unref(&cred);
1037 			if (error != 0) {
1038 				lck_rw_done(&pcbinfo->ipi_lock);
1039 				socket_lock(so, 0);
1040 				return error;
1041 			}
1042 			first = (u_short)ipport_lowfirstauto;    /* 1023 */
1043 			last  = (u_short)ipport_lowlastauto;     /* 600 */
1044 			lastport = &pcbinfo->ipi_lastlow;
1045 		} else {
1046 			first = (u_short)ipport_firstauto;       /* sysctl */
1047 			last  = (u_short)ipport_lastauto;
1048 			lastport = &pcbinfo->ipi_lastport;
1049 		}
1050 		/* No point in randomizing if only one port is available */
1051 
1052 		if (first == last) {
1053 			randomport = 0;
1054 		}
1055 		/*
1056 		 * Simple check to ensure all ports are not used up causing
1057 		 * a deadlock here.
1058 		 *
1059 		 * We split the two cases (up and down) so that the direction
1060 		 * is not being tested on each round of the loop.
1061 		 */
1062 		if (first > last) {
1063 			struct in_addr lookup_addr;
1064 
1065 			/*
1066 			 * counting down
1067 			 */
1068 			if (randomport) {
1069 				read_frandom(&rand_port, sizeof(rand_port));
1070 				*lastport =
1071 				    first - (rand_port % (first - last));
1072 			}
1073 			count = first - last;
1074 
1075 			lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1076 			    inp->inp_laddr;
1077 
1078 			found = false;
1079 			do {
1080 				if (count-- < 0) {      /* completely used? */
1081 					lck_rw_done(&pcbinfo->ipi_lock);
1082 					socket_lock(so, 0);
1083 					return EADDRNOTAVAIL;
1084 				}
1085 				--*lastport;
1086 				if (*lastport > first || *lastport < last) {
1087 					*lastport = first;
1088 				}
1089 				lport = htons(*lastport);
1090 
1091 				/*
1092 				 * Skip if this is a restricted port as we do not want to
1093 				 * restricted ports as ephemeral
1094 				 */
1095 				if (IS_RESTRICTED_IN_PORT(lport)) {
1096 					continue;
1097 				}
1098 
1099 				found = in_pcblookup_local_and_cleanup(pcbinfo,
1100 				    lookup_addr, lport, wild) == NULL;
1101 #if SKYWALK
1102 				if (found &&
1103 				    (SOCK_PROTO(so) == IPPROTO_TCP ||
1104 				    SOCK_PROTO(so) == IPPROTO_UDP) &&
1105 				    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1106 					int res_err;
1107 					if (inp->inp_vflag & INP_IPV6) {
1108 						res_err = netns_reserve_in6(
1109 							&inp->inp_netns_token,
1110 							inp->in6p_laddr,
1111 							(uint8_t)SOCK_PROTO(so), lport,
1112 							NETNS_BSD, NULL);
1113 					} else {
1114 						res_err = netns_reserve_in(
1115 							&inp->inp_netns_token,
1116 							lookup_addr, (uint8_t)SOCK_PROTO(so),
1117 							lport, NETNS_BSD, NULL);
1118 					}
1119 					found = res_err == 0;
1120 				}
1121 #endif /* SKYWALK */
1122 			} while (!found);
1123 		} else {
1124 			struct in_addr lookup_addr;
1125 
1126 			/*
1127 			 * counting up
1128 			 */
1129 			if (randomport) {
1130 				read_frandom(&rand_port, sizeof(rand_port));
1131 				*lastport =
1132 				    first + (rand_port % (first - last));
1133 			}
1134 			count = last - first;
1135 
1136 			lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1137 			    inp->inp_laddr;
1138 
1139 			found = false;
1140 			do {
1141 				if (count-- < 0) {      /* completely used? */
1142 					lck_rw_done(&pcbinfo->ipi_lock);
1143 					socket_lock(so, 0);
1144 					return EADDRNOTAVAIL;
1145 				}
1146 				++*lastport;
1147 				if (*lastport < first || *lastport > last) {
1148 					*lastport = first;
1149 				}
1150 				lport = htons(*lastport);
1151 
1152 				/*
1153 				 * Skip if this is a restricted port as we do not want to
1154 				 * restricted ports as ephemeral
1155 				 */
1156 				if (IS_RESTRICTED_IN_PORT(lport)) {
1157 					continue;
1158 				}
1159 
1160 				found = in_pcblookup_local_and_cleanup(pcbinfo,
1161 				    lookup_addr, lport, wild) == NULL;
1162 #if SKYWALK
1163 				if (found &&
1164 				    (SOCK_PROTO(so) == IPPROTO_TCP ||
1165 				    SOCK_PROTO(so) == IPPROTO_UDP) &&
1166 				    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1167 					int res_err;
1168 					if (inp->inp_vflag & INP_IPV6) {
1169 						res_err = netns_reserve_in6(
1170 							&inp->inp_netns_token,
1171 							inp->in6p_laddr,
1172 							(uint8_t)SOCK_PROTO(so), lport,
1173 							NETNS_BSD, NULL);
1174 					} else {
1175 						res_err = netns_reserve_in(
1176 							&inp->inp_netns_token,
1177 							lookup_addr, (uint8_t)SOCK_PROTO(so),
1178 							lport, NETNS_BSD, NULL);
1179 					}
1180 					found = res_err == 0;
1181 				}
1182 #endif /* SKYWALK */
1183 			} while (!found);
1184 		}
1185 	}
1186 	socket_lock(so, 0);
1187 
1188 	/*
1189 	 * We unlocked socket's protocol lock for a long time.
1190 	 * The socket might have been dropped/defuncted.
1191 	 * Checking if world has changed since.
1192 	 */
1193 	if (inp->inp_state == INPCB_STATE_DEAD) {
1194 #if SKYWALK
1195 		netns_release(&inp->inp_netns_token);
1196 #endif /* SKYWALK */
1197 		lck_rw_done(&pcbinfo->ipi_lock);
1198 		return ECONNABORTED;
1199 	}
1200 
1201 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
1202 #if SKYWALK
1203 		netns_release(&inp->inp_netns_token);
1204 #endif /* SKYWALK */
1205 		lck_rw_done(&pcbinfo->ipi_lock);
1206 		return EINVAL;
1207 	}
1208 
1209 	if (laddr.s_addr != INADDR_ANY) {
1210 		inp->inp_laddr = laddr;
1211 		inp->inp_last_outifp = outif;
1212 #if SKYWALK
1213 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1214 			netns_set_ifnet(&inp->inp_netns_token, outif);
1215 		}
1216 #endif /* SKYWALK */
1217 	}
1218 	inp->inp_lport = lport;
1219 	if (anonport) {
1220 		inp->inp_flags |= INP_ANONPORT;
1221 	}
1222 
1223 	if (in_pcbinshash(inp, 1) != 0) {
1224 		inp->inp_laddr.s_addr = INADDR_ANY;
1225 		inp->inp_last_outifp = NULL;
1226 
1227 #if SKYWALK
1228 		netns_release(&inp->inp_netns_token);
1229 #endif /* SKYWALK */
1230 		inp->inp_lport = 0;
1231 		if (anonport) {
1232 			inp->inp_flags &= ~INP_ANONPORT;
1233 		}
1234 		lck_rw_done(&pcbinfo->ipi_lock);
1235 		return EAGAIN;
1236 	}
1237 	lck_rw_done(&pcbinfo->ipi_lock);
1238 	sflt_notify(so, sock_evt_bound, NULL);
1239 	return 0;
1240 }
1241 
1242 #define APN_FALLBACK_IP_FILTER(a)       \
1243 	(IN_LINKLOCAL(ntohl((a)->sin_addr.s_addr)) || \
1244 	 IN_LOOPBACK(ntohl((a)->sin_addr.s_addr)) || \
1245 	 IN_ZERONET(ntohl((a)->sin_addr.s_addr)) || \
1246 	 IN_MULTICAST(ntohl((a)->sin_addr.s_addr)) || \
1247 	 IN_PRIVATE(ntohl((a)->sin_addr.s_addr)))
1248 
1249 #define APN_FALLBACK_NOTIF_INTERVAL     2 /* Magic Number */
1250 static uint64_t last_apn_fallback = 0;
1251 
1252 static boolean_t
apn_fallback_required(proc_t proc,struct socket * so,struct sockaddr_in * p_dstv4)1253 apn_fallback_required(proc_t proc, struct socket *so, struct sockaddr_in *p_dstv4)
1254 {
1255 	uint64_t timenow;
1256 	struct sockaddr_storage lookup_default_addr;
1257 	struct rtentry *rt = NULL;
1258 
1259 	VERIFY(proc != NULL);
1260 
1261 	if (apn_fallbk_enabled == FALSE) {
1262 		return FALSE;
1263 	}
1264 
1265 	if (proc == kernproc) {
1266 		return FALSE;
1267 	}
1268 
1269 	if (so && (so->so_options & SO_NOAPNFALLBK)) {
1270 		return FALSE;
1271 	}
1272 
1273 	timenow = net_uptime();
1274 	if ((timenow - last_apn_fallback) < APN_FALLBACK_NOTIF_INTERVAL) {
1275 		apn_fallbk_log((LOG_INFO, "APN fallback notification throttled.\n"));
1276 		return FALSE;
1277 	}
1278 
1279 	if (p_dstv4 && APN_FALLBACK_IP_FILTER(p_dstv4)) {
1280 		return FALSE;
1281 	}
1282 
1283 	/* Check if we have unscoped IPv6 default route through cellular */
1284 	bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1285 	lookup_default_addr.ss_family = AF_INET6;
1286 	lookup_default_addr.ss_len = sizeof(struct sockaddr_in6);
1287 
1288 	rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1289 	if (NULL == rt) {
1290 		apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1291 		    "unscoped default IPv6 route.\n"));
1292 		return FALSE;
1293 	}
1294 
1295 	if (!IFNET_IS_CELLULAR(rt->rt_ifp)) {
1296 		rtfree(rt);
1297 		apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1298 		    "unscoped default IPv6 route through cellular interface.\n"));
1299 		return FALSE;
1300 	}
1301 
1302 	/*
1303 	 * We have a default IPv6 route, ensure that
1304 	 * we do not have IPv4 default route before triggering
1305 	 * the event
1306 	 */
1307 	rtfree(rt);
1308 	rt = NULL;
1309 
1310 	bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1311 	lookup_default_addr.ss_family = AF_INET;
1312 	lookup_default_addr.ss_len = sizeof(struct sockaddr_in);
1313 
1314 	rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1315 
1316 	if (rt) {
1317 		rtfree(rt);
1318 		rt = NULL;
1319 		apn_fallbk_log((LOG_INFO, "APN fallback notification found unscoped "
1320 		    "IPv4 default route!\n"));
1321 		return FALSE;
1322 	}
1323 
1324 	{
1325 		/*
1326 		 * We disable APN fallback if the binary is not a third-party app.
1327 		 * Note that platform daemons use their process name as a
1328 		 * bundle ID so we filter out bundle IDs without dots.
1329 		 */
1330 		const char *bundle_id = cs_identity_get(proc);
1331 		if (bundle_id == NULL ||
1332 		    bundle_id[0] == '\0' ||
1333 		    strchr(bundle_id, '.') == NULL ||
1334 		    strncmp(bundle_id, "com.apple.", sizeof("com.apple.") - 1) == 0) {
1335 			apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found first-"
1336 			    "party bundle ID \"%s\"!\n", (bundle_id ? bundle_id : "NULL")));
1337 			return FALSE;
1338 		}
1339 	}
1340 
1341 	{
1342 		/*
1343 		 * The Apple App Store IPv6 requirement started on
1344 		 * June 1st, 2016 at 12:00:00 AM PDT.
1345 		 * We disable APN fallback if the binary is more recent than that.
1346 		 * We check both atime and birthtime since birthtime is not always supported.
1347 		 */
1348 		static const long ipv6_start_date = 1464764400L;
1349 		vfs_context_t context;
1350 		struct stat64 sb;
1351 		int vn_stat_error;
1352 
1353 		bzero(&sb, sizeof(struct stat64));
1354 		context = vfs_context_create(NULL);
1355 		vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, 0, context);
1356 		(void)vfs_context_rele(context);
1357 
1358 		if (vn_stat_error != 0 ||
1359 		    sb.st_atimespec.tv_sec >= ipv6_start_date ||
1360 		    sb.st_birthtimespec.tv_sec >= ipv6_start_date) {
1361 			apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found binary "
1362 			    "too recent! (err %d atime %ld mtime %ld ctime %ld birthtime %ld)\n",
1363 			    vn_stat_error, sb.st_atimespec.tv_sec, sb.st_mtimespec.tv_sec,
1364 			    sb.st_ctimespec.tv_sec, sb.st_birthtimespec.tv_sec));
1365 			return FALSE;
1366 		}
1367 	}
1368 	return TRUE;
1369 }
1370 
1371 static void
apn_fallback_trigger(proc_t proc,struct socket * so)1372 apn_fallback_trigger(proc_t proc, struct socket *so)
1373 {
1374 	pid_t pid = 0;
1375 	struct kev_msg ev_msg;
1376 	struct kev_netevent_apnfallbk_data apnfallbk_data;
1377 
1378 	last_apn_fallback = net_uptime();
1379 	pid = proc_pid(proc);
1380 	uuid_t application_uuid;
1381 	uuid_clear(application_uuid);
1382 	proc_getexecutableuuid(proc, application_uuid,
1383 	    sizeof(application_uuid));
1384 
1385 	bzero(&ev_msg, sizeof(struct kev_msg));
1386 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1387 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1388 	ev_msg.kev_subclass     = KEV_NETEVENT_SUBCLASS;
1389 	ev_msg.event_code       = KEV_NETEVENT_APNFALLBACK;
1390 
1391 	bzero(&apnfallbk_data, sizeof(apnfallbk_data));
1392 
1393 	if (so->so_flags & SOF_DELEGATED) {
1394 		apnfallbk_data.epid = so->e_pid;
1395 		uuid_copy(apnfallbk_data.euuid, so->e_uuid);
1396 	} else {
1397 		apnfallbk_data.epid = so->last_pid;
1398 		uuid_copy(apnfallbk_data.euuid, so->last_uuid);
1399 	}
1400 
1401 	ev_msg.dv[0].data_ptr   = &apnfallbk_data;
1402 	ev_msg.dv[0].data_length = sizeof(apnfallbk_data);
1403 	kev_post_msg(&ev_msg);
1404 	apn_fallbk_log((LOG_INFO, "APN fallback notification issued.\n"));
1405 }
1406 
1407 /*
1408  * Transform old in_pcbconnect() into an inner subroutine for new
1409  * in_pcbconnect(); do some validity-checking on the remote address
1410  * (in "nam") and then determine local host address (i.e., which
1411  * interface) to use to access that remote host.
1412  *
1413  * This routine may alter the caller-supplied remote address "nam".
1414  *
1415  * The caller may override the bound-to-interface setting of the socket
1416  * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1417  *
1418  * This routine might return an ifp with a reference held if the caller
1419  * provides a non-NULL outif, even in the error case.  The caller is
1420  * responsible for releasing its reference.
1421  *
1422  * Returns:	0			Success
1423  *		EINVAL			Invalid argument
1424  *		EAFNOSUPPORT		Address family not supported
1425  *		EADDRNOTAVAIL		Address not available
1426  */
1427 int
in_pcbladdr(struct inpcb * inp,struct sockaddr * nam,struct in_addr * laddr,unsigned int ifscope,struct ifnet ** outif,int raw)1428 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr,
1429     unsigned int ifscope, struct ifnet **outif, int raw)
1430 {
1431 	struct route *ro = &inp->inp_route;
1432 	struct in_ifaddr *ia = NULL;
1433 	struct sockaddr_in sin;
1434 	int error = 0;
1435 	boolean_t restricted = FALSE;
1436 
1437 	if (outif != NULL) {
1438 		*outif = NULL;
1439 	}
1440 	if (nam->sa_len != sizeof(struct sockaddr_in)) {
1441 		return EINVAL;
1442 	}
1443 	if (SIN(nam)->sin_family != AF_INET) {
1444 		return EAFNOSUPPORT;
1445 	}
1446 	if (raw == 0 && SIN(nam)->sin_port == 0) {
1447 		return EADDRNOTAVAIL;
1448 	}
1449 
1450 	/*
1451 	 * If the destination address is INADDR_ANY,
1452 	 * use the primary local address.
1453 	 * If the supplied address is INADDR_BROADCAST,
1454 	 * and the primary interface supports broadcast,
1455 	 * choose the broadcast address for that interface.
1456 	 */
1457 	if (raw == 0 && (SIN(nam)->sin_addr.s_addr == INADDR_ANY ||
1458 	    SIN(nam)->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST)) {
1459 		lck_rw_lock_shared(&in_ifaddr_rwlock);
1460 		if (!TAILQ_EMPTY(&in_ifaddrhead)) {
1461 			ia = TAILQ_FIRST(&in_ifaddrhead);
1462 			IFA_LOCK_SPIN(&ia->ia_ifa);
1463 			if (SIN(nam)->sin_addr.s_addr == INADDR_ANY) {
1464 				SIN(nam)->sin_addr = IA_SIN(ia)->sin_addr;
1465 			} else if (ia->ia_ifp->if_flags & IFF_BROADCAST) {
1466 				SIN(nam)->sin_addr =
1467 				    SIN(&ia->ia_broadaddr)->sin_addr;
1468 			}
1469 			IFA_UNLOCK(&ia->ia_ifa);
1470 			ia = NULL;
1471 		}
1472 		lck_rw_done(&in_ifaddr_rwlock);
1473 	}
1474 	/*
1475 	 * Otherwise, if the socket has already bound the source, just use it.
1476 	 */
1477 	if (inp->inp_laddr.s_addr != INADDR_ANY) {
1478 		VERIFY(ia == NULL);
1479 		*laddr = inp->inp_laddr;
1480 		return 0;
1481 	}
1482 
1483 	/*
1484 	 * If the ifscope is specified by the caller (e.g. IP_PKTINFO)
1485 	 * then it overrides the sticky ifscope set for the socket.
1486 	 */
1487 	if (ifscope == IFSCOPE_NONE && (inp->inp_flags & INP_BOUND_IF)) {
1488 		ifscope = inp->inp_boundifp->if_index;
1489 	}
1490 
1491 	/*
1492 	 * If route is known or can be allocated now,
1493 	 * our src addr is taken from the i/f, else punt.
1494 	 * Note that we should check the address family of the cached
1495 	 * destination, in case of sharing the cache with IPv6.
1496 	 */
1497 	if (ro->ro_rt != NULL) {
1498 		RT_LOCK_SPIN(ro->ro_rt);
1499 	}
1500 	if (ROUTE_UNUSABLE(ro) || ro->ro_dst.sa_family != AF_INET ||
1501 	    SIN(&ro->ro_dst)->sin_addr.s_addr != SIN(nam)->sin_addr.s_addr ||
1502 	    (inp->inp_socket->so_options & SO_DONTROUTE)) {
1503 		if (ro->ro_rt != NULL) {
1504 			RT_UNLOCK(ro->ro_rt);
1505 		}
1506 		ROUTE_RELEASE(ro);
1507 	}
1508 	if (!(inp->inp_socket->so_options & SO_DONTROUTE) &&
1509 	    (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
1510 		if (ro->ro_rt != NULL) {
1511 			RT_UNLOCK(ro->ro_rt);
1512 		}
1513 		ROUTE_RELEASE(ro);
1514 		/* No route yet, so try to acquire one */
1515 		bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
1516 		ro->ro_dst.sa_family = AF_INET;
1517 		ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1518 		SIN(&ro->ro_dst)->sin_addr = SIN(nam)->sin_addr;
1519 		rtalloc_scoped(ro, ifscope);
1520 		if (ro->ro_rt != NULL) {
1521 			RT_LOCK_SPIN(ro->ro_rt);
1522 		}
1523 	}
1524 	/* Sanitized local copy for interface address searches */
1525 	bzero(&sin, sizeof(sin));
1526 	sin.sin_family = AF_INET;
1527 	sin.sin_len = sizeof(struct sockaddr_in);
1528 	sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
1529 	/*
1530 	 * If we did not find (or use) a route, assume dest is reachable
1531 	 * on a directly connected network and try to find a corresponding
1532 	 * interface to take the source address from.
1533 	 */
1534 	if (ro->ro_rt == NULL) {
1535 		proc_t proc = current_proc();
1536 
1537 		VERIFY(ia == NULL);
1538 		ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1539 		if (ia == NULL) {
1540 			ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1541 		}
1542 		error = ((ia == NULL) ? ENETUNREACH : 0);
1543 
1544 		if (apn_fallback_required(proc, inp->inp_socket,
1545 		    (void *)nam)) {
1546 			apn_fallback_trigger(proc, inp->inp_socket);
1547 		}
1548 
1549 		goto done;
1550 	}
1551 	RT_LOCK_ASSERT_HELD(ro->ro_rt);
1552 	/*
1553 	 * If the outgoing interface on the route found is not
1554 	 * a loopback interface, use the address from that interface.
1555 	 */
1556 	if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
1557 		VERIFY(ia == NULL);
1558 		/*
1559 		 * If the route points to a cellular interface and the
1560 		 * caller forbids our using interfaces of such type,
1561 		 * pretend that there is no route.
1562 		 * Apply the same logic for expensive interfaces.
1563 		 */
1564 		if (inp_restricted_send(inp, ro->ro_rt->rt_ifp)) {
1565 			RT_UNLOCK(ro->ro_rt);
1566 			ROUTE_RELEASE(ro);
1567 			error = EHOSTUNREACH;
1568 			restricted = TRUE;
1569 		} else {
1570 			/* Become a regular mutex */
1571 			RT_CONVERT_LOCK(ro->ro_rt);
1572 			ia = ifatoia(ro->ro_rt->rt_ifa);
1573 			IFA_ADDREF(&ia->ia_ifa);
1574 
1575 			/*
1576 			 * Mark the control block for notification of
1577 			 * a possible flow that might undergo clat46
1578 			 * translation.
1579 			 *
1580 			 * We defer the decision to a later point when
1581 			 * inpcb is being disposed off.
1582 			 * The reason is that we only want to send notification
1583 			 * if the flow was ever used to send data.
1584 			 */
1585 			if (IS_INTF_CLAT46(ro->ro_rt->rt_ifp)) {
1586 				inp->inp_flags2 |= INP2_CLAT46_FLOW;
1587 			}
1588 
1589 			RT_UNLOCK(ro->ro_rt);
1590 			error = 0;
1591 		}
1592 		goto done;
1593 	}
1594 	VERIFY(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK);
1595 	RT_UNLOCK(ro->ro_rt);
1596 	/*
1597 	 * The outgoing interface is marked with 'loopback net', so a route
1598 	 * to ourselves is here.
1599 	 * Try to find the interface of the destination address and then
1600 	 * take the address from there. That interface is not necessarily
1601 	 * a loopback interface.
1602 	 */
1603 	VERIFY(ia == NULL);
1604 	ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1605 	if (ia == NULL) {
1606 		ia = ifatoia(ifa_ifwithaddr_scoped(SA(&sin), ifscope));
1607 	}
1608 	if (ia == NULL) {
1609 		ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1610 	}
1611 	if (ia == NULL) {
1612 		RT_LOCK(ro->ro_rt);
1613 		ia = ifatoia(ro->ro_rt->rt_ifa);
1614 		if (ia != NULL) {
1615 			IFA_ADDREF(&ia->ia_ifa);
1616 		}
1617 		RT_UNLOCK(ro->ro_rt);
1618 	}
1619 	error = ((ia == NULL) ? ENETUNREACH : 0);
1620 
1621 done:
1622 	/*
1623 	 * If the destination address is multicast and an outgoing
1624 	 * interface has been set as a multicast option, use the
1625 	 * address of that interface as our source address.
1626 	 */
1627 	if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
1628 	    inp->inp_moptions != NULL) {
1629 		struct ip_moptions *imo;
1630 		struct ifnet *ifp;
1631 
1632 		imo = inp->inp_moptions;
1633 		IMO_LOCK(imo);
1634 		if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
1635 		    ia->ia_ifp != imo->imo_multicast_ifp)) {
1636 			ifp = imo->imo_multicast_ifp;
1637 			if (ia != NULL) {
1638 				IFA_REMREF(&ia->ia_ifa);
1639 			}
1640 			lck_rw_lock_shared(&in_ifaddr_rwlock);
1641 			TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
1642 				if (ia->ia_ifp == ifp) {
1643 					break;
1644 				}
1645 			}
1646 			if (ia != NULL) {
1647 				IFA_ADDREF(&ia->ia_ifa);
1648 			}
1649 			lck_rw_done(&in_ifaddr_rwlock);
1650 			if (ia == NULL) {
1651 				error = EADDRNOTAVAIL;
1652 			} else {
1653 				error = 0;
1654 			}
1655 		}
1656 		IMO_UNLOCK(imo);
1657 	}
1658 	/*
1659 	 * Don't do pcblookup call here; return interface in laddr
1660 	 * and exit to caller, that will do the lookup.
1661 	 */
1662 	if (ia != NULL) {
1663 		/*
1664 		 * If the source address belongs to a cellular interface
1665 		 * and the socket forbids our using interfaces of such
1666 		 * type, pretend that there is no source address.
1667 		 * Apply the same logic for expensive interfaces.
1668 		 */
1669 		IFA_LOCK_SPIN(&ia->ia_ifa);
1670 		if (inp_restricted_send(inp, ia->ia_ifa.ifa_ifp)) {
1671 			IFA_UNLOCK(&ia->ia_ifa);
1672 			error = EHOSTUNREACH;
1673 			restricted = TRUE;
1674 		} else if (error == 0) {
1675 			*laddr = ia->ia_addr.sin_addr;
1676 			if (outif != NULL) {
1677 				struct ifnet *ifp;
1678 
1679 				if (ro->ro_rt != NULL) {
1680 					ifp = ro->ro_rt->rt_ifp;
1681 				} else {
1682 					ifp = ia->ia_ifp;
1683 				}
1684 
1685 				VERIFY(ifp != NULL);
1686 				IFA_CONVERT_LOCK(&ia->ia_ifa);
1687 				ifnet_reference(ifp);   /* for caller */
1688 				if (*outif != NULL) {
1689 					ifnet_release(*outif);
1690 				}
1691 				*outif = ifp;
1692 			}
1693 			IFA_UNLOCK(&ia->ia_ifa);
1694 		} else {
1695 			IFA_UNLOCK(&ia->ia_ifa);
1696 		}
1697 		IFA_REMREF(&ia->ia_ifa);
1698 		ia = NULL;
1699 	}
1700 
1701 	if (restricted && error == EHOSTUNREACH) {
1702 		soevent(inp->inp_socket, (SO_FILT_HINT_LOCKED |
1703 		    SO_FILT_HINT_IFDENIED));
1704 	}
1705 
1706 	return error;
1707 }
1708 
1709 /*
1710  * Outer subroutine:
1711  * Connect from a socket to a specified address.
1712  * Both address and port must be specified in argument sin.
1713  * If don't have a local address for this socket yet,
1714  * then pick one.
1715  *
1716  * The caller may override the bound-to-interface setting of the socket
1717  * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1718  */
1719 int
in_pcbconnect(struct inpcb * inp,struct sockaddr * nam,struct proc * p,unsigned int ifscope,struct ifnet ** outif)1720 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
1721     unsigned int ifscope, struct ifnet **outif)
1722 {
1723 	struct in_addr laddr;
1724 	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
1725 	struct inpcb *pcb;
1726 	int error;
1727 	struct socket *so = inp->inp_socket;
1728 
1729 #if CONTENT_FILTER
1730 	if (so) {
1731 		so->so_state_change_cnt++;
1732 	}
1733 #endif
1734 
1735 	/*
1736 	 *   Call inner routine, to assign local interface address.
1737 	 */
1738 	if ((error = in_pcbladdr(inp, nam, &laddr, ifscope, outif, 0)) != 0) {
1739 		return error;
1740 	}
1741 
1742 	socket_unlock(so, 0);
1743 	pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
1744 	    inp->inp_laddr.s_addr ? inp->inp_laddr : laddr,
1745 	    inp->inp_lport, 0, NULL);
1746 	socket_lock(so, 0);
1747 
1748 	/*
1749 	 * Check if the socket is still in a valid state. When we unlock this
1750 	 * embryonic socket, it can get aborted if another thread is closing
1751 	 * the listener (radar 7947600).
1752 	 */
1753 	if ((so->so_flags & SOF_ABORTED) != 0) {
1754 		return ECONNREFUSED;
1755 	}
1756 
1757 	if (pcb != NULL) {
1758 		in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0);
1759 		return EADDRINUSE;
1760 	}
1761 	if (inp->inp_laddr.s_addr == INADDR_ANY) {
1762 		if (inp->inp_lport == 0) {
1763 			error = in_pcbbind(inp, NULL, p);
1764 			if (error) {
1765 				return error;
1766 			}
1767 		}
1768 		if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1769 			/*
1770 			 * Lock inversion issue, mostly with udp
1771 			 * multicast packets.
1772 			 */
1773 			socket_unlock(so, 0);
1774 			lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1775 			socket_lock(so, 0);
1776 		}
1777 		inp->inp_laddr = laddr;
1778 		/* no reference needed */
1779 		inp->inp_last_outifp = (outif != NULL) ? *outif : NULL;
1780 #if SKYWALK
1781 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1782 			netns_set_ifnet(&inp->inp_netns_token,
1783 			    inp->inp_last_outifp);
1784 		}
1785 #endif /* SKYWALK */
1786 		inp->inp_flags |= INP_INADDR_ANY;
1787 	} else {
1788 		/*
1789 		 * Usage of IP_PKTINFO, without local port already
1790 		 * speficified will cause kernel to panic,
1791 		 * see rdar://problem/18508185.
1792 		 * For now returning error to avoid a kernel panic
1793 		 * This routines can be refactored and handle this better
1794 		 * in future.
1795 		 */
1796 		if (inp->inp_lport == 0) {
1797 			return EINVAL;
1798 		}
1799 		if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1800 			/*
1801 			 * Lock inversion issue, mostly with udp
1802 			 * multicast packets.
1803 			 */
1804 			socket_unlock(so, 0);
1805 			lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1806 			socket_lock(so, 0);
1807 		}
1808 	}
1809 	inp->inp_faddr = sin->sin_addr;
1810 	inp->inp_fport = sin->sin_port;
1811 	if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1812 		nstat_pcb_invalidate_cache(inp);
1813 	}
1814 	in_pcbrehash(inp);
1815 	lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1816 	return 0;
1817 }
1818 
1819 void
in_pcbdisconnect(struct inpcb * inp)1820 in_pcbdisconnect(struct inpcb *inp)
1821 {
1822 	struct socket *so = inp->inp_socket;
1823 
1824 	if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1825 		nstat_pcb_cache(inp);
1826 	}
1827 
1828 	inp->inp_faddr.s_addr = INADDR_ANY;
1829 	inp->inp_fport = 0;
1830 
1831 #if CONTENT_FILTER
1832 	if (so) {
1833 		so->so_state_change_cnt++;
1834 	}
1835 #endif
1836 
1837 	if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1838 		/* lock inversion issue, mostly with udp multicast packets */
1839 		socket_unlock(so, 0);
1840 		lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1841 		socket_lock(so, 0);
1842 	}
1843 
1844 	in_pcbrehash(inp);
1845 	lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1846 	/*
1847 	 * A multipath subflow socket would have its SS_NOFDREF set by default,
1848 	 * so check for SOF_MP_SUBFLOW socket flag before detaching the PCB;
1849 	 * when the socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1850 	 */
1851 	if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF)) {
1852 		in_pcbdetach(inp);
1853 	}
1854 }
1855 
1856 void
in_pcbdetach(struct inpcb * inp)1857 in_pcbdetach(struct inpcb *inp)
1858 {
1859 	struct socket *so = inp->inp_socket;
1860 
1861 	if (so->so_pcb == NULL) {
1862 		/* PCB has been disposed */
1863 		panic("%s: inp=%p so=%p proto=%d so_pcb is null!", __func__,
1864 		    inp, so, SOCK_PROTO(so));
1865 		/* NOTREACHED */
1866 	}
1867 
1868 #if IPSEC
1869 	if (inp->inp_sp != NULL) {
1870 		(void) ipsec4_delete_pcbpolicy(inp);
1871 	}
1872 #endif /* IPSEC */
1873 
1874 	if (inp->inp_stat != NULL && SOCK_PROTO(so) == IPPROTO_UDP) {
1875 		if (inp->inp_stat->rxpackets == 0 && inp->inp_stat->txpackets == 0) {
1876 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_no_data);
1877 		}
1878 	}
1879 
1880 	/*
1881 	 * Let NetworkStatistics know this PCB is going away
1882 	 * before we detach it.
1883 	 */
1884 	if (nstat_collect &&
1885 	    (SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP)) {
1886 		nstat_pcb_detach(inp);
1887 	}
1888 
1889 	/* Free memory buffer held for generating keep alives */
1890 	if (inp->inp_keepalive_data != NULL) {
1891 		kfree_data(inp->inp_keepalive_data, inp->inp_keepalive_datalen);
1892 		inp->inp_keepalive_data = NULL;
1893 	}
1894 
1895 	/* mark socket state as dead */
1896 	if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING) {
1897 		panic("%s: so=%p proto=%d couldn't set to STOPUSING",
1898 		    __func__, so, SOCK_PROTO(so));
1899 		/* NOTREACHED */
1900 	}
1901 
1902 	if (!(so->so_flags & SOF_PCBCLEARING)) {
1903 		struct ip_moptions *imo;
1904 
1905 		inp->inp_vflag = 0;
1906 		if (inp->inp_options != NULL) {
1907 			(void) m_free(inp->inp_options);
1908 			inp->inp_options = NULL;
1909 		}
1910 		ROUTE_RELEASE(&inp->inp_route);
1911 		imo = inp->inp_moptions;
1912 		if (imo != NULL) {
1913 			IMO_REMREF(imo);
1914 		}
1915 		inp->inp_moptions = NULL;
1916 		sofreelastref(so, 0);
1917 		inp->inp_state = INPCB_STATE_DEAD;
1918 
1919 		/*
1920 		 * Enqueue an event to send kernel event notification
1921 		 * if the flow has to CLAT46 for data packets
1922 		 */
1923 		if (inp->inp_flags2 & INP2_CLAT46_FLOW) {
1924 			/*
1925 			 * If there has been any exchange of data bytes
1926 			 * over this flow.
1927 			 * Schedule a notification to report that flow is
1928 			 * using client side translation.
1929 			 */
1930 			if (inp->inp_stat != NULL &&
1931 			    (inp->inp_stat->txbytes != 0 ||
1932 			    inp->inp_stat->rxbytes != 0)) {
1933 				if (so->so_flags & SOF_DELEGATED) {
1934 					in6_clat46_event_enqueue_nwk_wq_entry(
1935 						IN6_CLAT46_EVENT_V4_FLOW,
1936 						so->e_pid,
1937 						so->e_uuid);
1938 				} else {
1939 					in6_clat46_event_enqueue_nwk_wq_entry(
1940 						IN6_CLAT46_EVENT_V4_FLOW,
1941 						so->last_pid,
1942 						so->last_uuid);
1943 				}
1944 			}
1945 		}
1946 
1947 		/* makes sure we're not called twice from so_close */
1948 		so->so_flags |= SOF_PCBCLEARING;
1949 
1950 		inpcb_gc_sched(inp->inp_pcbinfo, INPCB_TIMER_FAST);
1951 	}
1952 }
1953 
1954 
1955 void
in_pcbdispose(struct inpcb * inp)1956 in_pcbdispose(struct inpcb *inp)
1957 {
1958 	struct socket *so = inp->inp_socket;
1959 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
1960 
1961 	if (so != NULL && so->so_usecount != 0) {
1962 		panic("%s: so %p [%d,%d] usecount %d lockhistory %s",
1963 		    __func__, so, SOCK_DOM(so), SOCK_TYPE(so), so->so_usecount,
1964 		    solockhistory_nr(so));
1965 		/* NOTREACHED */
1966 	} else if (inp->inp_wantcnt != WNT_STOPUSING) {
1967 		if (so != NULL) {
1968 			panic_plain("%s: inp %p invalid wantcnt %d, so %p "
1969 			    "[%d,%d] usecount %d retaincnt %d state 0x%x "
1970 			    "flags 0x%x lockhistory %s\n", __func__, inp,
1971 			    inp->inp_wantcnt, so, SOCK_DOM(so), SOCK_TYPE(so),
1972 			    so->so_usecount, so->so_retaincnt, so->so_state,
1973 			    so->so_flags, solockhistory_nr(so));
1974 			/* NOTREACHED */
1975 		} else {
1976 			panic("%s: inp %p invalid wantcnt %d no socket",
1977 			    __func__, inp, inp->inp_wantcnt);
1978 			/* NOTREACHED */
1979 		}
1980 	}
1981 
1982 	LCK_RW_ASSERT(&ipi->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
1983 
1984 	inp->inp_gencnt = ++ipi->ipi_gencnt;
1985 	/* access ipi in in_pcbremlists */
1986 	in_pcbremlists(inp);
1987 
1988 	if (so != NULL) {
1989 		if (so->so_proto->pr_flags & PR_PCBLOCK) {
1990 			sofreelastref(so, 0);
1991 			if (so->so_rcv.sb_cc > 0 || so->so_snd.sb_cc > 0) {
1992 				/*
1993 				 * selthreadclear() already called
1994 				 * during sofreelastref() above.
1995 				 */
1996 				sbrelease(&so->so_rcv);
1997 				sbrelease(&so->so_snd);
1998 			}
1999 			if (so->so_head != NULL) {
2000 				panic("%s: so=%p head still exist",
2001 				    __func__, so);
2002 				/* NOTREACHED */
2003 			}
2004 			lck_mtx_unlock(&inp->inpcb_mtx);
2005 
2006 #if NECP
2007 			necp_inpcb_remove_cb(inp);
2008 #endif /* NECP */
2009 
2010 			lck_mtx_destroy(&inp->inpcb_mtx, ipi->ipi_lock_grp);
2011 		}
2012 		/* makes sure we're not called twice from so_close */
2013 		so->so_flags |= SOF_PCBCLEARING;
2014 		so->so_saved_pcb = (caddr_t)inp;
2015 		so->so_pcb = NULL;
2016 		inp->inp_socket = NULL;
2017 #if NECP
2018 		necp_inpcb_dispose(inp);
2019 #endif /* NECP */
2020 		/*
2021 		 * In case there a route cached after a detach (possible
2022 		 * in the tcp case), make sure that it is freed before
2023 		 * we deallocate the structure.
2024 		 */
2025 		ROUTE_RELEASE(&inp->inp_route);
2026 		if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
2027 			zfree(ipi->ipi_zone, inp);
2028 		}
2029 		sodealloc(so);
2030 	}
2031 }
2032 
2033 /*
2034  * The calling convention of in_getsockaddr() and in_getpeeraddr() was
2035  * modified to match the pru_sockaddr() and pru_peeraddr() entry points
2036  * in struct pr_usrreqs, so that protocols can just reference then directly
2037  * without the need for a wrapper function.
2038  */
2039 int
in_getsockaddr(struct socket * so,struct sockaddr ** nam)2040 in_getsockaddr(struct socket *so, struct sockaddr **nam)
2041 {
2042 	struct inpcb *inp;
2043 	struct sockaddr_in *sin;
2044 
2045 	/*
2046 	 * Do the malloc first in case it blocks.
2047 	 */
2048 	sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2049 	    Z_WAITOK | Z_NOFAIL);
2050 
2051 	sin->sin_family = AF_INET;
2052 
2053 	if ((inp = sotoinpcb(so)) == NULL) {
2054 		free_sockaddr(sin);
2055 		return EINVAL;
2056 	}
2057 	sin->sin_port = inp->inp_lport;
2058 	sin->sin_addr = inp->inp_laddr;
2059 
2060 	*nam = (struct sockaddr *)sin;
2061 	return 0;
2062 }
2063 
2064 int
in_getsockaddr_s(struct socket * so,struct sockaddr_in * ss)2065 in_getsockaddr_s(struct socket *so, struct sockaddr_in *ss)
2066 {
2067 	struct sockaddr_in *sin = ss;
2068 	struct inpcb *inp;
2069 
2070 	VERIFY(ss != NULL);
2071 	bzero(ss, sizeof(*ss));
2072 
2073 	sin->sin_family = AF_INET;
2074 	sin->sin_len = sizeof(*sin);
2075 
2076 	if ((inp = sotoinpcb(so)) == NULL) {
2077 		return EINVAL;
2078 	}
2079 
2080 	sin->sin_port = inp->inp_lport;
2081 	sin->sin_addr = inp->inp_laddr;
2082 	return 0;
2083 }
2084 
2085 int
in_getpeeraddr(struct socket * so,struct sockaddr ** nam)2086 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
2087 {
2088 	struct inpcb *inp;
2089 	struct sockaddr_in *sin;
2090 
2091 	/*
2092 	 * Do the malloc first in case it blocks.
2093 	 */
2094 	sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2095 	    Z_WAITOK | Z_NOFAIL);
2096 
2097 	sin->sin_family = AF_INET;
2098 
2099 	if ((inp = sotoinpcb(so)) == NULL) {
2100 		free_sockaddr(sin);
2101 		return EINVAL;
2102 	}
2103 	sin->sin_port = inp->inp_fport;
2104 	sin->sin_addr = inp->inp_faddr;
2105 
2106 	*nam = (struct sockaddr *)sin;
2107 	return 0;
2108 }
2109 
2110 void
in_pcbnotifyall(struct inpcbinfo * pcbinfo,struct in_addr faddr,int errno,void (* notify)(struct inpcb *,int))2111 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2112     int errno, void (*notify)(struct inpcb *, int))
2113 {
2114 	struct inpcb *inp;
2115 
2116 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2117 
2118 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
2119 		if (!(inp->inp_vflag & INP_IPV4)) {
2120 			continue;
2121 		}
2122 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
2123 		    inp->inp_socket == NULL) {
2124 			continue;
2125 		}
2126 		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2127 			continue;
2128 		}
2129 		socket_lock(inp->inp_socket, 1);
2130 		(*notify)(inp, errno);
2131 		(void) in_pcb_checkstate(inp, WNT_RELEASE, 1);
2132 		socket_unlock(inp->inp_socket, 1);
2133 	}
2134 	lck_rw_done(&pcbinfo->ipi_lock);
2135 }
2136 
2137 /*
2138  * Check for alternatives when higher level complains
2139  * about service problems.  For now, invalidate cached
2140  * routing information.  If the route was created dynamically
2141  * (by a redirect), time to try a default gateway again.
2142  */
2143 void
in_losing(struct inpcb * inp)2144 in_losing(struct inpcb *inp)
2145 {
2146 	boolean_t release = FALSE;
2147 	struct rtentry *rt;
2148 
2149 	if ((rt = inp->inp_route.ro_rt) != NULL) {
2150 		struct in_ifaddr *ia = NULL;
2151 
2152 		RT_LOCK(rt);
2153 		if (rt->rt_flags & RTF_DYNAMIC) {
2154 			/*
2155 			 * Prevent another thread from modifying rt_key,
2156 			 * rt_gateway via rt_setgate() after rt_lock is
2157 			 * dropped by marking the route as defunct.
2158 			 */
2159 			rt->rt_flags |= RTF_CONDEMNED;
2160 			RT_UNLOCK(rt);
2161 			(void) rtrequest(RTM_DELETE, rt_key(rt),
2162 			    rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
2163 		} else {
2164 			RT_UNLOCK(rt);
2165 		}
2166 		/* if the address is gone keep the old route in the pcb */
2167 		if (inp->inp_laddr.s_addr != INADDR_ANY &&
2168 		    (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2169 			/*
2170 			 * Address is around; ditch the route.  A new route
2171 			 * can be allocated the next time output is attempted.
2172 			 */
2173 			release = TRUE;
2174 		}
2175 		if (ia != NULL) {
2176 			IFA_REMREF(&ia->ia_ifa);
2177 		}
2178 	}
2179 	if (rt == NULL || release) {
2180 		ROUTE_RELEASE(&inp->inp_route);
2181 	}
2182 }
2183 
2184 /*
2185  * After a routing change, flush old routing
2186  * and allocate a (hopefully) better one.
2187  */
2188 void
in_rtchange(struct inpcb * inp,int errno)2189 in_rtchange(struct inpcb *inp, int errno)
2190 {
2191 #pragma unused(errno)
2192 	boolean_t release = FALSE;
2193 	struct rtentry *rt;
2194 
2195 	if ((rt = inp->inp_route.ro_rt) != NULL) {
2196 		struct in_ifaddr *ia = NULL;
2197 
2198 		/* if address is gone, keep the old route */
2199 		if (inp->inp_laddr.s_addr != INADDR_ANY &&
2200 		    (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2201 			/*
2202 			 * Address is around; ditch the route.  A new route
2203 			 * can be allocated the next time output is attempted.
2204 			 */
2205 			release = TRUE;
2206 		}
2207 		if (ia != NULL) {
2208 			IFA_REMREF(&ia->ia_ifa);
2209 		}
2210 	}
2211 	if (rt == NULL || release) {
2212 		ROUTE_RELEASE(&inp->inp_route);
2213 	}
2214 }
2215 
2216 /*
2217  * Lookup a PCB based on the local address and port.
2218  */
2219 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,unsigned int lport_arg,int wild_okay)2220 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2221     unsigned int lport_arg, int wild_okay)
2222 {
2223 	struct inpcb *inp;
2224 	int matchwild = 3, wildcard;
2225 	u_short lport = (u_short)lport_arg;
2226 
2227 	KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0, 0, 0, 0, 0);
2228 
2229 	if (!wild_okay) {
2230 		struct inpcbhead *head;
2231 		/*
2232 		 * Look for an unconnected (wildcard foreign addr) PCB that
2233 		 * matches the local address and port we're looking for.
2234 		 */
2235 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2236 		    pcbinfo->ipi_hashmask)];
2237 		LIST_FOREACH(inp, head, inp_hash) {
2238 			if (!(inp->inp_vflag & INP_IPV4)) {
2239 				continue;
2240 			}
2241 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
2242 			    inp->inp_laddr.s_addr == laddr.s_addr &&
2243 			    inp->inp_lport == lport) {
2244 				/*
2245 				 * Found.
2246 				 */
2247 				return inp;
2248 			}
2249 		}
2250 		/*
2251 		 * Not found.
2252 		 */
2253 		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0, 0, 0, 0, 0);
2254 		return NULL;
2255 	} else {
2256 		struct inpcbporthead *porthash;
2257 		struct inpcbport *phd;
2258 		struct inpcb *match = NULL;
2259 		/*
2260 		 * Best fit PCB lookup.
2261 		 *
2262 		 * First see if this local port is in use by looking on the
2263 		 * port hash list.
2264 		 */
2265 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2266 		    pcbinfo->ipi_porthashmask)];
2267 		LIST_FOREACH(phd, porthash, phd_hash) {
2268 			if (phd->phd_port == lport) {
2269 				break;
2270 			}
2271 		}
2272 		if (phd != NULL) {
2273 			/*
2274 			 * Port is in use by one or more PCBs. Look for best
2275 			 * fit.
2276 			 */
2277 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2278 				wildcard = 0;
2279 				if (!(inp->inp_vflag & INP_IPV4)) {
2280 					continue;
2281 				}
2282 				if (inp->inp_faddr.s_addr != INADDR_ANY) {
2283 					wildcard++;
2284 				}
2285 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
2286 					if (laddr.s_addr == INADDR_ANY) {
2287 						wildcard++;
2288 					} else if (inp->inp_laddr.s_addr !=
2289 					    laddr.s_addr) {
2290 						continue;
2291 					}
2292 				} else {
2293 					if (laddr.s_addr != INADDR_ANY) {
2294 						wildcard++;
2295 					}
2296 				}
2297 				if (wildcard < matchwild) {
2298 					match = inp;
2299 					matchwild = wildcard;
2300 					if (matchwild == 0) {
2301 						break;
2302 					}
2303 				}
2304 			}
2305 		}
2306 		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,
2307 		    0, 0, 0, 0);
2308 		return match;
2309 	}
2310 }
2311 
2312 /*
2313  * Check if PCB exists in hash list.
2314  */
2315 int
in_pcblookup_hash_exists(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,uid_t * uid,gid_t * gid,struct ifnet * ifp)2316 in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2317     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2318     uid_t *uid, gid_t *gid, struct ifnet *ifp)
2319 {
2320 	struct inpcbhead *head;
2321 	struct inpcb *inp;
2322 	u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2323 	int found = 0;
2324 	struct inpcb *local_wild = NULL;
2325 	struct inpcb *local_wild_mapped = NULL;
2326 
2327 	*uid = UID_MAX;
2328 	*gid = GID_MAX;
2329 
2330 	/*
2331 	 * We may have found the pcb in the last lookup - check this first.
2332 	 */
2333 
2334 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2335 
2336 	/*
2337 	 * First look for an exact match.
2338 	 */
2339 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2340 	    pcbinfo->ipi_hashmask)];
2341 	LIST_FOREACH(inp, head, inp_hash) {
2342 		if (!(inp->inp_vflag & INP_IPV4)) {
2343 			continue;
2344 		}
2345 		if (inp_restricted_recv(inp, ifp)) {
2346 			continue;
2347 		}
2348 
2349 #if NECP
2350 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2351 			continue;
2352 		}
2353 #endif /* NECP */
2354 
2355 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
2356 		    inp->inp_laddr.s_addr == laddr.s_addr &&
2357 		    inp->inp_fport == fport &&
2358 		    inp->inp_lport == lport) {
2359 			if ((found = (inp->inp_socket != NULL))) {
2360 				/*
2361 				 * Found.
2362 				 */
2363 				*uid = kauth_cred_getuid(
2364 					inp->inp_socket->so_cred);
2365 				*gid = kauth_cred_getgid(
2366 					inp->inp_socket->so_cred);
2367 			}
2368 			lck_rw_done(&pcbinfo->ipi_lock);
2369 			return found;
2370 		}
2371 	}
2372 
2373 	if (!wildcard) {
2374 		/*
2375 		 * Not found.
2376 		 */
2377 		lck_rw_done(&pcbinfo->ipi_lock);
2378 		return 0;
2379 	}
2380 
2381 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2382 	    pcbinfo->ipi_hashmask)];
2383 	LIST_FOREACH(inp, head, inp_hash) {
2384 		if (!(inp->inp_vflag & INP_IPV4)) {
2385 			continue;
2386 		}
2387 		if (inp_restricted_recv(inp, ifp)) {
2388 			continue;
2389 		}
2390 
2391 #if NECP
2392 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2393 			continue;
2394 		}
2395 #endif /* NECP */
2396 
2397 		if (inp->inp_faddr.s_addr == INADDR_ANY &&
2398 		    inp->inp_lport == lport) {
2399 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
2400 				if ((found = (inp->inp_socket != NULL))) {
2401 					*uid = kauth_cred_getuid(
2402 						inp->inp_socket->so_cred);
2403 					*gid = kauth_cred_getgid(
2404 						inp->inp_socket->so_cred);
2405 				}
2406 				lck_rw_done(&pcbinfo->ipi_lock);
2407 				return found;
2408 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2409 				if (inp->inp_socket &&
2410 				    SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2411 					local_wild_mapped = inp;
2412 				} else {
2413 					local_wild = inp;
2414 				}
2415 			}
2416 		}
2417 	}
2418 	if (local_wild == NULL) {
2419 		if (local_wild_mapped != NULL) {
2420 			if ((found = (local_wild_mapped->inp_socket != NULL))) {
2421 				*uid = kauth_cred_getuid(
2422 					local_wild_mapped->inp_socket->so_cred);
2423 				*gid = kauth_cred_getgid(
2424 					local_wild_mapped->inp_socket->so_cred);
2425 			}
2426 			lck_rw_done(&pcbinfo->ipi_lock);
2427 			return found;
2428 		}
2429 		lck_rw_done(&pcbinfo->ipi_lock);
2430 		return 0;
2431 	}
2432 	if ((found = (local_wild->inp_socket != NULL))) {
2433 		*uid = kauth_cred_getuid(
2434 			local_wild->inp_socket->so_cred);
2435 		*gid = kauth_cred_getgid(
2436 			local_wild->inp_socket->so_cred);
2437 	}
2438 	lck_rw_done(&pcbinfo->ipi_lock);
2439 	return found;
2440 }
2441 
2442 /*
2443  * Lookup PCB in hash list.
2444  */
2445 struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,struct ifnet * ifp)2446 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2447     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2448     struct ifnet *ifp)
2449 {
2450 	struct inpcbhead *head;
2451 	struct inpcb *inp;
2452 	u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2453 	struct inpcb *local_wild = NULL;
2454 	struct inpcb *local_wild_mapped = NULL;
2455 
2456 	/*
2457 	 * We may have found the pcb in the last lookup - check this first.
2458 	 */
2459 
2460 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
2461 
2462 	/*
2463 	 * First look for an exact match.
2464 	 */
2465 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2466 	    pcbinfo->ipi_hashmask)];
2467 	LIST_FOREACH(inp, head, inp_hash) {
2468 		if (!(inp->inp_vflag & INP_IPV4)) {
2469 			continue;
2470 		}
2471 		if (inp_restricted_recv(inp, ifp)) {
2472 			continue;
2473 		}
2474 
2475 #if NECP
2476 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2477 			continue;
2478 		}
2479 #endif /* NECP */
2480 
2481 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
2482 		    inp->inp_laddr.s_addr == laddr.s_addr &&
2483 		    inp->inp_fport == fport &&
2484 		    inp->inp_lport == lport) {
2485 			/*
2486 			 * Found.
2487 			 */
2488 			if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2489 			    WNT_STOPUSING) {
2490 				lck_rw_done(&pcbinfo->ipi_lock);
2491 				return inp;
2492 			} else {
2493 				/* it's there but dead, say it isn't found */
2494 				lck_rw_done(&pcbinfo->ipi_lock);
2495 				return NULL;
2496 			}
2497 		}
2498 	}
2499 
2500 	if (!wildcard) {
2501 		/*
2502 		 * Not found.
2503 		 */
2504 		lck_rw_done(&pcbinfo->ipi_lock);
2505 		return NULL;
2506 	}
2507 
2508 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2509 	    pcbinfo->ipi_hashmask)];
2510 	LIST_FOREACH(inp, head, inp_hash) {
2511 		if (!(inp->inp_vflag & INP_IPV4)) {
2512 			continue;
2513 		}
2514 		if (inp_restricted_recv(inp, ifp)) {
2515 			continue;
2516 		}
2517 
2518 #if NECP
2519 		if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2520 			continue;
2521 		}
2522 #endif /* NECP */
2523 
2524 		if (inp->inp_faddr.s_addr == INADDR_ANY &&
2525 		    inp->inp_lport == lport) {
2526 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
2527 				if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2528 				    WNT_STOPUSING) {
2529 					lck_rw_done(&pcbinfo->ipi_lock);
2530 					return inp;
2531 				} else {
2532 					/* it's dead; say it isn't found */
2533 					lck_rw_done(&pcbinfo->ipi_lock);
2534 					return NULL;
2535 				}
2536 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2537 				if (SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2538 					local_wild_mapped = inp;
2539 				} else {
2540 					local_wild = inp;
2541 				}
2542 			}
2543 		}
2544 	}
2545 	if (local_wild == NULL) {
2546 		if (local_wild_mapped != NULL) {
2547 			if (in_pcb_checkstate(local_wild_mapped,
2548 			    WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2549 				lck_rw_done(&pcbinfo->ipi_lock);
2550 				return local_wild_mapped;
2551 			} else {
2552 				/* it's dead; say it isn't found */
2553 				lck_rw_done(&pcbinfo->ipi_lock);
2554 				return NULL;
2555 			}
2556 		}
2557 		lck_rw_done(&pcbinfo->ipi_lock);
2558 		return NULL;
2559 	}
2560 	if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2561 		lck_rw_done(&pcbinfo->ipi_lock);
2562 		return local_wild;
2563 	}
2564 	/*
2565 	 * It's either not found or is already dead.
2566 	 */
2567 	lck_rw_done(&pcbinfo->ipi_lock);
2568 	return NULL;
2569 }
2570 
2571 /*
2572  * @brief	Insert PCB onto various hash lists.
2573  *
2574  * @param	inp Pointer to internet protocol control block
2575  * @param	locked	Implies if ipi_lock (protecting pcb list)
2576  *              is already locked or not.
2577  *
2578  * @return	int error on failure and 0 on success
2579  */
2580 int
in_pcbinshash(struct inpcb * inp,int locked)2581 in_pcbinshash(struct inpcb *inp, int locked)
2582 {
2583 	struct inpcbhead *pcbhash;
2584 	struct inpcbporthead *pcbporthash;
2585 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2586 	struct inpcbport *phd;
2587 	u_int32_t hashkey_faddr;
2588 
2589 	if (!locked) {
2590 		if (!lck_rw_try_lock_exclusive(&pcbinfo->ipi_lock)) {
2591 			/*
2592 			 * Lock inversion issue, mostly with udp
2593 			 * multicast packets
2594 			 */
2595 			socket_unlock(inp->inp_socket, 0);
2596 			lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
2597 			socket_lock(inp->inp_socket, 0);
2598 		}
2599 	}
2600 
2601 	/*
2602 	 * This routine or its caller may have given up
2603 	 * socket's protocol lock briefly.
2604 	 * During that time the socket may have been dropped.
2605 	 * Safe-guarding against that.
2606 	 */
2607 	if (inp->inp_state == INPCB_STATE_DEAD) {
2608 		if (!locked) {
2609 			lck_rw_done(&pcbinfo->ipi_lock);
2610 		}
2611 		return ECONNABORTED;
2612 	}
2613 
2614 
2615 	if (inp->inp_vflag & INP_IPV6) {
2616 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2617 	} else {
2618 		hashkey_faddr = inp->inp_faddr.s_addr;
2619 	}
2620 
2621 	inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2622 	    inp->inp_fport, pcbinfo->ipi_hashmask);
2623 
2624 	pcbhash = &pcbinfo->ipi_hashbase[inp->inp_hash_element];
2625 
2626 	pcbporthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(inp->inp_lport,
2627 	    pcbinfo->ipi_porthashmask)];
2628 
2629 	/*
2630 	 * Go through port list and look for a head for this lport.
2631 	 */
2632 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
2633 		if (phd->phd_port == inp->inp_lport) {
2634 			break;
2635 		}
2636 	}
2637 
2638 	/*
2639 	 * If none exists, malloc one and tack it on.
2640 	 */
2641 	if (phd == NULL) {
2642 		phd = kalloc_type(struct inpcbport, Z_WAITOK | Z_NOFAIL);
2643 		phd->phd_port = inp->inp_lport;
2644 		LIST_INIT(&phd->phd_pcblist);
2645 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2646 	}
2647 
2648 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2649 
2650 #if SKYWALK
2651 	int err;
2652 	struct socket *so = inp->inp_socket;
2653 	if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2654 	    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2655 		if (inp->inp_vflag & INP_IPV6) {
2656 			err = netns_reserve_in6(&inp->inp_netns_token,
2657 			    inp->in6p_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2658 			    NETNS_BSD | NETNS_PRERESERVED, NULL);
2659 		} else {
2660 			err = netns_reserve_in(&inp->inp_netns_token,
2661 			    inp->inp_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2662 			    NETNS_BSD | NETNS_PRERESERVED, NULL);
2663 		}
2664 		if (err) {
2665 			if (!locked) {
2666 				lck_rw_done(&pcbinfo->ipi_lock);
2667 			}
2668 			return err;
2669 		}
2670 		netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2671 		inp_update_netns_flags(so);
2672 	}
2673 #endif /* SKYWALK */
2674 
2675 	inp->inp_phd = phd;
2676 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2677 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2678 	inp->inp_flags2 |= INP2_INHASHLIST;
2679 
2680 	if (!locked) {
2681 		lck_rw_done(&pcbinfo->ipi_lock);
2682 	}
2683 
2684 #if NECP
2685 	// This call catches the original setting of the local address
2686 	inp_update_necp_policy(inp, NULL, NULL, 0);
2687 #endif /* NECP */
2688 
2689 	return 0;
2690 }
2691 
2692 /*
2693  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2694  * changed. NOTE: This does not handle the case of the lport changing (the
2695  * hashed port list would have to be updated as well), so the lport must
2696  * not change after in_pcbinshash() has been called.
2697  */
2698 void
in_pcbrehash(struct inpcb * inp)2699 in_pcbrehash(struct inpcb *inp)
2700 {
2701 	struct inpcbhead *head;
2702 	u_int32_t hashkey_faddr;
2703 
2704 #if SKYWALK
2705 	struct socket *so = inp->inp_socket;
2706 	if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2707 	    !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2708 		int err;
2709 		if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
2710 			if (inp->inp_vflag & INP_IPV6) {
2711 				err = netns_change_addr_in6(
2712 					&inp->inp_netns_token, inp->in6p_laddr);
2713 			} else {
2714 				err = netns_change_addr_in(
2715 					&inp->inp_netns_token, inp->inp_laddr);
2716 			}
2717 		} else {
2718 			if (inp->inp_vflag & INP_IPV6) {
2719 				err = netns_reserve_in6(&inp->inp_netns_token,
2720 				    inp->in6p_laddr, (uint8_t)SOCK_PROTO(so),
2721 				    inp->inp_lport, NETNS_BSD, NULL);
2722 			} else {
2723 				err = netns_reserve_in(&inp->inp_netns_token,
2724 				    inp->inp_laddr, (uint8_t)SOCK_PROTO(so),
2725 				    inp->inp_lport, NETNS_BSD, NULL);
2726 			}
2727 		}
2728 		/* We are assuming that whatever code paths result in a rehash
2729 		 * did their due diligence and ensured that the given
2730 		 * <proto, laddr, lport> tuple was free ahead of time. Just
2731 		 * reserving the lport on INADDR_ANY should be enough, since
2732 		 * that will block Skywalk from trying to reserve that same
2733 		 * port. Given this assumption, the above netns calls should
2734 		 * never fail*/
2735 		VERIFY(err == 0);
2736 
2737 		netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2738 		inp_update_netns_flags(so);
2739 	}
2740 #endif /* SKYWALK */
2741 	if (inp->inp_vflag & INP_IPV6) {
2742 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2743 	} else {
2744 		hashkey_faddr = inp->inp_faddr.s_addr;
2745 	}
2746 
2747 	inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2748 	    inp->inp_fport, inp->inp_pcbinfo->ipi_hashmask);
2749 	head = &inp->inp_pcbinfo->ipi_hashbase[inp->inp_hash_element];
2750 
2751 	if (inp->inp_flags2 & INP2_INHASHLIST) {
2752 		LIST_REMOVE(inp, inp_hash);
2753 		inp->inp_flags2 &= ~INP2_INHASHLIST;
2754 	}
2755 
2756 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2757 	LIST_INSERT_HEAD(head, inp, inp_hash);
2758 	inp->inp_flags2 |= INP2_INHASHLIST;
2759 
2760 #if NECP
2761 	// This call catches updates to the remote addresses
2762 	inp_update_necp_policy(inp, NULL, NULL, 0);
2763 #endif /* NECP */
2764 }
2765 
2766 /*
2767  * Remove PCB from various lists.
2768  * Must be called pcbinfo lock is held in exclusive mode.
2769  */
2770 void
in_pcbremlists(struct inpcb * inp)2771 in_pcbremlists(struct inpcb *inp)
2772 {
2773 	inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
2774 
2775 	/*
2776 	 * Check if it's in hashlist -- an inp is placed in hashlist when
2777 	 * it's local port gets assigned. So it should also be present
2778 	 * in the port list.
2779 	 */
2780 	if (inp->inp_flags2 & INP2_INHASHLIST) {
2781 		struct inpcbport *phd = inp->inp_phd;
2782 
2783 		VERIFY(phd != NULL && inp->inp_lport > 0);
2784 
2785 		LIST_REMOVE(inp, inp_hash);
2786 		inp->inp_hash.le_next = NULL;
2787 		inp->inp_hash.le_prev = NULL;
2788 
2789 		LIST_REMOVE(inp, inp_portlist);
2790 		inp->inp_portlist.le_next = NULL;
2791 		inp->inp_portlist.le_prev = NULL;
2792 		if (LIST_EMPTY(&phd->phd_pcblist)) {
2793 			LIST_REMOVE(phd, phd_hash);
2794 			kfree_type(struct inpcbport, phd);
2795 		}
2796 		inp->inp_phd = NULL;
2797 		inp->inp_flags2 &= ~INP2_INHASHLIST;
2798 #if SKYWALK
2799 		/* Free up the port in the namespace registrar */
2800 		netns_release(&inp->inp_netns_token);
2801 		netns_release(&inp->inp_wildcard_netns_token);
2802 #endif /* SKYWALK */
2803 	}
2804 	VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2805 
2806 	if (inp->inp_flags2 & INP2_TIMEWAIT) {
2807 		/* Remove from time-wait queue */
2808 		tcp_remove_from_time_wait(inp);
2809 		inp->inp_flags2 &= ~INP2_TIMEWAIT;
2810 		VERIFY(inp->inp_pcbinfo->ipi_twcount != 0);
2811 		inp->inp_pcbinfo->ipi_twcount--;
2812 	} else {
2813 		/* Remove from global inp list if it is not time-wait */
2814 		LIST_REMOVE(inp, inp_list);
2815 	}
2816 
2817 	if (inp->inp_flags2 & INP2_IN_FCTREE) {
2818 		inp_fc_getinp(inp->inp_flowhash, (INPFC_SOLOCKED | INPFC_REMOVE));
2819 		VERIFY(!(inp->inp_flags2 & INP2_IN_FCTREE));
2820 	}
2821 
2822 	inp->inp_pcbinfo->ipi_count--;
2823 }
2824 
2825 /*
2826  * Mechanism used to defer the memory release of PCBs
2827  * The pcb list will contain the pcb until the reaper can clean it up if
2828  * the following conditions are met:
2829  *	1) state "DEAD",
2830  *	2) wantcnt is STOPUSING
2831  *	3) usecount is 0
2832  * This function will be called to either mark the pcb as
2833  */
2834 int
in_pcb_checkstate(struct inpcb * pcb,int mode,int locked)2835 in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
2836 {
2837 	volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt;
2838 	UInt32 origwant;
2839 	UInt32 newwant;
2840 
2841 	switch (mode) {
2842 	case WNT_STOPUSING:
2843 		/*
2844 		 * Try to mark the pcb as ready for recycling.  CAS with
2845 		 * STOPUSING, if success we're good, if it's in use, will
2846 		 * be marked later
2847 		 */
2848 		if (locked == 0) {
2849 			socket_lock(pcb->inp_socket, 1);
2850 		}
2851 		pcb->inp_state = INPCB_STATE_DEAD;
2852 
2853 stopusing:
2854 		if (pcb->inp_socket->so_usecount < 0) {
2855 			panic("%s: pcb=%p so=%p usecount is negative",
2856 			    __func__, pcb, pcb->inp_socket);
2857 			/* NOTREACHED */
2858 		}
2859 		if (locked == 0) {
2860 			socket_unlock(pcb->inp_socket, 1);
2861 		}
2862 
2863 		inpcb_gc_sched(pcb->inp_pcbinfo, INPCB_TIMER_FAST);
2864 
2865 		origwant = *wantcnt;
2866 		if ((UInt16) origwant == 0xffff) { /* should stop using */
2867 			return WNT_STOPUSING;
2868 		}
2869 		newwant = 0xffff;
2870 		if ((UInt16) origwant == 0) {
2871 			/* try to mark it as unsuable now */
2872 			OSCompareAndSwap(origwant, newwant, wantcnt);
2873 		}
2874 		return WNT_STOPUSING;
2875 
2876 	case WNT_ACQUIRE:
2877 		/*
2878 		 * Try to increase reference to pcb.  If WNT_STOPUSING
2879 		 * should bail out.  If socket state DEAD, try to set count
2880 		 * to STOPUSING, return failed otherwise increase cnt.
2881 		 */
2882 		do {
2883 			origwant = *wantcnt;
2884 			if ((UInt16) origwant == 0xffff) {
2885 				/* should stop using */
2886 				return WNT_STOPUSING;
2887 			}
2888 			newwant = origwant + 1;
2889 		} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
2890 		return WNT_ACQUIRE;
2891 
2892 	case WNT_RELEASE:
2893 		/*
2894 		 * Release reference.  If result is null and pcb state
2895 		 * is DEAD, set wanted bit to STOPUSING
2896 		 */
2897 		if (locked == 0) {
2898 			socket_lock(pcb->inp_socket, 1);
2899 		}
2900 
2901 		do {
2902 			origwant = *wantcnt;
2903 			if ((UInt16) origwant == 0x0) {
2904 				panic("%s: pcb=%p release with zero count",
2905 				    __func__, pcb);
2906 				/* NOTREACHED */
2907 			}
2908 			if ((UInt16) origwant == 0xffff) {
2909 				/* should stop using */
2910 				if (locked == 0) {
2911 					socket_unlock(pcb->inp_socket, 1);
2912 				}
2913 				return WNT_STOPUSING;
2914 			}
2915 			newwant = origwant - 1;
2916 		} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
2917 
2918 		if (pcb->inp_state == INPCB_STATE_DEAD) {
2919 			goto stopusing;
2920 		}
2921 		if (pcb->inp_socket->so_usecount < 0) {
2922 			panic("%s: RELEASE pcb=%p so=%p usecount is negative",
2923 			    __func__, pcb, pcb->inp_socket);
2924 			/* NOTREACHED */
2925 		}
2926 
2927 		if (locked == 0) {
2928 			socket_unlock(pcb->inp_socket, 1);
2929 		}
2930 		return WNT_RELEASE;
2931 
2932 	default:
2933 		panic("%s: so=%p not a valid state =%x", __func__,
2934 		    pcb->inp_socket, mode);
2935 		/* NOTREACHED */
2936 	}
2937 
2938 	/* NOTREACHED */
2939 	return mode;
2940 }
2941 
2942 /*
2943  * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
2944  * The inpcb_compat data structure is passed to user space and must
2945  * not change. We intentionally avoid copying pointers.
2946  */
2947 void
inpcb_to_compat(struct inpcb * inp,struct inpcb_compat * inp_compat)2948 inpcb_to_compat(struct inpcb *inp, struct inpcb_compat *inp_compat)
2949 {
2950 	bzero(inp_compat, sizeof(*inp_compat));
2951 	inp_compat->inp_fport = inp->inp_fport;
2952 	inp_compat->inp_lport = inp->inp_lport;
2953 	inp_compat->nat_owner = 0;
2954 	inp_compat->nat_cookie = 0;
2955 	inp_compat->inp_gencnt = inp->inp_gencnt;
2956 	inp_compat->inp_flags = inp->inp_flags;
2957 	inp_compat->inp_flow = inp->inp_flow;
2958 	inp_compat->inp_vflag = inp->inp_vflag;
2959 	inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
2960 	inp_compat->inp_ip_p = inp->inp_ip_p;
2961 	inp_compat->inp_dependfaddr.inp6_foreign =
2962 	    inp->inp_dependfaddr.inp6_foreign;
2963 	inp_compat->inp_dependladdr.inp6_local =
2964 	    inp->inp_dependladdr.inp6_local;
2965 	inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
2966 	inp_compat->inp_depend6.inp6_hlim = 0;
2967 	inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
2968 	inp_compat->inp_depend6.inp6_ifindex = 0;
2969 	inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
2970 }
2971 
2972 #if XNU_TARGET_OS_OSX
2973 void
inpcb_to_xinpcb64(struct inpcb * inp,struct xinpcb64 * xinp)2974 inpcb_to_xinpcb64(struct inpcb *inp, struct xinpcb64 *xinp)
2975 {
2976 	xinp->inp_fport = inp->inp_fport;
2977 	xinp->inp_lport = inp->inp_lport;
2978 	xinp->inp_gencnt = inp->inp_gencnt;
2979 	xinp->inp_flags = inp->inp_flags;
2980 	xinp->inp_flow = inp->inp_flow;
2981 	xinp->inp_vflag = inp->inp_vflag;
2982 	xinp->inp_ip_ttl = inp->inp_ip_ttl;
2983 	xinp->inp_ip_p = inp->inp_ip_p;
2984 	xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
2985 	xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
2986 	xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
2987 	xinp->inp_depend6.inp6_hlim = 0;
2988 	xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
2989 	xinp->inp_depend6.inp6_ifindex = 0;
2990 	xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
2991 }
2992 #endif /* XNU_TARGET_OS_OSX */
2993 
2994 /*
2995  * The following routines implement this scheme:
2996  *
2997  * Callers of ip_output() that intend to cache the route in the inpcb pass
2998  * a local copy of the struct route to ip_output().  Using a local copy of
2999  * the cached route significantly simplifies things as IP no longer has to
3000  * worry about having exclusive access to the passed in struct route, since
3001  * it's defined in the caller's stack; in essence, this allows for a lock-
3002  * less operation when updating the struct route at the IP level and below,
3003  * whenever necessary. The scheme works as follows:
3004  *
3005  * Prior to dropping the socket's lock and calling ip_output(), the caller
3006  * copies the struct route from the inpcb into its stack, and adds a reference
3007  * to the cached route entry, if there was any.  The socket's lock is then
3008  * dropped and ip_output() is called with a pointer to the copy of struct
3009  * route defined on the stack (not to the one in the inpcb.)
3010  *
3011  * Upon returning from ip_output(), the caller then acquires the socket's
3012  * lock and synchronizes the cache; if there is no route cached in the inpcb,
3013  * it copies the local copy of struct route (which may or may not contain any
3014  * route) back into the cache; otherwise, if the inpcb has a route cached in
3015  * it, the one in the local copy will be freed, if there's any.  Trashing the
3016  * cached route in the inpcb can be avoided because ip_output() is single-
3017  * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
3018  * by the socket/transport layer.)
3019  */
3020 void
inp_route_copyout(struct inpcb * inp,struct route * dst)3021 inp_route_copyout(struct inpcb *inp, struct route *dst)
3022 {
3023 	struct route *src = &inp->inp_route;
3024 
3025 	socket_lock_assert_owned(inp->inp_socket);
3026 
3027 	/*
3028 	 * If the route in the PCB is stale or not for IPv4, blow it away;
3029 	 * this is possible in the case of IPv4-mapped address case.
3030 	 */
3031 	if (ROUTE_UNUSABLE(src) || rt_key(src->ro_rt)->sa_family != AF_INET) {
3032 		ROUTE_RELEASE(src);
3033 	}
3034 
3035 	route_copyout(dst, src, sizeof(*dst));
3036 }
3037 
3038 void
inp_route_copyin(struct inpcb * inp,struct route * src)3039 inp_route_copyin(struct inpcb *inp, struct route *src)
3040 {
3041 	struct route *dst = &inp->inp_route;
3042 
3043 	socket_lock_assert_owned(inp->inp_socket);
3044 
3045 	/* Minor sanity check */
3046 	if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3047 		panic("%s: wrong or corrupted route: %p", __func__, src);
3048 	}
3049 
3050 	route_copyin(src, dst, sizeof(*src));
3051 }
3052 
3053 /*
3054  * Handler for setting IP_BOUND_IF/IPV6_BOUND_IF socket option.
3055  */
3056 int
inp_bindif(struct inpcb * inp,unsigned int ifscope,struct ifnet ** pifp)3057 inp_bindif(struct inpcb *inp, unsigned int ifscope, struct ifnet **pifp)
3058 {
3059 	struct ifnet *ifp = NULL;
3060 
3061 	ifnet_head_lock_shared();
3062 	if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE &&
3063 	    (ifp = ifindex2ifnet[ifscope]) == NULL)) {
3064 		ifnet_head_done();
3065 		return ENXIO;
3066 	}
3067 	ifnet_head_done();
3068 
3069 	VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE);
3070 
3071 	/*
3072 	 * A zero interface scope value indicates an "unbind".
3073 	 * Otherwise, take in whatever value the app desires;
3074 	 * the app may already know the scope (or force itself
3075 	 * to such a scope) ahead of time before the interface
3076 	 * gets attached.  It doesn't matter either way; any
3077 	 * route lookup from this point on will require an
3078 	 * exact match for the embedded interface scope.
3079 	 */
3080 	inp->inp_boundifp = ifp;
3081 	if (inp->inp_boundifp == NULL) {
3082 		inp->inp_flags &= ~INP_BOUND_IF;
3083 	} else {
3084 		inp->inp_flags |= INP_BOUND_IF;
3085 	}
3086 
3087 	/* Blow away any cached route in the PCB */
3088 	ROUTE_RELEASE(&inp->inp_route);
3089 
3090 	if (pifp != NULL) {
3091 		*pifp = ifp;
3092 	}
3093 
3094 	return 0;
3095 }
3096 
3097 /*
3098  * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3099  * as well as for setting PROC_UUID_NO_CELLULAR policy.
3100  */
3101 void
inp_set_nocellular(struct inpcb * inp)3102 inp_set_nocellular(struct inpcb *inp)
3103 {
3104 	inp->inp_flags |= INP_NO_IFT_CELLULAR;
3105 
3106 	/* Blow away any cached route in the PCB */
3107 	ROUTE_RELEASE(&inp->inp_route);
3108 }
3109 
3110 /*
3111  * Handler for clearing IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3112  * as well as for clearing PROC_UUID_NO_CELLULAR policy.
3113  */
3114 void
inp_clear_nocellular(struct inpcb * inp)3115 inp_clear_nocellular(struct inpcb *inp)
3116 {
3117 	struct socket *so = inp->inp_socket;
3118 
3119 	/*
3120 	 * SO_RESTRICT_DENY_CELLULAR socket restriction issued on the socket
3121 	 * has a higher precendence than INP_NO_IFT_CELLULAR.  Clear the flag
3122 	 * if and only if the socket is unrestricted.
3123 	 */
3124 	if (so != NULL && !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
3125 		inp->inp_flags &= ~INP_NO_IFT_CELLULAR;
3126 
3127 		/* Blow away any cached route in the PCB */
3128 		ROUTE_RELEASE(&inp->inp_route);
3129 	}
3130 }
3131 
3132 void
inp_set_noexpensive(struct inpcb * inp)3133 inp_set_noexpensive(struct inpcb *inp)
3134 {
3135 	inp->inp_flags2 |= INP2_NO_IFF_EXPENSIVE;
3136 
3137 	/* Blow away any cached route in the PCB */
3138 	ROUTE_RELEASE(&inp->inp_route);
3139 }
3140 
3141 void
inp_set_noconstrained(struct inpcb * inp)3142 inp_set_noconstrained(struct inpcb *inp)
3143 {
3144 	inp->inp_flags2 |= INP2_NO_IFF_CONSTRAINED;
3145 
3146 	/* Blow away any cached route in the PCB */
3147 	ROUTE_RELEASE(&inp->inp_route);
3148 }
3149 
3150 void
inp_set_awdl_unrestricted(struct inpcb * inp)3151 inp_set_awdl_unrestricted(struct inpcb *inp)
3152 {
3153 	inp->inp_flags2 |= INP2_AWDL_UNRESTRICTED;
3154 
3155 	/* Blow away any cached route in the PCB */
3156 	ROUTE_RELEASE(&inp->inp_route);
3157 }
3158 
3159 boolean_t
inp_get_awdl_unrestricted(struct inpcb * inp)3160 inp_get_awdl_unrestricted(struct inpcb *inp)
3161 {
3162 	return (inp->inp_flags2 & INP2_AWDL_UNRESTRICTED) ? TRUE : FALSE;
3163 }
3164 
3165 void
inp_clear_awdl_unrestricted(struct inpcb * inp)3166 inp_clear_awdl_unrestricted(struct inpcb *inp)
3167 {
3168 	inp->inp_flags2 &= ~INP2_AWDL_UNRESTRICTED;
3169 
3170 	/* Blow away any cached route in the PCB */
3171 	ROUTE_RELEASE(&inp->inp_route);
3172 }
3173 
3174 void
inp_set_intcoproc_allowed(struct inpcb * inp)3175 inp_set_intcoproc_allowed(struct inpcb *inp)
3176 {
3177 	inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
3178 
3179 	/* Blow away any cached route in the PCB */
3180 	ROUTE_RELEASE(&inp->inp_route);
3181 }
3182 
3183 boolean_t
inp_get_intcoproc_allowed(struct inpcb * inp)3184 inp_get_intcoproc_allowed(struct inpcb *inp)
3185 {
3186 	return (inp->inp_flags2 & INP2_INTCOPROC_ALLOWED) ? TRUE : FALSE;
3187 }
3188 
3189 void
inp_clear_intcoproc_allowed(struct inpcb * inp)3190 inp_clear_intcoproc_allowed(struct inpcb *inp)
3191 {
3192 	inp->inp_flags2 &= ~INP2_INTCOPROC_ALLOWED;
3193 
3194 	/* Blow away any cached route in the PCB */
3195 	ROUTE_RELEASE(&inp->inp_route);
3196 }
3197 
3198 #if NECP
3199 /*
3200  * Called when PROC_UUID_NECP_APP_POLICY is set.
3201  */
3202 void
inp_set_want_app_policy(struct inpcb * inp)3203 inp_set_want_app_policy(struct inpcb *inp)
3204 {
3205 	inp->inp_flags2 |= INP2_WANT_APP_POLICY;
3206 }
3207 
3208 /*
3209  * Called when PROC_UUID_NECP_APP_POLICY is cleared.
3210  */
3211 void
inp_clear_want_app_policy(struct inpcb * inp)3212 inp_clear_want_app_policy(struct inpcb *inp)
3213 {
3214 	inp->inp_flags2 &= ~INP2_WANT_APP_POLICY;
3215 }
3216 #endif /* NECP */
3217 
3218 /*
3219  * Calculate flow hash for an inp, used by an interface to identify a
3220  * flow. When an interface provides flow control advisory, this flow
3221  * hash is used as an identifier.
3222  */
3223 u_int32_t
inp_calc_flowhash(struct inpcb * inp)3224 inp_calc_flowhash(struct inpcb *inp)
3225 {
3226 #if SKYWALK
3227 
3228 	uint32_t flowid;
3229 	struct flowidns_flow_key fk;
3230 
3231 	bzero(&fk, sizeof(fk));
3232 
3233 	if (inp->inp_vflag & INP_IPV4) {
3234 		fk.ffk_af = AF_INET;
3235 		fk.ffk_laddr_v4 = inp->inp_laddr;
3236 		fk.ffk_raddr_v4 = inp->inp_faddr;
3237 	} else {
3238 		fk.ffk_af = AF_INET6;
3239 		fk.ffk_laddr_v6 = inp->in6p_laddr;
3240 		fk.ffk_raddr_v6 = inp->in6p_faddr;
3241 		/* clear embedded scope ID */
3242 		if (IN6_IS_SCOPE_EMBED(&fk.ffk_laddr_v6)) {
3243 			fk.ffk_laddr_v6.s6_addr16[1] = 0;
3244 		}
3245 		if (IN6_IS_SCOPE_EMBED(&fk.ffk_raddr_v6)) {
3246 			fk.ffk_raddr_v6.s6_addr16[1] = 0;
3247 		}
3248 	}
3249 
3250 	fk.ffk_lport = inp->inp_lport;
3251 	fk.ffk_rport = inp->inp_fport;
3252 	fk.ffk_proto = (inp->inp_ip_p != 0) ? inp->inp_ip_p :
3253 	    (uint8_t)SOCK_PROTO(inp->inp_socket);
3254 	flowidns_allocate_flowid(FLOWIDNS_DOMAIN_INPCB, &fk, &flowid);
3255 	/* Insert the inp into inp_fc_tree */
3256 	lck_mtx_lock_spin(&inp_fc_lck);
3257 	ASSERT(inp->inp_flowhash == 0);
3258 	ASSERT((inp->inp_flags2 & INP2_IN_FCTREE) == 0);
3259 	inp->inp_flowhash = flowid;
3260 	VERIFY(RB_INSERT(inp_fc_tree, &inp_fc_tree, inp) == NULL);
3261 	inp->inp_flags2 |= INP2_IN_FCTREE;
3262 	lck_mtx_unlock(&inp_fc_lck);
3263 
3264 	return flowid;
3265 
3266 #else /* !SKYWALK */
3267 
3268 	struct inp_flowhash_key fh __attribute__((aligned(8)));
3269 	u_int32_t flowhash = 0;
3270 	struct inpcb *tmp_inp = NULL;
3271 
3272 	if (inp_hash_seed == 0) {
3273 		inp_hash_seed = RandomULong();
3274 	}
3275 
3276 	bzero(&fh, sizeof(fh));
3277 
3278 	bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof(fh.infh_laddr));
3279 	bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof(fh.infh_faddr));
3280 
3281 	fh.infh_lport = inp->inp_lport;
3282 	fh.infh_fport = inp->inp_fport;
3283 	fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET;
3284 	fh.infh_proto = inp->inp_ip_p;
3285 	fh.infh_rand1 = RandomULong();
3286 	fh.infh_rand2 = RandomULong();
3287 
3288 try_again:
3289 	flowhash = net_flowhash(&fh, sizeof(fh), inp_hash_seed);
3290 	if (flowhash == 0) {
3291 		/* try to get a non-zero flowhash */
3292 		inp_hash_seed = RandomULong();
3293 		goto try_again;
3294 	}
3295 
3296 	inp->inp_flowhash = flowhash;
3297 
3298 	/* Insert the inp into inp_fc_tree */
3299 	lck_mtx_lock_spin(&inp_fc_lck);
3300 	tmp_inp = RB_FIND(inp_fc_tree, &inp_fc_tree, inp);
3301 	if (tmp_inp != NULL) {
3302 		/*
3303 		 * There is a different inp with the same flowhash.
3304 		 * There can be a collision on flow hash but the
3305 		 * probability is low.  Let's recompute the
3306 		 * flowhash.
3307 		 */
3308 		lck_mtx_unlock(&inp_fc_lck);
3309 		/* recompute hash seed */
3310 		inp_hash_seed = RandomULong();
3311 		goto try_again;
3312 	}
3313 
3314 	RB_INSERT(inp_fc_tree, &inp_fc_tree, inp);
3315 	inp->inp_flags2 |= INP2_IN_FCTREE;
3316 	lck_mtx_unlock(&inp_fc_lck);
3317 
3318 	return flowhash;
3319 
3320 #endif /* !SKYWALK */
3321 }
3322 
3323 void
inp_flowadv(uint32_t flowhash)3324 inp_flowadv(uint32_t flowhash)
3325 {
3326 	struct inpcb *inp;
3327 
3328 	inp = inp_fc_getinp(flowhash, 0);
3329 
3330 	if (inp == NULL) {
3331 		return;
3332 	}
3333 	inp_fc_feedback(inp);
3334 }
3335 
3336 /*
3337  * Function to compare inp_fc_entries in inp flow control tree
3338  */
3339 static inline int
infc_cmp(const struct inpcb * inp1,const struct inpcb * inp2)3340 infc_cmp(const struct inpcb *inp1, const struct inpcb *inp2)
3341 {
3342 	return memcmp(&(inp1->inp_flowhash), &(inp2->inp_flowhash),
3343 	           sizeof(inp1->inp_flowhash));
3344 }
3345 
3346 static struct inpcb *
inp_fc_getinp(u_int32_t flowhash,u_int32_t flags)3347 inp_fc_getinp(u_int32_t flowhash, u_int32_t flags)
3348 {
3349 	struct inpcb *inp = NULL;
3350 	int locked = (flags & INPFC_SOLOCKED) ? 1 : 0;
3351 
3352 	lck_mtx_lock_spin(&inp_fc_lck);
3353 	key_inp.inp_flowhash = flowhash;
3354 	inp = RB_FIND(inp_fc_tree, &inp_fc_tree, &key_inp);
3355 	if (inp == NULL) {
3356 		/* inp is not present, return */
3357 		lck_mtx_unlock(&inp_fc_lck);
3358 		return NULL;
3359 	}
3360 
3361 	if (flags & INPFC_REMOVE) {
3362 		ASSERT((inp->inp_flags2 & INP2_IN_FCTREE) != 0);
3363 		lck_mtx_convert_spin(&inp_fc_lck);
3364 		RB_REMOVE(inp_fc_tree, &inp_fc_tree, inp);
3365 		bzero(&(inp->infc_link), sizeof(inp->infc_link));
3366 #if SKYWALK
3367 		VERIFY(inp->inp_flowhash != 0);
3368 		flowidns_release_flowid(inp->inp_flowhash);
3369 		inp->inp_flowhash = 0;
3370 #endif /* !SKYWALK */
3371 		inp->inp_flags2 &= ~INP2_IN_FCTREE;
3372 		lck_mtx_unlock(&inp_fc_lck);
3373 		return NULL;
3374 	}
3375 
3376 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) {
3377 		inp = NULL;
3378 	}
3379 	lck_mtx_unlock(&inp_fc_lck);
3380 
3381 	return inp;
3382 }
3383 
3384 static void
inp_fc_feedback(struct inpcb * inp)3385 inp_fc_feedback(struct inpcb *inp)
3386 {
3387 	struct socket *so = inp->inp_socket;
3388 
3389 	/* we already hold a want_cnt on this inp, socket can't be null */
3390 	VERIFY(so != NULL);
3391 	socket_lock(so, 1);
3392 
3393 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3394 		socket_unlock(so, 1);
3395 		return;
3396 	}
3397 
3398 	if (inp->inp_sndinprog_cnt > 0) {
3399 		inp->inp_flags |= INP_FC_FEEDBACK;
3400 	}
3401 
3402 	/*
3403 	 * Return if the connection is not in flow-controlled state.
3404 	 * This can happen if the connection experienced
3405 	 * loss while it was in flow controlled state
3406 	 */
3407 	if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) {
3408 		socket_unlock(so, 1);
3409 		return;
3410 	}
3411 	inp_reset_fc_state(inp);
3412 
3413 	if (SOCK_TYPE(so) == SOCK_STREAM) {
3414 		inp_fc_unthrottle_tcp(inp);
3415 	}
3416 
3417 	socket_unlock(so, 1);
3418 }
3419 
3420 void
inp_reset_fc_state(struct inpcb * inp)3421 inp_reset_fc_state(struct inpcb *inp)
3422 {
3423 	struct socket *so = inp->inp_socket;
3424 	int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0;
3425 	int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0;
3426 
3427 	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3428 
3429 	if (suspended) {
3430 		so->so_flags &= ~(SOF_SUSPENDED);
3431 		soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME));
3432 	}
3433 
3434 	/* Give a write wakeup to unblock the socket */
3435 	if (needwakeup) {
3436 		sowwakeup(so);
3437 	}
3438 }
3439 
3440 int
inp_set_fc_state(struct inpcb * inp,int advcode)3441 inp_set_fc_state(struct inpcb *inp, int advcode)
3442 {
3443 	boolean_t is_flow_controlled = INP_WAIT_FOR_IF_FEEDBACK(inp);
3444 	struct inpcb *tmp_inp = NULL;
3445 	/*
3446 	 * If there was a feedback from the interface when
3447 	 * send operation was in progress, we should ignore
3448 	 * this flow advisory to avoid a race between setting
3449 	 * flow controlled state and receiving feedback from
3450 	 * the interface
3451 	 */
3452 	if (inp->inp_flags & INP_FC_FEEDBACK) {
3453 		return 0;
3454 	}
3455 
3456 	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3457 	if ((tmp_inp = inp_fc_getinp(inp->inp_flowhash,
3458 	    INPFC_SOLOCKED)) != NULL) {
3459 		if (in_pcb_checkstate(tmp_inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3460 			return 0;
3461 		}
3462 		VERIFY(tmp_inp == inp);
3463 		switch (advcode) {
3464 		case FADV_FLOW_CONTROLLED:
3465 			inp->inp_flags |= INP_FLOW_CONTROLLED;
3466 			inp->inp_fadv_flow_ctrl_cnt++;
3467 			break;
3468 		case FADV_SUSPENDED:
3469 			inp->inp_flags |= INP_FLOW_SUSPENDED;
3470 			inp->inp_fadv_suspended_cnt++;
3471 			soevent(inp->inp_socket,
3472 			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND));
3473 
3474 			/* Record the fact that suspend event was sent */
3475 			inp->inp_socket->so_flags |= SOF_SUSPENDED;
3476 			break;
3477 		}
3478 
3479 		if (!is_flow_controlled && SOCK_TYPE(inp->inp_socket) == SOCK_STREAM) {
3480 			inp_fc_throttle_tcp(inp);
3481 		}
3482 		return 1;
3483 	}
3484 	return 0;
3485 }
3486 
3487 /*
3488  * Handler for SO_FLUSH socket option.
3489  */
3490 int
inp_flush(struct inpcb * inp,int optval)3491 inp_flush(struct inpcb *inp, int optval)
3492 {
3493 	u_int32_t flowhash = inp->inp_flowhash;
3494 	struct ifnet *rtifp, *oifp;
3495 
3496 	/* Either all classes or one of the valid ones */
3497 	if (optval != SO_TC_ALL && !SO_VALID_TC(optval)) {
3498 		return EINVAL;
3499 	}
3500 
3501 	/* We need a flow hash for identification */
3502 	if (flowhash == 0) {
3503 		return 0;
3504 	}
3505 
3506 	/* Grab the interfaces from the route and pcb */
3507 	rtifp = ((inp->inp_route.ro_rt != NULL) ?
3508 	    inp->inp_route.ro_rt->rt_ifp : NULL);
3509 	oifp = inp->inp_last_outifp;
3510 
3511 	if (rtifp != NULL) {
3512 		if_qflush_sc(rtifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3513 	}
3514 	if (oifp != NULL && oifp != rtifp) {
3515 		if_qflush_sc(oifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3516 	}
3517 
3518 	return 0;
3519 }
3520 
3521 /*
3522  * Clear the INP_INADDR_ANY flag (special case for PPP only)
3523  */
3524 void
inp_clear_INP_INADDR_ANY(struct socket * so)3525 inp_clear_INP_INADDR_ANY(struct socket *so)
3526 {
3527 	struct inpcb *inp = NULL;
3528 
3529 	socket_lock(so, 1);
3530 	inp = sotoinpcb(so);
3531 	if (inp) {
3532 		inp->inp_flags &= ~INP_INADDR_ANY;
3533 	}
3534 	socket_unlock(so, 1);
3535 }
3536 
3537 void
inp_get_soprocinfo(struct inpcb * inp,struct so_procinfo * soprocinfo)3538 inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo)
3539 {
3540 	struct socket *so = inp->inp_socket;
3541 
3542 	soprocinfo->spi_pid = so->last_pid;
3543 	strlcpy(&soprocinfo->spi_proc_name[0], &inp->inp_last_proc_name[0],
3544 	    sizeof(soprocinfo->spi_proc_name));
3545 	if (so->last_pid != 0) {
3546 		uuid_copy(soprocinfo->spi_uuid, so->last_uuid);
3547 	}
3548 	/*
3549 	 * When not delegated, the effective pid is the same as the real pid
3550 	 */
3551 	if (so->so_flags & SOF_DELEGATED) {
3552 		soprocinfo->spi_delegated = 1;
3553 		soprocinfo->spi_epid = so->e_pid;
3554 		uuid_copy(soprocinfo->spi_euuid, so->e_uuid);
3555 	} else {
3556 		soprocinfo->spi_delegated = 0;
3557 		soprocinfo->spi_epid = so->last_pid;
3558 	}
3559 	strlcpy(&soprocinfo->spi_e_proc_name[0], &inp->inp_e_proc_name[0],
3560 	    sizeof(soprocinfo->spi_e_proc_name));
3561 }
3562 
3563 int
inp_findinpcb_procinfo(struct inpcbinfo * pcbinfo,uint32_t flowhash,struct so_procinfo * soprocinfo)3564 inp_findinpcb_procinfo(struct inpcbinfo *pcbinfo, uint32_t flowhash,
3565     struct so_procinfo *soprocinfo)
3566 {
3567 	struct inpcb *inp = NULL;
3568 	int found = 0;
3569 
3570 	bzero(soprocinfo, sizeof(struct so_procinfo));
3571 
3572 	if (!flowhash) {
3573 		return -1;
3574 	}
3575 
3576 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
3577 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
3578 		if (inp->inp_state != INPCB_STATE_DEAD &&
3579 		    inp->inp_socket != NULL &&
3580 		    inp->inp_flowhash == flowhash) {
3581 			found = 1;
3582 			inp_get_soprocinfo(inp, soprocinfo);
3583 			break;
3584 		}
3585 	}
3586 	lck_rw_done(&pcbinfo->ipi_lock);
3587 
3588 	return found;
3589 }
3590 
3591 #if CONFIG_PROC_UUID_POLICY
3592 static void
inp_update_cellular_policy(struct inpcb * inp,boolean_t set)3593 inp_update_cellular_policy(struct inpcb *inp, boolean_t set)
3594 {
3595 	struct socket *so = inp->inp_socket;
3596 	int before, after;
3597 
3598 	VERIFY(so != NULL);
3599 	VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3600 
3601 	before = INP_NO_CELLULAR(inp);
3602 	if (set) {
3603 		inp_set_nocellular(inp);
3604 	} else {
3605 		inp_clear_nocellular(inp);
3606 	}
3607 	after = INP_NO_CELLULAR(inp);
3608 	if (net_io_policy_log && (before != after)) {
3609 		static const char *ok = "OK";
3610 		static const char *nok = "NOACCESS";
3611 		uuid_string_t euuid_buf;
3612 		pid_t epid;
3613 
3614 		if (so->so_flags & SOF_DELEGATED) {
3615 			uuid_unparse(so->e_uuid, euuid_buf);
3616 			epid = so->e_pid;
3617 		} else {
3618 			uuid_unparse(so->last_uuid, euuid_buf);
3619 			epid = so->last_pid;
3620 		}
3621 
3622 		/* allow this socket to generate another notification event */
3623 		so->so_ifdenied_notifies = 0;
3624 
3625 		log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3626 		    "euuid %s%s %s->%s\n", __func__,
3627 		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3628 		    SOCK_TYPE(so), epid, euuid_buf,
3629 		    (so->so_flags & SOF_DELEGATED) ?
3630 		    " [delegated]" : "",
3631 		    ((before < after) ? ok : nok),
3632 		    ((before < after) ? nok : ok));
3633 	}
3634 }
3635 
3636 #if NECP
3637 static void
inp_update_necp_want_app_policy(struct inpcb * inp,boolean_t set)3638 inp_update_necp_want_app_policy(struct inpcb *inp, boolean_t set)
3639 {
3640 	struct socket *so = inp->inp_socket;
3641 	int before, after;
3642 
3643 	VERIFY(so != NULL);
3644 	VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3645 
3646 	before = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3647 	if (set) {
3648 		inp_set_want_app_policy(inp);
3649 	} else {
3650 		inp_clear_want_app_policy(inp);
3651 	}
3652 	after = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3653 	if (net_io_policy_log && (before != after)) {
3654 		static const char *wanted = "WANTED";
3655 		static const char *unwanted = "UNWANTED";
3656 		uuid_string_t euuid_buf;
3657 		pid_t epid;
3658 
3659 		if (so->so_flags & SOF_DELEGATED) {
3660 			uuid_unparse(so->e_uuid, euuid_buf);
3661 			epid = so->e_pid;
3662 		} else {
3663 			uuid_unparse(so->last_uuid, euuid_buf);
3664 			epid = so->last_pid;
3665 		}
3666 
3667 		log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3668 		    "euuid %s%s %s->%s\n", __func__,
3669 		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3670 		    SOCK_TYPE(so), epid, euuid_buf,
3671 		    (so->so_flags & SOF_DELEGATED) ?
3672 		    " [delegated]" : "",
3673 		    ((before < after) ? unwanted : wanted),
3674 		    ((before < after) ? wanted : unwanted));
3675 	}
3676 }
3677 #endif /* NECP */
3678 #endif /* !CONFIG_PROC_UUID_POLICY */
3679 
3680 #if NECP
3681 void
inp_update_necp_policy(struct inpcb * inp,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr,u_int override_bound_interface)3682 inp_update_necp_policy(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int override_bound_interface)
3683 {
3684 	necp_socket_find_policy_match(inp, override_local_addr, override_remote_addr, override_bound_interface);
3685 	if (necp_socket_should_rescope(inp) &&
3686 	    inp->inp_lport == 0 &&
3687 	    inp->inp_laddr.s_addr == INADDR_ANY &&
3688 	    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
3689 		// If we should rescope, and the socket is not yet bound
3690 		inp_bindif(inp, necp_socket_get_rescope_if_index(inp), NULL);
3691 		inp->inp_flags2 |= INP2_SCOPED_BY_NECP;
3692 	}
3693 }
3694 #endif /* NECP */
3695 
3696 int
inp_update_policy(struct inpcb * inp)3697 inp_update_policy(struct inpcb *inp)
3698 {
3699 #if CONFIG_PROC_UUID_POLICY
3700 	struct socket *so = inp->inp_socket;
3701 	uint32_t pflags = 0;
3702 	int32_t ogencnt;
3703 	int err = 0;
3704 	uint8_t *lookup_uuid = NULL;
3705 
3706 	if (!net_io_policy_uuid ||
3707 	    so == NULL || inp->inp_state == INPCB_STATE_DEAD) {
3708 		return 0;
3709 	}
3710 
3711 	/*
3712 	 * Kernel-created sockets that aren't delegating other sockets
3713 	 * are currently exempted from UUID policy checks.
3714 	 */
3715 	if (so->last_pid == 0 && !(so->so_flags & SOF_DELEGATED)) {
3716 		return 0;
3717 	}
3718 
3719 #if defined(XNU_TARGET_OS_OSX)
3720 	if (so->so_rpid > 0) {
3721 		lookup_uuid = so->so_ruuid;
3722 		ogencnt = so->so_policy_gencnt;
3723 		err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3724 	}
3725 #endif
3726 	if (lookup_uuid == NULL || err == ENOENT) {
3727 		lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid);
3728 		ogencnt = so->so_policy_gencnt;
3729 		err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3730 	}
3731 
3732 	/*
3733 	 * Discard cached generation count if the entry is gone (ENOENT),
3734 	 * so that we go thru the checks below.
3735 	 */
3736 	if (err == ENOENT && ogencnt != 0) {
3737 		so->so_policy_gencnt = 0;
3738 	}
3739 
3740 	/*
3741 	 * If the generation count has changed, inspect the policy flags
3742 	 * and act accordingly.  If a policy flag was previously set and
3743 	 * the UUID is no longer present in the table (ENOENT), treat it
3744 	 * as if the flag has been cleared.
3745 	 */
3746 	if ((err == 0 || err == ENOENT) && ogencnt != so->so_policy_gencnt) {
3747 		/* update cellular policy for this socket */
3748 		if (err == 0 && (pflags & PROC_UUID_NO_CELLULAR)) {
3749 			inp_update_cellular_policy(inp, TRUE);
3750 		} else if (!(pflags & PROC_UUID_NO_CELLULAR)) {
3751 			inp_update_cellular_policy(inp, FALSE);
3752 		}
3753 #if NECP
3754 		/* update necp want app policy for this socket */
3755 		if (err == 0 && (pflags & PROC_UUID_NECP_APP_POLICY)) {
3756 			inp_update_necp_want_app_policy(inp, TRUE);
3757 		} else if (!(pflags & PROC_UUID_NECP_APP_POLICY)) {
3758 			inp_update_necp_want_app_policy(inp, FALSE);
3759 		}
3760 #endif /* NECP */
3761 	}
3762 
3763 	return (err == ENOENT) ? 0 : err;
3764 #else /* !CONFIG_PROC_UUID_POLICY */
3765 #pragma unused(inp)
3766 	return 0;
3767 #endif /* !CONFIG_PROC_UUID_POLICY */
3768 }
3769 
3770 static unsigned int log_restricted;
3771 SYSCTL_DECL(_net_inet);
3772 SYSCTL_INT(_net_inet, OID_AUTO, log_restricted,
3773     CTLFLAG_RW | CTLFLAG_LOCKED, &log_restricted, 0,
3774     "Log network restrictions");
3775 /*
3776  * Called when we need to enforce policy restrictions in the input path.
3777  *
3778  * Returns TRUE if we're not allowed to receive data, otherwise FALSE.
3779  */
3780 static boolean_t
_inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)3781 _inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
3782 {
3783 	VERIFY(inp != NULL);
3784 
3785 	/*
3786 	 * Inbound restrictions.
3787 	 */
3788 	if (!sorestrictrecv) {
3789 		return FALSE;
3790 	}
3791 
3792 	if (ifp == NULL) {
3793 		return FALSE;
3794 	}
3795 
3796 	if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
3797 		return TRUE;
3798 	}
3799 
3800 	if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
3801 		return TRUE;
3802 	}
3803 
3804 	if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
3805 		return TRUE;
3806 	}
3807 
3808 	if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
3809 		return TRUE;
3810 	}
3811 
3812 	if (!(ifp->if_eflags & IFEF_RESTRICTED_RECV)) {
3813 		return FALSE;
3814 	}
3815 
3816 	if (inp->inp_flags & INP_RECV_ANYIF) {
3817 		return FALSE;
3818 	}
3819 
3820 	if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp == ifp) {
3821 		return FALSE;
3822 	}
3823 
3824 	if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
3825 		return TRUE;
3826 	}
3827 
3828 	return TRUE;
3829 }
3830 
3831 boolean_t
inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)3832 inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
3833 {
3834 	boolean_t ret;
3835 
3836 	ret = _inp_restricted_recv(inp, ifp);
3837 	if (ret == TRUE && log_restricted) {
3838 		printf("pid %d (%s) is unable to receive packets on %s\n",
3839 		    proc_getpid(current_proc()), proc_best_name(current_proc()),
3840 		    ifp->if_xname);
3841 	}
3842 	return ret;
3843 }
3844 
3845 /*
3846  * Called when we need to enforce policy restrictions in the output path.
3847  *
3848  * Returns TRUE if we're not allowed to send data out, otherwise FALSE.
3849  */
3850 static boolean_t
_inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)3851 _inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
3852 {
3853 	VERIFY(inp != NULL);
3854 
3855 	/*
3856 	 * Outbound restrictions.
3857 	 */
3858 	if (!sorestrictsend) {
3859 		return FALSE;
3860 	}
3861 
3862 	if (ifp == NULL) {
3863 		return FALSE;
3864 	}
3865 
3866 	if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
3867 		return TRUE;
3868 	}
3869 
3870 	if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
3871 		return TRUE;
3872 	}
3873 
3874 	if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
3875 		return TRUE;
3876 	}
3877 
3878 	if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
3879 		return TRUE;
3880 	}
3881 
3882 	if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
3883 		return TRUE;
3884 	}
3885 
3886 	return FALSE;
3887 }
3888 
3889 boolean_t
inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)3890 inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
3891 {
3892 	boolean_t ret;
3893 
3894 	ret = _inp_restricted_send(inp, ifp);
3895 	if (ret == TRUE && log_restricted) {
3896 		printf("pid %d (%s) is unable to transmit packets on %s\n",
3897 		    proc_getpid(current_proc()), proc_best_name(current_proc()),
3898 		    ifp->if_xname);
3899 	}
3900 	return ret;
3901 }
3902 
3903 inline void
inp_count_sndbytes(struct inpcb * inp,u_int32_t th_ack)3904 inp_count_sndbytes(struct inpcb *inp, u_int32_t th_ack)
3905 {
3906 	struct ifnet *ifp = inp->inp_last_outifp;
3907 	struct socket *so = inp->inp_socket;
3908 	if (ifp != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
3909 	    (ifp->if_type == IFT_CELLULAR || IFNET_IS_WIFI(ifp))) {
3910 		int32_t unsent;
3911 
3912 		so->so_snd.sb_flags |= SB_SNDBYTE_CNT;
3913 
3914 		/*
3915 		 * There can be data outstanding before the connection
3916 		 * becomes established -- TFO case
3917 		 */
3918 		if (so->so_snd.sb_cc > 0) {
3919 			inp_incr_sndbytes_total(so, so->so_snd.sb_cc);
3920 		}
3921 
3922 		unsent = inp_get_sndbytes_allunsent(so, th_ack);
3923 		if (unsent > 0) {
3924 			inp_incr_sndbytes_unsent(so, unsent);
3925 		}
3926 	}
3927 }
3928 
3929 inline void
inp_incr_sndbytes_total(struct socket * so,int32_t len)3930 inp_incr_sndbytes_total(struct socket *so, int32_t len)
3931 {
3932 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
3933 	struct ifnet *ifp = inp->inp_last_outifp;
3934 
3935 	if (ifp != NULL) {
3936 		VERIFY(ifp->if_sndbyte_total >= 0);
3937 		OSAddAtomic64(len, &ifp->if_sndbyte_total);
3938 	}
3939 }
3940 
3941 inline void
inp_decr_sndbytes_total(struct socket * so,int32_t len)3942 inp_decr_sndbytes_total(struct socket *so, int32_t len)
3943 {
3944 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
3945 	struct ifnet *ifp = inp->inp_last_outifp;
3946 
3947 	if (ifp != NULL) {
3948 		if (ifp->if_sndbyte_total >= len) {
3949 			OSAddAtomic64(-len, &ifp->if_sndbyte_total);
3950 		} else {
3951 			ifp->if_sndbyte_total = 0;
3952 		}
3953 	}
3954 }
3955 
3956 inline void
inp_incr_sndbytes_unsent(struct socket * so,int32_t len)3957 inp_incr_sndbytes_unsent(struct socket *so, int32_t len)
3958 {
3959 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
3960 	struct ifnet *ifp = inp->inp_last_outifp;
3961 
3962 	if (ifp != NULL) {
3963 		VERIFY(ifp->if_sndbyte_unsent >= 0);
3964 		OSAddAtomic64(len, &ifp->if_sndbyte_unsent);
3965 	}
3966 }
3967 
3968 inline void
inp_decr_sndbytes_unsent(struct socket * so,int32_t len)3969 inp_decr_sndbytes_unsent(struct socket *so, int32_t len)
3970 {
3971 	if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
3972 		return;
3973 	}
3974 
3975 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
3976 	struct ifnet *ifp = inp->inp_last_outifp;
3977 
3978 	if (ifp != NULL) {
3979 		if (ifp->if_sndbyte_unsent >= len) {
3980 			OSAddAtomic64(-len, &ifp->if_sndbyte_unsent);
3981 		} else {
3982 			ifp->if_sndbyte_unsent = 0;
3983 		}
3984 	}
3985 }
3986 
3987 inline void
inp_decr_sndbytes_allunsent(struct socket * so,u_int32_t th_ack)3988 inp_decr_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
3989 {
3990 	int32_t len;
3991 
3992 	if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
3993 		return;
3994 	}
3995 
3996 	len = inp_get_sndbytes_allunsent(so, th_ack);
3997 	inp_decr_sndbytes_unsent(so, len);
3998 }
3999 
4000 #if SKYWALK
4001 inline void
inp_update_netns_flags(struct socket * so)4002 inp_update_netns_flags(struct socket *so)
4003 {
4004 	struct inpcb *inp;
4005 	uint32_t set_flags = 0;
4006 	uint32_t clear_flags = 0;
4007 
4008 	if (!(SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
4009 		return;
4010 	}
4011 
4012 	inp = sotoinpcb(so);
4013 
4014 	if (inp == NULL) {
4015 		return;
4016 	}
4017 
4018 	if (!NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
4019 		return;
4020 	}
4021 
4022 	if (so->so_options & SO_NOWAKEFROMSLEEP) {
4023 		set_flags |= NETNS_NOWAKEFROMSLEEP;
4024 	} else {
4025 		clear_flags |= NETNS_NOWAKEFROMSLEEP;
4026 	}
4027 
4028 	if (inp->inp_flags & INP_RECV_ANYIF) {
4029 		set_flags |= NETNS_RECVANYIF;
4030 	} else {
4031 		clear_flags |= NETNS_RECVANYIF;
4032 	}
4033 
4034 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
4035 		set_flags |= NETNS_EXTBGIDLE;
4036 	} else {
4037 		clear_flags |= NETNS_EXTBGIDLE;
4038 	}
4039 
4040 	netns_change_flags(&inp->inp_netns_token, set_flags, clear_flags);
4041 }
4042 #endif /* SKYWALK */
4043 
4044 inline void
inp_set_activity_bitmap(struct inpcb * inp)4045 inp_set_activity_bitmap(struct inpcb *inp)
4046 {
4047 	in_stat_set_activity_bitmap(&inp->inp_nw_activity, net_uptime());
4048 }
4049 
4050 inline void
inp_get_activity_bitmap(struct inpcb * inp,activity_bitmap_t * ab)4051 inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *ab)
4052 {
4053 	bcopy(&inp->inp_nw_activity, ab, sizeof(*ab));
4054 }
4055 
4056 void
inp_update_last_owner(struct socket * so,struct proc * p,struct proc * ep)4057 inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep)
4058 {
4059 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4060 
4061 	if (inp == NULL) {
4062 		return;
4063 	}
4064 
4065 	if (p != NULL) {
4066 		strlcpy(&inp->inp_last_proc_name[0], proc_name_address(p), sizeof(inp->inp_last_proc_name));
4067 	}
4068 	if (so->so_flags & SOF_DELEGATED) {
4069 		if (ep != NULL) {
4070 			strlcpy(&inp->inp_e_proc_name[0], proc_name_address(ep), sizeof(inp->inp_e_proc_name));
4071 		} else {
4072 			inp->inp_e_proc_name[0] = 0;
4073 		}
4074 	} else {
4075 		inp->inp_e_proc_name[0] = 0;
4076 	}
4077 }
4078 
4079 void
inp_copy_last_owner(struct socket * so,struct socket * head)4080 inp_copy_last_owner(struct socket *so, struct socket *head)
4081 {
4082 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4083 	struct inpcb *head_inp = (struct inpcb *)head->so_pcb;
4084 
4085 	if (inp == NULL || head_inp == NULL) {
4086 		return;
4087 	}
4088 
4089 	strlcpy(&inp->inp_last_proc_name[0], &head_inp->inp_last_proc_name[0], sizeof(inp->inp_last_proc_name));
4090 	strlcpy(&inp->inp_e_proc_name[0], &head_inp->inp_e_proc_name[0], sizeof(inp->inp_e_proc_name));
4091 }
4092