1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1991, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62 */
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/proc.h>
73 #include <sys/kernel.h>
74 #include <sys/sysctl.h>
75 #include <sys/mcache.h>
76 #include <sys/kauth.h>
77 #include <sys/priv.h>
78 #include <sys/proc_uuid_policy.h>
79 #include <sys/syslog.h>
80 #include <sys/priv.h>
81 #include <net/dlil.h>
82
83 #include <libkern/OSAtomic.h>
84 #include <kern/locks.h>
85
86 #include <machine/limits.h>
87
88 #include <kern/zalloc.h>
89
90 #include <net/if.h>
91 #include <net/if_types.h>
92 #include <net/route.h>
93 #include <net/flowhash.h>
94 #include <net/flowadv.h>
95 #include <net/nat464_utils.h>
96 #include <net/ntstat.h>
97 #include <net/restricted_in_port.h>
98
99 #include <netinet/in.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/in_var.h>
102 #include <netinet/ip_var.h>
103
104 #include <netinet/ip6.h>
105 #include <netinet6/ip6_var.h>
106
107 #include <sys/kdebug.h>
108 #include <sys/random.h>
109
110 #include <dev/random/randomdev.h>
111 #include <mach/boolean.h>
112
113 #include <pexpert/pexpert.h>
114
115 #if NECP
116 #include <net/necp.h>
117 #endif
118
119 #include <sys/stat.h>
120 #include <sys/ubc.h>
121 #include <sys/vnode.h>
122
123 #include <os/log.h>
124
125 extern const char *proc_name_address(struct proc *);
126
127 static LCK_GRP_DECLARE(inpcb_lock_grp, "inpcb");
128 static LCK_ATTR_DECLARE(inpcb_lock_attr, 0, 0);
129 static LCK_MTX_DECLARE_ATTR(inpcb_lock, &inpcb_lock_grp, &inpcb_lock_attr);
130 static LCK_MTX_DECLARE_ATTR(inpcb_timeout_lock, &inpcb_lock_grp, &inpcb_lock_attr);
131
132 static TAILQ_HEAD(, inpcbinfo) inpcb_head = TAILQ_HEAD_INITIALIZER(inpcb_head);
133
134 static u_int16_t inpcb_timeout_run = 0; /* INPCB timer is scheduled to run */
135 static boolean_t inpcb_garbage_collecting = FALSE; /* gc timer is scheduled */
136 static boolean_t inpcb_ticking = FALSE; /* "slow" timer is scheduled */
137 static boolean_t inpcb_fast_timer_on = FALSE;
138
139 #define INPCB_GCREQ_THRESHOLD 50000
140
141 static thread_call_t inpcb_thread_call, inpcb_fast_thread_call;
142 static void inpcb_sched_timeout(void);
143 static void inpcb_sched_lazy_timeout(void);
144 static void _inpcb_sched_timeout(unsigned int);
145 static void inpcb_timeout(void *, void *);
146 const int inpcb_timeout_lazy = 10; /* 10 seconds leeway for lazy timers */
147 extern int tvtohz(struct timeval *);
148
149 #if CONFIG_PROC_UUID_POLICY
150 static void inp_update_cellular_policy(struct inpcb *, boolean_t);
151 #if NECP
152 static void inp_update_necp_want_app_policy(struct inpcb *, boolean_t);
153 #endif /* NECP */
154 #endif /* !CONFIG_PROC_UUID_POLICY */
155
156 #define DBG_FNC_PCB_LOOKUP NETDBG_CODE(DBG_NETTCP, (6 << 8))
157 #define DBG_FNC_PCB_HLOOKUP NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
158
159 int allow_udp_port_exhaustion = 0;
160
161 /*
162 * These configure the range of local port addresses assigned to
163 * "unspecified" outgoing connections/packets/whatever.
164 */
165 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
166 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
167 int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
168 int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */
169 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
170 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
171
172 #define RANGECHK(var, min, max) \
173 if ((var) < (min)) { (var) = (min); } \
174 else if ((var) > (max)) { (var) = (max); }
175
176 static int
177 sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
178 {
179 #pragma unused(arg1, arg2)
180 int error;
181 int new_value = *(int *)oidp->oid_arg1;
182 #if (DEBUG | DEVELOPMENT)
183 int old_value = *(int *)oidp->oid_arg1;
184 /*
185 * For unit testing allow a non-superuser process with the
186 * proper entitlement to modify the variables
187 */
188 if (req->newptr) {
189 if (proc_suser(current_proc()) != 0 &&
190 (error = priv_check_cred(kauth_cred_get(),
191 PRIV_NETINET_RESERVEDPORT, 0))) {
192 return EPERM;
193 }
194 }
195 #endif /* (DEBUG | DEVELOPMENT) */
196
197 error = sysctl_handle_int(oidp, &new_value, 0, req);
198 if (!error) {
199 if (oidp->oid_arg1 == &ipport_lowfirstauto || oidp->oid_arg1 == &ipport_lowlastauto) {
200 RANGECHK(new_value, 1, IPPORT_RESERVED - 1);
201 } else {
202 RANGECHK(new_value, IPPORT_RESERVED, USHRT_MAX);
203 }
204 *(int *)oidp->oid_arg1 = new_value;
205 }
206
207 #if (DEBUG | DEVELOPMENT)
208 os_log(OS_LOG_DEFAULT,
209 "%s:%u sysctl net.restricted_port.verbose: %d -> %d)",
210 proc_best_name(current_proc()), proc_selfpid(),
211 old_value, *(int *)oidp->oid_arg1);
212 #endif /* (DEBUG | DEVELOPMENT) */
213
214 return error;
215 }
216
217 #undef RANGECHK
218
219 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
220 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IP Ports");
221
222 #if (DEBUG | DEVELOPMENT)
223 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY)
224 #else
225 #define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED)
226 #endif /* (DEBUG | DEVELOPMENT) */
227
228 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
229 CTLFAGS_IP_PORTRANGE,
230 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
231 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
232 CTLFAGS_IP_PORTRANGE,
233 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
234 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
235 CTLFAGS_IP_PORTRANGE,
236 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
237 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
238 CTLFAGS_IP_PORTRANGE,
239 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
240 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
241 CTLFAGS_IP_PORTRANGE,
242 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
243 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
244 CTLFAGS_IP_PORTRANGE,
245 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
246 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, ipport_allow_udp_port_exhaustion,
247 CTLFLAG_LOCKED | CTLFLAG_RW, &allow_udp_port_exhaustion, 0, "");
248
249 static uint32_t apn_fallbk_debug = 0;
250 #define apn_fallbk_log(x) do { if (apn_fallbk_debug >= 1) log x; } while (0)
251
252 #if !XNU_TARGET_OS_OSX
253 static boolean_t apn_fallbk_enabled = TRUE;
254
255 SYSCTL_DECL(_net_inet);
256 SYSCTL_NODE(_net_inet, OID_AUTO, apn_fallback, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "APN Fallback");
257 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
258 &apn_fallbk_enabled, 0, "APN fallback enable");
259 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
260 &apn_fallbk_debug, 0, "APN fallback debug enable");
261 #else /* XNU_TARGET_OS_OSX */
262 static boolean_t apn_fallbk_enabled = FALSE;
263 #endif /* XNU_TARGET_OS_OSX */
264
265 extern int udp_use_randomport;
266 extern int tcp_use_randomport;
267
268 /* Structs used for flowhash computation */
269 struct inp_flowhash_key_addr {
270 union {
271 struct in_addr v4;
272 struct in6_addr v6;
273 u_int8_t addr8[16];
274 u_int16_t addr16[8];
275 u_int32_t addr32[4];
276 } infha;
277 };
278
279 struct inp_flowhash_key {
280 struct inp_flowhash_key_addr infh_laddr;
281 struct inp_flowhash_key_addr infh_faddr;
282 u_int32_t infh_lport;
283 u_int32_t infh_fport;
284 u_int32_t infh_af;
285 u_int32_t infh_proto;
286 u_int32_t infh_rand1;
287 u_int32_t infh_rand2;
288 };
289
290 static u_int32_t inp_hash_seed = 0;
291
292 static int infc_cmp(const struct inpcb *, const struct inpcb *);
293
294 /* Flags used by inp_fc_getinp */
295 #define INPFC_SOLOCKED 0x1
296 #define INPFC_REMOVE 0x2
297 static struct inpcb *inp_fc_getinp(u_int32_t, u_int32_t);
298
299 static void inp_fc_feedback(struct inpcb *);
300 extern void tcp_remove_from_time_wait(struct inpcb *inp);
301
302 static LCK_MTX_DECLARE_ATTR(inp_fc_lck, &inpcb_lock_grp, &inpcb_lock_attr);
303
304 RB_HEAD(inp_fc_tree, inpcb) inp_fc_tree;
305 RB_PROTOTYPE(inp_fc_tree, inpcb, infc_link, infc_cmp);
306 RB_GENERATE(inp_fc_tree, inpcb, infc_link, infc_cmp);
307
308 /*
309 * Use this inp as a key to find an inp in the flowhash tree.
310 * Accesses to it are protected by inp_fc_lck.
311 */
312 struct inpcb key_inp;
313
314 /*
315 * in_pcb.c: manage the Protocol Control Blocks.
316 */
317
318 void
in_pcbinit(void)319 in_pcbinit(void)
320 {
321 static int inpcb_initialized = 0;
322
323 VERIFY(!inpcb_initialized);
324 inpcb_initialized = 1;
325
326 inpcb_thread_call = thread_call_allocate_with_priority(inpcb_timeout,
327 NULL, THREAD_CALL_PRIORITY_KERNEL);
328 /* Give it an arg so that we know that this is the fast timer */
329 inpcb_fast_thread_call = thread_call_allocate_with_priority(
330 inpcb_timeout, &inpcb_timeout, THREAD_CALL_PRIORITY_KERNEL);
331 if (inpcb_thread_call == NULL || inpcb_fast_thread_call == NULL) {
332 panic("unable to alloc the inpcb thread call");
333 }
334
335 /*
336 * Initialize data structures required to deliver
337 * flow advisories.
338 */
339 lck_mtx_lock(&inp_fc_lck);
340 RB_INIT(&inp_fc_tree);
341 bzero(&key_inp, sizeof(key_inp));
342 lck_mtx_unlock(&inp_fc_lck);
343 }
344
345 #define INPCB_HAVE_TIMER_REQ(req) (((req).intimer_lazy > 0) || \
346 ((req).intimer_fast > 0) || ((req).intimer_nodelay > 0))
347 static void
inpcb_timeout(void * arg0,void * arg1)348 inpcb_timeout(void *arg0, void *arg1)
349 {
350 #pragma unused(arg1)
351 struct inpcbinfo *ipi;
352 boolean_t t, gc;
353 struct intimercount gccnt, tmcnt;
354
355 /*
356 * Update coarse-grained networking timestamp (in sec.); the idea
357 * is to piggy-back on the timeout callout to update the counter
358 * returnable via net_uptime().
359 */
360 net_update_uptime();
361
362 bzero(&gccnt, sizeof(gccnt));
363 bzero(&tmcnt, sizeof(tmcnt));
364
365 lck_mtx_lock_spin(&inpcb_timeout_lock);
366 gc = inpcb_garbage_collecting;
367 inpcb_garbage_collecting = FALSE;
368
369 t = inpcb_ticking;
370 inpcb_ticking = FALSE;
371
372 if (gc || t) {
373 lck_mtx_unlock(&inpcb_timeout_lock);
374
375 lck_mtx_lock(&inpcb_lock);
376 TAILQ_FOREACH(ipi, &inpcb_head, ipi_entry) {
377 if (INPCB_HAVE_TIMER_REQ(ipi->ipi_gc_req)) {
378 bzero(&ipi->ipi_gc_req,
379 sizeof(ipi->ipi_gc_req));
380 if (gc && ipi->ipi_gc != NULL) {
381 ipi->ipi_gc(ipi);
382 gccnt.intimer_lazy +=
383 ipi->ipi_gc_req.intimer_lazy;
384 gccnt.intimer_fast +=
385 ipi->ipi_gc_req.intimer_fast;
386 gccnt.intimer_nodelay +=
387 ipi->ipi_gc_req.intimer_nodelay;
388 }
389 }
390 if (INPCB_HAVE_TIMER_REQ(ipi->ipi_timer_req)) {
391 bzero(&ipi->ipi_timer_req,
392 sizeof(ipi->ipi_timer_req));
393 if (t && ipi->ipi_timer != NULL) {
394 ipi->ipi_timer(ipi);
395 tmcnt.intimer_lazy +=
396 ipi->ipi_timer_req.intimer_lazy;
397 tmcnt.intimer_fast +=
398 ipi->ipi_timer_req.intimer_fast;
399 tmcnt.intimer_nodelay +=
400 ipi->ipi_timer_req.intimer_nodelay;
401 }
402 }
403 }
404 lck_mtx_unlock(&inpcb_lock);
405 lck_mtx_lock_spin(&inpcb_timeout_lock);
406 }
407
408 /* lock was dropped above, so check first before overriding */
409 if (!inpcb_garbage_collecting) {
410 inpcb_garbage_collecting = INPCB_HAVE_TIMER_REQ(gccnt);
411 }
412 if (!inpcb_ticking) {
413 inpcb_ticking = INPCB_HAVE_TIMER_REQ(tmcnt);
414 }
415
416 /* arg0 will be set if we are the fast timer */
417 if (arg0 != NULL) {
418 inpcb_fast_timer_on = FALSE;
419 }
420 inpcb_timeout_run--;
421 VERIFY(inpcb_timeout_run >= 0 && inpcb_timeout_run < 2);
422
423 /* re-arm the timer if there's work to do */
424 if (gccnt.intimer_nodelay > 0 || tmcnt.intimer_nodelay > 0) {
425 inpcb_sched_timeout();
426 } else if ((gccnt.intimer_fast + tmcnt.intimer_fast) <= 5) {
427 /* be lazy when idle with little activity */
428 inpcb_sched_lazy_timeout();
429 } else {
430 inpcb_sched_timeout();
431 }
432
433 lck_mtx_unlock(&inpcb_timeout_lock);
434 }
435
436 static void
inpcb_sched_timeout(void)437 inpcb_sched_timeout(void)
438 {
439 _inpcb_sched_timeout(0);
440 }
441
442 static void
inpcb_sched_lazy_timeout(void)443 inpcb_sched_lazy_timeout(void)
444 {
445 _inpcb_sched_timeout(inpcb_timeout_lazy);
446 }
447
448 static void
_inpcb_sched_timeout(unsigned int offset)449 _inpcb_sched_timeout(unsigned int offset)
450 {
451 uint64_t deadline, leeway;
452
453 clock_interval_to_deadline(1, NSEC_PER_SEC, &deadline);
454 LCK_MTX_ASSERT(&inpcb_timeout_lock, LCK_MTX_ASSERT_OWNED);
455 if (inpcb_timeout_run == 0 &&
456 (inpcb_garbage_collecting || inpcb_ticking)) {
457 lck_mtx_convert_spin(&inpcb_timeout_lock);
458 inpcb_timeout_run++;
459 if (offset == 0) {
460 inpcb_fast_timer_on = TRUE;
461 thread_call_enter_delayed(inpcb_fast_thread_call,
462 deadline);
463 } else {
464 inpcb_fast_timer_on = FALSE;
465 clock_interval_to_absolutetime_interval(offset,
466 NSEC_PER_SEC, &leeway);
467 thread_call_enter_delayed_with_leeway(
468 inpcb_thread_call, NULL, deadline, leeway,
469 THREAD_CALL_DELAY_LEEWAY);
470 }
471 } else if (inpcb_timeout_run == 1 &&
472 offset == 0 && !inpcb_fast_timer_on) {
473 /*
474 * Since the request was for a fast timer but the
475 * scheduled timer is a lazy timer, try to schedule
476 * another instance of fast timer also.
477 */
478 lck_mtx_convert_spin(&inpcb_timeout_lock);
479 inpcb_timeout_run++;
480 inpcb_fast_timer_on = TRUE;
481 thread_call_enter_delayed(inpcb_fast_thread_call, deadline);
482 }
483 }
484
485 void
inpcb_gc_sched(struct inpcbinfo * ipi,u_int32_t type)486 inpcb_gc_sched(struct inpcbinfo *ipi, u_int32_t type)
487 {
488 u_int32_t gccnt;
489
490 lck_mtx_lock_spin(&inpcb_timeout_lock);
491 inpcb_garbage_collecting = TRUE;
492 gccnt = ipi->ipi_gc_req.intimer_nodelay +
493 ipi->ipi_gc_req.intimer_fast;
494
495 if (gccnt > INPCB_GCREQ_THRESHOLD) {
496 type = INPCB_TIMER_FAST;
497 }
498
499 switch (type) {
500 case INPCB_TIMER_NODELAY:
501 atomic_add_32(&ipi->ipi_gc_req.intimer_nodelay, 1);
502 inpcb_sched_timeout();
503 break;
504 case INPCB_TIMER_FAST:
505 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
506 inpcb_sched_timeout();
507 break;
508 default:
509 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
510 inpcb_sched_lazy_timeout();
511 break;
512 }
513 lck_mtx_unlock(&inpcb_timeout_lock);
514 }
515
516 void
inpcb_timer_sched(struct inpcbinfo * ipi,u_int32_t type)517 inpcb_timer_sched(struct inpcbinfo *ipi, u_int32_t type)
518 {
519 lck_mtx_lock_spin(&inpcb_timeout_lock);
520 inpcb_ticking = TRUE;
521 switch (type) {
522 case INPCB_TIMER_NODELAY:
523 atomic_add_32(&ipi->ipi_timer_req.intimer_nodelay, 1);
524 inpcb_sched_timeout();
525 break;
526 case INPCB_TIMER_FAST:
527 atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
528 inpcb_sched_timeout();
529 break;
530 default:
531 atomic_add_32(&ipi->ipi_timer_req.intimer_lazy, 1);
532 inpcb_sched_lazy_timeout();
533 break;
534 }
535 lck_mtx_unlock(&inpcb_timeout_lock);
536 }
537
538 void
in_pcbinfo_attach(struct inpcbinfo * ipi)539 in_pcbinfo_attach(struct inpcbinfo *ipi)
540 {
541 struct inpcbinfo *ipi0;
542
543 lck_mtx_lock(&inpcb_lock);
544 TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
545 if (ipi0 == ipi) {
546 panic("%s: ipi %p already in the list",
547 __func__, ipi);
548 /* NOTREACHED */
549 }
550 }
551 TAILQ_INSERT_TAIL(&inpcb_head, ipi, ipi_entry);
552 lck_mtx_unlock(&inpcb_lock);
553 }
554
555 int
in_pcbinfo_detach(struct inpcbinfo * ipi)556 in_pcbinfo_detach(struct inpcbinfo *ipi)
557 {
558 struct inpcbinfo *ipi0;
559 int error = 0;
560
561 lck_mtx_lock(&inpcb_lock);
562 TAILQ_FOREACH(ipi0, &inpcb_head, ipi_entry) {
563 if (ipi0 == ipi) {
564 break;
565 }
566 }
567 if (ipi0 != NULL) {
568 TAILQ_REMOVE(&inpcb_head, ipi0, ipi_entry);
569 } else {
570 error = ENXIO;
571 }
572 lck_mtx_unlock(&inpcb_lock);
573
574 return error;
575 }
576
577 /*
578 * Allocate a PCB and associate it with the socket.
579 *
580 * Returns: 0 Success
581 * ENOBUFS
582 * ENOMEM
583 */
584 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo,struct proc * p)585 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p)
586 {
587 #pragma unused(p)
588 struct inpcb *inp;
589 caddr_t temp;
590
591 if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
592 inp = zalloc_flags(pcbinfo->ipi_zone,
593 Z_WAITOK | Z_ZERO | Z_NOFAIL);
594 } else {
595 inp = (struct inpcb *)(void *)so->so_saved_pcb;
596 temp = inp->inp_saved_ppcb;
597 bzero((caddr_t)inp, sizeof(*inp));
598 inp->inp_saved_ppcb = temp;
599 }
600
601 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
602 inp->inp_pcbinfo = pcbinfo;
603 inp->inp_socket = so;
604 /* make sure inp_stat is always 64-bit aligned */
605 inp->inp_stat = (struct inp_stat *)P2ROUNDUP(inp->inp_stat_store,
606 sizeof(u_int64_t));
607 if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store) +
608 sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) {
609 panic("%s: insufficient space to align inp_stat", __func__);
610 /* NOTREACHED */
611 }
612
613 /* make sure inp_cstat is always 64-bit aligned */
614 inp->inp_cstat = (struct inp_stat *)P2ROUNDUP(inp->inp_cstat_store,
615 sizeof(u_int64_t));
616 if (((uintptr_t)inp->inp_cstat - (uintptr_t)inp->inp_cstat_store) +
617 sizeof(*inp->inp_cstat) > sizeof(inp->inp_cstat_store)) {
618 panic("%s: insufficient space to align inp_cstat", __func__);
619 /* NOTREACHED */
620 }
621
622 /* make sure inp_wstat is always 64-bit aligned */
623 inp->inp_wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_wstat_store,
624 sizeof(u_int64_t));
625 if (((uintptr_t)inp->inp_wstat - (uintptr_t)inp->inp_wstat_store) +
626 sizeof(*inp->inp_wstat) > sizeof(inp->inp_wstat_store)) {
627 panic("%s: insufficient space to align inp_wstat", __func__);
628 /* NOTREACHED */
629 }
630
631 /* make sure inp_Wstat is always 64-bit aligned */
632 inp->inp_Wstat = (struct inp_stat *)P2ROUNDUP(inp->inp_Wstat_store,
633 sizeof(u_int64_t));
634 if (((uintptr_t)inp->inp_Wstat - (uintptr_t)inp->inp_Wstat_store) +
635 sizeof(*inp->inp_Wstat) > sizeof(inp->inp_Wstat_store)) {
636 panic("%s: insufficient space to align inp_Wstat", __func__);
637 /* NOTREACHED */
638 }
639
640 so->so_pcb = (caddr_t)inp;
641
642 if (so->so_proto->pr_flags & PR_PCBLOCK) {
643 lck_mtx_init(&inp->inpcb_mtx, pcbinfo->ipi_lock_grp,
644 &pcbinfo->ipi_lock_attr);
645 }
646
647 if (SOCK_DOM(so) == PF_INET6 && !ip6_mapped_addr_on) {
648 inp->inp_flags |= IN6P_IPV6_V6ONLY;
649 }
650
651 if (ip6_auto_flowlabel) {
652 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
653 }
654 if (intcoproc_unrestricted) {
655 inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
656 }
657
658 (void) inp_update_policy(inp);
659
660 lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
661 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
662 LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
663 pcbinfo->ipi_count++;
664 lck_rw_done(&pcbinfo->ipi_lock);
665 return 0;
666 }
667
668 /*
669 * in_pcblookup_local_and_cleanup does everything
670 * in_pcblookup_local does but it checks for a socket
671 * that's going away. Since we know that the lock is
672 * held read+write when this function is called, we
673 * can safely dispose of this socket like the slow
674 * timer would usually do and return NULL. This is
675 * great for bind.
676 */
677 struct inpcb *
in_pcblookup_local_and_cleanup(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_int lport_arg,int wild_okay)678 in_pcblookup_local_and_cleanup(struct inpcbinfo *pcbinfo, struct in_addr laddr,
679 u_int lport_arg, int wild_okay)
680 {
681 struct inpcb *inp;
682
683 /* Perform normal lookup */
684 inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
685
686 /* Check if we found a match but it's waiting to be disposed */
687 if (inp != NULL && inp->inp_wantcnt == WNT_STOPUSING) {
688 struct socket *so = inp->inp_socket;
689
690 socket_lock(so, 0);
691
692 if (so->so_usecount == 0) {
693 if (inp->inp_state != INPCB_STATE_DEAD) {
694 in_pcbdetach(inp);
695 }
696 in_pcbdispose(inp); /* will unlock & destroy */
697 inp = NULL;
698 } else {
699 socket_unlock(so, 0);
700 }
701 }
702
703 return inp;
704 }
705
706 static void
in_pcb_conflict_post_msg(u_int16_t port)707 in_pcb_conflict_post_msg(u_int16_t port)
708 {
709 /*
710 * Radar 5523020 send a kernel event notification if a
711 * non-participating socket tries to bind the port a socket
712 * who has set SOF_NOTIFYCONFLICT owns.
713 */
714 struct kev_msg ev_msg;
715 struct kev_in_portinuse in_portinuse;
716
717 bzero(&in_portinuse, sizeof(struct kev_in_portinuse));
718 bzero(&ev_msg, sizeof(struct kev_msg));
719 in_portinuse.port = ntohs(port); /* port in host order */
720 in_portinuse.req_pid = proc_selfpid();
721 ev_msg.vendor_code = KEV_VENDOR_APPLE;
722 ev_msg.kev_class = KEV_NETWORK_CLASS;
723 ev_msg.kev_subclass = KEV_INET_SUBCLASS;
724 ev_msg.event_code = KEV_INET_PORTINUSE;
725 ev_msg.dv[0].data_ptr = &in_portinuse;
726 ev_msg.dv[0].data_length = sizeof(struct kev_in_portinuse);
727 ev_msg.dv[1].data_length = 0;
728 dlil_post_complete_msg(NULL, &ev_msg);
729 }
730
731 /*
732 * Bind an INPCB to an address and/or port. This routine should not alter
733 * the caller-supplied local address "nam".
734 *
735 * Returns: 0 Success
736 * EADDRNOTAVAIL Address not available.
737 * EINVAL Invalid argument
738 * EAFNOSUPPORT Address family not supported [notdef]
739 * EACCES Permission denied
740 * EADDRINUSE Address in use
741 * EAGAIN Resource unavailable, try again
742 * priv_check_cred:EPERM Operation not permitted
743 */
744 int
in_pcbbind(struct inpcb * inp,struct sockaddr * nam,struct proc * p)745 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
746 {
747 struct socket *so = inp->inp_socket;
748 unsigned short *lastport;
749 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
750 u_short lport = 0, rand_port = 0;
751 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
752 int error, randomport, conflict = 0;
753 boolean_t anonport = FALSE;
754 kauth_cred_t cred;
755 struct in_addr laddr;
756 struct ifnet *outif = NULL;
757
758 if (TAILQ_EMPTY(&in_ifaddrhead)) { /* XXX broken! */
759 return EADDRNOTAVAIL;
760 }
761 if (!(so->so_options & (SO_REUSEADDR | SO_REUSEPORT))) {
762 wild = 1;
763 }
764
765 bzero(&laddr, sizeof(laddr));
766
767 socket_unlock(so, 0); /* keep reference on socket */
768 lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
769 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
770 /* another thread completed the bind */
771 lck_rw_done(&pcbinfo->ipi_lock);
772 socket_lock(so, 0);
773 return EINVAL;
774 }
775
776 if (nam != NULL) {
777 if (nam->sa_len != sizeof(struct sockaddr_in)) {
778 lck_rw_done(&pcbinfo->ipi_lock);
779 socket_lock(so, 0);
780 return EINVAL;
781 }
782 #if 0
783 /*
784 * We should check the family, but old programs
785 * incorrectly fail to initialize it.
786 */
787 if (nam->sa_family != AF_INET) {
788 lck_rw_done(&pcbinfo->ipi_lock);
789 socket_lock(so, 0);
790 return EAFNOSUPPORT;
791 }
792 #endif /* 0 */
793 lport = SIN(nam)->sin_port;
794
795 if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr))) {
796 /*
797 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
798 * allow complete duplication of binding if
799 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
800 * and a multicast address is bound on both
801 * new and duplicated sockets.
802 */
803 if (so->so_options & SO_REUSEADDR) {
804 reuseport = SO_REUSEADDR | SO_REUSEPORT;
805 }
806 } else if (SIN(nam)->sin_addr.s_addr != INADDR_ANY) {
807 struct sockaddr_in sin;
808 struct ifaddr *ifa;
809
810 /* Sanitized for interface address searches */
811 bzero(&sin, sizeof(sin));
812 sin.sin_family = AF_INET;
813 sin.sin_len = sizeof(struct sockaddr_in);
814 sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
815
816 ifa = ifa_ifwithaddr(SA(&sin));
817 if (ifa == NULL) {
818 lck_rw_done(&pcbinfo->ipi_lock);
819 socket_lock(so, 0);
820 return EADDRNOTAVAIL;
821 } else {
822 /*
823 * Opportunistically determine the outbound
824 * interface that may be used; this may not
825 * hold true if we end up using a route
826 * going over a different interface, e.g.
827 * when sending to a local address. This
828 * will get updated again after sending.
829 */
830 IFA_LOCK(ifa);
831 outif = ifa->ifa_ifp;
832 IFA_UNLOCK(ifa);
833 IFA_REMREF(ifa);
834 }
835 }
836
837 #if SKYWALK
838 if (inp->inp_flags2 & INP2_EXTERNAL_PORT) {
839 // Extract the external flow info
840 struct ns_flow_info nfi = {};
841 error = necp_client_get_netns_flow_info(inp->necp_client_uuid,
842 &nfi);
843 if (error != 0) {
844 lck_rw_done(&pcbinfo->ipi_lock);
845 socket_lock(so, 0);
846 return error;
847 }
848
849 // Extract the reserved port
850 u_int16_t reserved_lport = 0;
851 if (nfi.nfi_laddr.sa.sa_family == AF_INET) {
852 reserved_lport = nfi.nfi_laddr.sin.sin_port;
853 } else if (nfi.nfi_laddr.sa.sa_family == AF_INET6) {
854 reserved_lport = nfi.nfi_laddr.sin6.sin6_port;
855 } else {
856 lck_rw_done(&pcbinfo->ipi_lock);
857 socket_lock(so, 0);
858 return EINVAL;
859 }
860
861 // Validate or use the reserved port
862 if (lport == 0) {
863 lport = reserved_lport;
864 } else if (lport != reserved_lport) {
865 lck_rw_done(&pcbinfo->ipi_lock);
866 socket_lock(so, 0);
867 return EINVAL;
868 }
869 }
870
871 /* Do not allow reserving a UDP port if remaining UDP port count is below 4096 */
872 if (SOCK_PROTO(so) == IPPROTO_UDP && !allow_udp_port_exhaustion) {
873 uint32_t current_reservations = 0;
874 if (inp->inp_vflag & INP_IPV6) {
875 current_reservations = netns_lookup_reservations_count_in6(inp->in6p_laddr, IPPROTO_UDP);
876 } else {
877 current_reservations = netns_lookup_reservations_count_in(inp->inp_laddr, IPPROTO_UDP);
878 }
879 if (USHRT_MAX - UDP_RANDOM_PORT_RESERVE < current_reservations) {
880 log(LOG_ERR, "UDP port not available, less than 4096 UDP ports left");
881 lck_rw_done(&pcbinfo->ipi_lock);
882 socket_lock(so, 0);
883 return EADDRNOTAVAIL;
884 }
885 }
886
887 #endif /* SKYWALK */
888
889 if (lport != 0) {
890 struct inpcb *t;
891 uid_t u;
892
893 #if XNU_TARGET_OS_OSX
894 if (ntohs(lport) < IPPORT_RESERVED &&
895 SIN(nam)->sin_addr.s_addr != 0 &&
896 !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
897 cred = kauth_cred_proc_ref(p);
898 error = priv_check_cred(cred,
899 PRIV_NETINET_RESERVEDPORT, 0);
900 kauth_cred_unref(&cred);
901 if (error != 0) {
902 lck_rw_done(&pcbinfo->ipi_lock);
903 socket_lock(so, 0);
904 return EACCES;
905 }
906 }
907 #endif /* XNU_TARGET_OS_OSX */
908 /*
909 * Check wether the process is allowed to bind to a restricted port
910 */
911 if (!current_task_can_use_restricted_in_port(lport,
912 (uint8_t)so->so_proto->pr_protocol, PORT_FLAGS_BSD)) {
913 lck_rw_done(&pcbinfo->ipi_lock);
914 socket_lock(so, 0);
915 return EADDRINUSE;
916 }
917
918 if (!IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
919 (u = kauth_cred_getuid(so->so_cred)) != 0 &&
920 (t = in_pcblookup_local_and_cleanup(
921 inp->inp_pcbinfo, SIN(nam)->sin_addr, lport,
922 INPLOOKUP_WILDCARD)) != NULL &&
923 (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
924 t->inp_laddr.s_addr != INADDR_ANY ||
925 !(t->inp_socket->so_options & SO_REUSEPORT)) &&
926 (u != kauth_cred_getuid(t->inp_socket->so_cred)) &&
927 !(t->inp_socket->so_flags & SOF_REUSESHAREUID) &&
928 (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
929 t->inp_laddr.s_addr != INADDR_ANY) &&
930 (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
931 !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
932 uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
933 if ((t->inp_socket->so_flags &
934 SOF_NOTIFYCONFLICT) &&
935 !(so->so_flags & SOF_NOTIFYCONFLICT)) {
936 conflict = 1;
937 }
938
939 lck_rw_done(&pcbinfo->ipi_lock);
940
941 if (conflict) {
942 in_pcb_conflict_post_msg(lport);
943 }
944
945 socket_lock(so, 0);
946 return EADDRINUSE;
947 }
948 t = in_pcblookup_local_and_cleanup(pcbinfo,
949 SIN(nam)->sin_addr, lport, wild);
950 if (t != NULL &&
951 (reuseport & t->inp_socket->so_options) == 0 &&
952 (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
953 !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
954 uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
955 if (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
956 t->inp_laddr.s_addr != INADDR_ANY ||
957 SOCK_DOM(so) != PF_INET6 ||
958 SOCK_DOM(t->inp_socket) != PF_INET6) {
959 if ((t->inp_socket->so_flags &
960 SOF_NOTIFYCONFLICT) &&
961 !(so->so_flags & SOF_NOTIFYCONFLICT)) {
962 conflict = 1;
963 }
964
965 lck_rw_done(&pcbinfo->ipi_lock);
966
967 if (conflict) {
968 in_pcb_conflict_post_msg(lport);
969 }
970 socket_lock(so, 0);
971 return EADDRINUSE;
972 }
973 }
974 #if SKYWALK
975 if ((SOCK_PROTO(so) == IPPROTO_TCP ||
976 SOCK_PROTO(so) == IPPROTO_UDP) &&
977 !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
978 int res_err = 0;
979 if (inp->inp_vflag & INP_IPV6) {
980 res_err = netns_reserve_in6(
981 &inp->inp_netns_token,
982 SIN6(nam)->sin6_addr,
983 (uint8_t)SOCK_PROTO(so), lport, NETNS_BSD,
984 NULL);
985 } else {
986 res_err = netns_reserve_in(
987 &inp->inp_netns_token,
988 SIN(nam)->sin_addr, (uint8_t)SOCK_PROTO(so),
989 lport, NETNS_BSD, NULL);
990 }
991 if (res_err != 0) {
992 lck_rw_done(&pcbinfo->ipi_lock);
993 socket_lock(so, 0);
994 return EADDRINUSE;
995 }
996 }
997 #endif /* SKYWALK */
998 }
999 laddr = SIN(nam)->sin_addr;
1000 }
1001 if (lport == 0) {
1002 u_short first, last;
1003 int count;
1004 bool found;
1005
1006 /*
1007 * Override wild = 1 for implicit bind (mainly used by connect)
1008 * For implicit bind (lport == 0), we always use an unused port,
1009 * so REUSEADDR|REUSEPORT don't apply
1010 */
1011 wild = 1;
1012
1013 randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
1014 (so->so_type == SOCK_STREAM ? tcp_use_randomport :
1015 udp_use_randomport);
1016
1017 /*
1018 * Even though this looks similar to the code in
1019 * in6_pcbsetport, the v6 vs v4 checks are different.
1020 */
1021 anonport = TRUE;
1022 if (inp->inp_flags & INP_HIGHPORT) {
1023 first = (u_short)ipport_hifirstauto; /* sysctl */
1024 last = (u_short)ipport_hilastauto;
1025 lastport = &pcbinfo->ipi_lasthi;
1026 } else if (inp->inp_flags & INP_LOWPORT) {
1027 cred = kauth_cred_proc_ref(p);
1028 error = priv_check_cred(cred,
1029 PRIV_NETINET_RESERVEDPORT, 0);
1030 kauth_cred_unref(&cred);
1031 if (error != 0) {
1032 lck_rw_done(&pcbinfo->ipi_lock);
1033 socket_lock(so, 0);
1034 return error;
1035 }
1036 first = (u_short)ipport_lowfirstauto; /* 1023 */
1037 last = (u_short)ipport_lowlastauto; /* 600 */
1038 lastport = &pcbinfo->ipi_lastlow;
1039 } else {
1040 first = (u_short)ipport_firstauto; /* sysctl */
1041 last = (u_short)ipport_lastauto;
1042 lastport = &pcbinfo->ipi_lastport;
1043 }
1044 /* No point in randomizing if only one port is available */
1045
1046 if (first == last) {
1047 randomport = 0;
1048 }
1049 /*
1050 * Simple check to ensure all ports are not used up causing
1051 * a deadlock here.
1052 *
1053 * We split the two cases (up and down) so that the direction
1054 * is not being tested on each round of the loop.
1055 */
1056 if (first > last) {
1057 struct in_addr lookup_addr;
1058
1059 /*
1060 * counting down
1061 */
1062 if (randomport) {
1063 read_frandom(&rand_port, sizeof(rand_port));
1064 *lastport =
1065 first - (rand_port % (first - last));
1066 }
1067 count = first - last;
1068
1069 lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1070 inp->inp_laddr;
1071
1072 found = false;
1073 do {
1074 if (count-- < 0) { /* completely used? */
1075 lck_rw_done(&pcbinfo->ipi_lock);
1076 socket_lock(so, 0);
1077 return EADDRNOTAVAIL;
1078 }
1079 --*lastport;
1080 if (*lastport > first || *lastport < last) {
1081 *lastport = first;
1082 }
1083 lport = htons(*lastport);
1084
1085 /*
1086 * Skip if this is a restricted port as we do not want to
1087 * restricted ports as ephemeral
1088 */
1089 if (IS_RESTRICTED_IN_PORT(lport)) {
1090 continue;
1091 }
1092
1093 found = in_pcblookup_local_and_cleanup(pcbinfo,
1094 lookup_addr, lport, wild) == NULL;
1095 #if SKYWALK
1096 if (found &&
1097 (SOCK_PROTO(so) == IPPROTO_TCP ||
1098 SOCK_PROTO(so) == IPPROTO_UDP) &&
1099 !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1100 int res_err;
1101 if (inp->inp_vflag & INP_IPV6) {
1102 res_err = netns_reserve_in6(
1103 &inp->inp_netns_token,
1104 inp->in6p_laddr,
1105 (uint8_t)SOCK_PROTO(so), lport,
1106 NETNS_BSD, NULL);
1107 } else {
1108 res_err = netns_reserve_in(
1109 &inp->inp_netns_token,
1110 lookup_addr, (uint8_t)SOCK_PROTO(so),
1111 lport, NETNS_BSD, NULL);
1112 }
1113 found = res_err == 0;
1114 }
1115 #endif /* SKYWALK */
1116 } while (!found);
1117 } else {
1118 struct in_addr lookup_addr;
1119
1120 /*
1121 * counting up
1122 */
1123 if (randomport) {
1124 read_frandom(&rand_port, sizeof(rand_port));
1125 *lastport =
1126 first + (rand_port % (first - last));
1127 }
1128 count = last - first;
1129
1130 lookup_addr = (laddr.s_addr != INADDR_ANY) ? laddr :
1131 inp->inp_laddr;
1132
1133 found = false;
1134 do {
1135 if (count-- < 0) { /* completely used? */
1136 lck_rw_done(&pcbinfo->ipi_lock);
1137 socket_lock(so, 0);
1138 return EADDRNOTAVAIL;
1139 }
1140 ++*lastport;
1141 if (*lastport < first || *lastport > last) {
1142 *lastport = first;
1143 }
1144 lport = htons(*lastport);
1145
1146 /*
1147 * Skip if this is a restricted port as we do not want to
1148 * restricted ports as ephemeral
1149 */
1150 if (IS_RESTRICTED_IN_PORT(lport)) {
1151 continue;
1152 }
1153
1154 found = in_pcblookup_local_and_cleanup(pcbinfo,
1155 lookup_addr, lport, wild) == NULL;
1156 #if SKYWALK
1157 if (found &&
1158 (SOCK_PROTO(so) == IPPROTO_TCP ||
1159 SOCK_PROTO(so) == IPPROTO_UDP) &&
1160 !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
1161 int res_err;
1162 if (inp->inp_vflag & INP_IPV6) {
1163 res_err = netns_reserve_in6(
1164 &inp->inp_netns_token,
1165 inp->in6p_laddr,
1166 (uint8_t)SOCK_PROTO(so), lport,
1167 NETNS_BSD, NULL);
1168 } else {
1169 res_err = netns_reserve_in(
1170 &inp->inp_netns_token,
1171 lookup_addr, (uint8_t)SOCK_PROTO(so),
1172 lport, NETNS_BSD, NULL);
1173 }
1174 found = res_err == 0;
1175 }
1176 #endif /* SKYWALK */
1177 } while (!found);
1178 }
1179 }
1180 socket_lock(so, 0);
1181
1182 /*
1183 * We unlocked socket's protocol lock for a long time.
1184 * The socket might have been dropped/defuncted.
1185 * Checking if world has changed since.
1186 */
1187 if (inp->inp_state == INPCB_STATE_DEAD) {
1188 #if SKYWALK
1189 netns_release(&inp->inp_netns_token);
1190 #endif /* SKYWALK */
1191 lck_rw_done(&pcbinfo->ipi_lock);
1192 return ECONNABORTED;
1193 }
1194
1195 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
1196 #if SKYWALK
1197 netns_release(&inp->inp_netns_token);
1198 #endif /* SKYWALK */
1199 lck_rw_done(&pcbinfo->ipi_lock);
1200 return EINVAL;
1201 }
1202
1203 if (laddr.s_addr != INADDR_ANY) {
1204 inp->inp_laddr = laddr;
1205 inp->inp_last_outifp = outif;
1206 #if SKYWALK
1207 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1208 netns_set_ifnet(&inp->inp_netns_token, outif);
1209 }
1210 #endif /* SKYWALK */
1211 }
1212 inp->inp_lport = lport;
1213 if (anonport) {
1214 inp->inp_flags |= INP_ANONPORT;
1215 }
1216
1217 if (in_pcbinshash(inp, 1) != 0) {
1218 inp->inp_laddr.s_addr = INADDR_ANY;
1219 inp->inp_last_outifp = NULL;
1220
1221 #if SKYWALK
1222 netns_release(&inp->inp_netns_token);
1223 #endif /* SKYWALK */
1224 inp->inp_lport = 0;
1225 if (anonport) {
1226 inp->inp_flags &= ~INP_ANONPORT;
1227 }
1228 lck_rw_done(&pcbinfo->ipi_lock);
1229 return EAGAIN;
1230 }
1231 lck_rw_done(&pcbinfo->ipi_lock);
1232 sflt_notify(so, sock_evt_bound, NULL);
1233 return 0;
1234 }
1235
1236 #define APN_FALLBACK_IP_FILTER(a) \
1237 (IN_LINKLOCAL(ntohl((a)->sin_addr.s_addr)) || \
1238 IN_LOOPBACK(ntohl((a)->sin_addr.s_addr)) || \
1239 IN_ZERONET(ntohl((a)->sin_addr.s_addr)) || \
1240 IN_MULTICAST(ntohl((a)->sin_addr.s_addr)) || \
1241 IN_PRIVATE(ntohl((a)->sin_addr.s_addr)))
1242
1243 #define APN_FALLBACK_NOTIF_INTERVAL 2 /* Magic Number */
1244 static uint64_t last_apn_fallback = 0;
1245
1246 static boolean_t
apn_fallback_required(proc_t proc,struct socket * so,struct sockaddr_in * p_dstv4)1247 apn_fallback_required(proc_t proc, struct socket *so, struct sockaddr_in *p_dstv4)
1248 {
1249 uint64_t timenow;
1250 struct sockaddr_storage lookup_default_addr;
1251 struct rtentry *rt = NULL;
1252
1253 VERIFY(proc != NULL);
1254
1255 if (apn_fallbk_enabled == FALSE) {
1256 return FALSE;
1257 }
1258
1259 if (proc == kernproc) {
1260 return FALSE;
1261 }
1262
1263 if (so && (so->so_options & SO_NOAPNFALLBK)) {
1264 return FALSE;
1265 }
1266
1267 timenow = net_uptime();
1268 if ((timenow - last_apn_fallback) < APN_FALLBACK_NOTIF_INTERVAL) {
1269 apn_fallbk_log((LOG_INFO, "APN fallback notification throttled.\n"));
1270 return FALSE;
1271 }
1272
1273 if (p_dstv4 && APN_FALLBACK_IP_FILTER(p_dstv4)) {
1274 return FALSE;
1275 }
1276
1277 /* Check if we have unscoped IPv6 default route through cellular */
1278 bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1279 lookup_default_addr.ss_family = AF_INET6;
1280 lookup_default_addr.ss_len = sizeof(struct sockaddr_in6);
1281
1282 rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1283 if (NULL == rt) {
1284 apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1285 "unscoped default IPv6 route.\n"));
1286 return FALSE;
1287 }
1288
1289 if (!IFNET_IS_CELLULAR(rt->rt_ifp)) {
1290 rtfree(rt);
1291 apn_fallbk_log((LOG_INFO, "APN fallback notification could not find "
1292 "unscoped default IPv6 route through cellular interface.\n"));
1293 return FALSE;
1294 }
1295
1296 /*
1297 * We have a default IPv6 route, ensure that
1298 * we do not have IPv4 default route before triggering
1299 * the event
1300 */
1301 rtfree(rt);
1302 rt = NULL;
1303
1304 bzero(&lookup_default_addr, sizeof(lookup_default_addr));
1305 lookup_default_addr.ss_family = AF_INET;
1306 lookup_default_addr.ss_len = sizeof(struct sockaddr_in);
1307
1308 rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0);
1309
1310 if (rt) {
1311 rtfree(rt);
1312 rt = NULL;
1313 apn_fallbk_log((LOG_INFO, "APN fallback notification found unscoped "
1314 "IPv4 default route!\n"));
1315 return FALSE;
1316 }
1317
1318 {
1319 /*
1320 * We disable APN fallback if the binary is not a third-party app.
1321 * Note that platform daemons use their process name as a
1322 * bundle ID so we filter out bundle IDs without dots.
1323 */
1324 const char *bundle_id = cs_identity_get(proc);
1325 if (bundle_id == NULL ||
1326 bundle_id[0] == '\0' ||
1327 strchr(bundle_id, '.') == NULL ||
1328 strncmp(bundle_id, "com.apple.", sizeof("com.apple.") - 1) == 0) {
1329 apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found first-"
1330 "party bundle ID \"%s\"!\n", (bundle_id ? bundle_id : "NULL")));
1331 return FALSE;
1332 }
1333 }
1334
1335 {
1336 /*
1337 * The Apple App Store IPv6 requirement started on
1338 * June 1st, 2016 at 12:00:00 AM PDT.
1339 * We disable APN fallback if the binary is more recent than that.
1340 * We check both atime and birthtime since birthtime is not always supported.
1341 */
1342 static const long ipv6_start_date = 1464764400L;
1343 vfs_context_t context;
1344 struct stat64 sb;
1345 int vn_stat_error;
1346
1347 bzero(&sb, sizeof(struct stat64));
1348 context = vfs_context_create(NULL);
1349 vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, 0, context);
1350 (void)vfs_context_rele(context);
1351
1352 if (vn_stat_error != 0 ||
1353 sb.st_atimespec.tv_sec >= ipv6_start_date ||
1354 sb.st_birthtimespec.tv_sec >= ipv6_start_date) {
1355 apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found binary "
1356 "too recent! (err %d atime %ld mtime %ld ctime %ld birthtime %ld)\n",
1357 vn_stat_error, sb.st_atimespec.tv_sec, sb.st_mtimespec.tv_sec,
1358 sb.st_ctimespec.tv_sec, sb.st_birthtimespec.tv_sec));
1359 return FALSE;
1360 }
1361 }
1362 return TRUE;
1363 }
1364
1365 static void
apn_fallback_trigger(proc_t proc,struct socket * so)1366 apn_fallback_trigger(proc_t proc, struct socket *so)
1367 {
1368 pid_t pid = 0;
1369 struct kev_msg ev_msg;
1370 struct kev_netevent_apnfallbk_data apnfallbk_data;
1371
1372 last_apn_fallback = net_uptime();
1373 pid = proc_pid(proc);
1374 uuid_t application_uuid;
1375 uuid_clear(application_uuid);
1376 proc_getexecutableuuid(proc, application_uuid,
1377 sizeof(application_uuid));
1378
1379 bzero(&ev_msg, sizeof(struct kev_msg));
1380 ev_msg.vendor_code = KEV_VENDOR_APPLE;
1381 ev_msg.kev_class = KEV_NETWORK_CLASS;
1382 ev_msg.kev_subclass = KEV_NETEVENT_SUBCLASS;
1383 ev_msg.event_code = KEV_NETEVENT_APNFALLBACK;
1384
1385 bzero(&apnfallbk_data, sizeof(apnfallbk_data));
1386
1387 if (so->so_flags & SOF_DELEGATED) {
1388 apnfallbk_data.epid = so->e_pid;
1389 uuid_copy(apnfallbk_data.euuid, so->e_uuid);
1390 } else {
1391 apnfallbk_data.epid = so->last_pid;
1392 uuid_copy(apnfallbk_data.euuid, so->last_uuid);
1393 }
1394
1395 ev_msg.dv[0].data_ptr = &apnfallbk_data;
1396 ev_msg.dv[0].data_length = sizeof(apnfallbk_data);
1397 kev_post_msg(&ev_msg);
1398 apn_fallbk_log((LOG_INFO, "APN fallback notification issued.\n"));
1399 }
1400
1401 /*
1402 * Transform old in_pcbconnect() into an inner subroutine for new
1403 * in_pcbconnect(); do some validity-checking on the remote address
1404 * (in "nam") and then determine local host address (i.e., which
1405 * interface) to use to access that remote host.
1406 *
1407 * This routine may alter the caller-supplied remote address "nam".
1408 *
1409 * The caller may override the bound-to-interface setting of the socket
1410 * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1411 *
1412 * This routine might return an ifp with a reference held if the caller
1413 * provides a non-NULL outif, even in the error case. The caller is
1414 * responsible for releasing its reference.
1415 *
1416 * Returns: 0 Success
1417 * EINVAL Invalid argument
1418 * EAFNOSUPPORT Address family not supported
1419 * EADDRNOTAVAIL Address not available
1420 */
1421 int
in_pcbladdr(struct inpcb * inp,struct sockaddr * nam,struct in_addr * laddr,unsigned int ifscope,struct ifnet ** outif,int raw)1422 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr,
1423 unsigned int ifscope, struct ifnet **outif, int raw)
1424 {
1425 struct route *ro = &inp->inp_route;
1426 struct in_ifaddr *ia = NULL;
1427 struct sockaddr_in sin;
1428 int error = 0;
1429 boolean_t restricted = FALSE;
1430
1431 if (outif != NULL) {
1432 *outif = NULL;
1433 }
1434 if (nam->sa_len != sizeof(struct sockaddr_in)) {
1435 return EINVAL;
1436 }
1437 if (SIN(nam)->sin_family != AF_INET) {
1438 return EAFNOSUPPORT;
1439 }
1440 if (raw == 0 && SIN(nam)->sin_port == 0) {
1441 return EADDRNOTAVAIL;
1442 }
1443
1444 /*
1445 * If the destination address is INADDR_ANY,
1446 * use the primary local address.
1447 * If the supplied address is INADDR_BROADCAST,
1448 * and the primary interface supports broadcast,
1449 * choose the broadcast address for that interface.
1450 */
1451 if (raw == 0 && (SIN(nam)->sin_addr.s_addr == INADDR_ANY ||
1452 SIN(nam)->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST)) {
1453 lck_rw_lock_shared(&in_ifaddr_rwlock);
1454 if (!TAILQ_EMPTY(&in_ifaddrhead)) {
1455 ia = TAILQ_FIRST(&in_ifaddrhead);
1456 IFA_LOCK_SPIN(&ia->ia_ifa);
1457 if (SIN(nam)->sin_addr.s_addr == INADDR_ANY) {
1458 SIN(nam)->sin_addr = IA_SIN(ia)->sin_addr;
1459 } else if (ia->ia_ifp->if_flags & IFF_BROADCAST) {
1460 SIN(nam)->sin_addr =
1461 SIN(&ia->ia_broadaddr)->sin_addr;
1462 }
1463 IFA_UNLOCK(&ia->ia_ifa);
1464 ia = NULL;
1465 }
1466 lck_rw_done(&in_ifaddr_rwlock);
1467 }
1468 /*
1469 * Otherwise, if the socket has already bound the source, just use it.
1470 */
1471 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1472 VERIFY(ia == NULL);
1473 *laddr = inp->inp_laddr;
1474 return 0;
1475 }
1476
1477 /*
1478 * If the ifscope is specified by the caller (e.g. IP_PKTINFO)
1479 * then it overrides the sticky ifscope set for the socket.
1480 */
1481 if (ifscope == IFSCOPE_NONE && (inp->inp_flags & INP_BOUND_IF)) {
1482 ifscope = inp->inp_boundifp->if_index;
1483 }
1484
1485 /*
1486 * If route is known or can be allocated now,
1487 * our src addr is taken from the i/f, else punt.
1488 * Note that we should check the address family of the cached
1489 * destination, in case of sharing the cache with IPv6.
1490 */
1491 if (ro->ro_rt != NULL) {
1492 RT_LOCK_SPIN(ro->ro_rt);
1493 }
1494 if (ROUTE_UNUSABLE(ro) || ro->ro_dst.sa_family != AF_INET ||
1495 SIN(&ro->ro_dst)->sin_addr.s_addr != SIN(nam)->sin_addr.s_addr ||
1496 (inp->inp_socket->so_options & SO_DONTROUTE)) {
1497 if (ro->ro_rt != NULL) {
1498 RT_UNLOCK(ro->ro_rt);
1499 }
1500 ROUTE_RELEASE(ro);
1501 }
1502 if (!(inp->inp_socket->so_options & SO_DONTROUTE) &&
1503 (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
1504 if (ro->ro_rt != NULL) {
1505 RT_UNLOCK(ro->ro_rt);
1506 }
1507 ROUTE_RELEASE(ro);
1508 /* No route yet, so try to acquire one */
1509 bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
1510 ro->ro_dst.sa_family = AF_INET;
1511 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1512 SIN(&ro->ro_dst)->sin_addr = SIN(nam)->sin_addr;
1513 rtalloc_scoped(ro, ifscope);
1514 if (ro->ro_rt != NULL) {
1515 RT_LOCK_SPIN(ro->ro_rt);
1516 }
1517 }
1518 /* Sanitized local copy for interface address searches */
1519 bzero(&sin, sizeof(sin));
1520 sin.sin_family = AF_INET;
1521 sin.sin_len = sizeof(struct sockaddr_in);
1522 sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
1523 /*
1524 * If we did not find (or use) a route, assume dest is reachable
1525 * on a directly connected network and try to find a corresponding
1526 * interface to take the source address from.
1527 */
1528 if (ro->ro_rt == NULL) {
1529 proc_t proc = current_proc();
1530
1531 VERIFY(ia == NULL);
1532 ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1533 if (ia == NULL) {
1534 ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1535 }
1536 error = ((ia == NULL) ? ENETUNREACH : 0);
1537
1538 if (apn_fallback_required(proc, inp->inp_socket,
1539 (void *)nam)) {
1540 apn_fallback_trigger(proc, inp->inp_socket);
1541 }
1542
1543 goto done;
1544 }
1545 RT_LOCK_ASSERT_HELD(ro->ro_rt);
1546 /*
1547 * If the outgoing interface on the route found is not
1548 * a loopback interface, use the address from that interface.
1549 */
1550 if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
1551 VERIFY(ia == NULL);
1552 /*
1553 * If the route points to a cellular interface and the
1554 * caller forbids our using interfaces of such type,
1555 * pretend that there is no route.
1556 * Apply the same logic for expensive interfaces.
1557 */
1558 if (inp_restricted_send(inp, ro->ro_rt->rt_ifp)) {
1559 RT_UNLOCK(ro->ro_rt);
1560 ROUTE_RELEASE(ro);
1561 error = EHOSTUNREACH;
1562 restricted = TRUE;
1563 } else {
1564 /* Become a regular mutex */
1565 RT_CONVERT_LOCK(ro->ro_rt);
1566 ia = ifatoia(ro->ro_rt->rt_ifa);
1567 IFA_ADDREF(&ia->ia_ifa);
1568
1569 /*
1570 * Mark the control block for notification of
1571 * a possible flow that might undergo clat46
1572 * translation.
1573 *
1574 * We defer the decision to a later point when
1575 * inpcb is being disposed off.
1576 * The reason is that we only want to send notification
1577 * if the flow was ever used to send data.
1578 */
1579 if (IS_INTF_CLAT46(ro->ro_rt->rt_ifp)) {
1580 inp->inp_flags2 |= INP2_CLAT46_FLOW;
1581 }
1582
1583 RT_UNLOCK(ro->ro_rt);
1584 error = 0;
1585 }
1586 goto done;
1587 }
1588 VERIFY(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK);
1589 RT_UNLOCK(ro->ro_rt);
1590 /*
1591 * The outgoing interface is marked with 'loopback net', so a route
1592 * to ourselves is here.
1593 * Try to find the interface of the destination address and then
1594 * take the address from there. That interface is not necessarily
1595 * a loopback interface.
1596 */
1597 VERIFY(ia == NULL);
1598 ia = ifatoia(ifa_ifwithdstaddr(SA(&sin)));
1599 if (ia == NULL) {
1600 ia = ifatoia(ifa_ifwithaddr_scoped(SA(&sin), ifscope));
1601 }
1602 if (ia == NULL) {
1603 ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope));
1604 }
1605 if (ia == NULL) {
1606 RT_LOCK(ro->ro_rt);
1607 ia = ifatoia(ro->ro_rt->rt_ifa);
1608 if (ia != NULL) {
1609 IFA_ADDREF(&ia->ia_ifa);
1610 }
1611 RT_UNLOCK(ro->ro_rt);
1612 }
1613 error = ((ia == NULL) ? ENETUNREACH : 0);
1614
1615 done:
1616 /*
1617 * If the destination address is multicast and an outgoing
1618 * interface has been set as a multicast option, use the
1619 * address of that interface as our source address.
1620 */
1621 if (IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
1622 inp->inp_moptions != NULL) {
1623 struct ip_moptions *imo;
1624 struct ifnet *ifp;
1625
1626 imo = inp->inp_moptions;
1627 IMO_LOCK(imo);
1628 if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
1629 ia->ia_ifp != imo->imo_multicast_ifp)) {
1630 ifp = imo->imo_multicast_ifp;
1631 if (ia != NULL) {
1632 IFA_REMREF(&ia->ia_ifa);
1633 }
1634 lck_rw_lock_shared(&in_ifaddr_rwlock);
1635 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
1636 if (ia->ia_ifp == ifp) {
1637 break;
1638 }
1639 }
1640 if (ia != NULL) {
1641 IFA_ADDREF(&ia->ia_ifa);
1642 }
1643 lck_rw_done(&in_ifaddr_rwlock);
1644 if (ia == NULL) {
1645 error = EADDRNOTAVAIL;
1646 } else {
1647 error = 0;
1648 }
1649 }
1650 IMO_UNLOCK(imo);
1651 }
1652 /*
1653 * Don't do pcblookup call here; return interface in laddr
1654 * and exit to caller, that will do the lookup.
1655 */
1656 if (ia != NULL) {
1657 /*
1658 * If the source address belongs to a cellular interface
1659 * and the socket forbids our using interfaces of such
1660 * type, pretend that there is no source address.
1661 * Apply the same logic for expensive interfaces.
1662 */
1663 IFA_LOCK_SPIN(&ia->ia_ifa);
1664 if (inp_restricted_send(inp, ia->ia_ifa.ifa_ifp)) {
1665 IFA_UNLOCK(&ia->ia_ifa);
1666 error = EHOSTUNREACH;
1667 restricted = TRUE;
1668 } else if (error == 0) {
1669 *laddr = ia->ia_addr.sin_addr;
1670 if (outif != NULL) {
1671 struct ifnet *ifp;
1672
1673 if (ro->ro_rt != NULL) {
1674 ifp = ro->ro_rt->rt_ifp;
1675 } else {
1676 ifp = ia->ia_ifp;
1677 }
1678
1679 VERIFY(ifp != NULL);
1680 IFA_CONVERT_LOCK(&ia->ia_ifa);
1681 ifnet_reference(ifp); /* for caller */
1682 if (*outif != NULL) {
1683 ifnet_release(*outif);
1684 }
1685 *outif = ifp;
1686 }
1687 IFA_UNLOCK(&ia->ia_ifa);
1688 } else {
1689 IFA_UNLOCK(&ia->ia_ifa);
1690 }
1691 IFA_REMREF(&ia->ia_ifa);
1692 ia = NULL;
1693 }
1694
1695 if (restricted && error == EHOSTUNREACH) {
1696 soevent(inp->inp_socket, (SO_FILT_HINT_LOCKED |
1697 SO_FILT_HINT_IFDENIED));
1698 }
1699
1700 return error;
1701 }
1702
1703 /*
1704 * Outer subroutine:
1705 * Connect from a socket to a specified address.
1706 * Both address and port must be specified in argument sin.
1707 * If don't have a local address for this socket yet,
1708 * then pick one.
1709 *
1710 * The caller may override the bound-to-interface setting of the socket
1711 * by specifying the ifscope parameter (e.g. from IP_PKTINFO.)
1712 */
1713 int
in_pcbconnect(struct inpcb * inp,struct sockaddr * nam,struct proc * p,unsigned int ifscope,struct ifnet ** outif)1714 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
1715 unsigned int ifscope, struct ifnet **outif)
1716 {
1717 struct in_addr laddr;
1718 struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
1719 struct inpcb *pcb;
1720 int error;
1721 struct socket *so = inp->inp_socket;
1722
1723 #if CONTENT_FILTER
1724 if (so) {
1725 so->so_state_change_cnt++;
1726 }
1727 #endif
1728
1729 /*
1730 * Call inner routine, to assign local interface address.
1731 */
1732 if ((error = in_pcbladdr(inp, nam, &laddr, ifscope, outif, 0)) != 0) {
1733 return error;
1734 }
1735
1736 socket_unlock(so, 0);
1737 pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
1738 inp->inp_laddr.s_addr ? inp->inp_laddr : laddr,
1739 inp->inp_lport, 0, NULL);
1740 socket_lock(so, 0);
1741
1742 /*
1743 * Check if the socket is still in a valid state. When we unlock this
1744 * embryonic socket, it can get aborted if another thread is closing
1745 * the listener (radar 7947600).
1746 */
1747 if ((so->so_flags & SOF_ABORTED) != 0) {
1748 return ECONNREFUSED;
1749 }
1750
1751 if (pcb != NULL) {
1752 in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0);
1753 return EADDRINUSE;
1754 }
1755 if (inp->inp_laddr.s_addr == INADDR_ANY) {
1756 if (inp->inp_lport == 0) {
1757 error = in_pcbbind(inp, NULL, p);
1758 if (error) {
1759 return error;
1760 }
1761 }
1762 if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1763 /*
1764 * Lock inversion issue, mostly with udp
1765 * multicast packets.
1766 */
1767 socket_unlock(so, 0);
1768 lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1769 socket_lock(so, 0);
1770 }
1771 inp->inp_laddr = laddr;
1772 /* no reference needed */
1773 inp->inp_last_outifp = (outif != NULL) ? *outif : NULL;
1774 #if SKYWALK
1775 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
1776 netns_set_ifnet(&inp->inp_netns_token,
1777 inp->inp_last_outifp);
1778 }
1779 #endif /* SKYWALK */
1780 inp->inp_flags |= INP_INADDR_ANY;
1781 } else {
1782 /*
1783 * Usage of IP_PKTINFO, without local port already
1784 * speficified will cause kernel to panic,
1785 * see rdar://problem/18508185.
1786 * For now returning error to avoid a kernel panic
1787 * This routines can be refactored and handle this better
1788 * in future.
1789 */
1790 if (inp->inp_lport == 0) {
1791 return EINVAL;
1792 }
1793 if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1794 /*
1795 * Lock inversion issue, mostly with udp
1796 * multicast packets.
1797 */
1798 socket_unlock(so, 0);
1799 lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1800 socket_lock(so, 0);
1801 }
1802 }
1803 inp->inp_faddr = sin->sin_addr;
1804 inp->inp_fport = sin->sin_port;
1805 if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1806 nstat_pcb_invalidate_cache(inp);
1807 }
1808 in_pcbrehash(inp);
1809 lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1810 return 0;
1811 }
1812
1813 void
in_pcbdisconnect(struct inpcb * inp)1814 in_pcbdisconnect(struct inpcb *inp)
1815 {
1816 struct socket *so = inp->inp_socket;
1817
1818 if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) {
1819 nstat_pcb_cache(inp);
1820 }
1821
1822 inp->inp_faddr.s_addr = INADDR_ANY;
1823 inp->inp_fport = 0;
1824
1825 #if CONTENT_FILTER
1826 if (so) {
1827 so->so_state_change_cnt++;
1828 }
1829 #endif
1830
1831 if (!lck_rw_try_lock_exclusive(&inp->inp_pcbinfo->ipi_lock)) {
1832 /* lock inversion issue, mostly with udp multicast packets */
1833 socket_unlock(so, 0);
1834 lck_rw_lock_exclusive(&inp->inp_pcbinfo->ipi_lock);
1835 socket_lock(so, 0);
1836 }
1837
1838 in_pcbrehash(inp);
1839 lck_rw_done(&inp->inp_pcbinfo->ipi_lock);
1840 /*
1841 * A multipath subflow socket would have its SS_NOFDREF set by default,
1842 * so check for SOF_MP_SUBFLOW socket flag before detaching the PCB;
1843 * when the socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1844 */
1845 if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF)) {
1846 in_pcbdetach(inp);
1847 }
1848 }
1849
1850 void
in_pcbdetach(struct inpcb * inp)1851 in_pcbdetach(struct inpcb *inp)
1852 {
1853 struct socket *so = inp->inp_socket;
1854
1855 if (so->so_pcb == NULL) {
1856 /* PCB has been disposed */
1857 panic("%s: inp=%p so=%p proto=%d so_pcb is null!", __func__,
1858 inp, so, SOCK_PROTO(so));
1859 /* NOTREACHED */
1860 }
1861
1862 #if IPSEC
1863 if (inp->inp_sp != NULL) {
1864 (void) ipsec4_delete_pcbpolicy(inp);
1865 }
1866 #endif /* IPSEC */
1867
1868 if (inp->inp_stat != NULL && SOCK_PROTO(so) == IPPROTO_UDP) {
1869 if (inp->inp_stat->rxpackets == 0 && inp->inp_stat->txpackets == 0) {
1870 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_no_data);
1871 }
1872 }
1873
1874 /*
1875 * Let NetworkStatistics know this PCB is going away
1876 * before we detach it.
1877 */
1878 if (nstat_collect &&
1879 (SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP)) {
1880 nstat_pcb_detach(inp);
1881 }
1882
1883 /* Free memory buffer held for generating keep alives */
1884 if (inp->inp_keepalive_data != NULL) {
1885 kfree_data(inp->inp_keepalive_data, inp->inp_keepalive_datalen);
1886 inp->inp_keepalive_data = NULL;
1887 }
1888
1889 /* mark socket state as dead */
1890 if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING) {
1891 panic("%s: so=%p proto=%d couldn't set to STOPUSING",
1892 __func__, so, SOCK_PROTO(so));
1893 /* NOTREACHED */
1894 }
1895
1896 if (!(so->so_flags & SOF_PCBCLEARING)) {
1897 struct ip_moptions *imo;
1898
1899 inp->inp_vflag = 0;
1900 if (inp->inp_options != NULL) {
1901 (void) m_free(inp->inp_options);
1902 inp->inp_options = NULL;
1903 }
1904 ROUTE_RELEASE(&inp->inp_route);
1905 imo = inp->inp_moptions;
1906 if (imo != NULL) {
1907 IMO_REMREF(imo);
1908 }
1909 inp->inp_moptions = NULL;
1910 sofreelastref(so, 0);
1911 inp->inp_state = INPCB_STATE_DEAD;
1912
1913 /*
1914 * Enqueue an event to send kernel event notification
1915 * if the flow has to CLAT46 for data packets
1916 */
1917 if (inp->inp_flags2 & INP2_CLAT46_FLOW) {
1918 /*
1919 * If there has been any exchange of data bytes
1920 * over this flow.
1921 * Schedule a notification to report that flow is
1922 * using client side translation.
1923 */
1924 if (inp->inp_stat != NULL &&
1925 (inp->inp_stat->txbytes != 0 ||
1926 inp->inp_stat->rxbytes != 0)) {
1927 if (so->so_flags & SOF_DELEGATED) {
1928 in6_clat46_event_enqueue_nwk_wq_entry(
1929 IN6_CLAT46_EVENT_V4_FLOW,
1930 so->e_pid,
1931 so->e_uuid);
1932 } else {
1933 in6_clat46_event_enqueue_nwk_wq_entry(
1934 IN6_CLAT46_EVENT_V4_FLOW,
1935 so->last_pid,
1936 so->last_uuid);
1937 }
1938 }
1939 }
1940
1941 /* makes sure we're not called twice from so_close */
1942 so->so_flags |= SOF_PCBCLEARING;
1943
1944 inpcb_gc_sched(inp->inp_pcbinfo, INPCB_TIMER_FAST);
1945 }
1946 }
1947
1948
1949 void
in_pcbdispose(struct inpcb * inp)1950 in_pcbdispose(struct inpcb *inp)
1951 {
1952 struct socket *so = inp->inp_socket;
1953 struct inpcbinfo *ipi = inp->inp_pcbinfo;
1954
1955 if (so != NULL && so->so_usecount != 0) {
1956 panic("%s: so %p [%d,%d] usecount %d lockhistory %s",
1957 __func__, so, SOCK_DOM(so), SOCK_TYPE(so), so->so_usecount,
1958 solockhistory_nr(so));
1959 /* NOTREACHED */
1960 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
1961 if (so != NULL) {
1962 panic_plain("%s: inp %p invalid wantcnt %d, so %p "
1963 "[%d,%d] usecount %d retaincnt %d state 0x%x "
1964 "flags 0x%x lockhistory %s\n", __func__, inp,
1965 inp->inp_wantcnt, so, SOCK_DOM(so), SOCK_TYPE(so),
1966 so->so_usecount, so->so_retaincnt, so->so_state,
1967 so->so_flags, solockhistory_nr(so));
1968 /* NOTREACHED */
1969 } else {
1970 panic("%s: inp %p invalid wantcnt %d no socket",
1971 __func__, inp, inp->inp_wantcnt);
1972 /* NOTREACHED */
1973 }
1974 }
1975
1976 LCK_RW_ASSERT(&ipi->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
1977
1978 inp->inp_gencnt = ++ipi->ipi_gencnt;
1979 /* access ipi in in_pcbremlists */
1980 in_pcbremlists(inp);
1981
1982 if (so != NULL) {
1983 if (so->so_proto->pr_flags & PR_PCBLOCK) {
1984 sofreelastref(so, 0);
1985 if (so->so_rcv.sb_cc > 0 || so->so_snd.sb_cc > 0) {
1986 /*
1987 * selthreadclear() already called
1988 * during sofreelastref() above.
1989 */
1990 sbrelease(&so->so_rcv);
1991 sbrelease(&so->so_snd);
1992 }
1993 if (so->so_head != NULL) {
1994 panic("%s: so=%p head still exist",
1995 __func__, so);
1996 /* NOTREACHED */
1997 }
1998 lck_mtx_unlock(&inp->inpcb_mtx);
1999
2000 #if NECP
2001 necp_inpcb_remove_cb(inp);
2002 #endif /* NECP */
2003
2004 lck_mtx_destroy(&inp->inpcb_mtx, ipi->ipi_lock_grp);
2005 }
2006 /* makes sure we're not called twice from so_close */
2007 so->so_flags |= SOF_PCBCLEARING;
2008 so->so_saved_pcb = (caddr_t)inp;
2009 so->so_pcb = NULL;
2010 inp->inp_socket = NULL;
2011 #if NECP
2012 necp_inpcb_dispose(inp);
2013 #endif /* NECP */
2014 /*
2015 * In case there a route cached after a detach (possible
2016 * in the tcp case), make sure that it is freed before
2017 * we deallocate the structure.
2018 */
2019 ROUTE_RELEASE(&inp->inp_route);
2020 if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
2021 zfree(ipi->ipi_zone, inp);
2022 }
2023 sodealloc(so);
2024 }
2025 }
2026
2027 /*
2028 * The calling convention of in_getsockaddr() and in_getpeeraddr() was
2029 * modified to match the pru_sockaddr() and pru_peeraddr() entry points
2030 * in struct pr_usrreqs, so that protocols can just reference then directly
2031 * without the need for a wrapper function.
2032 */
2033 int
in_getsockaddr(struct socket * so,struct sockaddr ** nam)2034 in_getsockaddr(struct socket *so, struct sockaddr **nam)
2035 {
2036 struct inpcb *inp;
2037 struct sockaddr_in *sin;
2038
2039 /*
2040 * Do the malloc first in case it blocks.
2041 */
2042 sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2043 Z_WAITOK | Z_NOFAIL);
2044
2045 sin->sin_family = AF_INET;
2046
2047 if ((inp = sotoinpcb(so)) == NULL) {
2048 free_sockaddr(sin);
2049 return EINVAL;
2050 }
2051 sin->sin_port = inp->inp_lport;
2052 sin->sin_addr = inp->inp_laddr;
2053
2054 *nam = (struct sockaddr *)sin;
2055 return 0;
2056 }
2057
2058 int
in_getsockaddr_s(struct socket * so,struct sockaddr_in * ss)2059 in_getsockaddr_s(struct socket *so, struct sockaddr_in *ss)
2060 {
2061 struct sockaddr_in *sin = ss;
2062 struct inpcb *inp;
2063
2064 VERIFY(ss != NULL);
2065 bzero(ss, sizeof(*ss));
2066
2067 sin->sin_family = AF_INET;
2068 sin->sin_len = sizeof(*sin);
2069
2070 if ((inp = sotoinpcb(so)) == NULL) {
2071 return EINVAL;
2072 }
2073
2074 sin->sin_port = inp->inp_lport;
2075 sin->sin_addr = inp->inp_laddr;
2076 return 0;
2077 }
2078
2079 int
in_getpeeraddr(struct socket * so,struct sockaddr ** nam)2080 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
2081 {
2082 struct inpcb *inp;
2083 struct sockaddr_in *sin;
2084
2085 /*
2086 * Do the malloc first in case it blocks.
2087 */
2088 sin = (struct sockaddr_in *)alloc_sockaddr(sizeof(*sin),
2089 Z_WAITOK | Z_NOFAIL);
2090
2091 sin->sin_family = AF_INET;
2092
2093 if ((inp = sotoinpcb(so)) == NULL) {
2094 free_sockaddr(sin);
2095 return EINVAL;
2096 }
2097 sin->sin_port = inp->inp_fport;
2098 sin->sin_addr = inp->inp_faddr;
2099
2100 *nam = (struct sockaddr *)sin;
2101 return 0;
2102 }
2103
2104 void
in_pcbnotifyall(struct inpcbinfo * pcbinfo,struct in_addr faddr,int errno,void (* notify)(struct inpcb *,int))2105 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2106 int errno, void (*notify)(struct inpcb *, int))
2107 {
2108 struct inpcb *inp;
2109
2110 lck_rw_lock_shared(&pcbinfo->ipi_lock);
2111
2112 LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
2113 if (!(inp->inp_vflag & INP_IPV4)) {
2114 continue;
2115 }
2116 if (inp->inp_faddr.s_addr != faddr.s_addr ||
2117 inp->inp_socket == NULL) {
2118 continue;
2119 }
2120 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2121 continue;
2122 }
2123 socket_lock(inp->inp_socket, 1);
2124 (*notify)(inp, errno);
2125 (void) in_pcb_checkstate(inp, WNT_RELEASE, 1);
2126 socket_unlock(inp->inp_socket, 1);
2127 }
2128 lck_rw_done(&pcbinfo->ipi_lock);
2129 }
2130
2131 /*
2132 * Check for alternatives when higher level complains
2133 * about service problems. For now, invalidate cached
2134 * routing information. If the route was created dynamically
2135 * (by a redirect), time to try a default gateway again.
2136 */
2137 void
in_losing(struct inpcb * inp)2138 in_losing(struct inpcb *inp)
2139 {
2140 boolean_t release = FALSE;
2141 struct rtentry *rt;
2142
2143 if ((rt = inp->inp_route.ro_rt) != NULL) {
2144 struct in_ifaddr *ia = NULL;
2145
2146 RT_LOCK(rt);
2147 if (rt->rt_flags & RTF_DYNAMIC) {
2148 /*
2149 * Prevent another thread from modifying rt_key,
2150 * rt_gateway via rt_setgate() after rt_lock is
2151 * dropped by marking the route as defunct.
2152 */
2153 rt->rt_flags |= RTF_CONDEMNED;
2154 RT_UNLOCK(rt);
2155 (void) rtrequest(RTM_DELETE, rt_key(rt),
2156 rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
2157 } else {
2158 RT_UNLOCK(rt);
2159 }
2160 /* if the address is gone keep the old route in the pcb */
2161 if (inp->inp_laddr.s_addr != INADDR_ANY &&
2162 (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2163 /*
2164 * Address is around; ditch the route. A new route
2165 * can be allocated the next time output is attempted.
2166 */
2167 release = TRUE;
2168 }
2169 if (ia != NULL) {
2170 IFA_REMREF(&ia->ia_ifa);
2171 }
2172 }
2173 if (rt == NULL || release) {
2174 ROUTE_RELEASE(&inp->inp_route);
2175 }
2176 }
2177
2178 /*
2179 * After a routing change, flush old routing
2180 * and allocate a (hopefully) better one.
2181 */
2182 void
in_rtchange(struct inpcb * inp,int errno)2183 in_rtchange(struct inpcb *inp, int errno)
2184 {
2185 #pragma unused(errno)
2186 boolean_t release = FALSE;
2187 struct rtentry *rt;
2188
2189 if ((rt = inp->inp_route.ro_rt) != NULL) {
2190 struct in_ifaddr *ia = NULL;
2191
2192 /* if address is gone, keep the old route */
2193 if (inp->inp_laddr.s_addr != INADDR_ANY &&
2194 (ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
2195 /*
2196 * Address is around; ditch the route. A new route
2197 * can be allocated the next time output is attempted.
2198 */
2199 release = TRUE;
2200 }
2201 if (ia != NULL) {
2202 IFA_REMREF(&ia->ia_ifa);
2203 }
2204 }
2205 if (rt == NULL || release) {
2206 ROUTE_RELEASE(&inp->inp_route);
2207 }
2208 }
2209
2210 /*
2211 * Lookup a PCB based on the local address and port.
2212 */
2213 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,unsigned int lport_arg,int wild_okay)2214 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2215 unsigned int lport_arg, int wild_okay)
2216 {
2217 struct inpcb *inp;
2218 int matchwild = 3, wildcard;
2219 u_short lport = (u_short)lport_arg;
2220
2221 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0, 0, 0, 0, 0);
2222
2223 if (!wild_okay) {
2224 struct inpcbhead *head;
2225 /*
2226 * Look for an unconnected (wildcard foreign addr) PCB that
2227 * matches the local address and port we're looking for.
2228 */
2229 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2230 pcbinfo->ipi_hashmask)];
2231 LIST_FOREACH(inp, head, inp_hash) {
2232 if (!(inp->inp_vflag & INP_IPV4)) {
2233 continue;
2234 }
2235 if (inp->inp_faddr.s_addr == INADDR_ANY &&
2236 inp->inp_laddr.s_addr == laddr.s_addr &&
2237 inp->inp_lport == lport) {
2238 /*
2239 * Found.
2240 */
2241 return inp;
2242 }
2243 }
2244 /*
2245 * Not found.
2246 */
2247 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0, 0, 0, 0, 0);
2248 return NULL;
2249 } else {
2250 struct inpcbporthead *porthash;
2251 struct inpcbport *phd;
2252 struct inpcb *match = NULL;
2253 /*
2254 * Best fit PCB lookup.
2255 *
2256 * First see if this local port is in use by looking on the
2257 * port hash list.
2258 */
2259 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2260 pcbinfo->ipi_porthashmask)];
2261 LIST_FOREACH(phd, porthash, phd_hash) {
2262 if (phd->phd_port == lport) {
2263 break;
2264 }
2265 }
2266 if (phd != NULL) {
2267 /*
2268 * Port is in use by one or more PCBs. Look for best
2269 * fit.
2270 */
2271 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2272 wildcard = 0;
2273 if (!(inp->inp_vflag & INP_IPV4)) {
2274 continue;
2275 }
2276 if (inp->inp_faddr.s_addr != INADDR_ANY) {
2277 wildcard++;
2278 }
2279 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2280 if (laddr.s_addr == INADDR_ANY) {
2281 wildcard++;
2282 } else if (inp->inp_laddr.s_addr !=
2283 laddr.s_addr) {
2284 continue;
2285 }
2286 } else {
2287 if (laddr.s_addr != INADDR_ANY) {
2288 wildcard++;
2289 }
2290 }
2291 if (wildcard < matchwild) {
2292 match = inp;
2293 matchwild = wildcard;
2294 if (matchwild == 0) {
2295 break;
2296 }
2297 }
2298 }
2299 }
2300 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,
2301 0, 0, 0, 0);
2302 return match;
2303 }
2304 }
2305
2306 /*
2307 * Check if PCB exists in hash list.
2308 */
2309 int
in_pcblookup_hash_exists(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,uid_t * uid,gid_t * gid,struct ifnet * ifp)2310 in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2311 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2312 uid_t *uid, gid_t *gid, struct ifnet *ifp)
2313 {
2314 struct inpcbhead *head;
2315 struct inpcb *inp;
2316 u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2317 int found = 0;
2318 struct inpcb *local_wild = NULL;
2319 struct inpcb *local_wild_mapped = NULL;
2320
2321 *uid = UID_MAX;
2322 *gid = GID_MAX;
2323
2324 /*
2325 * We may have found the pcb in the last lookup - check this first.
2326 */
2327
2328 lck_rw_lock_shared(&pcbinfo->ipi_lock);
2329
2330 /*
2331 * First look for an exact match.
2332 */
2333 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2334 pcbinfo->ipi_hashmask)];
2335 LIST_FOREACH(inp, head, inp_hash) {
2336 if (!(inp->inp_vflag & INP_IPV4)) {
2337 continue;
2338 }
2339 if (inp_restricted_recv(inp, ifp)) {
2340 continue;
2341 }
2342
2343 #if NECP
2344 if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2345 continue;
2346 }
2347 #endif /* NECP */
2348
2349 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2350 inp->inp_laddr.s_addr == laddr.s_addr &&
2351 inp->inp_fport == fport &&
2352 inp->inp_lport == lport) {
2353 if ((found = (inp->inp_socket != NULL))) {
2354 /*
2355 * Found.
2356 */
2357 *uid = kauth_cred_getuid(
2358 inp->inp_socket->so_cred);
2359 *gid = kauth_cred_getgid(
2360 inp->inp_socket->so_cred);
2361 }
2362 lck_rw_done(&pcbinfo->ipi_lock);
2363 return found;
2364 }
2365 }
2366
2367 if (!wildcard) {
2368 /*
2369 * Not found.
2370 */
2371 lck_rw_done(&pcbinfo->ipi_lock);
2372 return 0;
2373 }
2374
2375 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2376 pcbinfo->ipi_hashmask)];
2377 LIST_FOREACH(inp, head, inp_hash) {
2378 if (!(inp->inp_vflag & INP_IPV4)) {
2379 continue;
2380 }
2381 if (inp_restricted_recv(inp, ifp)) {
2382 continue;
2383 }
2384
2385 #if NECP
2386 if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2387 continue;
2388 }
2389 #endif /* NECP */
2390
2391 if (inp->inp_faddr.s_addr == INADDR_ANY &&
2392 inp->inp_lport == lport) {
2393 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2394 if ((found = (inp->inp_socket != NULL))) {
2395 *uid = kauth_cred_getuid(
2396 inp->inp_socket->so_cred);
2397 *gid = kauth_cred_getgid(
2398 inp->inp_socket->so_cred);
2399 }
2400 lck_rw_done(&pcbinfo->ipi_lock);
2401 return found;
2402 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2403 if (inp->inp_socket &&
2404 SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2405 local_wild_mapped = inp;
2406 } else {
2407 local_wild = inp;
2408 }
2409 }
2410 }
2411 }
2412 if (local_wild == NULL) {
2413 if (local_wild_mapped != NULL) {
2414 if ((found = (local_wild_mapped->inp_socket != NULL))) {
2415 *uid = kauth_cred_getuid(
2416 local_wild_mapped->inp_socket->so_cred);
2417 *gid = kauth_cred_getgid(
2418 local_wild_mapped->inp_socket->so_cred);
2419 }
2420 lck_rw_done(&pcbinfo->ipi_lock);
2421 return found;
2422 }
2423 lck_rw_done(&pcbinfo->ipi_lock);
2424 return 0;
2425 }
2426 if ((found = (local_wild->inp_socket != NULL))) {
2427 *uid = kauth_cred_getuid(
2428 local_wild->inp_socket->so_cred);
2429 *gid = kauth_cred_getgid(
2430 local_wild->inp_socket->so_cred);
2431 }
2432 lck_rw_done(&pcbinfo->ipi_lock);
2433 return found;
2434 }
2435
2436 /*
2437 * Lookup PCB in hash list.
2438 */
2439 struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int wildcard,struct ifnet * ifp)2440 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2441 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
2442 struct ifnet *ifp)
2443 {
2444 struct inpcbhead *head;
2445 struct inpcb *inp;
2446 u_short fport = (u_short)fport_arg, lport = (u_short)lport_arg;
2447 struct inpcb *local_wild = NULL;
2448 struct inpcb *local_wild_mapped = NULL;
2449
2450 /*
2451 * We may have found the pcb in the last lookup - check this first.
2452 */
2453
2454 lck_rw_lock_shared(&pcbinfo->ipi_lock);
2455
2456 /*
2457 * First look for an exact match.
2458 */
2459 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2460 pcbinfo->ipi_hashmask)];
2461 LIST_FOREACH(inp, head, inp_hash) {
2462 if (!(inp->inp_vflag & INP_IPV4)) {
2463 continue;
2464 }
2465 if (inp_restricted_recv(inp, ifp)) {
2466 continue;
2467 }
2468
2469 #if NECP
2470 if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2471 continue;
2472 }
2473 #endif /* NECP */
2474
2475 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2476 inp->inp_laddr.s_addr == laddr.s_addr &&
2477 inp->inp_fport == fport &&
2478 inp->inp_lport == lport) {
2479 /*
2480 * Found.
2481 */
2482 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2483 WNT_STOPUSING) {
2484 lck_rw_done(&pcbinfo->ipi_lock);
2485 return inp;
2486 } else {
2487 /* it's there but dead, say it isn't found */
2488 lck_rw_done(&pcbinfo->ipi_lock);
2489 return NULL;
2490 }
2491 }
2492 }
2493
2494 if (!wildcard) {
2495 /*
2496 * Not found.
2497 */
2498 lck_rw_done(&pcbinfo->ipi_lock);
2499 return NULL;
2500 }
2501
2502 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
2503 pcbinfo->ipi_hashmask)];
2504 LIST_FOREACH(inp, head, inp_hash) {
2505 if (!(inp->inp_vflag & INP_IPV4)) {
2506 continue;
2507 }
2508 if (inp_restricted_recv(inp, ifp)) {
2509 continue;
2510 }
2511
2512 #if NECP
2513 if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
2514 continue;
2515 }
2516 #endif /* NECP */
2517
2518 if (inp->inp_faddr.s_addr == INADDR_ANY &&
2519 inp->inp_lport == lport) {
2520 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2521 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
2522 WNT_STOPUSING) {
2523 lck_rw_done(&pcbinfo->ipi_lock);
2524 return inp;
2525 } else {
2526 /* it's dead; say it isn't found */
2527 lck_rw_done(&pcbinfo->ipi_lock);
2528 return NULL;
2529 }
2530 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2531 if (SOCK_CHECK_DOM(inp->inp_socket, PF_INET6)) {
2532 local_wild_mapped = inp;
2533 } else {
2534 local_wild = inp;
2535 }
2536 }
2537 }
2538 }
2539 if (local_wild == NULL) {
2540 if (local_wild_mapped != NULL) {
2541 if (in_pcb_checkstate(local_wild_mapped,
2542 WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2543 lck_rw_done(&pcbinfo->ipi_lock);
2544 return local_wild_mapped;
2545 } else {
2546 /* it's dead; say it isn't found */
2547 lck_rw_done(&pcbinfo->ipi_lock);
2548 return NULL;
2549 }
2550 }
2551 lck_rw_done(&pcbinfo->ipi_lock);
2552 return NULL;
2553 }
2554 if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
2555 lck_rw_done(&pcbinfo->ipi_lock);
2556 return local_wild;
2557 }
2558 /*
2559 * It's either not found or is already dead.
2560 */
2561 lck_rw_done(&pcbinfo->ipi_lock);
2562 return NULL;
2563 }
2564
2565 /*
2566 * @brief Insert PCB onto various hash lists.
2567 *
2568 * @param inp Pointer to internet protocol control block
2569 * @param locked Implies if ipi_lock (protecting pcb list)
2570 * is already locked or not.
2571 *
2572 * @return int error on failure and 0 on success
2573 */
2574 int
in_pcbinshash(struct inpcb * inp,int locked)2575 in_pcbinshash(struct inpcb *inp, int locked)
2576 {
2577 struct inpcbhead *pcbhash;
2578 struct inpcbporthead *pcbporthash;
2579 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2580 struct inpcbport *phd;
2581 u_int32_t hashkey_faddr;
2582
2583 if (!locked) {
2584 if (!lck_rw_try_lock_exclusive(&pcbinfo->ipi_lock)) {
2585 /*
2586 * Lock inversion issue, mostly with udp
2587 * multicast packets
2588 */
2589 socket_unlock(inp->inp_socket, 0);
2590 lck_rw_lock_exclusive(&pcbinfo->ipi_lock);
2591 socket_lock(inp->inp_socket, 0);
2592 }
2593 }
2594
2595 /*
2596 * This routine or its caller may have given up
2597 * socket's protocol lock briefly.
2598 * During that time the socket may have been dropped.
2599 * Safe-guarding against that.
2600 */
2601 if (inp->inp_state == INPCB_STATE_DEAD) {
2602 if (!locked) {
2603 lck_rw_done(&pcbinfo->ipi_lock);
2604 }
2605 return ECONNABORTED;
2606 }
2607
2608
2609 if (inp->inp_vflag & INP_IPV6) {
2610 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2611 } else {
2612 hashkey_faddr = inp->inp_faddr.s_addr;
2613 }
2614
2615 inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2616 inp->inp_fport, pcbinfo->ipi_hashmask);
2617
2618 pcbhash = &pcbinfo->ipi_hashbase[inp->inp_hash_element];
2619
2620 pcbporthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(inp->inp_lport,
2621 pcbinfo->ipi_porthashmask)];
2622
2623 /*
2624 * Go through port list and look for a head for this lport.
2625 */
2626 LIST_FOREACH(phd, pcbporthash, phd_hash) {
2627 if (phd->phd_port == inp->inp_lport) {
2628 break;
2629 }
2630 }
2631
2632 /*
2633 * If none exists, malloc one and tack it on.
2634 */
2635 if (phd == NULL) {
2636 phd = kalloc_type(struct inpcbport, Z_WAITOK | Z_NOFAIL);
2637 phd->phd_port = inp->inp_lport;
2638 LIST_INIT(&phd->phd_pcblist);
2639 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2640 }
2641
2642 VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2643
2644 #if SKYWALK
2645 int err;
2646 struct socket *so = inp->inp_socket;
2647 if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2648 !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2649 if (inp->inp_vflag & INP_IPV6) {
2650 err = netns_reserve_in6(&inp->inp_netns_token,
2651 inp->in6p_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2652 NETNS_BSD | NETNS_PRERESERVED, NULL);
2653 } else {
2654 err = netns_reserve_in(&inp->inp_netns_token,
2655 inp->inp_laddr, (uint8_t)SOCK_PROTO(so), inp->inp_lport,
2656 NETNS_BSD | NETNS_PRERESERVED, NULL);
2657 }
2658 if (err) {
2659 if (!locked) {
2660 lck_rw_done(&pcbinfo->ipi_lock);
2661 }
2662 return err;
2663 }
2664 netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2665 inp_update_netns_flags(so);
2666 }
2667 #endif /* SKYWALK */
2668
2669 inp->inp_phd = phd;
2670 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2671 LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2672 inp->inp_flags2 |= INP2_INHASHLIST;
2673
2674 if (!locked) {
2675 lck_rw_done(&pcbinfo->ipi_lock);
2676 }
2677
2678 #if NECP
2679 // This call catches the original setting of the local address
2680 inp_update_necp_policy(inp, NULL, NULL, 0);
2681 #endif /* NECP */
2682
2683 return 0;
2684 }
2685
2686 /*
2687 * Move PCB to the proper hash bucket when { faddr, fport } have been
2688 * changed. NOTE: This does not handle the case of the lport changing (the
2689 * hashed port list would have to be updated as well), so the lport must
2690 * not change after in_pcbinshash() has been called.
2691 */
2692 void
in_pcbrehash(struct inpcb * inp)2693 in_pcbrehash(struct inpcb *inp)
2694 {
2695 struct inpcbhead *head;
2696 u_int32_t hashkey_faddr;
2697
2698 #if SKYWALK
2699 struct socket *so = inp->inp_socket;
2700 if ((SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP) &&
2701 !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
2702 int err;
2703 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
2704 if (inp->inp_vflag & INP_IPV6) {
2705 err = netns_change_addr_in6(
2706 &inp->inp_netns_token, inp->in6p_laddr);
2707 } else {
2708 err = netns_change_addr_in(
2709 &inp->inp_netns_token, inp->inp_laddr);
2710 }
2711 } else {
2712 if (inp->inp_vflag & INP_IPV6) {
2713 err = netns_reserve_in6(&inp->inp_netns_token,
2714 inp->in6p_laddr, (uint8_t)SOCK_PROTO(so),
2715 inp->inp_lport, NETNS_BSD, NULL);
2716 } else {
2717 err = netns_reserve_in(&inp->inp_netns_token,
2718 inp->inp_laddr, (uint8_t)SOCK_PROTO(so),
2719 inp->inp_lport, NETNS_BSD, NULL);
2720 }
2721 }
2722 /* We are assuming that whatever code paths result in a rehash
2723 * did their due diligence and ensured that the given
2724 * <proto, laddr, lport> tuple was free ahead of time. Just
2725 * reserving the lport on INADDR_ANY should be enough, since
2726 * that will block Skywalk from trying to reserve that same
2727 * port. Given this assumption, the above netns calls should
2728 * never fail*/
2729 VERIFY(err == 0);
2730
2731 netns_set_ifnet(&inp->inp_netns_token, inp->inp_last_outifp);
2732 inp_update_netns_flags(so);
2733 }
2734 #endif /* SKYWALK */
2735 if (inp->inp_vflag & INP_IPV6) {
2736 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
2737 } else {
2738 hashkey_faddr = inp->inp_faddr.s_addr;
2739 }
2740
2741 inp->inp_hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
2742 inp->inp_fport, inp->inp_pcbinfo->ipi_hashmask);
2743 head = &inp->inp_pcbinfo->ipi_hashbase[inp->inp_hash_element];
2744
2745 if (inp->inp_flags2 & INP2_INHASHLIST) {
2746 LIST_REMOVE(inp, inp_hash);
2747 inp->inp_flags2 &= ~INP2_INHASHLIST;
2748 }
2749
2750 VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2751 LIST_INSERT_HEAD(head, inp, inp_hash);
2752 inp->inp_flags2 |= INP2_INHASHLIST;
2753
2754 #if NECP
2755 // This call catches updates to the remote addresses
2756 inp_update_necp_policy(inp, NULL, NULL, 0);
2757 #endif /* NECP */
2758 }
2759
2760 /*
2761 * Remove PCB from various lists.
2762 * Must be called pcbinfo lock is held in exclusive mode.
2763 */
2764 void
in_pcbremlists(struct inpcb * inp)2765 in_pcbremlists(struct inpcb *inp)
2766 {
2767 inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
2768
2769 /*
2770 * Check if it's in hashlist -- an inp is placed in hashlist when
2771 * it's local port gets assigned. So it should also be present
2772 * in the port list.
2773 */
2774 if (inp->inp_flags2 & INP2_INHASHLIST) {
2775 struct inpcbport *phd = inp->inp_phd;
2776
2777 VERIFY(phd != NULL && inp->inp_lport > 0);
2778
2779 LIST_REMOVE(inp, inp_hash);
2780 inp->inp_hash.le_next = NULL;
2781 inp->inp_hash.le_prev = NULL;
2782
2783 LIST_REMOVE(inp, inp_portlist);
2784 inp->inp_portlist.le_next = NULL;
2785 inp->inp_portlist.le_prev = NULL;
2786 if (LIST_EMPTY(&phd->phd_pcblist)) {
2787 LIST_REMOVE(phd, phd_hash);
2788 kfree_type(struct inpcbport, phd);
2789 }
2790 inp->inp_phd = NULL;
2791 inp->inp_flags2 &= ~INP2_INHASHLIST;
2792 #if SKYWALK
2793 /* Free up the port in the namespace registrar */
2794 netns_release(&inp->inp_netns_token);
2795 netns_release(&inp->inp_wildcard_netns_token);
2796 #endif /* SKYWALK */
2797 }
2798 VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST));
2799
2800 if (inp->inp_flags2 & INP2_TIMEWAIT) {
2801 /* Remove from time-wait queue */
2802 tcp_remove_from_time_wait(inp);
2803 inp->inp_flags2 &= ~INP2_TIMEWAIT;
2804 VERIFY(inp->inp_pcbinfo->ipi_twcount != 0);
2805 inp->inp_pcbinfo->ipi_twcount--;
2806 } else {
2807 /* Remove from global inp list if it is not time-wait */
2808 LIST_REMOVE(inp, inp_list);
2809 }
2810
2811 if (inp->inp_flags2 & INP2_IN_FCTREE) {
2812 inp_fc_getinp(inp->inp_flowhash, (INPFC_SOLOCKED | INPFC_REMOVE));
2813 VERIFY(!(inp->inp_flags2 & INP2_IN_FCTREE));
2814 }
2815
2816 inp->inp_pcbinfo->ipi_count--;
2817 }
2818
2819 /*
2820 * Mechanism used to defer the memory release of PCBs
2821 * The pcb list will contain the pcb until the reaper can clean it up if
2822 * the following conditions are met:
2823 * 1) state "DEAD",
2824 * 2) wantcnt is STOPUSING
2825 * 3) usecount is 0
2826 * This function will be called to either mark the pcb as
2827 */
2828 int
in_pcb_checkstate(struct inpcb * pcb,int mode,int locked)2829 in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
2830 {
2831 volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt;
2832 UInt32 origwant;
2833 UInt32 newwant;
2834
2835 switch (mode) {
2836 case WNT_STOPUSING:
2837 /*
2838 * Try to mark the pcb as ready for recycling. CAS with
2839 * STOPUSING, if success we're good, if it's in use, will
2840 * be marked later
2841 */
2842 if (locked == 0) {
2843 socket_lock(pcb->inp_socket, 1);
2844 }
2845 pcb->inp_state = INPCB_STATE_DEAD;
2846
2847 stopusing:
2848 if (pcb->inp_socket->so_usecount < 0) {
2849 panic("%s: pcb=%p so=%p usecount is negative",
2850 __func__, pcb, pcb->inp_socket);
2851 /* NOTREACHED */
2852 }
2853 if (locked == 0) {
2854 socket_unlock(pcb->inp_socket, 1);
2855 }
2856
2857 inpcb_gc_sched(pcb->inp_pcbinfo, INPCB_TIMER_FAST);
2858
2859 origwant = *wantcnt;
2860 if ((UInt16) origwant == 0xffff) { /* should stop using */
2861 return WNT_STOPUSING;
2862 }
2863 newwant = 0xffff;
2864 if ((UInt16) origwant == 0) {
2865 /* try to mark it as unsuable now */
2866 OSCompareAndSwap(origwant, newwant, wantcnt);
2867 }
2868 return WNT_STOPUSING;
2869
2870 case WNT_ACQUIRE:
2871 /*
2872 * Try to increase reference to pcb. If WNT_STOPUSING
2873 * should bail out. If socket state DEAD, try to set count
2874 * to STOPUSING, return failed otherwise increase cnt.
2875 */
2876 do {
2877 origwant = *wantcnt;
2878 if ((UInt16) origwant == 0xffff) {
2879 /* should stop using */
2880 return WNT_STOPUSING;
2881 }
2882 newwant = origwant + 1;
2883 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
2884 return WNT_ACQUIRE;
2885
2886 case WNT_RELEASE:
2887 /*
2888 * Release reference. If result is null and pcb state
2889 * is DEAD, set wanted bit to STOPUSING
2890 */
2891 if (locked == 0) {
2892 socket_lock(pcb->inp_socket, 1);
2893 }
2894
2895 do {
2896 origwant = *wantcnt;
2897 if ((UInt16) origwant == 0x0) {
2898 panic("%s: pcb=%p release with zero count",
2899 __func__, pcb);
2900 /* NOTREACHED */
2901 }
2902 if ((UInt16) origwant == 0xffff) {
2903 /* should stop using */
2904 if (locked == 0) {
2905 socket_unlock(pcb->inp_socket, 1);
2906 }
2907 return WNT_STOPUSING;
2908 }
2909 newwant = origwant - 1;
2910 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
2911
2912 if (pcb->inp_state == INPCB_STATE_DEAD) {
2913 goto stopusing;
2914 }
2915 if (pcb->inp_socket->so_usecount < 0) {
2916 panic("%s: RELEASE pcb=%p so=%p usecount is negative",
2917 __func__, pcb, pcb->inp_socket);
2918 /* NOTREACHED */
2919 }
2920
2921 if (locked == 0) {
2922 socket_unlock(pcb->inp_socket, 1);
2923 }
2924 return WNT_RELEASE;
2925
2926 default:
2927 panic("%s: so=%p not a valid state =%x", __func__,
2928 pcb->inp_socket, mode);
2929 /* NOTREACHED */
2930 }
2931
2932 /* NOTREACHED */
2933 return mode;
2934 }
2935
2936 /*
2937 * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
2938 * The inpcb_compat data structure is passed to user space and must
2939 * not change. We intentionally avoid copying pointers.
2940 */
2941 void
inpcb_to_compat(struct inpcb * inp,struct inpcb_compat * inp_compat)2942 inpcb_to_compat(struct inpcb *inp, struct inpcb_compat *inp_compat)
2943 {
2944 bzero(inp_compat, sizeof(*inp_compat));
2945 inp_compat->inp_fport = inp->inp_fport;
2946 inp_compat->inp_lport = inp->inp_lport;
2947 inp_compat->nat_owner = 0;
2948 inp_compat->nat_cookie = 0;
2949 inp_compat->inp_gencnt = inp->inp_gencnt;
2950 inp_compat->inp_flags = inp->inp_flags;
2951 inp_compat->inp_flow = inp->inp_flow;
2952 inp_compat->inp_vflag = inp->inp_vflag;
2953 inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
2954 inp_compat->inp_ip_p = inp->inp_ip_p;
2955 inp_compat->inp_dependfaddr.inp6_foreign =
2956 inp->inp_dependfaddr.inp6_foreign;
2957 inp_compat->inp_dependladdr.inp6_local =
2958 inp->inp_dependladdr.inp6_local;
2959 inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
2960 inp_compat->inp_depend6.inp6_hlim = 0;
2961 inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
2962 inp_compat->inp_depend6.inp6_ifindex = 0;
2963 inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
2964 }
2965
2966 #if XNU_TARGET_OS_OSX
2967 void
inpcb_to_xinpcb64(struct inpcb * inp,struct xinpcb64 * xinp)2968 inpcb_to_xinpcb64(struct inpcb *inp, struct xinpcb64 *xinp)
2969 {
2970 xinp->inp_fport = inp->inp_fport;
2971 xinp->inp_lport = inp->inp_lport;
2972 xinp->inp_gencnt = inp->inp_gencnt;
2973 xinp->inp_flags = inp->inp_flags;
2974 xinp->inp_flow = inp->inp_flow;
2975 xinp->inp_vflag = inp->inp_vflag;
2976 xinp->inp_ip_ttl = inp->inp_ip_ttl;
2977 xinp->inp_ip_p = inp->inp_ip_p;
2978 xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
2979 xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
2980 xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
2981 xinp->inp_depend6.inp6_hlim = 0;
2982 xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
2983 xinp->inp_depend6.inp6_ifindex = 0;
2984 xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
2985 }
2986 #endif /* XNU_TARGET_OS_OSX */
2987
2988 /*
2989 * The following routines implement this scheme:
2990 *
2991 * Callers of ip_output() that intend to cache the route in the inpcb pass
2992 * a local copy of the struct route to ip_output(). Using a local copy of
2993 * the cached route significantly simplifies things as IP no longer has to
2994 * worry about having exclusive access to the passed in struct route, since
2995 * it's defined in the caller's stack; in essence, this allows for a lock-
2996 * less operation when updating the struct route at the IP level and below,
2997 * whenever necessary. The scheme works as follows:
2998 *
2999 * Prior to dropping the socket's lock and calling ip_output(), the caller
3000 * copies the struct route from the inpcb into its stack, and adds a reference
3001 * to the cached route entry, if there was any. The socket's lock is then
3002 * dropped and ip_output() is called with a pointer to the copy of struct
3003 * route defined on the stack (not to the one in the inpcb.)
3004 *
3005 * Upon returning from ip_output(), the caller then acquires the socket's
3006 * lock and synchronizes the cache; if there is no route cached in the inpcb,
3007 * it copies the local copy of struct route (which may or may not contain any
3008 * route) back into the cache; otherwise, if the inpcb has a route cached in
3009 * it, the one in the local copy will be freed, if there's any. Trashing the
3010 * cached route in the inpcb can be avoided because ip_output() is single-
3011 * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
3012 * by the socket/transport layer.)
3013 */
3014 void
inp_route_copyout(struct inpcb * inp,struct route * dst)3015 inp_route_copyout(struct inpcb *inp, struct route *dst)
3016 {
3017 struct route *src = &inp->inp_route;
3018
3019 socket_lock_assert_owned(inp->inp_socket);
3020
3021 /*
3022 * If the route in the PCB is stale or not for IPv4, blow it away;
3023 * this is possible in the case of IPv4-mapped address case.
3024 */
3025 if (ROUTE_UNUSABLE(src) || rt_key(src->ro_rt)->sa_family != AF_INET) {
3026 ROUTE_RELEASE(src);
3027 }
3028
3029 route_copyout(dst, src, sizeof(*dst));
3030 }
3031
3032 void
inp_route_copyin(struct inpcb * inp,struct route * src)3033 inp_route_copyin(struct inpcb *inp, struct route *src)
3034 {
3035 struct route *dst = &inp->inp_route;
3036
3037 socket_lock_assert_owned(inp->inp_socket);
3038
3039 /* Minor sanity check */
3040 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3041 panic("%s: wrong or corrupted route: %p", __func__, src);
3042 }
3043
3044 route_copyin(src, dst, sizeof(*src));
3045 }
3046
3047 /*
3048 * Handler for setting IP_BOUND_IF/IPV6_BOUND_IF socket option.
3049 */
3050 int
inp_bindif(struct inpcb * inp,unsigned int ifscope,struct ifnet ** pifp)3051 inp_bindif(struct inpcb *inp, unsigned int ifscope, struct ifnet **pifp)
3052 {
3053 struct ifnet *ifp = NULL;
3054
3055 ifnet_head_lock_shared();
3056 if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE &&
3057 (ifp = ifindex2ifnet[ifscope]) == NULL)) {
3058 ifnet_head_done();
3059 return ENXIO;
3060 }
3061 ifnet_head_done();
3062
3063 VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE);
3064
3065 /*
3066 * A zero interface scope value indicates an "unbind".
3067 * Otherwise, take in whatever value the app desires;
3068 * the app may already know the scope (or force itself
3069 * to such a scope) ahead of time before the interface
3070 * gets attached. It doesn't matter either way; any
3071 * route lookup from this point on will require an
3072 * exact match for the embedded interface scope.
3073 */
3074 inp->inp_boundifp = ifp;
3075 if (inp->inp_boundifp == NULL) {
3076 inp->inp_flags &= ~INP_BOUND_IF;
3077 } else {
3078 inp->inp_flags |= INP_BOUND_IF;
3079 }
3080
3081 /* Blow away any cached route in the PCB */
3082 ROUTE_RELEASE(&inp->inp_route);
3083
3084 if (pifp != NULL) {
3085 *pifp = ifp;
3086 }
3087
3088 return 0;
3089 }
3090
3091 /*
3092 * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3093 * as well as for setting PROC_UUID_NO_CELLULAR policy.
3094 */
3095 void
inp_set_nocellular(struct inpcb * inp)3096 inp_set_nocellular(struct inpcb *inp)
3097 {
3098 inp->inp_flags |= INP_NO_IFT_CELLULAR;
3099
3100 /* Blow away any cached route in the PCB */
3101 ROUTE_RELEASE(&inp->inp_route);
3102 }
3103
3104 /*
3105 * Handler for clearing IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option,
3106 * as well as for clearing PROC_UUID_NO_CELLULAR policy.
3107 */
3108 void
inp_clear_nocellular(struct inpcb * inp)3109 inp_clear_nocellular(struct inpcb *inp)
3110 {
3111 struct socket *so = inp->inp_socket;
3112
3113 /*
3114 * SO_RESTRICT_DENY_CELLULAR socket restriction issued on the socket
3115 * has a higher precendence than INP_NO_IFT_CELLULAR. Clear the flag
3116 * if and only if the socket is unrestricted.
3117 */
3118 if (so != NULL && !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
3119 inp->inp_flags &= ~INP_NO_IFT_CELLULAR;
3120
3121 /* Blow away any cached route in the PCB */
3122 ROUTE_RELEASE(&inp->inp_route);
3123 }
3124 }
3125
3126 void
inp_set_noexpensive(struct inpcb * inp)3127 inp_set_noexpensive(struct inpcb *inp)
3128 {
3129 inp->inp_flags2 |= INP2_NO_IFF_EXPENSIVE;
3130
3131 /* Blow away any cached route in the PCB */
3132 ROUTE_RELEASE(&inp->inp_route);
3133 }
3134
3135 void
inp_set_noconstrained(struct inpcb * inp)3136 inp_set_noconstrained(struct inpcb *inp)
3137 {
3138 inp->inp_flags2 |= INP2_NO_IFF_CONSTRAINED;
3139
3140 /* Blow away any cached route in the PCB */
3141 ROUTE_RELEASE(&inp->inp_route);
3142 }
3143
3144 void
inp_set_awdl_unrestricted(struct inpcb * inp)3145 inp_set_awdl_unrestricted(struct inpcb *inp)
3146 {
3147 inp->inp_flags2 |= INP2_AWDL_UNRESTRICTED;
3148
3149 /* Blow away any cached route in the PCB */
3150 ROUTE_RELEASE(&inp->inp_route);
3151 }
3152
3153 boolean_t
inp_get_awdl_unrestricted(struct inpcb * inp)3154 inp_get_awdl_unrestricted(struct inpcb *inp)
3155 {
3156 return (inp->inp_flags2 & INP2_AWDL_UNRESTRICTED) ? TRUE : FALSE;
3157 }
3158
3159 void
inp_clear_awdl_unrestricted(struct inpcb * inp)3160 inp_clear_awdl_unrestricted(struct inpcb *inp)
3161 {
3162 inp->inp_flags2 &= ~INP2_AWDL_UNRESTRICTED;
3163
3164 /* Blow away any cached route in the PCB */
3165 ROUTE_RELEASE(&inp->inp_route);
3166 }
3167
3168 void
inp_set_intcoproc_allowed(struct inpcb * inp)3169 inp_set_intcoproc_allowed(struct inpcb *inp)
3170 {
3171 inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED;
3172
3173 /* Blow away any cached route in the PCB */
3174 ROUTE_RELEASE(&inp->inp_route);
3175 }
3176
3177 boolean_t
inp_get_intcoproc_allowed(struct inpcb * inp)3178 inp_get_intcoproc_allowed(struct inpcb *inp)
3179 {
3180 return (inp->inp_flags2 & INP2_INTCOPROC_ALLOWED) ? TRUE : FALSE;
3181 }
3182
3183 void
inp_clear_intcoproc_allowed(struct inpcb * inp)3184 inp_clear_intcoproc_allowed(struct inpcb *inp)
3185 {
3186 inp->inp_flags2 &= ~INP2_INTCOPROC_ALLOWED;
3187
3188 /* Blow away any cached route in the PCB */
3189 ROUTE_RELEASE(&inp->inp_route);
3190 }
3191
3192 #if NECP
3193 /*
3194 * Called when PROC_UUID_NECP_APP_POLICY is set.
3195 */
3196 void
inp_set_want_app_policy(struct inpcb * inp)3197 inp_set_want_app_policy(struct inpcb *inp)
3198 {
3199 inp->inp_flags2 |= INP2_WANT_APP_POLICY;
3200 }
3201
3202 /*
3203 * Called when PROC_UUID_NECP_APP_POLICY is cleared.
3204 */
3205 void
inp_clear_want_app_policy(struct inpcb * inp)3206 inp_clear_want_app_policy(struct inpcb *inp)
3207 {
3208 inp->inp_flags2 &= ~INP2_WANT_APP_POLICY;
3209 }
3210 #endif /* NECP */
3211
3212 /*
3213 * Calculate flow hash for an inp, used by an interface to identify a
3214 * flow. When an interface provides flow control advisory, this flow
3215 * hash is used as an identifier.
3216 */
3217 u_int32_t
inp_calc_flowhash(struct inpcb * inp)3218 inp_calc_flowhash(struct inpcb *inp)
3219 {
3220 struct inp_flowhash_key fh __attribute__((aligned(8)));
3221 u_int32_t flowhash = 0;
3222 struct inpcb *tmp_inp = NULL;
3223
3224 if (inp_hash_seed == 0) {
3225 inp_hash_seed = RandomULong();
3226 }
3227
3228 bzero(&fh, sizeof(fh));
3229
3230 bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof(fh.infh_laddr));
3231 bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof(fh.infh_faddr));
3232
3233 fh.infh_lport = inp->inp_lport;
3234 fh.infh_fport = inp->inp_fport;
3235 fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET;
3236 fh.infh_proto = inp->inp_ip_p;
3237 fh.infh_rand1 = RandomULong();
3238 fh.infh_rand2 = RandomULong();
3239
3240 try_again:
3241 flowhash = net_flowhash(&fh, sizeof(fh), inp_hash_seed);
3242 if (flowhash == 0) {
3243 /* try to get a non-zero flowhash */
3244 inp_hash_seed = RandomULong();
3245 goto try_again;
3246 }
3247
3248 inp->inp_flowhash = flowhash;
3249
3250 /* Insert the inp into inp_fc_tree */
3251 lck_mtx_lock_spin(&inp_fc_lck);
3252 tmp_inp = RB_FIND(inp_fc_tree, &inp_fc_tree, inp);
3253 if (tmp_inp != NULL) {
3254 /*
3255 * There is a different inp with the same flowhash.
3256 * There can be a collision on flow hash but the
3257 * probability is low. Let's recompute the
3258 * flowhash.
3259 */
3260 lck_mtx_unlock(&inp_fc_lck);
3261 /* recompute hash seed */
3262 inp_hash_seed = RandomULong();
3263 goto try_again;
3264 }
3265
3266 RB_INSERT(inp_fc_tree, &inp_fc_tree, inp);
3267 inp->inp_flags2 |= INP2_IN_FCTREE;
3268 lck_mtx_unlock(&inp_fc_lck);
3269
3270 return flowhash;
3271 }
3272
3273 void
inp_flowadv(uint32_t flowhash)3274 inp_flowadv(uint32_t flowhash)
3275 {
3276 struct inpcb *inp;
3277
3278 inp = inp_fc_getinp(flowhash, 0);
3279
3280 if (inp == NULL) {
3281 return;
3282 }
3283 inp_fc_feedback(inp);
3284 }
3285
3286 /*
3287 * Function to compare inp_fc_entries in inp flow control tree
3288 */
3289 static inline int
infc_cmp(const struct inpcb * inp1,const struct inpcb * inp2)3290 infc_cmp(const struct inpcb *inp1, const struct inpcb *inp2)
3291 {
3292 return memcmp(&(inp1->inp_flowhash), &(inp2->inp_flowhash),
3293 sizeof(inp1->inp_flowhash));
3294 }
3295
3296 static struct inpcb *
inp_fc_getinp(u_int32_t flowhash,u_int32_t flags)3297 inp_fc_getinp(u_int32_t flowhash, u_int32_t flags)
3298 {
3299 struct inpcb *inp = NULL;
3300 int locked = (flags & INPFC_SOLOCKED) ? 1 : 0;
3301
3302 lck_mtx_lock_spin(&inp_fc_lck);
3303 key_inp.inp_flowhash = flowhash;
3304 inp = RB_FIND(inp_fc_tree, &inp_fc_tree, &key_inp);
3305 if (inp == NULL) {
3306 /* inp is not present, return */
3307 lck_mtx_unlock(&inp_fc_lck);
3308 return NULL;
3309 }
3310
3311 if (flags & INPFC_REMOVE) {
3312 RB_REMOVE(inp_fc_tree, &inp_fc_tree, inp);
3313 lck_mtx_unlock(&inp_fc_lck);
3314
3315 bzero(&(inp->infc_link), sizeof(inp->infc_link));
3316 inp->inp_flags2 &= ~INP2_IN_FCTREE;
3317 return NULL;
3318 }
3319
3320 if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) {
3321 inp = NULL;
3322 }
3323 lck_mtx_unlock(&inp_fc_lck);
3324
3325 return inp;
3326 }
3327
3328 static void
inp_fc_feedback(struct inpcb * inp)3329 inp_fc_feedback(struct inpcb *inp)
3330 {
3331 struct socket *so = inp->inp_socket;
3332
3333 /* we already hold a want_cnt on this inp, socket can't be null */
3334 VERIFY(so != NULL);
3335 socket_lock(so, 1);
3336
3337 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3338 socket_unlock(so, 1);
3339 return;
3340 }
3341
3342 if (inp->inp_sndinprog_cnt > 0) {
3343 inp->inp_flags |= INP_FC_FEEDBACK;
3344 }
3345
3346 /*
3347 * Return if the connection is not in flow-controlled state.
3348 * This can happen if the connection experienced
3349 * loss while it was in flow controlled state
3350 */
3351 if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) {
3352 socket_unlock(so, 1);
3353 return;
3354 }
3355 inp_reset_fc_state(inp);
3356
3357 if (SOCK_TYPE(so) == SOCK_STREAM) {
3358 inp_fc_unthrottle_tcp(inp);
3359 }
3360
3361 socket_unlock(so, 1);
3362 }
3363
3364 void
inp_reset_fc_state(struct inpcb * inp)3365 inp_reset_fc_state(struct inpcb *inp)
3366 {
3367 struct socket *so = inp->inp_socket;
3368 int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0;
3369 int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0;
3370
3371 inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3372
3373 if (suspended) {
3374 so->so_flags &= ~(SOF_SUSPENDED);
3375 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME));
3376 }
3377
3378 /* Give a write wakeup to unblock the socket */
3379 if (needwakeup) {
3380 sowwakeup(so);
3381 }
3382 }
3383
3384 int
inp_set_fc_state(struct inpcb * inp,int advcode)3385 inp_set_fc_state(struct inpcb *inp, int advcode)
3386 {
3387 boolean_t is_flow_controlled = INP_WAIT_FOR_IF_FEEDBACK(inp);
3388 struct inpcb *tmp_inp = NULL;
3389 /*
3390 * If there was a feedback from the interface when
3391 * send operation was in progress, we should ignore
3392 * this flow advisory to avoid a race between setting
3393 * flow controlled state and receiving feedback from
3394 * the interface
3395 */
3396 if (inp->inp_flags & INP_FC_FEEDBACK) {
3397 return 0;
3398 }
3399
3400 inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
3401 if ((tmp_inp = inp_fc_getinp(inp->inp_flowhash,
3402 INPFC_SOLOCKED)) != NULL) {
3403 if (in_pcb_checkstate(tmp_inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3404 return 0;
3405 }
3406 VERIFY(tmp_inp == inp);
3407 switch (advcode) {
3408 case FADV_FLOW_CONTROLLED:
3409 inp->inp_flags |= INP_FLOW_CONTROLLED;
3410 break;
3411 case FADV_SUSPENDED:
3412 inp->inp_flags |= INP_FLOW_SUSPENDED;
3413 soevent(inp->inp_socket,
3414 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND));
3415
3416 /* Record the fact that suspend event was sent */
3417 inp->inp_socket->so_flags |= SOF_SUSPENDED;
3418 break;
3419 }
3420
3421 if (!is_flow_controlled && SOCK_TYPE(inp->inp_socket) == SOCK_STREAM) {
3422 inp_fc_throttle_tcp(inp);
3423 }
3424 return 1;
3425 }
3426 return 0;
3427 }
3428
3429 /*
3430 * Handler for SO_FLUSH socket option.
3431 */
3432 int
inp_flush(struct inpcb * inp,int optval)3433 inp_flush(struct inpcb *inp, int optval)
3434 {
3435 u_int32_t flowhash = inp->inp_flowhash;
3436 struct ifnet *rtifp, *oifp;
3437
3438 /* Either all classes or one of the valid ones */
3439 if (optval != SO_TC_ALL && !SO_VALID_TC(optval)) {
3440 return EINVAL;
3441 }
3442
3443 /* We need a flow hash for identification */
3444 if (flowhash == 0) {
3445 return 0;
3446 }
3447
3448 /* Grab the interfaces from the route and pcb */
3449 rtifp = ((inp->inp_route.ro_rt != NULL) ?
3450 inp->inp_route.ro_rt->rt_ifp : NULL);
3451 oifp = inp->inp_last_outifp;
3452
3453 if (rtifp != NULL) {
3454 if_qflush_sc(rtifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3455 }
3456 if (oifp != NULL && oifp != rtifp) {
3457 if_qflush_sc(oifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
3458 }
3459
3460 return 0;
3461 }
3462
3463 /*
3464 * Clear the INP_INADDR_ANY flag (special case for PPP only)
3465 */
3466 void
inp_clear_INP_INADDR_ANY(struct socket * so)3467 inp_clear_INP_INADDR_ANY(struct socket *so)
3468 {
3469 struct inpcb *inp = NULL;
3470
3471 socket_lock(so, 1);
3472 inp = sotoinpcb(so);
3473 if (inp) {
3474 inp->inp_flags &= ~INP_INADDR_ANY;
3475 }
3476 socket_unlock(so, 1);
3477 }
3478
3479 void
inp_get_soprocinfo(struct inpcb * inp,struct so_procinfo * soprocinfo)3480 inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo)
3481 {
3482 struct socket *so = inp->inp_socket;
3483
3484 soprocinfo->spi_pid = so->last_pid;
3485 strlcpy(&soprocinfo->spi_proc_name[0], &inp->inp_last_proc_name[0],
3486 sizeof(soprocinfo->spi_proc_name));
3487 if (so->last_pid != 0) {
3488 uuid_copy(soprocinfo->spi_uuid, so->last_uuid);
3489 }
3490 /*
3491 * When not delegated, the effective pid is the same as the real pid
3492 */
3493 if (so->so_flags & SOF_DELEGATED) {
3494 soprocinfo->spi_delegated = 1;
3495 soprocinfo->spi_epid = so->e_pid;
3496 uuid_copy(soprocinfo->spi_euuid, so->e_uuid);
3497 } else {
3498 soprocinfo->spi_delegated = 0;
3499 soprocinfo->spi_epid = so->last_pid;
3500 }
3501 strlcpy(&soprocinfo->spi_e_proc_name[0], &inp->inp_e_proc_name[0],
3502 sizeof(soprocinfo->spi_e_proc_name));
3503 }
3504
3505 int
inp_findinpcb_procinfo(struct inpcbinfo * pcbinfo,uint32_t flowhash,struct so_procinfo * soprocinfo)3506 inp_findinpcb_procinfo(struct inpcbinfo *pcbinfo, uint32_t flowhash,
3507 struct so_procinfo *soprocinfo)
3508 {
3509 struct inpcb *inp = NULL;
3510 int found = 0;
3511
3512 bzero(soprocinfo, sizeof(struct so_procinfo));
3513
3514 if (!flowhash) {
3515 return -1;
3516 }
3517
3518 lck_rw_lock_shared(&pcbinfo->ipi_lock);
3519 LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
3520 if (inp->inp_state != INPCB_STATE_DEAD &&
3521 inp->inp_socket != NULL &&
3522 inp->inp_flowhash == flowhash) {
3523 found = 1;
3524 inp_get_soprocinfo(inp, soprocinfo);
3525 break;
3526 }
3527 }
3528 lck_rw_done(&pcbinfo->ipi_lock);
3529
3530 return found;
3531 }
3532
3533 #if CONFIG_PROC_UUID_POLICY
3534 static void
inp_update_cellular_policy(struct inpcb * inp,boolean_t set)3535 inp_update_cellular_policy(struct inpcb *inp, boolean_t set)
3536 {
3537 struct socket *so = inp->inp_socket;
3538 int before, after;
3539
3540 VERIFY(so != NULL);
3541 VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3542
3543 before = INP_NO_CELLULAR(inp);
3544 if (set) {
3545 inp_set_nocellular(inp);
3546 } else {
3547 inp_clear_nocellular(inp);
3548 }
3549 after = INP_NO_CELLULAR(inp);
3550 if (net_io_policy_log && (before != after)) {
3551 static const char *ok = "OK";
3552 static const char *nok = "NOACCESS";
3553 uuid_string_t euuid_buf;
3554 pid_t epid;
3555
3556 if (so->so_flags & SOF_DELEGATED) {
3557 uuid_unparse(so->e_uuid, euuid_buf);
3558 epid = so->e_pid;
3559 } else {
3560 uuid_unparse(so->last_uuid, euuid_buf);
3561 epid = so->last_pid;
3562 }
3563
3564 /* allow this socket to generate another notification event */
3565 so->so_ifdenied_notifies = 0;
3566
3567 log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3568 "euuid %s%s %s->%s\n", __func__,
3569 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3570 SOCK_TYPE(so), epid, euuid_buf,
3571 (so->so_flags & SOF_DELEGATED) ?
3572 " [delegated]" : "",
3573 ((before < after) ? ok : nok),
3574 ((before < after) ? nok : ok));
3575 }
3576 }
3577
3578 #if NECP
3579 static void
inp_update_necp_want_app_policy(struct inpcb * inp,boolean_t set)3580 inp_update_necp_want_app_policy(struct inpcb *inp, boolean_t set)
3581 {
3582 struct socket *so = inp->inp_socket;
3583 int before, after;
3584
3585 VERIFY(so != NULL);
3586 VERIFY(inp->inp_state != INPCB_STATE_DEAD);
3587
3588 before = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3589 if (set) {
3590 inp_set_want_app_policy(inp);
3591 } else {
3592 inp_clear_want_app_policy(inp);
3593 }
3594 after = (inp->inp_flags2 & INP2_WANT_APP_POLICY);
3595 if (net_io_policy_log && (before != after)) {
3596 static const char *wanted = "WANTED";
3597 static const char *unwanted = "UNWANTED";
3598 uuid_string_t euuid_buf;
3599 pid_t epid;
3600
3601 if (so->so_flags & SOF_DELEGATED) {
3602 uuid_unparse(so->e_uuid, euuid_buf);
3603 epid = so->e_pid;
3604 } else {
3605 uuid_unparse(so->last_uuid, euuid_buf);
3606 epid = so->last_pid;
3607 }
3608
3609 log(LOG_DEBUG, "%s: so 0x%llx [%d,%d] epid %d "
3610 "euuid %s%s %s->%s\n", __func__,
3611 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
3612 SOCK_TYPE(so), epid, euuid_buf,
3613 (so->so_flags & SOF_DELEGATED) ?
3614 " [delegated]" : "",
3615 ((before < after) ? unwanted : wanted),
3616 ((before < after) ? wanted : unwanted));
3617 }
3618 }
3619 #endif /* NECP */
3620 #endif /* !CONFIG_PROC_UUID_POLICY */
3621
3622 #if NECP
3623 void
inp_update_necp_policy(struct inpcb * inp,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr,u_int override_bound_interface)3624 inp_update_necp_policy(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int override_bound_interface)
3625 {
3626 necp_socket_find_policy_match(inp, override_local_addr, override_remote_addr, override_bound_interface);
3627 if (necp_socket_should_rescope(inp) &&
3628 inp->inp_lport == 0 &&
3629 inp->inp_laddr.s_addr == INADDR_ANY &&
3630 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
3631 // If we should rescope, and the socket is not yet bound
3632 inp_bindif(inp, necp_socket_get_rescope_if_index(inp), NULL);
3633 inp->inp_flags2 |= INP2_SCOPED_BY_NECP;
3634 }
3635 }
3636 #endif /* NECP */
3637
3638 int
inp_update_policy(struct inpcb * inp)3639 inp_update_policy(struct inpcb *inp)
3640 {
3641 #if CONFIG_PROC_UUID_POLICY
3642 struct socket *so = inp->inp_socket;
3643 uint32_t pflags = 0;
3644 int32_t ogencnt;
3645 int err = 0;
3646 uint8_t *lookup_uuid = NULL;
3647
3648 if (!net_io_policy_uuid ||
3649 so == NULL || inp->inp_state == INPCB_STATE_DEAD) {
3650 return 0;
3651 }
3652
3653 /*
3654 * Kernel-created sockets that aren't delegating other sockets
3655 * are currently exempted from UUID policy checks.
3656 */
3657 if (so->last_pid == 0 && !(so->so_flags & SOF_DELEGATED)) {
3658 return 0;
3659 }
3660
3661 #if defined(XNU_TARGET_OS_OSX)
3662 if (so->so_rpid > 0) {
3663 lookup_uuid = so->so_ruuid;
3664 ogencnt = so->so_policy_gencnt;
3665 err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3666 }
3667 #endif
3668 if (lookup_uuid == NULL || err == ENOENT) {
3669 lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid);
3670 ogencnt = so->so_policy_gencnt;
3671 err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
3672 }
3673
3674 /*
3675 * Discard cached generation count if the entry is gone (ENOENT),
3676 * so that we go thru the checks below.
3677 */
3678 if (err == ENOENT && ogencnt != 0) {
3679 so->so_policy_gencnt = 0;
3680 }
3681
3682 /*
3683 * If the generation count has changed, inspect the policy flags
3684 * and act accordingly. If a policy flag was previously set and
3685 * the UUID is no longer present in the table (ENOENT), treat it
3686 * as if the flag has been cleared.
3687 */
3688 if ((err == 0 || err == ENOENT) && ogencnt != so->so_policy_gencnt) {
3689 /* update cellular policy for this socket */
3690 if (err == 0 && (pflags & PROC_UUID_NO_CELLULAR)) {
3691 inp_update_cellular_policy(inp, TRUE);
3692 } else if (!(pflags & PROC_UUID_NO_CELLULAR)) {
3693 inp_update_cellular_policy(inp, FALSE);
3694 }
3695 #if NECP
3696 /* update necp want app policy for this socket */
3697 if (err == 0 && (pflags & PROC_UUID_NECP_APP_POLICY)) {
3698 inp_update_necp_want_app_policy(inp, TRUE);
3699 } else if (!(pflags & PROC_UUID_NECP_APP_POLICY)) {
3700 inp_update_necp_want_app_policy(inp, FALSE);
3701 }
3702 #endif /* NECP */
3703 }
3704
3705 return (err == ENOENT) ? 0 : err;
3706 #else /* !CONFIG_PROC_UUID_POLICY */
3707 #pragma unused(inp)
3708 return 0;
3709 #endif /* !CONFIG_PROC_UUID_POLICY */
3710 }
3711
3712 static unsigned int log_restricted;
3713 SYSCTL_DECL(_net_inet);
3714 SYSCTL_INT(_net_inet, OID_AUTO, log_restricted,
3715 CTLFLAG_RW | CTLFLAG_LOCKED, &log_restricted, 0,
3716 "Log network restrictions");
3717 /*
3718 * Called when we need to enforce policy restrictions in the input path.
3719 *
3720 * Returns TRUE if we're not allowed to receive data, otherwise FALSE.
3721 */
3722 static boolean_t
_inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)3723 _inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
3724 {
3725 VERIFY(inp != NULL);
3726
3727 /*
3728 * Inbound restrictions.
3729 */
3730 if (!sorestrictrecv) {
3731 return FALSE;
3732 }
3733
3734 if (ifp == NULL) {
3735 return FALSE;
3736 }
3737
3738 if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
3739 return TRUE;
3740 }
3741
3742 if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
3743 return TRUE;
3744 }
3745
3746 if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
3747 return TRUE;
3748 }
3749
3750 if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
3751 return TRUE;
3752 }
3753
3754 if (!(ifp->if_eflags & IFEF_RESTRICTED_RECV)) {
3755 return FALSE;
3756 }
3757
3758 if (inp->inp_flags & INP_RECV_ANYIF) {
3759 return FALSE;
3760 }
3761
3762 if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp == ifp) {
3763 return FALSE;
3764 }
3765
3766 if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
3767 return TRUE;
3768 }
3769
3770 return TRUE;
3771 }
3772
3773 boolean_t
inp_restricted_recv(struct inpcb * inp,struct ifnet * ifp)3774 inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
3775 {
3776 boolean_t ret;
3777
3778 ret = _inp_restricted_recv(inp, ifp);
3779 if (ret == TRUE && log_restricted) {
3780 printf("pid %d (%s) is unable to receive packets on %s\n",
3781 proc_getpid(current_proc()), proc_best_name(current_proc()),
3782 ifp->if_xname);
3783 }
3784 return ret;
3785 }
3786
3787 /*
3788 * Called when we need to enforce policy restrictions in the output path.
3789 *
3790 * Returns TRUE if we're not allowed to send data out, otherwise FALSE.
3791 */
3792 static boolean_t
_inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)3793 _inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
3794 {
3795 VERIFY(inp != NULL);
3796
3797 /*
3798 * Outbound restrictions.
3799 */
3800 if (!sorestrictsend) {
3801 return FALSE;
3802 }
3803
3804 if (ifp == NULL) {
3805 return FALSE;
3806 }
3807
3808 if (IFNET_IS_CELLULAR(ifp) && INP_NO_CELLULAR(inp)) {
3809 return TRUE;
3810 }
3811
3812 if (IFNET_IS_EXPENSIVE(ifp) && INP_NO_EXPENSIVE(inp)) {
3813 return TRUE;
3814 }
3815
3816 if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
3817 return TRUE;
3818 }
3819
3820 if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
3821 return TRUE;
3822 }
3823
3824 if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) {
3825 return TRUE;
3826 }
3827
3828 return FALSE;
3829 }
3830
3831 boolean_t
inp_restricted_send(struct inpcb * inp,struct ifnet * ifp)3832 inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
3833 {
3834 boolean_t ret;
3835
3836 ret = _inp_restricted_send(inp, ifp);
3837 if (ret == TRUE && log_restricted) {
3838 printf("pid %d (%s) is unable to transmit packets on %s\n",
3839 proc_getpid(current_proc()), proc_best_name(current_proc()),
3840 ifp->if_xname);
3841 }
3842 return ret;
3843 }
3844
3845 inline void
inp_count_sndbytes(struct inpcb * inp,u_int32_t th_ack)3846 inp_count_sndbytes(struct inpcb *inp, u_int32_t th_ack)
3847 {
3848 struct ifnet *ifp = inp->inp_last_outifp;
3849 struct socket *so = inp->inp_socket;
3850 if (ifp != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
3851 (ifp->if_type == IFT_CELLULAR || IFNET_IS_WIFI(ifp))) {
3852 int32_t unsent;
3853
3854 so->so_snd.sb_flags |= SB_SNDBYTE_CNT;
3855
3856 /*
3857 * There can be data outstanding before the connection
3858 * becomes established -- TFO case
3859 */
3860 if (so->so_snd.sb_cc > 0) {
3861 inp_incr_sndbytes_total(so, so->so_snd.sb_cc);
3862 }
3863
3864 unsent = inp_get_sndbytes_allunsent(so, th_ack);
3865 if (unsent > 0) {
3866 inp_incr_sndbytes_unsent(so, unsent);
3867 }
3868 }
3869 }
3870
3871 inline void
inp_incr_sndbytes_total(struct socket * so,int32_t len)3872 inp_incr_sndbytes_total(struct socket *so, int32_t len)
3873 {
3874 struct inpcb *inp = (struct inpcb *)so->so_pcb;
3875 struct ifnet *ifp = inp->inp_last_outifp;
3876
3877 if (ifp != NULL) {
3878 VERIFY(ifp->if_sndbyte_total >= 0);
3879 OSAddAtomic64(len, &ifp->if_sndbyte_total);
3880 }
3881 }
3882
3883 inline void
inp_decr_sndbytes_total(struct socket * so,int32_t len)3884 inp_decr_sndbytes_total(struct socket *so, int32_t len)
3885 {
3886 struct inpcb *inp = (struct inpcb *)so->so_pcb;
3887 struct ifnet *ifp = inp->inp_last_outifp;
3888
3889 if (ifp != NULL) {
3890 if (ifp->if_sndbyte_total >= len) {
3891 OSAddAtomic64(-len, &ifp->if_sndbyte_total);
3892 } else {
3893 ifp->if_sndbyte_total = 0;
3894 }
3895 }
3896 }
3897
3898 inline void
inp_incr_sndbytes_unsent(struct socket * so,int32_t len)3899 inp_incr_sndbytes_unsent(struct socket *so, int32_t len)
3900 {
3901 struct inpcb *inp = (struct inpcb *)so->so_pcb;
3902 struct ifnet *ifp = inp->inp_last_outifp;
3903
3904 if (ifp != NULL) {
3905 VERIFY(ifp->if_sndbyte_unsent >= 0);
3906 OSAddAtomic64(len, &ifp->if_sndbyte_unsent);
3907 }
3908 }
3909
3910 inline void
inp_decr_sndbytes_unsent(struct socket * so,int32_t len)3911 inp_decr_sndbytes_unsent(struct socket *so, int32_t len)
3912 {
3913 if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
3914 return;
3915 }
3916
3917 struct inpcb *inp = (struct inpcb *)so->so_pcb;
3918 struct ifnet *ifp = inp->inp_last_outifp;
3919
3920 if (ifp != NULL) {
3921 if (ifp->if_sndbyte_unsent >= len) {
3922 OSAddAtomic64(-len, &ifp->if_sndbyte_unsent);
3923 } else {
3924 ifp->if_sndbyte_unsent = 0;
3925 }
3926 }
3927 }
3928
3929 inline void
inp_decr_sndbytes_allunsent(struct socket * so,u_int32_t th_ack)3930 inp_decr_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
3931 {
3932 int32_t len;
3933
3934 if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
3935 return;
3936 }
3937
3938 len = inp_get_sndbytes_allunsent(so, th_ack);
3939 inp_decr_sndbytes_unsent(so, len);
3940 }
3941
3942 #if SKYWALK
3943 inline void
inp_update_netns_flags(struct socket * so)3944 inp_update_netns_flags(struct socket *so)
3945 {
3946 struct inpcb *inp;
3947 uint32_t set_flags = 0;
3948 uint32_t clear_flags = 0;
3949
3950 if (!(SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3951 return;
3952 }
3953
3954 inp = sotoinpcb(so);
3955
3956 if (inp == NULL) {
3957 return;
3958 }
3959
3960 if (!NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3961 return;
3962 }
3963
3964 if (so->so_options & SO_NOWAKEFROMSLEEP) {
3965 set_flags |= NETNS_NOWAKEFROMSLEEP;
3966 } else {
3967 clear_flags |= NETNS_NOWAKEFROMSLEEP;
3968 }
3969
3970 if (inp->inp_flags & INP_RECV_ANYIF) {
3971 set_flags |= NETNS_RECVANYIF;
3972 } else {
3973 clear_flags |= NETNS_RECVANYIF;
3974 }
3975
3976 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
3977 set_flags |= NETNS_EXTBGIDLE;
3978 } else {
3979 clear_flags |= NETNS_EXTBGIDLE;
3980 }
3981
3982 netns_change_flags(&inp->inp_netns_token, set_flags, clear_flags);
3983 }
3984 #endif /* SKYWALK */
3985
3986 inline void
inp_set_activity_bitmap(struct inpcb * inp)3987 inp_set_activity_bitmap(struct inpcb *inp)
3988 {
3989 in_stat_set_activity_bitmap(&inp->inp_nw_activity, net_uptime());
3990 }
3991
3992 inline void
inp_get_activity_bitmap(struct inpcb * inp,activity_bitmap_t * ab)3993 inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *ab)
3994 {
3995 bcopy(&inp->inp_nw_activity, ab, sizeof(*ab));
3996 }
3997
3998 void
inp_update_last_owner(struct socket * so,struct proc * p,struct proc * ep)3999 inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep)
4000 {
4001 struct inpcb *inp = (struct inpcb *)so->so_pcb;
4002
4003 if (inp == NULL) {
4004 return;
4005 }
4006
4007 if (p != NULL) {
4008 strlcpy(&inp->inp_last_proc_name[0], proc_name_address(p), sizeof(inp->inp_last_proc_name));
4009 }
4010 if (so->so_flags & SOF_DELEGATED) {
4011 if (ep != NULL) {
4012 strlcpy(&inp->inp_e_proc_name[0], proc_name_address(ep), sizeof(inp->inp_e_proc_name));
4013 } else {
4014 inp->inp_e_proc_name[0] = 0;
4015 }
4016 } else {
4017 inp->inp_e_proc_name[0] = 0;
4018 }
4019 }
4020
4021 void
inp_copy_last_owner(struct socket * so,struct socket * head)4022 inp_copy_last_owner(struct socket *so, struct socket *head)
4023 {
4024 struct inpcb *inp = (struct inpcb *)so->so_pcb;
4025 struct inpcb *head_inp = (struct inpcb *)head->so_pcb;
4026
4027 if (inp == NULL || head_inp == NULL) {
4028 return;
4029 }
4030
4031 strlcpy(&inp->inp_last_proc_name[0], &head_inp->inp_last_proc_name[0], sizeof(inp->inp_last_proc_name));
4032 strlcpy(&inp->inp_e_proc_name[0], &head_inp->inp_e_proc_name[0], sizeof(inp->inp_e_proc_name));
4033 }
4034