xref: /xnu-12377.61.12/bsd/skywalk/namespace/netns.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/assert.h>
30 #include <kern/locks.h>
31 #include <kern/zalloc.h>
32 #include <libkern/tree.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/bitstring.h>
36 #include <net/if.h>
37 #include <net/kpi_interface.h>
38 #include <net/restricted_in_port.h>
39 
40 #include <netinet/in.h>
41 #include <netinet/in_pcb.h>
42 #include <netinet/tcp_fsm.h>
43 #include <netinet/tcp_var.h>
44 
45 #include <netinet6/in6_var.h>
46 #include <string.h>
47 
48 #include <skywalk/os_skywalk.h>
49 #include <skywalk/os_skywalk_private.h>
50 #include <skywalk/os_stats_private.h>
51 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
52 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
53 
54 #include <net/if_ports_used.h>
55 
56 static int __netns_inited = 0;
57 
58 /*
59  * Logging
60  */
61 
62 #define NS_VERB_PROTO(proto)    ((proto == IPPROTO_TCP) ? SK_VERB_NS_TCP : \
63 	                                    SK_VERB_NS_UDP)
64 #define NS_VERB_IP(addr_len)    ((addr_len == sizeof (struct in_addr)) ? \
65 	                                    SK_VERB_NS_IPV4 : SK_VERB_NS_IPV6)
66 #define PROTO_STR(proto)        ((proto == IPPROTO_TCP) ? "tcp" : "udp")
67 #define LEN_TO_AF(len)          (((len == sizeof (struct in_addr)) ? \
68 	                            AF_INET : AF_INET6))
69 #define NS_PORT_ERR(_fmt, ...) do { \
70 	SK_ERR("%s(%d) port %u: " _fmt, sk_proc_name(current_proc()), \
71 	    sk_proc_pid(current_proc()), port, ##__VA_ARGS__); \
72 } while (0);
73 
74 /*
75  * Locking
76  * Netns is currently protected by a global mutex, NETNS_LOCK. This lock is
77  * aquired at the entry of every kernel-facing function, and released at the
78  * end. Data within netns_token structures is also protected under this lock.
79  */
80 
81 #define NETNS_LOCK()                    \
82 	lck_mtx_lock(&netns_lock)
83 #define NETNS_LOCK_SPIN()               \
84 	lck_mtx_lock_spin(&netns_lock)
85 #define NETNS_LOCK_CONVERT() do {       \
86 	NETNS_LOCK_ASSERT_HELD();       \
87 	lck_mtx_convert_spin(&netns_lock); \
88 } while (0)
89 #define NETNS_UNLOCK()                  \
90 	lck_mtx_unlock(&netns_lock)
91 #define NETNS_LOCK_ASSERT_HELD()        \
92 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_OWNED)
93 #define NETNS_LOCK_ASSERT_NOTHELD()     \
94 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_NOTOWNED)
95 
96 static LCK_GRP_DECLARE(netns_lock_group, "netns_lock");
97 static LCK_MTX_DECLARE(netns_lock, &netns_lock_group);
98 
99 /*
100  * Internal data structures and parameters
101  */
102 
103 /*
104  * Local ports are kept track of by reference counts kept in a tree specific to
105  * an <IP, protocol> tuple (see struct ns).
106  *
107  * Note: port numbers are stored in host byte order.
108  */
109 struct ns_reservation {
110 	RB_ENTRY(ns_reservation) nsr_link;
111 	uint32_t nsr_refs[NETNS_OWNER_MAX + 1];
112 	in_port_t nsr_port;
113 	bool nsr_reuseport:1;
114 };
115 
116 #define NETNS_REF_COUNT(nsr, flags)     \
117 	(nsr)->nsr_refs[((flags) & NETNS_OWNER_MASK)]
118 
119 static inline int nsr_cmp(const struct ns_reservation *,
120     const struct ns_reservation *);
121 
122 RB_HEAD(ns_reservation_tree, ns_reservation);
123 RB_PROTOTYPE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
124 RB_GENERATE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
125 
126 static inline struct ns_reservation *ns_reservation_tree_find(
127 	struct ns_reservation_tree *, const in_port_t);
128 
129 /*
130  * A namespace keeps track of the local port numbers in use for a given
131  * <IP, protocol> tuple. There are also global namespaces for each
132  * protocol to accomodate INADDR_ANY behavior and diagnostics.
133  */
134 struct ns {
135 	RB_ENTRY(ns)    ns_link;
136 
137 	void            *ns_addr_key;
138 
139 	union {
140 		uint32_t        ns_addr[4];
141 		struct in_addr  ns_inaddr;
142 		struct in6_addr ns_in6addr;
143 	};
144 	uint8_t         ns_addr_len;
145 	uint8_t         ns_proto;
146 
147 	in_port_t       ns_last_ephemeral_port_down;
148 	in_port_t       ns_last_ephemeral_port_up;
149 
150 	uint8_t         ns_is_freeable;
151 
152 	uint32_t        ns_n_reservations;
153 	struct ns_reservation_tree ns_reservations;
154 };
155 
156 static uint32_t netns_n_namespaces;
157 
158 static inline int ns_cmp(const struct ns *, const struct ns *);
159 
160 RB_HEAD(netns_namespaces_tree, ns) netns_namespaces =
161     RB_INITIALIZER(netns_namespaces);
162 RB_PROTOTYPE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
163 RB_GENERATE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
164 
165 /*
166  * Declare pointers to global namespaces for each protocol.
167  * All non-wildcard reservations will have an entry here.
168  */
169 #define NETNS_N_GLOBAL  4
170 static struct ns *netns_global_non_wild[NETNS_N_GLOBAL];
171 static struct ns *netns_global_wild[NETNS_N_GLOBAL];
172 #define NETNS_ADDRLEN_V4 (sizeof(struct in_addr))
173 #define NETNS_ADDRLEN_V6 (sizeof(struct in6_addr))
174 #define NETNS_NS_TCP    0
175 #define NETNS_NS_UDP    1
176 #define NETNS_NS_V4     0
177 #define NETNS_NS_V6     2
178 #define NETNS_NS_GLOBAL_IDX(proto, addrlen)     \
179 	((((proto) == IPPROTO_TCP) ? NETNS_NS_TCP : NETNS_NS_UDP) | \
180 	(((addrlen) == NETNS_ADDRLEN_V4) ? NETNS_NS_V4 : NETNS_NS_V6))
181 
182 #define NETNS_NS_UDP_EPHEMERAL_RESERVE  4096
183 
184 /*
185  * Internal token structure
186  *
187  * Note: port numbers are stored in host byte order.
188  */
189 struct ns_token {
190 	/* Reservation state */
191 	ifnet_t                 nt_ifp;
192 	LIST_ENTRY(ns_token)    nt_ifp_link;
193 	LIST_ENTRY(ns_token)    nt_all_link;
194 	uint32_t                nt_state;       /* NETNS_STATE_* */
195 
196 	/* Reservation context */
197 	union {
198 		uint32_t        nt_addr[4];
199 		struct in_addr  nt_inaddr;
200 		struct in6_addr nt_in6addr;
201 	};
202 	uint8_t                 nt_addr_len;
203 	uint8_t                 nt_proto;
204 	in_port_t               nt_port;
205 	uint32_t                nt_flags;
206 
207 	/* Optional information about the flow */
208 	struct ns_flow_info     *nt_flow_info;
209 };
210 
211 /* Valid values for nt_state */
212 #define NETNS_STATE_HALFCLOSED  0x1     /* half closed */
213 #define NETNS_STATE_WITHDRAWN   0x2     /* withdrawn; not offloadable */
214 
215 #define NETNS_STATE_BITS        "\020\01HALFCLOSED\02WITHDRAWN"
216 
217 /* List of tokens not bound to an ifnet */
218 LIST_HEAD(, ns_token) netns_unbound_tokens = LIST_HEAD_INITIALIZER(
219 	netns_unbound_tokens);
220 
221 /* List of all tokens currently allocated in the system */
222 LIST_HEAD(, ns_token) netns_all_tokens = LIST_HEAD_INITIALIZER(
223 	netns_all_tokens);
224 
225 /*
226  * Memory management
227  */
228 static SKMEM_TYPE_DEFINE(netns_ns_zone, struct ns);
229 
230 #define NETNS_NS_TOKEN_ZONE_NAME        "netns.ns_token"
231 static unsigned int netns_ns_token_size; /* size of zone element */
232 static struct skmem_cache *netns_ns_token_cache; /* for ns_token */
233 
234 #define NETNS_NS_FLOW_INFO_ZONE_NAME    "netns.ns_flow_info"
235 static unsigned int netns_ns_flow_info_size; /* size of zone element */
236 static struct skmem_cache *netns_ns_flow_info_cache; /* for ns_flow_info */
237 
238 #define NETNS_NS_RESERVATION_ZONE_NAME  "netns.ns_reservation"
239 static unsigned int netns_ns_reservation_size; /* size of zone element */
240 static struct skmem_cache *netns_ns_reservation_cache; /* for ns_reservation */
241 
242 static struct ns_reservation *netns_ns_reservation_alloc(in_port_t, uint32_t);
243 static void netns_ns_reservation_free(struct ns_reservation *);
244 static struct ns *netns_ns_alloc(zalloc_flags_t);
245 static void netns_ns_free(struct ns *);
246 static void netns_ns_cleanup(struct ns *);
247 static struct ns_token *netns_ns_token_alloc(boolean_t);
248 static void netns_ns_token_free(struct ns_token *);
249 
250 /*
251  * Utility/internal code
252  */
253 static struct ns *_netns_get_ns(uint32_t *__sized_by(addr_len), uint8_t addr_len,
254     uint8_t, bool);
255 static inline boolean_t _netns_is_wildcard_addr(
256 	const uint32_t *__sized_by(addr_len), uint8_t addr_len);
257 static int _netns_reserve_common(struct ns *, in_port_t, uint32_t);
258 static void _netns_release_common(struct ns *, in_port_t, uint32_t);
259 static inline void netns_clear_ifnet(struct ns_token *);
260 static int _netns_reserve_kpi_common(struct ns *, netns_token *,
261     uint32_t *__sized_by(addr_len), uint8_t addr_len, uint8_t, in_port_t *,
262     uint32_t, struct ns_flow_info *);
263 static void _netns_set_ifnet_internal(struct ns_token *, struct ifnet *);
264 
265 static struct ns_reservation *
netns_ns_reservation_alloc(in_port_t port,uint32_t flags)266 netns_ns_reservation_alloc(in_port_t port, uint32_t flags)
267 {
268 	struct ns_reservation *res;
269 
270 	VERIFY(port != 0);
271 
272 	res = skmem_cache_alloc(netns_ns_reservation_cache, SKMEM_SLEEP);
273 	ASSERT(res != NULL);
274 
275 	bzero(res, netns_ns_reservation_size);
276 	res->nsr_port = port;
277 	res->nsr_reuseport = ((flags & NETNS_REUSEPORT) != 0);
278 	return res;
279 }
280 
281 static void
netns_ns_reservation_free(struct ns_reservation * res)282 netns_ns_reservation_free(struct ns_reservation *res)
283 {
284 	skmem_cache_free(netns_ns_reservation_cache, res);
285 }
286 
287 static struct ns *
netns_ns_alloc(zalloc_flags_t how)288 netns_ns_alloc(zalloc_flags_t how)
289 {
290 	struct ns *namespace;
291 	in_port_t first = (in_port_t)ipport_firstauto;
292 	in_port_t last  = (in_port_t)ipport_lastauto;
293 	in_port_t rand_port;
294 
295 	namespace = zalloc_flags(netns_ns_zone, how | Z_ZERO);
296 	if (namespace == NULL) {
297 		return NULL;
298 	}
299 
300 	namespace->ns_is_freeable = 1;
301 
302 	RB_INIT(&namespace->ns_reservations);
303 
304 	/*
305 	 * Randomize the initial ephemeral port starting point, just in case
306 	 * this namespace is for an ipv6 address which gets brought up and
307 	 * down often.
308 	 */
309 	if (first == last) {
310 		rand_port = first;
311 	} else {
312 		read_frandom(&rand_port, sizeof(rand_port));
313 
314 		if (first > last) {
315 			rand_port = last + (rand_port % (first - last));
316 		} else {
317 			rand_port = first + (rand_port % (last - first));
318 		}
319 	}
320 	namespace->ns_last_ephemeral_port_down = rand_port;
321 	namespace->ns_last_ephemeral_port_up = rand_port;
322 
323 	return namespace;
324 }
325 
326 static void
netns_ns_free(struct ns * namespace)327 netns_ns_free(struct ns *namespace)
328 {
329 	struct ns_reservation *res;
330 	struct ns_reservation *tmp_res;
331 #if SK_LOG
332 	char tmp_ip_str[MAX_IPv6_STR_LEN];
333 #endif /* SK_LOG */
334 
335 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
336 	    NS_VERB_PROTO(namespace->ns_proto),
337 	    "freeing %s ns for IP %s",
338 	    PROTO_STR(namespace->ns_proto),
339 	    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
340 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)));
341 
342 	RB_FOREACH_SAFE(res, ns_reservation_tree, &namespace->ns_reservations,
343 	    tmp_res) {
344 		netns_ns_reservation_free(res);
345 		namespace->ns_n_reservations--;
346 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
347 		    res);
348 	}
349 
350 	VERIFY(RB_EMPTY(&namespace->ns_reservations));
351 
352 	if (netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
353 	    namespace->ns_addr_len)] == namespace) {
354 		netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
355 		namespace->ns_addr_len)] = NULL;
356 	}
357 	if (netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
358 	    namespace->ns_addr_len)] == namespace) {
359 		netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
360 		namespace->ns_addr_len)] = NULL;
361 	}
362 
363 	zfree(netns_ns_zone, namespace);
364 }
365 
366 static void
netns_ns_cleanup(struct ns * namespace)367 netns_ns_cleanup(struct ns *namespace)
368 {
369 	if (namespace->ns_is_freeable &&
370 	    RB_EMPTY(&namespace->ns_reservations)) {
371 		RB_REMOVE(netns_namespaces_tree, &netns_namespaces, namespace);
372 		netns_n_namespaces--;
373 		netns_ns_free(namespace);
374 	}
375 }
376 
377 static struct ns_token *
netns_ns_token_alloc(boolean_t with_nfi)378 netns_ns_token_alloc(boolean_t with_nfi)
379 {
380 	struct ns_token *token;
381 
382 	NETNS_LOCK_ASSERT_HELD();
383 	NETNS_LOCK_CONVERT();
384 
385 	token = skmem_cache_alloc(netns_ns_token_cache, SKMEM_SLEEP);
386 	ASSERT(token != NULL);
387 
388 	bzero(token, netns_ns_token_size);
389 
390 	if (with_nfi) {
391 		token->nt_flow_info =  skmem_cache_alloc(netns_ns_flow_info_cache,
392 		    SKMEM_SLEEP);
393 		ASSERT(token->nt_flow_info != NULL);
394 	}
395 	LIST_INSERT_HEAD(&netns_all_tokens, token, nt_all_link);
396 
397 	return token;
398 }
399 
400 static void
netns_ns_token_free(struct ns_token * token)401 netns_ns_token_free(struct ns_token *token)
402 {
403 	NETNS_LOCK_ASSERT_HELD();
404 	NETNS_LOCK_CONVERT();
405 	LIST_REMOVE(token, nt_all_link);
406 
407 	if (token->nt_flow_info != NULL) {
408 		skmem_cache_free(netns_ns_flow_info_cache, token->nt_flow_info);
409 	}
410 	skmem_cache_free(netns_ns_token_cache, token);
411 }
412 
413 __attribute__((always_inline))
414 static inline int
nsr_cmp(const struct ns_reservation * nsr1,const struct ns_reservation * nsr2)415 nsr_cmp(const struct ns_reservation *nsr1, const struct ns_reservation *nsr2)
416 {
417 #define NSR_COMPARE(r1, r2)     ((int)(r1)->nsr_port - (int)(r2)->nsr_port)
418 	return NSR_COMPARE(nsr1, nsr2);
419 }
420 
421 __attribute__((always_inline))
422 static inline int
ns_cmp(const struct ns * a,const struct ns * b)423 ns_cmp(const struct ns *a, const struct ns *b)
424 {
425 	int d;
426 
427 	if ((d = (a->ns_addr_len - b->ns_addr_len)) != 0) {
428 		return d;
429 	}
430 	if ((d = (a->ns_proto - b->ns_proto)) != 0) {
431 		return d;
432 	}
433 	if ((d = flow_ip_cmp(a->ns_addr_key, b->ns_addr_key,
434 	    b->ns_addr_len)) != 0) {
435 		return d;
436 	}
437 
438 	return 0;
439 }
440 
441 /*
442  * Common routine to look up a reservation.
443  *
444  * NOTE: Assumes the caller holds the NETNS global lock
445  */
446 __attribute__((always_inline))
447 static inline struct ns_reservation *
ns_reservation_tree_find(struct ns_reservation_tree * tree,const in_port_t port)448 ns_reservation_tree_find(struct ns_reservation_tree *tree, const in_port_t port)
449 {
450 	struct ns_reservation res;
451 	res.nsr_port = port;
452 	return RB_FIND(ns_reservation_tree, tree, &res);
453 }
454 
455 /*
456  * Retrieve the namespace for the supplied <address, protocol> tuple.
457  * If create is set and such a namespace doesn't already exist, one will be
458  * created.
459  */
460 static struct ns *
_netns_get_ns(uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,bool create)461 _netns_get_ns(uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto, bool create)
462 {
463 	struct ns *namespace = NULL;
464 	struct ns find = {
465 		.ns_addr_key = addr,
466 		.ns_addr_len = addr_len,
467 		.ns_proto = proto,
468 	};
469 #if SK_LOG
470 	char tmp_ip_str[MAX_IPv6_STR_LEN];
471 #endif /* SK_LOG */
472 
473 	VERIFY(addr_len == sizeof(struct in_addr) ||
474 	    addr_len == sizeof(struct in6_addr));
475 
476 	NETNS_LOCK_ASSERT_HELD();
477 
478 	namespace = RB_FIND(netns_namespaces_tree, &netns_namespaces, &find);
479 
480 	if (create && namespace == NULL) {
481 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
482 		    "allocating %s ns for IP %s",
483 		    PROTO_STR(proto), sk_ntop(LEN_TO_AF(addr_len), addr,
484 		    tmp_ip_str, sizeof(tmp_ip_str)));
485 		NETNS_LOCK_CONVERT();
486 		namespace = netns_ns_alloc(Z_WAITOK | Z_NOFAIL);
487 		__builtin_assume(namespace != NULL);
488 		memcpy(namespace->ns_addr, addr, addr_len);
489 		namespace->ns_addr_key = &namespace->ns_addr;
490 		namespace->ns_addr_len = addr_len;
491 		namespace->ns_proto = proto;
492 		RB_INSERT(netns_namespaces_tree, &netns_namespaces, namespace);
493 		netns_n_namespaces++;
494 
495 		if (_netns_is_wildcard_addr(addr, addr_len) &&
496 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
497 		    addr_len)] == NULL) {
498 			netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
499 			addr_len)] = namespace;
500 		}
501 	}
502 
503 	return namespace;
504 }
505 
506 /*
507  * Return true if the supplied address is a wildcard (INADDR_ANY)
508  */
509 __attribute__((always_inline))
510 static boolean_t
_netns_is_wildcard_addr(const uint32_t * __sized_by (addr_len)addr,uint8_t addr_len)511 _netns_is_wildcard_addr(const uint32_t *__sized_by(addr_len)addr, uint8_t addr_len)
512 {
513 	boolean_t wildcard;
514 
515 	switch (addr_len) {
516 	case sizeof(struct in_addr):
517 		wildcard = (addr[0] == 0);
518 		break;
519 
520 	case sizeof(struct in6_addr):
521 		wildcard = (addr[0] == 0 && addr[1] == 0 &&
522 		    addr[2] == 0 && addr[3] == 0);
523 		break;
524 
525 	default:
526 		wildcard = FALSE;
527 		break;
528 	}
529 
530 	return wildcard;
531 }
532 
533 __attribute__((always_inline))
534 static boolean_t
_netns_is_port_used(struct ns * gns,struct ns_reservation * curr_res,in_port_t port)535 _netns_is_port_used(struct ns * gns, struct ns_reservation *curr_res, in_port_t port)
536 {
537 	struct ns_reservation *res = NULL;
538 
539 	if (gns == NULL) {
540 		return FALSE;
541 	}
542 
543 	res = ns_reservation_tree_find(&gns->ns_reservations, port);
544 	if (res != NULL && res != curr_res) {
545 		if (!res->nsr_reuseport) {
546 			return TRUE;
547 		}
548 	}
549 
550 	return FALSE;
551 }
552 
553 /*
554  * Internal shared code to reserve ports within a specific namespace.
555  *
556  * Note: port numbers are in host byte-order here.
557  */
558 static int
_netns_reserve_common(struct ns * namespace,in_port_t port,uint32_t flags)559 _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags)
560 {
561 	struct ns_reservation *res = NULL, *exist = NULL;
562 	uint8_t proto, addr_len;
563 	int err = 0;
564 #if SK_LOG
565 	char tmp_ip_str[MAX_IPv6_STR_LEN];
566 #endif /* SK_LOG */
567 
568 	VERIFY(port != 0);
569 	proto = namespace->ns_proto;
570 	addr_len = namespace->ns_addr_len;
571 	NETNS_LOCK_CONVERT();
572 	res = netns_ns_reservation_alloc(port, flags);
573 	if (res == NULL) {
574 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
575 		    "ERROR %s:%s:%d // flags 0x%x // OUT OF MEMORY",
576 		    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
577 		    namespace->ns_addr, tmp_ip_str,
578 		    sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags);
579 		return ENOMEM;
580 	}
581 	exist = RB_INSERT(ns_reservation_tree, &namespace->ns_reservations,
582 	    res);
583 	if (__probable(exist == NULL)) {
584 		namespace->ns_n_reservations++;
585 	} else {
586 		netns_ns_reservation_free(res);
587 		res = exist;
588 	}
589 
590 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
591 	    "pre: %s:%s:%d // flags 0x%x // refs %d sky, %d ls, "
592 	    "%d bsd %d pf", sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
593 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
594 	    PROTO_STR(proto), port, flags,
595 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
596 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
597 	    NETNS_REF_COUNT(res, NETNS_BSD),
598 	    NETNS_REF_COUNT(res, NETNS_PF));
599 
600 	/* Make reservation */
601 	/*
602 	 * Bypass collision detection for reservations in the global non-wild
603 	 * namespace. We use that namespace for reference counts only.
604 	 */
605 	if (namespace !=
606 	    netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)]) {
607 		struct ns_reservation *skres;
608 		boolean_t is_wild = _netns_is_wildcard_addr(namespace->ns_addr,
609 		    addr_len);
610 		struct ns *gns =
611 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)];
612 
613 		if (NETNS_IS_SKYWALK(flags)) {
614 			if ((!is_wild || exist != NULL) && gns != NULL &&
615 			    (skres = ns_reservation_tree_find(
616 				    &gns->ns_reservations, port)) != NULL &&
617 			    NETNS_REF_COUNT(skres, NETNS_LISTENER) == 0) {
618 				/*
619 				 * The mere existence of any non-skywalk
620 				 * listener wildcard entry for this
621 				 * protocol/port number means this must fail.
622 				 */
623 				NS_PORT_ERR("ADDRINUSE: Duplicate wildcard");
624 				err = EADDRINUSE;
625 				goto done;
626 			}
627 
628 			if (is_wild) {
629 				gns = netns_global_non_wild[
630 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
631 				VERIFY(gns != NULL);
632 
633 				if (_netns_is_port_used(netns_global_non_wild[
634 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
635 				    _netns_is_port_used(netns_global_non_wild[
636 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
637 					/*
638 					 * If Skywalk is trying to reserve a
639 					 * wildcard, then the mere existance of
640 					 * any entry in either v4/v6 non-wild
641 					 * namespace for this port means this
642 					 * must fail.
643 					 */
644 					NS_PORT_ERR("ADDRINUSE: Wildcard with non-wild.");
645 					err = EADDRINUSE;
646 					goto done;
647 				}
648 			}
649 		} else {
650 			/*
651 			 * Check if Skywalk has reserved a wildcard entry.
652 			 * Note that the arithmetic OR here is intentional.
653 			 */
654 			if ((!is_wild || exist != NULL) && gns != NULL &&
655 			    (skres = ns_reservation_tree_find(
656 				    &gns->ns_reservations, port)) != NULL &&
657 			    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
658 			    NETNS_REF_COUNT(skres, NETNS_LISTENER)) != 0) {
659 				/*
660 				 * BSD is trying to reserve a proto/port for
661 				 * which Skywalk already has a wildcard
662 				 * reservation.
663 				 */
664 				NS_PORT_ERR("ADDRINUSE: BSD requesting Skywalk port");
665 				err = EADDRINUSE;
666 				goto done;
667 			}
668 
669 			/*
670 			 * If BSD is trying to reserve a wildcard,
671 			 * ensure Skywalk has not already reserved
672 			 * a non-wildcard.
673 			 */
674 			if (is_wild) {
675 				gns = netns_global_non_wild[
676 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
677 				VERIFY(gns != NULL);
678 
679 				/*
680 				 * Note that the arithmetic OR here is
681 				 * intentional.
682 				 */
683 				if ((skres = ns_reservation_tree_find(
684 					    &gns->ns_reservations, port)) != NULL &&
685 				    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
686 				    NETNS_REF_COUNT(skres,
687 				    NETNS_LISTENER)) != 0) {
688 					NS_PORT_ERR("ADDRINUSE: BSD wildcard with non-wild.");
689 					err = EADDRINUSE;
690 					goto done;
691 				}
692 			}
693 		}
694 
695 		switch (flags & NETNS_OWNER_MASK) {
696 		case NETNS_SKYWALK:
697 			/* check collision w/ BSD */
698 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
699 			    NETNS_REF_COUNT(res, NETNS_PF) > 0) {
700 				NS_PORT_ERR("ERROR - Skywalk got ADDRINUSE (w/ BSD)");
701 				err = EADDRINUSE;
702 				goto done;
703 			}
704 
705 			/* BEGIN CSTYLED */
706 			/*
707 			 * Scenarios with new Skywalk connected flow:
708 			 * 1. With existing Skywalk connected flow,
709 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
710 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 1
711 			 *    reject by failing the wild gns lookup below.
712 			 * 2. With existing Skywalk 3-tuple listener,
713 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 1
714 			 *    bypass the check below.
715 			 * 3. With existing Skywalk 2-tuple listener,
716 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
717 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 0
718 			 *    pass with successful wild gns lookup.
719 			 */
720 			/* END CSTYLED */
721 			if (NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
722 			    NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0) {
723 				/* check if covered by wild Skywalk listener */
724 				gns = netns_global_wild[
725 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
726 				if (gns != NULL &&
727 				    (skres = ns_reservation_tree_find(
728 					    &gns->ns_reservations, port)) != NULL &&
729 				    NETNS_REF_COUNT(skres, NETNS_LISTENER)
730 				    != 0) {
731 					err = 0;
732 					goto done;
733 				}
734 				if (addr_len == sizeof(struct in_addr)) {
735 					/* If address is IPv4, also check for wild IPv6 registration */
736 					gns = netns_global_wild[
737 						NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)];
738 					if (gns != NULL &&
739 					    (skres = ns_reservation_tree_find(
740 						    &gns->ns_reservations, port)) != NULL &&
741 					    NETNS_REF_COUNT(skres, NETNS_LISTENER)
742 					    != 0) {
743 						err = 0;
744 						goto done;
745 					}
746 				}
747 				NS_PORT_ERR("ERROR - Skywalk got ADDRINUSE (w/ SK connected flow)");
748 				err = EADDRINUSE;
749 			}
750 			/*
751 			 * XXX: Duplicate 5-tuple flows under a Skywalk
752 			 * listener are currently detected by flow manager,
753 			 * till we implement 5-tuple-aware netns.
754 			 */
755 			break;
756 
757 		case NETNS_LISTENER:
758 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
759 			    NETNS_REF_COUNT(res, NETNS_PF) > 0 ||
760 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0 ||
761 			    _netns_is_port_used(netns_global_wild[
762 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
763 			    _netns_is_port_used(netns_global_wild[
764 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port) ||
765 			    _netns_is_port_used(netns_global_non_wild[
766 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
767 			    _netns_is_port_used(netns_global_non_wild[
768 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
769 				NS_PORT_ERR("ERROR - Listener got ADDRINUSE");
770 				err = EADDRINUSE;
771 			}
772 			break;
773 
774 		case NETNS_BSD:
775 		case NETNS_PF:
776 			if (NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0 ||
777 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0) {
778 				NS_PORT_ERR("ERROR - %s got ADDRINUSE",
779 				    ((flags & NETNS_OWNER_MASK) == NETNS_PF) ?
780 				    "PF" : "BSD");
781 				err = EADDRINUSE;
782 			}
783 			break;
784 
785 		default:
786 			panic("_netns_reserve_common: invalid owner 0x%x",
787 			    flags & NETNS_OWNER_MASK);
788 			/* NOTREACHED */
789 			__builtin_unreachable();
790 		}
791 	}
792 
793 done:
794 	ASSERT(res != NULL);
795 	if (__probable(err == 0)) {
796 		NETNS_REF_COUNT(res, flags)++;
797 		/* Check for wrap around */
798 		VERIFY(NETNS_REF_COUNT(res, flags) != 0);
799 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
800 		    NS_VERB_PROTO(namespace->ns_proto),
801 		    "post: %s:%s:%d err %d // flags 0x%x // refs %d sky, "
802 		    "%d ls, %d bsd %d pf",
803 		    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
804 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
805 		    PROTO_STR(namespace->ns_proto), port, err, flags,
806 		    NETNS_REF_COUNT(res, NETNS_SKYWALK),
807 		    NETNS_REF_COUNT(res, NETNS_LISTENER),
808 		    NETNS_REF_COUNT(res, NETNS_BSD),
809 		    NETNS_REF_COUNT(res, NETNS_PF));
810 	} else {
811 		if (exist == NULL) {
812 			RB_REMOVE(ns_reservation_tree,
813 			    &namespace->ns_reservations, res);
814 			namespace->ns_n_reservations--;
815 			netns_ns_reservation_free(res);
816 		}
817 	}
818 	return err;
819 }
820 
821 /*
822  * Internal shared code to release ports within a specific namespace.
823  */
824 static void
_netns_release_common(struct ns * namespace,in_port_t port,uint32_t flags)825 _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags)
826 {
827 	struct ns_reservation *res;
828 	uint32_t refs;
829 	int i;
830 #if SK_LOG
831 	char tmp_ip_str[MAX_IPv6_STR_LEN];
832 #endif /* SK_LOG */
833 
834 	NETNS_LOCK_ASSERT_HELD();
835 
836 	res = ns_reservation_tree_find(&namespace->ns_reservations, port);
837 	if (res == NULL) {
838 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
839 		    NS_VERB_PROTO(namespace->ns_proto),
840 		    "ERROR %s:%s:%d // flags 0x%x // not found",
841 		    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
842 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
843 		    PROTO_STR(namespace->ns_proto), port, flags);
844 		VERIFY(res != NULL);
845 	}
846 
847 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
848 	    NS_VERB_PROTO(namespace->ns_proto),
849 	    "%s:%s:%d // flags 0x%x // refs %d sky, %d ls, %d bsd, %d pf",
850 	    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
851 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
852 	    PROTO_STR(namespace->ns_proto), port, flags,
853 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
854 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
855 	    NETNS_REF_COUNT(res, NETNS_BSD),
856 	    NETNS_REF_COUNT(res, NETNS_PF));
857 
858 	/* Release reservation */
859 	VERIFY(NETNS_REF_COUNT(res, flags) > 0);
860 	NETNS_REF_COUNT(res, flags) -= 1;
861 
862 	/* Clean up memory, if appropriate */
863 	for (i = 0, refs = 0; i <= NETNS_OWNER_MAX && refs == 0; i++) {
864 		refs |= res->nsr_refs[i];
865 	}
866 	if (refs == 0) {
867 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
868 		    res);
869 		namespace->ns_n_reservations--;
870 		NETNS_LOCK_CONVERT();
871 		netns_ns_reservation_free(res);
872 		netns_ns_cleanup(namespace);
873 	}
874 }
875 
876 __attribute__((always_inline))
877 static inline void
netns_init_global_ns(struct ns ** global_ptr,uint8_t proto,uint8_t addrlen)878 netns_init_global_ns(struct ns **global_ptr, uint8_t proto, uint8_t addrlen)
879 {
880 	struct ns *namespace;
881 
882 	namespace = *global_ptr = netns_ns_alloc(Z_WAITOK);
883 	memset(namespace->ns_addr, 0xFF, addrlen);
884 	namespace->ns_addr_len = addrlen;
885 	namespace->ns_proto = proto;
886 	namespace->ns_is_freeable = 0;
887 }
888 
889 __attribute__((always_inline))
890 static inline void
netns_clear_ifnet(struct ns_token * nstoken)891 netns_clear_ifnet(struct ns_token *nstoken)
892 {
893 #if SK_LOG
894 	char tmp_ip_str[MAX_IPv6_STR_LEN];
895 #endif /* SK_LOG */
896 
897 	NETNS_LOCK_ASSERT_HELD();
898 
899 	if (nstoken->nt_ifp != NULL) {
900 		LIST_REMOVE(nstoken, nt_ifp_link);
901 
902 		SK_DF(NS_VERB_IP(nstoken->nt_addr_len) |
903 		    NS_VERB_PROTO(nstoken->nt_proto),
904 		    "%s:%s:%d // removed from ifnet %d",
905 		    sk_ntop(LEN_TO_AF(nstoken->nt_addr_len),
906 		    nstoken->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
907 		    PROTO_STR(nstoken->nt_proto), nstoken->nt_port,
908 		    nstoken->nt_ifp->if_index);
909 
910 		NETNS_LOCK_CONVERT();
911 		ifnet_decr_iorefcnt(nstoken->nt_ifp);
912 		nstoken->nt_ifp = NULL;
913 	} else {
914 		LIST_REMOVE(nstoken, nt_ifp_link);
915 	}
916 }
917 
918 /*
919  * Internal shared code to perform a port[-range] reservation, along with all
920  * the boilerplate and sanity checks expected for a call coming in from the
921  * surrounding kernel code.
922  */
923 static int
_netns_reserve_kpi_common(struct ns * ns,netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)924 _netns_reserve_kpi_common(struct ns *ns, netns_token *token,
925     uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto,
926     in_port_t *port, uint32_t flags, struct ns_flow_info *nfi)
927 {
928 	boolean_t ns_want_cleanup = (ns == NULL);
929 	struct ns_token *nt;
930 	int err = 0;
931 	in_port_t hport;
932 #if SK_LOG
933 	char tmp_ip_str[MAX_IPv6_STR_LEN];
934 #endif /* SK_LOG */
935 	struct ifnet *ifp = (nfi != NULL) ? nfi->nfi_ifp : NULL;
936 
937 	NETNS_LOCK_ASSERT_HELD();
938 
939 	hport = ntohs(*port);
940 
941 	VERIFY((flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
942 	VERIFY(addr_len == sizeof(struct in_addr) ||
943 	    addr_len == sizeof(struct in6_addr));
944 	VERIFY(proto == IPPROTO_TCP || proto == IPPROTO_UDP);
945 	VERIFY(hport != 0);
946 
947 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
948 	    "reserving %s:%s:%d // flags 0x%x // token %svalid",
949 	    sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str,
950 	    sizeof(tmp_ip_str)), PROTO_STR(proto), hport, flags,
951 	    NETNS_TOKEN_VALID(token) ? "" : "in");
952 
953 	/*
954 	 * See the documentation for NETNS_PRERESERVED in netns.h for an
955 	 * explanation of this block.
956 	 */
957 	if (NETNS_TOKEN_VALID(token)) {
958 		if (flags & NETNS_PRERESERVED) {
959 			nt = *token;
960 			VERIFY(nt->nt_addr_len == addr_len);
961 			VERIFY(memcmp(nt->nt_addr, addr, addr_len) == 0);
962 			VERIFY(nt->nt_proto == proto);
963 			VERIFY(nt->nt_port == hport);
964 			VERIFY((nt->nt_flags &
965 			    NETNS_RESERVATION_FLAGS | NETNS_PRERESERVED) ==
966 			    (flags & NETNS_RESERVATION_FLAGS));
967 
968 			if ((nt->nt_flags & NETNS_CONFIGURATION_FLAGS) ==
969 			    (flags & NETNS_CONFIGURATION_FLAGS)) {
970 				SK_DF(NS_VERB_IP(nt->nt_addr_len) |
971 				    NS_VERB_PROTO(nt->nt_proto),
972 				    "%s:%s:%d // flags 0x%x -> 0x%x",
973 				    sk_ntop(LEN_TO_AF(nt->nt_addr_len),
974 				    nt->nt_addr, tmp_ip_str,
975 				    sizeof(tmp_ip_str)),
976 				    PROTO_STR(nt->nt_proto),
977 				    nt->nt_port, nt->nt_flags, flags);
978 				nt->nt_flags &= ~NETNS_CONFIGURATION_FLAGS;
979 				nt->nt_flags |=
980 				    flags & NETNS_CONFIGURATION_FLAGS;
981 			}
982 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
983 			    "token was prereserved");
984 			goto done;
985 		} else {
986 			panic("Request to overwrite valid netns token");
987 			/* NOTREACHED */
988 			__builtin_unreachable();
989 		}
990 	}
991 
992 	/*
993 	 * TODO: Check range against bitmap
994 	 */
995 	if (hport == 0) {
996 		/*
997 		 * Caller request an arbitrary range of ports
998 		 * TODO: Need to figure out how to allocate
999 		 * emphemeral ports only.
1000 		 */
1001 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1002 		    "ERROR - wildcard port not yet supported");
1003 		err = ENOMEM;
1004 		goto done;
1005 	}
1006 
1007 	/*
1008 	 * Fetch namespace for the specified address/protocol, creating
1009 	 * a new namespace if necessary.
1010 	 */
1011 	if (ns == NULL) {
1012 		ASSERT(ns_want_cleanup);
1013 		ns = _netns_get_ns(addr, addr_len, proto, true);
1014 	}
1015 	if (__improbable(ns == NULL)) {
1016 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1017 		    "ERROR - couldn't create namespace");
1018 		err = ENOMEM;
1019 		goto done;
1020 	}
1021 
1022 	/*
1023 	 * Make a reservation in the namespace
1024 	 * This will return an error if an incompatible reservation
1025 	 * already exists.
1026 	 */
1027 	err = _netns_reserve_common(ns, hport, flags);
1028 	if (__improbable(err != 0)) {
1029 		NETNS_LOCK_CONVERT();
1030 		if (ns_want_cleanup) {
1031 			netns_ns_cleanup(ns);
1032 		}
1033 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1034 		    "ERROR - reservation collision");
1035 		goto done;
1036 	}
1037 
1038 	if (!_netns_is_wildcard_addr(ns->ns_addr, addr_len)) {
1039 		/* Record the reservation in the non-wild namespace */
1040 		struct ns *nwns;
1041 
1042 		nwns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1043 		    addr_len)];
1044 		err = _netns_reserve_common(nwns, hport, flags);
1045 		if (__improbable(err != 0)) {
1046 			/* Need to free the specific namespace entry */
1047 			NETNS_LOCK_CONVERT();
1048 			_netns_release_common(ns, hport, flags);
1049 			if (ns_want_cleanup) {
1050 				netns_ns_cleanup(ns);
1051 			}
1052 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1053 			    "ERROR - reservation collision");
1054 			goto done;
1055 		}
1056 	}
1057 
1058 	nt = netns_ns_token_alloc(nfi != NULL ? true : false);
1059 	ASSERT(nt->nt_ifp == NULL);
1060 	_netns_set_ifnet_internal(nt, ifp);
1061 
1062 	memcpy(nt->nt_addr, addr, addr_len);
1063 	nt->nt_addr_len = addr_len;
1064 	nt->nt_proto = proto;
1065 	nt->nt_port = hport;
1066 	nt->nt_flags = flags;
1067 
1068 	if (nfi != NULL) {
1069 		VERIFY(nt->nt_flow_info != NULL);
1070 
1071 		memcpy(nt->nt_flow_info, nfi, sizeof(struct ns_flow_info));
1072 		/*
1073 		 * The local port is passed as a separate argument
1074 		 */
1075 		if (nfi->nfi_laddr.sa.sa_family == AF_INET) {
1076 			nt->nt_flow_info->nfi_laddr.sin.sin_port = *port;
1077 		} else if (nfi->nfi_laddr.sa.sa_family == AF_INET6) {
1078 			nt->nt_flow_info->nfi_laddr.sin6.sin6_port = *port;
1079 		}
1080 	}
1081 	*token = nt;
1082 
1083 done:
1084 	return err;
1085 }
1086 
1087 /*
1088  * Kernel-facing functions
1089  */
1090 
1091 int
netns_init(void)1092 netns_init(void)
1093 {
1094 	VERIFY(__netns_inited == 0);
1095 
1096 	netns_ns_reservation_size = sizeof(struct ns_reservation);
1097 	netns_ns_reservation_cache = skmem_cache_create(NETNS_NS_RESERVATION_ZONE_NAME,
1098 	    netns_ns_reservation_size, sizeof(uint64_t), NULL, NULL, NULL,
1099 	    NULL, NULL, 0);
1100 	if (netns_ns_reservation_cache == NULL) {
1101 		panic("%s: skmem_cache create failed (%s)", __func__,
1102 		    NETNS_NS_RESERVATION_ZONE_NAME);
1103 		/* NOTREACHED */
1104 		__builtin_unreachable();
1105 	}
1106 
1107 	netns_ns_token_size = sizeof(struct ns_token);
1108 	netns_ns_token_cache = skmem_cache_create(NETNS_NS_TOKEN_ZONE_NAME,
1109 	    netns_ns_token_size, sizeof(uint64_t), NULL, NULL, NULL, NULL,
1110 	    NULL, 0);
1111 	if (netns_ns_token_cache == NULL) {
1112 		panic("%s: skmem_cache create failed (%s)", __func__,
1113 		    NETNS_NS_TOKEN_ZONE_NAME);
1114 		/* NOTREACHED */
1115 		__builtin_unreachable();
1116 	}
1117 
1118 	netns_ns_flow_info_size = sizeof(struct ns_flow_info);
1119 	netns_ns_flow_info_cache = skmem_cache_create(NETNS_NS_FLOW_INFO_ZONE_NAME,
1120 	    netns_ns_flow_info_size, sizeof(uint64_t), NULL, NULL, NULL,
1121 	    NULL, NULL, 0);
1122 	if (netns_ns_flow_info_cache == NULL) {
1123 		panic("%s: skmem_cache create failed (%s)", __func__,
1124 		    NETNS_NS_FLOW_INFO_ZONE_NAME);
1125 		/* NOTREACHED */
1126 		__builtin_unreachable();
1127 	}
1128 
1129 	LIST_INIT(&netns_unbound_tokens);
1130 	LIST_INIT(&netns_all_tokens);
1131 
1132 	netns_n_namespaces = 0;
1133 	RB_INIT(&netns_namespaces);
1134 
1135 	SK_D("initializing global namespaces");
1136 
1137 	netns_init_global_ns(
1138 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1139 		NETNS_ADDRLEN_V4)], IPPROTO_TCP, sizeof(struct in_addr));
1140 
1141 	netns_init_global_ns(
1142 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1143 		NETNS_ADDRLEN_V4)], IPPROTO_UDP, sizeof(struct in_addr));
1144 
1145 	netns_init_global_ns(
1146 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1147 		NETNS_ADDRLEN_V6)], IPPROTO_TCP, sizeof(struct in6_addr));
1148 
1149 	netns_init_global_ns(
1150 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1151 		NETNS_ADDRLEN_V6)], IPPROTO_UDP, sizeof(struct in6_addr));
1152 
1153 	/* Done */
1154 
1155 	__netns_inited = 1;
1156 	sk_features |= SK_FEATURE_NETNS;
1157 
1158 	SK_D("initialized netns");
1159 
1160 	return 0;
1161 }
1162 
1163 void
netns_uninit(void)1164 netns_uninit(void)
1165 {
1166 	if (__netns_inited == 1) {
1167 		struct ns *namespace;
1168 		struct ns *temp_namespace;
1169 		int i;
1170 
1171 		RB_FOREACH_SAFE(namespace, netns_namespaces_tree,
1172 		    &netns_namespaces, temp_namespace) {
1173 			RB_REMOVE(netns_namespaces_tree, &netns_namespaces,
1174 			    namespace);
1175 			netns_n_namespaces--;
1176 			netns_ns_free(namespace);
1177 		}
1178 
1179 		for (i = 0; i < NETNS_N_GLOBAL; i++) {
1180 			netns_ns_free(netns_global_non_wild[i]);
1181 		}
1182 
1183 		if (netns_ns_flow_info_cache != NULL) {
1184 			skmem_cache_destroy(netns_ns_flow_info_cache);
1185 			netns_ns_flow_info_cache = NULL;
1186 		}
1187 		if (netns_ns_token_cache != NULL) {
1188 			skmem_cache_destroy(netns_ns_token_cache);
1189 			netns_ns_token_cache = NULL;
1190 		}
1191 		if (netns_ns_reservation_cache != NULL) {
1192 			skmem_cache_destroy(netns_ns_reservation_cache);
1193 			netns_ns_reservation_cache = NULL;
1194 		}
1195 
1196 		__netns_inited = 0;
1197 		sk_features &= ~SK_FEATURE_NETNS;
1198 
1199 		SK_D("uninitialized netns");
1200 	}
1201 }
1202 
1203 void
netns_reap_caches(boolean_t purge)1204 netns_reap_caches(boolean_t purge)
1205 {
1206 	/* these aren't created unless netns is enabled */
1207 	if (netns_ns_token_cache != NULL) {
1208 		skmem_cache_reap_now(netns_ns_token_cache, purge);
1209 	}
1210 	if (netns_ns_reservation_cache != NULL) {
1211 		skmem_cache_reap_now(netns_ns_reservation_cache, purge);
1212 	}
1213 	if (netns_ns_flow_info_cache != NULL) {
1214 		skmem_cache_reap_now(netns_ns_flow_info_cache, purge);
1215 	}
1216 }
1217 
1218 boolean_t
netns_is_enabled(void)1219 netns_is_enabled(void)
1220 {
1221 	return __netns_inited == 1;
1222 }
1223 
1224 int
netns_reserve(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t port,uint32_t flags,struct ns_flow_info * nfi)1225 netns_reserve(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1226     uint8_t addr_len, uint8_t proto, in_port_t port, uint32_t flags,
1227     struct ns_flow_info *nfi)
1228 {
1229 	int err = 0;
1230 #if SK_LOG
1231 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1232 #endif /* SK_LOG */
1233 
1234 	if (__netns_inited == 0) {
1235 		*token = NULL;
1236 		return err;
1237 	}
1238 
1239 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1240 		NS_PORT_ERR("netns doesn't support non TCP/UDP protocol");
1241 		return ENOTSUP;
1242 	}
1243 
1244 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1245 	    "%s:%s:%d // flags 0x%x", sk_ntop(LEN_TO_AF(addr_len), addr,
1246 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1247 	    flags);
1248 
1249 	/*
1250 	 * Check wether the process is allowed to bind to a restricted port
1251 	 */
1252 	if (!current_task_can_use_restricted_in_port(port,
1253 	    proto, flags)) {
1254 		*token = NULL;
1255 		return EADDRINUSE;
1256 	}
1257 
1258 	NETNS_LOCK_SPIN();
1259 	err = _netns_reserve_kpi_common(NULL, token, addr, addr_len,
1260 	    proto, &port, flags, nfi);
1261 	NETNS_UNLOCK();
1262 
1263 	return err;
1264 }
1265 
1266 /* Import net.inet.{tcp,udp}.randomize_ports sysctls */
1267 extern int      udp_use_randomport;
1268 extern int      tcp_use_randomport;
1269 
1270 int
netns_reserve_ephemeral(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t * pport,uint32_t flags,struct ns_flow_info * nfi)1271 netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1272     uint8_t addr_len, uint8_t proto, in_port_t *pport, uint32_t flags,
1273     struct ns_flow_info *nfi)
1274 {
1275 	int err = 0;
1276 	SK_LOG_VAR(in_port_t port = *pport);
1277 	in_port_t first = (in_port_t)ipport_firstauto;
1278 	in_port_t last  = (in_port_t)ipport_lastauto;
1279 	in_port_t rand_port;
1280 	in_port_t last_port;
1281 	in_port_t n_last_port;
1282 	struct ns *namespace;
1283 	boolean_t count_up = true;
1284 	boolean_t use_randomport = (proto == IPPROTO_TCP) ?
1285 	    tcp_use_randomport : udp_use_randomport;
1286 #if SK_LOG
1287 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1288 #endif /* SK_LOG */
1289 
1290 	if (__netns_inited == 0) {
1291 		*token = NULL;
1292 		return err;
1293 	}
1294 
1295 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1296 		NS_PORT_ERR("netns doesn't support non TCP/UDP protocol");
1297 		return ENOTSUP;
1298 	}
1299 
1300 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1301 	    "%s:%s:%d // flags 0x%x", sk_ntop(LEN_TO_AF(addr_len), addr,
1302 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1303 	    flags);
1304 
1305 	NETNS_LOCK_SPIN();
1306 
1307 	namespace = _netns_get_ns(addr, addr_len, proto, true);
1308 	if (namespace == NULL) {
1309 		err = ENOMEM;
1310 		NETNS_UNLOCK();
1311 		return err;
1312 	}
1313 
1314 	if (proto == IPPROTO_UDP) {
1315 		if (UINT16_MAX - namespace->ns_n_reservations <
1316 		    NETNS_NS_UDP_EPHEMERAL_RESERVE) {
1317 			NS_PORT_ERR("UDP ephemeral port not available"
1318 			    "(less than 4096 UDP ports left)");
1319 			err = EADDRNOTAVAIL;
1320 			NETNS_UNLOCK();
1321 			return err;
1322 		}
1323 	}
1324 
1325 	if (first == last) {
1326 		rand_port = first;
1327 	} else {
1328 		if (use_randomport) {
1329 			NETNS_LOCK_CONVERT();
1330 			read_frandom(&rand_port, sizeof(rand_port));
1331 
1332 			if (first > last) {
1333 				rand_port = last + (rand_port %
1334 				    (first - last));
1335 				count_up = false;
1336 			} else {
1337 				rand_port = first + (rand_port %
1338 				    (last - first));
1339 			}
1340 		} else {
1341 			if (first > last) {
1342 				rand_port =
1343 				    namespace->ns_last_ephemeral_port_down - 1;
1344 				if (rand_port < last || rand_port > first) {
1345 					rand_port = last;
1346 				}
1347 				count_up = false;
1348 			} else {
1349 				rand_port =
1350 				    namespace->ns_last_ephemeral_port_up + 1;
1351 				if (rand_port < first || rand_port > last) {
1352 					rand_port = first;
1353 				}
1354 			}
1355 		}
1356 	}
1357 	last_port = rand_port;
1358 	n_last_port = htons(last_port);
1359 
1360 	while (true) {
1361 		if (n_last_port == 0) {
1362 			NS_PORT_ERR("ephemeral port search range includes 0");
1363 			err = EINVAL;
1364 			break;
1365 		}
1366 
1367 		/*
1368 		 * Skip if this is a restricted port as we do not want to
1369 		 * restricted ports as ephemeral
1370 		 */
1371 		if (!IS_RESTRICTED_IN_PORT(n_last_port)) {
1372 			err = _netns_reserve_kpi_common(namespace, token, addr,
1373 			    addr_len, proto, &n_last_port, flags, nfi);
1374 			if (err == 0 || err != EADDRINUSE) {
1375 				break;
1376 			}
1377 		}
1378 		if (count_up) {
1379 			last_port++;
1380 			if (last_port < first || last_port > last) {
1381 				last_port = first;
1382 			}
1383 		} else {
1384 			last_port--;
1385 			if (last_port < last || last_port > first) {
1386 				last_port = last;
1387 			}
1388 		}
1389 		n_last_port = htons(last_port);
1390 
1391 		if (last_port == rand_port || first == last) {
1392 			NS_PORT_ERR("couldn't find free ephemeral port");
1393 			err = EADDRNOTAVAIL;
1394 			break;
1395 		}
1396 	}
1397 
1398 	if (err == 0) {
1399 		*pport = n_last_port;
1400 		if (count_up) {
1401 			namespace->ns_last_ephemeral_port_up = last_port;
1402 		} else {
1403 			namespace->ns_last_ephemeral_port_down = last_port;
1404 		}
1405 	} else {
1406 		netns_ns_cleanup(namespace);
1407 	}
1408 
1409 	NETNS_UNLOCK();
1410 
1411 	return err;
1412 }
1413 
1414 void
netns_release(netns_token * token)1415 netns_release(netns_token *token)
1416 {
1417 	struct ns *ns;
1418 	struct ns_token *nt;
1419 	uint8_t proto, addr_len;
1420 #if SK_LOG
1421 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1422 #endif /* SK_LOG */
1423 
1424 	if (!NETNS_TOKEN_VALID(token)) {
1425 		return;
1426 	}
1427 
1428 	if (__netns_inited == 0) {
1429 		*token = NULL;
1430 		return;
1431 	}
1432 
1433 	NETNS_LOCK_SPIN();
1434 
1435 	nt = *token;
1436 	*token = NULL;
1437 
1438 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
1439 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1440 	    nt->nt_addr_len == sizeof(struct in6_addr));
1441 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1442 
1443 	addr_len = nt->nt_addr_len;
1444 	proto = nt->nt_proto;
1445 
1446 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1447 	    "releasing %s:%s:%d",
1448 	    sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1449 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto),
1450 	    nt->nt_port);
1451 
1452 	if (!_netns_is_wildcard_addr(nt->nt_addr, addr_len)) {
1453 		/* Remove from global non-wild namespace */
1454 
1455 		ns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1456 		    addr_len)];
1457 		VERIFY(ns != NULL);
1458 
1459 		_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1460 	}
1461 
1462 	ns = _netns_get_ns(nt->nt_addr, addr_len, proto, false);
1463 	VERIFY(ns != NULL);
1464 	_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1465 
1466 	netns_clear_ifnet(nt);
1467 	netns_ns_token_free(nt);
1468 
1469 	NETNS_UNLOCK();
1470 }
1471 
1472 int
netns_change_addr(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len)1473 netns_change_addr(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1474     uint8_t addr_len)
1475 {
1476 	int err = 0;
1477 	struct ns *old_namespace;
1478 	struct ns *new_namespace;
1479 	struct ns *global_namespace;
1480 	struct ns_token *nt;
1481 	uint8_t proto;
1482 #if SK_LOG
1483 	char tmp_ip_str_1[MAX_IPv6_STR_LEN];
1484 	char tmp_ip_str_2[MAX_IPv6_STR_LEN];
1485 #endif /* SK_LOG */
1486 
1487 	if (__netns_inited == 0) {
1488 		return 0;
1489 	}
1490 
1491 	NETNS_LOCK();
1492 
1493 	VERIFY(NETNS_TOKEN_VALID(token));
1494 
1495 	nt = *token;
1496 
1497 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) == NETNS_BSD);
1498 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1499 	    nt->nt_addr_len == sizeof(struct in6_addr));
1500 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1501 
1502 	proto = nt->nt_proto;
1503 
1504 #if SK_LOG
1505 	sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1506 	    tmp_ip_str_1, sizeof(tmp_ip_str_1));
1507 	sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2,
1508 	    sizeof(tmp_ip_str_2));
1509 #endif /* SK_LOG */
1510 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1511 	    "changing address for %s:%d from %s to %s",
1512 	    PROTO_STR(proto), nt->nt_port, tmp_ip_str_1,
1513 	    tmp_ip_str_2);
1514 
1515 	if (nt->nt_addr_len == addr_len &&
1516 	    memcmp(nt->nt_addr, addr, nt->nt_addr_len) == 0) {
1517 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1518 		    "address didn't change, exiting early");
1519 		goto done;
1520 	}
1521 
1522 	old_namespace = _netns_get_ns(nt->nt_addr, nt->nt_addr_len, proto,
1523 	    false);
1524 	VERIFY(old_namespace != NULL);
1525 
1526 	new_namespace = _netns_get_ns(addr, addr_len, proto, true);
1527 	if (new_namespace == NULL) {
1528 		err = ENOMEM;
1529 		goto done;
1530 	}
1531 
1532 	/* Acquire reservation in new namespace */
1533 	if ((err = _netns_reserve_common(new_namespace, nt->nt_port,
1534 	    nt->nt_flags))) {
1535 		NETNS_LOCK_CONVERT();
1536 		netns_ns_cleanup(new_namespace);
1537 		SK_ERR("port %u reservation collision under new namespace",
1538 		    nt->nt_port);
1539 		goto done;
1540 	}
1541 
1542 	/* Release from old namespace */
1543 	_netns_release_common(old_namespace, nt->nt_port, nt->nt_flags);
1544 
1545 	if (!_netns_is_wildcard_addr(nt->nt_addr, nt->nt_addr_len)) {
1546 		/*
1547 		 * Old address is non-wildcard.
1548 		 * Remove old reservation from global non-wild namespace
1549 		 */
1550 		global_namespace = netns_global_non_wild[
1551 			NETNS_NS_GLOBAL_IDX(proto, nt->nt_addr_len)];
1552 		VERIFY(global_namespace != NULL);
1553 
1554 		_netns_release_common(global_namespace, nt->nt_port,
1555 		    nt->nt_flags);
1556 	}
1557 
1558 	if (!_netns_is_wildcard_addr(addr, addr_len)) {
1559 		/*
1560 		 * New address is non-wildcard.
1561 		 * Record new reservation in global non-wild namespace
1562 		 */
1563 		global_namespace = netns_global_non_wild[
1564 			NETNS_NS_GLOBAL_IDX(proto, addr_len)];
1565 		VERIFY(global_namespace != NULL);
1566 
1567 		if ((err = _netns_reserve_common(global_namespace,
1568 		    nt->nt_port, nt->nt_flags)) != 0) {
1569 			SK_ERR("port %u - reservation collision under new global namespace",
1570 			    nt->nt_port);
1571 			/* XXX: Should not fail. Maybe assert instead */
1572 			goto done;
1573 		}
1574 	}
1575 
1576 	memcpy(nt->nt_addr, addr, addr_len);
1577 	nt->nt_addr_len = addr_len;
1578 
1579 done:
1580 	NETNS_UNLOCK();
1581 	return err;
1582 }
1583 
1584 static void
_netns_set_ifnet_internal(struct ns_token * nt,struct ifnet * ifp)1585 _netns_set_ifnet_internal(struct ns_token *nt, struct ifnet *ifp)
1586 {
1587 #if SK_LOG
1588 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1589 #endif /* SK_LOG */
1590 
1591 	NETNS_LOCK_ASSERT_HELD();
1592 
1593 	if (ifp != NULL && ifnet_get_ioref(ifp)) {
1594 		nt->nt_ifp = ifp;
1595 		LIST_INSERT_HEAD(&ifp->if_netns_tokens, nt, nt_ifp_link);
1596 
1597 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1598 		    "%s:%s:%d // added to ifnet %d",
1599 		    sk_ntop(LEN_TO_AF(nt->nt_addr_len),
1600 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1601 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1602 		    ifp->if_index);
1603 	} else {
1604 		LIST_INSERT_HEAD(&netns_unbound_tokens, nt, nt_ifp_link);
1605 	}
1606 }
1607 
1608 void
netns_set_ifnet(netns_token * token,ifnet_t ifp)1609 netns_set_ifnet(netns_token *token, ifnet_t ifp)
1610 {
1611 	struct ns_token *nt;
1612 #if SK_LOG
1613 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1614 #endif /* SK_LOG */
1615 
1616 	if (__netns_inited == 0) {
1617 		return;
1618 	}
1619 
1620 	NETNS_LOCK();
1621 
1622 	VERIFY(NETNS_TOKEN_VALID(token));
1623 
1624 	nt = *token;
1625 
1626 	if (nt->nt_ifp == ifp) {
1627 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1628 		    "%s:%s:%d // ifnet already %d, exiting early",
1629 		    sk_ntop(LEN_TO_AF(nt->nt_addr_len),
1630 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1631 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1632 		    ifp ? ifp->if_index : -1);
1633 		NETNS_UNLOCK();
1634 		return;
1635 	}
1636 
1637 	netns_clear_ifnet(nt);
1638 
1639 	_netns_set_ifnet_internal(nt, ifp);
1640 
1641 	NETNS_UNLOCK();
1642 }
1643 
1644 void
netns_ifnet_detach(ifnet_t ifp)1645 netns_ifnet_detach(ifnet_t ifp)
1646 {
1647 	struct ns_token *token, *tmp_token;
1648 
1649 	if (__netns_inited == 0) {
1650 		return;
1651 	}
1652 
1653 	NETNS_LOCK();
1654 
1655 	LIST_FOREACH_SAFE(token, &ifp->if_netns_tokens, nt_ifp_link,
1656 	    tmp_token) {
1657 		netns_clear_ifnet(token);
1658 		LIST_INSERT_HEAD(&netns_unbound_tokens, token, nt_ifp_link);
1659 	}
1660 
1661 	NETNS_UNLOCK();
1662 }
1663 
1664 static void
_netns_set_state(netns_token * token,uint32_t state)1665 _netns_set_state(netns_token *token, uint32_t state)
1666 {
1667 	struct ns_token *nt;
1668 #if SK_LOG
1669 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1670 #endif /* SK_LOG */
1671 
1672 	if (__netns_inited == 0) {
1673 		return;
1674 	}
1675 
1676 	NETNS_LOCK();
1677 	VERIFY(NETNS_TOKEN_VALID(token));
1678 
1679 	nt = *token;
1680 	nt->nt_state |= state;
1681 
1682 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1683 	    "%s:%s:%d // state 0x%x",
1684 	    sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1685 	    tmp_ip_str, sizeof(tmp_ip_str)),
1686 	    PROTO_STR(nt->nt_proto), nt->nt_port, state);
1687 
1688 	NETNS_UNLOCK();
1689 }
1690 
1691 void
netns_half_close(netns_token * token)1692 netns_half_close(netns_token *token)
1693 {
1694 	_netns_set_state(token, NETNS_STATE_HALFCLOSED);
1695 }
1696 
1697 void
netns_withdraw(netns_token * token)1698 netns_withdraw(netns_token *token)
1699 {
1700 	_netns_set_state(token, NETNS_STATE_WITHDRAWN);
1701 }
1702 
1703 int
netns_get_flow_info(netns_token * token,struct ns_flow_info * nfi)1704 netns_get_flow_info(netns_token *token,
1705     struct ns_flow_info *nfi)
1706 {
1707 	if (__netns_inited == 0) {
1708 		return ENOTSUP;
1709 	}
1710 
1711 	NETNS_LOCK();
1712 	if (!NETNS_TOKEN_VALID(token) ||
1713 	    nfi == NULL) {
1714 		NETNS_UNLOCK();
1715 		return EINVAL;
1716 	}
1717 
1718 	struct ns_token *nt = *token;
1719 	if (nt->nt_flow_info == NULL) {
1720 		NETNS_UNLOCK();
1721 		return ENOENT;
1722 	}
1723 
1724 	memcpy(nfi, nt->nt_flow_info, sizeof(struct ns_flow_info));
1725 	NETNS_UNLOCK();
1726 
1727 	return 0;
1728 }
1729 
1730 void
netns_change_flags(netns_token * token,uint32_t set_flags,uint32_t clear_flags)1731 netns_change_flags(netns_token *token, uint32_t set_flags,
1732     uint32_t clear_flags)
1733 {
1734 	struct ns_token *nt;
1735 #if SK_LOG
1736 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1737 #endif /* SK_LOG */
1738 
1739 	if (__netns_inited == 0) {
1740 		return;
1741 	}
1742 
1743 	NETNS_LOCK();
1744 
1745 	VERIFY(NETNS_TOKEN_VALID(token));
1746 
1747 	nt = *token;
1748 
1749 	VERIFY(!((set_flags | clear_flags) & NETNS_RESERVATION_FLAGS));
1750 	/* TODO: verify set and clear flags don't overlap? */
1751 
1752 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1753 	    "%s:%s:%d // flags 0x%x -> 0x%x",
1754 	    sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1755 	    tmp_ip_str, sizeof(tmp_ip_str)),
1756 	    PROTO_STR(nt->nt_proto), nt->nt_port, nt->nt_flags,
1757 	    nt->nt_flags | set_flags & ~clear_flags);
1758 
1759 	nt->nt_flags |= set_flags;
1760 	nt->nt_flags &= ~clear_flags;
1761 
1762 	NETNS_UNLOCK();
1763 }
1764 
1765 /*
1766  * Port offloading KPI
1767  */
1768 static inline void
netns_local_port_scan_flow_entry(struct flow_entry * fe,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1769 netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protocol,
1770     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1771 {
1772 	struct ns_token *token;
1773 	boolean_t iswildcard = false;
1774 
1775 	if (fe == NULL) {
1776 		return;
1777 	}
1778 
1779 	if (fe->fe_flags & (FLOWENTF_EXTRL_PORT | FLOWENTF_AOP_OFFLOAD)) {
1780 		return;
1781 	}
1782 
1783 	token = fe->fe_port_reservation;
1784 	if (token == NULL) {
1785 		return;
1786 	}
1787 
1788 	/*
1789 	 * We are only interested in active flows over skywalk channels
1790 	 */
1791 	if ((token->nt_flags & NETNS_OWNER_MASK) != NETNS_SKYWALK) {
1792 		return;
1793 	}
1794 
1795 	if (token->nt_state & NETNS_STATE_WITHDRAWN) {
1796 		return;
1797 	}
1798 
1799 	if (!(flags & IFNET_GET_LOCAL_PORTS_ANYTCPSTATEOK) &&
1800 	    (flags & IFNET_GET_LOCAL_PORTS_ACTIVEONLY) &&
1801 	    (token->nt_state & NETNS_STATE_HALFCLOSED)) {
1802 		return;
1803 	}
1804 
1805 	VERIFY(token->nt_addr_len == sizeof(struct in_addr) ||
1806 	    token->nt_addr_len == sizeof(struct in6_addr));
1807 
1808 	if (token->nt_addr_len == sizeof(struct in_addr)) {
1809 		if (protocol == PF_INET6) {
1810 			return;
1811 		}
1812 
1813 		iswildcard = token->nt_inaddr.s_addr == INADDR_ANY;
1814 	} else if (token->nt_addr_len == sizeof(struct in6_addr)) {
1815 		if (protocol == PF_INET) {
1816 			return;
1817 		}
1818 
1819 		iswildcard = IN6_IS_ADDR_UNSPECIFIED(
1820 			&token->nt_in6addr);
1821 	}
1822 	if (!(flags & IFNET_GET_LOCAL_PORTS_WILDCARDOK) && iswildcard) {
1823 		return;
1824 	}
1825 
1826 	if ((flags & IFNET_GET_LOCAL_PORTS_TCPONLY) &&
1827 	    token->nt_proto == IPPROTO_UDP) {
1828 		return;
1829 	}
1830 	if ((flags & IFNET_GET_LOCAL_PORTS_UDPONLY) &&
1831 	    token->nt_proto == IPPROTO_TCP) {
1832 		return;
1833 	}
1834 
1835 	if ((flags & IFNET_GET_LOCAL_PORTS_RECVANYIFONLY) &&
1836 	    !(token->nt_flags & NETNS_RECVANYIF)) {
1837 		return;
1838 	}
1839 
1840 	if ((flags & IFNET_GET_LOCAL_PORTS_EXTBGIDLEONLY) &&
1841 	    !(token->nt_flags & NETNS_EXTBGIDLE)) {
1842 		return;
1843 	}
1844 
1845 	if (token->nt_ifp != NULL && (token->nt_ifp->if_eflags & IFEF_AWDL) != 0) {
1846 		struct flow_route *fr = fe->fe_route;
1847 
1848 		if (fr == NULL || fr->fr_rt_dst == NULL ||
1849 		    (fr->fr_rt_dst->rt_flags & (RTF_UP | RTF_CONDEMNED)) != RTF_UP) {
1850 #if DEBUG || DEVELOPMENT
1851 			char lbuf[MAX_IPv6_STR_LEN + 6] = {};
1852 			char fbuf[MAX_IPv6_STR_LEN + 6] = {};
1853 			in_port_t lport;
1854 			in_port_t fport;
1855 			char pname[MAXCOMLEN + 1];
1856 			const struct ns_flow_info *nfi = token->nt_flow_info;
1857 
1858 			proc_name(nfi->nfi_owner_pid, pname, sizeof(pname));
1859 
1860 			if (protocol == PF_INET) {
1861 				sk_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr,
1862 				    lbuf, sizeof(lbuf));
1863 				sk_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr,
1864 				    fbuf, sizeof(fbuf));
1865 				lport = nfi->nfi_laddr.sin.sin_port;
1866 				fport = nfi->nfi_faddr.sin.sin_port;
1867 			} else {
1868 				sk_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr,
1869 				    lbuf, sizeof(lbuf));
1870 				sk_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr,
1871 				    fbuf, sizeof(fbuf));
1872 				lport = nfi->nfi_laddr.sin6.sin6_port;
1873 				fport = nfi->nfi_faddr.sin6.sin6_port;
1874 			}
1875 
1876 			os_log(wake_packet_log_handle,
1877 			    "netns_local_port_scan_flow_entry: route is down %s %s:%u %s:%u ifp %s proc %s:%d",
1878 			    token->nt_proto == IPPROTO_TCP ? "tcp" : "udp",
1879 			    lbuf, ntohs(lport), fbuf, ntohs(fport),
1880 			    token->nt_ifp->if_xname, pname, nfi->nfi_owner_pid);
1881 #endif /* DEBUG || DEVELOPMENT */
1882 
1883 			return;
1884 		}
1885 	}
1886 
1887 #if DEBUG || DEVELOPMENT
1888 	if (!(flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) &&
1889 	    (token->nt_flags & NETNS_NOWAKEFROMSLEEP)) {
1890 		char lbuf[MAX_IPv6_STR_LEN + 6] = {};
1891 		char fbuf[MAX_IPv6_STR_LEN + 6] = {};
1892 		in_port_t lport;
1893 		in_port_t fport;
1894 		char pname[MAXCOMLEN + 1];
1895 		const struct ns_flow_info *nfi = token->nt_flow_info;
1896 
1897 		proc_name(nfi->nfi_owner_pid, pname, sizeof(pname));
1898 
1899 		if (protocol == PF_INET) {
1900 			sk_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr,
1901 			    lbuf, sizeof(lbuf));
1902 			sk_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr,
1903 			    fbuf, sizeof(fbuf));
1904 			lport = nfi->nfi_laddr.sin.sin_port;
1905 			fport = nfi->nfi_faddr.sin.sin_port;
1906 		} else {
1907 			sk_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr,
1908 			    lbuf, sizeof(lbuf));
1909 			sk_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr,
1910 			    fbuf, sizeof(fbuf));
1911 			lport = nfi->nfi_laddr.sin6.sin6_port;
1912 			fport = nfi->nfi_faddr.sin6.sin6_port;
1913 		}
1914 
1915 		os_log(wake_packet_log_handle,
1916 		    "netns_local_port_scan_flow_entry: no wake from sleep %s %s:%u %s:%u ifp %s proc %s:%d",
1917 		    token->nt_proto == IPPROTO_TCP ? "tcp" : "udp",
1918 		    lbuf, ntohs(lport), fbuf, ntohs(fport),
1919 		    token->nt_ifp != NULL ? token->nt_ifp->if_xname : "",
1920 		    pname, nfi->nfi_owner_pid);
1921 	}
1922 #endif /* DEBUG || DEVELOPMENT */
1923 
1924 	if (token->nt_ifp != NULL && token->nt_flow_info != NULL) {
1925 		/*
1926 		 * When the flow has "no wake from sleep" option, do not set the port in the bitmap
1927 		 * except if explicetely requested by the driver.
1928 		 * We always add the flow to the list of port in order to report spurious wakes
1929 		 */
1930 		if ((flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) ||
1931 		    (token->nt_flags & NETNS_NOWAKEFROMSLEEP) == 0) {
1932 			bitstr_set(bitfield, token->nt_port);
1933 		}
1934 		(void) if_ports_used_add_flow_entry(fe, token->nt_ifp->if_index,
1935 		    token->nt_flow_info, token->nt_flags);
1936 	} else {
1937 		SK_ERR("unknown owner port %u"
1938 		    " nt_flags 0x%x ifindex %u nt_flow_info %p\n",
1939 		    token->nt_port, token->nt_flags,
1940 		    token->nt_ifp != NULL ? token->nt_ifp->if_index : 0,
1941 		    token->nt_flow_info);
1942 	}
1943 }
1944 
1945 static void
netns_get_if_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1946 netns_get_if_local_ports(ifnet_t ifp, protocol_family_t protocol,
1947     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1948 {
1949 	struct nx_flowswitch *fsw = NULL;
1950 
1951 	if (ifp == NULL || ifp->if_na == NULL) {
1952 		return;
1953 	}
1954 	/* Ensure that the interface is attached and won't detach */
1955 	if (!ifnet_get_ioref(ifp)) {
1956 		return;
1957 	}
1958 	fsw = fsw_ifp_to_fsw(ifp);
1959 	if (fsw == NULL) {
1960 		goto done;
1961 	}
1962 	FSW_RLOCK(fsw);
1963 	NETNS_LOCK();
1964 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1965 		netns_local_port_scan_flow_entry(_fe, protocol, flags,
1966 		bitfield);
1967 	});
1968 	NETNS_UNLOCK();
1969 	FSW_UNLOCK(fsw);
1970 done:
1971 	ifnet_decr_iorefcnt(ifp);
1972 }
1973 
1974 errno_t
netns_get_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1975 netns_get_local_ports(ifnet_t ifp, protocol_family_t protocol,
1976     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1977 {
1978 	if (__netns_inited == 0) {
1979 		return 0;
1980 	}
1981 	if (ifp != NULL) {
1982 		netns_get_if_local_ports(ifp, protocol, flags, bitfield);
1983 	} else {
1984 		errno_t error;
1985 		uint32_t count, i;
1986 		ifnet_t *__counted_by(count) ifp_list;
1987 
1988 		error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count);
1989 		if (error != 0) {
1990 			os_log_error(wake_packet_log_handle,
1991 			    "%s: ifnet_list_get_all() failed %d",
1992 			    __func__, error);
1993 			return error;
1994 		}
1995 		for (i = 0; i < count; i++) {
1996 			if (TAILQ_EMPTY(&ifp_list[i]->if_addrhead)) {
1997 				continue;
1998 			}
1999 			netns_get_if_local_ports(ifp_list[i], protocol, flags,
2000 			    bitfield);
2001 		}
2002 		ifnet_list_free_counted_by(ifp_list, count);
2003 	}
2004 
2005 	return 0;
2006 }
2007 
2008 uint32_t
netns_find_anyres_byaddr(struct ifaddr * ifa,uint8_t proto)2009 netns_find_anyres_byaddr(struct ifaddr *ifa, uint8_t proto)
2010 {
2011 	int result = 0;
2012 	int ifa_addr_len;
2013 	struct ns_token *token;
2014 	struct ifnet *ifp = ifa->ifa_ifp;
2015 	struct sockaddr *ifa_addr = ifa->ifa_addr;
2016 
2017 	if (__netns_inited == 0) {
2018 		return ENOTSUP;
2019 	}
2020 
2021 	if ((ifa_addr->sa_family != AF_INET) &&
2022 	    (ifa_addr->sa_family != AF_INET6)) {
2023 		return 0;
2024 	}
2025 
2026 	ifa_addr_len = (ifa_addr->sa_family == AF_INET) ?
2027 	    sizeof(struct in_addr) : sizeof(struct in6_addr);
2028 
2029 	NETNS_LOCK();
2030 
2031 	LIST_FOREACH(token, &ifp->if_netns_tokens, nt_ifp_link) {
2032 		if ((token->nt_flags & NETNS_OWNER_MASK) == NETNS_PF) {
2033 			continue;
2034 		}
2035 		if (token->nt_addr_len != ifa_addr_len) {
2036 			continue;
2037 		}
2038 		if (token->nt_proto != proto) {
2039 			continue;
2040 		}
2041 		if (ifa_addr->sa_family == AF_INET) {
2042 			if (token->nt_inaddr.s_addr ==
2043 			    (satosin(ifa->ifa_addr))->sin_addr.s_addr) {
2044 				result = 1;
2045 				break;
2046 			}
2047 		} else if (ifa_addr->sa_family == AF_INET6) {
2048 			if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa),
2049 			    &token->nt_in6addr)) {
2050 				result = 1;
2051 				break;
2052 			}
2053 		}
2054 	}
2055 
2056 	NETNS_UNLOCK();
2057 	return result;
2058 }
2059 
2060 static uint32_t
_netns_lookup_ns_n_reservations(uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto)2061 _netns_lookup_ns_n_reservations(uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto)
2062 {
2063 	uint32_t ns_n_reservations = 0;
2064 	NETNS_LOCK_SPIN();
2065 	struct ns *namespace = _netns_get_ns(addr, addr_len, proto, true);
2066 	if (namespace != NULL) {
2067 		ns_n_reservations = namespace->ns_n_reservations;
2068 	}
2069 	NETNS_UNLOCK();
2070 	return ns_n_reservations;
2071 }
2072 
2073 uint32_t
netns_lookup_reservations_count_in(struct in_addr addr,uint8_t proto)2074 netns_lookup_reservations_count_in(struct in_addr addr, uint8_t proto)
2075 {
2076 	return _netns_lookup_ns_n_reservations(&addr.s_addr, sizeof(struct in_addr), proto);
2077 }
2078 
2079 uint32_t
netns_lookup_reservations_count_in6(struct in6_addr addr,uint8_t proto)2080 netns_lookup_reservations_count_in6(struct in6_addr addr, uint8_t proto)
2081 {
2082 	if (IN6_IS_SCOPE_EMBED(&addr)) {
2083 		addr.s6_addr16[1] = 0;
2084 	}
2085 	return _netns_lookup_ns_n_reservations(&addr.s6_addr32[0], sizeof(struct in6_addr), proto);
2086 }
2087 
2088 /*
2089  * Sysctl interface
2090  */
2091 
2092 static int netns_ctl_dump_all SYSCTL_HANDLER_ARGS;
2093 
2094 SYSCTL_NODE(_kern_skywalk, OID_AUTO, netns, CTLFLAG_RW | CTLFLAG_LOCKED,
2095     0, "Netns interface");
2096 
2097 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netns,
2098     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
2099     0, 0, netns_ctl_dump_all, "-",
2100     "Namespace contents (struct netns_ctl_dump_header, "
2101     "skywalk/os_stats_private.h)");
2102 
2103 static int
netns_ctl_write_ns(struct sysctl_req * req,struct ns * namespace,boolean_t is_global)2104 netns_ctl_write_ns(struct sysctl_req *req, struct ns *namespace,
2105     boolean_t is_global)
2106 {
2107 	struct ns_reservation *res;
2108 	struct netns_ctl_dump_header response_header;
2109 	struct netns_ctl_dump_record response_record;
2110 	int err;
2111 
2112 	/* Fill out header */
2113 	memset(&response_header, 0, sizeof(response_header));
2114 	response_header.ncdh_n_records = namespace->ns_n_reservations;
2115 	response_header.ncdh_proto = namespace->ns_proto;
2116 
2117 	if (is_global) {
2118 		response_header.ncdh_addr_len = 0;
2119 	} else {
2120 		response_header.ncdh_addr_len = namespace->ns_addr_len;
2121 	}
2122 	memcpy(response_header.ncdh_addr, namespace->ns_addr,
2123 	    namespace->ns_addr_len);
2124 
2125 	err = SYSCTL_OUT(req, &response_header, sizeof(response_header));
2126 	if (err) {
2127 		return err;
2128 	}
2129 
2130 	/* Fill out records */
2131 	RB_FOREACH(res, ns_reservation_tree, &namespace->ns_reservations) {
2132 		memset(&response_record, 0, sizeof(response_record));
2133 		response_record.ncdr_port = res->nsr_port;
2134 		response_record.ncdr_port_end = 0;
2135 		response_record.ncdr_listener_refs =
2136 		    NETNS_REF_COUNT(res, NETNS_LISTENER);
2137 		response_record.ncdr_skywalk_refs =
2138 		    NETNS_REF_COUNT(res, NETNS_SKYWALK);
2139 		response_record.ncdr_bsd_refs =
2140 		    NETNS_REF_COUNT(res, NETNS_BSD);
2141 		response_record.ncdr_pf_refs =
2142 		    NETNS_REF_COUNT(res, NETNS_PF);
2143 		err = SYSCTL_OUT(req, &response_record,
2144 		    sizeof(response_record));
2145 		if (err) {
2146 			return err;
2147 		}
2148 	}
2149 
2150 	return 0;
2151 }
2152 
2153 static int
2154 netns_ctl_dump_all SYSCTL_HANDLER_ARGS
2155 {
2156 #pragma unused(oidp, arg1, arg2)
2157 	struct ns *namespace;
2158 	int i, err = 0;
2159 
2160 	if (!kauth_cred_issuser(kauth_cred_get())) {
2161 		return EPERM;
2162 	}
2163 
2164 	if (__netns_inited == 0) {
2165 		return ENOTSUP;
2166 	}
2167 
2168 	NETNS_LOCK();
2169 
2170 	for (i = 0; i < NETNS_N_GLOBAL; i++) {
2171 		err = netns_ctl_write_ns(req, netns_global_non_wild[i], true);
2172 		if (err) {
2173 			goto done;
2174 		}
2175 	}
2176 
2177 	RB_FOREACH(namespace, netns_namespaces_tree, &netns_namespaces) {
2178 		err = netns_ctl_write_ns(req, namespace, false);
2179 		if (err) {
2180 			goto done;
2181 		}
2182 	}
2183 
2184 	/*
2185 	 * If this is just a request for length, add slop because
2186 	 * this is dynamically changing data
2187 	 */
2188 	if (req->oldptr == USER_ADDR_NULL) {
2189 		req->oldidx += 20 * sizeof(struct netns_ctl_dump_record);
2190 	}
2191 
2192 done:
2193 	NETNS_UNLOCK();
2194 	return err;
2195 }
2196 /* CSTYLED */
2197