xref: /xnu-8792.61.2/bsd/skywalk/namespace/netns.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/assert.h>
30 #include <kern/locks.h>
31 #include <kern/zalloc.h>
32 #include <libkern/tree.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/bitstring.h>
36 #include <net/if.h>
37 #include <net/kpi_interface.h>
38 #include <net/restricted_in_port.h>
39 
40 #include <netinet/in.h>
41 #include <netinet/in_pcb.h>
42 #include <netinet/tcp_fsm.h>
43 #include <netinet/tcp_var.h>
44 
45 #include <netinet6/in6_var.h>
46 #include <string.h>
47 
48 #include <skywalk/os_skywalk.h>
49 #include <skywalk/os_skywalk_private.h>
50 #include <skywalk/os_stats_private.h>
51 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
52 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
53 
54 #include <net/if_ports_used.h>
55 
56 static int __netns_inited = 0;
57 
58 /*
59  * Logging
60  */
61 
62 #define NS_VERB_PROTO(proto)    ((proto == IPPROTO_TCP) ? SK_VERB_NS_TCP : \
63 	                                    SK_VERB_NS_UDP)
64 #define NS_VERB_IP(addr_len)    ((addr_len == sizeof (struct in_addr)) ? \
65 	                                    SK_VERB_NS_IPV4 : SK_VERB_NS_IPV6)
66 #define PROTO_STR(proto)        ((proto == IPPROTO_TCP) ? "tcp" : "udp")
67 #define LEN_TO_AF(len)          (((len == sizeof (struct in_addr)) ? \
68 	                            AF_INET : AF_INET6))
69 /*
70  * Locking
71  * Netns is currently protected by a global mutex, NETNS_LOCK. This lock is
72  * aquired at the entry of every kernel-facing function, and released at the
73  * end. Data within netns_token structures is also protected under this lock.
74  */
75 
76 #define NETNS_LOCK()                    \
77 	lck_mtx_lock(&netns_lock)
78 #define NETNS_LOCK_SPIN()               \
79 	lck_mtx_lock_spin(&netns_lock)
80 #define NETNS_LOCK_CONVERT() do {       \
81 	NETNS_LOCK_ASSERT_HELD();       \
82 	lck_mtx_convert_spin(&netns_lock); \
83 } while (0)
84 #define NETNS_UNLOCK()                  \
85 	lck_mtx_unlock(&netns_lock)
86 #define NETNS_LOCK_ASSERT_HELD()        \
87 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_OWNED)
88 #define NETNS_LOCK_ASSERT_NOTHELD()     \
89 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_NOTOWNED)
90 
91 static LCK_GRP_DECLARE(netns_lock_group, "netns_lock");
92 static LCK_MTX_DECLARE(netns_lock, &netns_lock_group);
93 
94 /*
95  * Internal data structures and parameters
96  */
97 
98 /*
99  * Local ports are kept track of by reference counts kept in a tree specific to
100  * an <IP, protocol> tuple (see struct ns).
101  *
102  * Note: port numbers are stored in host byte order.
103  */
104 struct ns_reservation {
105 	RB_ENTRY(ns_reservation) nsr_link;
106 	uint32_t nsr_refs[NETNS_OWNER_MAX + 1];
107 	in_port_t nsr_port;
108 	bool nsr_reuseport:1;
109 };
110 
111 #define NETNS_REF_COUNT(nsr, flags)     \
112 	(nsr)->nsr_refs[((flags) & NETNS_OWNER_MASK)]
113 
114 static inline int nsr_cmp(const struct ns_reservation *,
115     const struct ns_reservation *);
116 
117 RB_HEAD(ns_reservation_tree, ns_reservation);
118 RB_PROTOTYPE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
119 RB_GENERATE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
120 
121 static inline struct ns_reservation *ns_reservation_tree_find(
122 	struct ns_reservation_tree *, const in_port_t);
123 
124 /*
125  * A namespace keeps track of the local port numbers in use for a given
126  * <IP, protocol> tuple. There are also global namespaces for each
127  * protocol to accomodate INADDR_ANY behavior and diagnostics.
128  */
129 struct ns {
130 	RB_ENTRY(ns)    ns_link;
131 
132 	void            *ns_addr_key;
133 
134 	union {
135 		uint32_t        ns_addr[4];
136 		struct in_addr  ns_inaddr;
137 		struct in6_addr ns_in6addr;
138 	};
139 	uint8_t         ns_addr_len;
140 	uint8_t         ns_proto;
141 
142 	in_port_t       ns_last_ephemeral_port_down;
143 	in_port_t       ns_last_ephemeral_port_up;
144 
145 	uint8_t         ns_is_freeable;
146 
147 	uint32_t        ns_n_reservations;
148 	struct ns_reservation_tree ns_reservations;
149 };
150 
151 static uint32_t netns_n_namespaces;
152 
153 static inline int ns_cmp(const struct ns *, const struct ns *);
154 
155 RB_HEAD(netns_namespaces_tree, ns) netns_namespaces =
156     RB_INITIALIZER(netns_namespaces);
157 RB_PROTOTYPE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
158 RB_GENERATE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
159 
160 /*
161  * Declare pointers to global namespaces for each protocol.
162  * All non-wildcard reservations will have an entry here.
163  */
164 #define NETNS_N_GLOBAL  4
165 static struct ns *netns_global_non_wild[NETNS_N_GLOBAL];
166 static struct ns *netns_global_wild[NETNS_N_GLOBAL];
167 #define NETNS_ADDRLEN_V4 (sizeof(struct in_addr))
168 #define NETNS_ADDRLEN_V6 (sizeof(struct in6_addr))
169 #define NETNS_NS_TCP    0
170 #define NETNS_NS_UDP    1
171 #define NETNS_NS_V4     0
172 #define NETNS_NS_V6     2
173 #define NETNS_NS_GLOBAL_IDX(proto, addrlen)     \
174 	((((proto) == IPPROTO_TCP) ? NETNS_NS_TCP : NETNS_NS_UDP) | \
175 	(((addrlen) == NETNS_ADDRLEN_V4) ? NETNS_NS_V4 : NETNS_NS_V6))
176 
177 #define NETNS_NS_UDP_EPHEMERAL_RESERVE  4096
178 
179 /*
180  * Internal token structure
181  *
182  * Note: port numbers are stored in host byte order.
183  */
184 struct ns_token {
185 	/* Reservation state */
186 	ifnet_t                 nt_ifp;
187 	SLIST_ENTRY(ns_token)   nt_ifp_link;
188 	SLIST_ENTRY(ns_token)   nt_all_link;
189 	uint32_t                nt_state;       /* NETNS_STATE_* */
190 
191 	/* Reservation context */
192 	union {
193 		uint32_t        nt_addr[4];
194 		struct in_addr  nt_inaddr;
195 		struct in6_addr nt_in6addr;
196 	};
197 	uint8_t                 nt_addr_len;
198 	uint8_t                 nt_proto;
199 	in_port_t               nt_port;
200 	uint32_t                nt_flags;
201 
202 	/* Optional information about the flow */
203 	struct ns_flow_info     *nt_flow_info;
204 };
205 
206 /* Valid values for nt_state */
207 #define NETNS_STATE_HALFCLOSED  0x1     /* half closed */
208 #define NETNS_STATE_WITHDRAWN   0x2     /* withdrawn; not offloadable */
209 
210 #define NETNS_STATE_BITS        "\020\01HALFCLOSED\02WITHDRAWN"
211 
212 /* List of tokens not bound to an ifnet */
213 SLIST_HEAD(, ns_token) netns_unbound_tokens = SLIST_HEAD_INITIALIZER(
214 	netns_unbound_tokens);
215 
216 /* List of all tokens currently allocated in the system */
217 SLIST_HEAD(, ns_token) netns_all_tokens = SLIST_HEAD_INITIALIZER(
218 	netns_all_tokens);
219 
220 /*
221  * Memory management
222  */
223 static ZONE_DEFINE(netns_ns_zone, SKMEM_ZONE_PREFIX ".netns.ns",
224     sizeof(struct ns), ZC_ZFREE_CLEARMEM);
225 
226 #define NETNS_NS_TOKEN_ZONE_NAME        "netns.ns_token"
227 static unsigned int netns_ns_token_size; /* size of zone element */
228 static struct skmem_cache *netns_ns_token_cache; /* for ns_token */
229 
230 #define NETNS_NS_FLOW_INFO_ZONE_NAME    "netns.ns_flow_info"
231 static unsigned int netns_ns_flow_info_size; /* size of zone element */
232 static struct skmem_cache *netns_ns_flow_info_cache; /* for ns_flow_info */
233 
234 #define NETNS_NS_RESERVATION_ZONE_NAME  "netns.ns_reservation"
235 static unsigned int netns_ns_reservation_size; /* size of zone element */
236 static struct skmem_cache *netns_ns_reservation_cache; /* for ns_reservation */
237 
238 static struct ns_reservation *netns_ns_reservation_alloc(in_port_t, uint32_t);
239 static void netns_ns_reservation_free(struct ns_reservation *);
240 static struct ns *netns_ns_alloc(zalloc_flags_t);
241 static void netns_ns_free(struct ns *);
242 static void netns_ns_cleanup(struct ns *);
243 static struct ns_token *netns_ns_token_alloc(boolean_t);
244 static void netns_ns_token_free(struct ns_token *);
245 
246 /*
247  * Utility/internal code
248  */
249 static struct ns *_netns_get_ns(uint32_t *, uint8_t, uint8_t, bool);
250 static inline boolean_t _netns_is_wildcard_addr(const uint32_t *, uint8_t);
251 static int _netns_reserve_common(struct ns *, in_port_t, uint32_t);
252 static void _netns_release_common(struct ns *, in_port_t, uint32_t);
253 static inline void netns_clear_ifnet(struct ns_token *);
254 static int _netns_reserve_kpi_common(struct ns *, netns_token *, uint32_t *,
255     uint8_t, uint8_t, in_port_t *, uint32_t, struct ns_flow_info *);
256 static void _netns_set_ifnet_internal(struct ns_token *, struct ifnet *);
257 
258 static struct ns_reservation *
netns_ns_reservation_alloc(in_port_t port,uint32_t flags)259 netns_ns_reservation_alloc(in_port_t port, uint32_t flags)
260 {
261 	struct ns_reservation *res;
262 
263 	VERIFY(port != 0);
264 
265 	res = skmem_cache_alloc(netns_ns_reservation_cache, SKMEM_SLEEP);
266 	ASSERT(res != NULL);
267 
268 	bzero(res, netns_ns_reservation_size);
269 	res->nsr_port = port;
270 	res->nsr_reuseport = ((flags & NETNS_REUSEPORT) != 0);
271 	return res;
272 }
273 
274 static void
netns_ns_reservation_free(struct ns_reservation * res)275 netns_ns_reservation_free(struct ns_reservation *res)
276 {
277 	skmem_cache_free(netns_ns_reservation_cache, res);
278 }
279 
280 static struct ns *
netns_ns_alloc(zalloc_flags_t how)281 netns_ns_alloc(zalloc_flags_t how)
282 {
283 	struct ns *namespace;
284 	in_port_t first = (in_port_t)ipport_firstauto;
285 	in_port_t last  = (in_port_t)ipport_lastauto;
286 	in_port_t rand_port;
287 
288 	namespace = zalloc_flags(netns_ns_zone, how | Z_ZERO);
289 	if (namespace == NULL) {
290 		return NULL;
291 	}
292 
293 	namespace->ns_is_freeable = 1;
294 
295 	RB_INIT(&namespace->ns_reservations);
296 
297 	/*
298 	 * Randomize the initial ephemeral port starting point, just in case
299 	 * this namespace is for an ipv6 address which gets brought up and
300 	 * down often.
301 	 */
302 	if (first == last) {
303 		rand_port = first;
304 	} else {
305 		read_frandom(&rand_port, sizeof(rand_port));
306 
307 		if (first > last) {
308 			rand_port = last + (rand_port % (first - last));
309 		} else {
310 			rand_port = first + (rand_port % (last - first));
311 		}
312 	}
313 	namespace->ns_last_ephemeral_port_down = rand_port;
314 	namespace->ns_last_ephemeral_port_up = rand_port;
315 
316 	return namespace;
317 }
318 
319 static void
netns_ns_free(struct ns * namespace)320 netns_ns_free(struct ns *namespace)
321 {
322 	struct ns_reservation *res;
323 	struct ns_reservation *tmp_res;
324 #if SK_LOG
325 	char tmp_ip_str[MAX_IPv6_STR_LEN];
326 #endif /* SK_LOG */
327 
328 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
329 	    NS_VERB_PROTO(namespace->ns_proto),
330 	    "freeing %s ns for IP %s",
331 	    PROTO_STR(namespace->ns_proto),
332 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
333 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)));
334 
335 	RB_FOREACH_SAFE(res, ns_reservation_tree, &namespace->ns_reservations,
336 	    tmp_res) {
337 		netns_ns_reservation_free(res);
338 		namespace->ns_n_reservations--;
339 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
340 		    res);
341 	}
342 
343 	VERIFY(RB_EMPTY(&namespace->ns_reservations));
344 
345 	if (netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
346 	    namespace->ns_addr_len)] == namespace) {
347 		netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
348 		namespace->ns_addr_len)] = NULL;
349 	}
350 	if (netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
351 	    namespace->ns_addr_len)] == namespace) {
352 		netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
353 		namespace->ns_addr_len)] = NULL;
354 	}
355 
356 	zfree(netns_ns_zone, namespace);
357 }
358 
359 static void
netns_ns_cleanup(struct ns * namespace)360 netns_ns_cleanup(struct ns *namespace)
361 {
362 	if (namespace->ns_is_freeable &&
363 	    RB_EMPTY(&namespace->ns_reservations)) {
364 		RB_REMOVE(netns_namespaces_tree, &netns_namespaces, namespace);
365 		netns_n_namespaces--;
366 		netns_ns_free(namespace);
367 	}
368 }
369 
370 static struct ns_token *
netns_ns_token_alloc(boolean_t with_nfi)371 netns_ns_token_alloc(boolean_t with_nfi)
372 {
373 	struct ns_token *token;
374 
375 	NETNS_LOCK_ASSERT_HELD();
376 	NETNS_LOCK_CONVERT();
377 
378 	token = skmem_cache_alloc(netns_ns_token_cache, SKMEM_SLEEP);
379 	ASSERT(token != NULL);
380 
381 	bzero(token, netns_ns_token_size);
382 
383 	if (with_nfi) {
384 		token->nt_flow_info =  skmem_cache_alloc(netns_ns_flow_info_cache,
385 		    SKMEM_SLEEP);
386 		ASSERT(token->nt_flow_info != NULL);
387 	}
388 	SLIST_INSERT_HEAD(&netns_all_tokens, token, nt_all_link);
389 
390 	return token;
391 }
392 
393 static void
netns_ns_token_free(struct ns_token * token)394 netns_ns_token_free(struct ns_token *token)
395 {
396 	NETNS_LOCK_ASSERT_HELD();
397 	NETNS_LOCK_CONVERT();
398 	SLIST_REMOVE(&netns_all_tokens, token, ns_token, nt_all_link);
399 
400 	if (token->nt_flow_info != NULL) {
401 		skmem_cache_free(netns_ns_flow_info_cache, token->nt_flow_info);
402 	}
403 	skmem_cache_free(netns_ns_token_cache, token);
404 }
405 
406 __attribute__((always_inline))
407 static inline int
nsr_cmp(const struct ns_reservation * nsr1,const struct ns_reservation * nsr2)408 nsr_cmp(const struct ns_reservation *nsr1, const struct ns_reservation *nsr2)
409 {
410 #define NSR_COMPARE(r1, r2)     ((int)(r1)->nsr_port - (int)(r2)->nsr_port)
411 	return NSR_COMPARE(nsr1, nsr2);
412 }
413 
414 __attribute__((always_inline))
415 static inline int
ns_cmp(const struct ns * a,const struct ns * b)416 ns_cmp(const struct ns *a, const struct ns *b)
417 {
418 	int d;
419 
420 	if ((d = (a->ns_addr_len - b->ns_addr_len)) != 0) {
421 		return d;
422 	}
423 	if ((d = (a->ns_proto - b->ns_proto)) != 0) {
424 		return d;
425 	}
426 	if ((d = flow_ip_cmp(a->ns_addr_key, b->ns_addr_key,
427 	    b->ns_addr_len)) != 0) {
428 		return d;
429 	}
430 
431 	return 0;
432 }
433 
434 /*
435  * Common routine to look up a reservation.
436  *
437  * NOTE: Assumes the caller holds the NETNS global lock
438  */
439 __attribute__((always_inline))
440 static inline struct ns_reservation *
ns_reservation_tree_find(struct ns_reservation_tree * tree,const in_port_t port)441 ns_reservation_tree_find(struct ns_reservation_tree *tree, const in_port_t port)
442 {
443 	struct ns_reservation res;
444 	res.nsr_port = port;
445 	return RB_FIND(ns_reservation_tree, tree, &res);
446 }
447 
448 /*
449  * Retrieve the namespace for the supplied <address, protocol> tuple.
450  * If create is set and such a namespace doesn't already exist, one will be
451  * created.
452  */
453 static struct ns *
_netns_get_ns(uint32_t * addr,uint8_t addr_len,uint8_t proto,bool create)454 _netns_get_ns(uint32_t *addr, uint8_t addr_len, uint8_t proto, bool create)
455 {
456 	struct ns *namespace = NULL;
457 	struct ns find = {
458 		.ns_addr_key = addr,
459 		.ns_addr_len = addr_len,
460 		.ns_proto = proto,
461 	};
462 #if SK_LOG
463 	char tmp_ip_str[MAX_IPv6_STR_LEN];
464 #endif /* SK_LOG */
465 
466 	VERIFY(addr_len == sizeof(struct in_addr) ||
467 	    addr_len == sizeof(struct in6_addr));
468 
469 	NETNS_LOCK_ASSERT_HELD();
470 
471 	namespace = RB_FIND(netns_namespaces_tree, &netns_namespaces, &find);
472 
473 	if (create && namespace == NULL) {
474 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
475 		    "allocating %s ns for IP %s",
476 		    PROTO_STR(proto), inet_ntop(LEN_TO_AF(addr_len), addr,
477 		    tmp_ip_str, sizeof(tmp_ip_str)));
478 		NETNS_LOCK_CONVERT();
479 		namespace = netns_ns_alloc(Z_WAITOK | Z_NOFAIL);
480 		__builtin_assume(namespace != NULL);
481 		memcpy(namespace->ns_addr, addr, addr_len);
482 		namespace->ns_addr_key = &namespace->ns_addr;
483 		namespace->ns_addr_len = addr_len;
484 		namespace->ns_proto = proto;
485 		RB_INSERT(netns_namespaces_tree, &netns_namespaces, namespace);
486 		netns_n_namespaces++;
487 
488 		if (_netns_is_wildcard_addr(addr, addr_len) &&
489 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
490 		    addr_len)] == NULL) {
491 			netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
492 			addr_len)] = namespace;
493 		}
494 	}
495 
496 	return namespace;
497 }
498 
499 /*
500  * Return true if the supplied address is a wildcard (INADDR_ANY)
501  */
502 __attribute__((always_inline))
503 static boolean_t
_netns_is_wildcard_addr(const uint32_t * addr,uint8_t addr_len)504 _netns_is_wildcard_addr(const uint32_t *addr, uint8_t addr_len)
505 {
506 	boolean_t wildcard;
507 
508 	switch (addr_len) {
509 	case sizeof(struct in_addr):
510 		wildcard = (addr[0] == 0);
511 		break;
512 
513 	case sizeof(struct in6_addr):
514 		wildcard = (addr[0] == 0 && addr[1] == 0 &&
515 		    addr[2] == 0 && addr[3] == 0);
516 		break;
517 
518 	default:
519 		wildcard = FALSE;
520 		break;
521 	}
522 
523 	return wildcard;
524 }
525 
526 __attribute__((always_inline))
527 static boolean_t
_netns_is_port_used(struct ns * gns,struct ns_reservation * curr_res,in_port_t port)528 _netns_is_port_used(struct ns * gns, struct ns_reservation *curr_res, in_port_t port)
529 {
530 	struct ns_reservation *res = NULL;
531 
532 	if (gns == NULL) {
533 		return FALSE;
534 	}
535 
536 	res = ns_reservation_tree_find(&gns->ns_reservations, port);
537 	if (res != NULL && res != curr_res) {
538 		if (!res->nsr_reuseport) {
539 			return TRUE;
540 		}
541 	}
542 
543 	return FALSE;
544 }
545 
546 /*
547  * Internal shared code to reserve ports within a specific namespace.
548  *
549  * Note: port numbers are in host byte-order here.
550  */
551 static int
_netns_reserve_common(struct ns * namespace,in_port_t port,uint32_t flags)552 _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags)
553 {
554 	struct ns_reservation *res = NULL, *exist = NULL;
555 	uint8_t proto, addr_len;
556 	int err = 0;
557 #if SK_LOG
558 	char tmp_ip_str[MAX_IPv6_STR_LEN];
559 #endif /* SK_LOG */
560 
561 	VERIFY(port != 0);
562 	proto = namespace->ns_proto;
563 	addr_len = namespace->ns_addr_len;
564 	NETNS_LOCK_CONVERT();
565 	res = netns_ns_reservation_alloc(port, flags);
566 	if (res == NULL) {
567 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
568 		    "ERROR %s:%s:%d // flags 0x%x // OUT OF MEMORY",
569 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
570 		    namespace->ns_addr, tmp_ip_str,
571 		    sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags);
572 		return ENOMEM;
573 	}
574 	exist = RB_INSERT(ns_reservation_tree, &namespace->ns_reservations,
575 	    res);
576 	if (__probable(exist == NULL)) {
577 		namespace->ns_n_reservations++;
578 	} else {
579 		netns_ns_reservation_free(res);
580 		res = exist;
581 	}
582 
583 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
584 	    "pre: %s:%s:%d // flags 0x%x // refs %d sky, %d ls, "
585 	    "%d bsd %d pf", inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
586 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
587 	    PROTO_STR(proto), port, flags,
588 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
589 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
590 	    NETNS_REF_COUNT(res, NETNS_BSD),
591 	    NETNS_REF_COUNT(res, NETNS_PF));
592 
593 	/* Make reservation */
594 	/*
595 	 * Bypass collision detection for reservations in the global non-wild
596 	 * namespace. We use that namespace for reference counts only.
597 	 */
598 	if (namespace !=
599 	    netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)]) {
600 		struct ns_reservation *skres;
601 		boolean_t is_wild = _netns_is_wildcard_addr(namespace->ns_addr,
602 		    addr_len);
603 		struct ns *gns =
604 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)];
605 
606 		if (NETNS_IS_SKYWALK(flags)) {
607 			if ((!is_wild || exist != NULL) && gns != NULL &&
608 			    (skres = ns_reservation_tree_find(
609 				    &gns->ns_reservations, port)) != NULL &&
610 			    NETNS_REF_COUNT(skres, NETNS_LISTENER) == 0) {
611 				/*
612 				 * The mere existence of any non-skywalk
613 				 * listener wildcard entry for this
614 				 * protocol/port number means this must fail.
615 				 */
616 				SK_ERR("ADDRINUSE: Duplicate wildcard");
617 				err = EADDRINUSE;
618 				goto done;
619 			}
620 
621 			if (is_wild) {
622 				gns = netns_global_non_wild[
623 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
624 				VERIFY(gns != NULL);
625 
626 				if (_netns_is_port_used(netns_global_non_wild[
627 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
628 				    _netns_is_port_used(netns_global_non_wild[
629 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
630 					/*
631 					 * If Skywalk is trying to reserve a
632 					 * wildcard, then the mere existance of
633 					 * any entry in either v4/v6 non-wild
634 					 * namespace for this port means this
635 					 * must fail.
636 					 */
637 					SK_ERR("ADDRINUSE: Wildcard with non-wild.");
638 					err = EADDRINUSE;
639 					goto done;
640 				}
641 			}
642 		} else {
643 			/*
644 			 * Check if Skywalk has reserved a wildcard entry.
645 			 * Note that the arithmetic OR here is intentional.
646 			 */
647 			if ((!is_wild || exist != NULL) && gns != NULL &&
648 			    (skres = ns_reservation_tree_find(
649 				    &gns->ns_reservations, port)) != NULL &&
650 			    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
651 			    NETNS_REF_COUNT(skres, NETNS_LISTENER)) != 0) {
652 				/*
653 				 * BSD is trying to reserve a proto/port for
654 				 * which Skywalk already has a wildcard
655 				 * reservation.
656 				 */
657 				SK_ERR("ADDRINUSE: BSD requesting Skywalk port");
658 				err = EADDRINUSE;
659 				goto done;
660 			}
661 
662 			/*
663 			 * If BSD is trying to reserve a wildcard,
664 			 * ensure Skywalk has not already reserved
665 			 * a non-wildcard.
666 			 */
667 			if (is_wild) {
668 				gns = netns_global_non_wild[
669 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
670 				VERIFY(gns != NULL);
671 
672 				/*
673 				 * Note that the arithmetic OR here is
674 				 * intentional.
675 				 */
676 				if ((skres = ns_reservation_tree_find(
677 					    &gns->ns_reservations, port)) != NULL &&
678 				    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
679 				    NETNS_REF_COUNT(skres,
680 				    NETNS_LISTENER)) != 0) {
681 					SK_ERR("ADDRINUSE: BSD wildcard with non-wild.");
682 					err = EADDRINUSE;
683 					goto done;
684 				}
685 			}
686 		}
687 
688 		switch (flags & NETNS_OWNER_MASK) {
689 		case NETNS_SKYWALK:
690 			/* check collision w/ BSD */
691 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
692 			    NETNS_REF_COUNT(res, NETNS_PF) > 0) {
693 				SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ BSD)");
694 				err = EADDRINUSE;
695 				goto done;
696 			}
697 
698 			/* BEGIN CSTYLED */
699 			/*
700 			 * Scenarios with new Skywalk connected flow:
701 			 * 1. With existing Skywalk connected flow,
702 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
703 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 1
704 			 *    reject by failing the wild gns lookup below.
705 			 * 2. With existing Skywalk 3-tuple listener,
706 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 1
707 			 *    bypass the check below.
708 			 * 3. With existing Skywalk 2-tuple listener,
709 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
710 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 0
711 			 *    pass with successful wild gns lookup.
712 			 */
713 			/* END CSTYLED */
714 			if (NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
715 			    NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0) {
716 				/* check if covered by wild Skywalk listener */
717 				gns = netns_global_wild[
718 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
719 				if (gns != NULL &&
720 				    (skres = ns_reservation_tree_find(
721 					    &gns->ns_reservations, port)) != NULL &&
722 				    NETNS_REF_COUNT(skres, NETNS_LISTENER)
723 				    != 0) {
724 					err = 0;
725 					goto done;
726 				}
727 				if (addr_len == sizeof(struct in_addr)) {
728 					/* If address is IPv4, also check for wild IPv6 registration */
729 					gns = netns_global_wild[
730 						NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)];
731 					if (gns != NULL &&
732 					    (skres = ns_reservation_tree_find(
733 						    &gns->ns_reservations, port)) != NULL &&
734 					    NETNS_REF_COUNT(skres, NETNS_LISTENER)
735 					    != 0) {
736 						err = 0;
737 						goto done;
738 					}
739 				}
740 				SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ SK connected flow)");
741 				err = EADDRINUSE;
742 			}
743 			/*
744 			 * XXX: Duplicate 5-tuple flows under a Skywalk
745 			 * listener are currently detected by flow manager,
746 			 * till we implement 5-tuple-aware netns.
747 			 */
748 			break;
749 
750 		case NETNS_LISTENER:
751 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
752 			    NETNS_REF_COUNT(res, NETNS_PF) > 0 ||
753 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0 ||
754 			    _netns_is_port_used(netns_global_wild[
755 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
756 			    _netns_is_port_used(netns_global_wild[
757 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port) ||
758 			    _netns_is_port_used(netns_global_non_wild[
759 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
760 			    _netns_is_port_used(netns_global_non_wild[
761 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
762 				SK_ERR("ERROR - Listener got ADDRINUSE");
763 				err = EADDRINUSE;
764 			}
765 			break;
766 
767 		case NETNS_BSD:
768 		case NETNS_PF:
769 			if (NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0 ||
770 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0) {
771 				SK_ERR("ERROR - %s got ADDRINUSE",
772 				    ((flags & NETNS_OWNER_MASK) == NETNS_PF) ?
773 				    "PF" : "BSD");
774 				err = EADDRINUSE;
775 			}
776 			break;
777 
778 		default:
779 			panic("_netns_reserve_common: invalid owner 0x%x",
780 			    flags & NETNS_OWNER_MASK);
781 			/* NOTREACHED */
782 			__builtin_unreachable();
783 		}
784 	}
785 
786 done:
787 	ASSERT(res != NULL);
788 	if (__probable(err == 0)) {
789 		NETNS_REF_COUNT(res, flags)++;
790 		/* Check for wrap around */
791 		VERIFY(NETNS_REF_COUNT(res, flags) != 0);
792 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
793 		    NS_VERB_PROTO(namespace->ns_proto),
794 		    "post: %s:%s:%d err %d // flags 0x%x // refs %d sky, "
795 		    "%d ls, %d bsd %d pf",
796 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
797 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
798 		    PROTO_STR(namespace->ns_proto), port, err, flags,
799 		    NETNS_REF_COUNT(res, NETNS_SKYWALK),
800 		    NETNS_REF_COUNT(res, NETNS_LISTENER),
801 		    NETNS_REF_COUNT(res, NETNS_BSD),
802 		    NETNS_REF_COUNT(res, NETNS_PF));
803 	} else {
804 		if (exist == NULL) {
805 			RB_REMOVE(ns_reservation_tree,
806 			    &namespace->ns_reservations, res);
807 			namespace->ns_n_reservations--;
808 			netns_ns_reservation_free(res);
809 		}
810 	}
811 	return err;
812 }
813 
814 /*
815  * Internal shared code to release ports within a specific namespace.
816  */
817 static void
_netns_release_common(struct ns * namespace,in_port_t port,uint32_t flags)818 _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags)
819 {
820 	struct ns_reservation *res;
821 	uint32_t refs;
822 	int i;
823 #if SK_LOG
824 	char tmp_ip_str[MAX_IPv6_STR_LEN];
825 #endif /* SK_LOG */
826 
827 	NETNS_LOCK_ASSERT_HELD();
828 
829 	res = ns_reservation_tree_find(&namespace->ns_reservations, port);
830 	if (res == NULL) {
831 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
832 		    NS_VERB_PROTO(namespace->ns_proto),
833 		    "ERROR %s:%s:%d // flags 0x%x // not found",
834 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
835 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
836 		    PROTO_STR(namespace->ns_proto), port, flags);
837 		VERIFY(res != NULL);
838 	}
839 
840 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
841 	    NS_VERB_PROTO(namespace->ns_proto),
842 	    "%s:%s:%d // flags 0x%x // refs %d sky, %d ls, %d bsd, %d pf",
843 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
844 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
845 	    PROTO_STR(namespace->ns_proto), port, flags,
846 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
847 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
848 	    NETNS_REF_COUNT(res, NETNS_BSD),
849 	    NETNS_REF_COUNT(res, NETNS_PF));
850 
851 	/* Release reservation */
852 	VERIFY(NETNS_REF_COUNT(res, flags) > 0);
853 	NETNS_REF_COUNT(res, flags) -= 1;
854 
855 	/* Clean up memory, if appropriate */
856 	for (i = 0, refs = 0; i <= NETNS_OWNER_MAX && refs == 0; i++) {
857 		refs |= res->nsr_refs[i];
858 	}
859 	if (refs == 0) {
860 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
861 		    res);
862 		namespace->ns_n_reservations--;
863 		NETNS_LOCK_CONVERT();
864 		netns_ns_reservation_free(res);
865 		netns_ns_cleanup(namespace);
866 	}
867 }
868 
869 __attribute__((always_inline))
870 static inline void
netns_init_global_ns(struct ns ** global_ptr,uint8_t proto,uint8_t addrlen)871 netns_init_global_ns(struct ns **global_ptr, uint8_t proto, uint8_t addrlen)
872 {
873 	struct ns *namespace;
874 
875 	namespace = *global_ptr = netns_ns_alloc(Z_WAITOK);
876 	memset(namespace->ns_addr, 0xFF, addrlen);
877 	namespace->ns_addr_len = addrlen;
878 	namespace->ns_proto = proto;
879 	namespace->ns_is_freeable = 0;
880 }
881 
882 __attribute__((always_inline))
883 static inline void
netns_clear_ifnet(struct ns_token * nstoken)884 netns_clear_ifnet(struct ns_token *nstoken)
885 {
886 #if SK_LOG
887 	char tmp_ip_str[MAX_IPv6_STR_LEN];
888 #endif /* SK_LOG */
889 
890 	NETNS_LOCK_ASSERT_HELD();
891 
892 	if (nstoken->nt_ifp != NULL) {
893 		SLIST_REMOVE(&nstoken->nt_ifp->if_netns_tokens, nstoken,
894 		    ns_token, nt_ifp_link);
895 
896 		SK_DF(NS_VERB_IP(nstoken->nt_addr_len) |
897 		    NS_VERB_PROTO(nstoken->nt_proto),
898 		    "%s:%s:%d // removed from ifnet %d",
899 		    inet_ntop(LEN_TO_AF(nstoken->nt_addr_len),
900 		    nstoken->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
901 		    PROTO_STR(nstoken->nt_proto), nstoken->nt_port,
902 		    nstoken->nt_ifp->if_index);
903 
904 		NETNS_LOCK_CONVERT();
905 		ifnet_decr_iorefcnt(nstoken->nt_ifp);
906 		nstoken->nt_ifp = NULL;
907 	} else {
908 		SLIST_REMOVE(&netns_unbound_tokens, nstoken, ns_token,
909 		    nt_ifp_link);
910 	}
911 }
912 
913 /*
914  * Internal shared code to perform a port[-range] reservation, along with all
915  * the boilerplate and sanity checks expected for a call coming in from the
916  * surrounding kernel code.
917  */
918 static int
_netns_reserve_kpi_common(struct ns * ns,netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)919 _netns_reserve_kpi_common(struct ns *ns, netns_token *token, uint32_t *addr,
920     uint8_t addr_len, uint8_t proto, in_port_t *port, uint32_t flags,
921     struct ns_flow_info *nfi)
922 {
923 	boolean_t ns_want_cleanup = (ns == NULL);
924 	struct ns_token *nt;
925 	int err = 0;
926 	in_port_t hport;
927 #if SK_LOG
928 	char tmp_ip_str[MAX_IPv6_STR_LEN];
929 #endif /* SK_LOG */
930 	struct ifnet *ifp = (nfi != NULL) ? nfi->nfi_ifp : NULL;
931 
932 	NETNS_LOCK_ASSERT_HELD();
933 
934 	hport = ntohs(*port);
935 
936 	VERIFY((flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
937 	VERIFY(addr_len == sizeof(struct in_addr) ||
938 	    addr_len == sizeof(struct in6_addr));
939 	VERIFY(proto == IPPROTO_TCP || proto == IPPROTO_UDP);
940 	VERIFY(hport != 0);
941 
942 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
943 	    "reserving %s:%s:%d // flags 0x%x // token %svalid",
944 	    inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str,
945 	    sizeof(tmp_ip_str)), PROTO_STR(proto), hport, flags,
946 	    NETNS_TOKEN_VALID(token) ? "" : "in");
947 
948 	/*
949 	 * See the documentation for NETNS_PRERESERVED in netns.h for an
950 	 * explanation of this block.
951 	 */
952 	if (NETNS_TOKEN_VALID(token)) {
953 		if (flags & NETNS_PRERESERVED) {
954 			nt = *token;
955 			VERIFY(nt->nt_addr_len == addr_len);
956 			VERIFY(memcmp(nt->nt_addr, addr, addr_len) == 0);
957 			VERIFY(nt->nt_proto == proto);
958 			VERIFY(nt->nt_port == hport);
959 			VERIFY((nt->nt_flags &
960 			    NETNS_RESERVATION_FLAGS | NETNS_PRERESERVED) ==
961 			    (flags & NETNS_RESERVATION_FLAGS));
962 
963 			if ((nt->nt_flags & NETNS_CONFIGURATION_FLAGS) ==
964 			    (flags & NETNS_CONFIGURATION_FLAGS)) {
965 				SK_DF(NS_VERB_IP(nt->nt_addr_len) |
966 				    NS_VERB_PROTO(nt->nt_proto),
967 				    "%s:%s:%d // flags 0x%x -> 0x%x",
968 				    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
969 				    nt->nt_addr, tmp_ip_str,
970 				    sizeof(tmp_ip_str)),
971 				    PROTO_STR(nt->nt_proto),
972 				    nt->nt_port, nt->nt_flags, flags);
973 				nt->nt_flags &= ~NETNS_CONFIGURATION_FLAGS;
974 				nt->nt_flags |=
975 				    flags & NETNS_CONFIGURATION_FLAGS;
976 			}
977 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
978 			    "token was prereserved");
979 			goto done;
980 		} else {
981 			panic("Request to overwrite valid netns token");
982 			/* NOTREACHED */
983 			__builtin_unreachable();
984 		}
985 	}
986 
987 	/*
988 	 * TODO: Check range against bitmap
989 	 */
990 	if (hport == 0) {
991 		/*
992 		 * Caller request an arbitrary range of ports
993 		 * TODO: Need to figure out how to allocate
994 		 * emphemeral ports only.
995 		 */
996 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
997 		    "ERROR - wildcard port not yet supported");
998 		err = ENOMEM;
999 		goto done;
1000 	}
1001 
1002 	/*
1003 	 * Fetch namespace for the specified address/protocol, creating
1004 	 * a new namespace if necessary.
1005 	 */
1006 	if (ns == NULL) {
1007 		ASSERT(ns_want_cleanup);
1008 		ns = _netns_get_ns(addr, addr_len, proto, true);
1009 	}
1010 	if (__improbable(ns == NULL)) {
1011 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1012 		    "ERROR - couldn't create namespace");
1013 		err = ENOMEM;
1014 		goto done;
1015 	}
1016 
1017 	/*
1018 	 * Make a reservation in the namespace
1019 	 * This will return an error if an incompatible reservation
1020 	 * already exists.
1021 	 */
1022 	err = _netns_reserve_common(ns, hport, flags);
1023 	if (__improbable(err != 0)) {
1024 		NETNS_LOCK_CONVERT();
1025 		if (ns_want_cleanup) {
1026 			netns_ns_cleanup(ns);
1027 		}
1028 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1029 		    "ERROR - reservation collision");
1030 		goto done;
1031 	}
1032 
1033 	if (!_netns_is_wildcard_addr(ns->ns_addr, addr_len)) {
1034 		/* Record the reservation in the non-wild namespace */
1035 		struct ns *nwns;
1036 
1037 		nwns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1038 		    addr_len)];
1039 		err = _netns_reserve_common(nwns, hport, flags);
1040 		if (__improbable(err != 0)) {
1041 			/* Need to free the specific namespace entry */
1042 			NETNS_LOCK_CONVERT();
1043 			_netns_release_common(ns, hport, flags);
1044 			if (ns_want_cleanup) {
1045 				netns_ns_cleanup(ns);
1046 			}
1047 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1048 			    "ERROR - reservation collision");
1049 			goto done;
1050 		}
1051 	}
1052 
1053 	nt = netns_ns_token_alloc(nfi != NULL ? true : false);
1054 	ASSERT(nt->nt_ifp == NULL);
1055 	_netns_set_ifnet_internal(nt, ifp);
1056 
1057 	memcpy(nt->nt_addr, addr, addr_len);
1058 	nt->nt_addr_len = addr_len;
1059 	nt->nt_proto = proto;
1060 	nt->nt_port = hport;
1061 	nt->nt_flags = flags;
1062 
1063 	if (nfi != NULL) {
1064 		VERIFY(nt->nt_flow_info != NULL);
1065 
1066 		memcpy(nt->nt_flow_info, nfi, sizeof(struct ns_flow_info));
1067 		/*
1068 		 * The local port is passed as a separate argument
1069 		 */
1070 		if (nfi->nfi_laddr.sa.sa_family == AF_INET) {
1071 			nt->nt_flow_info->nfi_laddr.sin.sin_port = *port;
1072 		} else if (nfi->nfi_laddr.sa.sa_family == AF_INET6) {
1073 			nt->nt_flow_info->nfi_laddr.sin6.sin6_port = *port;
1074 		}
1075 	}
1076 	*token = nt;
1077 
1078 done:
1079 	return err;
1080 }
1081 
1082 /*
1083  * Kernel-facing functions
1084  */
1085 
1086 int
netns_init(void)1087 netns_init(void)
1088 {
1089 	VERIFY(__netns_inited == 0);
1090 
1091 	netns_ns_reservation_size = sizeof(struct ns_reservation);
1092 	netns_ns_reservation_cache = skmem_cache_create(NETNS_NS_RESERVATION_ZONE_NAME,
1093 	    netns_ns_reservation_size, sizeof(uint64_t), NULL, NULL, NULL,
1094 	    NULL, NULL, 0);
1095 	if (netns_ns_reservation_cache == NULL) {
1096 		panic("%s: skmem_cache create failed (%s)", __func__,
1097 		    NETNS_NS_RESERVATION_ZONE_NAME);
1098 		/* NOTREACHED */
1099 		__builtin_unreachable();
1100 	}
1101 
1102 	netns_ns_token_size = sizeof(struct ns_token);
1103 	netns_ns_token_cache = skmem_cache_create(NETNS_NS_TOKEN_ZONE_NAME,
1104 	    netns_ns_token_size, sizeof(uint64_t), NULL, NULL, NULL, NULL,
1105 	    NULL, 0);
1106 	if (netns_ns_token_cache == NULL) {
1107 		panic("%s: skmem_cache create failed (%s)", __func__,
1108 		    NETNS_NS_TOKEN_ZONE_NAME);
1109 		/* NOTREACHED */
1110 		__builtin_unreachable();
1111 	}
1112 
1113 	netns_ns_flow_info_size = sizeof(struct ns_flow_info);
1114 	netns_ns_flow_info_cache = skmem_cache_create(NETNS_NS_FLOW_INFO_ZONE_NAME,
1115 	    netns_ns_flow_info_size, sizeof(uint64_t), NULL, NULL, NULL,
1116 	    NULL, NULL, 0);
1117 	if (netns_ns_flow_info_cache == NULL) {
1118 		panic("%s: skmem_cache create failed (%s)", __func__,
1119 		    NETNS_NS_FLOW_INFO_ZONE_NAME);
1120 		/* NOTREACHED */
1121 		__builtin_unreachable();
1122 	}
1123 
1124 	SLIST_INIT(&netns_unbound_tokens);
1125 	SLIST_INIT(&netns_all_tokens);
1126 
1127 	netns_n_namespaces = 0;
1128 	RB_INIT(&netns_namespaces);
1129 
1130 	SK_D("initializing global namespaces");
1131 
1132 	netns_init_global_ns(
1133 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1134 		NETNS_ADDRLEN_V4)], IPPROTO_TCP, sizeof(struct in_addr));
1135 
1136 	netns_init_global_ns(
1137 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1138 		NETNS_ADDRLEN_V4)], IPPROTO_UDP, sizeof(struct in_addr));
1139 
1140 	netns_init_global_ns(
1141 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1142 		NETNS_ADDRLEN_V6)], IPPROTO_TCP, sizeof(struct in6_addr));
1143 
1144 	netns_init_global_ns(
1145 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1146 		NETNS_ADDRLEN_V6)], IPPROTO_UDP, sizeof(struct in6_addr));
1147 
1148 	/* Done */
1149 
1150 	__netns_inited = 1;
1151 	sk_features |= SK_FEATURE_NETNS;
1152 
1153 	SK_D("initialized netns");
1154 
1155 	return 0;
1156 }
1157 
1158 void
netns_uninit(void)1159 netns_uninit(void)
1160 {
1161 	if (__netns_inited == 1) {
1162 		struct ns *namespace;
1163 		struct ns *temp_namespace;
1164 		int i;
1165 
1166 		RB_FOREACH_SAFE(namespace, netns_namespaces_tree,
1167 		    &netns_namespaces, temp_namespace) {
1168 			RB_REMOVE(netns_namespaces_tree, &netns_namespaces,
1169 			    namespace);
1170 			netns_n_namespaces--;
1171 			netns_ns_free(namespace);
1172 		}
1173 
1174 		for (i = 0; i < NETNS_N_GLOBAL; i++) {
1175 			netns_ns_free(netns_global_non_wild[i]);
1176 		}
1177 
1178 		if (netns_ns_flow_info_cache != NULL) {
1179 			skmem_cache_destroy(netns_ns_flow_info_cache);
1180 			netns_ns_flow_info_cache = NULL;
1181 		}
1182 		if (netns_ns_token_cache != NULL) {
1183 			skmem_cache_destroy(netns_ns_token_cache);
1184 			netns_ns_token_cache = NULL;
1185 		}
1186 		if (netns_ns_reservation_cache != NULL) {
1187 			skmem_cache_destroy(netns_ns_reservation_cache);
1188 			netns_ns_reservation_cache = NULL;
1189 		}
1190 
1191 		__netns_inited = 0;
1192 		sk_features &= ~SK_FEATURE_NETNS;
1193 
1194 		SK_D("uninitialized netns");
1195 	}
1196 }
1197 
1198 void
netns_reap_caches(boolean_t purge)1199 netns_reap_caches(boolean_t purge)
1200 {
1201 	/* these aren't created unless netns is enabled */
1202 	if (netns_ns_token_cache != NULL) {
1203 		skmem_cache_reap_now(netns_ns_token_cache, purge);
1204 	}
1205 	if (netns_ns_reservation_cache != NULL) {
1206 		skmem_cache_reap_now(netns_ns_reservation_cache, purge);
1207 	}
1208 	if (netns_ns_flow_info_cache != NULL) {
1209 		skmem_cache_reap_now(netns_ns_flow_info_cache, purge);
1210 	}
1211 }
1212 
1213 boolean_t
netns_is_enabled(void)1214 netns_is_enabled(void)
1215 {
1216 	return __netns_inited == 1;
1217 }
1218 
1219 int
netns_reserve(netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t port,uint32_t flags,struct ns_flow_info * nfi)1220 netns_reserve(netns_token *token, uint32_t *addr, uint8_t addr_len,
1221     uint8_t proto, in_port_t port, uint32_t flags, struct ns_flow_info *nfi)
1222 {
1223 	int err = 0;
1224 #if SK_LOG
1225 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1226 #endif /* SK_LOG */
1227 
1228 	if (__netns_inited == 0) {
1229 		*token = NULL;
1230 		return err;
1231 	}
1232 
1233 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1234 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1235 		return ENOTSUP;
1236 	}
1237 
1238 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1239 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1240 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1241 	    flags);
1242 
1243 	/*
1244 	 * Check wether the process is allowed to bind to a restricted port
1245 	 */
1246 	if (!current_task_can_use_restricted_in_port(port,
1247 	    proto, flags)) {
1248 		*token = NULL;
1249 		return EADDRINUSE;
1250 	}
1251 
1252 	NETNS_LOCK_SPIN();
1253 	err = _netns_reserve_kpi_common(NULL, token, addr, addr_len,
1254 	    proto, &port, flags, nfi);
1255 	NETNS_UNLOCK();
1256 
1257 	return err;
1258 }
1259 
1260 /* Import net.inet.{tcp,udp}.randomize_ports sysctls */
1261 extern int      udp_use_randomport;
1262 extern int      tcp_use_randomport;
1263 
1264 int
netns_reserve_ephemeral(netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)1265 netns_reserve_ephemeral(netns_token *token, uint32_t *addr, uint8_t addr_len,
1266     uint8_t proto, in_port_t *port, uint32_t flags, struct ns_flow_info *nfi)
1267 {
1268 	int err = 0;
1269 	in_port_t first = (in_port_t)ipport_firstauto;
1270 	in_port_t last  = (in_port_t)ipport_lastauto;
1271 	in_port_t rand_port;
1272 	in_port_t last_port;
1273 	in_port_t n_last_port;
1274 	struct ns *namespace;
1275 	boolean_t count_up = true;
1276 	boolean_t use_randomport = (proto == IPPROTO_TCP) ?
1277 	    tcp_use_randomport : udp_use_randomport;
1278 #if SK_LOG
1279 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1280 #endif /* SK_LOG */
1281 
1282 	if (__netns_inited == 0) {
1283 		*token = NULL;
1284 		return err;
1285 	}
1286 
1287 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1288 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1289 		return ENOTSUP;
1290 	}
1291 
1292 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1293 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1294 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(*port),
1295 	    flags);
1296 
1297 	NETNS_LOCK_SPIN();
1298 
1299 	namespace = _netns_get_ns(addr, addr_len, proto, true);
1300 	if (namespace == NULL) {
1301 		err = ENOMEM;
1302 		NETNS_UNLOCK();
1303 		return err;
1304 	}
1305 
1306 	if (proto == IPPROTO_UDP) {
1307 		if (UINT16_MAX - namespace->ns_n_reservations <
1308 		    NETNS_NS_UDP_EPHEMERAL_RESERVE) {
1309 			SK_ERR("UDP ephemeral port not available"
1310 			    "(less than 4096 UDP ports left)");
1311 			err = EADDRNOTAVAIL;
1312 			NETNS_UNLOCK();
1313 			return err;
1314 		}
1315 	}
1316 
1317 	if (first == last) {
1318 		rand_port = first;
1319 	} else {
1320 		if (use_randomport) {
1321 			NETNS_LOCK_CONVERT();
1322 			read_frandom(&rand_port, sizeof(rand_port));
1323 
1324 			if (first > last) {
1325 				rand_port = last + (rand_port %
1326 				    (first - last));
1327 				count_up = false;
1328 			} else {
1329 				rand_port = first + (rand_port %
1330 				    (last - first));
1331 			}
1332 		} else {
1333 			if (first > last) {
1334 				rand_port =
1335 				    namespace->ns_last_ephemeral_port_down - 1;
1336 				if (rand_port < last || rand_port > first) {
1337 					rand_port = last;
1338 				}
1339 				count_up = false;
1340 			} else {
1341 				rand_port =
1342 				    namespace->ns_last_ephemeral_port_up + 1;
1343 				if (rand_port < first || rand_port > last) {
1344 					rand_port = first;
1345 				}
1346 			}
1347 		}
1348 	}
1349 	last_port = rand_port;
1350 	n_last_port = htons(last_port);
1351 
1352 	while (true) {
1353 		if (n_last_port == 0) {
1354 			SK_ERR("ephemeral port search range includes 0");
1355 			err = EINVAL;
1356 			break;
1357 		}
1358 
1359 		/*
1360 		 * Skip if this is a restricted port as we do not want to
1361 		 * restricted ports as ephemeral
1362 		 */
1363 		if (!IS_RESTRICTED_IN_PORT(n_last_port)) {
1364 			err = _netns_reserve_kpi_common(namespace, token, addr,
1365 			    addr_len, proto, &n_last_port, flags, nfi);
1366 			if (err == 0 || err != EADDRINUSE) {
1367 				break;
1368 			}
1369 		}
1370 		if (count_up) {
1371 			last_port++;
1372 			if (last_port < first || last_port > last) {
1373 				last_port = first;
1374 			}
1375 		} else {
1376 			last_port--;
1377 			if (last_port < last || last_port > first) {
1378 				last_port = last;
1379 			}
1380 		}
1381 		n_last_port = htons(last_port);
1382 
1383 		if (last_port == rand_port || first == last) {
1384 			SK_ERR("couldn't find free ephemeral port");
1385 			err = EADDRNOTAVAIL;
1386 			break;
1387 		}
1388 	}
1389 
1390 	if (err == 0) {
1391 		*port = n_last_port;
1392 		if (count_up) {
1393 			namespace->ns_last_ephemeral_port_up = last_port;
1394 		} else {
1395 			namespace->ns_last_ephemeral_port_down = last_port;
1396 		}
1397 	} else {
1398 		netns_ns_cleanup(namespace);
1399 	}
1400 
1401 	NETNS_UNLOCK();
1402 
1403 	return err;
1404 }
1405 
1406 void
netns_release(netns_token * token)1407 netns_release(netns_token *token)
1408 {
1409 	struct ns *ns;
1410 	struct ns_token *nt;
1411 	uint8_t proto, addr_len;
1412 #if SK_LOG
1413 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1414 #endif /* SK_LOG */
1415 
1416 	if (!NETNS_TOKEN_VALID(token)) {
1417 		return;
1418 	}
1419 
1420 	if (__netns_inited == 0) {
1421 		*token = NULL;
1422 		return;
1423 	}
1424 
1425 	NETNS_LOCK_SPIN();
1426 
1427 	nt = *token;
1428 	*token = NULL;
1429 
1430 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
1431 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1432 	    nt->nt_addr_len == sizeof(struct in6_addr));
1433 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1434 
1435 	addr_len = nt->nt_addr_len;
1436 	proto = nt->nt_proto;
1437 
1438 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1439 	    "releasing %s:%s:%d",
1440 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1441 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto),
1442 	    nt->nt_port);
1443 
1444 	if (!_netns_is_wildcard_addr(nt->nt_addr, addr_len)) {
1445 		/* Remove from global non-wild namespace */
1446 
1447 		ns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1448 		    addr_len)];
1449 		VERIFY(ns != NULL);
1450 
1451 		_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1452 	}
1453 
1454 	ns = _netns_get_ns(nt->nt_addr, addr_len, proto, false);
1455 	VERIFY(ns != NULL);
1456 	_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1457 
1458 	netns_clear_ifnet(nt);
1459 	netns_ns_token_free(nt);
1460 
1461 	NETNS_UNLOCK();
1462 }
1463 
1464 int
netns_change_addr(netns_token * token,uint32_t * addr,uint8_t addr_len)1465 netns_change_addr(netns_token *token, uint32_t *addr, uint8_t addr_len)
1466 {
1467 	int err = 0;
1468 	struct ns *old_namespace;
1469 	struct ns *new_namespace;
1470 	struct ns *global_namespace;
1471 	struct ns_token *nt;
1472 	uint8_t proto;
1473 #if SK_LOG
1474 	char tmp_ip_str_1[MAX_IPv6_STR_LEN];
1475 	char tmp_ip_str_2[MAX_IPv6_STR_LEN];
1476 #endif /* SK_LOG */
1477 
1478 	if (__netns_inited == 0) {
1479 		return 0;
1480 	}
1481 
1482 	NETNS_LOCK();
1483 
1484 	VERIFY(NETNS_TOKEN_VALID(token));
1485 
1486 	nt = *token;
1487 
1488 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) == NETNS_BSD);
1489 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1490 	    nt->nt_addr_len == sizeof(struct in6_addr));
1491 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1492 
1493 	proto = nt->nt_proto;
1494 
1495 #if SK_LOG
1496 	inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1497 	    tmp_ip_str_1, sizeof(tmp_ip_str_1));
1498 	inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2,
1499 	    sizeof(tmp_ip_str_2));
1500 #endif /* SK_LOG */
1501 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1502 	    "changing address for %s:%d from %s to %s",
1503 	    PROTO_STR(proto), nt->nt_port, tmp_ip_str_1,
1504 	    tmp_ip_str_2);
1505 
1506 	if (nt->nt_addr_len == addr_len &&
1507 	    memcmp(nt->nt_addr, addr, nt->nt_addr_len) == 0) {
1508 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1509 		    "address didn't change, exiting early");
1510 		goto done;
1511 	}
1512 
1513 	old_namespace = _netns_get_ns(nt->nt_addr, nt->nt_addr_len, proto,
1514 	    false);
1515 	VERIFY(old_namespace != NULL);
1516 
1517 	new_namespace = _netns_get_ns(addr, addr_len, proto, true);
1518 	if (new_namespace == NULL) {
1519 		err = ENOMEM;
1520 		goto done;
1521 	}
1522 
1523 	/* Acquire reservation in new namespace */
1524 	if ((err = _netns_reserve_common(new_namespace, nt->nt_port,
1525 	    nt->nt_flags))) {
1526 		NETNS_LOCK_CONVERT();
1527 		netns_ns_cleanup(new_namespace);
1528 		SK_ERR("ERROR - reservation collision under new namespace");
1529 		goto done;
1530 	}
1531 
1532 	/* Release from old namespace */
1533 	_netns_release_common(old_namespace, nt->nt_port, nt->nt_flags);
1534 
1535 	if (!_netns_is_wildcard_addr(nt->nt_addr, nt->nt_addr_len)) {
1536 		/*
1537 		 * Old address is non-wildcard.
1538 		 * Remove old reservation from global non-wild namespace
1539 		 */
1540 		global_namespace = netns_global_non_wild[
1541 			NETNS_NS_GLOBAL_IDX(proto, nt->nt_addr_len)];
1542 		VERIFY(global_namespace != NULL);
1543 
1544 		_netns_release_common(global_namespace, nt->nt_port,
1545 		    nt->nt_flags);
1546 	}
1547 
1548 	if (!_netns_is_wildcard_addr(addr, addr_len)) {
1549 		/*
1550 		 * New address is non-wildcard.
1551 		 * Record new reservation in global non-wild namespace
1552 		 */
1553 		global_namespace = netns_global_non_wild[
1554 			NETNS_NS_GLOBAL_IDX(proto, addr_len)];
1555 		VERIFY(global_namespace != NULL);
1556 
1557 		if ((err = _netns_reserve_common(global_namespace,
1558 		    nt->nt_port, nt->nt_flags)) != 0) {
1559 			SK_ERR("ERROR - reservation collision under new global namespace");
1560 			/* XXX: Should not fail. Maybe assert instead */
1561 			goto done;
1562 		}
1563 	}
1564 
1565 	memcpy(nt->nt_addr, addr, addr_len);
1566 	nt->nt_addr_len = addr_len;
1567 
1568 done:
1569 	NETNS_UNLOCK();
1570 	return err;
1571 }
1572 
1573 static void
_netns_set_ifnet_internal(struct ns_token * nt,struct ifnet * ifp)1574 _netns_set_ifnet_internal(struct ns_token *nt, struct ifnet *ifp)
1575 {
1576 #if SK_LOG
1577 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1578 #endif /* SK_LOG */
1579 
1580 	NETNS_LOCK_ASSERT_HELD();
1581 
1582 	if (ifp != NULL && ifnet_is_attached(ifp, 1)) {
1583 		nt->nt_ifp = ifp;
1584 		SLIST_INSERT_HEAD(&ifp->if_netns_tokens, nt, nt_ifp_link);
1585 
1586 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1587 		    "%s:%s:%d // added to ifnet %d",
1588 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1589 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1590 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1591 		    ifp->if_index);
1592 	} else {
1593 		SLIST_INSERT_HEAD(&netns_unbound_tokens, nt, nt_ifp_link);
1594 	}
1595 }
1596 
1597 void
netns_set_ifnet(netns_token * token,ifnet_t ifp)1598 netns_set_ifnet(netns_token *token, ifnet_t ifp)
1599 {
1600 	struct ns_token *nt;
1601 #if SK_LOG
1602 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1603 #endif /* SK_LOG */
1604 
1605 	if (__netns_inited == 0) {
1606 		return;
1607 	}
1608 
1609 	NETNS_LOCK();
1610 
1611 	VERIFY(NETNS_TOKEN_VALID(token));
1612 
1613 	nt = *token;
1614 
1615 	if (nt->nt_ifp == ifp) {
1616 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1617 		    "%s:%s:%d // ifnet already %d, exiting early",
1618 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1619 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1620 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1621 		    ifp ? ifp->if_index : -1);
1622 		NETNS_UNLOCK();
1623 		return;
1624 	}
1625 
1626 	netns_clear_ifnet(nt);
1627 
1628 	_netns_set_ifnet_internal(nt, ifp);
1629 
1630 	NETNS_UNLOCK();
1631 }
1632 
1633 void
netns_ifnet_detach(ifnet_t ifp)1634 netns_ifnet_detach(ifnet_t ifp)
1635 {
1636 	struct ns_token *token, *tmp_token;
1637 
1638 	if (__netns_inited == 0) {
1639 		return;
1640 	}
1641 
1642 	NETNS_LOCK();
1643 
1644 	SLIST_FOREACH_SAFE(token, &ifp->if_netns_tokens, nt_ifp_link,
1645 	    tmp_token) {
1646 		netns_clear_ifnet(token);
1647 		SLIST_INSERT_HEAD(&netns_unbound_tokens, token, nt_ifp_link);
1648 	}
1649 
1650 	NETNS_UNLOCK();
1651 }
1652 
1653 static void
_netns_set_state(netns_token * token,uint32_t state)1654 _netns_set_state(netns_token *token, uint32_t state)
1655 {
1656 	struct ns_token *nt;
1657 #if SK_LOG
1658 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1659 #endif /* SK_LOG */
1660 
1661 	if (__netns_inited == 0) {
1662 		return;
1663 	}
1664 
1665 	NETNS_LOCK();
1666 	VERIFY(NETNS_TOKEN_VALID(token));
1667 
1668 	nt = *token;
1669 	nt->nt_state |= state;
1670 
1671 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1672 	    "%s:%s:%d // state 0x%b",
1673 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1674 	    tmp_ip_str, sizeof(tmp_ip_str)),
1675 	    PROTO_STR(nt->nt_proto), nt->nt_port, state, NETNS_STATE_BITS);
1676 
1677 	NETNS_UNLOCK();
1678 }
1679 
1680 void
netns_half_close(netns_token * token)1681 netns_half_close(netns_token *token)
1682 {
1683 	_netns_set_state(token, NETNS_STATE_HALFCLOSED);
1684 }
1685 
1686 void
netns_withdraw(netns_token * token)1687 netns_withdraw(netns_token *token)
1688 {
1689 	_netns_set_state(token, NETNS_STATE_WITHDRAWN);
1690 }
1691 
1692 int
netns_get_flow_info(netns_token * token,struct ns_flow_info * nfi)1693 netns_get_flow_info(netns_token *token,
1694     struct ns_flow_info *nfi)
1695 {
1696 	if (__netns_inited == 0) {
1697 		return ENOTSUP;
1698 	}
1699 
1700 	NETNS_LOCK();
1701 	if (!NETNS_TOKEN_VALID(token) ||
1702 	    nfi == NULL) {
1703 		NETNS_UNLOCK();
1704 		return EINVAL;
1705 	}
1706 
1707 	struct ns_token *nt = *token;
1708 	if (nt->nt_flow_info == NULL) {
1709 		NETNS_UNLOCK();
1710 		return ENOENT;
1711 	}
1712 
1713 	memcpy(nfi, nt->nt_flow_info, sizeof(struct ns_flow_info));
1714 	NETNS_UNLOCK();
1715 
1716 	return 0;
1717 }
1718 
1719 void
netns_change_flags(netns_token * token,uint32_t set_flags,uint32_t clear_flags)1720 netns_change_flags(netns_token *token, uint32_t set_flags,
1721     uint32_t clear_flags)
1722 {
1723 	struct ns_token *nt;
1724 #if SK_LOG
1725 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1726 #endif /* SK_LOG */
1727 
1728 	if (__netns_inited == 0) {
1729 		return;
1730 	}
1731 
1732 	NETNS_LOCK();
1733 
1734 	VERIFY(NETNS_TOKEN_VALID(token));
1735 
1736 	nt = *token;
1737 
1738 	VERIFY(!((set_flags | clear_flags) & NETNS_RESERVATION_FLAGS));
1739 	/* TODO: verify set and clear flags don't overlap? */
1740 
1741 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1742 	    "%s:%s:%d // flags 0x%x -> 0x%x",
1743 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1744 	    tmp_ip_str, sizeof(tmp_ip_str)),
1745 	    PROTO_STR(nt->nt_proto), nt->nt_port, nt->nt_flags,
1746 	    nt->nt_flags | set_flags & ~clear_flags);
1747 
1748 	nt->nt_flags |= set_flags;
1749 	nt->nt_flags &= ~clear_flags;
1750 
1751 	NETNS_UNLOCK();
1752 }
1753 
1754 /*
1755  * Port offloading KPI
1756  */
1757 static inline void
netns_local_port_scan_flow_entry(struct flow_entry * fe,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1758 netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protocol,
1759     u_int32_t flags, u_int8_t *bitfield)
1760 {
1761 	struct ns_token *token;
1762 	boolean_t iswildcard = false;
1763 
1764 	if (fe == NULL) {
1765 		return;
1766 	}
1767 
1768 	if (fe->fe_flags & FLOWENTF_EXTRL_PORT) {
1769 		return;
1770 	}
1771 
1772 	token = fe->fe_port_reservation;
1773 	if (token == NULL) {
1774 		return;
1775 	}
1776 
1777 	/*
1778 	 * We are only interested in active flows over skywalk channels
1779 	 */
1780 	if ((token->nt_flags & NETNS_OWNER_MASK) != NETNS_SKYWALK) {
1781 		return;
1782 	}
1783 
1784 	if (token->nt_state & NETNS_STATE_WITHDRAWN) {
1785 		return;
1786 	}
1787 
1788 	if (!(flags & IFNET_GET_LOCAL_PORTS_ANYTCPSTATEOK) &&
1789 	    (flags & IFNET_GET_LOCAL_PORTS_ACTIVEONLY) &&
1790 	    (token->nt_state & NETNS_STATE_HALFCLOSED)) {
1791 		return;
1792 	}
1793 
1794 	VERIFY(token->nt_addr_len == sizeof(struct in_addr) ||
1795 	    token->nt_addr_len == sizeof(struct in6_addr));
1796 
1797 	if (token->nt_addr_len == sizeof(struct in_addr)) {
1798 		if (protocol == PF_INET6) {
1799 			return;
1800 		}
1801 
1802 		iswildcard = token->nt_inaddr.s_addr == INADDR_ANY;
1803 	} else if (token->nt_addr_len == sizeof(struct in6_addr)) {
1804 		if (protocol == PF_INET) {
1805 			return;
1806 		}
1807 
1808 		iswildcard = IN6_IS_ADDR_UNSPECIFIED(
1809 			&token->nt_in6addr);
1810 	}
1811 	if (!(flags & IFNET_GET_LOCAL_PORTS_WILDCARDOK) && iswildcard) {
1812 		return;
1813 	}
1814 
1815 	if ((flags & IFNET_GET_LOCAL_PORTS_TCPONLY) &&
1816 	    token->nt_proto == IPPROTO_UDP) {
1817 		return;
1818 	}
1819 	if ((flags & IFNET_GET_LOCAL_PORTS_UDPONLY) &&
1820 	    token->nt_proto == IPPROTO_TCP) {
1821 		return;
1822 	}
1823 
1824 	if (!(flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) &&
1825 	    (token->nt_flags & NETNS_NOWAKEFROMSLEEP)) {
1826 		return;
1827 	}
1828 
1829 	if ((flags & IFNET_GET_LOCAL_PORTS_RECVANYIFONLY) &&
1830 	    !(token->nt_flags & NETNS_RECVANYIF)) {
1831 		return;
1832 	}
1833 
1834 	if ((flags & IFNET_GET_LOCAL_PORTS_EXTBGIDLEONLY) &&
1835 	    !(token->nt_flags & NETNS_EXTBGIDLE)) {
1836 		return;
1837 	}
1838 
1839 	if (token->nt_ifp != NULL && token->nt_flow_info != NULL) {
1840 		bitstr_set(bitfield, token->nt_port);
1841 		(void) if_ports_used_add_flow_entry(fe, token->nt_ifp->if_index,
1842 		    token->nt_flow_info, token->nt_flags);
1843 	} else {
1844 		SK_ERR("%s: unknown owner port %u"
1845 		    " nt_flags 0x%x ifindex %u nt_flow_info %p\n",
1846 		    __func__, token->nt_port,
1847 		    token->nt_flags,
1848 		    token->nt_ifp != NULL ? token->nt_ifp->if_index : 0,
1849 		    token->nt_flow_info);
1850 	}
1851 }
1852 
1853 static void
netns_get_if_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1854 netns_get_if_local_ports(ifnet_t ifp, protocol_family_t protocol,
1855     u_int32_t flags, u_int8_t *bitfield)
1856 {
1857 	struct nx_flowswitch *fsw = NULL;
1858 
1859 	if (ifp == NULL || ifp->if_na == NULL) {
1860 		return;
1861 	}
1862 	/* Ensure that the interface is attached and won't detach */
1863 	if (!ifnet_is_attached(ifp, 1)) {
1864 		return;
1865 	}
1866 	fsw = fsw_ifp_to_fsw(ifp);
1867 	if (fsw == NULL) {
1868 		goto done;
1869 	}
1870 	FSW_RLOCK(fsw);
1871 	NETNS_LOCK();
1872 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1873 		netns_local_port_scan_flow_entry(_fe, protocol, flags,
1874 		bitfield);
1875 	});
1876 	NETNS_UNLOCK();
1877 	FSW_UNLOCK(fsw);
1878 done:
1879 	ifnet_decr_iorefcnt(ifp);
1880 }
1881 
1882 errno_t
netns_get_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1883 netns_get_local_ports(ifnet_t ifp, protocol_family_t protocol,
1884     u_int32_t flags, u_int8_t *bitfield)
1885 {
1886 	if (__netns_inited == 0) {
1887 		return 0;
1888 	}
1889 	if (ifp != NULL) {
1890 		netns_get_if_local_ports(ifp, protocol, flags, bitfield);
1891 	} else {
1892 		errno_t error;
1893 		ifnet_t *ifp_list;
1894 		uint32_t count, i;
1895 
1896 		error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count);
1897 		if (error != 0) {
1898 			os_log_error(OS_LOG_DEFAULT,
1899 			    "%s: ifnet_list_get_all() failed %d",
1900 			    __func__, error);
1901 			return error;
1902 		}
1903 		for (i = 0; i < count; i++) {
1904 			if (TAILQ_EMPTY(&ifp_list[i]->if_addrhead)) {
1905 				continue;
1906 			}
1907 			netns_get_if_local_ports(ifp_list[i], protocol, flags,
1908 			    bitfield);
1909 		}
1910 		ifnet_list_free(ifp_list);
1911 	}
1912 
1913 	return 0;
1914 }
1915 
1916 uint32_t
netns_find_anyres_byaddr(struct ifaddr * ifa,uint8_t proto)1917 netns_find_anyres_byaddr(struct ifaddr *ifa, uint8_t proto)
1918 {
1919 	int result = 0;
1920 	int ifa_addr_len;
1921 	struct ns_token *token;
1922 	struct ifnet *ifp = ifa->ifa_ifp;
1923 	struct sockaddr *ifa_addr = ifa->ifa_addr;
1924 
1925 	if (__netns_inited == 0) {
1926 		return ENOTSUP;
1927 	}
1928 
1929 	if ((ifa_addr->sa_family != AF_INET) &&
1930 	    (ifa_addr->sa_family != AF_INET6)) {
1931 		return 0;
1932 	}
1933 
1934 	ifa_addr_len = (ifa_addr->sa_family == AF_INET) ?
1935 	    sizeof(struct in_addr) : sizeof(struct in6_addr);
1936 
1937 	NETNS_LOCK();
1938 
1939 	SLIST_FOREACH(token, &ifp->if_netns_tokens, nt_ifp_link) {
1940 		if ((token->nt_flags & NETNS_OWNER_MASK) == NETNS_PF) {
1941 			continue;
1942 		}
1943 		if (token->nt_addr_len != ifa_addr_len) {
1944 			continue;
1945 		}
1946 		if (token->nt_proto != proto) {
1947 			continue;
1948 		}
1949 		if (ifa_addr->sa_family == AF_INET) {
1950 			if (token->nt_inaddr.s_addr ==
1951 			    (satosin(ifa->ifa_addr))->sin_addr.s_addr) {
1952 				result = 1;
1953 				break;
1954 			}
1955 		} else if (ifa_addr->sa_family == AF_INET6) {
1956 			if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa),
1957 			    &token->nt_in6addr)) {
1958 				result = 1;
1959 				break;
1960 			}
1961 		}
1962 	}
1963 
1964 	NETNS_UNLOCK();
1965 	return result;
1966 }
1967 
1968 static uint32_t
_netns_lookup_ns_n_reservations(uint32_t * addr,uint8_t addr_len,uint8_t proto)1969 _netns_lookup_ns_n_reservations(uint32_t *addr, uint8_t addr_len, uint8_t proto)
1970 {
1971 	uint32_t ns_n_reservations = 0;
1972 	NETNS_LOCK_SPIN();
1973 	struct ns *namespace = _netns_get_ns(addr, addr_len, proto, true);
1974 	if (namespace != NULL) {
1975 		ns_n_reservations = namespace->ns_n_reservations;
1976 	}
1977 	NETNS_UNLOCK();
1978 	return ns_n_reservations;
1979 }
1980 
1981 uint32_t
netns_lookup_reservations_count_in(struct in_addr addr,uint8_t proto)1982 netns_lookup_reservations_count_in(struct in_addr addr, uint8_t proto)
1983 {
1984 	return _netns_lookup_ns_n_reservations(&addr.s_addr, sizeof(struct in_addr), proto);
1985 }
1986 
1987 uint32_t
netns_lookup_reservations_count_in6(struct in6_addr addr,uint8_t proto)1988 netns_lookup_reservations_count_in6(struct in6_addr addr, uint8_t proto)
1989 {
1990 	if (IN6_IS_SCOPE_EMBED(&addr)) {
1991 		addr.s6_addr16[1] = 0;
1992 	}
1993 	return _netns_lookup_ns_n_reservations(&addr.s6_addr32[0], sizeof(struct in6_addr), proto);
1994 }
1995 
1996 /*
1997  * Sysctl interface
1998  */
1999 
2000 static int netns_ctl_dump_all SYSCTL_HANDLER_ARGS;
2001 
2002 SYSCTL_NODE(_kern_skywalk, OID_AUTO, netns, CTLFLAG_RW | CTLFLAG_LOCKED,
2003     0, "Netns interface");
2004 
2005 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netns,
2006     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
2007     0, 0, netns_ctl_dump_all, "-",
2008     "Namespace contents (struct netns_ctl_dump_header, "
2009     "skywalk/os_stats_private.h)");
2010 
2011 static int
netns_ctl_write_ns(struct sysctl_req * req,struct ns * namespace,boolean_t is_global)2012 netns_ctl_write_ns(struct sysctl_req *req, struct ns *namespace,
2013     boolean_t is_global)
2014 {
2015 	struct ns_reservation *res;
2016 	struct netns_ctl_dump_header response_header;
2017 	struct netns_ctl_dump_record response_record;
2018 	int err;
2019 
2020 	/* Fill out header */
2021 	memset(&response_header, 0, sizeof(response_header));
2022 	response_header.ncdh_n_records = namespace->ns_n_reservations;
2023 	response_header.ncdh_proto = namespace->ns_proto;
2024 
2025 	if (is_global) {
2026 		response_header.ncdh_addr_len = 0;
2027 	} else {
2028 		response_header.ncdh_addr_len = namespace->ns_addr_len;
2029 	}
2030 	memcpy(response_header.ncdh_addr, namespace->ns_addr,
2031 	    namespace->ns_addr_len);
2032 
2033 	err = SYSCTL_OUT(req, &response_header, sizeof(response_header));
2034 	if (err) {
2035 		return err;
2036 	}
2037 
2038 	/* Fill out records */
2039 	RB_FOREACH(res, ns_reservation_tree, &namespace->ns_reservations) {
2040 		memset(&response_record, 0, sizeof(response_record));
2041 		response_record.ncdr_port = res->nsr_port;
2042 		response_record.ncdr_port_end = 0;
2043 		response_record.ncdr_listener_refs =
2044 		    NETNS_REF_COUNT(res, NETNS_LISTENER);
2045 		response_record.ncdr_skywalk_refs =
2046 		    NETNS_REF_COUNT(res, NETNS_SKYWALK);
2047 		response_record.ncdr_bsd_refs =
2048 		    NETNS_REF_COUNT(res, NETNS_BSD);
2049 		response_record.ncdr_pf_refs =
2050 		    NETNS_REF_COUNT(res, NETNS_PF);
2051 		err = SYSCTL_OUT(req, &response_record,
2052 		    sizeof(response_record));
2053 		if (err) {
2054 			return err;
2055 		}
2056 	}
2057 
2058 	return 0;
2059 }
2060 
2061 static int
2062 netns_ctl_dump_all SYSCTL_HANDLER_ARGS
2063 {
2064 #pragma unused(oidp, arg1, arg2)
2065 	struct ns *namespace;
2066 	int i, err = 0;
2067 
2068 	if (!kauth_cred_issuser(kauth_cred_get())) {
2069 		return EPERM;
2070 	}
2071 
2072 	if (__netns_inited == 0) {
2073 		return ENOTSUP;
2074 	}
2075 
2076 	NETNS_LOCK();
2077 
2078 	for (i = 0; i < NETNS_N_GLOBAL; i++) {
2079 		err = netns_ctl_write_ns(req, netns_global_non_wild[i], true);
2080 		if (err) {
2081 			goto done;
2082 		}
2083 	}
2084 
2085 	RB_FOREACH(namespace, netns_namespaces_tree, &netns_namespaces) {
2086 		err = netns_ctl_write_ns(req, namespace, false);
2087 		if (err) {
2088 			goto done;
2089 		}
2090 	}
2091 
2092 	/*
2093 	 * If this is just a request for length, add slop because
2094 	 * this is dynamically changing data
2095 	 */
2096 	if (req->oldptr == USER_ADDR_NULL) {
2097 		req->oldidx += 20 * sizeof(struct netns_ctl_dump_record);
2098 	}
2099 
2100 done:
2101 	NETNS_UNLOCK();
2102 	return err;
2103 }
2104 /* CSTYLED */
2105