xref: /xnu-8792.41.9/bsd/skywalk/namespace/netns.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/assert.h>
30 #include <kern/locks.h>
31 #include <kern/zalloc.h>
32 #include <libkern/tree.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/bitstring.h>
36 #include <net/if.h>
37 #include <net/kpi_interface.h>
38 #include <net/restricted_in_port.h>
39 
40 #include <netinet/in.h>
41 #include <netinet/in_pcb.h>
42 #include <netinet/tcp_fsm.h>
43 #include <netinet/tcp_var.h>
44 
45 #include <netinet6/in6_var.h>
46 #include <string.h>
47 
48 #include <skywalk/os_skywalk.h>
49 #include <skywalk/os_skywalk_private.h>
50 #include <skywalk/os_stats_private.h>
51 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
52 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
53 
54 #include <net/if_ports_used.h>
55 
56 static int __netns_inited = 0;
57 
58 /*
59  * Logging
60  */
61 
62 #define NS_VERB_PROTO(proto)    ((proto == IPPROTO_TCP) ? SK_VERB_NS_TCP : \
63 	                                    SK_VERB_NS_UDP)
64 #define NS_VERB_IP(addr_len)    ((addr_len == sizeof (struct in_addr)) ? \
65 	                                    SK_VERB_NS_IPV4 : SK_VERB_NS_IPV6)
66 #define PROTO_STR(proto)        ((proto == IPPROTO_TCP) ? "tcp" : "udp")
67 #define LEN_TO_AF(len)          (((len == sizeof (struct in_addr)) ? \
68 	                            AF_INET : AF_INET6))
69 /*
70  * Locking
71  * Netns is currently protected by a global mutex, NETNS_LOCK. This lock is
72  * aquired at the entry of every kernel-facing function, and released at the
73  * end. Data within netns_token structures is also protected under this lock.
74  */
75 
76 #define NETNS_LOCK()                    \
77 	lck_mtx_lock(&netns_lock)
78 #define NETNS_LOCK_SPIN()               \
79 	lck_mtx_lock_spin(&netns_lock)
80 #define NETNS_LOCK_CONVERT() do {       \
81 	NETNS_LOCK_ASSERT_HELD();       \
82 	lck_mtx_convert_spin(&netns_lock); \
83 } while (0)
84 #define NETNS_UNLOCK()                  \
85 	lck_mtx_unlock(&netns_lock)
86 #define NETNS_LOCK_ASSERT_HELD()        \
87 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_OWNED)
88 #define NETNS_LOCK_ASSERT_NOTHELD()     \
89 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_NOTOWNED)
90 
91 static LCK_GRP_DECLARE(netns_lock_group, "netns_lock");
92 static LCK_MTX_DECLARE(netns_lock, &netns_lock_group);
93 
94 /*
95  * Internal data structures and parameters
96  */
97 
98 /*
99  * Local ports are kept track of by reference counts kept in a tree specific to
100  * an <IP, protocol> tuple (see struct ns).
101  *
102  * Note: port numbers are stored in host byte order.
103  */
104 struct ns_reservation {
105 	RB_ENTRY(ns_reservation) nsr_link;
106 	uint32_t nsr_refs[NETNS_OWNER_MAX + 1];
107 	in_port_t nsr_port;
108 	bool nsr_reuseport:1;
109 };
110 
111 #define NETNS_REF_COUNT(nsr, flags)     \
112 	(nsr)->nsr_refs[((flags) & NETNS_OWNER_MASK)]
113 
114 static inline int nsr_cmp(const struct ns_reservation *,
115     const struct ns_reservation *);
116 
117 RB_HEAD(ns_reservation_tree, ns_reservation);
118 RB_PROTOTYPE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
119 RB_GENERATE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
120 
121 static inline struct ns_reservation *ns_reservation_tree_find(
122 	struct ns_reservation_tree *, const in_port_t);
123 
124 /*
125  * A namespace keeps track of the local port numbers in use for a given
126  * <IP, protocol> tuple. There are also global namespaces for each
127  * protocol to accomodate INADDR_ANY behavior and diagnostics.
128  */
129 struct ns {
130 	RB_ENTRY(ns)    ns_link;
131 
132 	void            *ns_addr_key;
133 
134 	union {
135 		uint32_t        ns_addr[4];
136 		struct in_addr  ns_inaddr;
137 		struct in6_addr ns_in6addr;
138 	};
139 	uint8_t         ns_addr_len;
140 	uint8_t         ns_proto;
141 
142 	in_port_t       ns_last_ephemeral_port_down;
143 	in_port_t       ns_last_ephemeral_port_up;
144 
145 	uint8_t         ns_is_freeable;
146 
147 	uint32_t        ns_n_reservations;
148 	struct ns_reservation_tree ns_reservations;
149 };
150 
151 static uint32_t netns_n_namespaces;
152 
153 static inline int ns_cmp(const struct ns *, const struct ns *);
154 
155 RB_HEAD(netns_namespaces_tree, ns) netns_namespaces =
156     RB_INITIALIZER(netns_namespaces);
157 RB_PROTOTYPE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
158 RB_GENERATE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
159 
160 /*
161  * Declare pointers to global namespaces for each protocol.
162  * All non-wildcard reservations will have an entry here.
163  */
164 #define NETNS_N_GLOBAL  4
165 static struct ns *netns_global_non_wild[NETNS_N_GLOBAL];
166 static struct ns *netns_global_wild[NETNS_N_GLOBAL];
167 #define NETNS_NS_TCP    0
168 #define NETNS_NS_UDP    1
169 #define NETNS_NS_V4     0
170 #define NETNS_NS_V6     2
171 #define NETNS_NS_GLOBAL_IDX(proto, addrlen)     \
172 	((((proto) == IPPROTO_TCP) ? NETNS_NS_TCP : NETNS_NS_UDP) | \
173 	(((addrlen) == sizeof (struct in_addr)) ? NETNS_NS_V4 : NETNS_NS_V6))
174 
175 #define NETNS_NS_UDP_EPHEMERAL_RESERVE  4096
176 
177 /*
178  * Internal token structure
179  *
180  * Note: port numbers are stored in host byte order.
181  */
182 struct ns_token {
183 	/* Reservation state */
184 	ifnet_t                 nt_ifp;
185 	SLIST_ENTRY(ns_token)   nt_ifp_link;
186 	SLIST_ENTRY(ns_token)   nt_all_link;
187 	uint32_t                nt_state;       /* NETNS_STATE_* */
188 
189 	/* Reservation context */
190 	union {
191 		uint32_t        nt_addr[4];
192 		struct in_addr  nt_inaddr;
193 		struct in6_addr nt_in6addr;
194 	};
195 	uint8_t                 nt_addr_len;
196 	uint8_t                 nt_proto;
197 	in_port_t               nt_port;
198 	uint32_t                nt_flags;
199 
200 	/* Optional information about the flow */
201 	struct ns_flow_info     *nt_flow_info;
202 };
203 
204 /* Valid values for nt_state */
205 #define NETNS_STATE_HALFCLOSED  0x1     /* half closed */
206 #define NETNS_STATE_WITHDRAWN   0x2     /* withdrawn; not offloadable */
207 
208 #define NETNS_STATE_BITS        "\020\01HALFCLOSED\02WITHDRAWN"
209 
210 /* List of tokens not bound to an ifnet */
211 SLIST_HEAD(, ns_token) netns_unbound_tokens = SLIST_HEAD_INITIALIZER(
212 	netns_unbound_tokens);
213 
214 /* List of all tokens currently allocated in the system */
215 SLIST_HEAD(, ns_token) netns_all_tokens = SLIST_HEAD_INITIALIZER(
216 	netns_all_tokens);
217 
218 /*
219  * Memory management
220  */
221 static ZONE_DEFINE(netns_ns_zone, SKMEM_ZONE_PREFIX ".netns.ns",
222     sizeof(struct ns), ZC_ZFREE_CLEARMEM);
223 
224 #define NETNS_NS_TOKEN_ZONE_NAME        "netns.ns_token"
225 static unsigned int netns_ns_token_size; /* size of zone element */
226 static struct skmem_cache *netns_ns_token_cache; /* for ns_token */
227 
228 #define NETNS_NS_FLOW_INFO_ZONE_NAME    "netns.ns_flow_info"
229 static unsigned int netns_ns_flow_info_size; /* size of zone element */
230 static struct skmem_cache *netns_ns_flow_info_cache; /* for ns_flow_info */
231 
232 #define NETNS_NS_RESERVATION_ZONE_NAME  "netns.ns_reservation"
233 static unsigned int netns_ns_reservation_size; /* size of zone element */
234 static struct skmem_cache *netns_ns_reservation_cache; /* for ns_reservation */
235 
236 static struct ns_reservation *netns_ns_reservation_alloc(in_port_t, uint32_t);
237 static void netns_ns_reservation_free(struct ns_reservation *);
238 static struct ns *netns_ns_alloc(zalloc_flags_t);
239 static void netns_ns_free(struct ns *);
240 static void netns_ns_cleanup(struct ns *);
241 static struct ns_token *netns_ns_token_alloc(boolean_t);
242 static void netns_ns_token_free(struct ns_token *);
243 
244 /*
245  * Utility/internal code
246  */
247 static struct ns *_netns_get_ns(uint32_t *, uint8_t, uint8_t, bool);
248 static inline boolean_t _netns_is_wildcard_addr(const uint32_t *, uint8_t);
249 static int _netns_reserve_common(struct ns *, in_port_t, uint32_t);
250 static void _netns_release_common(struct ns *, in_port_t, uint32_t);
251 static inline void netns_clear_ifnet(struct ns_token *);
252 static int _netns_reserve_kpi_common(struct ns *, netns_token *, uint32_t *,
253     uint8_t, uint8_t, in_port_t *, uint32_t, struct ns_flow_info *);
254 static void _netns_set_ifnet_internal(struct ns_token *, struct ifnet *);
255 
256 static struct ns_reservation *
netns_ns_reservation_alloc(in_port_t port,uint32_t flags)257 netns_ns_reservation_alloc(in_port_t port, uint32_t flags)
258 {
259 	struct ns_reservation *res;
260 
261 	VERIFY(port != 0);
262 
263 	res = skmem_cache_alloc(netns_ns_reservation_cache, SKMEM_SLEEP);
264 	ASSERT(res != NULL);
265 
266 	bzero(res, netns_ns_reservation_size);
267 	res->nsr_port = port;
268 	res->nsr_reuseport = ((flags & NETNS_REUSEPORT) != 0);
269 	return res;
270 }
271 
272 static void
netns_ns_reservation_free(struct ns_reservation * res)273 netns_ns_reservation_free(struct ns_reservation *res)
274 {
275 	skmem_cache_free(netns_ns_reservation_cache, res);
276 }
277 
278 static struct ns *
netns_ns_alloc(zalloc_flags_t how)279 netns_ns_alloc(zalloc_flags_t how)
280 {
281 	struct ns *namespace;
282 	in_port_t first = (in_port_t)ipport_firstauto;
283 	in_port_t last  = (in_port_t)ipport_lastauto;
284 	in_port_t rand_port;
285 
286 	namespace = zalloc_flags(netns_ns_zone, how | Z_ZERO);
287 	if (namespace == NULL) {
288 		return NULL;
289 	}
290 
291 	namespace->ns_is_freeable = 1;
292 
293 	RB_INIT(&namespace->ns_reservations);
294 
295 	/*
296 	 * Randomize the initial ephemeral port starting point, just in case
297 	 * this namespace is for an ipv6 address which gets brought up and
298 	 * down often.
299 	 */
300 	if (first == last) {
301 		rand_port = first;
302 	} else {
303 		read_frandom(&rand_port, sizeof(rand_port));
304 
305 		if (first > last) {
306 			rand_port = last + (rand_port % (first - last));
307 		} else {
308 			rand_port = first + (rand_port % (last - first));
309 		}
310 	}
311 	namespace->ns_last_ephemeral_port_down = rand_port;
312 	namespace->ns_last_ephemeral_port_up = rand_port;
313 
314 	return namespace;
315 }
316 
317 static void
netns_ns_free(struct ns * namespace)318 netns_ns_free(struct ns *namespace)
319 {
320 	struct ns_reservation *res;
321 	struct ns_reservation *tmp_res;
322 #if SK_LOG
323 	char tmp_ip_str[MAX_IPv6_STR_LEN];
324 #endif /* SK_LOG */
325 
326 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
327 	    NS_VERB_PROTO(namespace->ns_proto),
328 	    "freeing %s ns for IP %s",
329 	    PROTO_STR(namespace->ns_proto),
330 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
331 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)));
332 
333 	RB_FOREACH_SAFE(res, ns_reservation_tree, &namespace->ns_reservations,
334 	    tmp_res) {
335 		netns_ns_reservation_free(res);
336 		namespace->ns_n_reservations--;
337 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
338 		    res);
339 	}
340 
341 	VERIFY(RB_EMPTY(&namespace->ns_reservations));
342 
343 	if (netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
344 	    namespace->ns_addr_len)] == namespace) {
345 		netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
346 		namespace->ns_addr_len)] = NULL;
347 	}
348 	if (netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
349 	    namespace->ns_addr_len)] == namespace) {
350 		netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
351 		namespace->ns_addr_len)] = NULL;
352 	}
353 
354 	zfree(netns_ns_zone, namespace);
355 }
356 
357 static void
netns_ns_cleanup(struct ns * namespace)358 netns_ns_cleanup(struct ns *namespace)
359 {
360 	if (namespace->ns_is_freeable &&
361 	    RB_EMPTY(&namespace->ns_reservations)) {
362 		RB_REMOVE(netns_namespaces_tree, &netns_namespaces, namespace);
363 		netns_n_namespaces--;
364 		netns_ns_free(namespace);
365 	}
366 }
367 
368 static struct ns_token *
netns_ns_token_alloc(boolean_t with_nfi)369 netns_ns_token_alloc(boolean_t with_nfi)
370 {
371 	struct ns_token *token;
372 
373 	NETNS_LOCK_ASSERT_HELD();
374 	NETNS_LOCK_CONVERT();
375 
376 	token = skmem_cache_alloc(netns_ns_token_cache, SKMEM_SLEEP);
377 	ASSERT(token != NULL);
378 
379 	bzero(token, netns_ns_token_size);
380 
381 	if (with_nfi) {
382 		token->nt_flow_info =  skmem_cache_alloc(netns_ns_flow_info_cache,
383 		    SKMEM_SLEEP);
384 		ASSERT(token->nt_flow_info != NULL);
385 	}
386 	SLIST_INSERT_HEAD(&netns_all_tokens, token, nt_all_link);
387 
388 	return token;
389 }
390 
391 static void
netns_ns_token_free(struct ns_token * token)392 netns_ns_token_free(struct ns_token *token)
393 {
394 	NETNS_LOCK_ASSERT_HELD();
395 	NETNS_LOCK_CONVERT();
396 	SLIST_REMOVE(&netns_all_tokens, token, ns_token, nt_all_link);
397 
398 	if (token->nt_flow_info != NULL) {
399 		skmem_cache_free(netns_ns_flow_info_cache, token->nt_flow_info);
400 	}
401 	skmem_cache_free(netns_ns_token_cache, token);
402 }
403 
404 __attribute__((always_inline))
405 static inline int
nsr_cmp(const struct ns_reservation * nsr1,const struct ns_reservation * nsr2)406 nsr_cmp(const struct ns_reservation *nsr1, const struct ns_reservation *nsr2)
407 {
408 #define NSR_COMPARE(r1, r2)     ((int)(r1)->nsr_port - (int)(r2)->nsr_port)
409 	return NSR_COMPARE(nsr1, nsr2);
410 }
411 
412 __attribute__((always_inline))
413 static inline int
ns_cmp(const struct ns * a,const struct ns * b)414 ns_cmp(const struct ns *a, const struct ns *b)
415 {
416 	int d;
417 
418 	if ((d = (a->ns_addr_len - b->ns_addr_len)) != 0) {
419 		return d;
420 	}
421 	if ((d = (a->ns_proto - b->ns_proto)) != 0) {
422 		return d;
423 	}
424 	if ((d = flow_ip_cmp(a->ns_addr_key, b->ns_addr_key,
425 	    b->ns_addr_len)) != 0) {
426 		return d;
427 	}
428 
429 	return 0;
430 }
431 
432 /*
433  * Common routine to look up a reservation.
434  *
435  * NOTE: Assumes the caller holds the NETNS global lock
436  */
437 __attribute__((always_inline))
438 static inline struct ns_reservation *
ns_reservation_tree_find(struct ns_reservation_tree * tree,const in_port_t port)439 ns_reservation_tree_find(struct ns_reservation_tree *tree, const in_port_t port)
440 {
441 	struct ns_reservation res;
442 	res.nsr_port = port;
443 	return RB_FIND(ns_reservation_tree, tree, &res);
444 }
445 
446 /*
447  * Retrieve the namespace for the supplied <address, protocol> tuple.
448  * If create is set and such a namespace doesn't already exist, one will be
449  * created.
450  */
451 static struct ns *
_netns_get_ns(uint32_t * addr,uint8_t addr_len,uint8_t proto,bool create)452 _netns_get_ns(uint32_t *addr, uint8_t addr_len, uint8_t proto, bool create)
453 {
454 	struct ns *namespace = NULL;
455 	struct ns find = {
456 		.ns_addr_key = addr,
457 		.ns_addr_len = addr_len,
458 		.ns_proto = proto,
459 	};
460 #if SK_LOG
461 	char tmp_ip_str[MAX_IPv6_STR_LEN];
462 #endif /* SK_LOG */
463 
464 	VERIFY(addr_len == sizeof(struct in_addr) ||
465 	    addr_len == sizeof(struct in6_addr));
466 
467 	NETNS_LOCK_ASSERT_HELD();
468 
469 	namespace = RB_FIND(netns_namespaces_tree, &netns_namespaces, &find);
470 
471 	if (create && namespace == NULL) {
472 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
473 		    "allocating %s ns for IP %s",
474 		    PROTO_STR(proto), inet_ntop(LEN_TO_AF(addr_len), addr,
475 		    tmp_ip_str, sizeof(tmp_ip_str)));
476 		NETNS_LOCK_CONVERT();
477 		namespace = netns_ns_alloc(Z_WAITOK | Z_NOFAIL);
478 		__builtin_assume(namespace != NULL);
479 		memcpy(namespace->ns_addr, addr, addr_len);
480 		namespace->ns_addr_key = &namespace->ns_addr;
481 		namespace->ns_addr_len = addr_len;
482 		namespace->ns_proto = proto;
483 		RB_INSERT(netns_namespaces_tree, &netns_namespaces, namespace);
484 		netns_n_namespaces++;
485 
486 		if (_netns_is_wildcard_addr(addr, addr_len) &&
487 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
488 		    addr_len)] == NULL) {
489 			netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
490 			addr_len)] = namespace;
491 		}
492 	}
493 
494 	return namespace;
495 }
496 
497 /*
498  * Return true if the supplied address is a wildcard (INADDR_ANY)
499  */
500 __attribute__((always_inline))
501 static boolean_t
_netns_is_wildcard_addr(const uint32_t * addr,uint8_t addr_len)502 _netns_is_wildcard_addr(const uint32_t *addr, uint8_t addr_len)
503 {
504 	boolean_t wildcard;
505 
506 	switch (addr_len) {
507 	case sizeof(struct in_addr):
508 		wildcard = (addr[0] == 0);
509 		break;
510 
511 	case sizeof(struct in6_addr):
512 		wildcard = (addr[0] == 0 && addr[1] == 0 &&
513 		    addr[2] == 0 && addr[3] == 0);
514 		break;
515 
516 	default:
517 		wildcard = FALSE;
518 		break;
519 	}
520 
521 	return wildcard;
522 }
523 
524 __attribute__((always_inline))
525 static boolean_t
_netns_is_port_used(struct ns * gns,struct ns_reservation * curr_res,in_port_t port)526 _netns_is_port_used(struct ns * gns, struct ns_reservation *curr_res, in_port_t port)
527 {
528 	struct ns_reservation *res = NULL;
529 
530 	if (gns == NULL) {
531 		return FALSE;
532 	}
533 
534 	res = ns_reservation_tree_find(&gns->ns_reservations, port);
535 	if (res != NULL && res != curr_res) {
536 		if (!res->nsr_reuseport &&
537 		    (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
538 		    NETNS_REF_COUNT(res, NETNS_PF) > 0 ||
539 		    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0 ||
540 		    NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0)) {
541 			return TRUE;
542 		}
543 	}
544 
545 	return FALSE;
546 }
547 
548 /*
549  * Internal shared code to reserve ports within a specific namespace.
550  *
551  * Note: port numbers are in host byte-order here.
552  */
553 static int
_netns_reserve_common(struct ns * namespace,in_port_t port,uint32_t flags)554 _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags)
555 {
556 	struct ns_reservation *res = NULL, *exist = NULL;
557 	uint8_t proto, addr_len;
558 	int err = 0;
559 #if SK_LOG
560 	char tmp_ip_str[MAX_IPv6_STR_LEN];
561 #endif /* SK_LOG */
562 
563 	VERIFY(port != 0);
564 	proto = namespace->ns_proto;
565 	addr_len = namespace->ns_addr_len;
566 	NETNS_LOCK_CONVERT();
567 	res = netns_ns_reservation_alloc(port, flags);
568 	if (res == NULL) {
569 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
570 		    "ERROR %s:%s:%d // flags 0x%x // OUT OF MEMORY",
571 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
572 		    namespace->ns_addr, tmp_ip_str,
573 		    sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags);
574 		return ENOMEM;
575 	}
576 	exist = RB_INSERT(ns_reservation_tree, &namespace->ns_reservations,
577 	    res);
578 	if (__probable(exist == NULL)) {
579 		namespace->ns_n_reservations++;
580 	} else {
581 		netns_ns_reservation_free(res);
582 		res = exist;
583 	}
584 
585 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
586 	    "pre: %s:%s:%d // flags 0x%x // refs %d sky, %d ls, "
587 	    "%d bsd %d pf", inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
588 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
589 	    PROTO_STR(proto), port, flags,
590 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
591 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
592 	    NETNS_REF_COUNT(res, NETNS_BSD),
593 	    NETNS_REF_COUNT(res, NETNS_PF));
594 
595 	/* Make reservation */
596 	/*
597 	 * Bypass collision detection for reservations in the global non-wild
598 	 * namespace. We use that namespace for reference counts only.
599 	 */
600 	if (namespace !=
601 	    netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)]) {
602 		struct ns_reservation *skres;
603 		boolean_t is_wild = _netns_is_wildcard_addr(namespace->ns_addr,
604 		    addr_len);
605 		struct ns *gns =
606 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)];
607 
608 		if (NETNS_IS_SKYWALK(flags)) {
609 			if ((!is_wild || exist != NULL) && gns != NULL &&
610 			    (skres = ns_reservation_tree_find(
611 				    &gns->ns_reservations, port)) != NULL &&
612 			    NETNS_REF_COUNT(skres, NETNS_LISTENER) == 0) {
613 				/*
614 				 * The mere existence of any non-skywalk
615 				 * listener wildcard entry for this
616 				 * protocol/port number means this must fail.
617 				 */
618 				SK_DF(NS_VERB_IP(addr_len) |
619 				    NS_VERB_PROTO(proto),
620 				    "ADDRINUSE: Duplicate wildcard");
621 				err = EADDRINUSE;
622 				goto done;
623 			}
624 
625 			if (is_wild) {
626 				gns = netns_global_non_wild[
627 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
628 				VERIFY(gns != NULL);
629 
630 				if (ns_reservation_tree_find(
631 					    &gns->ns_reservations, port) != NULL) {
632 					/*
633 					 * If Skywalk is trying to reserve a
634 					 * wildcard, then the mere existance of
635 					 * any entry in the non-wild namespace
636 					 * for this port means this must fail.
637 					 */
638 					SK_DF(NS_VERB_IP(addr_len) |
639 					    NS_VERB_PROTO(proto), "ADDRINUSE: "
640 					    "Wildcard with non-wild.");
641 					err = EADDRINUSE;
642 					goto done;
643 				}
644 			}
645 		} else {
646 			/*
647 			 * Check if Skywalk has reserved a wildcard entry.
648 			 * Note that the arithmetic OR here is intentional.
649 			 */
650 			if ((!is_wild || exist != NULL) && gns != NULL &&
651 			    (skres = ns_reservation_tree_find(
652 				    &gns->ns_reservations, port)) != NULL &&
653 			    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
654 			    NETNS_REF_COUNT(skres, NETNS_LISTENER)) != 0) {
655 				/*
656 				 * BSD is trying to reserve a proto/port for
657 				 * which Skywalk already has a wildcard
658 				 * reservation.
659 				 */
660 				SK_DF(NS_VERB_IP(addr_len) |
661 				    NS_VERB_PROTO(proto),
662 				    "ADDRINUSE: BSD requesting Skywalk port");
663 				err = EADDRINUSE;
664 				goto done;
665 			}
666 
667 			/*
668 			 * If BSD is trying to reserve a wildcard,
669 			 * ensure Skywalk has not already reserved
670 			 * a non-wildcard.
671 			 */
672 			if (is_wild) {
673 				gns = netns_global_non_wild[
674 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
675 				VERIFY(gns != NULL);
676 
677 				/*
678 				 * Note that the arithmetic OR here is
679 				 * intentional.
680 				 */
681 				if ((skres = ns_reservation_tree_find(
682 					    &gns->ns_reservations, port)) != NULL &&
683 				    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
684 				    NETNS_REF_COUNT(skres,
685 				    NETNS_LISTENER)) != 0) {
686 					SK_DF(NS_VERB_IP(addr_len) |
687 					    NS_VERB_PROTO(proto), "ADDRINUSE: "
688 					    "BSD wildcard with non-wild.");
689 					err = EADDRINUSE;
690 					goto done;
691 				}
692 			}
693 		}
694 
695 		switch (flags & NETNS_OWNER_MASK) {
696 		case NETNS_SKYWALK:
697 			/* check collision w/ BSD */
698 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
699 			    NETNS_REF_COUNT(res, NETNS_PF) > 0) {
700 				SK_DF(NS_VERB_IP(addr_len) |
701 				    NS_VERB_PROTO(proto),
702 				    "ERROR - Skywalk got ADDRINUSE (w/ BSD)");
703 				err = EADDRINUSE;
704 				goto done;
705 			}
706 
707 			/* BEGIN CSTYLED */
708 			/*
709 			 * Scenarios with new Skywalk connected flow:
710 			 * 1. With existing Skywalk connected flow,
711 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
712 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 1
713 			 *    reject by failing the wild gns lookup below.
714 			 * 2. With existing Skywalk 3-tuple listener,
715 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 1
716 			 *    bypass the check below.
717 			 * 3. With existing Skywalk 2-tuple listener,
718 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
719 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 0
720 			 *    pass with successful wild gns lookup.
721 			 */
722 			/* END CSTYLED */
723 			if (NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
724 			    NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0) {
725 				/* check if covered by wild Skywalk listener */
726 				gns = netns_global_wild[
727 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
728 				if (gns != NULL &&
729 				    (skres = ns_reservation_tree_find(
730 					    &gns->ns_reservations, port)) != NULL &&
731 				    NETNS_REF_COUNT(skres, NETNS_LISTENER)
732 				    != 0) {
733 					err = 0;
734 					goto done;
735 				}
736 				if (addr_len == sizeof(struct in_addr)) {
737 					/* If address is IPv4, also check for wild IPv6 registration */
738 					gns = netns_global_wild[
739 						NETNS_NS_GLOBAL_IDX(proto, sizeof(struct in6_addr))];
740 					if (gns != NULL &&
741 					    (skres = ns_reservation_tree_find(
742 						    &gns->ns_reservations, port)) != NULL &&
743 					    NETNS_REF_COUNT(skres, NETNS_LISTENER)
744 					    != 0) {
745 						err = 0;
746 						goto done;
747 					}
748 				}
749 				SK_DF(NS_VERB_IP(addr_len) |
750 				    NS_VERB_PROTO(proto),
751 				    "ERROR - Skywalk got ADDRINUSE "
752 				    "(w/ SK connected flow)");
753 				err = EADDRINUSE;
754 			}
755 			/*
756 			 * XXX: Duplicate 5-tuple flows under a Skywalk
757 			 * listener are currently detected by flow manager,
758 			 * till we implement 5-tuple-aware netns.
759 			 */
760 			break;
761 
762 		case NETNS_LISTENER:
763 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
764 			    NETNS_REF_COUNT(res, NETNS_PF) > 0 ||
765 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0 ||
766 			    _netns_is_port_used(netns_global_wild[
767 				    NETNS_NS_GLOBAL_IDX(proto, sizeof(struct in_addr))], res, port) ||
768 			    _netns_is_port_used(netns_global_wild[
769 				    NETNS_NS_GLOBAL_IDX(proto, sizeof(struct in6_addr))], res, port) ||
770 			    _netns_is_port_used(netns_global_non_wild[
771 				    NETNS_NS_GLOBAL_IDX(proto, sizeof(struct in_addr))], res, port) ||
772 			    _netns_is_port_used(netns_global_non_wild[
773 				    NETNS_NS_GLOBAL_IDX(proto, sizeof(struct in6_addr))], res, port)) {
774 				SK_DF(NS_VERB_IP(addr_len) |
775 				    NS_VERB_PROTO(proto),
776 				    "ERROR - Listener got ADDRINUSE");
777 				err = EADDRINUSE;
778 			}
779 			break;
780 
781 		case NETNS_BSD:
782 		case NETNS_PF:
783 			if (NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0 ||
784 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0) {
785 				SK_DF(NS_VERB_IP(addr_len) |
786 				    NS_VERB_PROTO(proto),
787 				    "ERROR - %s got ADDRINUSE",
788 				    ((flags & NETNS_OWNER_MASK) == NETNS_PF) ?
789 				    "PF" : "BSD");
790 				err = EADDRINUSE;
791 			}
792 			break;
793 
794 		default:
795 			panic("_netns_reserve_common: invalid owner 0x%x",
796 			    flags & NETNS_OWNER_MASK);
797 			/* NOTREACHED */
798 			__builtin_unreachable();
799 		}
800 	}
801 
802 done:
803 	ASSERT(res != NULL);
804 	if (__probable(err == 0)) {
805 		NETNS_REF_COUNT(res, flags)++;
806 		/* Check for wrap around */
807 		VERIFY(NETNS_REF_COUNT(res, flags) != 0);
808 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
809 		    NS_VERB_PROTO(namespace->ns_proto),
810 		    "post: %s:%s:%d err %d // flags 0x%x // refs %d sky, "
811 		    "%d ls, %d bsd %d pf",
812 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
813 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
814 		    PROTO_STR(namespace->ns_proto), port, err, flags,
815 		    NETNS_REF_COUNT(res, NETNS_SKYWALK),
816 		    NETNS_REF_COUNT(res, NETNS_LISTENER),
817 		    NETNS_REF_COUNT(res, NETNS_BSD),
818 		    NETNS_REF_COUNT(res, NETNS_PF));
819 	} else {
820 		if (exist == NULL) {
821 			RB_REMOVE(ns_reservation_tree,
822 			    &namespace->ns_reservations, res);
823 			namespace->ns_n_reservations--;
824 			netns_ns_reservation_free(res);
825 		}
826 	}
827 	return err;
828 }
829 
830 /*
831  * Internal shared code to release ports within a specific namespace.
832  */
833 static void
_netns_release_common(struct ns * namespace,in_port_t port,uint32_t flags)834 _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags)
835 {
836 	struct ns_reservation *res;
837 	uint32_t refs;
838 	int i;
839 #if SK_LOG
840 	char tmp_ip_str[MAX_IPv6_STR_LEN];
841 #endif /* SK_LOG */
842 
843 	NETNS_LOCK_ASSERT_HELD();
844 
845 	res = ns_reservation_tree_find(&namespace->ns_reservations, port);
846 	if (res == NULL) {
847 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
848 		    NS_VERB_PROTO(namespace->ns_proto),
849 		    "ERROR %s:%s:%d // flags 0x%x // not found",
850 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
851 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
852 		    PROTO_STR(namespace->ns_proto), port, flags);
853 		VERIFY(res != NULL);
854 	}
855 
856 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
857 	    NS_VERB_PROTO(namespace->ns_proto),
858 	    "%s:%s:%d // flags 0x%x // refs %d sky, %d ls, %d bsd, %d pf",
859 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
860 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
861 	    PROTO_STR(namespace->ns_proto), port, flags,
862 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
863 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
864 	    NETNS_REF_COUNT(res, NETNS_BSD),
865 	    NETNS_REF_COUNT(res, NETNS_PF));
866 
867 	/* Release reservation */
868 	VERIFY(NETNS_REF_COUNT(res, flags) > 0);
869 	NETNS_REF_COUNT(res, flags) -= 1;
870 
871 	/* Clean up memory, if appropriate */
872 	for (i = 0, refs = 0; i <= NETNS_OWNER_MAX && refs == 0; i++) {
873 		refs |= res->nsr_refs[i];
874 	}
875 	if (refs == 0) {
876 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
877 		    res);
878 		namespace->ns_n_reservations--;
879 		NETNS_LOCK_CONVERT();
880 		netns_ns_reservation_free(res);
881 		netns_ns_cleanup(namespace);
882 	}
883 }
884 
885 __attribute__((always_inline))
886 static inline void
netns_init_global_ns(struct ns ** global_ptr,uint8_t proto,uint8_t addrlen)887 netns_init_global_ns(struct ns **global_ptr, uint8_t proto, uint8_t addrlen)
888 {
889 	struct ns *namespace;
890 
891 	namespace = *global_ptr = netns_ns_alloc(Z_WAITOK);
892 	memset(namespace->ns_addr, 0xFF, addrlen);
893 	namespace->ns_addr_len = addrlen;
894 	namespace->ns_proto = proto;
895 	namespace->ns_is_freeable = 0;
896 }
897 
898 __attribute__((always_inline))
899 static inline void
netns_clear_ifnet(struct ns_token * nstoken)900 netns_clear_ifnet(struct ns_token *nstoken)
901 {
902 #if SK_LOG
903 	char tmp_ip_str[MAX_IPv6_STR_LEN];
904 #endif /* SK_LOG */
905 
906 	NETNS_LOCK_ASSERT_HELD();
907 
908 	if (nstoken->nt_ifp != NULL) {
909 		SLIST_REMOVE(&nstoken->nt_ifp->if_netns_tokens, nstoken,
910 		    ns_token, nt_ifp_link);
911 
912 		SK_DF(NS_VERB_IP(nstoken->nt_addr_len) |
913 		    NS_VERB_PROTO(nstoken->nt_proto),
914 		    "%s:%s:%d // removed from ifnet %d",
915 		    inet_ntop(LEN_TO_AF(nstoken->nt_addr_len),
916 		    nstoken->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
917 		    PROTO_STR(nstoken->nt_proto), nstoken->nt_port,
918 		    nstoken->nt_ifp->if_index);
919 
920 		NETNS_LOCK_CONVERT();
921 		ifnet_decr_iorefcnt(nstoken->nt_ifp);
922 		nstoken->nt_ifp = NULL;
923 	} else {
924 		SLIST_REMOVE(&netns_unbound_tokens, nstoken, ns_token,
925 		    nt_ifp_link);
926 	}
927 }
928 
929 /*
930  * Internal shared code to perform a port[-range] reservation, along with all
931  * the boilerplate and sanity checks expected for a call coming in from the
932  * surrounding kernel code.
933  */
934 static int
_netns_reserve_kpi_common(struct ns * ns,netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)935 _netns_reserve_kpi_common(struct ns *ns, netns_token *token, uint32_t *addr,
936     uint8_t addr_len, uint8_t proto, in_port_t *port, uint32_t flags,
937     struct ns_flow_info *nfi)
938 {
939 	boolean_t ns_want_cleanup = (ns == NULL);
940 	struct ns_token *nt;
941 	int err = 0;
942 	in_port_t hport;
943 #if SK_LOG
944 	char tmp_ip_str[MAX_IPv6_STR_LEN];
945 #endif /* SK_LOG */
946 	struct ifnet *ifp = (nfi != NULL) ? nfi->nfi_ifp : NULL;
947 
948 	NETNS_LOCK_ASSERT_HELD();
949 
950 	hport = ntohs(*port);
951 
952 	VERIFY((flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
953 	VERIFY(addr_len == sizeof(struct in_addr) ||
954 	    addr_len == sizeof(struct in6_addr));
955 	VERIFY(proto == IPPROTO_TCP || proto == IPPROTO_UDP);
956 	VERIFY(hport != 0);
957 
958 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
959 	    "reserving %s:%s:%d // flags 0x%x // token %svalid",
960 	    inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str,
961 	    sizeof(tmp_ip_str)), PROTO_STR(proto), hport, flags,
962 	    NETNS_TOKEN_VALID(token) ? "" : "in");
963 
964 	/*
965 	 * See the documentation for NETNS_PRERESERVED in netns.h for an
966 	 * explanation of this block.
967 	 */
968 	if (NETNS_TOKEN_VALID(token)) {
969 		if (flags & NETNS_PRERESERVED) {
970 			nt = *token;
971 			VERIFY(nt->nt_addr_len == addr_len);
972 			VERIFY(memcmp(nt->nt_addr, addr, addr_len) == 0);
973 			VERIFY(nt->nt_proto == proto);
974 			VERIFY(nt->nt_port == hport);
975 			VERIFY((nt->nt_flags &
976 			    NETNS_RESERVATION_FLAGS | NETNS_PRERESERVED) ==
977 			    (flags & NETNS_RESERVATION_FLAGS));
978 
979 			if ((nt->nt_flags & NETNS_CONFIGURATION_FLAGS) ==
980 			    (flags & NETNS_CONFIGURATION_FLAGS)) {
981 				SK_DF(NS_VERB_IP(nt->nt_addr_len) |
982 				    NS_VERB_PROTO(nt->nt_proto),
983 				    "%s:%s:%d // flags 0x%x -> 0x%x",
984 				    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
985 				    nt->nt_addr, tmp_ip_str,
986 				    sizeof(tmp_ip_str)),
987 				    PROTO_STR(nt->nt_proto),
988 				    nt->nt_port, nt->nt_flags, flags);
989 				nt->nt_flags &= ~NETNS_CONFIGURATION_FLAGS;
990 				nt->nt_flags |=
991 				    flags & NETNS_CONFIGURATION_FLAGS;
992 			}
993 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
994 			    "token was prereserved");
995 			goto done;
996 		} else {
997 			panic("Request to overwrite valid netns token");
998 			/* NOTREACHED */
999 			__builtin_unreachable();
1000 		}
1001 	}
1002 
1003 	/*
1004 	 * TODO: Check range against bitmap
1005 	 */
1006 	if (hport == 0) {
1007 		/*
1008 		 * Caller request an arbitrary range of ports
1009 		 * TODO: Need to figure out how to allocate
1010 		 * emphemeral ports only.
1011 		 */
1012 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1013 		    "ERROR - wildcard port not yet supported");
1014 		err = ENOMEM;
1015 		goto done;
1016 	}
1017 
1018 	/*
1019 	 * Fetch namespace for the specified address/protocol, creating
1020 	 * a new namespace if necessary.
1021 	 */
1022 	if (ns == NULL) {
1023 		ASSERT(ns_want_cleanup);
1024 		ns = _netns_get_ns(addr, addr_len, proto, true);
1025 	}
1026 	if (__improbable(ns == NULL)) {
1027 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1028 		    "ERROR - couldn't create namespace");
1029 		err = ENOMEM;
1030 		goto done;
1031 	}
1032 
1033 	/*
1034 	 * Make a reservation in the namespace
1035 	 * This will return an error if an incompatible reservation
1036 	 * already exists.
1037 	 */
1038 	err = _netns_reserve_common(ns, hport, flags);
1039 	if (__improbable(err != 0)) {
1040 		NETNS_LOCK_CONVERT();
1041 		if (ns_want_cleanup) {
1042 			netns_ns_cleanup(ns);
1043 		}
1044 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1045 		    "ERROR - reservation collision");
1046 		goto done;
1047 	}
1048 
1049 	if (!_netns_is_wildcard_addr(ns->ns_addr, addr_len)) {
1050 		/* Record the reservation in the non-wild namespace */
1051 		struct ns *nwns;
1052 
1053 		nwns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1054 		    addr_len)];
1055 		err = _netns_reserve_common(nwns, hport, flags);
1056 		if (__improbable(err != 0)) {
1057 			/* Need to free the specific namespace entry */
1058 			NETNS_LOCK_CONVERT();
1059 			_netns_release_common(ns, hport, flags);
1060 			if (ns_want_cleanup) {
1061 				netns_ns_cleanup(ns);
1062 			}
1063 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1064 			    "ERROR - reservation collision");
1065 			goto done;
1066 		}
1067 	}
1068 
1069 	nt = netns_ns_token_alloc(nfi != NULL ? true : false);
1070 	ASSERT(nt->nt_ifp == NULL);
1071 	_netns_set_ifnet_internal(nt, ifp);
1072 
1073 	memcpy(nt->nt_addr, addr, addr_len);
1074 	nt->nt_addr_len = addr_len;
1075 	nt->nt_proto = proto;
1076 	nt->nt_port = hport;
1077 	nt->nt_flags = flags;
1078 
1079 	if (nfi != NULL) {
1080 		VERIFY(nt->nt_flow_info != NULL);
1081 
1082 		memcpy(nt->nt_flow_info, nfi, sizeof(struct ns_flow_info));
1083 		/*
1084 		 * The local port is passed as a separate argument
1085 		 */
1086 		if (nfi->nfi_laddr.sa.sa_family == AF_INET) {
1087 			nt->nt_flow_info->nfi_laddr.sin.sin_port = *port;
1088 		} else if (nfi->nfi_laddr.sa.sa_family == AF_INET6) {
1089 			nt->nt_flow_info->nfi_laddr.sin6.sin6_port = *port;
1090 		}
1091 	}
1092 	*token = nt;
1093 
1094 done:
1095 	return err;
1096 }
1097 
1098 /*
1099  * Kernel-facing functions
1100  */
1101 
1102 int
netns_init(void)1103 netns_init(void)
1104 {
1105 	VERIFY(__netns_inited == 0);
1106 
1107 	netns_ns_reservation_size = sizeof(struct ns_reservation);
1108 	netns_ns_reservation_cache = skmem_cache_create(NETNS_NS_RESERVATION_ZONE_NAME,
1109 	    netns_ns_reservation_size, sizeof(uint64_t), NULL, NULL, NULL,
1110 	    NULL, NULL, 0);
1111 	if (netns_ns_reservation_cache == NULL) {
1112 		panic("%s: skmem_cache create failed (%s)", __func__,
1113 		    NETNS_NS_RESERVATION_ZONE_NAME);
1114 		/* NOTREACHED */
1115 		__builtin_unreachable();
1116 	}
1117 
1118 	netns_ns_token_size = sizeof(struct ns_token);
1119 	netns_ns_token_cache = skmem_cache_create(NETNS_NS_TOKEN_ZONE_NAME,
1120 	    netns_ns_token_size, sizeof(uint64_t), NULL, NULL, NULL, NULL,
1121 	    NULL, 0);
1122 	if (netns_ns_token_cache == NULL) {
1123 		panic("%s: skmem_cache create failed (%s)", __func__,
1124 		    NETNS_NS_TOKEN_ZONE_NAME);
1125 		/* NOTREACHED */
1126 		__builtin_unreachable();
1127 	}
1128 
1129 	netns_ns_flow_info_size = sizeof(struct ns_flow_info);
1130 	netns_ns_flow_info_cache = skmem_cache_create(NETNS_NS_FLOW_INFO_ZONE_NAME,
1131 	    netns_ns_flow_info_size, sizeof(uint64_t), NULL, NULL, NULL,
1132 	    NULL, NULL, 0);
1133 	if (netns_ns_flow_info_cache == NULL) {
1134 		panic("%s: skmem_cache create failed (%s)", __func__,
1135 		    NETNS_NS_FLOW_INFO_ZONE_NAME);
1136 		/* NOTREACHED */
1137 		__builtin_unreachable();
1138 	}
1139 
1140 	SLIST_INIT(&netns_unbound_tokens);
1141 	SLIST_INIT(&netns_all_tokens);
1142 
1143 	netns_n_namespaces = 0;
1144 	RB_INIT(&netns_namespaces);
1145 
1146 	SK_D("initializing global namespaces");
1147 
1148 	netns_init_global_ns(
1149 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1150 		sizeof(struct in_addr))], IPPROTO_TCP, sizeof(struct in_addr));
1151 
1152 	netns_init_global_ns(
1153 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1154 		sizeof(struct in_addr))], IPPROTO_UDP, sizeof(struct in_addr));
1155 
1156 	netns_init_global_ns(
1157 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1158 		sizeof(struct in6_addr))], IPPROTO_TCP, sizeof(struct in6_addr));
1159 
1160 	netns_init_global_ns(
1161 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1162 		sizeof(struct in6_addr))], IPPROTO_UDP, sizeof(struct in6_addr));
1163 
1164 	/* Done */
1165 
1166 	__netns_inited = 1;
1167 	sk_features |= SK_FEATURE_NETNS;
1168 
1169 	SK_D("initialized netns");
1170 
1171 	return 0;
1172 }
1173 
1174 void
netns_uninit(void)1175 netns_uninit(void)
1176 {
1177 	if (__netns_inited == 1) {
1178 		struct ns *namespace;
1179 		struct ns *temp_namespace;
1180 		int i;
1181 
1182 		RB_FOREACH_SAFE(namespace, netns_namespaces_tree,
1183 		    &netns_namespaces, temp_namespace) {
1184 			RB_REMOVE(netns_namespaces_tree, &netns_namespaces,
1185 			    namespace);
1186 			netns_n_namespaces--;
1187 			netns_ns_free(namespace);
1188 		}
1189 
1190 		for (i = 0; i < NETNS_N_GLOBAL; i++) {
1191 			netns_ns_free(netns_global_non_wild[i]);
1192 		}
1193 
1194 		if (netns_ns_flow_info_cache != NULL) {
1195 			skmem_cache_destroy(netns_ns_flow_info_cache);
1196 			netns_ns_flow_info_cache = NULL;
1197 		}
1198 		if (netns_ns_token_cache != NULL) {
1199 			skmem_cache_destroy(netns_ns_token_cache);
1200 			netns_ns_token_cache = NULL;
1201 		}
1202 		if (netns_ns_reservation_cache != NULL) {
1203 			skmem_cache_destroy(netns_ns_reservation_cache);
1204 			netns_ns_reservation_cache = NULL;
1205 		}
1206 
1207 		__netns_inited = 0;
1208 		sk_features &= ~SK_FEATURE_NETNS;
1209 
1210 		SK_D("uninitialized netns");
1211 	}
1212 }
1213 
1214 void
netns_reap_caches(boolean_t purge)1215 netns_reap_caches(boolean_t purge)
1216 {
1217 	/* these aren't created unless netns is enabled */
1218 	if (netns_ns_token_cache != NULL) {
1219 		skmem_cache_reap_now(netns_ns_token_cache, purge);
1220 	}
1221 	if (netns_ns_reservation_cache != NULL) {
1222 		skmem_cache_reap_now(netns_ns_reservation_cache, purge);
1223 	}
1224 	if (netns_ns_flow_info_cache != NULL) {
1225 		skmem_cache_reap_now(netns_ns_flow_info_cache, purge);
1226 	}
1227 }
1228 
1229 boolean_t
netns_is_enabled(void)1230 netns_is_enabled(void)
1231 {
1232 	return __netns_inited == 1;
1233 }
1234 
1235 int
netns_reserve(netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t port,uint32_t flags,struct ns_flow_info * nfi)1236 netns_reserve(netns_token *token, uint32_t *addr, uint8_t addr_len,
1237     uint8_t proto, in_port_t port, uint32_t flags, struct ns_flow_info *nfi)
1238 {
1239 	int err = 0;
1240 #if SK_LOG
1241 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1242 #endif /* SK_LOG */
1243 
1244 	if (__netns_inited == 0) {
1245 		*token = NULL;
1246 		return err;
1247 	}
1248 
1249 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1250 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1251 		return ENOTSUP;
1252 	}
1253 
1254 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1255 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1256 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1257 	    flags);
1258 
1259 	/*
1260 	 * Check wether the process is allowed to bind to a restricted port
1261 	 */
1262 	if (!current_task_can_use_restricted_in_port(port,
1263 	    proto, flags)) {
1264 		*token = NULL;
1265 		return EADDRINUSE;
1266 	}
1267 
1268 	NETNS_LOCK_SPIN();
1269 	err = _netns_reserve_kpi_common(NULL, token, addr, addr_len,
1270 	    proto, &port, flags, nfi);
1271 	NETNS_UNLOCK();
1272 
1273 	return err;
1274 }
1275 
1276 /* Import net.inet.{tcp,udp}.randomize_ports sysctls */
1277 extern int      udp_use_randomport;
1278 extern int      tcp_use_randomport;
1279 
1280 int
netns_reserve_ephemeral(netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)1281 netns_reserve_ephemeral(netns_token *token, uint32_t *addr, uint8_t addr_len,
1282     uint8_t proto, in_port_t *port, uint32_t flags, struct ns_flow_info *nfi)
1283 {
1284 	int err = 0;
1285 	in_port_t first = (in_port_t)ipport_firstauto;
1286 	in_port_t last  = (in_port_t)ipport_lastauto;
1287 	in_port_t rand_port;
1288 	in_port_t last_port;
1289 	in_port_t n_last_port;
1290 	struct ns *namespace;
1291 	boolean_t count_up = true;
1292 	boolean_t use_randomport = (proto == IPPROTO_TCP) ?
1293 	    tcp_use_randomport : udp_use_randomport;
1294 #if SK_LOG
1295 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1296 #endif /* SK_LOG */
1297 
1298 	if (__netns_inited == 0) {
1299 		*token = NULL;
1300 		return err;
1301 	}
1302 
1303 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1304 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1305 		return ENOTSUP;
1306 	}
1307 
1308 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1309 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1310 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(*port),
1311 	    flags);
1312 
1313 	NETNS_LOCK_SPIN();
1314 
1315 	namespace = _netns_get_ns(addr, addr_len, proto, true);
1316 	if (namespace == NULL) {
1317 		err = ENOMEM;
1318 		NETNS_UNLOCK();
1319 		return err;
1320 	}
1321 
1322 	if (proto == IPPROTO_UDP) {
1323 		if (UINT16_MAX - namespace->ns_n_reservations <
1324 		    NETNS_NS_UDP_EPHEMERAL_RESERVE) {
1325 			SK_ERR("UDP ephemeral port not available"
1326 			    "(less than 4096 UDP ports left)");
1327 			err = EADDRNOTAVAIL;
1328 			NETNS_UNLOCK();
1329 			return err;
1330 		}
1331 	}
1332 
1333 	if (first == last) {
1334 		rand_port = first;
1335 	} else {
1336 		if (use_randomport) {
1337 			NETNS_LOCK_CONVERT();
1338 			read_frandom(&rand_port, sizeof(rand_port));
1339 
1340 			if (first > last) {
1341 				rand_port = last + (rand_port %
1342 				    (first - last));
1343 				count_up = false;
1344 			} else {
1345 				rand_port = first + (rand_port %
1346 				    (last - first));
1347 			}
1348 		} else {
1349 			if (first > last) {
1350 				rand_port =
1351 				    namespace->ns_last_ephemeral_port_down - 1;
1352 				if (rand_port < last || rand_port > first) {
1353 					rand_port = last;
1354 				}
1355 				count_up = false;
1356 			} else {
1357 				rand_port =
1358 				    namespace->ns_last_ephemeral_port_up + 1;
1359 				if (rand_port < first || rand_port > last) {
1360 					rand_port = first;
1361 				}
1362 			}
1363 		}
1364 	}
1365 	last_port = rand_port;
1366 	n_last_port = htons(last_port);
1367 
1368 	while (true) {
1369 		if (n_last_port == 0) {
1370 			SK_ERR("ephemeral port search range includes 0");
1371 			err = EINVAL;
1372 			break;
1373 		}
1374 
1375 		/*
1376 		 * Skip if this is a restricted port as we do not want to
1377 		 * restricted ports as ephemeral
1378 		 */
1379 		if (!IS_RESTRICTED_IN_PORT(n_last_port)) {
1380 			err = _netns_reserve_kpi_common(namespace, token, addr,
1381 			    addr_len, proto, &n_last_port, flags, nfi);
1382 			if (err == 0 || err != EADDRINUSE) {
1383 				break;
1384 			}
1385 		}
1386 		if (count_up) {
1387 			last_port++;
1388 			if (last_port < first || last_port > last) {
1389 				last_port = first;
1390 			}
1391 		} else {
1392 			last_port--;
1393 			if (last_port < last || last_port > first) {
1394 				last_port = last;
1395 			}
1396 		}
1397 		n_last_port = htons(last_port);
1398 
1399 		if (last_port == rand_port || first == last) {
1400 			SK_ERR("couldn't find free ephemeral port");
1401 			err = EADDRNOTAVAIL;
1402 			break;
1403 		}
1404 	}
1405 
1406 	if (err == 0) {
1407 		*port = n_last_port;
1408 		if (count_up) {
1409 			namespace->ns_last_ephemeral_port_up = last_port;
1410 		} else {
1411 			namespace->ns_last_ephemeral_port_down = last_port;
1412 		}
1413 	} else {
1414 		netns_ns_cleanup(namespace);
1415 	}
1416 
1417 	NETNS_UNLOCK();
1418 
1419 	return err;
1420 }
1421 
1422 void
netns_release(netns_token * token)1423 netns_release(netns_token *token)
1424 {
1425 	struct ns *ns;
1426 	struct ns_token *nt;
1427 	uint8_t proto, addr_len;
1428 #if SK_LOG
1429 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1430 #endif /* SK_LOG */
1431 
1432 	if (!NETNS_TOKEN_VALID(token)) {
1433 		return;
1434 	}
1435 
1436 	if (__netns_inited == 0) {
1437 		*token = NULL;
1438 		return;
1439 	}
1440 
1441 	NETNS_LOCK_SPIN();
1442 
1443 	nt = *token;
1444 	*token = NULL;
1445 
1446 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
1447 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1448 	    nt->nt_addr_len == sizeof(struct in6_addr));
1449 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1450 
1451 	addr_len = nt->nt_addr_len;
1452 	proto = nt->nt_proto;
1453 
1454 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1455 	    "releasing %s:%s:%d",
1456 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1457 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto),
1458 	    nt->nt_port);
1459 
1460 	if (!_netns_is_wildcard_addr(nt->nt_addr, addr_len)) {
1461 		/* Remove from global non-wild namespace */
1462 
1463 		ns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1464 		    addr_len)];
1465 		VERIFY(ns != NULL);
1466 
1467 		_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1468 	}
1469 
1470 	ns = _netns_get_ns(nt->nt_addr, addr_len, proto, false);
1471 	VERIFY(ns != NULL);
1472 	_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1473 
1474 	netns_clear_ifnet(nt);
1475 	netns_ns_token_free(nt);
1476 
1477 	NETNS_UNLOCK();
1478 }
1479 
1480 int
netns_change_addr(netns_token * token,uint32_t * addr,uint8_t addr_len)1481 netns_change_addr(netns_token *token, uint32_t *addr, uint8_t addr_len)
1482 {
1483 	int err = 0;
1484 	struct ns *old_namespace;
1485 	struct ns *new_namespace;
1486 	struct ns *global_namespace;
1487 	struct ns_token *nt;
1488 	uint8_t proto;
1489 #if SK_LOG
1490 	char tmp_ip_str_1[MAX_IPv6_STR_LEN];
1491 	char tmp_ip_str_2[MAX_IPv6_STR_LEN];
1492 #endif /* SK_LOG */
1493 
1494 	if (__netns_inited == 0) {
1495 		return 0;
1496 	}
1497 
1498 	NETNS_LOCK();
1499 
1500 	VERIFY(NETNS_TOKEN_VALID(token));
1501 
1502 	nt = *token;
1503 
1504 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) == NETNS_BSD);
1505 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1506 	    nt->nt_addr_len == sizeof(struct in6_addr));
1507 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1508 
1509 	proto = nt->nt_proto;
1510 
1511 #if SK_LOG
1512 	inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1513 	    tmp_ip_str_1, sizeof(tmp_ip_str_1));
1514 	inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2,
1515 	    sizeof(tmp_ip_str_2));
1516 #endif /* SK_LOG */
1517 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1518 	    "changing address for %s:%d from %s to %s",
1519 	    PROTO_STR(proto), nt->nt_port, tmp_ip_str_1,
1520 	    tmp_ip_str_2);
1521 
1522 	if (nt->nt_addr_len == addr_len &&
1523 	    memcmp(nt->nt_addr, addr, nt->nt_addr_len) == 0) {
1524 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1525 		    "address didn't change, exiting early");
1526 		goto done;
1527 	}
1528 
1529 	old_namespace = _netns_get_ns(nt->nt_addr, nt->nt_addr_len, proto,
1530 	    false);
1531 	VERIFY(old_namespace != NULL);
1532 
1533 	new_namespace = _netns_get_ns(addr, addr_len, proto, true);
1534 	if (new_namespace == NULL) {
1535 		err = ENOMEM;
1536 		goto done;
1537 	}
1538 
1539 	/* Acquire reservation in new namespace */
1540 	if ((err = _netns_reserve_common(new_namespace, nt->nt_port,
1541 	    nt->nt_flags))) {
1542 		NETNS_LOCK_CONVERT();
1543 		netns_ns_cleanup(new_namespace);
1544 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1545 		    "ERROR - reservation collision under new namespace");
1546 		goto done;
1547 	}
1548 
1549 	/* Release from old namespace */
1550 	_netns_release_common(old_namespace, nt->nt_port, nt->nt_flags);
1551 
1552 	if (!_netns_is_wildcard_addr(nt->nt_addr, nt->nt_addr_len)) {
1553 		/*
1554 		 * Old address is non-wildcard.
1555 		 * Remove old reservation from global non-wild namespace
1556 		 */
1557 		global_namespace = netns_global_non_wild[
1558 			NETNS_NS_GLOBAL_IDX(proto, nt->nt_addr_len)];
1559 		VERIFY(global_namespace != NULL);
1560 
1561 		_netns_release_common(global_namespace, nt->nt_port,
1562 		    nt->nt_flags);
1563 	}
1564 
1565 	if (!_netns_is_wildcard_addr(addr, addr_len)) {
1566 		/*
1567 		 * New address is non-wildcard.
1568 		 * Record new reservation in global non-wild namespace
1569 		 */
1570 		global_namespace = netns_global_non_wild[
1571 			NETNS_NS_GLOBAL_IDX(proto, addr_len)];
1572 		VERIFY(global_namespace != NULL);
1573 
1574 		if ((err = _netns_reserve_common(global_namespace,
1575 		    nt->nt_port, nt->nt_flags)) != 0) {
1576 			SK_DF(NS_VERB_IP(addr_len) |
1577 			    NS_VERB_PROTO(proto),
1578 			    "ERROR - reservation collision under new "
1579 			    "global namespace");
1580 			/* XXX: Should not fail. Maybe assert instead */
1581 			goto done;
1582 		}
1583 	}
1584 
1585 	memcpy(nt->nt_addr, addr, addr_len);
1586 	nt->nt_addr_len = addr_len;
1587 
1588 done:
1589 	NETNS_UNLOCK();
1590 	return err;
1591 }
1592 
1593 static void
_netns_set_ifnet_internal(struct ns_token * nt,struct ifnet * ifp)1594 _netns_set_ifnet_internal(struct ns_token *nt, struct ifnet *ifp)
1595 {
1596 #if SK_LOG
1597 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1598 #endif /* SK_LOG */
1599 
1600 	NETNS_LOCK_ASSERT_HELD();
1601 
1602 	if (ifp != NULL && ifnet_is_attached(ifp, 1)) {
1603 		nt->nt_ifp = ifp;
1604 		SLIST_INSERT_HEAD(&ifp->if_netns_tokens, nt, nt_ifp_link);
1605 
1606 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1607 		    "%s:%s:%d // added to ifnet %d",
1608 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1609 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1610 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1611 		    ifp->if_index);
1612 	} else {
1613 		SLIST_INSERT_HEAD(&netns_unbound_tokens, nt, nt_ifp_link);
1614 	}
1615 }
1616 
1617 void
netns_set_ifnet(netns_token * token,ifnet_t ifp)1618 netns_set_ifnet(netns_token *token, ifnet_t ifp)
1619 {
1620 	struct ns_token *nt;
1621 #if SK_LOG
1622 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1623 #endif /* SK_LOG */
1624 
1625 	if (__netns_inited == 0) {
1626 		return;
1627 	}
1628 
1629 	NETNS_LOCK();
1630 
1631 	VERIFY(NETNS_TOKEN_VALID(token));
1632 
1633 	nt = *token;
1634 
1635 	if (nt->nt_ifp == ifp) {
1636 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1637 		    "%s:%s:%d // ifnet already %d, exiting early",
1638 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1639 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1640 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1641 		    ifp ? ifp->if_index : -1);
1642 		NETNS_UNLOCK();
1643 		return;
1644 	}
1645 
1646 	netns_clear_ifnet(nt);
1647 
1648 	_netns_set_ifnet_internal(nt, ifp);
1649 
1650 	NETNS_UNLOCK();
1651 }
1652 
1653 void
netns_ifnet_detach(ifnet_t ifp)1654 netns_ifnet_detach(ifnet_t ifp)
1655 {
1656 	struct ns_token *token, *tmp_token;
1657 
1658 	if (__netns_inited == 0) {
1659 		return;
1660 	}
1661 
1662 	NETNS_LOCK();
1663 
1664 	SLIST_FOREACH_SAFE(token, &ifp->if_netns_tokens, nt_ifp_link,
1665 	    tmp_token) {
1666 		netns_clear_ifnet(token);
1667 		SLIST_INSERT_HEAD(&netns_unbound_tokens, token, nt_ifp_link);
1668 	}
1669 
1670 	NETNS_UNLOCK();
1671 }
1672 
1673 static void
_netns_set_state(netns_token * token,uint32_t state)1674 _netns_set_state(netns_token *token, uint32_t state)
1675 {
1676 	struct ns_token *nt;
1677 #if SK_LOG
1678 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1679 #endif /* SK_LOG */
1680 
1681 	if (__netns_inited == 0) {
1682 		return;
1683 	}
1684 
1685 	NETNS_LOCK();
1686 	VERIFY(NETNS_TOKEN_VALID(token));
1687 
1688 	nt = *token;
1689 	nt->nt_state |= state;
1690 
1691 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1692 	    "%s:%s:%d // state 0x%b",
1693 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1694 	    tmp_ip_str, sizeof(tmp_ip_str)),
1695 	    PROTO_STR(nt->nt_proto), nt->nt_port, state, NETNS_STATE_BITS);
1696 
1697 	NETNS_UNLOCK();
1698 }
1699 
1700 void
netns_half_close(netns_token * token)1701 netns_half_close(netns_token *token)
1702 {
1703 	_netns_set_state(token, NETNS_STATE_HALFCLOSED);
1704 }
1705 
1706 void
netns_withdraw(netns_token * token)1707 netns_withdraw(netns_token *token)
1708 {
1709 	_netns_set_state(token, NETNS_STATE_WITHDRAWN);
1710 }
1711 
1712 int
netns_get_flow_info(netns_token * token,struct ns_flow_info * nfi)1713 netns_get_flow_info(netns_token *token,
1714     struct ns_flow_info *nfi)
1715 {
1716 	if (__netns_inited == 0) {
1717 		return ENOTSUP;
1718 	}
1719 
1720 	NETNS_LOCK();
1721 	if (!NETNS_TOKEN_VALID(token) ||
1722 	    nfi == NULL) {
1723 		NETNS_UNLOCK();
1724 		return EINVAL;
1725 	}
1726 
1727 	struct ns_token *nt = *token;
1728 	if (nt->nt_flow_info == NULL) {
1729 		NETNS_UNLOCK();
1730 		return ENOENT;
1731 	}
1732 
1733 	memcpy(nfi, nt->nt_flow_info, sizeof(struct ns_flow_info));
1734 	NETNS_UNLOCK();
1735 
1736 	return 0;
1737 }
1738 
1739 void
netns_change_flags(netns_token * token,uint32_t set_flags,uint32_t clear_flags)1740 netns_change_flags(netns_token *token, uint32_t set_flags,
1741     uint32_t clear_flags)
1742 {
1743 	struct ns_token *nt;
1744 #if SK_LOG
1745 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1746 #endif /* SK_LOG */
1747 
1748 	if (__netns_inited == 0) {
1749 		return;
1750 	}
1751 
1752 	NETNS_LOCK();
1753 
1754 	VERIFY(NETNS_TOKEN_VALID(token));
1755 
1756 	nt = *token;
1757 
1758 	VERIFY(!((set_flags | clear_flags) & NETNS_RESERVATION_FLAGS));
1759 	/* TODO: verify set and clear flags don't overlap? */
1760 
1761 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1762 	    "%s:%s:%d // flags 0x%x -> 0x%x",
1763 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1764 	    tmp_ip_str, sizeof(tmp_ip_str)),
1765 	    PROTO_STR(nt->nt_proto), nt->nt_port, nt->nt_flags,
1766 	    nt->nt_flags | set_flags & ~clear_flags);
1767 
1768 	nt->nt_flags |= set_flags;
1769 	nt->nt_flags &= ~clear_flags;
1770 
1771 	NETNS_UNLOCK();
1772 }
1773 
1774 /*
1775  * Port offloading KPI
1776  */
1777 static inline void
netns_local_port_scan_flow_entry(struct flow_entry * fe,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1778 netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protocol,
1779     u_int32_t flags, u_int8_t *bitfield)
1780 {
1781 	struct ns_token *token;
1782 	boolean_t iswildcard = false;
1783 
1784 	if (fe == NULL) {
1785 		return;
1786 	}
1787 
1788 	if (fe->fe_flags & FLOWENTF_EXTRL_PORT) {
1789 		return;
1790 	}
1791 
1792 	token = fe->fe_port_reservation;
1793 	if (token == NULL) {
1794 		return;
1795 	}
1796 
1797 	/*
1798 	 * We are only interested in active flows over skywalk channels
1799 	 */
1800 	if ((token->nt_flags & NETNS_OWNER_MASK) != NETNS_SKYWALK) {
1801 		return;
1802 	}
1803 
1804 	if (token->nt_state & NETNS_STATE_WITHDRAWN) {
1805 		return;
1806 	}
1807 
1808 	if (!(flags & IFNET_GET_LOCAL_PORTS_ANYTCPSTATEOK) &&
1809 	    (flags & IFNET_GET_LOCAL_PORTS_ACTIVEONLY) &&
1810 	    (token->nt_state & NETNS_STATE_HALFCLOSED)) {
1811 		return;
1812 	}
1813 
1814 	VERIFY(token->nt_addr_len == sizeof(struct in_addr) ||
1815 	    token->nt_addr_len == sizeof(struct in6_addr));
1816 
1817 	if (token->nt_addr_len == sizeof(struct in_addr)) {
1818 		if (protocol == PF_INET6) {
1819 			return;
1820 		}
1821 
1822 		iswildcard = token->nt_inaddr.s_addr == INADDR_ANY;
1823 	} else if (token->nt_addr_len == sizeof(struct in6_addr)) {
1824 		if (protocol == PF_INET) {
1825 			return;
1826 		}
1827 
1828 		iswildcard = IN6_IS_ADDR_UNSPECIFIED(
1829 			&token->nt_in6addr);
1830 	}
1831 	if (!(flags & IFNET_GET_LOCAL_PORTS_WILDCARDOK) && iswildcard) {
1832 		return;
1833 	}
1834 
1835 	if ((flags & IFNET_GET_LOCAL_PORTS_TCPONLY) &&
1836 	    token->nt_proto == IPPROTO_UDP) {
1837 		return;
1838 	}
1839 	if ((flags & IFNET_GET_LOCAL_PORTS_UDPONLY) &&
1840 	    token->nt_proto == IPPROTO_TCP) {
1841 		return;
1842 	}
1843 
1844 	if (!(flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) &&
1845 	    (token->nt_flags & NETNS_NOWAKEFROMSLEEP)) {
1846 		return;
1847 	}
1848 
1849 	if ((flags & IFNET_GET_LOCAL_PORTS_RECVANYIFONLY) &&
1850 	    !(token->nt_flags & NETNS_RECVANYIF)) {
1851 		return;
1852 	}
1853 
1854 	if ((flags & IFNET_GET_LOCAL_PORTS_EXTBGIDLEONLY) &&
1855 	    !(token->nt_flags & NETNS_EXTBGIDLE)) {
1856 		return;
1857 	}
1858 
1859 	if (token->nt_ifp != NULL && token->nt_flow_info != NULL) {
1860 		bitstr_set(bitfield, token->nt_port);
1861 		(void) if_ports_used_add_flow_entry(fe, token->nt_ifp->if_index,
1862 		    token->nt_flow_info, token->nt_flags);
1863 	} else {
1864 		SK_ERR("%s: unknown owner port %u"
1865 		    " nt_flags 0x%x ifindex %u nt_flow_info %p\n",
1866 		    __func__, token->nt_port,
1867 		    token->nt_flags,
1868 		    token->nt_ifp != NULL ? token->nt_ifp->if_index : 0,
1869 		    token->nt_flow_info);
1870 	}
1871 }
1872 
1873 static void
netns_get_if_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1874 netns_get_if_local_ports(ifnet_t ifp, protocol_family_t protocol,
1875     u_int32_t flags, u_int8_t *bitfield)
1876 {
1877 	struct nx_flowswitch *fsw = NULL;
1878 
1879 	if (ifp == NULL || ifp->if_na == NULL) {
1880 		return;
1881 	}
1882 	/* Ensure that the interface is attached and won't detach */
1883 	if (!ifnet_is_attached(ifp, 1)) {
1884 		return;
1885 	}
1886 	fsw = fsw_ifp_to_fsw(ifp);
1887 	if (fsw == NULL) {
1888 		goto done;
1889 	}
1890 	FSW_RLOCK(fsw);
1891 	NETNS_LOCK();
1892 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1893 		netns_local_port_scan_flow_entry(_fe, protocol, flags,
1894 		bitfield);
1895 	});
1896 	NETNS_UNLOCK();
1897 	FSW_UNLOCK(fsw);
1898 done:
1899 	ifnet_decr_iorefcnt(ifp);
1900 }
1901 
1902 errno_t
netns_get_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1903 netns_get_local_ports(ifnet_t ifp, protocol_family_t protocol,
1904     u_int32_t flags, u_int8_t *bitfield)
1905 {
1906 	if (__netns_inited == 0) {
1907 		return 0;
1908 	}
1909 	if (ifp != NULL) {
1910 		netns_get_if_local_ports(ifp, protocol, flags, bitfield);
1911 	} else {
1912 		errno_t error;
1913 		ifnet_t *ifp_list;
1914 		uint32_t count, i;
1915 
1916 		error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count);
1917 		if (error != 0) {
1918 			os_log_error(OS_LOG_DEFAULT,
1919 			    "%s: ifnet_list_get_all() failed %d",
1920 			    __func__, error);
1921 			return error;
1922 		}
1923 		for (i = 0; i < count; i++) {
1924 			if (TAILQ_EMPTY(&ifp_list[i]->if_addrhead)) {
1925 				continue;
1926 			}
1927 			netns_get_if_local_ports(ifp_list[i], protocol, flags,
1928 			    bitfield);
1929 		}
1930 		ifnet_list_free(ifp_list);
1931 	}
1932 
1933 	return 0;
1934 }
1935 
1936 uint32_t
netns_find_anyres_byaddr(struct ifaddr * ifa,uint8_t proto)1937 netns_find_anyres_byaddr(struct ifaddr *ifa, uint8_t proto)
1938 {
1939 	int result = 0;
1940 	int ifa_addr_len;
1941 	struct ns_token *token;
1942 	struct ifnet *ifp = ifa->ifa_ifp;
1943 	struct sockaddr *ifa_addr = ifa->ifa_addr;
1944 
1945 	if (__netns_inited == 0) {
1946 		return ENOTSUP;
1947 	}
1948 
1949 	if ((ifa_addr->sa_family != AF_INET) &&
1950 	    (ifa_addr->sa_family != AF_INET6)) {
1951 		return 0;
1952 	}
1953 
1954 	ifa_addr_len = (ifa_addr->sa_family == AF_INET) ?
1955 	    sizeof(struct in_addr) : sizeof(struct in6_addr);
1956 
1957 	NETNS_LOCK();
1958 
1959 	SLIST_FOREACH(token, &ifp->if_netns_tokens, nt_ifp_link) {
1960 		if ((token->nt_flags & NETNS_OWNER_MASK) == NETNS_PF) {
1961 			continue;
1962 		}
1963 		if (token->nt_addr_len != ifa_addr_len) {
1964 			continue;
1965 		}
1966 		if (token->nt_proto != proto) {
1967 			continue;
1968 		}
1969 		if (ifa_addr->sa_family == AF_INET) {
1970 			if (token->nt_inaddr.s_addr ==
1971 			    (satosin(ifa->ifa_addr))->sin_addr.s_addr) {
1972 				result = 1;
1973 				break;
1974 			}
1975 		} else if (ifa_addr->sa_family == AF_INET6) {
1976 			if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa),
1977 			    &token->nt_in6addr)) {
1978 				result = 1;
1979 				break;
1980 			}
1981 		}
1982 	}
1983 
1984 	NETNS_UNLOCK();
1985 	return result;
1986 }
1987 
1988 static uint32_t
_netns_lookup_ns_n_reservations(uint32_t * addr,uint8_t addr_len,uint8_t proto)1989 _netns_lookup_ns_n_reservations(uint32_t *addr, uint8_t addr_len, uint8_t proto)
1990 {
1991 	uint32_t ns_n_reservations = 0;
1992 	NETNS_LOCK_SPIN();
1993 	struct ns *namespace = _netns_get_ns(addr, addr_len, proto, true);
1994 	if (namespace != NULL) {
1995 		ns_n_reservations = namespace->ns_n_reservations;
1996 	}
1997 	NETNS_UNLOCK();
1998 	return ns_n_reservations;
1999 }
2000 
2001 uint32_t
netns_lookup_reservations_count_in(struct in_addr addr,uint8_t proto)2002 netns_lookup_reservations_count_in(struct in_addr addr, uint8_t proto)
2003 {
2004 	return _netns_lookup_ns_n_reservations(&addr.s_addr, sizeof(struct in_addr), proto);
2005 }
2006 
2007 uint32_t
netns_lookup_reservations_count_in6(struct in6_addr addr,uint8_t proto)2008 netns_lookup_reservations_count_in6(struct in6_addr addr, uint8_t proto)
2009 {
2010 	if (IN6_IS_SCOPE_EMBED(&addr)) {
2011 		addr.s6_addr16[1] = 0;
2012 	}
2013 	return _netns_lookup_ns_n_reservations(&addr.s6_addr32[0], sizeof(struct in6_addr), proto);
2014 }
2015 
2016 /*
2017  * Sysctl interface
2018  */
2019 
2020 static int netns_ctl_dump_all SYSCTL_HANDLER_ARGS;
2021 
2022 SYSCTL_NODE(_kern_skywalk, OID_AUTO, netns, CTLFLAG_RW | CTLFLAG_LOCKED,
2023     0, "Netns interface");
2024 
2025 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netns,
2026     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
2027     0, 0, netns_ctl_dump_all, "-",
2028     "Namespace contents (struct netns_ctl_dump_header, "
2029     "skywalk/os_stats_private.h)");
2030 
2031 static int
netns_ctl_write_ns(struct sysctl_req * req,struct ns * namespace,boolean_t is_global)2032 netns_ctl_write_ns(struct sysctl_req *req, struct ns *namespace,
2033     boolean_t is_global)
2034 {
2035 	struct ns_reservation *res;
2036 	struct netns_ctl_dump_header response_header;
2037 	struct netns_ctl_dump_record response_record;
2038 	int err;
2039 
2040 	/* Fill out header */
2041 	memset(&response_header, 0, sizeof(response_header));
2042 	response_header.ncdh_n_records = namespace->ns_n_reservations;
2043 	response_header.ncdh_proto = namespace->ns_proto;
2044 
2045 	if (is_global) {
2046 		response_header.ncdh_addr_len = 0;
2047 	} else {
2048 		response_header.ncdh_addr_len = namespace->ns_addr_len;
2049 	}
2050 	memcpy(response_header.ncdh_addr, namespace->ns_addr,
2051 	    namespace->ns_addr_len);
2052 
2053 	err = SYSCTL_OUT(req, &response_header, sizeof(response_header));
2054 	if (err) {
2055 		return err;
2056 	}
2057 
2058 	/* Fill out records */
2059 	RB_FOREACH(res, ns_reservation_tree, &namespace->ns_reservations) {
2060 		memset(&response_record, 0, sizeof(response_record));
2061 		response_record.ncdr_port = res->nsr_port;
2062 		response_record.ncdr_port_end = 0;
2063 		response_record.ncdr_listener_refs =
2064 		    NETNS_REF_COUNT(res, NETNS_LISTENER);
2065 		response_record.ncdr_skywalk_refs =
2066 		    NETNS_REF_COUNT(res, NETNS_SKYWALK);
2067 		response_record.ncdr_bsd_refs =
2068 		    NETNS_REF_COUNT(res, NETNS_BSD);
2069 		response_record.ncdr_pf_refs =
2070 		    NETNS_REF_COUNT(res, NETNS_PF);
2071 		err = SYSCTL_OUT(req, &response_record,
2072 		    sizeof(response_record));
2073 		if (err) {
2074 			return err;
2075 		}
2076 	}
2077 
2078 	return 0;
2079 }
2080 
2081 static int
2082 netns_ctl_dump_all SYSCTL_HANDLER_ARGS
2083 {
2084 #pragma unused(oidp, arg1, arg2)
2085 	struct ns *namespace;
2086 	int i, err = 0;
2087 
2088 	if (!kauth_cred_issuser(kauth_cred_get())) {
2089 		return EPERM;
2090 	}
2091 
2092 	if (__netns_inited == 0) {
2093 		return ENOTSUP;
2094 	}
2095 
2096 	NETNS_LOCK();
2097 
2098 	for (i = 0; i < NETNS_N_GLOBAL; i++) {
2099 		err = netns_ctl_write_ns(req, netns_global_non_wild[i], true);
2100 		if (err) {
2101 			goto done;
2102 		}
2103 	}
2104 
2105 	RB_FOREACH(namespace, netns_namespaces_tree, &netns_namespaces) {
2106 		err = netns_ctl_write_ns(req, namespace, false);
2107 		if (err) {
2108 			goto done;
2109 		}
2110 	}
2111 
2112 	/*
2113 	 * If this is just a request for length, add slop because
2114 	 * this is dynamically changing data
2115 	 */
2116 	if (req->oldptr == USER_ADDR_NULL) {
2117 		req->oldidx += 20 * sizeof(struct netns_ctl_dump_record);
2118 	}
2119 
2120 done:
2121 	NETNS_UNLOCK();
2122 	return err;
2123 }
2124 /* CSTYLED */
2125