xref: /xnu-10002.81.5/bsd/skywalk/namespace/netns.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/assert.h>
30 #include <kern/locks.h>
31 #include <kern/zalloc.h>
32 #include <libkern/tree.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/bitstring.h>
36 #include <net/if.h>
37 #include <net/kpi_interface.h>
38 #include <net/restricted_in_port.h>
39 
40 #include <netinet/in.h>
41 #include <netinet/in_pcb.h>
42 #include <netinet/tcp_fsm.h>
43 #include <netinet/tcp_var.h>
44 
45 #include <netinet6/in6_var.h>
46 #include <string.h>
47 
48 #include <skywalk/os_skywalk.h>
49 #include <skywalk/os_skywalk_private.h>
50 #include <skywalk/os_stats_private.h>
51 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
52 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
53 
54 #include <net/if_ports_used.h>
55 
56 static int __netns_inited = 0;
57 
58 /*
59  * Logging
60  */
61 
62 #define NS_VERB_PROTO(proto)    ((proto == IPPROTO_TCP) ? SK_VERB_NS_TCP : \
63 	                                    SK_VERB_NS_UDP)
64 #define NS_VERB_IP(addr_len)    ((addr_len == sizeof (struct in_addr)) ? \
65 	                                    SK_VERB_NS_IPV4 : SK_VERB_NS_IPV6)
66 #define PROTO_STR(proto)        ((proto == IPPROTO_TCP) ? "tcp" : "udp")
67 #define LEN_TO_AF(len)          (((len == sizeof (struct in_addr)) ? \
68 	                            AF_INET : AF_INET6))
69 /*
70  * Locking
71  * Netns is currently protected by a global mutex, NETNS_LOCK. This lock is
72  * aquired at the entry of every kernel-facing function, and released at the
73  * end. Data within netns_token structures is also protected under this lock.
74  */
75 
76 #define NETNS_LOCK()                    \
77 	lck_mtx_lock(&netns_lock)
78 #define NETNS_LOCK_SPIN()               \
79 	lck_mtx_lock_spin(&netns_lock)
80 #define NETNS_LOCK_CONVERT() do {       \
81 	NETNS_LOCK_ASSERT_HELD();       \
82 	lck_mtx_convert_spin(&netns_lock); \
83 } while (0)
84 #define NETNS_UNLOCK()                  \
85 	lck_mtx_unlock(&netns_lock)
86 #define NETNS_LOCK_ASSERT_HELD()        \
87 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_OWNED)
88 #define NETNS_LOCK_ASSERT_NOTHELD()     \
89 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_NOTOWNED)
90 
91 static LCK_GRP_DECLARE(netns_lock_group, "netns_lock");
92 static LCK_MTX_DECLARE(netns_lock, &netns_lock_group);
93 
94 /*
95  * Internal data structures and parameters
96  */
97 
98 /*
99  * Local ports are kept track of by reference counts kept in a tree specific to
100  * an <IP, protocol> tuple (see struct ns).
101  *
102  * Note: port numbers are stored in host byte order.
103  */
104 struct ns_reservation {
105 	RB_ENTRY(ns_reservation) nsr_link;
106 	uint32_t nsr_refs[NETNS_OWNER_MAX + 1];
107 	in_port_t nsr_port;
108 	bool nsr_reuseport:1;
109 };
110 
111 #define NETNS_REF_COUNT(nsr, flags)     \
112 	(nsr)->nsr_refs[((flags) & NETNS_OWNER_MASK)]
113 
114 static inline int nsr_cmp(const struct ns_reservation *,
115     const struct ns_reservation *);
116 
117 RB_HEAD(ns_reservation_tree, ns_reservation);
118 RB_PROTOTYPE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
119 RB_GENERATE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
120 
121 static inline struct ns_reservation *ns_reservation_tree_find(
122 	struct ns_reservation_tree *, const in_port_t);
123 
124 /*
125  * A namespace keeps track of the local port numbers in use for a given
126  * <IP, protocol> tuple. There are also global namespaces for each
127  * protocol to accomodate INADDR_ANY behavior and diagnostics.
128  */
129 struct ns {
130 	RB_ENTRY(ns)    ns_link;
131 
132 	void            *ns_addr_key;
133 
134 	union {
135 		uint32_t        ns_addr[4];
136 		struct in_addr  ns_inaddr;
137 		struct in6_addr ns_in6addr;
138 	};
139 	uint8_t         ns_addr_len;
140 	uint8_t         ns_proto;
141 
142 	in_port_t       ns_last_ephemeral_port_down;
143 	in_port_t       ns_last_ephemeral_port_up;
144 
145 	uint8_t         ns_is_freeable;
146 
147 	uint32_t        ns_n_reservations;
148 	struct ns_reservation_tree ns_reservations;
149 };
150 
151 static uint32_t netns_n_namespaces;
152 
153 static inline int ns_cmp(const struct ns *, const struct ns *);
154 
155 RB_HEAD(netns_namespaces_tree, ns) netns_namespaces =
156     RB_INITIALIZER(netns_namespaces);
157 RB_PROTOTYPE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
158 RB_GENERATE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
159 
160 /*
161  * Declare pointers to global namespaces for each protocol.
162  * All non-wildcard reservations will have an entry here.
163  */
164 #define NETNS_N_GLOBAL  4
165 static struct ns *netns_global_non_wild[NETNS_N_GLOBAL];
166 static struct ns *netns_global_wild[NETNS_N_GLOBAL];
167 #define NETNS_ADDRLEN_V4 (sizeof(struct in_addr))
168 #define NETNS_ADDRLEN_V6 (sizeof(struct in6_addr))
169 #define NETNS_NS_TCP    0
170 #define NETNS_NS_UDP    1
171 #define NETNS_NS_V4     0
172 #define NETNS_NS_V6     2
173 #define NETNS_NS_GLOBAL_IDX(proto, addrlen)     \
174 	((((proto) == IPPROTO_TCP) ? NETNS_NS_TCP : NETNS_NS_UDP) | \
175 	(((addrlen) == NETNS_ADDRLEN_V4) ? NETNS_NS_V4 : NETNS_NS_V6))
176 
177 #define NETNS_NS_UDP_EPHEMERAL_RESERVE  4096
178 
179 /*
180  * Internal token structure
181  *
182  * Note: port numbers are stored in host byte order.
183  */
184 struct ns_token {
185 	/* Reservation state */
186 	ifnet_t                 nt_ifp;
187 	SLIST_ENTRY(ns_token)   nt_ifp_link;
188 	SLIST_ENTRY(ns_token)   nt_all_link;
189 	uint32_t                nt_state;       /* NETNS_STATE_* */
190 
191 	/* Reservation context */
192 	union {
193 		uint32_t        nt_addr[4];
194 		struct in_addr  nt_inaddr;
195 		struct in6_addr nt_in6addr;
196 	};
197 	uint8_t                 nt_addr_len;
198 	uint8_t                 nt_proto;
199 	in_port_t               nt_port;
200 	uint32_t                nt_flags;
201 
202 	/* Optional information about the flow */
203 	struct ns_flow_info     *nt_flow_info;
204 };
205 
206 /* Valid values for nt_state */
207 #define NETNS_STATE_HALFCLOSED  0x1     /* half closed */
208 #define NETNS_STATE_WITHDRAWN   0x2     /* withdrawn; not offloadable */
209 
210 #define NETNS_STATE_BITS        "\020\01HALFCLOSED\02WITHDRAWN"
211 
212 /* List of tokens not bound to an ifnet */
213 SLIST_HEAD(, ns_token) netns_unbound_tokens = SLIST_HEAD_INITIALIZER(
214 	netns_unbound_tokens);
215 
216 /* List of all tokens currently allocated in the system */
217 SLIST_HEAD(, ns_token) netns_all_tokens = SLIST_HEAD_INITIALIZER(
218 	netns_all_tokens);
219 
220 /*
221  * Memory management
222  */
223 static SKMEM_TYPE_DEFINE(netns_ns_zone, struct ns);
224 
225 #define NETNS_NS_TOKEN_ZONE_NAME        "netns.ns_token"
226 static unsigned int netns_ns_token_size; /* size of zone element */
227 static struct skmem_cache *netns_ns_token_cache; /* for ns_token */
228 
229 #define NETNS_NS_FLOW_INFO_ZONE_NAME    "netns.ns_flow_info"
230 static unsigned int netns_ns_flow_info_size; /* size of zone element */
231 static struct skmem_cache *netns_ns_flow_info_cache; /* for ns_flow_info */
232 
233 #define NETNS_NS_RESERVATION_ZONE_NAME  "netns.ns_reservation"
234 static unsigned int netns_ns_reservation_size; /* size of zone element */
235 static struct skmem_cache *netns_ns_reservation_cache; /* for ns_reservation */
236 
237 static struct ns_reservation *netns_ns_reservation_alloc(in_port_t, uint32_t);
238 static void netns_ns_reservation_free(struct ns_reservation *);
239 static struct ns *netns_ns_alloc(zalloc_flags_t);
240 static void netns_ns_free(struct ns *);
241 static void netns_ns_cleanup(struct ns *);
242 static struct ns_token *netns_ns_token_alloc(boolean_t);
243 static void netns_ns_token_free(struct ns_token *);
244 
245 /*
246  * Utility/internal code
247  */
248 static struct ns *_netns_get_ns(uint32_t *, uint8_t, uint8_t, bool);
249 static inline boolean_t _netns_is_wildcard_addr(const uint32_t *, uint8_t);
250 static int _netns_reserve_common(struct ns *, in_port_t, uint32_t);
251 static void _netns_release_common(struct ns *, in_port_t, uint32_t);
252 static inline void netns_clear_ifnet(struct ns_token *);
253 static int _netns_reserve_kpi_common(struct ns *, netns_token *, uint32_t *,
254     uint8_t, uint8_t, in_port_t *, uint32_t, struct ns_flow_info *);
255 static void _netns_set_ifnet_internal(struct ns_token *, struct ifnet *);
256 
257 static struct ns_reservation *
netns_ns_reservation_alloc(in_port_t port,uint32_t flags)258 netns_ns_reservation_alloc(in_port_t port, uint32_t flags)
259 {
260 	struct ns_reservation *res;
261 
262 	VERIFY(port != 0);
263 
264 	res = skmem_cache_alloc(netns_ns_reservation_cache, SKMEM_SLEEP);
265 	ASSERT(res != NULL);
266 
267 	bzero(res, netns_ns_reservation_size);
268 	res->nsr_port = port;
269 	res->nsr_reuseport = ((flags & NETNS_REUSEPORT) != 0);
270 	return res;
271 }
272 
273 static void
netns_ns_reservation_free(struct ns_reservation * res)274 netns_ns_reservation_free(struct ns_reservation *res)
275 {
276 	skmem_cache_free(netns_ns_reservation_cache, res);
277 }
278 
279 static struct ns *
netns_ns_alloc(zalloc_flags_t how)280 netns_ns_alloc(zalloc_flags_t how)
281 {
282 	struct ns *namespace;
283 	in_port_t first = (in_port_t)ipport_firstauto;
284 	in_port_t last  = (in_port_t)ipport_lastauto;
285 	in_port_t rand_port;
286 
287 	namespace = zalloc_flags(netns_ns_zone, how | Z_ZERO);
288 	if (namespace == NULL) {
289 		return NULL;
290 	}
291 
292 	namespace->ns_is_freeable = 1;
293 
294 	RB_INIT(&namespace->ns_reservations);
295 
296 	/*
297 	 * Randomize the initial ephemeral port starting point, just in case
298 	 * this namespace is for an ipv6 address which gets brought up and
299 	 * down often.
300 	 */
301 	if (first == last) {
302 		rand_port = first;
303 	} else {
304 		read_frandom(&rand_port, sizeof(rand_port));
305 
306 		if (first > last) {
307 			rand_port = last + (rand_port % (first - last));
308 		} else {
309 			rand_port = first + (rand_port % (last - first));
310 		}
311 	}
312 	namespace->ns_last_ephemeral_port_down = rand_port;
313 	namespace->ns_last_ephemeral_port_up = rand_port;
314 
315 	return namespace;
316 }
317 
318 static void
netns_ns_free(struct ns * namespace)319 netns_ns_free(struct ns *namespace)
320 {
321 	struct ns_reservation *res;
322 	struct ns_reservation *tmp_res;
323 #if SK_LOG
324 	char tmp_ip_str[MAX_IPv6_STR_LEN];
325 #endif /* SK_LOG */
326 
327 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
328 	    NS_VERB_PROTO(namespace->ns_proto),
329 	    "freeing %s ns for IP %s",
330 	    PROTO_STR(namespace->ns_proto),
331 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
332 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)));
333 
334 	RB_FOREACH_SAFE(res, ns_reservation_tree, &namespace->ns_reservations,
335 	    tmp_res) {
336 		netns_ns_reservation_free(res);
337 		namespace->ns_n_reservations--;
338 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
339 		    res);
340 	}
341 
342 	VERIFY(RB_EMPTY(&namespace->ns_reservations));
343 
344 	if (netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
345 	    namespace->ns_addr_len)] == namespace) {
346 		netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
347 		namespace->ns_addr_len)] = NULL;
348 	}
349 	if (netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
350 	    namespace->ns_addr_len)] == namespace) {
351 		netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
352 		namespace->ns_addr_len)] = NULL;
353 	}
354 
355 	zfree(netns_ns_zone, namespace);
356 }
357 
358 static void
netns_ns_cleanup(struct ns * namespace)359 netns_ns_cleanup(struct ns *namespace)
360 {
361 	if (namespace->ns_is_freeable &&
362 	    RB_EMPTY(&namespace->ns_reservations)) {
363 		RB_REMOVE(netns_namespaces_tree, &netns_namespaces, namespace);
364 		netns_n_namespaces--;
365 		netns_ns_free(namespace);
366 	}
367 }
368 
369 static struct ns_token *
netns_ns_token_alloc(boolean_t with_nfi)370 netns_ns_token_alloc(boolean_t with_nfi)
371 {
372 	struct ns_token *token;
373 
374 	NETNS_LOCK_ASSERT_HELD();
375 	NETNS_LOCK_CONVERT();
376 
377 	token = skmem_cache_alloc(netns_ns_token_cache, SKMEM_SLEEP);
378 	ASSERT(token != NULL);
379 
380 	bzero(token, netns_ns_token_size);
381 
382 	if (with_nfi) {
383 		token->nt_flow_info =  skmem_cache_alloc(netns_ns_flow_info_cache,
384 		    SKMEM_SLEEP);
385 		ASSERT(token->nt_flow_info != NULL);
386 	}
387 	SLIST_INSERT_HEAD(&netns_all_tokens, token, nt_all_link);
388 
389 	return token;
390 }
391 
392 static void
netns_ns_token_free(struct ns_token * token)393 netns_ns_token_free(struct ns_token *token)
394 {
395 	NETNS_LOCK_ASSERT_HELD();
396 	NETNS_LOCK_CONVERT();
397 	SLIST_REMOVE(&netns_all_tokens, token, ns_token, nt_all_link);
398 
399 	if (token->nt_flow_info != NULL) {
400 		skmem_cache_free(netns_ns_flow_info_cache, token->nt_flow_info);
401 	}
402 	skmem_cache_free(netns_ns_token_cache, token);
403 }
404 
405 __attribute__((always_inline))
406 static inline int
nsr_cmp(const struct ns_reservation * nsr1,const struct ns_reservation * nsr2)407 nsr_cmp(const struct ns_reservation *nsr1, const struct ns_reservation *nsr2)
408 {
409 #define NSR_COMPARE(r1, r2)     ((int)(r1)->nsr_port - (int)(r2)->nsr_port)
410 	return NSR_COMPARE(nsr1, nsr2);
411 }
412 
413 __attribute__((always_inline))
414 static inline int
ns_cmp(const struct ns * a,const struct ns * b)415 ns_cmp(const struct ns *a, const struct ns *b)
416 {
417 	int d;
418 
419 	if ((d = (a->ns_addr_len - b->ns_addr_len)) != 0) {
420 		return d;
421 	}
422 	if ((d = (a->ns_proto - b->ns_proto)) != 0) {
423 		return d;
424 	}
425 	if ((d = flow_ip_cmp(a->ns_addr_key, b->ns_addr_key,
426 	    b->ns_addr_len)) != 0) {
427 		return d;
428 	}
429 
430 	return 0;
431 }
432 
433 /*
434  * Common routine to look up a reservation.
435  *
436  * NOTE: Assumes the caller holds the NETNS global lock
437  */
438 __attribute__((always_inline))
439 static inline struct ns_reservation *
ns_reservation_tree_find(struct ns_reservation_tree * tree,const in_port_t port)440 ns_reservation_tree_find(struct ns_reservation_tree *tree, const in_port_t port)
441 {
442 	struct ns_reservation res;
443 	res.nsr_port = port;
444 	return RB_FIND(ns_reservation_tree, tree, &res);
445 }
446 
447 /*
448  * Retrieve the namespace for the supplied <address, protocol> tuple.
449  * If create is set and such a namespace doesn't already exist, one will be
450  * created.
451  */
452 static struct ns *
_netns_get_ns(uint32_t * addr,uint8_t addr_len,uint8_t proto,bool create)453 _netns_get_ns(uint32_t *addr, uint8_t addr_len, uint8_t proto, bool create)
454 {
455 	struct ns *namespace = NULL;
456 	struct ns find = {
457 		.ns_addr_key = addr,
458 		.ns_addr_len = addr_len,
459 		.ns_proto = proto,
460 	};
461 #if SK_LOG
462 	char tmp_ip_str[MAX_IPv6_STR_LEN];
463 #endif /* SK_LOG */
464 
465 	VERIFY(addr_len == sizeof(struct in_addr) ||
466 	    addr_len == sizeof(struct in6_addr));
467 
468 	NETNS_LOCK_ASSERT_HELD();
469 
470 	namespace = RB_FIND(netns_namespaces_tree, &netns_namespaces, &find);
471 
472 	if (create && namespace == NULL) {
473 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
474 		    "allocating %s ns for IP %s",
475 		    PROTO_STR(proto), inet_ntop(LEN_TO_AF(addr_len), addr,
476 		    tmp_ip_str, sizeof(tmp_ip_str)));
477 		NETNS_LOCK_CONVERT();
478 		namespace = netns_ns_alloc(Z_WAITOK | Z_NOFAIL);
479 		__builtin_assume(namespace != NULL);
480 		memcpy(namespace->ns_addr, addr, addr_len);
481 		namespace->ns_addr_key = &namespace->ns_addr;
482 		namespace->ns_addr_len = addr_len;
483 		namespace->ns_proto = proto;
484 		RB_INSERT(netns_namespaces_tree, &netns_namespaces, namespace);
485 		netns_n_namespaces++;
486 
487 		if (_netns_is_wildcard_addr(addr, addr_len) &&
488 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
489 		    addr_len)] == NULL) {
490 			netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
491 			addr_len)] = namespace;
492 		}
493 	}
494 
495 	return namespace;
496 }
497 
498 /*
499  * Return true if the supplied address is a wildcard (INADDR_ANY)
500  */
501 __attribute__((always_inline))
502 static boolean_t
_netns_is_wildcard_addr(const uint32_t * addr,uint8_t addr_len)503 _netns_is_wildcard_addr(const uint32_t *addr, uint8_t addr_len)
504 {
505 	boolean_t wildcard;
506 
507 	switch (addr_len) {
508 	case sizeof(struct in_addr):
509 		wildcard = (addr[0] == 0);
510 		break;
511 
512 	case sizeof(struct in6_addr):
513 		wildcard = (addr[0] == 0 && addr[1] == 0 &&
514 		    addr[2] == 0 && addr[3] == 0);
515 		break;
516 
517 	default:
518 		wildcard = FALSE;
519 		break;
520 	}
521 
522 	return wildcard;
523 }
524 
525 __attribute__((always_inline))
526 static boolean_t
_netns_is_port_used(struct ns * gns,struct ns_reservation * curr_res,in_port_t port)527 _netns_is_port_used(struct ns * gns, struct ns_reservation *curr_res, in_port_t port)
528 {
529 	struct ns_reservation *res = NULL;
530 
531 	if (gns == NULL) {
532 		return FALSE;
533 	}
534 
535 	res = ns_reservation_tree_find(&gns->ns_reservations, port);
536 	if (res != NULL && res != curr_res) {
537 		if (!res->nsr_reuseport) {
538 			return TRUE;
539 		}
540 	}
541 
542 	return FALSE;
543 }
544 
545 /*
546  * Internal shared code to reserve ports within a specific namespace.
547  *
548  * Note: port numbers are in host byte-order here.
549  */
550 static int
_netns_reserve_common(struct ns * namespace,in_port_t port,uint32_t flags)551 _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags)
552 {
553 	struct ns_reservation *res = NULL, *exist = NULL;
554 	uint8_t proto, addr_len;
555 	int err = 0;
556 #if SK_LOG
557 	char tmp_ip_str[MAX_IPv6_STR_LEN];
558 #endif /* SK_LOG */
559 
560 	VERIFY(port != 0);
561 	proto = namespace->ns_proto;
562 	addr_len = namespace->ns_addr_len;
563 	NETNS_LOCK_CONVERT();
564 	res = netns_ns_reservation_alloc(port, flags);
565 	if (res == NULL) {
566 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
567 		    "ERROR %s:%s:%d // flags 0x%x // OUT OF MEMORY",
568 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
569 		    namespace->ns_addr, tmp_ip_str,
570 		    sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags);
571 		return ENOMEM;
572 	}
573 	exist = RB_INSERT(ns_reservation_tree, &namespace->ns_reservations,
574 	    res);
575 	if (__probable(exist == NULL)) {
576 		namespace->ns_n_reservations++;
577 	} else {
578 		netns_ns_reservation_free(res);
579 		res = exist;
580 	}
581 
582 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
583 	    "pre: %s:%s:%d // flags 0x%x // refs %d sky, %d ls, "
584 	    "%d bsd %d pf", inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
585 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
586 	    PROTO_STR(proto), port, flags,
587 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
588 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
589 	    NETNS_REF_COUNT(res, NETNS_BSD),
590 	    NETNS_REF_COUNT(res, NETNS_PF));
591 
592 	/* Make reservation */
593 	/*
594 	 * Bypass collision detection for reservations in the global non-wild
595 	 * namespace. We use that namespace for reference counts only.
596 	 */
597 	if (namespace !=
598 	    netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)]) {
599 		struct ns_reservation *skres;
600 		boolean_t is_wild = _netns_is_wildcard_addr(namespace->ns_addr,
601 		    addr_len);
602 		struct ns *gns =
603 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)];
604 
605 		if (NETNS_IS_SKYWALK(flags)) {
606 			if ((!is_wild || exist != NULL) && gns != NULL &&
607 			    (skres = ns_reservation_tree_find(
608 				    &gns->ns_reservations, port)) != NULL &&
609 			    NETNS_REF_COUNT(skres, NETNS_LISTENER) == 0) {
610 				/*
611 				 * The mere existence of any non-skywalk
612 				 * listener wildcard entry for this
613 				 * protocol/port number means this must fail.
614 				 */
615 				SK_ERR("ADDRINUSE: Duplicate wildcard");
616 				err = EADDRINUSE;
617 				goto done;
618 			}
619 
620 			if (is_wild) {
621 				gns = netns_global_non_wild[
622 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
623 				VERIFY(gns != NULL);
624 
625 				if (_netns_is_port_used(netns_global_non_wild[
626 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
627 				    _netns_is_port_used(netns_global_non_wild[
628 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
629 					/*
630 					 * If Skywalk is trying to reserve a
631 					 * wildcard, then the mere existance of
632 					 * any entry in either v4/v6 non-wild
633 					 * namespace for this port means this
634 					 * must fail.
635 					 */
636 					SK_ERR("ADDRINUSE: Wildcard with non-wild.");
637 					err = EADDRINUSE;
638 					goto done;
639 				}
640 			}
641 		} else {
642 			/*
643 			 * Check if Skywalk has reserved a wildcard entry.
644 			 * Note that the arithmetic OR here is intentional.
645 			 */
646 			if ((!is_wild || exist != NULL) && gns != NULL &&
647 			    (skres = ns_reservation_tree_find(
648 				    &gns->ns_reservations, port)) != NULL &&
649 			    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
650 			    NETNS_REF_COUNT(skres, NETNS_LISTENER)) != 0) {
651 				/*
652 				 * BSD is trying to reserve a proto/port for
653 				 * which Skywalk already has a wildcard
654 				 * reservation.
655 				 */
656 				SK_ERR("ADDRINUSE: BSD requesting Skywalk port");
657 				err = EADDRINUSE;
658 				goto done;
659 			}
660 
661 			/*
662 			 * If BSD is trying to reserve a wildcard,
663 			 * ensure Skywalk has not already reserved
664 			 * a non-wildcard.
665 			 */
666 			if (is_wild) {
667 				gns = netns_global_non_wild[
668 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
669 				VERIFY(gns != NULL);
670 
671 				/*
672 				 * Note that the arithmetic OR here is
673 				 * intentional.
674 				 */
675 				if ((skres = ns_reservation_tree_find(
676 					    &gns->ns_reservations, port)) != NULL &&
677 				    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
678 				    NETNS_REF_COUNT(skres,
679 				    NETNS_LISTENER)) != 0) {
680 					SK_ERR("ADDRINUSE: BSD wildcard with non-wild.");
681 					err = EADDRINUSE;
682 					goto done;
683 				}
684 			}
685 		}
686 
687 		switch (flags & NETNS_OWNER_MASK) {
688 		case NETNS_SKYWALK:
689 			/* check collision w/ BSD */
690 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
691 			    NETNS_REF_COUNT(res, NETNS_PF) > 0) {
692 				SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ BSD)");
693 				err = EADDRINUSE;
694 				goto done;
695 			}
696 
697 			/* BEGIN CSTYLED */
698 			/*
699 			 * Scenarios with new Skywalk connected flow:
700 			 * 1. With existing Skywalk connected flow,
701 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
702 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 1
703 			 *    reject by failing the wild gns lookup below.
704 			 * 2. With existing Skywalk 3-tuple listener,
705 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 1
706 			 *    bypass the check below.
707 			 * 3. With existing Skywalk 2-tuple listener,
708 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
709 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 0
710 			 *    pass with successful wild gns lookup.
711 			 */
712 			/* END CSTYLED */
713 			if (NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
714 			    NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0) {
715 				/* check if covered by wild Skywalk listener */
716 				gns = netns_global_wild[
717 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
718 				if (gns != NULL &&
719 				    (skres = ns_reservation_tree_find(
720 					    &gns->ns_reservations, port)) != NULL &&
721 				    NETNS_REF_COUNT(skres, NETNS_LISTENER)
722 				    != 0) {
723 					err = 0;
724 					goto done;
725 				}
726 				if (addr_len == sizeof(struct in_addr)) {
727 					/* If address is IPv4, also check for wild IPv6 registration */
728 					gns = netns_global_wild[
729 						NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)];
730 					if (gns != NULL &&
731 					    (skres = ns_reservation_tree_find(
732 						    &gns->ns_reservations, port)) != NULL &&
733 					    NETNS_REF_COUNT(skres, NETNS_LISTENER)
734 					    != 0) {
735 						err = 0;
736 						goto done;
737 					}
738 				}
739 				SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ SK connected flow)");
740 				err = EADDRINUSE;
741 			}
742 			/*
743 			 * XXX: Duplicate 5-tuple flows under a Skywalk
744 			 * listener are currently detected by flow manager,
745 			 * till we implement 5-tuple-aware netns.
746 			 */
747 			break;
748 
749 		case NETNS_LISTENER:
750 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
751 			    NETNS_REF_COUNT(res, NETNS_PF) > 0 ||
752 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0 ||
753 			    _netns_is_port_used(netns_global_wild[
754 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
755 			    _netns_is_port_used(netns_global_wild[
756 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port) ||
757 			    _netns_is_port_used(netns_global_non_wild[
758 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
759 			    _netns_is_port_used(netns_global_non_wild[
760 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
761 				SK_ERR("ERROR - Listener got ADDRINUSE");
762 				err = EADDRINUSE;
763 			}
764 			break;
765 
766 		case NETNS_BSD:
767 		case NETNS_PF:
768 			if (NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0 ||
769 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0) {
770 				SK_ERR("ERROR - %s got ADDRINUSE",
771 				    ((flags & NETNS_OWNER_MASK) == NETNS_PF) ?
772 				    "PF" : "BSD");
773 				err = EADDRINUSE;
774 			}
775 			break;
776 
777 		default:
778 			panic("_netns_reserve_common: invalid owner 0x%x",
779 			    flags & NETNS_OWNER_MASK);
780 			/* NOTREACHED */
781 			__builtin_unreachable();
782 		}
783 	}
784 
785 done:
786 	ASSERT(res != NULL);
787 	if (__probable(err == 0)) {
788 		NETNS_REF_COUNT(res, flags)++;
789 		/* Check for wrap around */
790 		VERIFY(NETNS_REF_COUNT(res, flags) != 0);
791 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
792 		    NS_VERB_PROTO(namespace->ns_proto),
793 		    "post: %s:%s:%d err %d // flags 0x%x // refs %d sky, "
794 		    "%d ls, %d bsd %d pf",
795 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
796 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
797 		    PROTO_STR(namespace->ns_proto), port, err, flags,
798 		    NETNS_REF_COUNT(res, NETNS_SKYWALK),
799 		    NETNS_REF_COUNT(res, NETNS_LISTENER),
800 		    NETNS_REF_COUNT(res, NETNS_BSD),
801 		    NETNS_REF_COUNT(res, NETNS_PF));
802 	} else {
803 		if (exist == NULL) {
804 			RB_REMOVE(ns_reservation_tree,
805 			    &namespace->ns_reservations, res);
806 			namespace->ns_n_reservations--;
807 			netns_ns_reservation_free(res);
808 		}
809 	}
810 	return err;
811 }
812 
813 /*
814  * Internal shared code to release ports within a specific namespace.
815  */
816 static void
_netns_release_common(struct ns * namespace,in_port_t port,uint32_t flags)817 _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags)
818 {
819 	struct ns_reservation *res;
820 	uint32_t refs;
821 	int i;
822 #if SK_LOG
823 	char tmp_ip_str[MAX_IPv6_STR_LEN];
824 #endif /* SK_LOG */
825 
826 	NETNS_LOCK_ASSERT_HELD();
827 
828 	res = ns_reservation_tree_find(&namespace->ns_reservations, port);
829 	if (res == NULL) {
830 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
831 		    NS_VERB_PROTO(namespace->ns_proto),
832 		    "ERROR %s:%s:%d // flags 0x%x // not found",
833 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
834 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
835 		    PROTO_STR(namespace->ns_proto), port, flags);
836 		VERIFY(res != NULL);
837 	}
838 
839 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
840 	    NS_VERB_PROTO(namespace->ns_proto),
841 	    "%s:%s:%d // flags 0x%x // refs %d sky, %d ls, %d bsd, %d pf",
842 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
843 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
844 	    PROTO_STR(namespace->ns_proto), port, flags,
845 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
846 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
847 	    NETNS_REF_COUNT(res, NETNS_BSD),
848 	    NETNS_REF_COUNT(res, NETNS_PF));
849 
850 	/* Release reservation */
851 	VERIFY(NETNS_REF_COUNT(res, flags) > 0);
852 	NETNS_REF_COUNT(res, flags) -= 1;
853 
854 	/* Clean up memory, if appropriate */
855 	for (i = 0, refs = 0; i <= NETNS_OWNER_MAX && refs == 0; i++) {
856 		refs |= res->nsr_refs[i];
857 	}
858 	if (refs == 0) {
859 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
860 		    res);
861 		namespace->ns_n_reservations--;
862 		NETNS_LOCK_CONVERT();
863 		netns_ns_reservation_free(res);
864 		netns_ns_cleanup(namespace);
865 	}
866 }
867 
868 __attribute__((always_inline))
869 static inline void
netns_init_global_ns(struct ns ** global_ptr,uint8_t proto,uint8_t addrlen)870 netns_init_global_ns(struct ns **global_ptr, uint8_t proto, uint8_t addrlen)
871 {
872 	struct ns *namespace;
873 
874 	namespace = *global_ptr = netns_ns_alloc(Z_WAITOK);
875 	memset(namespace->ns_addr, 0xFF, addrlen);
876 	namespace->ns_addr_len = addrlen;
877 	namespace->ns_proto = proto;
878 	namespace->ns_is_freeable = 0;
879 }
880 
881 __attribute__((always_inline))
882 static inline void
netns_clear_ifnet(struct ns_token * nstoken)883 netns_clear_ifnet(struct ns_token *nstoken)
884 {
885 #if SK_LOG
886 	char tmp_ip_str[MAX_IPv6_STR_LEN];
887 #endif /* SK_LOG */
888 
889 	NETNS_LOCK_ASSERT_HELD();
890 
891 	if (nstoken->nt_ifp != NULL) {
892 		SLIST_REMOVE(&nstoken->nt_ifp->if_netns_tokens, nstoken,
893 		    ns_token, nt_ifp_link);
894 
895 		SK_DF(NS_VERB_IP(nstoken->nt_addr_len) |
896 		    NS_VERB_PROTO(nstoken->nt_proto),
897 		    "%s:%s:%d // removed from ifnet %d",
898 		    inet_ntop(LEN_TO_AF(nstoken->nt_addr_len),
899 		    nstoken->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
900 		    PROTO_STR(nstoken->nt_proto), nstoken->nt_port,
901 		    nstoken->nt_ifp->if_index);
902 
903 		NETNS_LOCK_CONVERT();
904 		ifnet_decr_iorefcnt(nstoken->nt_ifp);
905 		nstoken->nt_ifp = NULL;
906 	} else {
907 		SLIST_REMOVE(&netns_unbound_tokens, nstoken, ns_token,
908 		    nt_ifp_link);
909 	}
910 }
911 
912 /*
913  * Internal shared code to perform a port[-range] reservation, along with all
914  * the boilerplate and sanity checks expected for a call coming in from the
915  * surrounding kernel code.
916  */
917 static int
_netns_reserve_kpi_common(struct ns * ns,netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)918 _netns_reserve_kpi_common(struct ns *ns, netns_token *token, uint32_t *addr,
919     uint8_t addr_len, uint8_t proto, in_port_t *port, uint32_t flags,
920     struct ns_flow_info *nfi)
921 {
922 	boolean_t ns_want_cleanup = (ns == NULL);
923 	struct ns_token *nt;
924 	int err = 0;
925 	in_port_t hport;
926 #if SK_LOG
927 	char tmp_ip_str[MAX_IPv6_STR_LEN];
928 #endif /* SK_LOG */
929 	struct ifnet *ifp = (nfi != NULL) ? nfi->nfi_ifp : NULL;
930 
931 	NETNS_LOCK_ASSERT_HELD();
932 
933 	hport = ntohs(*port);
934 
935 	VERIFY((flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
936 	VERIFY(addr_len == sizeof(struct in_addr) ||
937 	    addr_len == sizeof(struct in6_addr));
938 	VERIFY(proto == IPPROTO_TCP || proto == IPPROTO_UDP);
939 	VERIFY(hport != 0);
940 
941 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
942 	    "reserving %s:%s:%d // flags 0x%x // token %svalid",
943 	    inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str,
944 	    sizeof(tmp_ip_str)), PROTO_STR(proto), hport, flags,
945 	    NETNS_TOKEN_VALID(token) ? "" : "in");
946 
947 	/*
948 	 * See the documentation for NETNS_PRERESERVED in netns.h for an
949 	 * explanation of this block.
950 	 */
951 	if (NETNS_TOKEN_VALID(token)) {
952 		if (flags & NETNS_PRERESERVED) {
953 			nt = *token;
954 			VERIFY(nt->nt_addr_len == addr_len);
955 			VERIFY(memcmp(nt->nt_addr, addr, addr_len) == 0);
956 			VERIFY(nt->nt_proto == proto);
957 			VERIFY(nt->nt_port == hport);
958 			VERIFY((nt->nt_flags &
959 			    NETNS_RESERVATION_FLAGS | NETNS_PRERESERVED) ==
960 			    (flags & NETNS_RESERVATION_FLAGS));
961 
962 			if ((nt->nt_flags & NETNS_CONFIGURATION_FLAGS) ==
963 			    (flags & NETNS_CONFIGURATION_FLAGS)) {
964 				SK_DF(NS_VERB_IP(nt->nt_addr_len) |
965 				    NS_VERB_PROTO(nt->nt_proto),
966 				    "%s:%s:%d // flags 0x%x -> 0x%x",
967 				    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
968 				    nt->nt_addr, tmp_ip_str,
969 				    sizeof(tmp_ip_str)),
970 				    PROTO_STR(nt->nt_proto),
971 				    nt->nt_port, nt->nt_flags, flags);
972 				nt->nt_flags &= ~NETNS_CONFIGURATION_FLAGS;
973 				nt->nt_flags |=
974 				    flags & NETNS_CONFIGURATION_FLAGS;
975 			}
976 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
977 			    "token was prereserved");
978 			goto done;
979 		} else {
980 			panic("Request to overwrite valid netns token");
981 			/* NOTREACHED */
982 			__builtin_unreachable();
983 		}
984 	}
985 
986 	/*
987 	 * TODO: Check range against bitmap
988 	 */
989 	if (hport == 0) {
990 		/*
991 		 * Caller request an arbitrary range of ports
992 		 * TODO: Need to figure out how to allocate
993 		 * emphemeral ports only.
994 		 */
995 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
996 		    "ERROR - wildcard port not yet supported");
997 		err = ENOMEM;
998 		goto done;
999 	}
1000 
1001 	/*
1002 	 * Fetch namespace for the specified address/protocol, creating
1003 	 * a new namespace if necessary.
1004 	 */
1005 	if (ns == NULL) {
1006 		ASSERT(ns_want_cleanup);
1007 		ns = _netns_get_ns(addr, addr_len, proto, true);
1008 	}
1009 	if (__improbable(ns == NULL)) {
1010 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1011 		    "ERROR - couldn't create namespace");
1012 		err = ENOMEM;
1013 		goto done;
1014 	}
1015 
1016 	/*
1017 	 * Make a reservation in the namespace
1018 	 * This will return an error if an incompatible reservation
1019 	 * already exists.
1020 	 */
1021 	err = _netns_reserve_common(ns, hport, flags);
1022 	if (__improbable(err != 0)) {
1023 		NETNS_LOCK_CONVERT();
1024 		if (ns_want_cleanup) {
1025 			netns_ns_cleanup(ns);
1026 		}
1027 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1028 		    "ERROR - reservation collision");
1029 		goto done;
1030 	}
1031 
1032 	if (!_netns_is_wildcard_addr(ns->ns_addr, addr_len)) {
1033 		/* Record the reservation in the non-wild namespace */
1034 		struct ns *nwns;
1035 
1036 		nwns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1037 		    addr_len)];
1038 		err = _netns_reserve_common(nwns, hport, flags);
1039 		if (__improbable(err != 0)) {
1040 			/* Need to free the specific namespace entry */
1041 			NETNS_LOCK_CONVERT();
1042 			_netns_release_common(ns, hport, flags);
1043 			if (ns_want_cleanup) {
1044 				netns_ns_cleanup(ns);
1045 			}
1046 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1047 			    "ERROR - reservation collision");
1048 			goto done;
1049 		}
1050 	}
1051 
1052 	nt = netns_ns_token_alloc(nfi != NULL ? true : false);
1053 	ASSERT(nt->nt_ifp == NULL);
1054 	_netns_set_ifnet_internal(nt, ifp);
1055 
1056 	memcpy(nt->nt_addr, addr, addr_len);
1057 	nt->nt_addr_len = addr_len;
1058 	nt->nt_proto = proto;
1059 	nt->nt_port = hport;
1060 	nt->nt_flags = flags;
1061 
1062 	if (nfi != NULL) {
1063 		VERIFY(nt->nt_flow_info != NULL);
1064 
1065 		memcpy(nt->nt_flow_info, nfi, sizeof(struct ns_flow_info));
1066 		/*
1067 		 * The local port is passed as a separate argument
1068 		 */
1069 		if (nfi->nfi_laddr.sa.sa_family == AF_INET) {
1070 			nt->nt_flow_info->nfi_laddr.sin.sin_port = *port;
1071 		} else if (nfi->nfi_laddr.sa.sa_family == AF_INET6) {
1072 			nt->nt_flow_info->nfi_laddr.sin6.sin6_port = *port;
1073 		}
1074 	}
1075 	*token = nt;
1076 
1077 done:
1078 	return err;
1079 }
1080 
1081 /*
1082  * Kernel-facing functions
1083  */
1084 
1085 int
netns_init(void)1086 netns_init(void)
1087 {
1088 	VERIFY(__netns_inited == 0);
1089 
1090 	netns_ns_reservation_size = sizeof(struct ns_reservation);
1091 	netns_ns_reservation_cache = skmem_cache_create(NETNS_NS_RESERVATION_ZONE_NAME,
1092 	    netns_ns_reservation_size, sizeof(uint64_t), NULL, NULL, NULL,
1093 	    NULL, NULL, 0);
1094 	if (netns_ns_reservation_cache == NULL) {
1095 		panic("%s: skmem_cache create failed (%s)", __func__,
1096 		    NETNS_NS_RESERVATION_ZONE_NAME);
1097 		/* NOTREACHED */
1098 		__builtin_unreachable();
1099 	}
1100 
1101 	netns_ns_token_size = sizeof(struct ns_token);
1102 	netns_ns_token_cache = skmem_cache_create(NETNS_NS_TOKEN_ZONE_NAME,
1103 	    netns_ns_token_size, sizeof(uint64_t), NULL, NULL, NULL, NULL,
1104 	    NULL, 0);
1105 	if (netns_ns_token_cache == NULL) {
1106 		panic("%s: skmem_cache create failed (%s)", __func__,
1107 		    NETNS_NS_TOKEN_ZONE_NAME);
1108 		/* NOTREACHED */
1109 		__builtin_unreachable();
1110 	}
1111 
1112 	netns_ns_flow_info_size = sizeof(struct ns_flow_info);
1113 	netns_ns_flow_info_cache = skmem_cache_create(NETNS_NS_FLOW_INFO_ZONE_NAME,
1114 	    netns_ns_flow_info_size, sizeof(uint64_t), NULL, NULL, NULL,
1115 	    NULL, NULL, 0);
1116 	if (netns_ns_flow_info_cache == NULL) {
1117 		panic("%s: skmem_cache create failed (%s)", __func__,
1118 		    NETNS_NS_FLOW_INFO_ZONE_NAME);
1119 		/* NOTREACHED */
1120 		__builtin_unreachable();
1121 	}
1122 
1123 	SLIST_INIT(&netns_unbound_tokens);
1124 	SLIST_INIT(&netns_all_tokens);
1125 
1126 	netns_n_namespaces = 0;
1127 	RB_INIT(&netns_namespaces);
1128 
1129 	SK_D("initializing global namespaces");
1130 
1131 	netns_init_global_ns(
1132 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1133 		NETNS_ADDRLEN_V4)], IPPROTO_TCP, sizeof(struct in_addr));
1134 
1135 	netns_init_global_ns(
1136 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1137 		NETNS_ADDRLEN_V4)], IPPROTO_UDP, sizeof(struct in_addr));
1138 
1139 	netns_init_global_ns(
1140 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1141 		NETNS_ADDRLEN_V6)], IPPROTO_TCP, sizeof(struct in6_addr));
1142 
1143 	netns_init_global_ns(
1144 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1145 		NETNS_ADDRLEN_V6)], IPPROTO_UDP, sizeof(struct in6_addr));
1146 
1147 	/* Done */
1148 
1149 	__netns_inited = 1;
1150 	sk_features |= SK_FEATURE_NETNS;
1151 
1152 	SK_D("initialized netns");
1153 
1154 	return 0;
1155 }
1156 
1157 void
netns_uninit(void)1158 netns_uninit(void)
1159 {
1160 	if (__netns_inited == 1) {
1161 		struct ns *namespace;
1162 		struct ns *temp_namespace;
1163 		int i;
1164 
1165 		RB_FOREACH_SAFE(namespace, netns_namespaces_tree,
1166 		    &netns_namespaces, temp_namespace) {
1167 			RB_REMOVE(netns_namespaces_tree, &netns_namespaces,
1168 			    namespace);
1169 			netns_n_namespaces--;
1170 			netns_ns_free(namespace);
1171 		}
1172 
1173 		for (i = 0; i < NETNS_N_GLOBAL; i++) {
1174 			netns_ns_free(netns_global_non_wild[i]);
1175 		}
1176 
1177 		if (netns_ns_flow_info_cache != NULL) {
1178 			skmem_cache_destroy(netns_ns_flow_info_cache);
1179 			netns_ns_flow_info_cache = NULL;
1180 		}
1181 		if (netns_ns_token_cache != NULL) {
1182 			skmem_cache_destroy(netns_ns_token_cache);
1183 			netns_ns_token_cache = NULL;
1184 		}
1185 		if (netns_ns_reservation_cache != NULL) {
1186 			skmem_cache_destroy(netns_ns_reservation_cache);
1187 			netns_ns_reservation_cache = NULL;
1188 		}
1189 
1190 		__netns_inited = 0;
1191 		sk_features &= ~SK_FEATURE_NETNS;
1192 
1193 		SK_D("uninitialized netns");
1194 	}
1195 }
1196 
1197 void
netns_reap_caches(boolean_t purge)1198 netns_reap_caches(boolean_t purge)
1199 {
1200 	/* these aren't created unless netns is enabled */
1201 	if (netns_ns_token_cache != NULL) {
1202 		skmem_cache_reap_now(netns_ns_token_cache, purge);
1203 	}
1204 	if (netns_ns_reservation_cache != NULL) {
1205 		skmem_cache_reap_now(netns_ns_reservation_cache, purge);
1206 	}
1207 	if (netns_ns_flow_info_cache != NULL) {
1208 		skmem_cache_reap_now(netns_ns_flow_info_cache, purge);
1209 	}
1210 }
1211 
1212 boolean_t
netns_is_enabled(void)1213 netns_is_enabled(void)
1214 {
1215 	return __netns_inited == 1;
1216 }
1217 
1218 int
netns_reserve(netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t port,uint32_t flags,struct ns_flow_info * nfi)1219 netns_reserve(netns_token *token, uint32_t *addr, uint8_t addr_len,
1220     uint8_t proto, in_port_t port, uint32_t flags, struct ns_flow_info *nfi)
1221 {
1222 	int err = 0;
1223 #if SK_LOG
1224 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1225 #endif /* SK_LOG */
1226 
1227 	if (__netns_inited == 0) {
1228 		*token = NULL;
1229 		return err;
1230 	}
1231 
1232 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1233 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1234 		return ENOTSUP;
1235 	}
1236 
1237 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1238 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1239 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1240 	    flags);
1241 
1242 	/*
1243 	 * Check wether the process is allowed to bind to a restricted port
1244 	 */
1245 	if (!current_task_can_use_restricted_in_port(port,
1246 	    proto, flags)) {
1247 		*token = NULL;
1248 		return EADDRINUSE;
1249 	}
1250 
1251 	NETNS_LOCK_SPIN();
1252 	err = _netns_reserve_kpi_common(NULL, token, addr, addr_len,
1253 	    proto, &port, flags, nfi);
1254 	NETNS_UNLOCK();
1255 
1256 	return err;
1257 }
1258 
1259 /* Import net.inet.{tcp,udp}.randomize_ports sysctls */
1260 extern int      udp_use_randomport;
1261 extern int      tcp_use_randomport;
1262 
1263 int
netns_reserve_ephemeral(netns_token * token,uint32_t * addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)1264 netns_reserve_ephemeral(netns_token *token, uint32_t *addr, uint8_t addr_len,
1265     uint8_t proto, in_port_t *port, uint32_t flags, struct ns_flow_info *nfi)
1266 {
1267 	int err = 0;
1268 	in_port_t first = (in_port_t)ipport_firstauto;
1269 	in_port_t last  = (in_port_t)ipport_lastauto;
1270 	in_port_t rand_port;
1271 	in_port_t last_port;
1272 	in_port_t n_last_port;
1273 	struct ns *namespace;
1274 	boolean_t count_up = true;
1275 	boolean_t use_randomport = (proto == IPPROTO_TCP) ?
1276 	    tcp_use_randomport : udp_use_randomport;
1277 #if SK_LOG
1278 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1279 #endif /* SK_LOG */
1280 
1281 	if (__netns_inited == 0) {
1282 		*token = NULL;
1283 		return err;
1284 	}
1285 
1286 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1287 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1288 		return ENOTSUP;
1289 	}
1290 
1291 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1292 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1293 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(*port),
1294 	    flags);
1295 
1296 	NETNS_LOCK_SPIN();
1297 
1298 	namespace = _netns_get_ns(addr, addr_len, proto, true);
1299 	if (namespace == NULL) {
1300 		err = ENOMEM;
1301 		NETNS_UNLOCK();
1302 		return err;
1303 	}
1304 
1305 	if (proto == IPPROTO_UDP) {
1306 		if (UINT16_MAX - namespace->ns_n_reservations <
1307 		    NETNS_NS_UDP_EPHEMERAL_RESERVE) {
1308 			SK_ERR("UDP ephemeral port not available"
1309 			    "(less than 4096 UDP ports left)");
1310 			err = EADDRNOTAVAIL;
1311 			NETNS_UNLOCK();
1312 			return err;
1313 		}
1314 	}
1315 
1316 	if (first == last) {
1317 		rand_port = first;
1318 	} else {
1319 		if (use_randomport) {
1320 			NETNS_LOCK_CONVERT();
1321 			read_frandom(&rand_port, sizeof(rand_port));
1322 
1323 			if (first > last) {
1324 				rand_port = last + (rand_port %
1325 				    (first - last));
1326 				count_up = false;
1327 			} else {
1328 				rand_port = first + (rand_port %
1329 				    (last - first));
1330 			}
1331 		} else {
1332 			if (first > last) {
1333 				rand_port =
1334 				    namespace->ns_last_ephemeral_port_down - 1;
1335 				if (rand_port < last || rand_port > first) {
1336 					rand_port = last;
1337 				}
1338 				count_up = false;
1339 			} else {
1340 				rand_port =
1341 				    namespace->ns_last_ephemeral_port_up + 1;
1342 				if (rand_port < first || rand_port > last) {
1343 					rand_port = first;
1344 				}
1345 			}
1346 		}
1347 	}
1348 	last_port = rand_port;
1349 	n_last_port = htons(last_port);
1350 
1351 	while (true) {
1352 		if (n_last_port == 0) {
1353 			SK_ERR("ephemeral port search range includes 0");
1354 			err = EINVAL;
1355 			break;
1356 		}
1357 
1358 		/*
1359 		 * Skip if this is a restricted port as we do not want to
1360 		 * restricted ports as ephemeral
1361 		 */
1362 		if (!IS_RESTRICTED_IN_PORT(n_last_port)) {
1363 			err = _netns_reserve_kpi_common(namespace, token, addr,
1364 			    addr_len, proto, &n_last_port, flags, nfi);
1365 			if (err == 0 || err != EADDRINUSE) {
1366 				break;
1367 			}
1368 		}
1369 		if (count_up) {
1370 			last_port++;
1371 			if (last_port < first || last_port > last) {
1372 				last_port = first;
1373 			}
1374 		} else {
1375 			last_port--;
1376 			if (last_port < last || last_port > first) {
1377 				last_port = last;
1378 			}
1379 		}
1380 		n_last_port = htons(last_port);
1381 
1382 		if (last_port == rand_port || first == last) {
1383 			SK_ERR("couldn't find free ephemeral port");
1384 			err = EADDRNOTAVAIL;
1385 			break;
1386 		}
1387 	}
1388 
1389 	if (err == 0) {
1390 		*port = n_last_port;
1391 		if (count_up) {
1392 			namespace->ns_last_ephemeral_port_up = last_port;
1393 		} else {
1394 			namespace->ns_last_ephemeral_port_down = last_port;
1395 		}
1396 	} else {
1397 		netns_ns_cleanup(namespace);
1398 	}
1399 
1400 	NETNS_UNLOCK();
1401 
1402 	return err;
1403 }
1404 
1405 void
netns_release(netns_token * token)1406 netns_release(netns_token *token)
1407 {
1408 	struct ns *ns;
1409 	struct ns_token *nt;
1410 	uint8_t proto, addr_len;
1411 #if SK_LOG
1412 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1413 #endif /* SK_LOG */
1414 
1415 	if (!NETNS_TOKEN_VALID(token)) {
1416 		return;
1417 	}
1418 
1419 	if (__netns_inited == 0) {
1420 		*token = NULL;
1421 		return;
1422 	}
1423 
1424 	NETNS_LOCK_SPIN();
1425 
1426 	nt = *token;
1427 	*token = NULL;
1428 
1429 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
1430 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1431 	    nt->nt_addr_len == sizeof(struct in6_addr));
1432 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1433 
1434 	addr_len = nt->nt_addr_len;
1435 	proto = nt->nt_proto;
1436 
1437 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1438 	    "releasing %s:%s:%d",
1439 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1440 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto),
1441 	    nt->nt_port);
1442 
1443 	if (!_netns_is_wildcard_addr(nt->nt_addr, addr_len)) {
1444 		/* Remove from global non-wild namespace */
1445 
1446 		ns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1447 		    addr_len)];
1448 		VERIFY(ns != NULL);
1449 
1450 		_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1451 	}
1452 
1453 	ns = _netns_get_ns(nt->nt_addr, addr_len, proto, false);
1454 	VERIFY(ns != NULL);
1455 	_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1456 
1457 	netns_clear_ifnet(nt);
1458 	netns_ns_token_free(nt);
1459 
1460 	NETNS_UNLOCK();
1461 }
1462 
1463 int
netns_change_addr(netns_token * token,uint32_t * addr,uint8_t addr_len)1464 netns_change_addr(netns_token *token, uint32_t *addr, uint8_t addr_len)
1465 {
1466 	int err = 0;
1467 	struct ns *old_namespace;
1468 	struct ns *new_namespace;
1469 	struct ns *global_namespace;
1470 	struct ns_token *nt;
1471 	uint8_t proto;
1472 #if SK_LOG
1473 	char tmp_ip_str_1[MAX_IPv6_STR_LEN];
1474 	char tmp_ip_str_2[MAX_IPv6_STR_LEN];
1475 #endif /* SK_LOG */
1476 
1477 	if (__netns_inited == 0) {
1478 		return 0;
1479 	}
1480 
1481 	NETNS_LOCK();
1482 
1483 	VERIFY(NETNS_TOKEN_VALID(token));
1484 
1485 	nt = *token;
1486 
1487 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) == NETNS_BSD);
1488 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1489 	    nt->nt_addr_len == sizeof(struct in6_addr));
1490 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1491 
1492 	proto = nt->nt_proto;
1493 
1494 #if SK_LOG
1495 	inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1496 	    tmp_ip_str_1, sizeof(tmp_ip_str_1));
1497 	inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2,
1498 	    sizeof(tmp_ip_str_2));
1499 #endif /* SK_LOG */
1500 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1501 	    "changing address for %s:%d from %s to %s",
1502 	    PROTO_STR(proto), nt->nt_port, tmp_ip_str_1,
1503 	    tmp_ip_str_2);
1504 
1505 	if (nt->nt_addr_len == addr_len &&
1506 	    memcmp(nt->nt_addr, addr, nt->nt_addr_len) == 0) {
1507 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1508 		    "address didn't change, exiting early");
1509 		goto done;
1510 	}
1511 
1512 	old_namespace = _netns_get_ns(nt->nt_addr, nt->nt_addr_len, proto,
1513 	    false);
1514 	VERIFY(old_namespace != NULL);
1515 
1516 	new_namespace = _netns_get_ns(addr, addr_len, proto, true);
1517 	if (new_namespace == NULL) {
1518 		err = ENOMEM;
1519 		goto done;
1520 	}
1521 
1522 	/* Acquire reservation in new namespace */
1523 	if ((err = _netns_reserve_common(new_namespace, nt->nt_port,
1524 	    nt->nt_flags))) {
1525 		NETNS_LOCK_CONVERT();
1526 		netns_ns_cleanup(new_namespace);
1527 		SK_ERR("ERROR - reservation collision under new namespace");
1528 		goto done;
1529 	}
1530 
1531 	/* Release from old namespace */
1532 	_netns_release_common(old_namespace, nt->nt_port, nt->nt_flags);
1533 
1534 	if (!_netns_is_wildcard_addr(nt->nt_addr, nt->nt_addr_len)) {
1535 		/*
1536 		 * Old address is non-wildcard.
1537 		 * Remove old reservation from global non-wild namespace
1538 		 */
1539 		global_namespace = netns_global_non_wild[
1540 			NETNS_NS_GLOBAL_IDX(proto, nt->nt_addr_len)];
1541 		VERIFY(global_namespace != NULL);
1542 
1543 		_netns_release_common(global_namespace, nt->nt_port,
1544 		    nt->nt_flags);
1545 	}
1546 
1547 	if (!_netns_is_wildcard_addr(addr, addr_len)) {
1548 		/*
1549 		 * New address is non-wildcard.
1550 		 * Record new reservation in global non-wild namespace
1551 		 */
1552 		global_namespace = netns_global_non_wild[
1553 			NETNS_NS_GLOBAL_IDX(proto, addr_len)];
1554 		VERIFY(global_namespace != NULL);
1555 
1556 		if ((err = _netns_reserve_common(global_namespace,
1557 		    nt->nt_port, nt->nt_flags)) != 0) {
1558 			SK_ERR("ERROR - reservation collision under new global namespace");
1559 			/* XXX: Should not fail. Maybe assert instead */
1560 			goto done;
1561 		}
1562 	}
1563 
1564 	memcpy(nt->nt_addr, addr, addr_len);
1565 	nt->nt_addr_len = addr_len;
1566 
1567 done:
1568 	NETNS_UNLOCK();
1569 	return err;
1570 }
1571 
1572 static void
_netns_set_ifnet_internal(struct ns_token * nt,struct ifnet * ifp)1573 _netns_set_ifnet_internal(struct ns_token *nt, struct ifnet *ifp)
1574 {
1575 #if SK_LOG
1576 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1577 #endif /* SK_LOG */
1578 
1579 	NETNS_LOCK_ASSERT_HELD();
1580 
1581 	if (ifp != NULL && ifnet_is_attached(ifp, 1)) {
1582 		nt->nt_ifp = ifp;
1583 		SLIST_INSERT_HEAD(&ifp->if_netns_tokens, nt, nt_ifp_link);
1584 
1585 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1586 		    "%s:%s:%d // added to ifnet %d",
1587 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1588 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1589 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1590 		    ifp->if_index);
1591 	} else {
1592 		SLIST_INSERT_HEAD(&netns_unbound_tokens, nt, nt_ifp_link);
1593 	}
1594 }
1595 
1596 void
netns_set_ifnet(netns_token * token,ifnet_t ifp)1597 netns_set_ifnet(netns_token *token, ifnet_t ifp)
1598 {
1599 	struct ns_token *nt;
1600 #if SK_LOG
1601 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1602 #endif /* SK_LOG */
1603 
1604 	if (__netns_inited == 0) {
1605 		return;
1606 	}
1607 
1608 	NETNS_LOCK();
1609 
1610 	VERIFY(NETNS_TOKEN_VALID(token));
1611 
1612 	nt = *token;
1613 
1614 	if (nt->nt_ifp == ifp) {
1615 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1616 		    "%s:%s:%d // ifnet already %d, exiting early",
1617 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1618 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1619 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1620 		    ifp ? ifp->if_index : -1);
1621 		NETNS_UNLOCK();
1622 		return;
1623 	}
1624 
1625 	netns_clear_ifnet(nt);
1626 
1627 	_netns_set_ifnet_internal(nt, ifp);
1628 
1629 	NETNS_UNLOCK();
1630 }
1631 
1632 void
netns_ifnet_detach(ifnet_t ifp)1633 netns_ifnet_detach(ifnet_t ifp)
1634 {
1635 	struct ns_token *token, *tmp_token;
1636 
1637 	if (__netns_inited == 0) {
1638 		return;
1639 	}
1640 
1641 	NETNS_LOCK();
1642 
1643 	SLIST_FOREACH_SAFE(token, &ifp->if_netns_tokens, nt_ifp_link,
1644 	    tmp_token) {
1645 		netns_clear_ifnet(token);
1646 		SLIST_INSERT_HEAD(&netns_unbound_tokens, token, nt_ifp_link);
1647 	}
1648 
1649 	NETNS_UNLOCK();
1650 }
1651 
1652 static void
_netns_set_state(netns_token * token,uint32_t state)1653 _netns_set_state(netns_token *token, uint32_t state)
1654 {
1655 	struct ns_token *nt;
1656 #if SK_LOG
1657 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1658 #endif /* SK_LOG */
1659 
1660 	if (__netns_inited == 0) {
1661 		return;
1662 	}
1663 
1664 	NETNS_LOCK();
1665 	VERIFY(NETNS_TOKEN_VALID(token));
1666 
1667 	nt = *token;
1668 	nt->nt_state |= state;
1669 
1670 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1671 	    "%s:%s:%d // state 0x%b",
1672 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1673 	    tmp_ip_str, sizeof(tmp_ip_str)),
1674 	    PROTO_STR(nt->nt_proto), nt->nt_port, state, NETNS_STATE_BITS);
1675 
1676 	NETNS_UNLOCK();
1677 }
1678 
1679 void
netns_half_close(netns_token * token)1680 netns_half_close(netns_token *token)
1681 {
1682 	_netns_set_state(token, NETNS_STATE_HALFCLOSED);
1683 }
1684 
1685 void
netns_withdraw(netns_token * token)1686 netns_withdraw(netns_token *token)
1687 {
1688 	_netns_set_state(token, NETNS_STATE_WITHDRAWN);
1689 }
1690 
1691 int
netns_get_flow_info(netns_token * token,struct ns_flow_info * nfi)1692 netns_get_flow_info(netns_token *token,
1693     struct ns_flow_info *nfi)
1694 {
1695 	if (__netns_inited == 0) {
1696 		return ENOTSUP;
1697 	}
1698 
1699 	NETNS_LOCK();
1700 	if (!NETNS_TOKEN_VALID(token) ||
1701 	    nfi == NULL) {
1702 		NETNS_UNLOCK();
1703 		return EINVAL;
1704 	}
1705 
1706 	struct ns_token *nt = *token;
1707 	if (nt->nt_flow_info == NULL) {
1708 		NETNS_UNLOCK();
1709 		return ENOENT;
1710 	}
1711 
1712 	memcpy(nfi, nt->nt_flow_info, sizeof(struct ns_flow_info));
1713 	NETNS_UNLOCK();
1714 
1715 	return 0;
1716 }
1717 
1718 void
netns_change_flags(netns_token * token,uint32_t set_flags,uint32_t clear_flags)1719 netns_change_flags(netns_token *token, uint32_t set_flags,
1720     uint32_t clear_flags)
1721 {
1722 	struct ns_token *nt;
1723 #if SK_LOG
1724 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1725 #endif /* SK_LOG */
1726 
1727 	if (__netns_inited == 0) {
1728 		return;
1729 	}
1730 
1731 	NETNS_LOCK();
1732 
1733 	VERIFY(NETNS_TOKEN_VALID(token));
1734 
1735 	nt = *token;
1736 
1737 	VERIFY(!((set_flags | clear_flags) & NETNS_RESERVATION_FLAGS));
1738 	/* TODO: verify set and clear flags don't overlap? */
1739 
1740 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1741 	    "%s:%s:%d // flags 0x%x -> 0x%x",
1742 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1743 	    tmp_ip_str, sizeof(tmp_ip_str)),
1744 	    PROTO_STR(nt->nt_proto), nt->nt_port, nt->nt_flags,
1745 	    nt->nt_flags | set_flags & ~clear_flags);
1746 
1747 	nt->nt_flags |= set_flags;
1748 	nt->nt_flags &= ~clear_flags;
1749 
1750 	NETNS_UNLOCK();
1751 }
1752 
1753 /*
1754  * Port offloading KPI
1755  */
1756 static inline void
netns_local_port_scan_flow_entry(struct flow_entry * fe,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1757 netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protocol,
1758     u_int32_t flags, u_int8_t *bitfield)
1759 {
1760 	struct ns_token *token;
1761 	boolean_t iswildcard = false;
1762 
1763 	if (fe == NULL) {
1764 		return;
1765 	}
1766 
1767 	if (fe->fe_flags & FLOWENTF_EXTRL_PORT) {
1768 		return;
1769 	}
1770 
1771 	token = fe->fe_port_reservation;
1772 	if (token == NULL) {
1773 		return;
1774 	}
1775 
1776 	/*
1777 	 * We are only interested in active flows over skywalk channels
1778 	 */
1779 	if ((token->nt_flags & NETNS_OWNER_MASK) != NETNS_SKYWALK) {
1780 		return;
1781 	}
1782 
1783 	if (token->nt_state & NETNS_STATE_WITHDRAWN) {
1784 		return;
1785 	}
1786 
1787 	if (!(flags & IFNET_GET_LOCAL_PORTS_ANYTCPSTATEOK) &&
1788 	    (flags & IFNET_GET_LOCAL_PORTS_ACTIVEONLY) &&
1789 	    (token->nt_state & NETNS_STATE_HALFCLOSED)) {
1790 		return;
1791 	}
1792 
1793 	VERIFY(token->nt_addr_len == sizeof(struct in_addr) ||
1794 	    token->nt_addr_len == sizeof(struct in6_addr));
1795 
1796 	if (token->nt_addr_len == sizeof(struct in_addr)) {
1797 		if (protocol == PF_INET6) {
1798 			return;
1799 		}
1800 
1801 		iswildcard = token->nt_inaddr.s_addr == INADDR_ANY;
1802 	} else if (token->nt_addr_len == sizeof(struct in6_addr)) {
1803 		if (protocol == PF_INET) {
1804 			return;
1805 		}
1806 
1807 		iswildcard = IN6_IS_ADDR_UNSPECIFIED(
1808 			&token->nt_in6addr);
1809 	}
1810 	if (!(flags & IFNET_GET_LOCAL_PORTS_WILDCARDOK) && iswildcard) {
1811 		return;
1812 	}
1813 
1814 	if ((flags & IFNET_GET_LOCAL_PORTS_TCPONLY) &&
1815 	    token->nt_proto == IPPROTO_UDP) {
1816 		return;
1817 	}
1818 	if ((flags & IFNET_GET_LOCAL_PORTS_UDPONLY) &&
1819 	    token->nt_proto == IPPROTO_TCP) {
1820 		return;
1821 	}
1822 
1823 	if (!(flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) &&
1824 	    (token->nt_flags & NETNS_NOWAKEFROMSLEEP)) {
1825 #if DEBUG || DEVELOPMENT
1826 		char lbuf[MAX_IPv6_STR_LEN + 6] = {};
1827 		char fbuf[MAX_IPv6_STR_LEN + 6] = {};
1828 		in_port_t lport;
1829 		in_port_t fport;
1830 		char pname[MAXCOMLEN + 1];
1831 		const struct ns_flow_info *nfi = token->nt_flow_info;
1832 
1833 		proc_name(nfi->nfi_owner_pid, pname, sizeof(pname));
1834 
1835 		if (protocol == PF_INET) {
1836 			inet_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr,
1837 			    lbuf, sizeof(lbuf));
1838 			inet_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr,
1839 			    fbuf, sizeof(fbuf));
1840 			lport = nfi->nfi_laddr.sin.sin_port;
1841 			fport = nfi->nfi_faddr.sin.sin_port;
1842 		} else {
1843 			inet_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr,
1844 			    lbuf, sizeof(lbuf));
1845 			inet_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr,
1846 			    fbuf, sizeof(fbuf));
1847 			lport = nfi->nfi_laddr.sin6.sin6_port;
1848 			fport = nfi->nfi_faddr.sin6.sin6_port;
1849 		}
1850 
1851 		os_log(OS_LOG_DEFAULT,
1852 		    "netns_local_port_scan_flow_entry: no wake from sleep %s %s:%u %s:%u ifp %s proc %s:%d",
1853 		    token->nt_proto == IPPROTO_TCP ? "tcp" : "udp",
1854 		    lbuf, ntohs(lport), fbuf, ntohs(fport),
1855 		    token->nt_ifp->if_xname, pname, nfi->nfi_owner_pid);
1856 #endif /* DEBUG || DEVELOPMENT */
1857 		return;
1858 	}
1859 
1860 	if ((flags & IFNET_GET_LOCAL_PORTS_RECVANYIFONLY) &&
1861 	    !(token->nt_flags & NETNS_RECVANYIF)) {
1862 		return;
1863 	}
1864 
1865 	if ((flags & IFNET_GET_LOCAL_PORTS_EXTBGIDLEONLY) &&
1866 	    !(token->nt_flags & NETNS_EXTBGIDLE)) {
1867 		return;
1868 	}
1869 
1870 	if (token->nt_ifp != NULL && token->nt_flow_info != NULL) {
1871 		bitstr_set(bitfield, token->nt_port);
1872 		(void) if_ports_used_add_flow_entry(fe, token->nt_ifp->if_index,
1873 		    token->nt_flow_info, token->nt_flags);
1874 	} else {
1875 		SK_ERR("%s: unknown owner port %u"
1876 		    " nt_flags 0x%x ifindex %u nt_flow_info %p\n",
1877 		    __func__, token->nt_port,
1878 		    token->nt_flags,
1879 		    token->nt_ifp != NULL ? token->nt_ifp->if_index : 0,
1880 		    token->nt_flow_info);
1881 	}
1882 }
1883 
1884 static void
netns_get_if_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1885 netns_get_if_local_ports(ifnet_t ifp, protocol_family_t protocol,
1886     u_int32_t flags, u_int8_t *bitfield)
1887 {
1888 	struct nx_flowswitch *fsw = NULL;
1889 
1890 	if (ifp == NULL || ifp->if_na == NULL) {
1891 		return;
1892 	}
1893 	/* Ensure that the interface is attached and won't detach */
1894 	if (!ifnet_is_attached(ifp, 1)) {
1895 		return;
1896 	}
1897 	fsw = fsw_ifp_to_fsw(ifp);
1898 	if (fsw == NULL) {
1899 		goto done;
1900 	}
1901 	FSW_RLOCK(fsw);
1902 	NETNS_LOCK();
1903 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1904 		netns_local_port_scan_flow_entry(_fe, protocol, flags,
1905 		bitfield);
1906 	});
1907 	NETNS_UNLOCK();
1908 	FSW_UNLOCK(fsw);
1909 done:
1910 	ifnet_decr_iorefcnt(ifp);
1911 }
1912 
1913 errno_t
netns_get_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t * bitfield)1914 netns_get_local_ports(ifnet_t ifp, protocol_family_t protocol,
1915     u_int32_t flags, u_int8_t *bitfield)
1916 {
1917 	if (__netns_inited == 0) {
1918 		return 0;
1919 	}
1920 	if (ifp != NULL) {
1921 		netns_get_if_local_ports(ifp, protocol, flags, bitfield);
1922 	} else {
1923 		errno_t error;
1924 		ifnet_t *ifp_list;
1925 		uint32_t count, i;
1926 
1927 		error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count);
1928 		if (error != 0) {
1929 			os_log_error(OS_LOG_DEFAULT,
1930 			    "%s: ifnet_list_get_all() failed %d",
1931 			    __func__, error);
1932 			return error;
1933 		}
1934 		for (i = 0; i < count; i++) {
1935 			if (TAILQ_EMPTY(&ifp_list[i]->if_addrhead)) {
1936 				continue;
1937 			}
1938 			netns_get_if_local_ports(ifp_list[i], protocol, flags,
1939 			    bitfield);
1940 		}
1941 		ifnet_list_free(ifp_list);
1942 	}
1943 
1944 	return 0;
1945 }
1946 
1947 uint32_t
netns_find_anyres_byaddr(struct ifaddr * ifa,uint8_t proto)1948 netns_find_anyres_byaddr(struct ifaddr *ifa, uint8_t proto)
1949 {
1950 	int result = 0;
1951 	int ifa_addr_len;
1952 	struct ns_token *token;
1953 	struct ifnet *ifp = ifa->ifa_ifp;
1954 	struct sockaddr *ifa_addr = ifa->ifa_addr;
1955 
1956 	if (__netns_inited == 0) {
1957 		return ENOTSUP;
1958 	}
1959 
1960 	if ((ifa_addr->sa_family != AF_INET) &&
1961 	    (ifa_addr->sa_family != AF_INET6)) {
1962 		return 0;
1963 	}
1964 
1965 	ifa_addr_len = (ifa_addr->sa_family == AF_INET) ?
1966 	    sizeof(struct in_addr) : sizeof(struct in6_addr);
1967 
1968 	NETNS_LOCK();
1969 
1970 	SLIST_FOREACH(token, &ifp->if_netns_tokens, nt_ifp_link) {
1971 		if ((token->nt_flags & NETNS_OWNER_MASK) == NETNS_PF) {
1972 			continue;
1973 		}
1974 		if (token->nt_addr_len != ifa_addr_len) {
1975 			continue;
1976 		}
1977 		if (token->nt_proto != proto) {
1978 			continue;
1979 		}
1980 		if (ifa_addr->sa_family == AF_INET) {
1981 			if (token->nt_inaddr.s_addr ==
1982 			    (satosin(ifa->ifa_addr))->sin_addr.s_addr) {
1983 				result = 1;
1984 				break;
1985 			}
1986 		} else if (ifa_addr->sa_family == AF_INET6) {
1987 			if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa),
1988 			    &token->nt_in6addr)) {
1989 				result = 1;
1990 				break;
1991 			}
1992 		}
1993 	}
1994 
1995 	NETNS_UNLOCK();
1996 	return result;
1997 }
1998 
1999 static uint32_t
_netns_lookup_ns_n_reservations(uint32_t * addr,uint8_t addr_len,uint8_t proto)2000 _netns_lookup_ns_n_reservations(uint32_t *addr, uint8_t addr_len, uint8_t proto)
2001 {
2002 	uint32_t ns_n_reservations = 0;
2003 	NETNS_LOCK_SPIN();
2004 	struct ns *namespace = _netns_get_ns(addr, addr_len, proto, true);
2005 	if (namespace != NULL) {
2006 		ns_n_reservations = namespace->ns_n_reservations;
2007 	}
2008 	NETNS_UNLOCK();
2009 	return ns_n_reservations;
2010 }
2011 
2012 uint32_t
netns_lookup_reservations_count_in(struct in_addr addr,uint8_t proto)2013 netns_lookup_reservations_count_in(struct in_addr addr, uint8_t proto)
2014 {
2015 	return _netns_lookup_ns_n_reservations(&addr.s_addr, sizeof(struct in_addr), proto);
2016 }
2017 
2018 uint32_t
netns_lookup_reservations_count_in6(struct in6_addr addr,uint8_t proto)2019 netns_lookup_reservations_count_in6(struct in6_addr addr, uint8_t proto)
2020 {
2021 	if (IN6_IS_SCOPE_EMBED(&addr)) {
2022 		addr.s6_addr16[1] = 0;
2023 	}
2024 	return _netns_lookup_ns_n_reservations(&addr.s6_addr32[0], sizeof(struct in6_addr), proto);
2025 }
2026 
2027 /*
2028  * Sysctl interface
2029  */
2030 
2031 static int netns_ctl_dump_all SYSCTL_HANDLER_ARGS;
2032 
2033 SYSCTL_NODE(_kern_skywalk, OID_AUTO, netns, CTLFLAG_RW | CTLFLAG_LOCKED,
2034     0, "Netns interface");
2035 
2036 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netns,
2037     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
2038     0, 0, netns_ctl_dump_all, "-",
2039     "Namespace contents (struct netns_ctl_dump_header, "
2040     "skywalk/os_stats_private.h)");
2041 
2042 static int
netns_ctl_write_ns(struct sysctl_req * req,struct ns * namespace,boolean_t is_global)2043 netns_ctl_write_ns(struct sysctl_req *req, struct ns *namespace,
2044     boolean_t is_global)
2045 {
2046 	struct ns_reservation *res;
2047 	struct netns_ctl_dump_header response_header;
2048 	struct netns_ctl_dump_record response_record;
2049 	int err;
2050 
2051 	/* Fill out header */
2052 	memset(&response_header, 0, sizeof(response_header));
2053 	response_header.ncdh_n_records = namespace->ns_n_reservations;
2054 	response_header.ncdh_proto = namespace->ns_proto;
2055 
2056 	if (is_global) {
2057 		response_header.ncdh_addr_len = 0;
2058 	} else {
2059 		response_header.ncdh_addr_len = namespace->ns_addr_len;
2060 	}
2061 	memcpy(response_header.ncdh_addr, namespace->ns_addr,
2062 	    namespace->ns_addr_len);
2063 
2064 	err = SYSCTL_OUT(req, &response_header, sizeof(response_header));
2065 	if (err) {
2066 		return err;
2067 	}
2068 
2069 	/* Fill out records */
2070 	RB_FOREACH(res, ns_reservation_tree, &namespace->ns_reservations) {
2071 		memset(&response_record, 0, sizeof(response_record));
2072 		response_record.ncdr_port = res->nsr_port;
2073 		response_record.ncdr_port_end = 0;
2074 		response_record.ncdr_listener_refs =
2075 		    NETNS_REF_COUNT(res, NETNS_LISTENER);
2076 		response_record.ncdr_skywalk_refs =
2077 		    NETNS_REF_COUNT(res, NETNS_SKYWALK);
2078 		response_record.ncdr_bsd_refs =
2079 		    NETNS_REF_COUNT(res, NETNS_BSD);
2080 		response_record.ncdr_pf_refs =
2081 		    NETNS_REF_COUNT(res, NETNS_PF);
2082 		err = SYSCTL_OUT(req, &response_record,
2083 		    sizeof(response_record));
2084 		if (err) {
2085 			return err;
2086 		}
2087 	}
2088 
2089 	return 0;
2090 }
2091 
2092 static int
2093 netns_ctl_dump_all SYSCTL_HANDLER_ARGS
2094 {
2095 #pragma unused(oidp, arg1, arg2)
2096 	struct ns *namespace;
2097 	int i, err = 0;
2098 
2099 	if (!kauth_cred_issuser(kauth_cred_get())) {
2100 		return EPERM;
2101 	}
2102 
2103 	if (__netns_inited == 0) {
2104 		return ENOTSUP;
2105 	}
2106 
2107 	NETNS_LOCK();
2108 
2109 	for (i = 0; i < NETNS_N_GLOBAL; i++) {
2110 		err = netns_ctl_write_ns(req, netns_global_non_wild[i], true);
2111 		if (err) {
2112 			goto done;
2113 		}
2114 	}
2115 
2116 	RB_FOREACH(namespace, netns_namespaces_tree, &netns_namespaces) {
2117 		err = netns_ctl_write_ns(req, namespace, false);
2118 		if (err) {
2119 			goto done;
2120 		}
2121 	}
2122 
2123 	/*
2124 	 * If this is just a request for length, add slop because
2125 	 * this is dynamically changing data
2126 	 */
2127 	if (req->oldptr == USER_ADDR_NULL) {
2128 		req->oldidx += 20 * sizeof(struct netns_ctl_dump_record);
2129 	}
2130 
2131 done:
2132 	NETNS_UNLOCK();
2133 	return err;
2134 }
2135 /* CSTYLED */
2136