xref: /xnu-11215.41.3/bsd/skywalk/namespace/netns.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/assert.h>
30 #include <kern/locks.h>
31 #include <kern/zalloc.h>
32 #include <libkern/tree.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/bitstring.h>
36 #include <net/if.h>
37 #include <net/kpi_interface.h>
38 #include <net/restricted_in_port.h>
39 
40 #include <netinet/in.h>
41 #include <netinet/in_pcb.h>
42 #include <netinet/tcp_fsm.h>
43 #include <netinet/tcp_var.h>
44 
45 #include <netinet6/in6_var.h>
46 #include <string.h>
47 
48 #include <skywalk/os_skywalk.h>
49 #include <skywalk/os_skywalk_private.h>
50 #include <skywalk/os_stats_private.h>
51 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
52 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
53 
54 #include <net/if_ports_used.h>
55 
56 static int __netns_inited = 0;
57 
58 /*
59  * Logging
60  */
61 
62 #define NS_VERB_PROTO(proto)    ((proto == IPPROTO_TCP) ? SK_VERB_NS_TCP : \
63 	                                    SK_VERB_NS_UDP)
64 #define NS_VERB_IP(addr_len)    ((addr_len == sizeof (struct in_addr)) ? \
65 	                                    SK_VERB_NS_IPV4 : SK_VERB_NS_IPV6)
66 #define PROTO_STR(proto)        ((proto == IPPROTO_TCP) ? "tcp" : "udp")
67 #define LEN_TO_AF(len)          (((len == sizeof (struct in_addr)) ? \
68 	                            AF_INET : AF_INET6))
69 /*
70  * Locking
71  * Netns is currently protected by a global mutex, NETNS_LOCK. This lock is
72  * aquired at the entry of every kernel-facing function, and released at the
73  * end. Data within netns_token structures is also protected under this lock.
74  */
75 
76 #define NETNS_LOCK()                    \
77 	lck_mtx_lock(&netns_lock)
78 #define NETNS_LOCK_SPIN()               \
79 	lck_mtx_lock_spin(&netns_lock)
80 #define NETNS_LOCK_CONVERT() do {       \
81 	NETNS_LOCK_ASSERT_HELD();       \
82 	lck_mtx_convert_spin(&netns_lock); \
83 } while (0)
84 #define NETNS_UNLOCK()                  \
85 	lck_mtx_unlock(&netns_lock)
86 #define NETNS_LOCK_ASSERT_HELD()        \
87 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_OWNED)
88 #define NETNS_LOCK_ASSERT_NOTHELD()     \
89 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_NOTOWNED)
90 
91 static LCK_GRP_DECLARE(netns_lock_group, "netns_lock");
92 static LCK_MTX_DECLARE(netns_lock, &netns_lock_group);
93 
94 /*
95  * Internal data structures and parameters
96  */
97 
98 /*
99  * Local ports are kept track of by reference counts kept in a tree specific to
100  * an <IP, protocol> tuple (see struct ns).
101  *
102  * Note: port numbers are stored in host byte order.
103  */
104 struct ns_reservation {
105 	RB_ENTRY(ns_reservation) nsr_link;
106 	uint32_t nsr_refs[NETNS_OWNER_MAX + 1];
107 	in_port_t nsr_port;
108 	bool nsr_reuseport:1;
109 };
110 
111 #define NETNS_REF_COUNT(nsr, flags)     \
112 	(nsr)->nsr_refs[((flags) & NETNS_OWNER_MASK)]
113 
114 static inline int nsr_cmp(const struct ns_reservation *,
115     const struct ns_reservation *);
116 
117 RB_HEAD(ns_reservation_tree, ns_reservation);
118 RB_PROTOTYPE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
119 RB_GENERATE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
120 
121 static inline struct ns_reservation *ns_reservation_tree_find(
122 	struct ns_reservation_tree *, const in_port_t);
123 
124 /*
125  * A namespace keeps track of the local port numbers in use for a given
126  * <IP, protocol> tuple. There are also global namespaces for each
127  * protocol to accomodate INADDR_ANY behavior and diagnostics.
128  */
129 struct ns {
130 	RB_ENTRY(ns)    ns_link;
131 
132 	void            *ns_addr_key;
133 
134 	union {
135 		uint32_t        ns_addr[4];
136 		struct in_addr  ns_inaddr;
137 		struct in6_addr ns_in6addr;
138 	};
139 	uint8_t         ns_addr_len;
140 	uint8_t         ns_proto;
141 
142 	in_port_t       ns_last_ephemeral_port_down;
143 	in_port_t       ns_last_ephemeral_port_up;
144 
145 	uint8_t         ns_is_freeable;
146 
147 	uint32_t        ns_n_reservations;
148 	struct ns_reservation_tree ns_reservations;
149 };
150 
151 static uint32_t netns_n_namespaces;
152 
153 static inline int ns_cmp(const struct ns *, const struct ns *);
154 
155 RB_HEAD(netns_namespaces_tree, ns) netns_namespaces =
156     RB_INITIALIZER(netns_namespaces);
157 RB_PROTOTYPE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
158 RB_GENERATE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
159 
160 /*
161  * Declare pointers to global namespaces for each protocol.
162  * All non-wildcard reservations will have an entry here.
163  */
164 #define NETNS_N_GLOBAL  4
165 static struct ns *netns_global_non_wild[NETNS_N_GLOBAL];
166 static struct ns *netns_global_wild[NETNS_N_GLOBAL];
167 #define NETNS_ADDRLEN_V4 (sizeof(struct in_addr))
168 #define NETNS_ADDRLEN_V6 (sizeof(struct in6_addr))
169 #define NETNS_NS_TCP    0
170 #define NETNS_NS_UDP    1
171 #define NETNS_NS_V4     0
172 #define NETNS_NS_V6     2
173 #define NETNS_NS_GLOBAL_IDX(proto, addrlen)     \
174 	((((proto) == IPPROTO_TCP) ? NETNS_NS_TCP : NETNS_NS_UDP) | \
175 	(((addrlen) == NETNS_ADDRLEN_V4) ? NETNS_NS_V4 : NETNS_NS_V6))
176 
177 #define NETNS_NS_UDP_EPHEMERAL_RESERVE  4096
178 
179 /*
180  * Internal token structure
181  *
182  * Note: port numbers are stored in host byte order.
183  */
184 struct ns_token {
185 	/* Reservation state */
186 	ifnet_t                 nt_ifp;
187 	LIST_ENTRY(ns_token)    nt_ifp_link;
188 	LIST_ENTRY(ns_token)    nt_all_link;
189 	uint32_t                nt_state;       /* NETNS_STATE_* */
190 
191 	/* Reservation context */
192 	union {
193 		uint32_t        nt_addr[4];
194 		struct in_addr  nt_inaddr;
195 		struct in6_addr nt_in6addr;
196 	};
197 	uint8_t                 nt_addr_len;
198 	uint8_t                 nt_proto;
199 	in_port_t               nt_port;
200 	uint32_t                nt_flags;
201 
202 	/* Optional information about the flow */
203 	struct ns_flow_info     *nt_flow_info;
204 };
205 
206 /* Valid values for nt_state */
207 #define NETNS_STATE_HALFCLOSED  0x1     /* half closed */
208 #define NETNS_STATE_WITHDRAWN   0x2     /* withdrawn; not offloadable */
209 
210 #define NETNS_STATE_BITS        "\020\01HALFCLOSED\02WITHDRAWN"
211 
212 /* List of tokens not bound to an ifnet */
213 LIST_HEAD(, ns_token) netns_unbound_tokens = LIST_HEAD_INITIALIZER(
214 	netns_unbound_tokens);
215 
216 /* List of all tokens currently allocated in the system */
217 LIST_HEAD(, ns_token) netns_all_tokens = LIST_HEAD_INITIALIZER(
218 	netns_all_tokens);
219 
220 /*
221  * Memory management
222  */
223 static SKMEM_TYPE_DEFINE(netns_ns_zone, struct ns);
224 
225 #define NETNS_NS_TOKEN_ZONE_NAME        "netns.ns_token"
226 static unsigned int netns_ns_token_size; /* size of zone element */
227 static struct skmem_cache *netns_ns_token_cache; /* for ns_token */
228 
229 #define NETNS_NS_FLOW_INFO_ZONE_NAME    "netns.ns_flow_info"
230 static unsigned int netns_ns_flow_info_size; /* size of zone element */
231 static struct skmem_cache *netns_ns_flow_info_cache; /* for ns_flow_info */
232 
233 #define NETNS_NS_RESERVATION_ZONE_NAME  "netns.ns_reservation"
234 static unsigned int netns_ns_reservation_size; /* size of zone element */
235 static struct skmem_cache *netns_ns_reservation_cache; /* for ns_reservation */
236 
237 static struct ns_reservation *netns_ns_reservation_alloc(in_port_t, uint32_t);
238 static void netns_ns_reservation_free(struct ns_reservation *);
239 static struct ns *netns_ns_alloc(zalloc_flags_t);
240 static void netns_ns_free(struct ns *);
241 static void netns_ns_cleanup(struct ns *);
242 static struct ns_token *netns_ns_token_alloc(boolean_t);
243 static void netns_ns_token_free(struct ns_token *);
244 
245 /*
246  * Utility/internal code
247  */
248 static struct ns *_netns_get_ns(uint32_t *__sized_by(addr_len), uint8_t addr_len,
249     uint8_t, bool);
250 static inline boolean_t _netns_is_wildcard_addr(
251 	const uint32_t *__sized_by(addr_len), uint8_t addr_len);
252 static int _netns_reserve_common(struct ns *, in_port_t, uint32_t);
253 static void _netns_release_common(struct ns *, in_port_t, uint32_t);
254 static inline void netns_clear_ifnet(struct ns_token *);
255 static int _netns_reserve_kpi_common(struct ns *, netns_token *,
256     uint32_t *__sized_by(addr_len), uint8_t addr_len, uint8_t, in_port_t *,
257     uint32_t, struct ns_flow_info *);
258 static void _netns_set_ifnet_internal(struct ns_token *, struct ifnet *);
259 
260 static struct ns_reservation *
netns_ns_reservation_alloc(in_port_t port,uint32_t flags)261 netns_ns_reservation_alloc(in_port_t port, uint32_t flags)
262 {
263 	struct ns_reservation *res;
264 
265 	VERIFY(port != 0);
266 
267 	res = skmem_cache_alloc(netns_ns_reservation_cache, SKMEM_SLEEP);
268 	ASSERT(res != NULL);
269 
270 	bzero(res, netns_ns_reservation_size);
271 	res->nsr_port = port;
272 	res->nsr_reuseport = ((flags & NETNS_REUSEPORT) != 0);
273 	return res;
274 }
275 
276 static void
netns_ns_reservation_free(struct ns_reservation * res)277 netns_ns_reservation_free(struct ns_reservation *res)
278 {
279 	skmem_cache_free(netns_ns_reservation_cache, res);
280 }
281 
282 static struct ns *
netns_ns_alloc(zalloc_flags_t how)283 netns_ns_alloc(zalloc_flags_t how)
284 {
285 	struct ns *namespace;
286 	in_port_t first = (in_port_t)ipport_firstauto;
287 	in_port_t last  = (in_port_t)ipport_lastauto;
288 	in_port_t rand_port;
289 
290 	namespace = zalloc_flags(netns_ns_zone, how | Z_ZERO);
291 	if (namespace == NULL) {
292 		return NULL;
293 	}
294 
295 	namespace->ns_is_freeable = 1;
296 
297 	RB_INIT(&namespace->ns_reservations);
298 
299 	/*
300 	 * Randomize the initial ephemeral port starting point, just in case
301 	 * this namespace is for an ipv6 address which gets brought up and
302 	 * down often.
303 	 */
304 	if (first == last) {
305 		rand_port = first;
306 	} else {
307 		read_frandom(&rand_port, sizeof(rand_port));
308 
309 		if (first > last) {
310 			rand_port = last + (rand_port % (first - last));
311 		} else {
312 			rand_port = first + (rand_port % (last - first));
313 		}
314 	}
315 	namespace->ns_last_ephemeral_port_down = rand_port;
316 	namespace->ns_last_ephemeral_port_up = rand_port;
317 
318 	return namespace;
319 }
320 
321 static void
netns_ns_free(struct ns * namespace)322 netns_ns_free(struct ns *namespace)
323 {
324 	struct ns_reservation *res;
325 	struct ns_reservation *tmp_res;
326 #if SK_LOG
327 	char tmp_ip_str[MAX_IPv6_STR_LEN];
328 #endif /* SK_LOG */
329 
330 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
331 	    NS_VERB_PROTO(namespace->ns_proto),
332 	    "freeing %s ns for IP %s",
333 	    PROTO_STR(namespace->ns_proto),
334 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
335 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)));
336 
337 	RB_FOREACH_SAFE(res, ns_reservation_tree, &namespace->ns_reservations,
338 	    tmp_res) {
339 		netns_ns_reservation_free(res);
340 		namespace->ns_n_reservations--;
341 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
342 		    res);
343 	}
344 
345 	VERIFY(RB_EMPTY(&namespace->ns_reservations));
346 
347 	if (netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
348 	    namespace->ns_addr_len)] == namespace) {
349 		netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
350 		namespace->ns_addr_len)] = NULL;
351 	}
352 	if (netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
353 	    namespace->ns_addr_len)] == namespace) {
354 		netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
355 		namespace->ns_addr_len)] = NULL;
356 	}
357 
358 	zfree(netns_ns_zone, namespace);
359 }
360 
361 static void
netns_ns_cleanup(struct ns * namespace)362 netns_ns_cleanup(struct ns *namespace)
363 {
364 	if (namespace->ns_is_freeable &&
365 	    RB_EMPTY(&namespace->ns_reservations)) {
366 		RB_REMOVE(netns_namespaces_tree, &netns_namespaces, namespace);
367 		netns_n_namespaces--;
368 		netns_ns_free(namespace);
369 	}
370 }
371 
372 static struct ns_token *
netns_ns_token_alloc(boolean_t with_nfi)373 netns_ns_token_alloc(boolean_t with_nfi)
374 {
375 	struct ns_token *token;
376 
377 	NETNS_LOCK_ASSERT_HELD();
378 	NETNS_LOCK_CONVERT();
379 
380 	token = skmem_cache_alloc(netns_ns_token_cache, SKMEM_SLEEP);
381 	ASSERT(token != NULL);
382 
383 	bzero(token, netns_ns_token_size);
384 
385 	if (with_nfi) {
386 		token->nt_flow_info =  skmem_cache_alloc(netns_ns_flow_info_cache,
387 		    SKMEM_SLEEP);
388 		ASSERT(token->nt_flow_info != NULL);
389 	}
390 	LIST_INSERT_HEAD(&netns_all_tokens, token, nt_all_link);
391 
392 	return token;
393 }
394 
395 static void
netns_ns_token_free(struct ns_token * token)396 netns_ns_token_free(struct ns_token *token)
397 {
398 	NETNS_LOCK_ASSERT_HELD();
399 	NETNS_LOCK_CONVERT();
400 	LIST_REMOVE(token, nt_all_link);
401 
402 	if (token->nt_flow_info != NULL) {
403 		skmem_cache_free(netns_ns_flow_info_cache, token->nt_flow_info);
404 	}
405 	skmem_cache_free(netns_ns_token_cache, token);
406 }
407 
408 __attribute__((always_inline))
409 static inline int
nsr_cmp(const struct ns_reservation * nsr1,const struct ns_reservation * nsr2)410 nsr_cmp(const struct ns_reservation *nsr1, const struct ns_reservation *nsr2)
411 {
412 #define NSR_COMPARE(r1, r2)     ((int)(r1)->nsr_port - (int)(r2)->nsr_port)
413 	return NSR_COMPARE(nsr1, nsr2);
414 }
415 
416 __attribute__((always_inline))
417 static inline int
ns_cmp(const struct ns * a,const struct ns * b)418 ns_cmp(const struct ns *a, const struct ns *b)
419 {
420 	int d;
421 
422 	if ((d = (a->ns_addr_len - b->ns_addr_len)) != 0) {
423 		return d;
424 	}
425 	if ((d = (a->ns_proto - b->ns_proto)) != 0) {
426 		return d;
427 	}
428 	if ((d = flow_ip_cmp(a->ns_addr_key, b->ns_addr_key,
429 	    b->ns_addr_len)) != 0) {
430 		return d;
431 	}
432 
433 	return 0;
434 }
435 
436 /*
437  * Common routine to look up a reservation.
438  *
439  * NOTE: Assumes the caller holds the NETNS global lock
440  */
441 __attribute__((always_inline))
442 static inline struct ns_reservation *
ns_reservation_tree_find(struct ns_reservation_tree * tree,const in_port_t port)443 ns_reservation_tree_find(struct ns_reservation_tree *tree, const in_port_t port)
444 {
445 	struct ns_reservation res;
446 	res.nsr_port = port;
447 	return RB_FIND(ns_reservation_tree, tree, &res);
448 }
449 
450 /*
451  * Retrieve the namespace for the supplied <address, protocol> tuple.
452  * If create is set and such a namespace doesn't already exist, one will be
453  * created.
454  */
455 static struct ns *
_netns_get_ns(uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,bool create)456 _netns_get_ns(uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto, bool create)
457 {
458 	struct ns *namespace = NULL;
459 	struct ns find = {
460 		.ns_addr_key = addr,
461 		.ns_addr_len = addr_len,
462 		.ns_proto = proto,
463 	};
464 #if SK_LOG
465 	char tmp_ip_str[MAX_IPv6_STR_LEN];
466 #endif /* SK_LOG */
467 
468 	VERIFY(addr_len == sizeof(struct in_addr) ||
469 	    addr_len == sizeof(struct in6_addr));
470 
471 	NETNS_LOCK_ASSERT_HELD();
472 
473 	namespace = RB_FIND(netns_namespaces_tree, &netns_namespaces, &find);
474 
475 	if (create && namespace == NULL) {
476 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
477 		    "allocating %s ns for IP %s",
478 		    PROTO_STR(proto), inet_ntop(LEN_TO_AF(addr_len), addr,
479 		    tmp_ip_str, sizeof(tmp_ip_str)));
480 		NETNS_LOCK_CONVERT();
481 		namespace = netns_ns_alloc(Z_WAITOK | Z_NOFAIL);
482 		__builtin_assume(namespace != NULL);
483 		memcpy(namespace->ns_addr, addr, addr_len);
484 		namespace->ns_addr_key = &namespace->ns_addr;
485 		namespace->ns_addr_len = addr_len;
486 		namespace->ns_proto = proto;
487 		RB_INSERT(netns_namespaces_tree, &netns_namespaces, namespace);
488 		netns_n_namespaces++;
489 
490 		if (_netns_is_wildcard_addr(addr, addr_len) &&
491 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
492 		    addr_len)] == NULL) {
493 			netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
494 			addr_len)] = namespace;
495 		}
496 	}
497 
498 	return namespace;
499 }
500 
501 /*
502  * Return true if the supplied address is a wildcard (INADDR_ANY)
503  */
504 __attribute__((always_inline))
505 static boolean_t
_netns_is_wildcard_addr(const uint32_t * __sized_by (addr_len)addr,uint8_t addr_len)506 _netns_is_wildcard_addr(const uint32_t *__sized_by(addr_len)addr, uint8_t addr_len)
507 {
508 	boolean_t wildcard;
509 
510 	switch (addr_len) {
511 	case sizeof(struct in_addr):
512 		wildcard = (addr[0] == 0);
513 		break;
514 
515 	case sizeof(struct in6_addr):
516 		wildcard = (addr[0] == 0 && addr[1] == 0 &&
517 		    addr[2] == 0 && addr[3] == 0);
518 		break;
519 
520 	default:
521 		wildcard = FALSE;
522 		break;
523 	}
524 
525 	return wildcard;
526 }
527 
528 __attribute__((always_inline))
529 static boolean_t
_netns_is_port_used(struct ns * gns,struct ns_reservation * curr_res,in_port_t port)530 _netns_is_port_used(struct ns * gns, struct ns_reservation *curr_res, in_port_t port)
531 {
532 	struct ns_reservation *res = NULL;
533 
534 	if (gns == NULL) {
535 		return FALSE;
536 	}
537 
538 	res = ns_reservation_tree_find(&gns->ns_reservations, port);
539 	if (res != NULL && res != curr_res) {
540 		if (!res->nsr_reuseport) {
541 			return TRUE;
542 		}
543 	}
544 
545 	return FALSE;
546 }
547 
548 /*
549  * Internal shared code to reserve ports within a specific namespace.
550  *
551  * Note: port numbers are in host byte-order here.
552  */
553 static int
_netns_reserve_common(struct ns * namespace,in_port_t port,uint32_t flags)554 _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags)
555 {
556 	struct ns_reservation *res = NULL, *exist = NULL;
557 	uint8_t proto, addr_len;
558 	int err = 0;
559 #if SK_LOG
560 	char tmp_ip_str[MAX_IPv6_STR_LEN];
561 #endif /* SK_LOG */
562 
563 	VERIFY(port != 0);
564 	proto = namespace->ns_proto;
565 	addr_len = namespace->ns_addr_len;
566 	NETNS_LOCK_CONVERT();
567 	res = netns_ns_reservation_alloc(port, flags);
568 	if (res == NULL) {
569 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
570 		    "ERROR %s:%s:%d // flags 0x%x // OUT OF MEMORY",
571 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
572 		    namespace->ns_addr, tmp_ip_str,
573 		    sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags);
574 		return ENOMEM;
575 	}
576 	exist = RB_INSERT(ns_reservation_tree, &namespace->ns_reservations,
577 	    res);
578 	if (__probable(exist == NULL)) {
579 		namespace->ns_n_reservations++;
580 	} else {
581 		netns_ns_reservation_free(res);
582 		res = exist;
583 	}
584 
585 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
586 	    "pre: %s:%s:%d // flags 0x%x // refs %d sky, %d ls, "
587 	    "%d bsd %d pf", inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
588 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
589 	    PROTO_STR(proto), port, flags,
590 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
591 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
592 	    NETNS_REF_COUNT(res, NETNS_BSD),
593 	    NETNS_REF_COUNT(res, NETNS_PF));
594 
595 	/* Make reservation */
596 	/*
597 	 * Bypass collision detection for reservations in the global non-wild
598 	 * namespace. We use that namespace for reference counts only.
599 	 */
600 	if (namespace !=
601 	    netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)]) {
602 		struct ns_reservation *skres;
603 		boolean_t is_wild = _netns_is_wildcard_addr(namespace->ns_addr,
604 		    addr_len);
605 		struct ns *gns =
606 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)];
607 
608 		if (NETNS_IS_SKYWALK(flags)) {
609 			if ((!is_wild || exist != NULL) && gns != NULL &&
610 			    (skres = ns_reservation_tree_find(
611 				    &gns->ns_reservations, port)) != NULL &&
612 			    NETNS_REF_COUNT(skres, NETNS_LISTENER) == 0) {
613 				/*
614 				 * The mere existence of any non-skywalk
615 				 * listener wildcard entry for this
616 				 * protocol/port number means this must fail.
617 				 */
618 				SK_ERR("ADDRINUSE: Duplicate wildcard");
619 				err = EADDRINUSE;
620 				goto done;
621 			}
622 
623 			if (is_wild) {
624 				gns = netns_global_non_wild[
625 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
626 				VERIFY(gns != NULL);
627 
628 				if (_netns_is_port_used(netns_global_non_wild[
629 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
630 				    _netns_is_port_used(netns_global_non_wild[
631 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
632 					/*
633 					 * If Skywalk is trying to reserve a
634 					 * wildcard, then the mere existance of
635 					 * any entry in either v4/v6 non-wild
636 					 * namespace for this port means this
637 					 * must fail.
638 					 */
639 					SK_ERR("ADDRINUSE: Wildcard with non-wild.");
640 					err = EADDRINUSE;
641 					goto done;
642 				}
643 			}
644 		} else {
645 			/*
646 			 * Check if Skywalk has reserved a wildcard entry.
647 			 * Note that the arithmetic OR here is intentional.
648 			 */
649 			if ((!is_wild || exist != NULL) && gns != NULL &&
650 			    (skres = ns_reservation_tree_find(
651 				    &gns->ns_reservations, port)) != NULL &&
652 			    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
653 			    NETNS_REF_COUNT(skres, NETNS_LISTENER)) != 0) {
654 				/*
655 				 * BSD is trying to reserve a proto/port for
656 				 * which Skywalk already has a wildcard
657 				 * reservation.
658 				 */
659 				SK_ERR("ADDRINUSE: BSD requesting Skywalk port");
660 				err = EADDRINUSE;
661 				goto done;
662 			}
663 
664 			/*
665 			 * If BSD is trying to reserve a wildcard,
666 			 * ensure Skywalk has not already reserved
667 			 * a non-wildcard.
668 			 */
669 			if (is_wild) {
670 				gns = netns_global_non_wild[
671 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
672 				VERIFY(gns != NULL);
673 
674 				/*
675 				 * Note that the arithmetic OR here is
676 				 * intentional.
677 				 */
678 				if ((skres = ns_reservation_tree_find(
679 					    &gns->ns_reservations, port)) != NULL &&
680 				    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
681 				    NETNS_REF_COUNT(skres,
682 				    NETNS_LISTENER)) != 0) {
683 					SK_ERR("ADDRINUSE: BSD wildcard with non-wild.");
684 					err = EADDRINUSE;
685 					goto done;
686 				}
687 			}
688 		}
689 
690 		switch (flags & NETNS_OWNER_MASK) {
691 		case NETNS_SKYWALK:
692 			/* check collision w/ BSD */
693 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
694 			    NETNS_REF_COUNT(res, NETNS_PF) > 0) {
695 				SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ BSD)");
696 				err = EADDRINUSE;
697 				goto done;
698 			}
699 
700 			/* BEGIN CSTYLED */
701 			/*
702 			 * Scenarios with new Skywalk connected flow:
703 			 * 1. With existing Skywalk connected flow,
704 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
705 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 1
706 			 *    reject by failing the wild gns lookup below.
707 			 * 2. With existing Skywalk 3-tuple listener,
708 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 1
709 			 *    bypass the check below.
710 			 * 3. With existing Skywalk 2-tuple listener,
711 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
712 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 0
713 			 *    pass with successful wild gns lookup.
714 			 */
715 			/* END CSTYLED */
716 			if (NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
717 			    NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0) {
718 				/* check if covered by wild Skywalk listener */
719 				gns = netns_global_wild[
720 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
721 				if (gns != NULL &&
722 				    (skres = ns_reservation_tree_find(
723 					    &gns->ns_reservations, port)) != NULL &&
724 				    NETNS_REF_COUNT(skres, NETNS_LISTENER)
725 				    != 0) {
726 					err = 0;
727 					goto done;
728 				}
729 				if (addr_len == sizeof(struct in_addr)) {
730 					/* If address is IPv4, also check for wild IPv6 registration */
731 					gns = netns_global_wild[
732 						NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)];
733 					if (gns != NULL &&
734 					    (skres = ns_reservation_tree_find(
735 						    &gns->ns_reservations, port)) != NULL &&
736 					    NETNS_REF_COUNT(skres, NETNS_LISTENER)
737 					    != 0) {
738 						err = 0;
739 						goto done;
740 					}
741 				}
742 				SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ SK connected flow)");
743 				err = EADDRINUSE;
744 			}
745 			/*
746 			 * XXX: Duplicate 5-tuple flows under a Skywalk
747 			 * listener are currently detected by flow manager,
748 			 * till we implement 5-tuple-aware netns.
749 			 */
750 			break;
751 
752 		case NETNS_LISTENER:
753 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
754 			    NETNS_REF_COUNT(res, NETNS_PF) > 0 ||
755 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0 ||
756 			    _netns_is_port_used(netns_global_wild[
757 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
758 			    _netns_is_port_used(netns_global_wild[
759 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port) ||
760 			    _netns_is_port_used(netns_global_non_wild[
761 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
762 			    _netns_is_port_used(netns_global_non_wild[
763 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
764 				SK_ERR("ERROR - Listener got ADDRINUSE");
765 				err = EADDRINUSE;
766 			}
767 			break;
768 
769 		case NETNS_BSD:
770 		case NETNS_PF:
771 			if (NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0 ||
772 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0) {
773 				SK_ERR("ERROR - %s got ADDRINUSE",
774 				    ((flags & NETNS_OWNER_MASK) == NETNS_PF) ?
775 				    "PF" : "BSD");
776 				err = EADDRINUSE;
777 			}
778 			break;
779 
780 		default:
781 			panic("_netns_reserve_common: invalid owner 0x%x",
782 			    flags & NETNS_OWNER_MASK);
783 			/* NOTREACHED */
784 			__builtin_unreachable();
785 		}
786 	}
787 
788 done:
789 	ASSERT(res != NULL);
790 	if (__probable(err == 0)) {
791 		NETNS_REF_COUNT(res, flags)++;
792 		/* Check for wrap around */
793 		VERIFY(NETNS_REF_COUNT(res, flags) != 0);
794 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
795 		    NS_VERB_PROTO(namespace->ns_proto),
796 		    "post: %s:%s:%d err %d // flags 0x%x // refs %d sky, "
797 		    "%d ls, %d bsd %d pf",
798 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
799 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
800 		    PROTO_STR(namespace->ns_proto), port, err, flags,
801 		    NETNS_REF_COUNT(res, NETNS_SKYWALK),
802 		    NETNS_REF_COUNT(res, NETNS_LISTENER),
803 		    NETNS_REF_COUNT(res, NETNS_BSD),
804 		    NETNS_REF_COUNT(res, NETNS_PF));
805 	} else {
806 		if (exist == NULL) {
807 			RB_REMOVE(ns_reservation_tree,
808 			    &namespace->ns_reservations, res);
809 			namespace->ns_n_reservations--;
810 			netns_ns_reservation_free(res);
811 		}
812 	}
813 	return err;
814 }
815 
816 /*
817  * Internal shared code to release ports within a specific namespace.
818  */
819 static void
_netns_release_common(struct ns * namespace,in_port_t port,uint32_t flags)820 _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags)
821 {
822 	struct ns_reservation *res;
823 	uint32_t refs;
824 	int i;
825 #if SK_LOG
826 	char tmp_ip_str[MAX_IPv6_STR_LEN];
827 #endif /* SK_LOG */
828 
829 	NETNS_LOCK_ASSERT_HELD();
830 
831 	res = ns_reservation_tree_find(&namespace->ns_reservations, port);
832 	if (res == NULL) {
833 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
834 		    NS_VERB_PROTO(namespace->ns_proto),
835 		    "ERROR %s:%s:%d // flags 0x%x // not found",
836 		    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
837 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
838 		    PROTO_STR(namespace->ns_proto), port, flags);
839 		VERIFY(res != NULL);
840 	}
841 
842 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
843 	    NS_VERB_PROTO(namespace->ns_proto),
844 	    "%s:%s:%d // flags 0x%x // refs %d sky, %d ls, %d bsd, %d pf",
845 	    inet_ntop(LEN_TO_AF(namespace->ns_addr_len),
846 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
847 	    PROTO_STR(namespace->ns_proto), port, flags,
848 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
849 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
850 	    NETNS_REF_COUNT(res, NETNS_BSD),
851 	    NETNS_REF_COUNT(res, NETNS_PF));
852 
853 	/* Release reservation */
854 	VERIFY(NETNS_REF_COUNT(res, flags) > 0);
855 	NETNS_REF_COUNT(res, flags) -= 1;
856 
857 	/* Clean up memory, if appropriate */
858 	for (i = 0, refs = 0; i <= NETNS_OWNER_MAX && refs == 0; i++) {
859 		refs |= res->nsr_refs[i];
860 	}
861 	if (refs == 0) {
862 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
863 		    res);
864 		namespace->ns_n_reservations--;
865 		NETNS_LOCK_CONVERT();
866 		netns_ns_reservation_free(res);
867 		netns_ns_cleanup(namespace);
868 	}
869 }
870 
871 __attribute__((always_inline))
872 static inline void
netns_init_global_ns(struct ns ** global_ptr,uint8_t proto,uint8_t addrlen)873 netns_init_global_ns(struct ns **global_ptr, uint8_t proto, uint8_t addrlen)
874 {
875 	struct ns *namespace;
876 
877 	namespace = *global_ptr = netns_ns_alloc(Z_WAITOK);
878 	memset(namespace->ns_addr, 0xFF, addrlen);
879 	namespace->ns_addr_len = addrlen;
880 	namespace->ns_proto = proto;
881 	namespace->ns_is_freeable = 0;
882 }
883 
884 __attribute__((always_inline))
885 static inline void
netns_clear_ifnet(struct ns_token * nstoken)886 netns_clear_ifnet(struct ns_token *nstoken)
887 {
888 #if SK_LOG
889 	char tmp_ip_str[MAX_IPv6_STR_LEN];
890 #endif /* SK_LOG */
891 
892 	NETNS_LOCK_ASSERT_HELD();
893 
894 	if (nstoken->nt_ifp != NULL) {
895 		LIST_REMOVE(nstoken, nt_ifp_link);
896 
897 		SK_DF(NS_VERB_IP(nstoken->nt_addr_len) |
898 		    NS_VERB_PROTO(nstoken->nt_proto),
899 		    "%s:%s:%d // removed from ifnet %d",
900 		    inet_ntop(LEN_TO_AF(nstoken->nt_addr_len),
901 		    nstoken->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
902 		    PROTO_STR(nstoken->nt_proto), nstoken->nt_port,
903 		    nstoken->nt_ifp->if_index);
904 
905 		NETNS_LOCK_CONVERT();
906 		ifnet_decr_iorefcnt(nstoken->nt_ifp);
907 		nstoken->nt_ifp = NULL;
908 	} else {
909 		LIST_REMOVE(nstoken, nt_ifp_link);
910 	}
911 }
912 
913 /*
914  * Internal shared code to perform a port[-range] reservation, along with all
915  * the boilerplate and sanity checks expected for a call coming in from the
916  * surrounding kernel code.
917  */
918 static int
_netns_reserve_kpi_common(struct ns * ns,netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)919 _netns_reserve_kpi_common(struct ns *ns, netns_token *token,
920     uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto,
921     in_port_t *port, uint32_t flags, struct ns_flow_info *nfi)
922 {
923 	boolean_t ns_want_cleanup = (ns == NULL);
924 	struct ns_token *nt;
925 	int err = 0;
926 	in_port_t hport;
927 #if SK_LOG
928 	char tmp_ip_str[MAX_IPv6_STR_LEN];
929 #endif /* SK_LOG */
930 	struct ifnet *ifp = (nfi != NULL) ? nfi->nfi_ifp : NULL;
931 
932 	NETNS_LOCK_ASSERT_HELD();
933 
934 	hport = ntohs(*port);
935 
936 	VERIFY((flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
937 	VERIFY(addr_len == sizeof(struct in_addr) ||
938 	    addr_len == sizeof(struct in6_addr));
939 	VERIFY(proto == IPPROTO_TCP || proto == IPPROTO_UDP);
940 	VERIFY(hport != 0);
941 
942 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
943 	    "reserving %s:%s:%d // flags 0x%x // token %svalid",
944 	    inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str,
945 	    sizeof(tmp_ip_str)), PROTO_STR(proto), hport, flags,
946 	    NETNS_TOKEN_VALID(token) ? "" : "in");
947 
948 	/*
949 	 * See the documentation for NETNS_PRERESERVED in netns.h for an
950 	 * explanation of this block.
951 	 */
952 	if (NETNS_TOKEN_VALID(token)) {
953 		if (flags & NETNS_PRERESERVED) {
954 			nt = *token;
955 			VERIFY(nt->nt_addr_len == addr_len);
956 			VERIFY(memcmp(nt->nt_addr, addr, addr_len) == 0);
957 			VERIFY(nt->nt_proto == proto);
958 			VERIFY(nt->nt_port == hport);
959 			VERIFY((nt->nt_flags &
960 			    NETNS_RESERVATION_FLAGS | NETNS_PRERESERVED) ==
961 			    (flags & NETNS_RESERVATION_FLAGS));
962 
963 			if ((nt->nt_flags & NETNS_CONFIGURATION_FLAGS) ==
964 			    (flags & NETNS_CONFIGURATION_FLAGS)) {
965 				SK_DF(NS_VERB_IP(nt->nt_addr_len) |
966 				    NS_VERB_PROTO(nt->nt_proto),
967 				    "%s:%s:%d // flags 0x%x -> 0x%x",
968 				    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
969 				    nt->nt_addr, tmp_ip_str,
970 				    sizeof(tmp_ip_str)),
971 				    PROTO_STR(nt->nt_proto),
972 				    nt->nt_port, nt->nt_flags, flags);
973 				nt->nt_flags &= ~NETNS_CONFIGURATION_FLAGS;
974 				nt->nt_flags |=
975 				    flags & NETNS_CONFIGURATION_FLAGS;
976 			}
977 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
978 			    "token was prereserved");
979 			goto done;
980 		} else {
981 			panic("Request to overwrite valid netns token");
982 			/* NOTREACHED */
983 			__builtin_unreachable();
984 		}
985 	}
986 
987 	/*
988 	 * TODO: Check range against bitmap
989 	 */
990 	if (hport == 0) {
991 		/*
992 		 * Caller request an arbitrary range of ports
993 		 * TODO: Need to figure out how to allocate
994 		 * emphemeral ports only.
995 		 */
996 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
997 		    "ERROR - wildcard port not yet supported");
998 		err = ENOMEM;
999 		goto done;
1000 	}
1001 
1002 	/*
1003 	 * Fetch namespace for the specified address/protocol, creating
1004 	 * a new namespace if necessary.
1005 	 */
1006 	if (ns == NULL) {
1007 		ASSERT(ns_want_cleanup);
1008 		ns = _netns_get_ns(addr, addr_len, proto, true);
1009 	}
1010 	if (__improbable(ns == NULL)) {
1011 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1012 		    "ERROR - couldn't create namespace");
1013 		err = ENOMEM;
1014 		goto done;
1015 	}
1016 
1017 	/*
1018 	 * Make a reservation in the namespace
1019 	 * This will return an error if an incompatible reservation
1020 	 * already exists.
1021 	 */
1022 	err = _netns_reserve_common(ns, hport, flags);
1023 	if (__improbable(err != 0)) {
1024 		NETNS_LOCK_CONVERT();
1025 		if (ns_want_cleanup) {
1026 			netns_ns_cleanup(ns);
1027 		}
1028 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1029 		    "ERROR - reservation collision");
1030 		goto done;
1031 	}
1032 
1033 	if (!_netns_is_wildcard_addr(ns->ns_addr, addr_len)) {
1034 		/* Record the reservation in the non-wild namespace */
1035 		struct ns *nwns;
1036 
1037 		nwns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1038 		    addr_len)];
1039 		err = _netns_reserve_common(nwns, hport, flags);
1040 		if (__improbable(err != 0)) {
1041 			/* Need to free the specific namespace entry */
1042 			NETNS_LOCK_CONVERT();
1043 			_netns_release_common(ns, hport, flags);
1044 			if (ns_want_cleanup) {
1045 				netns_ns_cleanup(ns);
1046 			}
1047 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1048 			    "ERROR - reservation collision");
1049 			goto done;
1050 		}
1051 	}
1052 
1053 	nt = netns_ns_token_alloc(nfi != NULL ? true : false);
1054 	ASSERT(nt->nt_ifp == NULL);
1055 	_netns_set_ifnet_internal(nt, ifp);
1056 
1057 	memcpy(nt->nt_addr, addr, addr_len);
1058 	nt->nt_addr_len = addr_len;
1059 	nt->nt_proto = proto;
1060 	nt->nt_port = hport;
1061 	nt->nt_flags = flags;
1062 
1063 	if (nfi != NULL) {
1064 		VERIFY(nt->nt_flow_info != NULL);
1065 
1066 		memcpy(nt->nt_flow_info, nfi, sizeof(struct ns_flow_info));
1067 		/*
1068 		 * The local port is passed as a separate argument
1069 		 */
1070 		if (nfi->nfi_laddr.sa.sa_family == AF_INET) {
1071 			nt->nt_flow_info->nfi_laddr.sin.sin_port = *port;
1072 		} else if (nfi->nfi_laddr.sa.sa_family == AF_INET6) {
1073 			nt->nt_flow_info->nfi_laddr.sin6.sin6_port = *port;
1074 		}
1075 	}
1076 	*token = nt;
1077 
1078 done:
1079 	return err;
1080 }
1081 
1082 /*
1083  * Kernel-facing functions
1084  */
1085 
1086 int
netns_init(void)1087 netns_init(void)
1088 {
1089 	VERIFY(__netns_inited == 0);
1090 
1091 	netns_ns_reservation_size = sizeof(struct ns_reservation);
1092 	netns_ns_reservation_cache = skmem_cache_create(NETNS_NS_RESERVATION_ZONE_NAME,
1093 	    netns_ns_reservation_size, sizeof(uint64_t), NULL, NULL, NULL,
1094 	    NULL, NULL, 0);
1095 	if (netns_ns_reservation_cache == NULL) {
1096 		panic("%s: skmem_cache create failed (%s)", __func__,
1097 		    NETNS_NS_RESERVATION_ZONE_NAME);
1098 		/* NOTREACHED */
1099 		__builtin_unreachable();
1100 	}
1101 
1102 	netns_ns_token_size = sizeof(struct ns_token);
1103 	netns_ns_token_cache = skmem_cache_create(NETNS_NS_TOKEN_ZONE_NAME,
1104 	    netns_ns_token_size, sizeof(uint64_t), NULL, NULL, NULL, NULL,
1105 	    NULL, 0);
1106 	if (netns_ns_token_cache == NULL) {
1107 		panic("%s: skmem_cache create failed (%s)", __func__,
1108 		    NETNS_NS_TOKEN_ZONE_NAME);
1109 		/* NOTREACHED */
1110 		__builtin_unreachable();
1111 	}
1112 
1113 	netns_ns_flow_info_size = sizeof(struct ns_flow_info);
1114 	netns_ns_flow_info_cache = skmem_cache_create(NETNS_NS_FLOW_INFO_ZONE_NAME,
1115 	    netns_ns_flow_info_size, sizeof(uint64_t), NULL, NULL, NULL,
1116 	    NULL, NULL, 0);
1117 	if (netns_ns_flow_info_cache == NULL) {
1118 		panic("%s: skmem_cache create failed (%s)", __func__,
1119 		    NETNS_NS_FLOW_INFO_ZONE_NAME);
1120 		/* NOTREACHED */
1121 		__builtin_unreachable();
1122 	}
1123 
1124 	LIST_INIT(&netns_unbound_tokens);
1125 	LIST_INIT(&netns_all_tokens);
1126 
1127 	netns_n_namespaces = 0;
1128 	RB_INIT(&netns_namespaces);
1129 
1130 	SK_D("initializing global namespaces");
1131 
1132 	netns_init_global_ns(
1133 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1134 		NETNS_ADDRLEN_V4)], IPPROTO_TCP, sizeof(struct in_addr));
1135 
1136 	netns_init_global_ns(
1137 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1138 		NETNS_ADDRLEN_V4)], IPPROTO_UDP, sizeof(struct in_addr));
1139 
1140 	netns_init_global_ns(
1141 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1142 		NETNS_ADDRLEN_V6)], IPPROTO_TCP, sizeof(struct in6_addr));
1143 
1144 	netns_init_global_ns(
1145 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1146 		NETNS_ADDRLEN_V6)], IPPROTO_UDP, sizeof(struct in6_addr));
1147 
1148 	/* Done */
1149 
1150 	__netns_inited = 1;
1151 	sk_features |= SK_FEATURE_NETNS;
1152 
1153 	SK_D("initialized netns");
1154 
1155 	return 0;
1156 }
1157 
1158 void
netns_uninit(void)1159 netns_uninit(void)
1160 {
1161 	if (__netns_inited == 1) {
1162 		struct ns *namespace;
1163 		struct ns *temp_namespace;
1164 		int i;
1165 
1166 		RB_FOREACH_SAFE(namespace, netns_namespaces_tree,
1167 		    &netns_namespaces, temp_namespace) {
1168 			RB_REMOVE(netns_namespaces_tree, &netns_namespaces,
1169 			    namespace);
1170 			netns_n_namespaces--;
1171 			netns_ns_free(namespace);
1172 		}
1173 
1174 		for (i = 0; i < NETNS_N_GLOBAL; i++) {
1175 			netns_ns_free(netns_global_non_wild[i]);
1176 		}
1177 
1178 		if (netns_ns_flow_info_cache != NULL) {
1179 			skmem_cache_destroy(netns_ns_flow_info_cache);
1180 			netns_ns_flow_info_cache = NULL;
1181 		}
1182 		if (netns_ns_token_cache != NULL) {
1183 			skmem_cache_destroy(netns_ns_token_cache);
1184 			netns_ns_token_cache = NULL;
1185 		}
1186 		if (netns_ns_reservation_cache != NULL) {
1187 			skmem_cache_destroy(netns_ns_reservation_cache);
1188 			netns_ns_reservation_cache = NULL;
1189 		}
1190 
1191 		__netns_inited = 0;
1192 		sk_features &= ~SK_FEATURE_NETNS;
1193 
1194 		SK_D("uninitialized netns");
1195 	}
1196 }
1197 
1198 void
netns_reap_caches(boolean_t purge)1199 netns_reap_caches(boolean_t purge)
1200 {
1201 	/* these aren't created unless netns is enabled */
1202 	if (netns_ns_token_cache != NULL) {
1203 		skmem_cache_reap_now(netns_ns_token_cache, purge);
1204 	}
1205 	if (netns_ns_reservation_cache != NULL) {
1206 		skmem_cache_reap_now(netns_ns_reservation_cache, purge);
1207 	}
1208 	if (netns_ns_flow_info_cache != NULL) {
1209 		skmem_cache_reap_now(netns_ns_flow_info_cache, purge);
1210 	}
1211 }
1212 
1213 boolean_t
netns_is_enabled(void)1214 netns_is_enabled(void)
1215 {
1216 	return __netns_inited == 1;
1217 }
1218 
1219 int
netns_reserve(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t port,uint32_t flags,struct ns_flow_info * nfi)1220 netns_reserve(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1221     uint8_t addr_len, uint8_t proto, in_port_t port, uint32_t flags,
1222     struct ns_flow_info *nfi)
1223 {
1224 	int err = 0;
1225 #if SK_LOG
1226 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1227 #endif /* SK_LOG */
1228 
1229 	if (__netns_inited == 0) {
1230 		*token = NULL;
1231 		return err;
1232 	}
1233 
1234 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1235 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1236 		return ENOTSUP;
1237 	}
1238 
1239 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1240 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1241 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1242 	    flags);
1243 
1244 	/*
1245 	 * Check wether the process is allowed to bind to a restricted port
1246 	 */
1247 	if (!current_task_can_use_restricted_in_port(port,
1248 	    proto, flags)) {
1249 		*token = NULL;
1250 		return EADDRINUSE;
1251 	}
1252 
1253 	NETNS_LOCK_SPIN();
1254 	err = _netns_reserve_kpi_common(NULL, token, addr, addr_len,
1255 	    proto, &port, flags, nfi);
1256 	NETNS_UNLOCK();
1257 
1258 	return err;
1259 }
1260 
1261 /* Import net.inet.{tcp,udp}.randomize_ports sysctls */
1262 extern int      udp_use_randomport;
1263 extern int      tcp_use_randomport;
1264 
1265 int
netns_reserve_ephemeral(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)1266 netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1267     uint8_t addr_len, uint8_t proto, in_port_t *port, uint32_t flags,
1268     struct ns_flow_info *nfi)
1269 {
1270 	int err = 0;
1271 	in_port_t first = (in_port_t)ipport_firstauto;
1272 	in_port_t last  = (in_port_t)ipport_lastauto;
1273 	in_port_t rand_port;
1274 	in_port_t last_port;
1275 	in_port_t n_last_port;
1276 	struct ns *namespace;
1277 	boolean_t count_up = true;
1278 	boolean_t use_randomport = (proto == IPPROTO_TCP) ?
1279 	    tcp_use_randomport : udp_use_randomport;
1280 #if SK_LOG
1281 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1282 #endif /* SK_LOG */
1283 
1284 	if (__netns_inited == 0) {
1285 		*token = NULL;
1286 		return err;
1287 	}
1288 
1289 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1290 		SK_ERR("netns doesn't support non TCP/UDP protocol");
1291 		return ENOTSUP;
1292 	}
1293 
1294 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1295 	    "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr,
1296 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(*port),
1297 	    flags);
1298 
1299 	NETNS_LOCK_SPIN();
1300 
1301 	namespace = _netns_get_ns(addr, addr_len, proto, true);
1302 	if (namespace == NULL) {
1303 		err = ENOMEM;
1304 		NETNS_UNLOCK();
1305 		return err;
1306 	}
1307 
1308 	if (proto == IPPROTO_UDP) {
1309 		if (UINT16_MAX - namespace->ns_n_reservations <
1310 		    NETNS_NS_UDP_EPHEMERAL_RESERVE) {
1311 			SK_ERR("UDP ephemeral port not available"
1312 			    "(less than 4096 UDP ports left)");
1313 			err = EADDRNOTAVAIL;
1314 			NETNS_UNLOCK();
1315 			return err;
1316 		}
1317 	}
1318 
1319 	if (first == last) {
1320 		rand_port = first;
1321 	} else {
1322 		if (use_randomport) {
1323 			NETNS_LOCK_CONVERT();
1324 			read_frandom(&rand_port, sizeof(rand_port));
1325 
1326 			if (first > last) {
1327 				rand_port = last + (rand_port %
1328 				    (first - last));
1329 				count_up = false;
1330 			} else {
1331 				rand_port = first + (rand_port %
1332 				    (last - first));
1333 			}
1334 		} else {
1335 			if (first > last) {
1336 				rand_port =
1337 				    namespace->ns_last_ephemeral_port_down - 1;
1338 				if (rand_port < last || rand_port > first) {
1339 					rand_port = last;
1340 				}
1341 				count_up = false;
1342 			} else {
1343 				rand_port =
1344 				    namespace->ns_last_ephemeral_port_up + 1;
1345 				if (rand_port < first || rand_port > last) {
1346 					rand_port = first;
1347 				}
1348 			}
1349 		}
1350 	}
1351 	last_port = rand_port;
1352 	n_last_port = htons(last_port);
1353 
1354 	while (true) {
1355 		if (n_last_port == 0) {
1356 			SK_ERR("ephemeral port search range includes 0");
1357 			err = EINVAL;
1358 			break;
1359 		}
1360 
1361 		/*
1362 		 * Skip if this is a restricted port as we do not want to
1363 		 * restricted ports as ephemeral
1364 		 */
1365 		if (!IS_RESTRICTED_IN_PORT(n_last_port)) {
1366 			err = _netns_reserve_kpi_common(namespace, token, addr,
1367 			    addr_len, proto, &n_last_port, flags, nfi);
1368 			if (err == 0 || err != EADDRINUSE) {
1369 				break;
1370 			}
1371 		}
1372 		if (count_up) {
1373 			last_port++;
1374 			if (last_port < first || last_port > last) {
1375 				last_port = first;
1376 			}
1377 		} else {
1378 			last_port--;
1379 			if (last_port < last || last_port > first) {
1380 				last_port = last;
1381 			}
1382 		}
1383 		n_last_port = htons(last_port);
1384 
1385 		if (last_port == rand_port || first == last) {
1386 			SK_ERR("couldn't find free ephemeral port");
1387 			err = EADDRNOTAVAIL;
1388 			break;
1389 		}
1390 	}
1391 
1392 	if (err == 0) {
1393 		*port = n_last_port;
1394 		if (count_up) {
1395 			namespace->ns_last_ephemeral_port_up = last_port;
1396 		} else {
1397 			namespace->ns_last_ephemeral_port_down = last_port;
1398 		}
1399 	} else {
1400 		netns_ns_cleanup(namespace);
1401 	}
1402 
1403 	NETNS_UNLOCK();
1404 
1405 	return err;
1406 }
1407 
1408 void
netns_release(netns_token * token)1409 netns_release(netns_token *token)
1410 {
1411 	struct ns *ns;
1412 	struct ns_token *nt;
1413 	uint8_t proto, addr_len;
1414 #if SK_LOG
1415 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1416 #endif /* SK_LOG */
1417 
1418 	if (!NETNS_TOKEN_VALID(token)) {
1419 		return;
1420 	}
1421 
1422 	if (__netns_inited == 0) {
1423 		*token = NULL;
1424 		return;
1425 	}
1426 
1427 	NETNS_LOCK_SPIN();
1428 
1429 	nt = *token;
1430 	*token = NULL;
1431 
1432 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
1433 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1434 	    nt->nt_addr_len == sizeof(struct in6_addr));
1435 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1436 
1437 	addr_len = nt->nt_addr_len;
1438 	proto = nt->nt_proto;
1439 
1440 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1441 	    "releasing %s:%s:%d",
1442 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1443 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto),
1444 	    nt->nt_port);
1445 
1446 	if (!_netns_is_wildcard_addr(nt->nt_addr, addr_len)) {
1447 		/* Remove from global non-wild namespace */
1448 
1449 		ns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1450 		    addr_len)];
1451 		VERIFY(ns != NULL);
1452 
1453 		_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1454 	}
1455 
1456 	ns = _netns_get_ns(nt->nt_addr, addr_len, proto, false);
1457 	VERIFY(ns != NULL);
1458 	_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1459 
1460 	netns_clear_ifnet(nt);
1461 	netns_ns_token_free(nt);
1462 
1463 	NETNS_UNLOCK();
1464 }
1465 
1466 int
netns_change_addr(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len)1467 netns_change_addr(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1468     uint8_t addr_len)
1469 {
1470 	int err = 0;
1471 	struct ns *old_namespace;
1472 	struct ns *new_namespace;
1473 	struct ns *global_namespace;
1474 	struct ns_token *nt;
1475 	uint8_t proto;
1476 #if SK_LOG
1477 	char tmp_ip_str_1[MAX_IPv6_STR_LEN];
1478 	char tmp_ip_str_2[MAX_IPv6_STR_LEN];
1479 #endif /* SK_LOG */
1480 
1481 	if (__netns_inited == 0) {
1482 		return 0;
1483 	}
1484 
1485 	NETNS_LOCK();
1486 
1487 	VERIFY(NETNS_TOKEN_VALID(token));
1488 
1489 	nt = *token;
1490 
1491 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) == NETNS_BSD);
1492 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1493 	    nt->nt_addr_len == sizeof(struct in6_addr));
1494 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1495 
1496 	proto = nt->nt_proto;
1497 
1498 #if SK_LOG
1499 	inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1500 	    tmp_ip_str_1, sizeof(tmp_ip_str_1));
1501 	inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2,
1502 	    sizeof(tmp_ip_str_2));
1503 #endif /* SK_LOG */
1504 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1505 	    "changing address for %s:%d from %s to %s",
1506 	    PROTO_STR(proto), nt->nt_port, tmp_ip_str_1,
1507 	    tmp_ip_str_2);
1508 
1509 	if (nt->nt_addr_len == addr_len &&
1510 	    memcmp(nt->nt_addr, addr, nt->nt_addr_len) == 0) {
1511 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1512 		    "address didn't change, exiting early");
1513 		goto done;
1514 	}
1515 
1516 	old_namespace = _netns_get_ns(nt->nt_addr, nt->nt_addr_len, proto,
1517 	    false);
1518 	VERIFY(old_namespace != NULL);
1519 
1520 	new_namespace = _netns_get_ns(addr, addr_len, proto, true);
1521 	if (new_namespace == NULL) {
1522 		err = ENOMEM;
1523 		goto done;
1524 	}
1525 
1526 	/* Acquire reservation in new namespace */
1527 	if ((err = _netns_reserve_common(new_namespace, nt->nt_port,
1528 	    nt->nt_flags))) {
1529 		NETNS_LOCK_CONVERT();
1530 		netns_ns_cleanup(new_namespace);
1531 		SK_ERR("ERROR - reservation collision under new namespace");
1532 		goto done;
1533 	}
1534 
1535 	/* Release from old namespace */
1536 	_netns_release_common(old_namespace, nt->nt_port, nt->nt_flags);
1537 
1538 	if (!_netns_is_wildcard_addr(nt->nt_addr, nt->nt_addr_len)) {
1539 		/*
1540 		 * Old address is non-wildcard.
1541 		 * Remove old reservation from global non-wild namespace
1542 		 */
1543 		global_namespace = netns_global_non_wild[
1544 			NETNS_NS_GLOBAL_IDX(proto, nt->nt_addr_len)];
1545 		VERIFY(global_namespace != NULL);
1546 
1547 		_netns_release_common(global_namespace, nt->nt_port,
1548 		    nt->nt_flags);
1549 	}
1550 
1551 	if (!_netns_is_wildcard_addr(addr, addr_len)) {
1552 		/*
1553 		 * New address is non-wildcard.
1554 		 * Record new reservation in global non-wild namespace
1555 		 */
1556 		global_namespace = netns_global_non_wild[
1557 			NETNS_NS_GLOBAL_IDX(proto, addr_len)];
1558 		VERIFY(global_namespace != NULL);
1559 
1560 		if ((err = _netns_reserve_common(global_namespace,
1561 		    nt->nt_port, nt->nt_flags)) != 0) {
1562 			SK_ERR("ERROR - reservation collision under new global namespace");
1563 			/* XXX: Should not fail. Maybe assert instead */
1564 			goto done;
1565 		}
1566 	}
1567 
1568 	memcpy(nt->nt_addr, addr, addr_len);
1569 	nt->nt_addr_len = addr_len;
1570 
1571 done:
1572 	NETNS_UNLOCK();
1573 	return err;
1574 }
1575 
1576 static void
_netns_set_ifnet_internal(struct ns_token * nt,struct ifnet * ifp)1577 _netns_set_ifnet_internal(struct ns_token *nt, struct ifnet *ifp)
1578 {
1579 #if SK_LOG
1580 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1581 #endif /* SK_LOG */
1582 
1583 	NETNS_LOCK_ASSERT_HELD();
1584 
1585 	if (ifp != NULL && ifnet_is_attached(ifp, 1)) {
1586 		nt->nt_ifp = ifp;
1587 		LIST_INSERT_HEAD(&ifp->if_netns_tokens, nt, nt_ifp_link);
1588 
1589 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1590 		    "%s:%s:%d // added to ifnet %d",
1591 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1592 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1593 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1594 		    ifp->if_index);
1595 	} else {
1596 		LIST_INSERT_HEAD(&netns_unbound_tokens, nt, nt_ifp_link);
1597 	}
1598 }
1599 
1600 void
netns_set_ifnet(netns_token * token,ifnet_t ifp)1601 netns_set_ifnet(netns_token *token, ifnet_t ifp)
1602 {
1603 	struct ns_token *nt;
1604 #if SK_LOG
1605 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1606 #endif /* SK_LOG */
1607 
1608 	if (__netns_inited == 0) {
1609 		return;
1610 	}
1611 
1612 	NETNS_LOCK();
1613 
1614 	VERIFY(NETNS_TOKEN_VALID(token));
1615 
1616 	nt = *token;
1617 
1618 	if (nt->nt_ifp == ifp) {
1619 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1620 		    "%s:%s:%d // ifnet already %d, exiting early",
1621 		    inet_ntop(LEN_TO_AF(nt->nt_addr_len),
1622 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1623 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1624 		    ifp ? ifp->if_index : -1);
1625 		NETNS_UNLOCK();
1626 		return;
1627 	}
1628 
1629 	netns_clear_ifnet(nt);
1630 
1631 	_netns_set_ifnet_internal(nt, ifp);
1632 
1633 	NETNS_UNLOCK();
1634 }
1635 
1636 void
netns_ifnet_detach(ifnet_t ifp)1637 netns_ifnet_detach(ifnet_t ifp)
1638 {
1639 	struct ns_token *token, *tmp_token;
1640 
1641 	if (__netns_inited == 0) {
1642 		return;
1643 	}
1644 
1645 	NETNS_LOCK();
1646 
1647 	LIST_FOREACH_SAFE(token, &ifp->if_netns_tokens, nt_ifp_link,
1648 	    tmp_token) {
1649 		netns_clear_ifnet(token);
1650 		LIST_INSERT_HEAD(&netns_unbound_tokens, token, nt_ifp_link);
1651 	}
1652 
1653 	NETNS_UNLOCK();
1654 }
1655 
1656 static void
_netns_set_state(netns_token * token,uint32_t state)1657 _netns_set_state(netns_token *token, uint32_t state)
1658 {
1659 	struct ns_token *nt;
1660 #if SK_LOG
1661 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1662 #endif /* SK_LOG */
1663 
1664 	if (__netns_inited == 0) {
1665 		return;
1666 	}
1667 
1668 	NETNS_LOCK();
1669 	VERIFY(NETNS_TOKEN_VALID(token));
1670 
1671 	nt = *token;
1672 	nt->nt_state |= state;
1673 
1674 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1675 	    "%s:%s:%d // state 0x%b",
1676 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1677 	    tmp_ip_str, sizeof(tmp_ip_str)),
1678 	    PROTO_STR(nt->nt_proto), nt->nt_port, state, NETNS_STATE_BITS);
1679 
1680 	NETNS_UNLOCK();
1681 }
1682 
1683 void
netns_half_close(netns_token * token)1684 netns_half_close(netns_token *token)
1685 {
1686 	_netns_set_state(token, NETNS_STATE_HALFCLOSED);
1687 }
1688 
1689 void
netns_withdraw(netns_token * token)1690 netns_withdraw(netns_token *token)
1691 {
1692 	_netns_set_state(token, NETNS_STATE_WITHDRAWN);
1693 }
1694 
1695 int
netns_get_flow_info(netns_token * token,struct ns_flow_info * nfi)1696 netns_get_flow_info(netns_token *token,
1697     struct ns_flow_info *nfi)
1698 {
1699 	if (__netns_inited == 0) {
1700 		return ENOTSUP;
1701 	}
1702 
1703 	NETNS_LOCK();
1704 	if (!NETNS_TOKEN_VALID(token) ||
1705 	    nfi == NULL) {
1706 		NETNS_UNLOCK();
1707 		return EINVAL;
1708 	}
1709 
1710 	struct ns_token *nt = *token;
1711 	if (nt->nt_flow_info == NULL) {
1712 		NETNS_UNLOCK();
1713 		return ENOENT;
1714 	}
1715 
1716 	memcpy(nfi, nt->nt_flow_info, sizeof(struct ns_flow_info));
1717 	NETNS_UNLOCK();
1718 
1719 	return 0;
1720 }
1721 
1722 void
netns_change_flags(netns_token * token,uint32_t set_flags,uint32_t clear_flags)1723 netns_change_flags(netns_token *token, uint32_t set_flags,
1724     uint32_t clear_flags)
1725 {
1726 	struct ns_token *nt;
1727 #if SK_LOG
1728 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1729 #endif /* SK_LOG */
1730 
1731 	if (__netns_inited == 0) {
1732 		return;
1733 	}
1734 
1735 	NETNS_LOCK();
1736 
1737 	VERIFY(NETNS_TOKEN_VALID(token));
1738 
1739 	nt = *token;
1740 
1741 	VERIFY(!((set_flags | clear_flags) & NETNS_RESERVATION_FLAGS));
1742 	/* TODO: verify set and clear flags don't overlap? */
1743 
1744 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1745 	    "%s:%s:%d // flags 0x%x -> 0x%x",
1746 	    inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1747 	    tmp_ip_str, sizeof(tmp_ip_str)),
1748 	    PROTO_STR(nt->nt_proto), nt->nt_port, nt->nt_flags,
1749 	    nt->nt_flags | set_flags & ~clear_flags);
1750 
1751 	nt->nt_flags |= set_flags;
1752 	nt->nt_flags &= ~clear_flags;
1753 
1754 	NETNS_UNLOCK();
1755 }
1756 
1757 /*
1758  * Port offloading KPI
1759  */
1760 static inline void
netns_local_port_scan_flow_entry(struct flow_entry * fe,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1761 netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protocol,
1762     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1763 {
1764 	struct ns_token *token;
1765 	boolean_t iswildcard = false;
1766 
1767 	if (fe == NULL) {
1768 		return;
1769 	}
1770 
1771 	if (fe->fe_flags & FLOWENTF_EXTRL_PORT) {
1772 		return;
1773 	}
1774 
1775 	token = fe->fe_port_reservation;
1776 	if (token == NULL) {
1777 		return;
1778 	}
1779 
1780 	/*
1781 	 * We are only interested in active flows over skywalk channels
1782 	 */
1783 	if ((token->nt_flags & NETNS_OWNER_MASK) != NETNS_SKYWALK) {
1784 		return;
1785 	}
1786 
1787 	if (token->nt_state & NETNS_STATE_WITHDRAWN) {
1788 		return;
1789 	}
1790 
1791 	if (!(flags & IFNET_GET_LOCAL_PORTS_ANYTCPSTATEOK) &&
1792 	    (flags & IFNET_GET_LOCAL_PORTS_ACTIVEONLY) &&
1793 	    (token->nt_state & NETNS_STATE_HALFCLOSED)) {
1794 		return;
1795 	}
1796 
1797 	VERIFY(token->nt_addr_len == sizeof(struct in_addr) ||
1798 	    token->nt_addr_len == sizeof(struct in6_addr));
1799 
1800 	if (token->nt_addr_len == sizeof(struct in_addr)) {
1801 		if (protocol == PF_INET6) {
1802 			return;
1803 		}
1804 
1805 		iswildcard = token->nt_inaddr.s_addr == INADDR_ANY;
1806 	} else if (token->nt_addr_len == sizeof(struct in6_addr)) {
1807 		if (protocol == PF_INET) {
1808 			return;
1809 		}
1810 
1811 		iswildcard = IN6_IS_ADDR_UNSPECIFIED(
1812 			&token->nt_in6addr);
1813 	}
1814 	if (!(flags & IFNET_GET_LOCAL_PORTS_WILDCARDOK) && iswildcard) {
1815 		return;
1816 	}
1817 
1818 	if ((flags & IFNET_GET_LOCAL_PORTS_TCPONLY) &&
1819 	    token->nt_proto == IPPROTO_UDP) {
1820 		return;
1821 	}
1822 	if ((flags & IFNET_GET_LOCAL_PORTS_UDPONLY) &&
1823 	    token->nt_proto == IPPROTO_TCP) {
1824 		return;
1825 	}
1826 
1827 	if ((flags & IFNET_GET_LOCAL_PORTS_RECVANYIFONLY) &&
1828 	    !(token->nt_flags & NETNS_RECVANYIF)) {
1829 		return;
1830 	}
1831 
1832 	if ((flags & IFNET_GET_LOCAL_PORTS_EXTBGIDLEONLY) &&
1833 	    !(token->nt_flags & NETNS_EXTBGIDLE)) {
1834 		return;
1835 	}
1836 
1837 	if (token->nt_ifp != NULL && (token->nt_ifp->if_eflags & IFEF_AWDL) != 0) {
1838 		struct flow_route *fr = fe->fe_route;
1839 
1840 		if (fr == NULL || fr->fr_rt_dst == NULL ||
1841 		    (fr->fr_rt_dst->rt_flags & (RTF_UP | RTF_CONDEMNED)) != RTF_UP) {
1842 #if DEBUG || DEVELOPMENT
1843 			char lbuf[MAX_IPv6_STR_LEN + 6] = {};
1844 			char fbuf[MAX_IPv6_STR_LEN + 6] = {};
1845 			in_port_t lport;
1846 			in_port_t fport;
1847 			char pname[MAXCOMLEN + 1];
1848 			const struct ns_flow_info *nfi = token->nt_flow_info;
1849 
1850 			proc_name(nfi->nfi_owner_pid, pname, sizeof(pname));
1851 
1852 			if (protocol == PF_INET) {
1853 				inet_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr,
1854 				    lbuf, sizeof(lbuf));
1855 				inet_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr,
1856 				    fbuf, sizeof(fbuf));
1857 				lport = nfi->nfi_laddr.sin.sin_port;
1858 				fport = nfi->nfi_faddr.sin.sin_port;
1859 			} else {
1860 				inet_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr,
1861 				    lbuf, sizeof(lbuf));
1862 				inet_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr,
1863 				    fbuf, sizeof(fbuf));
1864 				lport = nfi->nfi_laddr.sin6.sin6_port;
1865 				fport = nfi->nfi_faddr.sin6.sin6_port;
1866 			}
1867 
1868 			os_log(OS_LOG_DEFAULT,
1869 			    "netns_local_port_scan_flow_entry: route is down %s %s:%u %s:%u ifp %s proc %s:%d",
1870 			    token->nt_proto == IPPROTO_TCP ? "tcp" : "udp",
1871 			    lbuf, ntohs(lport), fbuf, ntohs(fport),
1872 			    token->nt_ifp->if_xname, pname, nfi->nfi_owner_pid);
1873 #endif /* DEBUG || DEVELOPMENT */
1874 
1875 			return;
1876 		}
1877 	}
1878 
1879 #if DEBUG || DEVELOPMENT
1880 	if (!(flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) &&
1881 	    (token->nt_flags & NETNS_NOWAKEFROMSLEEP)) {
1882 		char lbuf[MAX_IPv6_STR_LEN + 6] = {};
1883 		char fbuf[MAX_IPv6_STR_LEN + 6] = {};
1884 		in_port_t lport;
1885 		in_port_t fport;
1886 		char pname[MAXCOMLEN + 1];
1887 		const struct ns_flow_info *nfi = token->nt_flow_info;
1888 
1889 		proc_name(nfi->nfi_owner_pid, pname, sizeof(pname));
1890 
1891 		if (protocol == PF_INET) {
1892 			inet_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr,
1893 			    lbuf, sizeof(lbuf));
1894 			inet_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr,
1895 			    fbuf, sizeof(fbuf));
1896 			lport = nfi->nfi_laddr.sin.sin_port;
1897 			fport = nfi->nfi_faddr.sin.sin_port;
1898 		} else {
1899 			inet_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr,
1900 			    lbuf, sizeof(lbuf));
1901 			inet_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr,
1902 			    fbuf, sizeof(fbuf));
1903 			lport = nfi->nfi_laddr.sin6.sin6_port;
1904 			fport = nfi->nfi_faddr.sin6.sin6_port;
1905 		}
1906 
1907 		os_log(OS_LOG_DEFAULT,
1908 		    "netns_local_port_scan_flow_entry: no wake from sleep %s %s:%u %s:%u ifp %s proc %s:%d",
1909 		    token->nt_proto == IPPROTO_TCP ? "tcp" : "udp",
1910 		    lbuf, ntohs(lport), fbuf, ntohs(fport),
1911 		    token->nt_ifp != NULL ? token->nt_ifp->if_xname : "",
1912 		    pname, nfi->nfi_owner_pid);
1913 	}
1914 #endif /* DEBUG || DEVELOPMENT */
1915 
1916 	if (token->nt_ifp != NULL && token->nt_flow_info != NULL) {
1917 		/*
1918 		 * When the flow has "no wake from sleep" option, do not set the port in the bitmap
1919 		 * except if explicetely requested by the driver.
1920 		 * We always add the flow to the list of port in order to report spurious wakes
1921 		 */
1922 		if ((flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) ||
1923 		    (token->nt_flags & NETNS_NOWAKEFROMSLEEP) == 0) {
1924 			bitstr_set(bitfield, token->nt_port);
1925 		}
1926 		(void) if_ports_used_add_flow_entry(fe, token->nt_ifp->if_index,
1927 		    token->nt_flow_info, token->nt_flags);
1928 	} else {
1929 		SK_ERR("%s: unknown owner port %u"
1930 		    " nt_flags 0x%x ifindex %u nt_flow_info %p\n",
1931 		    __func__, token->nt_port,
1932 		    token->nt_flags,
1933 		    token->nt_ifp != NULL ? token->nt_ifp->if_index : 0,
1934 		    token->nt_flow_info);
1935 	}
1936 }
1937 
1938 static void
netns_get_if_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1939 netns_get_if_local_ports(ifnet_t ifp, protocol_family_t protocol,
1940     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1941 {
1942 	struct nx_flowswitch *fsw = NULL;
1943 
1944 	if (ifp == NULL || ifp->if_na == NULL) {
1945 		return;
1946 	}
1947 	/* Ensure that the interface is attached and won't detach */
1948 	if (!ifnet_is_attached(ifp, 1)) {
1949 		return;
1950 	}
1951 	fsw = fsw_ifp_to_fsw(ifp);
1952 	if (fsw == NULL) {
1953 		goto done;
1954 	}
1955 	FSW_RLOCK(fsw);
1956 	NETNS_LOCK();
1957 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1958 		netns_local_port_scan_flow_entry(_fe, protocol, flags,
1959 		bitfield);
1960 	});
1961 	NETNS_UNLOCK();
1962 	FSW_UNLOCK(fsw);
1963 done:
1964 	ifnet_decr_iorefcnt(ifp);
1965 }
1966 
1967 errno_t
netns_get_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1968 netns_get_local_ports(ifnet_t ifp, protocol_family_t protocol,
1969     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1970 {
1971 	if (__netns_inited == 0) {
1972 		return 0;
1973 	}
1974 	if (ifp != NULL) {
1975 		netns_get_if_local_ports(ifp, protocol, flags, bitfield);
1976 	} else {
1977 		errno_t error;
1978 		uint32_t count, i;
1979 		ifnet_t *__counted_by(count) ifp_list;
1980 
1981 		error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count);
1982 		if (error != 0) {
1983 			os_log_error(OS_LOG_DEFAULT,
1984 			    "%s: ifnet_list_get_all() failed %d",
1985 			    __func__, error);
1986 			return error;
1987 		}
1988 		for (i = 0; i < count; i++) {
1989 			if (TAILQ_EMPTY(&ifp_list[i]->if_addrhead)) {
1990 				continue;
1991 			}
1992 			netns_get_if_local_ports(ifp_list[i], protocol, flags,
1993 			    bitfield);
1994 		}
1995 		ifnet_list_free_counted_by(ifp_list, count);
1996 	}
1997 
1998 	return 0;
1999 }
2000 
2001 uint32_t
netns_find_anyres_byaddr(struct ifaddr * ifa,uint8_t proto)2002 netns_find_anyres_byaddr(struct ifaddr *ifa, uint8_t proto)
2003 {
2004 	int result = 0;
2005 	int ifa_addr_len;
2006 	struct ns_token *token;
2007 	struct ifnet *ifp = ifa->ifa_ifp;
2008 	struct sockaddr *ifa_addr = ifa->ifa_addr;
2009 
2010 	if (__netns_inited == 0) {
2011 		return ENOTSUP;
2012 	}
2013 
2014 	if ((ifa_addr->sa_family != AF_INET) &&
2015 	    (ifa_addr->sa_family != AF_INET6)) {
2016 		return 0;
2017 	}
2018 
2019 	ifa_addr_len = (ifa_addr->sa_family == AF_INET) ?
2020 	    sizeof(struct in_addr) : sizeof(struct in6_addr);
2021 
2022 	NETNS_LOCK();
2023 
2024 	LIST_FOREACH(token, &ifp->if_netns_tokens, nt_ifp_link) {
2025 		if ((token->nt_flags & NETNS_OWNER_MASK) == NETNS_PF) {
2026 			continue;
2027 		}
2028 		if (token->nt_addr_len != ifa_addr_len) {
2029 			continue;
2030 		}
2031 		if (token->nt_proto != proto) {
2032 			continue;
2033 		}
2034 		if (ifa_addr->sa_family == AF_INET) {
2035 			if (token->nt_inaddr.s_addr ==
2036 			    (satosin(ifa->ifa_addr))->sin_addr.s_addr) {
2037 				result = 1;
2038 				break;
2039 			}
2040 		} else if (ifa_addr->sa_family == AF_INET6) {
2041 			if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa),
2042 			    &token->nt_in6addr)) {
2043 				result = 1;
2044 				break;
2045 			}
2046 		}
2047 	}
2048 
2049 	NETNS_UNLOCK();
2050 	return result;
2051 }
2052 
2053 static uint32_t
_netns_lookup_ns_n_reservations(uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto)2054 _netns_lookup_ns_n_reservations(uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto)
2055 {
2056 	uint32_t ns_n_reservations = 0;
2057 	NETNS_LOCK_SPIN();
2058 	struct ns *namespace = _netns_get_ns(addr, addr_len, proto, true);
2059 	if (namespace != NULL) {
2060 		ns_n_reservations = namespace->ns_n_reservations;
2061 	}
2062 	NETNS_UNLOCK();
2063 	return ns_n_reservations;
2064 }
2065 
2066 uint32_t
netns_lookup_reservations_count_in(struct in_addr addr,uint8_t proto)2067 netns_lookup_reservations_count_in(struct in_addr addr, uint8_t proto)
2068 {
2069 	return _netns_lookup_ns_n_reservations(&addr.s_addr, sizeof(struct in_addr), proto);
2070 }
2071 
2072 uint32_t
netns_lookup_reservations_count_in6(struct in6_addr addr,uint8_t proto)2073 netns_lookup_reservations_count_in6(struct in6_addr addr, uint8_t proto)
2074 {
2075 	if (IN6_IS_SCOPE_EMBED(&addr)) {
2076 		addr.s6_addr16[1] = 0;
2077 	}
2078 	return _netns_lookup_ns_n_reservations(&addr.s6_addr32[0], sizeof(struct in6_addr), proto);
2079 }
2080 
2081 /*
2082  * Sysctl interface
2083  */
2084 
2085 static int netns_ctl_dump_all SYSCTL_HANDLER_ARGS;
2086 
2087 SYSCTL_NODE(_kern_skywalk, OID_AUTO, netns, CTLFLAG_RW | CTLFLAG_LOCKED,
2088     0, "Netns interface");
2089 
2090 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netns,
2091     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
2092     0, 0, netns_ctl_dump_all, "-",
2093     "Namespace contents (struct netns_ctl_dump_header, "
2094     "skywalk/os_stats_private.h)");
2095 
2096 static int
netns_ctl_write_ns(struct sysctl_req * req,struct ns * namespace,boolean_t is_global)2097 netns_ctl_write_ns(struct sysctl_req *req, struct ns *namespace,
2098     boolean_t is_global)
2099 {
2100 	struct ns_reservation *res;
2101 	struct netns_ctl_dump_header response_header;
2102 	struct netns_ctl_dump_record response_record;
2103 	int err;
2104 
2105 	/* Fill out header */
2106 	memset(&response_header, 0, sizeof(response_header));
2107 	response_header.ncdh_n_records = namespace->ns_n_reservations;
2108 	response_header.ncdh_proto = namespace->ns_proto;
2109 
2110 	if (is_global) {
2111 		response_header.ncdh_addr_len = 0;
2112 	} else {
2113 		response_header.ncdh_addr_len = namespace->ns_addr_len;
2114 	}
2115 	memcpy(response_header.ncdh_addr, namespace->ns_addr,
2116 	    namespace->ns_addr_len);
2117 
2118 	err = SYSCTL_OUT(req, &response_header, sizeof(response_header));
2119 	if (err) {
2120 		return err;
2121 	}
2122 
2123 	/* Fill out records */
2124 	RB_FOREACH(res, ns_reservation_tree, &namespace->ns_reservations) {
2125 		memset(&response_record, 0, sizeof(response_record));
2126 		response_record.ncdr_port = res->nsr_port;
2127 		response_record.ncdr_port_end = 0;
2128 		response_record.ncdr_listener_refs =
2129 		    NETNS_REF_COUNT(res, NETNS_LISTENER);
2130 		response_record.ncdr_skywalk_refs =
2131 		    NETNS_REF_COUNT(res, NETNS_SKYWALK);
2132 		response_record.ncdr_bsd_refs =
2133 		    NETNS_REF_COUNT(res, NETNS_BSD);
2134 		response_record.ncdr_pf_refs =
2135 		    NETNS_REF_COUNT(res, NETNS_PF);
2136 		err = SYSCTL_OUT(req, &response_record,
2137 		    sizeof(response_record));
2138 		if (err) {
2139 			return err;
2140 		}
2141 	}
2142 
2143 	return 0;
2144 }
2145 
2146 static int
2147 netns_ctl_dump_all SYSCTL_HANDLER_ARGS
2148 {
2149 #pragma unused(oidp, arg1, arg2)
2150 	struct ns *namespace;
2151 	int i, err = 0;
2152 
2153 	if (!kauth_cred_issuser(kauth_cred_get())) {
2154 		return EPERM;
2155 	}
2156 
2157 	if (__netns_inited == 0) {
2158 		return ENOTSUP;
2159 	}
2160 
2161 	NETNS_LOCK();
2162 
2163 	for (i = 0; i < NETNS_N_GLOBAL; i++) {
2164 		err = netns_ctl_write_ns(req, netns_global_non_wild[i], true);
2165 		if (err) {
2166 			goto done;
2167 		}
2168 	}
2169 
2170 	RB_FOREACH(namespace, netns_namespaces_tree, &netns_namespaces) {
2171 		err = netns_ctl_write_ns(req, namespace, false);
2172 		if (err) {
2173 			goto done;
2174 		}
2175 	}
2176 
2177 	/*
2178 	 * If this is just a request for length, add slop because
2179 	 * this is dynamically changing data
2180 	 */
2181 	if (req->oldptr == USER_ADDR_NULL) {
2182 		req->oldidx += 20 * sizeof(struct netns_ctl_dump_record);
2183 	}
2184 
2185 done:
2186 	NETNS_UNLOCK();
2187 	return err;
2188 }
2189 /* CSTYLED */
2190