xref: /xnu-12377.41.6/bsd/skywalk/namespace/netns.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/assert.h>
30 #include <kern/locks.h>
31 #include <kern/zalloc.h>
32 #include <libkern/tree.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/bitstring.h>
36 #include <net/if.h>
37 #include <net/kpi_interface.h>
38 #include <net/restricted_in_port.h>
39 
40 #include <netinet/in.h>
41 #include <netinet/in_pcb.h>
42 #include <netinet/tcp_fsm.h>
43 #include <netinet/tcp_var.h>
44 
45 #include <netinet6/in6_var.h>
46 #include <string.h>
47 
48 #include <skywalk/os_skywalk.h>
49 #include <skywalk/os_skywalk_private.h>
50 #include <skywalk/os_stats_private.h>
51 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
52 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
53 
54 #include <net/if_ports_used.h>
55 
56 static int __netns_inited = 0;
57 
58 /*
59  * Logging
60  */
61 
62 #define NS_VERB_PROTO(proto)    ((proto == IPPROTO_TCP) ? SK_VERB_NS_TCP : \
63 	                                    SK_VERB_NS_UDP)
64 #define NS_VERB_IP(addr_len)    ((addr_len == sizeof (struct in_addr)) ? \
65 	                                    SK_VERB_NS_IPV4 : SK_VERB_NS_IPV6)
66 #define PROTO_STR(proto)        ((proto == IPPROTO_TCP) ? "tcp" : "udp")
67 #define LEN_TO_AF(len)          (((len == sizeof (struct in_addr)) ? \
68 	                            AF_INET : AF_INET6))
69 #define NS_PORT_ERR(_fmt, ...) do { \
70 	proc_t _p = current_proc(); \
71 	SK_ERR("%s(%d) port %u: " _fmt, sk_proc_name(_p), sk_proc_pid(_p), \
72 	    port, ##__VA_ARGS__); \
73 } while (0);
74 
75 /*
76  * Locking
77  * Netns is currently protected by a global mutex, NETNS_LOCK. This lock is
78  * aquired at the entry of every kernel-facing function, and released at the
79  * end. Data within netns_token structures is also protected under this lock.
80  */
81 
82 #define NETNS_LOCK()                    \
83 	lck_mtx_lock(&netns_lock)
84 #define NETNS_LOCK_SPIN()               \
85 	lck_mtx_lock_spin(&netns_lock)
86 #define NETNS_LOCK_CONVERT() do {       \
87 	NETNS_LOCK_ASSERT_HELD();       \
88 	lck_mtx_convert_spin(&netns_lock); \
89 } while (0)
90 #define NETNS_UNLOCK()                  \
91 	lck_mtx_unlock(&netns_lock)
92 #define NETNS_LOCK_ASSERT_HELD()        \
93 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_OWNED)
94 #define NETNS_LOCK_ASSERT_NOTHELD()     \
95 	LCK_MTX_ASSERT(&netns_lock, LCK_MTX_ASSERT_NOTOWNED)
96 
97 static LCK_GRP_DECLARE(netns_lock_group, "netns_lock");
98 static LCK_MTX_DECLARE(netns_lock, &netns_lock_group);
99 
100 /*
101  * Internal data structures and parameters
102  */
103 
104 /*
105  * Local ports are kept track of by reference counts kept in a tree specific to
106  * an <IP, protocol> tuple (see struct ns).
107  *
108  * Note: port numbers are stored in host byte order.
109  */
110 struct ns_reservation {
111 	RB_ENTRY(ns_reservation) nsr_link;
112 	uint32_t nsr_refs[NETNS_OWNER_MAX + 1];
113 	in_port_t nsr_port;
114 	bool nsr_reuseport:1;
115 };
116 
117 #define NETNS_REF_COUNT(nsr, flags)     \
118 	(nsr)->nsr_refs[((flags) & NETNS_OWNER_MASK)]
119 
120 static inline int nsr_cmp(const struct ns_reservation *,
121     const struct ns_reservation *);
122 
123 RB_HEAD(ns_reservation_tree, ns_reservation);
124 RB_PROTOTYPE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
125 RB_GENERATE(ns_reservation_tree, ns_reservation, nsr_link, nsr_cmp);
126 
127 static inline struct ns_reservation *ns_reservation_tree_find(
128 	struct ns_reservation_tree *, const in_port_t);
129 
130 /*
131  * A namespace keeps track of the local port numbers in use for a given
132  * <IP, protocol> tuple. There are also global namespaces for each
133  * protocol to accomodate INADDR_ANY behavior and diagnostics.
134  */
135 struct ns {
136 	RB_ENTRY(ns)    ns_link;
137 
138 	void            *ns_addr_key;
139 
140 	union {
141 		uint32_t        ns_addr[4];
142 		struct in_addr  ns_inaddr;
143 		struct in6_addr ns_in6addr;
144 	};
145 	uint8_t         ns_addr_len;
146 	uint8_t         ns_proto;
147 
148 	in_port_t       ns_last_ephemeral_port_down;
149 	in_port_t       ns_last_ephemeral_port_up;
150 
151 	uint8_t         ns_is_freeable;
152 
153 	uint32_t        ns_n_reservations;
154 	struct ns_reservation_tree ns_reservations;
155 };
156 
157 static uint32_t netns_n_namespaces;
158 
159 static inline int ns_cmp(const struct ns *, const struct ns *);
160 
161 RB_HEAD(netns_namespaces_tree, ns) netns_namespaces =
162     RB_INITIALIZER(netns_namespaces);
163 RB_PROTOTYPE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
164 RB_GENERATE_PREV(netns_namespaces_tree, ns, ns_link, ns_cmp);
165 
166 /*
167  * Declare pointers to global namespaces for each protocol.
168  * All non-wildcard reservations will have an entry here.
169  */
170 #define NETNS_N_GLOBAL  4
171 static struct ns *netns_global_non_wild[NETNS_N_GLOBAL];
172 static struct ns *netns_global_wild[NETNS_N_GLOBAL];
173 #define NETNS_ADDRLEN_V4 (sizeof(struct in_addr))
174 #define NETNS_ADDRLEN_V6 (sizeof(struct in6_addr))
175 #define NETNS_NS_TCP    0
176 #define NETNS_NS_UDP    1
177 #define NETNS_NS_V4     0
178 #define NETNS_NS_V6     2
179 #define NETNS_NS_GLOBAL_IDX(proto, addrlen)     \
180 	((((proto) == IPPROTO_TCP) ? NETNS_NS_TCP : NETNS_NS_UDP) | \
181 	(((addrlen) == NETNS_ADDRLEN_V4) ? NETNS_NS_V4 : NETNS_NS_V6))
182 
183 #define NETNS_NS_UDP_EPHEMERAL_RESERVE  4096
184 
185 /*
186  * Internal token structure
187  *
188  * Note: port numbers are stored in host byte order.
189  */
190 struct ns_token {
191 	/* Reservation state */
192 	ifnet_t                 nt_ifp;
193 	LIST_ENTRY(ns_token)    nt_ifp_link;
194 	LIST_ENTRY(ns_token)    nt_all_link;
195 	uint32_t                nt_state;       /* NETNS_STATE_* */
196 
197 	/* Reservation context */
198 	union {
199 		uint32_t        nt_addr[4];
200 		struct in_addr  nt_inaddr;
201 		struct in6_addr nt_in6addr;
202 	};
203 	uint8_t                 nt_addr_len;
204 	uint8_t                 nt_proto;
205 	in_port_t               nt_port;
206 	uint32_t                nt_flags;
207 
208 	/* Optional information about the flow */
209 	struct ns_flow_info     *nt_flow_info;
210 };
211 
212 /* Valid values for nt_state */
213 #define NETNS_STATE_HALFCLOSED  0x1     /* half closed */
214 #define NETNS_STATE_WITHDRAWN   0x2     /* withdrawn; not offloadable */
215 
216 #define NETNS_STATE_BITS        "\020\01HALFCLOSED\02WITHDRAWN"
217 
218 /* List of tokens not bound to an ifnet */
219 LIST_HEAD(, ns_token) netns_unbound_tokens = LIST_HEAD_INITIALIZER(
220 	netns_unbound_tokens);
221 
222 /* List of all tokens currently allocated in the system */
223 LIST_HEAD(, ns_token) netns_all_tokens = LIST_HEAD_INITIALIZER(
224 	netns_all_tokens);
225 
226 /*
227  * Memory management
228  */
229 static SKMEM_TYPE_DEFINE(netns_ns_zone, struct ns);
230 
231 #define NETNS_NS_TOKEN_ZONE_NAME        "netns.ns_token"
232 static unsigned int netns_ns_token_size; /* size of zone element */
233 static struct skmem_cache *netns_ns_token_cache; /* for ns_token */
234 
235 #define NETNS_NS_FLOW_INFO_ZONE_NAME    "netns.ns_flow_info"
236 static unsigned int netns_ns_flow_info_size; /* size of zone element */
237 static struct skmem_cache *netns_ns_flow_info_cache; /* for ns_flow_info */
238 
239 #define NETNS_NS_RESERVATION_ZONE_NAME  "netns.ns_reservation"
240 static unsigned int netns_ns_reservation_size; /* size of zone element */
241 static struct skmem_cache *netns_ns_reservation_cache; /* for ns_reservation */
242 
243 static struct ns_reservation *netns_ns_reservation_alloc(in_port_t, uint32_t);
244 static void netns_ns_reservation_free(struct ns_reservation *);
245 static struct ns *netns_ns_alloc(zalloc_flags_t);
246 static void netns_ns_free(struct ns *);
247 static void netns_ns_cleanup(struct ns *);
248 static struct ns_token *netns_ns_token_alloc(boolean_t);
249 static void netns_ns_token_free(struct ns_token *);
250 
251 /*
252  * Utility/internal code
253  */
254 static struct ns *_netns_get_ns(uint32_t *__sized_by(addr_len), uint8_t addr_len,
255     uint8_t, bool);
256 static inline boolean_t _netns_is_wildcard_addr(
257 	const uint32_t *__sized_by(addr_len), uint8_t addr_len);
258 static int _netns_reserve_common(struct ns *, in_port_t, uint32_t);
259 static void _netns_release_common(struct ns *, in_port_t, uint32_t);
260 static inline void netns_clear_ifnet(struct ns_token *);
261 static int _netns_reserve_kpi_common(struct ns *, netns_token *,
262     uint32_t *__sized_by(addr_len), uint8_t addr_len, uint8_t, in_port_t *,
263     uint32_t, struct ns_flow_info *);
264 static void _netns_set_ifnet_internal(struct ns_token *, struct ifnet *);
265 
266 static struct ns_reservation *
netns_ns_reservation_alloc(in_port_t port,uint32_t flags)267 netns_ns_reservation_alloc(in_port_t port, uint32_t flags)
268 {
269 	struct ns_reservation *res;
270 
271 	VERIFY(port != 0);
272 
273 	res = skmem_cache_alloc(netns_ns_reservation_cache, SKMEM_SLEEP);
274 	ASSERT(res != NULL);
275 
276 	bzero(res, netns_ns_reservation_size);
277 	res->nsr_port = port;
278 	res->nsr_reuseport = ((flags & NETNS_REUSEPORT) != 0);
279 	return res;
280 }
281 
282 static void
netns_ns_reservation_free(struct ns_reservation * res)283 netns_ns_reservation_free(struct ns_reservation *res)
284 {
285 	skmem_cache_free(netns_ns_reservation_cache, res);
286 }
287 
288 static struct ns *
netns_ns_alloc(zalloc_flags_t how)289 netns_ns_alloc(zalloc_flags_t how)
290 {
291 	struct ns *namespace;
292 	in_port_t first = (in_port_t)ipport_firstauto;
293 	in_port_t last  = (in_port_t)ipport_lastauto;
294 	in_port_t rand_port;
295 
296 	namespace = zalloc_flags(netns_ns_zone, how | Z_ZERO);
297 	if (namespace == NULL) {
298 		return NULL;
299 	}
300 
301 	namespace->ns_is_freeable = 1;
302 
303 	RB_INIT(&namespace->ns_reservations);
304 
305 	/*
306 	 * Randomize the initial ephemeral port starting point, just in case
307 	 * this namespace is for an ipv6 address which gets brought up and
308 	 * down often.
309 	 */
310 	if (first == last) {
311 		rand_port = first;
312 	} else {
313 		read_frandom(&rand_port, sizeof(rand_port));
314 
315 		if (first > last) {
316 			rand_port = last + (rand_port % (first - last));
317 		} else {
318 			rand_port = first + (rand_port % (last - first));
319 		}
320 	}
321 	namespace->ns_last_ephemeral_port_down = rand_port;
322 	namespace->ns_last_ephemeral_port_up = rand_port;
323 
324 	return namespace;
325 }
326 
327 static void
netns_ns_free(struct ns * namespace)328 netns_ns_free(struct ns *namespace)
329 {
330 	struct ns_reservation *res;
331 	struct ns_reservation *tmp_res;
332 #if SK_LOG
333 	char tmp_ip_str[MAX_IPv6_STR_LEN];
334 #endif /* SK_LOG */
335 
336 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
337 	    NS_VERB_PROTO(namespace->ns_proto),
338 	    "freeing %s ns for IP %s",
339 	    PROTO_STR(namespace->ns_proto),
340 	    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
341 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)));
342 
343 	RB_FOREACH_SAFE(res, ns_reservation_tree, &namespace->ns_reservations,
344 	    tmp_res) {
345 		netns_ns_reservation_free(res);
346 		namespace->ns_n_reservations--;
347 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
348 		    res);
349 	}
350 
351 	VERIFY(RB_EMPTY(&namespace->ns_reservations));
352 
353 	if (netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
354 	    namespace->ns_addr_len)] == namespace) {
355 		netns_global_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
356 		namespace->ns_addr_len)] = NULL;
357 	}
358 	if (netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
359 	    namespace->ns_addr_len)] == namespace) {
360 		netns_global_non_wild[NETNS_NS_GLOBAL_IDX(namespace->ns_proto,
361 		namespace->ns_addr_len)] = NULL;
362 	}
363 
364 	zfree(netns_ns_zone, namespace);
365 }
366 
367 static void
netns_ns_cleanup(struct ns * namespace)368 netns_ns_cleanup(struct ns *namespace)
369 {
370 	if (namespace->ns_is_freeable &&
371 	    RB_EMPTY(&namespace->ns_reservations)) {
372 		RB_REMOVE(netns_namespaces_tree, &netns_namespaces, namespace);
373 		netns_n_namespaces--;
374 		netns_ns_free(namespace);
375 	}
376 }
377 
378 static struct ns_token *
netns_ns_token_alloc(boolean_t with_nfi)379 netns_ns_token_alloc(boolean_t with_nfi)
380 {
381 	struct ns_token *token;
382 
383 	NETNS_LOCK_ASSERT_HELD();
384 	NETNS_LOCK_CONVERT();
385 
386 	token = skmem_cache_alloc(netns_ns_token_cache, SKMEM_SLEEP);
387 	ASSERT(token != NULL);
388 
389 	bzero(token, netns_ns_token_size);
390 
391 	if (with_nfi) {
392 		token->nt_flow_info =  skmem_cache_alloc(netns_ns_flow_info_cache,
393 		    SKMEM_SLEEP);
394 		ASSERT(token->nt_flow_info != NULL);
395 	}
396 	LIST_INSERT_HEAD(&netns_all_tokens, token, nt_all_link);
397 
398 	return token;
399 }
400 
401 static void
netns_ns_token_free(struct ns_token * token)402 netns_ns_token_free(struct ns_token *token)
403 {
404 	NETNS_LOCK_ASSERT_HELD();
405 	NETNS_LOCK_CONVERT();
406 	LIST_REMOVE(token, nt_all_link);
407 
408 	if (token->nt_flow_info != NULL) {
409 		skmem_cache_free(netns_ns_flow_info_cache, token->nt_flow_info);
410 	}
411 	skmem_cache_free(netns_ns_token_cache, token);
412 }
413 
414 __attribute__((always_inline))
415 static inline int
nsr_cmp(const struct ns_reservation * nsr1,const struct ns_reservation * nsr2)416 nsr_cmp(const struct ns_reservation *nsr1, const struct ns_reservation *nsr2)
417 {
418 #define NSR_COMPARE(r1, r2)     ((int)(r1)->nsr_port - (int)(r2)->nsr_port)
419 	return NSR_COMPARE(nsr1, nsr2);
420 }
421 
422 __attribute__((always_inline))
423 static inline int
ns_cmp(const struct ns * a,const struct ns * b)424 ns_cmp(const struct ns *a, const struct ns *b)
425 {
426 	int d;
427 
428 	if ((d = (a->ns_addr_len - b->ns_addr_len)) != 0) {
429 		return d;
430 	}
431 	if ((d = (a->ns_proto - b->ns_proto)) != 0) {
432 		return d;
433 	}
434 	if ((d = flow_ip_cmp(a->ns_addr_key, b->ns_addr_key,
435 	    b->ns_addr_len)) != 0) {
436 		return d;
437 	}
438 
439 	return 0;
440 }
441 
442 /*
443  * Common routine to look up a reservation.
444  *
445  * NOTE: Assumes the caller holds the NETNS global lock
446  */
447 __attribute__((always_inline))
448 static inline struct ns_reservation *
ns_reservation_tree_find(struct ns_reservation_tree * tree,const in_port_t port)449 ns_reservation_tree_find(struct ns_reservation_tree *tree, const in_port_t port)
450 {
451 	struct ns_reservation res;
452 	res.nsr_port = port;
453 	return RB_FIND(ns_reservation_tree, tree, &res);
454 }
455 
456 /*
457  * Retrieve the namespace for the supplied <address, protocol> tuple.
458  * If create is set and such a namespace doesn't already exist, one will be
459  * created.
460  */
461 static struct ns *
_netns_get_ns(uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,bool create)462 _netns_get_ns(uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto, bool create)
463 {
464 	struct ns *namespace = NULL;
465 	struct ns find = {
466 		.ns_addr_key = addr,
467 		.ns_addr_len = addr_len,
468 		.ns_proto = proto,
469 	};
470 #if SK_LOG
471 	char tmp_ip_str[MAX_IPv6_STR_LEN];
472 #endif /* SK_LOG */
473 
474 	VERIFY(addr_len == sizeof(struct in_addr) ||
475 	    addr_len == sizeof(struct in6_addr));
476 
477 	NETNS_LOCK_ASSERT_HELD();
478 
479 	namespace = RB_FIND(netns_namespaces_tree, &netns_namespaces, &find);
480 
481 	if (create && namespace == NULL) {
482 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
483 		    "allocating %s ns for IP %s",
484 		    PROTO_STR(proto), sk_ntop(LEN_TO_AF(addr_len), addr,
485 		    tmp_ip_str, sizeof(tmp_ip_str)));
486 		NETNS_LOCK_CONVERT();
487 		namespace = netns_ns_alloc(Z_WAITOK | Z_NOFAIL);
488 		__builtin_assume(namespace != NULL);
489 		memcpy(namespace->ns_addr, addr, addr_len);
490 		namespace->ns_addr_key = &namespace->ns_addr;
491 		namespace->ns_addr_len = addr_len;
492 		namespace->ns_proto = proto;
493 		RB_INSERT(netns_namespaces_tree, &netns_namespaces, namespace);
494 		netns_n_namespaces++;
495 
496 		if (_netns_is_wildcard_addr(addr, addr_len) &&
497 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
498 		    addr_len)] == NULL) {
499 			netns_global_wild[NETNS_NS_GLOBAL_IDX(proto,
500 			addr_len)] = namespace;
501 		}
502 	}
503 
504 	return namespace;
505 }
506 
507 /*
508  * Return true if the supplied address is a wildcard (INADDR_ANY)
509  */
510 __attribute__((always_inline))
511 static boolean_t
_netns_is_wildcard_addr(const uint32_t * __sized_by (addr_len)addr,uint8_t addr_len)512 _netns_is_wildcard_addr(const uint32_t *__sized_by(addr_len)addr, uint8_t addr_len)
513 {
514 	boolean_t wildcard;
515 
516 	switch (addr_len) {
517 	case sizeof(struct in_addr):
518 		wildcard = (addr[0] == 0);
519 		break;
520 
521 	case sizeof(struct in6_addr):
522 		wildcard = (addr[0] == 0 && addr[1] == 0 &&
523 		    addr[2] == 0 && addr[3] == 0);
524 		break;
525 
526 	default:
527 		wildcard = FALSE;
528 		break;
529 	}
530 
531 	return wildcard;
532 }
533 
534 __attribute__((always_inline))
535 static boolean_t
_netns_is_port_used(struct ns * gns,struct ns_reservation * curr_res,in_port_t port)536 _netns_is_port_used(struct ns * gns, struct ns_reservation *curr_res, in_port_t port)
537 {
538 	struct ns_reservation *res = NULL;
539 
540 	if (gns == NULL) {
541 		return FALSE;
542 	}
543 
544 	res = ns_reservation_tree_find(&gns->ns_reservations, port);
545 	if (res != NULL && res != curr_res) {
546 		if (!res->nsr_reuseport) {
547 			return TRUE;
548 		}
549 	}
550 
551 	return FALSE;
552 }
553 
554 /*
555  * Internal shared code to reserve ports within a specific namespace.
556  *
557  * Note: port numbers are in host byte-order here.
558  */
559 static int
_netns_reserve_common(struct ns * namespace,in_port_t port,uint32_t flags)560 _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags)
561 {
562 	struct ns_reservation *res = NULL, *exist = NULL;
563 	uint8_t proto, addr_len;
564 	int err = 0;
565 #if SK_LOG
566 	char tmp_ip_str[MAX_IPv6_STR_LEN];
567 #endif /* SK_LOG */
568 
569 	VERIFY(port != 0);
570 	proto = namespace->ns_proto;
571 	addr_len = namespace->ns_addr_len;
572 	NETNS_LOCK_CONVERT();
573 	res = netns_ns_reservation_alloc(port, flags);
574 	if (res == NULL) {
575 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
576 		    "ERROR %s:%s:%d // flags 0x%x // OUT OF MEMORY",
577 		    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
578 		    namespace->ns_addr, tmp_ip_str,
579 		    sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags);
580 		return ENOMEM;
581 	}
582 	exist = RB_INSERT(ns_reservation_tree, &namespace->ns_reservations,
583 	    res);
584 	if (__probable(exist == NULL)) {
585 		namespace->ns_n_reservations++;
586 	} else {
587 		netns_ns_reservation_free(res);
588 		res = exist;
589 	}
590 
591 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
592 	    "pre: %s:%s:%d // flags 0x%x // refs %d sky, %d ls, "
593 	    "%d bsd %d pf", sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
594 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
595 	    PROTO_STR(proto), port, flags,
596 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
597 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
598 	    NETNS_REF_COUNT(res, NETNS_BSD),
599 	    NETNS_REF_COUNT(res, NETNS_PF));
600 
601 	/* Make reservation */
602 	/*
603 	 * Bypass collision detection for reservations in the global non-wild
604 	 * namespace. We use that namespace for reference counts only.
605 	 */
606 	if (namespace !=
607 	    netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)]) {
608 		struct ns_reservation *skres;
609 		boolean_t is_wild = _netns_is_wildcard_addr(namespace->ns_addr,
610 		    addr_len);
611 		struct ns *gns =
612 		    netns_global_wild[NETNS_NS_GLOBAL_IDX(proto, addr_len)];
613 
614 		if (NETNS_IS_SKYWALK(flags)) {
615 			if ((!is_wild || exist != NULL) && gns != NULL &&
616 			    (skres = ns_reservation_tree_find(
617 				    &gns->ns_reservations, port)) != NULL &&
618 			    NETNS_REF_COUNT(skres, NETNS_LISTENER) == 0) {
619 				/*
620 				 * The mere existence of any non-skywalk
621 				 * listener wildcard entry for this
622 				 * protocol/port number means this must fail.
623 				 */
624 				NS_PORT_ERR("ADDRINUSE: Duplicate wildcard");
625 				err = EADDRINUSE;
626 				goto done;
627 			}
628 
629 			if (is_wild) {
630 				gns = netns_global_non_wild[
631 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
632 				VERIFY(gns != NULL);
633 
634 				if (_netns_is_port_used(netns_global_non_wild[
635 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
636 				    _netns_is_port_used(netns_global_non_wild[
637 					    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
638 					/*
639 					 * If Skywalk is trying to reserve a
640 					 * wildcard, then the mere existance of
641 					 * any entry in either v4/v6 non-wild
642 					 * namespace for this port means this
643 					 * must fail.
644 					 */
645 					NS_PORT_ERR("ADDRINUSE: Wildcard with non-wild.");
646 					err = EADDRINUSE;
647 					goto done;
648 				}
649 			}
650 		} else {
651 			/*
652 			 * Check if Skywalk has reserved a wildcard entry.
653 			 * Note that the arithmetic OR here is intentional.
654 			 */
655 			if ((!is_wild || exist != NULL) && gns != NULL &&
656 			    (skres = ns_reservation_tree_find(
657 				    &gns->ns_reservations, port)) != NULL &&
658 			    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
659 			    NETNS_REF_COUNT(skres, NETNS_LISTENER)) != 0) {
660 				/*
661 				 * BSD is trying to reserve a proto/port for
662 				 * which Skywalk already has a wildcard
663 				 * reservation.
664 				 */
665 				NS_PORT_ERR("ADDRINUSE: BSD requesting Skywalk port");
666 				err = EADDRINUSE;
667 				goto done;
668 			}
669 
670 			/*
671 			 * If BSD is trying to reserve a wildcard,
672 			 * ensure Skywalk has not already reserved
673 			 * a non-wildcard.
674 			 */
675 			if (is_wild) {
676 				gns = netns_global_non_wild[
677 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
678 				VERIFY(gns != NULL);
679 
680 				/*
681 				 * Note that the arithmetic OR here is
682 				 * intentional.
683 				 */
684 				if ((skres = ns_reservation_tree_find(
685 					    &gns->ns_reservations, port)) != NULL &&
686 				    (NETNS_REF_COUNT(skres, NETNS_SKYWALK) |
687 				    NETNS_REF_COUNT(skres,
688 				    NETNS_LISTENER)) != 0) {
689 					NS_PORT_ERR("ADDRINUSE: BSD wildcard with non-wild.");
690 					err = EADDRINUSE;
691 					goto done;
692 				}
693 			}
694 		}
695 
696 		switch (flags & NETNS_OWNER_MASK) {
697 		case NETNS_SKYWALK:
698 			/* check collision w/ BSD */
699 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
700 			    NETNS_REF_COUNT(res, NETNS_PF) > 0) {
701 				NS_PORT_ERR("ERROR - Skywalk got ADDRINUSE (w/ BSD)");
702 				err = EADDRINUSE;
703 				goto done;
704 			}
705 
706 			/* BEGIN CSTYLED */
707 			/*
708 			 * Scenarios with new Skywalk connected flow:
709 			 * 1. With existing Skywalk connected flow,
710 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
711 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 1
712 			 *    reject by failing the wild gns lookup below.
713 			 * 2. With existing Skywalk 3-tuple listener,
714 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 1
715 			 *    bypass the check below.
716 			 * 3. With existing Skywalk 2-tuple listener,
717 			 *      NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
718 			 *      NETNS_REF_COUNT(res, NETNS_SKYWALK) == 0
719 			 *    pass with successful wild gns lookup.
720 			 */
721 			/* END CSTYLED */
722 			if (NETNS_REF_COUNT(res, NETNS_LISTENER) == 0 &&
723 			    NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0) {
724 				/* check if covered by wild Skywalk listener */
725 				gns = netns_global_wild[
726 					NETNS_NS_GLOBAL_IDX(proto, addr_len)];
727 				if (gns != NULL &&
728 				    (skres = ns_reservation_tree_find(
729 					    &gns->ns_reservations, port)) != NULL &&
730 				    NETNS_REF_COUNT(skres, NETNS_LISTENER)
731 				    != 0) {
732 					err = 0;
733 					goto done;
734 				}
735 				if (addr_len == sizeof(struct in_addr)) {
736 					/* If address is IPv4, also check for wild IPv6 registration */
737 					gns = netns_global_wild[
738 						NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)];
739 					if (gns != NULL &&
740 					    (skres = ns_reservation_tree_find(
741 						    &gns->ns_reservations, port)) != NULL &&
742 					    NETNS_REF_COUNT(skres, NETNS_LISTENER)
743 					    != 0) {
744 						err = 0;
745 						goto done;
746 					}
747 				}
748 				NS_PORT_ERR("ERROR - Skywalk got ADDRINUSE (w/ SK connected flow)");
749 				err = EADDRINUSE;
750 			}
751 			/*
752 			 * XXX: Duplicate 5-tuple flows under a Skywalk
753 			 * listener are currently detected by flow manager,
754 			 * till we implement 5-tuple-aware netns.
755 			 */
756 			break;
757 
758 		case NETNS_LISTENER:
759 			if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 ||
760 			    NETNS_REF_COUNT(res, NETNS_PF) > 0 ||
761 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0 ||
762 			    _netns_is_port_used(netns_global_wild[
763 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
764 			    _netns_is_port_used(netns_global_wild[
765 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port) ||
766 			    _netns_is_port_used(netns_global_non_wild[
767 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) ||
768 			    _netns_is_port_used(netns_global_non_wild[
769 				    NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) {
770 				NS_PORT_ERR("ERROR - Listener got ADDRINUSE");
771 				err = EADDRINUSE;
772 			}
773 			break;
774 
775 		case NETNS_BSD:
776 		case NETNS_PF:
777 			if (NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0 ||
778 			    NETNS_REF_COUNT(res, NETNS_LISTENER) > 0) {
779 				NS_PORT_ERR("ERROR - %s got ADDRINUSE",
780 				    ((flags & NETNS_OWNER_MASK) == NETNS_PF) ?
781 				    "PF" : "BSD");
782 				err = EADDRINUSE;
783 			}
784 			break;
785 
786 		default:
787 			panic("_netns_reserve_common: invalid owner 0x%x",
788 			    flags & NETNS_OWNER_MASK);
789 			/* NOTREACHED */
790 			__builtin_unreachable();
791 		}
792 	}
793 
794 done:
795 	ASSERT(res != NULL);
796 	if (__probable(err == 0)) {
797 		NETNS_REF_COUNT(res, flags)++;
798 		/* Check for wrap around */
799 		VERIFY(NETNS_REF_COUNT(res, flags) != 0);
800 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
801 		    NS_VERB_PROTO(namespace->ns_proto),
802 		    "post: %s:%s:%d err %d // flags 0x%x // refs %d sky, "
803 		    "%d ls, %d bsd %d pf",
804 		    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
805 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
806 		    PROTO_STR(namespace->ns_proto), port, err, flags,
807 		    NETNS_REF_COUNT(res, NETNS_SKYWALK),
808 		    NETNS_REF_COUNT(res, NETNS_LISTENER),
809 		    NETNS_REF_COUNT(res, NETNS_BSD),
810 		    NETNS_REF_COUNT(res, NETNS_PF));
811 	} else {
812 		if (exist == NULL) {
813 			RB_REMOVE(ns_reservation_tree,
814 			    &namespace->ns_reservations, res);
815 			namespace->ns_n_reservations--;
816 			netns_ns_reservation_free(res);
817 		}
818 	}
819 	return err;
820 }
821 
822 /*
823  * Internal shared code to release ports within a specific namespace.
824  */
825 static void
_netns_release_common(struct ns * namespace,in_port_t port,uint32_t flags)826 _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags)
827 {
828 	struct ns_reservation *res;
829 	uint32_t refs;
830 	int i;
831 #if SK_LOG
832 	char tmp_ip_str[MAX_IPv6_STR_LEN];
833 #endif /* SK_LOG */
834 
835 	NETNS_LOCK_ASSERT_HELD();
836 
837 	res = ns_reservation_tree_find(&namespace->ns_reservations, port);
838 	if (res == NULL) {
839 		SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
840 		    NS_VERB_PROTO(namespace->ns_proto),
841 		    "ERROR %s:%s:%d // flags 0x%x // not found",
842 		    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
843 		    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
844 		    PROTO_STR(namespace->ns_proto), port, flags);
845 		VERIFY(res != NULL);
846 	}
847 
848 	SK_DF(NS_VERB_IP(namespace->ns_addr_len) |
849 	    NS_VERB_PROTO(namespace->ns_proto),
850 	    "%s:%s:%d // flags 0x%x // refs %d sky, %d ls, %d bsd, %d pf",
851 	    sk_ntop(LEN_TO_AF(namespace->ns_addr_len),
852 	    namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)),
853 	    PROTO_STR(namespace->ns_proto), port, flags,
854 	    NETNS_REF_COUNT(res, NETNS_SKYWALK),
855 	    NETNS_REF_COUNT(res, NETNS_LISTENER),
856 	    NETNS_REF_COUNT(res, NETNS_BSD),
857 	    NETNS_REF_COUNT(res, NETNS_PF));
858 
859 	/* Release reservation */
860 	VERIFY(NETNS_REF_COUNT(res, flags) > 0);
861 	NETNS_REF_COUNT(res, flags) -= 1;
862 
863 	/* Clean up memory, if appropriate */
864 	for (i = 0, refs = 0; i <= NETNS_OWNER_MAX && refs == 0; i++) {
865 		refs |= res->nsr_refs[i];
866 	}
867 	if (refs == 0) {
868 		RB_REMOVE(ns_reservation_tree, &namespace->ns_reservations,
869 		    res);
870 		namespace->ns_n_reservations--;
871 		NETNS_LOCK_CONVERT();
872 		netns_ns_reservation_free(res);
873 		netns_ns_cleanup(namespace);
874 	}
875 }
876 
877 __attribute__((always_inline))
878 static inline void
netns_init_global_ns(struct ns ** global_ptr,uint8_t proto,uint8_t addrlen)879 netns_init_global_ns(struct ns **global_ptr, uint8_t proto, uint8_t addrlen)
880 {
881 	struct ns *namespace;
882 
883 	namespace = *global_ptr = netns_ns_alloc(Z_WAITOK);
884 	memset(namespace->ns_addr, 0xFF, addrlen);
885 	namespace->ns_addr_len = addrlen;
886 	namespace->ns_proto = proto;
887 	namespace->ns_is_freeable = 0;
888 }
889 
890 __attribute__((always_inline))
891 static inline void
netns_clear_ifnet(struct ns_token * nstoken)892 netns_clear_ifnet(struct ns_token *nstoken)
893 {
894 #if SK_LOG
895 	char tmp_ip_str[MAX_IPv6_STR_LEN];
896 #endif /* SK_LOG */
897 
898 	NETNS_LOCK_ASSERT_HELD();
899 
900 	if (nstoken->nt_ifp != NULL) {
901 		LIST_REMOVE(nstoken, nt_ifp_link);
902 
903 		SK_DF(NS_VERB_IP(nstoken->nt_addr_len) |
904 		    NS_VERB_PROTO(nstoken->nt_proto),
905 		    "%s:%s:%d // removed from ifnet %d",
906 		    sk_ntop(LEN_TO_AF(nstoken->nt_addr_len),
907 		    nstoken->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
908 		    PROTO_STR(nstoken->nt_proto), nstoken->nt_port,
909 		    nstoken->nt_ifp->if_index);
910 
911 		NETNS_LOCK_CONVERT();
912 		ifnet_decr_iorefcnt(nstoken->nt_ifp);
913 		nstoken->nt_ifp = NULL;
914 	} else {
915 		LIST_REMOVE(nstoken, nt_ifp_link);
916 	}
917 }
918 
919 /*
920  * Internal shared code to perform a port[-range] reservation, along with all
921  * the boilerplate and sanity checks expected for a call coming in from the
922  * surrounding kernel code.
923  */
924 static int
_netns_reserve_kpi_common(struct ns * ns,netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t * port,uint32_t flags,struct ns_flow_info * nfi)925 _netns_reserve_kpi_common(struct ns *ns, netns_token *token,
926     uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto,
927     in_port_t *port, uint32_t flags, struct ns_flow_info *nfi)
928 {
929 	boolean_t ns_want_cleanup = (ns == NULL);
930 	struct ns_token *nt;
931 	int err = 0;
932 	in_port_t hport;
933 #if SK_LOG
934 	char tmp_ip_str[MAX_IPv6_STR_LEN];
935 #endif /* SK_LOG */
936 	struct ifnet *ifp = (nfi != NULL) ? nfi->nfi_ifp : NULL;
937 
938 	NETNS_LOCK_ASSERT_HELD();
939 
940 	hport = ntohs(*port);
941 
942 	VERIFY((flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
943 	VERIFY(addr_len == sizeof(struct in_addr) ||
944 	    addr_len == sizeof(struct in6_addr));
945 	VERIFY(proto == IPPROTO_TCP || proto == IPPROTO_UDP);
946 	VERIFY(hport != 0);
947 
948 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
949 	    "reserving %s:%s:%d // flags 0x%x // token %svalid",
950 	    sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str,
951 	    sizeof(tmp_ip_str)), PROTO_STR(proto), hport, flags,
952 	    NETNS_TOKEN_VALID(token) ? "" : "in");
953 
954 	/*
955 	 * See the documentation for NETNS_PRERESERVED in netns.h for an
956 	 * explanation of this block.
957 	 */
958 	if (NETNS_TOKEN_VALID(token)) {
959 		if (flags & NETNS_PRERESERVED) {
960 			nt = *token;
961 			VERIFY(nt->nt_addr_len == addr_len);
962 			VERIFY(memcmp(nt->nt_addr, addr, addr_len) == 0);
963 			VERIFY(nt->nt_proto == proto);
964 			VERIFY(nt->nt_port == hport);
965 			VERIFY((nt->nt_flags &
966 			    NETNS_RESERVATION_FLAGS | NETNS_PRERESERVED) ==
967 			    (flags & NETNS_RESERVATION_FLAGS));
968 
969 			if ((nt->nt_flags & NETNS_CONFIGURATION_FLAGS) ==
970 			    (flags & NETNS_CONFIGURATION_FLAGS)) {
971 				SK_DF(NS_VERB_IP(nt->nt_addr_len) |
972 				    NS_VERB_PROTO(nt->nt_proto),
973 				    "%s:%s:%d // flags 0x%x -> 0x%x",
974 				    sk_ntop(LEN_TO_AF(nt->nt_addr_len),
975 				    nt->nt_addr, tmp_ip_str,
976 				    sizeof(tmp_ip_str)),
977 				    PROTO_STR(nt->nt_proto),
978 				    nt->nt_port, nt->nt_flags, flags);
979 				nt->nt_flags &= ~NETNS_CONFIGURATION_FLAGS;
980 				nt->nt_flags |=
981 				    flags & NETNS_CONFIGURATION_FLAGS;
982 			}
983 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
984 			    "token was prereserved");
985 			goto done;
986 		} else {
987 			panic("Request to overwrite valid netns token");
988 			/* NOTREACHED */
989 			__builtin_unreachable();
990 		}
991 	}
992 
993 	/*
994 	 * TODO: Check range against bitmap
995 	 */
996 	if (hport == 0) {
997 		/*
998 		 * Caller request an arbitrary range of ports
999 		 * TODO: Need to figure out how to allocate
1000 		 * emphemeral ports only.
1001 		 */
1002 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1003 		    "ERROR - wildcard port not yet supported");
1004 		err = ENOMEM;
1005 		goto done;
1006 	}
1007 
1008 	/*
1009 	 * Fetch namespace for the specified address/protocol, creating
1010 	 * a new namespace if necessary.
1011 	 */
1012 	if (ns == NULL) {
1013 		ASSERT(ns_want_cleanup);
1014 		ns = _netns_get_ns(addr, addr_len, proto, true);
1015 	}
1016 	if (__improbable(ns == NULL)) {
1017 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1018 		    "ERROR - couldn't create namespace");
1019 		err = ENOMEM;
1020 		goto done;
1021 	}
1022 
1023 	/*
1024 	 * Make a reservation in the namespace
1025 	 * This will return an error if an incompatible reservation
1026 	 * already exists.
1027 	 */
1028 	err = _netns_reserve_common(ns, hport, flags);
1029 	if (__improbable(err != 0)) {
1030 		NETNS_LOCK_CONVERT();
1031 		if (ns_want_cleanup) {
1032 			netns_ns_cleanup(ns);
1033 		}
1034 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1035 		    "ERROR - reservation collision");
1036 		goto done;
1037 	}
1038 
1039 	if (!_netns_is_wildcard_addr(ns->ns_addr, addr_len)) {
1040 		/* Record the reservation in the non-wild namespace */
1041 		struct ns *nwns;
1042 
1043 		nwns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1044 		    addr_len)];
1045 		err = _netns_reserve_common(nwns, hport, flags);
1046 		if (__improbable(err != 0)) {
1047 			/* Need to free the specific namespace entry */
1048 			NETNS_LOCK_CONVERT();
1049 			_netns_release_common(ns, hport, flags);
1050 			if (ns_want_cleanup) {
1051 				netns_ns_cleanup(ns);
1052 			}
1053 			SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1054 			    "ERROR - reservation collision");
1055 			goto done;
1056 		}
1057 	}
1058 
1059 	nt = netns_ns_token_alloc(nfi != NULL ? true : false);
1060 	ASSERT(nt->nt_ifp == NULL);
1061 	_netns_set_ifnet_internal(nt, ifp);
1062 
1063 	memcpy(nt->nt_addr, addr, addr_len);
1064 	nt->nt_addr_len = addr_len;
1065 	nt->nt_proto = proto;
1066 	nt->nt_port = hport;
1067 	nt->nt_flags = flags;
1068 
1069 	if (nfi != NULL) {
1070 		VERIFY(nt->nt_flow_info != NULL);
1071 
1072 		memcpy(nt->nt_flow_info, nfi, sizeof(struct ns_flow_info));
1073 		/*
1074 		 * The local port is passed as a separate argument
1075 		 */
1076 		if (nfi->nfi_laddr.sa.sa_family == AF_INET) {
1077 			nt->nt_flow_info->nfi_laddr.sin.sin_port = *port;
1078 		} else if (nfi->nfi_laddr.sa.sa_family == AF_INET6) {
1079 			nt->nt_flow_info->nfi_laddr.sin6.sin6_port = *port;
1080 		}
1081 	}
1082 	*token = nt;
1083 
1084 done:
1085 	return err;
1086 }
1087 
1088 /*
1089  * Kernel-facing functions
1090  */
1091 
1092 int
netns_init(void)1093 netns_init(void)
1094 {
1095 	VERIFY(__netns_inited == 0);
1096 
1097 	netns_ns_reservation_size = sizeof(struct ns_reservation);
1098 	netns_ns_reservation_cache = skmem_cache_create(NETNS_NS_RESERVATION_ZONE_NAME,
1099 	    netns_ns_reservation_size, sizeof(uint64_t), NULL, NULL, NULL,
1100 	    NULL, NULL, 0);
1101 	if (netns_ns_reservation_cache == NULL) {
1102 		panic("%s: skmem_cache create failed (%s)", __func__,
1103 		    NETNS_NS_RESERVATION_ZONE_NAME);
1104 		/* NOTREACHED */
1105 		__builtin_unreachable();
1106 	}
1107 
1108 	netns_ns_token_size = sizeof(struct ns_token);
1109 	netns_ns_token_cache = skmem_cache_create(NETNS_NS_TOKEN_ZONE_NAME,
1110 	    netns_ns_token_size, sizeof(uint64_t), NULL, NULL, NULL, NULL,
1111 	    NULL, 0);
1112 	if (netns_ns_token_cache == NULL) {
1113 		panic("%s: skmem_cache create failed (%s)", __func__,
1114 		    NETNS_NS_TOKEN_ZONE_NAME);
1115 		/* NOTREACHED */
1116 		__builtin_unreachable();
1117 	}
1118 
1119 	netns_ns_flow_info_size = sizeof(struct ns_flow_info);
1120 	netns_ns_flow_info_cache = skmem_cache_create(NETNS_NS_FLOW_INFO_ZONE_NAME,
1121 	    netns_ns_flow_info_size, sizeof(uint64_t), NULL, NULL, NULL,
1122 	    NULL, NULL, 0);
1123 	if (netns_ns_flow_info_cache == NULL) {
1124 		panic("%s: skmem_cache create failed (%s)", __func__,
1125 		    NETNS_NS_FLOW_INFO_ZONE_NAME);
1126 		/* NOTREACHED */
1127 		__builtin_unreachable();
1128 	}
1129 
1130 	LIST_INIT(&netns_unbound_tokens);
1131 	LIST_INIT(&netns_all_tokens);
1132 
1133 	netns_n_namespaces = 0;
1134 	RB_INIT(&netns_namespaces);
1135 
1136 	SK_D("initializing global namespaces");
1137 
1138 	netns_init_global_ns(
1139 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1140 		NETNS_ADDRLEN_V4)], IPPROTO_TCP, sizeof(struct in_addr));
1141 
1142 	netns_init_global_ns(
1143 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1144 		NETNS_ADDRLEN_V4)], IPPROTO_UDP, sizeof(struct in_addr));
1145 
1146 	netns_init_global_ns(
1147 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_TCP,
1148 		NETNS_ADDRLEN_V6)], IPPROTO_TCP, sizeof(struct in6_addr));
1149 
1150 	netns_init_global_ns(
1151 		&netns_global_non_wild[NETNS_NS_GLOBAL_IDX(IPPROTO_UDP,
1152 		NETNS_ADDRLEN_V6)], IPPROTO_UDP, sizeof(struct in6_addr));
1153 
1154 	/* Done */
1155 
1156 	__netns_inited = 1;
1157 	sk_features |= SK_FEATURE_NETNS;
1158 
1159 	SK_D("initialized netns");
1160 
1161 	return 0;
1162 }
1163 
1164 void
netns_uninit(void)1165 netns_uninit(void)
1166 {
1167 	if (__netns_inited == 1) {
1168 		struct ns *namespace;
1169 		struct ns *temp_namespace;
1170 		int i;
1171 
1172 		RB_FOREACH_SAFE(namespace, netns_namespaces_tree,
1173 		    &netns_namespaces, temp_namespace) {
1174 			RB_REMOVE(netns_namespaces_tree, &netns_namespaces,
1175 			    namespace);
1176 			netns_n_namespaces--;
1177 			netns_ns_free(namespace);
1178 		}
1179 
1180 		for (i = 0; i < NETNS_N_GLOBAL; i++) {
1181 			netns_ns_free(netns_global_non_wild[i]);
1182 		}
1183 
1184 		if (netns_ns_flow_info_cache != NULL) {
1185 			skmem_cache_destroy(netns_ns_flow_info_cache);
1186 			netns_ns_flow_info_cache = NULL;
1187 		}
1188 		if (netns_ns_token_cache != NULL) {
1189 			skmem_cache_destroy(netns_ns_token_cache);
1190 			netns_ns_token_cache = NULL;
1191 		}
1192 		if (netns_ns_reservation_cache != NULL) {
1193 			skmem_cache_destroy(netns_ns_reservation_cache);
1194 			netns_ns_reservation_cache = NULL;
1195 		}
1196 
1197 		__netns_inited = 0;
1198 		sk_features &= ~SK_FEATURE_NETNS;
1199 
1200 		SK_D("uninitialized netns");
1201 	}
1202 }
1203 
1204 void
netns_reap_caches(boolean_t purge)1205 netns_reap_caches(boolean_t purge)
1206 {
1207 	/* these aren't created unless netns is enabled */
1208 	if (netns_ns_token_cache != NULL) {
1209 		skmem_cache_reap_now(netns_ns_token_cache, purge);
1210 	}
1211 	if (netns_ns_reservation_cache != NULL) {
1212 		skmem_cache_reap_now(netns_ns_reservation_cache, purge);
1213 	}
1214 	if (netns_ns_flow_info_cache != NULL) {
1215 		skmem_cache_reap_now(netns_ns_flow_info_cache, purge);
1216 	}
1217 }
1218 
1219 boolean_t
netns_is_enabled(void)1220 netns_is_enabled(void)
1221 {
1222 	return __netns_inited == 1;
1223 }
1224 
1225 int
netns_reserve(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t port,uint32_t flags,struct ns_flow_info * nfi)1226 netns_reserve(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1227     uint8_t addr_len, uint8_t proto, in_port_t port, uint32_t flags,
1228     struct ns_flow_info *nfi)
1229 {
1230 	int err = 0;
1231 #if SK_LOG
1232 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1233 #endif /* SK_LOG */
1234 
1235 	if (__netns_inited == 0) {
1236 		*token = NULL;
1237 		return err;
1238 	}
1239 
1240 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1241 		NS_PORT_ERR("netns doesn't support non TCP/UDP protocol");
1242 		return ENOTSUP;
1243 	}
1244 
1245 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1246 	    "%s:%s:%d // flags 0x%x", sk_ntop(LEN_TO_AF(addr_len), addr,
1247 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1248 	    flags);
1249 
1250 	/*
1251 	 * Check wether the process is allowed to bind to a restricted port
1252 	 */
1253 	if (!current_task_can_use_restricted_in_port(port,
1254 	    proto, flags)) {
1255 		*token = NULL;
1256 		return EADDRINUSE;
1257 	}
1258 
1259 	NETNS_LOCK_SPIN();
1260 	err = _netns_reserve_kpi_common(NULL, token, addr, addr_len,
1261 	    proto, &port, flags, nfi);
1262 	NETNS_UNLOCK();
1263 
1264 	return err;
1265 }
1266 
1267 /* Import net.inet.{tcp,udp}.randomize_ports sysctls */
1268 extern int      udp_use_randomport;
1269 extern int      tcp_use_randomport;
1270 
1271 int
netns_reserve_ephemeral(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto,in_port_t * pport,uint32_t flags,struct ns_flow_info * nfi)1272 netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1273     uint8_t addr_len, uint8_t proto, in_port_t *pport, uint32_t flags,
1274     struct ns_flow_info *nfi)
1275 {
1276 	int err = 0;
1277 	SK_LOG_VAR(in_port_t port = *pport);
1278 	in_port_t first = (in_port_t)ipport_firstauto;
1279 	in_port_t last  = (in_port_t)ipport_lastauto;
1280 	in_port_t rand_port;
1281 	in_port_t last_port;
1282 	in_port_t n_last_port;
1283 	struct ns *namespace;
1284 	boolean_t count_up = true;
1285 	boolean_t use_randomport = (proto == IPPROTO_TCP) ?
1286 	    tcp_use_randomport : udp_use_randomport;
1287 #if SK_LOG
1288 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1289 #endif /* SK_LOG */
1290 
1291 	if (__netns_inited == 0) {
1292 		*token = NULL;
1293 		return err;
1294 	}
1295 
1296 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1297 		NS_PORT_ERR("netns doesn't support non TCP/UDP protocol");
1298 		return ENOTSUP;
1299 	}
1300 
1301 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1302 	    "%s:%s:%d // flags 0x%x", sk_ntop(LEN_TO_AF(addr_len), addr,
1303 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port),
1304 	    flags);
1305 
1306 	NETNS_LOCK_SPIN();
1307 
1308 	namespace = _netns_get_ns(addr, addr_len, proto, true);
1309 	if (namespace == NULL) {
1310 		err = ENOMEM;
1311 		NETNS_UNLOCK();
1312 		return err;
1313 	}
1314 
1315 	if (proto == IPPROTO_UDP) {
1316 		if (UINT16_MAX - namespace->ns_n_reservations <
1317 		    NETNS_NS_UDP_EPHEMERAL_RESERVE) {
1318 			NS_PORT_ERR("UDP ephemeral port not available"
1319 			    "(less than 4096 UDP ports left)");
1320 			err = EADDRNOTAVAIL;
1321 			NETNS_UNLOCK();
1322 			return err;
1323 		}
1324 	}
1325 
1326 	if (first == last) {
1327 		rand_port = first;
1328 	} else {
1329 		if (use_randomport) {
1330 			NETNS_LOCK_CONVERT();
1331 			read_frandom(&rand_port, sizeof(rand_port));
1332 
1333 			if (first > last) {
1334 				rand_port = last + (rand_port %
1335 				    (first - last));
1336 				count_up = false;
1337 			} else {
1338 				rand_port = first + (rand_port %
1339 				    (last - first));
1340 			}
1341 		} else {
1342 			if (first > last) {
1343 				rand_port =
1344 				    namespace->ns_last_ephemeral_port_down - 1;
1345 				if (rand_port < last || rand_port > first) {
1346 					rand_port = last;
1347 				}
1348 				count_up = false;
1349 			} else {
1350 				rand_port =
1351 				    namespace->ns_last_ephemeral_port_up + 1;
1352 				if (rand_port < first || rand_port > last) {
1353 					rand_port = first;
1354 				}
1355 			}
1356 		}
1357 	}
1358 	last_port = rand_port;
1359 	n_last_port = htons(last_port);
1360 
1361 	while (true) {
1362 		if (n_last_port == 0) {
1363 			NS_PORT_ERR("ephemeral port search range includes 0");
1364 			err = EINVAL;
1365 			break;
1366 		}
1367 
1368 		/*
1369 		 * Skip if this is a restricted port as we do not want to
1370 		 * restricted ports as ephemeral
1371 		 */
1372 		if (!IS_RESTRICTED_IN_PORT(n_last_port)) {
1373 			err = _netns_reserve_kpi_common(namespace, token, addr,
1374 			    addr_len, proto, &n_last_port, flags, nfi);
1375 			if (err == 0 || err != EADDRINUSE) {
1376 				break;
1377 			}
1378 		}
1379 		if (count_up) {
1380 			last_port++;
1381 			if (last_port < first || last_port > last) {
1382 				last_port = first;
1383 			}
1384 		} else {
1385 			last_port--;
1386 			if (last_port < last || last_port > first) {
1387 				last_port = last;
1388 			}
1389 		}
1390 		n_last_port = htons(last_port);
1391 
1392 		if (last_port == rand_port || first == last) {
1393 			NS_PORT_ERR("couldn't find free ephemeral port");
1394 			err = EADDRNOTAVAIL;
1395 			break;
1396 		}
1397 	}
1398 
1399 	if (err == 0) {
1400 		*pport = n_last_port;
1401 		if (count_up) {
1402 			namespace->ns_last_ephemeral_port_up = last_port;
1403 		} else {
1404 			namespace->ns_last_ephemeral_port_down = last_port;
1405 		}
1406 	} else {
1407 		netns_ns_cleanup(namespace);
1408 	}
1409 
1410 	NETNS_UNLOCK();
1411 
1412 	return err;
1413 }
1414 
1415 void
netns_release(netns_token * token)1416 netns_release(netns_token *token)
1417 {
1418 	struct ns *ns;
1419 	struct ns_token *nt;
1420 	uint8_t proto, addr_len;
1421 #if SK_LOG
1422 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1423 #endif /* SK_LOG */
1424 
1425 	if (!NETNS_TOKEN_VALID(token)) {
1426 		return;
1427 	}
1428 
1429 	if (__netns_inited == 0) {
1430 		*token = NULL;
1431 		return;
1432 	}
1433 
1434 	NETNS_LOCK_SPIN();
1435 
1436 	nt = *token;
1437 	*token = NULL;
1438 
1439 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) <= NETNS_OWNER_MAX);
1440 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1441 	    nt->nt_addr_len == sizeof(struct in6_addr));
1442 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1443 
1444 	addr_len = nt->nt_addr_len;
1445 	proto = nt->nt_proto;
1446 
1447 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1448 	    "releasing %s:%s:%d",
1449 	    sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1450 	    tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto),
1451 	    nt->nt_port);
1452 
1453 	if (!_netns_is_wildcard_addr(nt->nt_addr, addr_len)) {
1454 		/* Remove from global non-wild namespace */
1455 
1456 		ns = netns_global_non_wild[NETNS_NS_GLOBAL_IDX(proto,
1457 		    addr_len)];
1458 		VERIFY(ns != NULL);
1459 
1460 		_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1461 	}
1462 
1463 	ns = _netns_get_ns(nt->nt_addr, addr_len, proto, false);
1464 	VERIFY(ns != NULL);
1465 	_netns_release_common(ns, nt->nt_port, nt->nt_flags);
1466 
1467 	netns_clear_ifnet(nt);
1468 	netns_ns_token_free(nt);
1469 
1470 	NETNS_UNLOCK();
1471 }
1472 
1473 int
netns_change_addr(netns_token * token,uint32_t * __sized_by (addr_len)addr,uint8_t addr_len)1474 netns_change_addr(netns_token *token, uint32_t *__sized_by(addr_len)addr,
1475     uint8_t addr_len)
1476 {
1477 	int err = 0;
1478 	struct ns *old_namespace;
1479 	struct ns *new_namespace;
1480 	struct ns *global_namespace;
1481 	struct ns_token *nt;
1482 	uint8_t proto;
1483 #if SK_LOG
1484 	char tmp_ip_str_1[MAX_IPv6_STR_LEN];
1485 	char tmp_ip_str_2[MAX_IPv6_STR_LEN];
1486 #endif /* SK_LOG */
1487 
1488 	if (__netns_inited == 0) {
1489 		return 0;
1490 	}
1491 
1492 	NETNS_LOCK();
1493 
1494 	VERIFY(NETNS_TOKEN_VALID(token));
1495 
1496 	nt = *token;
1497 
1498 	VERIFY((nt->nt_flags & NETNS_OWNER_MASK) == NETNS_BSD);
1499 	VERIFY(nt->nt_addr_len == sizeof(struct in_addr) ||
1500 	    nt->nt_addr_len == sizeof(struct in6_addr));
1501 	VERIFY(nt->nt_proto == IPPROTO_TCP || nt->nt_proto == IPPROTO_UDP);
1502 
1503 	proto = nt->nt_proto;
1504 
1505 #if SK_LOG
1506 	sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1507 	    tmp_ip_str_1, sizeof(tmp_ip_str_1));
1508 	sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2,
1509 	    sizeof(tmp_ip_str_2));
1510 #endif /* SK_LOG */
1511 	SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1512 	    "changing address for %s:%d from %s to %s",
1513 	    PROTO_STR(proto), nt->nt_port, tmp_ip_str_1,
1514 	    tmp_ip_str_2);
1515 
1516 	if (nt->nt_addr_len == addr_len &&
1517 	    memcmp(nt->nt_addr, addr, nt->nt_addr_len) == 0) {
1518 		SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto),
1519 		    "address didn't change, exiting early");
1520 		goto done;
1521 	}
1522 
1523 	old_namespace = _netns_get_ns(nt->nt_addr, nt->nt_addr_len, proto,
1524 	    false);
1525 	VERIFY(old_namespace != NULL);
1526 
1527 	new_namespace = _netns_get_ns(addr, addr_len, proto, true);
1528 	if (new_namespace == NULL) {
1529 		err = ENOMEM;
1530 		goto done;
1531 	}
1532 
1533 	/* Acquire reservation in new namespace */
1534 	if ((err = _netns_reserve_common(new_namespace, nt->nt_port,
1535 	    nt->nt_flags))) {
1536 		NETNS_LOCK_CONVERT();
1537 		netns_ns_cleanup(new_namespace);
1538 		SK_ERR("port %u reservation collision under new namespace",
1539 		    nt->nt_port);
1540 		goto done;
1541 	}
1542 
1543 	/* Release from old namespace */
1544 	_netns_release_common(old_namespace, nt->nt_port, nt->nt_flags);
1545 
1546 	if (!_netns_is_wildcard_addr(nt->nt_addr, nt->nt_addr_len)) {
1547 		/*
1548 		 * Old address is non-wildcard.
1549 		 * Remove old reservation from global non-wild namespace
1550 		 */
1551 		global_namespace = netns_global_non_wild[
1552 			NETNS_NS_GLOBAL_IDX(proto, nt->nt_addr_len)];
1553 		VERIFY(global_namespace != NULL);
1554 
1555 		_netns_release_common(global_namespace, nt->nt_port,
1556 		    nt->nt_flags);
1557 	}
1558 
1559 	if (!_netns_is_wildcard_addr(addr, addr_len)) {
1560 		/*
1561 		 * New address is non-wildcard.
1562 		 * Record new reservation in global non-wild namespace
1563 		 */
1564 		global_namespace = netns_global_non_wild[
1565 			NETNS_NS_GLOBAL_IDX(proto, addr_len)];
1566 		VERIFY(global_namespace != NULL);
1567 
1568 		if ((err = _netns_reserve_common(global_namespace,
1569 		    nt->nt_port, nt->nt_flags)) != 0) {
1570 			SK_ERR("port %u - reservation collision under new global namespace",
1571 			    nt->nt_port);
1572 			/* XXX: Should not fail. Maybe assert instead */
1573 			goto done;
1574 		}
1575 	}
1576 
1577 	memcpy(nt->nt_addr, addr, addr_len);
1578 	nt->nt_addr_len = addr_len;
1579 
1580 done:
1581 	NETNS_UNLOCK();
1582 	return err;
1583 }
1584 
1585 static void
_netns_set_ifnet_internal(struct ns_token * nt,struct ifnet * ifp)1586 _netns_set_ifnet_internal(struct ns_token *nt, struct ifnet *ifp)
1587 {
1588 #if SK_LOG
1589 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1590 #endif /* SK_LOG */
1591 
1592 	NETNS_LOCK_ASSERT_HELD();
1593 
1594 	if (ifp != NULL && ifnet_get_ioref(ifp)) {
1595 		nt->nt_ifp = ifp;
1596 		LIST_INSERT_HEAD(&ifp->if_netns_tokens, nt, nt_ifp_link);
1597 
1598 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1599 		    "%s:%s:%d // added to ifnet %d",
1600 		    sk_ntop(LEN_TO_AF(nt->nt_addr_len),
1601 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1602 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1603 		    ifp->if_index);
1604 	} else {
1605 		LIST_INSERT_HEAD(&netns_unbound_tokens, nt, nt_ifp_link);
1606 	}
1607 }
1608 
1609 void
netns_set_ifnet(netns_token * token,ifnet_t ifp)1610 netns_set_ifnet(netns_token *token, ifnet_t ifp)
1611 {
1612 	struct ns_token *nt;
1613 #if SK_LOG
1614 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1615 #endif /* SK_LOG */
1616 
1617 	if (__netns_inited == 0) {
1618 		return;
1619 	}
1620 
1621 	NETNS_LOCK();
1622 
1623 	VERIFY(NETNS_TOKEN_VALID(token));
1624 
1625 	nt = *token;
1626 
1627 	if (nt->nt_ifp == ifp) {
1628 		SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1629 		    "%s:%s:%d // ifnet already %d, exiting early",
1630 		    sk_ntop(LEN_TO_AF(nt->nt_addr_len),
1631 		    nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)),
1632 		    PROTO_STR(nt->nt_proto), nt->nt_port,
1633 		    ifp ? ifp->if_index : -1);
1634 		NETNS_UNLOCK();
1635 		return;
1636 	}
1637 
1638 	netns_clear_ifnet(nt);
1639 
1640 	_netns_set_ifnet_internal(nt, ifp);
1641 
1642 	NETNS_UNLOCK();
1643 }
1644 
1645 void
netns_ifnet_detach(ifnet_t ifp)1646 netns_ifnet_detach(ifnet_t ifp)
1647 {
1648 	struct ns_token *token, *tmp_token;
1649 
1650 	if (__netns_inited == 0) {
1651 		return;
1652 	}
1653 
1654 	NETNS_LOCK();
1655 
1656 	LIST_FOREACH_SAFE(token, &ifp->if_netns_tokens, nt_ifp_link,
1657 	    tmp_token) {
1658 		netns_clear_ifnet(token);
1659 		LIST_INSERT_HEAD(&netns_unbound_tokens, token, nt_ifp_link);
1660 	}
1661 
1662 	NETNS_UNLOCK();
1663 }
1664 
1665 static void
_netns_set_state(netns_token * token,uint32_t state)1666 _netns_set_state(netns_token *token, uint32_t state)
1667 {
1668 	struct ns_token *nt;
1669 #if SK_LOG
1670 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1671 #endif /* SK_LOG */
1672 
1673 	if (__netns_inited == 0) {
1674 		return;
1675 	}
1676 
1677 	NETNS_LOCK();
1678 	VERIFY(NETNS_TOKEN_VALID(token));
1679 
1680 	nt = *token;
1681 	nt->nt_state |= state;
1682 
1683 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1684 	    "%s:%s:%d // state 0x%x",
1685 	    sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1686 	    tmp_ip_str, sizeof(tmp_ip_str)),
1687 	    PROTO_STR(nt->nt_proto), nt->nt_port, state);
1688 
1689 	NETNS_UNLOCK();
1690 }
1691 
1692 void
netns_half_close(netns_token * token)1693 netns_half_close(netns_token *token)
1694 {
1695 	_netns_set_state(token, NETNS_STATE_HALFCLOSED);
1696 }
1697 
1698 void
netns_withdraw(netns_token * token)1699 netns_withdraw(netns_token *token)
1700 {
1701 	_netns_set_state(token, NETNS_STATE_WITHDRAWN);
1702 }
1703 
1704 int
netns_get_flow_info(netns_token * token,struct ns_flow_info * nfi)1705 netns_get_flow_info(netns_token *token,
1706     struct ns_flow_info *nfi)
1707 {
1708 	if (__netns_inited == 0) {
1709 		return ENOTSUP;
1710 	}
1711 
1712 	NETNS_LOCK();
1713 	if (!NETNS_TOKEN_VALID(token) ||
1714 	    nfi == NULL) {
1715 		NETNS_UNLOCK();
1716 		return EINVAL;
1717 	}
1718 
1719 	struct ns_token *nt = *token;
1720 	if (nt->nt_flow_info == NULL) {
1721 		NETNS_UNLOCK();
1722 		return ENOENT;
1723 	}
1724 
1725 	memcpy(nfi, nt->nt_flow_info, sizeof(struct ns_flow_info));
1726 	NETNS_UNLOCK();
1727 
1728 	return 0;
1729 }
1730 
1731 void
netns_change_flags(netns_token * token,uint32_t set_flags,uint32_t clear_flags)1732 netns_change_flags(netns_token *token, uint32_t set_flags,
1733     uint32_t clear_flags)
1734 {
1735 	struct ns_token *nt;
1736 #if SK_LOG
1737 	char tmp_ip_str[MAX_IPv6_STR_LEN];
1738 #endif /* SK_LOG */
1739 
1740 	if (__netns_inited == 0) {
1741 		return;
1742 	}
1743 
1744 	NETNS_LOCK();
1745 
1746 	VERIFY(NETNS_TOKEN_VALID(token));
1747 
1748 	nt = *token;
1749 
1750 	VERIFY(!((set_flags | clear_flags) & NETNS_RESERVATION_FLAGS));
1751 	/* TODO: verify set and clear flags don't overlap? */
1752 
1753 	SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto),
1754 	    "%s:%s:%d // flags 0x%x -> 0x%x",
1755 	    sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr,
1756 	    tmp_ip_str, sizeof(tmp_ip_str)),
1757 	    PROTO_STR(nt->nt_proto), nt->nt_port, nt->nt_flags,
1758 	    nt->nt_flags | set_flags & ~clear_flags);
1759 
1760 	nt->nt_flags |= set_flags;
1761 	nt->nt_flags &= ~clear_flags;
1762 
1763 	NETNS_UNLOCK();
1764 }
1765 
1766 /*
1767  * Port offloading KPI
1768  */
1769 static inline void
netns_local_port_scan_flow_entry(struct flow_entry * fe,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1770 netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protocol,
1771     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1772 {
1773 	struct ns_token *token;
1774 	boolean_t iswildcard = false;
1775 
1776 	if (fe == NULL) {
1777 		return;
1778 	}
1779 
1780 	if (fe->fe_flags & (FLOWENTF_EXTRL_PORT | FLOWENTF_AOP_OFFLOAD)) {
1781 		return;
1782 	}
1783 
1784 	token = fe->fe_port_reservation;
1785 	if (token == NULL) {
1786 		return;
1787 	}
1788 
1789 	/*
1790 	 * We are only interested in active flows over skywalk channels
1791 	 */
1792 	if ((token->nt_flags & NETNS_OWNER_MASK) != NETNS_SKYWALK) {
1793 		return;
1794 	}
1795 
1796 	if (token->nt_state & NETNS_STATE_WITHDRAWN) {
1797 		return;
1798 	}
1799 
1800 	if (!(flags & IFNET_GET_LOCAL_PORTS_ANYTCPSTATEOK) &&
1801 	    (flags & IFNET_GET_LOCAL_PORTS_ACTIVEONLY) &&
1802 	    (token->nt_state & NETNS_STATE_HALFCLOSED)) {
1803 		return;
1804 	}
1805 
1806 	VERIFY(token->nt_addr_len == sizeof(struct in_addr) ||
1807 	    token->nt_addr_len == sizeof(struct in6_addr));
1808 
1809 	if (token->nt_addr_len == sizeof(struct in_addr)) {
1810 		if (protocol == PF_INET6) {
1811 			return;
1812 		}
1813 
1814 		iswildcard = token->nt_inaddr.s_addr == INADDR_ANY;
1815 	} else if (token->nt_addr_len == sizeof(struct in6_addr)) {
1816 		if (protocol == PF_INET) {
1817 			return;
1818 		}
1819 
1820 		iswildcard = IN6_IS_ADDR_UNSPECIFIED(
1821 			&token->nt_in6addr);
1822 	}
1823 	if (!(flags & IFNET_GET_LOCAL_PORTS_WILDCARDOK) && iswildcard) {
1824 		return;
1825 	}
1826 
1827 	if ((flags & IFNET_GET_LOCAL_PORTS_TCPONLY) &&
1828 	    token->nt_proto == IPPROTO_UDP) {
1829 		return;
1830 	}
1831 	if ((flags & IFNET_GET_LOCAL_PORTS_UDPONLY) &&
1832 	    token->nt_proto == IPPROTO_TCP) {
1833 		return;
1834 	}
1835 
1836 	if ((flags & IFNET_GET_LOCAL_PORTS_RECVANYIFONLY) &&
1837 	    !(token->nt_flags & NETNS_RECVANYIF)) {
1838 		return;
1839 	}
1840 
1841 	if ((flags & IFNET_GET_LOCAL_PORTS_EXTBGIDLEONLY) &&
1842 	    !(token->nt_flags & NETNS_EXTBGIDLE)) {
1843 		return;
1844 	}
1845 
1846 	if (token->nt_ifp != NULL && (token->nt_ifp->if_eflags & IFEF_AWDL) != 0) {
1847 		struct flow_route *fr = fe->fe_route;
1848 
1849 		if (fr == NULL || fr->fr_rt_dst == NULL ||
1850 		    (fr->fr_rt_dst->rt_flags & (RTF_UP | RTF_CONDEMNED)) != RTF_UP) {
1851 #if DEBUG || DEVELOPMENT
1852 			char lbuf[MAX_IPv6_STR_LEN + 6] = {};
1853 			char fbuf[MAX_IPv6_STR_LEN + 6] = {};
1854 			in_port_t lport;
1855 			in_port_t fport;
1856 			char pname[MAXCOMLEN + 1];
1857 			const struct ns_flow_info *nfi = token->nt_flow_info;
1858 
1859 			proc_name(nfi->nfi_owner_pid, pname, sizeof(pname));
1860 
1861 			if (protocol == PF_INET) {
1862 				sk_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr,
1863 				    lbuf, sizeof(lbuf));
1864 				sk_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr,
1865 				    fbuf, sizeof(fbuf));
1866 				lport = nfi->nfi_laddr.sin.sin_port;
1867 				fport = nfi->nfi_faddr.sin.sin_port;
1868 			} else {
1869 				sk_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr,
1870 				    lbuf, sizeof(lbuf));
1871 				sk_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr,
1872 				    fbuf, sizeof(fbuf));
1873 				lport = nfi->nfi_laddr.sin6.sin6_port;
1874 				fport = nfi->nfi_faddr.sin6.sin6_port;
1875 			}
1876 
1877 			os_log(wake_packet_log_handle,
1878 			    "netns_local_port_scan_flow_entry: route is down %s %s:%u %s:%u ifp %s proc %s:%d",
1879 			    token->nt_proto == IPPROTO_TCP ? "tcp" : "udp",
1880 			    lbuf, ntohs(lport), fbuf, ntohs(fport),
1881 			    token->nt_ifp->if_xname, pname, nfi->nfi_owner_pid);
1882 #endif /* DEBUG || DEVELOPMENT */
1883 
1884 			return;
1885 		}
1886 	}
1887 
1888 #if DEBUG || DEVELOPMENT
1889 	if (!(flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) &&
1890 	    (token->nt_flags & NETNS_NOWAKEFROMSLEEP)) {
1891 		char lbuf[MAX_IPv6_STR_LEN + 6] = {};
1892 		char fbuf[MAX_IPv6_STR_LEN + 6] = {};
1893 		in_port_t lport;
1894 		in_port_t fport;
1895 		char pname[MAXCOMLEN + 1];
1896 		const struct ns_flow_info *nfi = token->nt_flow_info;
1897 
1898 		proc_name(nfi->nfi_owner_pid, pname, sizeof(pname));
1899 
1900 		if (protocol == PF_INET) {
1901 			sk_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr,
1902 			    lbuf, sizeof(lbuf));
1903 			sk_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr,
1904 			    fbuf, sizeof(fbuf));
1905 			lport = nfi->nfi_laddr.sin.sin_port;
1906 			fport = nfi->nfi_faddr.sin.sin_port;
1907 		} else {
1908 			sk_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr,
1909 			    lbuf, sizeof(lbuf));
1910 			sk_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr,
1911 			    fbuf, sizeof(fbuf));
1912 			lport = nfi->nfi_laddr.sin6.sin6_port;
1913 			fport = nfi->nfi_faddr.sin6.sin6_port;
1914 		}
1915 
1916 		os_log(wake_packet_log_handle,
1917 		    "netns_local_port_scan_flow_entry: no wake from sleep %s %s:%u %s:%u ifp %s proc %s:%d",
1918 		    token->nt_proto == IPPROTO_TCP ? "tcp" : "udp",
1919 		    lbuf, ntohs(lport), fbuf, ntohs(fport),
1920 		    token->nt_ifp != NULL ? token->nt_ifp->if_xname : "",
1921 		    pname, nfi->nfi_owner_pid);
1922 	}
1923 #endif /* DEBUG || DEVELOPMENT */
1924 
1925 	if (token->nt_ifp != NULL && token->nt_flow_info != NULL) {
1926 		/*
1927 		 * When the flow has "no wake from sleep" option, do not set the port in the bitmap
1928 		 * except if explicetely requested by the driver.
1929 		 * We always add the flow to the list of port in order to report spurious wakes
1930 		 */
1931 		if ((flags & IFNET_GET_LOCAL_PORTS_NOWAKEUPOK) ||
1932 		    (token->nt_flags & NETNS_NOWAKEFROMSLEEP) == 0) {
1933 			bitstr_set(bitfield, token->nt_port);
1934 		}
1935 		(void) if_ports_used_add_flow_entry(fe, token->nt_ifp->if_index,
1936 		    token->nt_flow_info, token->nt_flags);
1937 	} else {
1938 		SK_ERR("unknown owner port %u"
1939 		    " nt_flags 0x%x ifindex %u nt_flow_info %p\n",
1940 		    token->nt_port, token->nt_flags,
1941 		    token->nt_ifp != NULL ? token->nt_ifp->if_index : 0,
1942 		    token->nt_flow_info);
1943 	}
1944 }
1945 
1946 static void
netns_get_if_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1947 netns_get_if_local_ports(ifnet_t ifp, protocol_family_t protocol,
1948     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1949 {
1950 	struct nx_flowswitch *fsw = NULL;
1951 
1952 	if (ifp == NULL || ifp->if_na == NULL) {
1953 		return;
1954 	}
1955 	/* Ensure that the interface is attached and won't detach */
1956 	if (!ifnet_get_ioref(ifp)) {
1957 		return;
1958 	}
1959 	fsw = fsw_ifp_to_fsw(ifp);
1960 	if (fsw == NULL) {
1961 		goto done;
1962 	}
1963 	FSW_RLOCK(fsw);
1964 	NETNS_LOCK();
1965 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1966 		netns_local_port_scan_flow_entry(_fe, protocol, flags,
1967 		bitfield);
1968 	});
1969 	NETNS_UNLOCK();
1970 	FSW_UNLOCK(fsw);
1971 done:
1972 	ifnet_decr_iorefcnt(ifp);
1973 }
1974 
1975 errno_t
netns_get_local_ports(ifnet_t ifp,protocol_family_t protocol,u_int32_t flags,u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])1976 netns_get_local_ports(ifnet_t ifp, protocol_family_t protocol,
1977     u_int32_t flags, u_int8_t bitfield[IP_PORTRANGE_BITFIELD_LEN])
1978 {
1979 	if (__netns_inited == 0) {
1980 		return 0;
1981 	}
1982 	if (ifp != NULL) {
1983 		netns_get_if_local_ports(ifp, protocol, flags, bitfield);
1984 	} else {
1985 		errno_t error;
1986 		uint32_t count, i;
1987 		ifnet_t *__counted_by(count) ifp_list;
1988 
1989 		error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count);
1990 		if (error != 0) {
1991 			os_log_error(wake_packet_log_handle,
1992 			    "%s: ifnet_list_get_all() failed %d",
1993 			    __func__, error);
1994 			return error;
1995 		}
1996 		for (i = 0; i < count; i++) {
1997 			if (TAILQ_EMPTY(&ifp_list[i]->if_addrhead)) {
1998 				continue;
1999 			}
2000 			netns_get_if_local_ports(ifp_list[i], protocol, flags,
2001 			    bitfield);
2002 		}
2003 		ifnet_list_free_counted_by(ifp_list, count);
2004 	}
2005 
2006 	return 0;
2007 }
2008 
2009 uint32_t
netns_find_anyres_byaddr(struct ifaddr * ifa,uint8_t proto)2010 netns_find_anyres_byaddr(struct ifaddr *ifa, uint8_t proto)
2011 {
2012 	int result = 0;
2013 	int ifa_addr_len;
2014 	struct ns_token *token;
2015 	struct ifnet *ifp = ifa->ifa_ifp;
2016 	struct sockaddr *ifa_addr = ifa->ifa_addr;
2017 
2018 	if (__netns_inited == 0) {
2019 		return ENOTSUP;
2020 	}
2021 
2022 	if ((ifa_addr->sa_family != AF_INET) &&
2023 	    (ifa_addr->sa_family != AF_INET6)) {
2024 		return 0;
2025 	}
2026 
2027 	ifa_addr_len = (ifa_addr->sa_family == AF_INET) ?
2028 	    sizeof(struct in_addr) : sizeof(struct in6_addr);
2029 
2030 	NETNS_LOCK();
2031 
2032 	LIST_FOREACH(token, &ifp->if_netns_tokens, nt_ifp_link) {
2033 		if ((token->nt_flags & NETNS_OWNER_MASK) == NETNS_PF) {
2034 			continue;
2035 		}
2036 		if (token->nt_addr_len != ifa_addr_len) {
2037 			continue;
2038 		}
2039 		if (token->nt_proto != proto) {
2040 			continue;
2041 		}
2042 		if (ifa_addr->sa_family == AF_INET) {
2043 			if (token->nt_inaddr.s_addr ==
2044 			    (satosin(ifa->ifa_addr))->sin_addr.s_addr) {
2045 				result = 1;
2046 				break;
2047 			}
2048 		} else if (ifa_addr->sa_family == AF_INET6) {
2049 			if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa),
2050 			    &token->nt_in6addr)) {
2051 				result = 1;
2052 				break;
2053 			}
2054 		}
2055 	}
2056 
2057 	NETNS_UNLOCK();
2058 	return result;
2059 }
2060 
2061 static uint32_t
_netns_lookup_ns_n_reservations(uint32_t * __sized_by (addr_len)addr,uint8_t addr_len,uint8_t proto)2062 _netns_lookup_ns_n_reservations(uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t proto)
2063 {
2064 	uint32_t ns_n_reservations = 0;
2065 	NETNS_LOCK_SPIN();
2066 	struct ns *namespace = _netns_get_ns(addr, addr_len, proto, true);
2067 	if (namespace != NULL) {
2068 		ns_n_reservations = namespace->ns_n_reservations;
2069 	}
2070 	NETNS_UNLOCK();
2071 	return ns_n_reservations;
2072 }
2073 
2074 uint32_t
netns_lookup_reservations_count_in(struct in_addr addr,uint8_t proto)2075 netns_lookup_reservations_count_in(struct in_addr addr, uint8_t proto)
2076 {
2077 	return _netns_lookup_ns_n_reservations(&addr.s_addr, sizeof(struct in_addr), proto);
2078 }
2079 
2080 uint32_t
netns_lookup_reservations_count_in6(struct in6_addr addr,uint8_t proto)2081 netns_lookup_reservations_count_in6(struct in6_addr addr, uint8_t proto)
2082 {
2083 	if (IN6_IS_SCOPE_EMBED(&addr)) {
2084 		addr.s6_addr16[1] = 0;
2085 	}
2086 	return _netns_lookup_ns_n_reservations(&addr.s6_addr32[0], sizeof(struct in6_addr), proto);
2087 }
2088 
2089 /*
2090  * Sysctl interface
2091  */
2092 
2093 static int netns_ctl_dump_all SYSCTL_HANDLER_ARGS;
2094 
2095 SYSCTL_NODE(_kern_skywalk, OID_AUTO, netns, CTLFLAG_RW | CTLFLAG_LOCKED,
2096     0, "Netns interface");
2097 
2098 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netns,
2099     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
2100     0, 0, netns_ctl_dump_all, "-",
2101     "Namespace contents (struct netns_ctl_dump_header, "
2102     "skywalk/os_stats_private.h)");
2103 
2104 static int
netns_ctl_write_ns(struct sysctl_req * req,struct ns * namespace,boolean_t is_global)2105 netns_ctl_write_ns(struct sysctl_req *req, struct ns *namespace,
2106     boolean_t is_global)
2107 {
2108 	struct ns_reservation *res;
2109 	struct netns_ctl_dump_header response_header;
2110 	struct netns_ctl_dump_record response_record;
2111 	int err;
2112 
2113 	/* Fill out header */
2114 	memset(&response_header, 0, sizeof(response_header));
2115 	response_header.ncdh_n_records = namespace->ns_n_reservations;
2116 	response_header.ncdh_proto = namespace->ns_proto;
2117 
2118 	if (is_global) {
2119 		response_header.ncdh_addr_len = 0;
2120 	} else {
2121 		response_header.ncdh_addr_len = namespace->ns_addr_len;
2122 	}
2123 	memcpy(response_header.ncdh_addr, namespace->ns_addr,
2124 	    namespace->ns_addr_len);
2125 
2126 	err = SYSCTL_OUT(req, &response_header, sizeof(response_header));
2127 	if (err) {
2128 		return err;
2129 	}
2130 
2131 	/* Fill out records */
2132 	RB_FOREACH(res, ns_reservation_tree, &namespace->ns_reservations) {
2133 		memset(&response_record, 0, sizeof(response_record));
2134 		response_record.ncdr_port = res->nsr_port;
2135 		response_record.ncdr_port_end = 0;
2136 		response_record.ncdr_listener_refs =
2137 		    NETNS_REF_COUNT(res, NETNS_LISTENER);
2138 		response_record.ncdr_skywalk_refs =
2139 		    NETNS_REF_COUNT(res, NETNS_SKYWALK);
2140 		response_record.ncdr_bsd_refs =
2141 		    NETNS_REF_COUNT(res, NETNS_BSD);
2142 		response_record.ncdr_pf_refs =
2143 		    NETNS_REF_COUNT(res, NETNS_PF);
2144 		err = SYSCTL_OUT(req, &response_record,
2145 		    sizeof(response_record));
2146 		if (err) {
2147 			return err;
2148 		}
2149 	}
2150 
2151 	return 0;
2152 }
2153 
2154 static int
2155 netns_ctl_dump_all SYSCTL_HANDLER_ARGS
2156 {
2157 #pragma unused(oidp, arg1, arg2)
2158 	struct ns *namespace;
2159 	int i, err = 0;
2160 
2161 	if (!kauth_cred_issuser(kauth_cred_get())) {
2162 		return EPERM;
2163 	}
2164 
2165 	if (__netns_inited == 0) {
2166 		return ENOTSUP;
2167 	}
2168 
2169 	NETNS_LOCK();
2170 
2171 	for (i = 0; i < NETNS_N_GLOBAL; i++) {
2172 		err = netns_ctl_write_ns(req, netns_global_non_wild[i], true);
2173 		if (err) {
2174 			goto done;
2175 		}
2176 	}
2177 
2178 	RB_FOREACH(namespace, netns_namespaces_tree, &netns_namespaces) {
2179 		err = netns_ctl_write_ns(req, namespace, false);
2180 		if (err) {
2181 			goto done;
2182 		}
2183 	}
2184 
2185 	/*
2186 	 * If this is just a request for length, add slop because
2187 	 * this is dynamically changing data
2188 	 */
2189 	if (req->oldptr == USER_ADDR_NULL) {
2190 		req->oldidx += 20 * sizeof(struct netns_ctl_dump_record);
2191 	}
2192 
2193 done:
2194 	NETNS_UNLOCK();
2195 	return err;
2196 }
2197 /* CSTYLED */
2198