xref: /xnu-10002.1.13/bsd/net/if_ipsec.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2012-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <sys/systm.h>
31 #include <sys/kern_control.h>
32 #include <net/kpi_protocol.h>
33 #include <net/kpi_interface.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <net/if.h>
37 #include <net/if_types.h>
38 #include <net/bpf.h>
39 #include <net/if_ipsec.h>
40 #include <sys/mbuf.h>
41 #include <sys/sockio.h>
42 #include <netinet/in.h>
43 #include <netinet/ip6.h>
44 #include <netinet6/in6_var.h>
45 #include <netinet6/ip6_var.h>
46 #include <sys/kauth.h>
47 #include <netinet6/ipsec.h>
48 #include <netinet6/ipsec6.h>
49 #include <netinet6/esp.h>
50 #include <netinet6/esp6.h>
51 #include <netinet/ip.h>
52 #include <net/flowadv.h>
53 #include <net/necp.h>
54 #include <netkey/key.h>
55 #include <net/pktap.h>
56 #include <kern/zalloc.h>
57 #include <os/log.h>
58 
59 #if SKYWALK
60 #include <skywalk/os_skywalk_private.h>
61 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
62 #include <skywalk/nexus/netif/nx_netif.h>
63 #define IPSEC_NEXUS 1
64 #else // SKYWALK
65 #define IPSEC_NEXUS 0
66 #endif // SKYWALK
67 
68 extern int net_qos_policy_restricted;
69 extern int net_qos_policy_restrict_avapps;
70 
71 /* Kernel Control functions */
72 static errno_t  ipsec_ctl_setup(u_int32_t *unit, void **unitinfo);
73 static errno_t  ipsec_ctl_bind(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
74     void **unitinfo);
75 static errno_t  ipsec_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
76     void **unitinfo);
77 static errno_t  ipsec_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t unit,
78     void *unitinfo);
79 static errno_t  ipsec_ctl_send(kern_ctl_ref kctlref, u_int32_t unit,
80     void *unitinfo, mbuf_t m, int flags);
81 static errno_t  ipsec_ctl_getopt(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo,
82     int opt, void *data, size_t *len);
83 static errno_t  ipsec_ctl_setopt(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo,
84     int opt, void *data, size_t len);
85 
86 /* Network Interface functions */
87 static void     ipsec_start(ifnet_t     interface);
88 static errno_t  ipsec_output(ifnet_t interface, mbuf_t data);
89 static errno_t  ipsec_demux(ifnet_t interface, mbuf_t data, char *frame_header,
90     protocol_family_t *protocol);
91 static errno_t  ipsec_add_proto(ifnet_t interface, protocol_family_t protocol,
92     const struct ifnet_demux_desc *demux_array,
93     u_int32_t demux_count);
94 static errno_t  ipsec_del_proto(ifnet_t interface, protocol_family_t protocol);
95 static errno_t  ipsec_ioctl(ifnet_t interface, u_long cmd, void *data);
96 static void             ipsec_detached(ifnet_t interface);
97 
98 /* Protocol handlers */
99 static errno_t  ipsec_attach_proto(ifnet_t interface, protocol_family_t proto);
100 static errno_t  ipsec_proto_input(ifnet_t interface, protocol_family_t protocol,
101     mbuf_t m, char *frame_header);
102 static errno_t ipsec_proto_pre_output(ifnet_t interface, protocol_family_t protocol,
103     mbuf_t *packet, const struct sockaddr *dest, void *route,
104     char *frame_type, char *link_layer_dest);
105 
106 static kern_ctl_ref     ipsec_kctlref;
107 static LCK_ATTR_DECLARE(ipsec_lck_attr, 0, 0);
108 static LCK_GRP_DECLARE(ipsec_lck_grp, "ipsec");
109 static LCK_MTX_DECLARE_ATTR(ipsec_lock, &ipsec_lck_grp, &ipsec_lck_attr);
110 
111 #if IPSEC_NEXUS
112 
113 SYSCTL_DECL(_net_ipsec);
114 SYSCTL_NODE(_net, OID_AUTO, ipsec, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IPsec");
115 static int if_ipsec_verify_interface_creation = 0;
116 SYSCTL_INT(_net_ipsec, OID_AUTO, verify_interface_creation, CTLFLAG_RW | CTLFLAG_LOCKED, &if_ipsec_verify_interface_creation, 0, "");
117 
118 #define IPSEC_IF_VERIFY(_e)             if (__improbable(if_ipsec_verify_interface_creation)) { VERIFY(_e); }
119 
120 #define IPSEC_IF_DEFAULT_SLOT_SIZE 2048
121 #define IPSEC_IF_DEFAULT_RING_SIZE 64
122 #define IPSEC_IF_DEFAULT_TX_FSW_RING_SIZE 64
123 #define IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE 128
124 #define IPSEC_IF_DEFAULT_BUF_SEG_SIZE   skmem_usr_buf_seg_size
125 
126 #define IPSEC_IF_WMM_RING_COUNT NEXUS_NUM_WMM_QUEUES
127 #define IPSEC_IF_MAX_RING_COUNT IPSEC_IF_WMM_RING_COUNT
128 #define IPSEC_NETIF_WMM_TX_RING_COUNT IPSEC_IF_WMM_RING_COUNT
129 #define IPSEC_NETIF_WMM_RX_RING_COUNT 1
130 #define IPSEC_NETIF_MAX_TX_RING_COUNT IPSEC_NETIF_WMM_TX_RING_COUNT
131 #define IPSEC_NETIF_MAX_RX_RING_COUNT IPSEC_NETIF_WMM_RX_RING_COUNT
132 
133 #define IPSEC_IF_MIN_RING_SIZE 8
134 #define IPSEC_IF_MAX_RING_SIZE 1024
135 
136 #define IPSEC_IF_MIN_SLOT_SIZE 1024
137 #define IPSEC_IF_MAX_SLOT_SIZE (16 * 1024)
138 
139 #define IPSEC_DEFAULT_MAX_PENDING_INPUT_COUNT 512
140 
141 #define IPSEC_KPIPE_FLAG_WAKE_PKT 0x01
142 
143 static uint32_t ipsec_kpipe_mbuf;
144 
145 static int if_ipsec_max_pending_input = IPSEC_DEFAULT_MAX_PENDING_INPUT_COUNT;
146 
147 static int sysctl_if_ipsec_ring_size SYSCTL_HANDLER_ARGS;
148 static int sysctl_if_ipsec_tx_fsw_ring_size SYSCTL_HANDLER_ARGS;
149 static int sysctl_if_ipsec_rx_fsw_ring_size SYSCTL_HANDLER_ARGS;
150 
151 static int if_ipsec_ring_size = IPSEC_IF_DEFAULT_RING_SIZE;
152 static int if_ipsec_tx_fsw_ring_size = IPSEC_IF_DEFAULT_TX_FSW_RING_SIZE;
153 static int if_ipsec_rx_fsw_ring_size = IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE;
154 
155 SYSCTL_INT(_net_ipsec, OID_AUTO, max_pending_input, CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_max_pending_input, 0, "");
156 SYSCTL_PROC(_net_ipsec, OID_AUTO, ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
157     &if_ipsec_ring_size, IPSEC_IF_DEFAULT_RING_SIZE, &sysctl_if_ipsec_ring_size, "I", "");
158 SYSCTL_PROC(_net_ipsec, OID_AUTO, tx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
159     &if_ipsec_tx_fsw_ring_size, IPSEC_IF_DEFAULT_TX_FSW_RING_SIZE, &sysctl_if_ipsec_tx_fsw_ring_size, "I", "");
160 SYSCTL_PROC(_net_ipsec, OID_AUTO, rx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
161     &if_ipsec_rx_fsw_ring_size, IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE, &sysctl_if_ipsec_rx_fsw_ring_size, "I", "");
162 
163 static int if_ipsec_debug = 0;
164 SYSCTL_INT(_net_ipsec, OID_AUTO, debug, CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_debug, 0, "");
165 
166 static errno_t
167 ipsec_register_nexus(void);
168 
169 typedef struct ipsec_nx {
170 	uuid_t if_provider;
171 	uuid_t if_instance;
172 	uuid_t fsw_provider;
173 	uuid_t fsw_instance;
174 	uuid_t fsw_device;
175 	uuid_t fsw_agent;
176 } *ipsec_nx_t;
177 
178 static nexus_controller_t ipsec_ncd;
179 static int ipsec_ncd_refcount;
180 static uuid_t ipsec_kpipe_uuid;
181 
182 #endif // IPSEC_NEXUS
183 
184 /* Control block allocated for each kernel control connection */
185 struct ipsec_pcb {
186 	TAILQ_ENTRY(ipsec_pcb)  ipsec_chain;
187 	kern_ctl_ref            ipsec_ctlref;
188 	ifnet_t                 ipsec_ifp;
189 	u_int32_t               ipsec_unit;
190 	u_int32_t               ipsec_unique_id;
191 	// These external flags can be set with IPSEC_OPT_FLAGS
192 	u_int32_t               ipsec_external_flags;
193 	// These internal flags are only used within this driver
194 	u_int32_t               ipsec_internal_flags;
195 	u_int32_t               ipsec_input_frag_size;
196 	bool                    ipsec_frag_size_set;
197 	int                     ipsec_ext_ifdata_stats;
198 	mbuf_svc_class_t        ipsec_output_service_class;
199 	char                    ipsec_if_xname[IFXNAMSIZ];
200 	char                    ipsec_unique_name[IFXNAMSIZ];
201 	// PCB lock protects state fields, like ipsec_kpipe_count
202 	decl_lck_rw_data(, ipsec_pcb_lock);
203 	// lock to protect ipsec_pcb_data_move & ipsec_pcb_drainers
204 	decl_lck_mtx_data(, ipsec_pcb_data_move_lock);
205 	u_int32_t               ipsec_pcb_data_move; /* number of data moving contexts */
206 	u_int32_t               ipsec_pcb_drainers; /* number of threads waiting to drain */
207 	u_int32_t               ipsec_pcb_data_path_state; /* internal state of interface data path */
208 	ipsec_dscp_mapping_t    ipsec_output_dscp_mapping;
209 
210 #if IPSEC_NEXUS
211 	lck_mtx_t               ipsec_input_chain_lock;
212 	lck_mtx_t               ipsec_kpipe_encrypt_lock;
213 	lck_mtx_t               ipsec_kpipe_decrypt_lock;
214 	struct mbuf *           ipsec_input_chain;
215 	struct mbuf *           ipsec_input_chain_last;
216 	u_int32_t               ipsec_input_chain_count;
217 	// Input chain lock protects the list of input mbufs
218 	// The input chain lock must be taken AFTER the PCB lock if both are held
219 	struct ipsec_nx         ipsec_nx;
220 	u_int32_t               ipsec_kpipe_count;
221 	pid_t                   ipsec_kpipe_pid;
222 	uuid_t                  ipsec_kpipe_proc_uuid;
223 	uuid_t                  ipsec_kpipe_uuid[IPSEC_IF_MAX_RING_COUNT];
224 	void *                  ipsec_kpipe_rxring[IPSEC_IF_MAX_RING_COUNT];
225 	void *                  ipsec_kpipe_txring[IPSEC_IF_MAX_RING_COUNT];
226 	kern_pbufpool_t         ipsec_kpipe_pp;
227 	u_int32_t               ipsec_kpipe_tx_ring_size;
228 	u_int32_t               ipsec_kpipe_rx_ring_size;
229 
230 	kern_nexus_t            ipsec_netif_nexus;
231 	kern_pbufpool_t         ipsec_netif_pp;
232 	void *                  ipsec_netif_rxring[IPSEC_NETIF_MAX_RX_RING_COUNT];
233 	void *                  ipsec_netif_txring[IPSEC_NETIF_MAX_TX_RING_COUNT];
234 	uint64_t                ipsec_netif_txring_size;
235 
236 	u_int32_t               ipsec_slot_size;
237 	u_int32_t               ipsec_netif_ring_size;
238 	u_int32_t               ipsec_tx_fsw_ring_size;
239 	u_int32_t               ipsec_rx_fsw_ring_size;
240 	bool                    ipsec_use_netif;
241 	bool                    ipsec_needs_netagent;
242 #endif // IPSEC_NEXUS
243 };
244 
245 /* These are internal flags not exposed outside this file */
246 #define IPSEC_FLAGS_KPIPE_ALLOCATED 1
247 
248 /* data movement refcounting functions */
249 static boolean_t ipsec_data_move_begin(struct ipsec_pcb *pcb);
250 static void ipsec_data_move_end(struct ipsec_pcb *pcb);
251 static void ipsec_wait_data_move_drain(struct ipsec_pcb *pcb);
252 
253 /* Data path states */
254 #define IPSEC_PCB_DATA_PATH_READY    0x1
255 
256 /* Macros to set/clear/test data path states */
257 #define IPSEC_SET_DATA_PATH_READY(_pcb) ((_pcb)->ipsec_pcb_data_path_state |= IPSEC_PCB_DATA_PATH_READY)
258 #define IPSEC_CLR_DATA_PATH_READY(_pcb) ((_pcb)->ipsec_pcb_data_path_state &= ~IPSEC_PCB_DATA_PATH_READY)
259 #define IPSEC_IS_DATA_PATH_READY(_pcb) (((_pcb)->ipsec_pcb_data_path_state & IPSEC_PCB_DATA_PATH_READY) != 0)
260 
261 #if IPSEC_NEXUS
262 /* Macros to clear/set/test flags. */
263 static inline void
ipsec_flag_set(struct ipsec_pcb * pcb,uint32_t flag)264 ipsec_flag_set(struct ipsec_pcb *pcb, uint32_t flag)
265 {
266 	pcb->ipsec_internal_flags |= flag;
267 }
268 static inline void
ipsec_flag_clr(struct ipsec_pcb * pcb,uint32_t flag)269 ipsec_flag_clr(struct ipsec_pcb *pcb, uint32_t flag)
270 {
271 	pcb->ipsec_internal_flags &= ~flag;
272 }
273 
274 static inline bool
ipsec_flag_isset(struct ipsec_pcb * pcb,uint32_t flag)275 ipsec_flag_isset(struct ipsec_pcb *pcb, uint32_t flag)
276 {
277 	return !!(pcb->ipsec_internal_flags & flag);
278 }
279 #endif // IPSEC_NEXUS
280 
281 TAILQ_HEAD(ipsec_list, ipsec_pcb) ipsec_head;
282 
283 static KALLOC_TYPE_DEFINE(ipsec_pcb_zone, struct ipsec_pcb, NET_KT_DEFAULT);
284 
285 #define IPSECQ_MAXLEN 256
286 
287 #if IPSEC_NEXUS
288 static int
289 sysctl_if_ipsec_ring_size SYSCTL_HANDLER_ARGS
290 {
291 #pragma unused(arg1, arg2)
292 	int value = if_ipsec_ring_size;
293 
294 	int error = sysctl_handle_int(oidp, &value, 0, req);
295 	if (error || !req->newptr) {
296 		return error;
297 	}
298 
299 	if (value < IPSEC_IF_MIN_RING_SIZE ||
300 	    value > IPSEC_IF_MAX_RING_SIZE) {
301 		return EINVAL;
302 	}
303 
304 	if_ipsec_ring_size = value;
305 
306 	return 0;
307 }
308 
309 static int
310 sysctl_if_ipsec_tx_fsw_ring_size SYSCTL_HANDLER_ARGS
311 {
312 #pragma unused(arg1, arg2)
313 	int value = if_ipsec_tx_fsw_ring_size;
314 
315 	int error = sysctl_handle_int(oidp, &value, 0, req);
316 	if (error || !req->newptr) {
317 		return error;
318 	}
319 
320 	if (value < IPSEC_IF_MIN_RING_SIZE ||
321 	    value > IPSEC_IF_MAX_RING_SIZE) {
322 		return EINVAL;
323 	}
324 
325 	if_ipsec_tx_fsw_ring_size = value;
326 
327 	return 0;
328 }
329 
330 static int
331 sysctl_if_ipsec_rx_fsw_ring_size SYSCTL_HANDLER_ARGS
332 {
333 #pragma unused(arg1, arg2)
334 	int value = if_ipsec_rx_fsw_ring_size;
335 
336 	int error = sysctl_handle_int(oidp, &value, 0, req);
337 	if (error || !req->newptr) {
338 		return error;
339 	}
340 
341 	if (value < IPSEC_IF_MIN_RING_SIZE ||
342 	    value > IPSEC_IF_MAX_RING_SIZE) {
343 		return EINVAL;
344 	}
345 
346 	if_ipsec_rx_fsw_ring_size = value;
347 
348 	return 0;
349 }
350 
351 
352 static inline bool
ipsec_in_wmm_mode(struct ipsec_pcb * pcb)353 ipsec_in_wmm_mode(struct ipsec_pcb *pcb)
354 {
355 	return pcb->ipsec_kpipe_count == IPSEC_IF_WMM_RING_COUNT;
356 }
357 
358 #endif // IPSEC_NEXUS
359 
360 errno_t
ipsec_register_control(void)361 ipsec_register_control(void)
362 {
363 	struct kern_ctl_reg     kern_ctl;
364 	errno_t                 result = 0;
365 
366 #if (DEVELOPMENT || DEBUG)
367 	(void)PE_parse_boot_argn("ipsec_kpipe_mbuf", &ipsec_kpipe_mbuf,
368 	    sizeof(ipsec_kpipe_mbuf));
369 #endif /* DEVELOPMENT || DEBUG */
370 
371 #if IPSEC_NEXUS
372 	ipsec_register_nexus();
373 #endif // IPSEC_NEXUS
374 
375 	TAILQ_INIT(&ipsec_head);
376 
377 	bzero(&kern_ctl, sizeof(kern_ctl));
378 	strlcpy(kern_ctl.ctl_name, IPSEC_CONTROL_NAME, sizeof(kern_ctl.ctl_name));
379 	kern_ctl.ctl_name[sizeof(kern_ctl.ctl_name) - 1] = 0;
380 	kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED | CTL_FLAG_REG_SETUP; /* Require root */
381 	kern_ctl.ctl_sendsize = 64 * 1024;
382 	kern_ctl.ctl_recvsize = 64 * 1024;
383 	kern_ctl.ctl_setup = ipsec_ctl_setup;
384 	kern_ctl.ctl_bind = ipsec_ctl_bind;
385 	kern_ctl.ctl_connect = ipsec_ctl_connect;
386 	kern_ctl.ctl_disconnect = ipsec_ctl_disconnect;
387 	kern_ctl.ctl_send = ipsec_ctl_send;
388 	kern_ctl.ctl_setopt = ipsec_ctl_setopt;
389 	kern_ctl.ctl_getopt = ipsec_ctl_getopt;
390 
391 	result = ctl_register(&kern_ctl, &ipsec_kctlref);
392 	if (result != 0) {
393 		os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - ctl_register failed: %d\n", result);
394 		return result;
395 	}
396 
397 	/* Register the protocol plumbers */
398 	if ((result = proto_register_plumber(PF_INET, IFNET_FAMILY_IPSEC,
399 	    ipsec_attach_proto, NULL)) != 0) {
400 		os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - proto_register_plumber(PF_INET, IFNET_FAMILY_IPSEC) failed: %d\n",
401 		    result);
402 		ctl_deregister(ipsec_kctlref);
403 		return result;
404 	}
405 
406 	/* Register the protocol plumbers */
407 	if ((result = proto_register_plumber(PF_INET6, IFNET_FAMILY_IPSEC,
408 	    ipsec_attach_proto, NULL)) != 0) {
409 		proto_unregister_plumber(PF_INET, IFNET_FAMILY_IPSEC);
410 		ctl_deregister(ipsec_kctlref);
411 		os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - proto_register_plumber(PF_INET6, IFNET_FAMILY_IPSEC) failed: %d\n",
412 		    result);
413 		return result;
414 	}
415 
416 	return 0;
417 }
418 
419 /* Helpers */
420 int
ipsec_interface_isvalid(ifnet_t interface)421 ipsec_interface_isvalid(ifnet_t interface)
422 {
423 	struct ipsec_pcb *pcb = NULL;
424 
425 	if (interface == NULL) {
426 		return 0;
427 	}
428 
429 	pcb = ifnet_softc(interface);
430 
431 	if (pcb == NULL) {
432 		return 0;
433 	}
434 
435 	/* When ctl disconnects, ipsec_unit is set to 0 */
436 	if (pcb->ipsec_unit == 0) {
437 		return 0;
438 	}
439 
440 	return 1;
441 }
442 
443 #if IPSEC_NEXUS
444 boolean_t
ipsec_interface_needs_netagent(ifnet_t interface)445 ipsec_interface_needs_netagent(ifnet_t interface)
446 {
447 	struct ipsec_pcb *pcb = NULL;
448 
449 	if (interface == NULL) {
450 		return FALSE;
451 	}
452 
453 	pcb = ifnet_softc(interface);
454 
455 	if (pcb == NULL) {
456 		return FALSE;
457 	}
458 
459 	return pcb->ipsec_needs_netagent == true;
460 }
461 #endif // IPSEC_NEXUS
462 
463 static errno_t
ipsec_ifnet_set_attrs(ifnet_t ifp)464 ipsec_ifnet_set_attrs(ifnet_t ifp)
465 {
466 	/* Set flags and additional information. */
467 	ifnet_set_mtu(ifp, 1500);
468 	ifnet_set_flags(ifp, IFF_UP | IFF_MULTICAST | IFF_POINTOPOINT, 0xffff);
469 
470 	/* The interface must generate its own IPv6 LinkLocal address,
471 	 * if possible following the recommendation of RFC2472 to the 64bit interface ID
472 	 */
473 	ifnet_set_eflags(ifp, IFEF_NOAUTOIPV6LL, IFEF_NOAUTOIPV6LL);
474 
475 #if !IPSEC_NEXUS
476 	/* Reset the stats in case as the interface may have been recycled */
477 	struct ifnet_stats_param stats;
478 	bzero(&stats, sizeof(struct ifnet_stats_param));
479 	ifnet_set_stat(ifp, &stats);
480 #endif // !IPSEC_NEXUS
481 
482 	return 0;
483 }
484 
485 #if IPSEC_NEXUS
486 
487 static uuid_t ipsec_nx_dom_prov;
488 
489 static errno_t
ipsec_nxdp_init(__unused kern_nexus_domain_provider_t domprov)490 ipsec_nxdp_init(__unused kern_nexus_domain_provider_t domprov)
491 {
492 	return 0;
493 }
494 
495 static void
ipsec_nxdp_fini(__unused kern_nexus_domain_provider_t domprov)496 ipsec_nxdp_fini(__unused kern_nexus_domain_provider_t domprov)
497 {
498 	// Ignore
499 }
500 
501 static errno_t
ipsec_register_nexus(void)502 ipsec_register_nexus(void)
503 {
504 	const struct kern_nexus_domain_provider_init dp_init = {
505 		.nxdpi_version = KERN_NEXUS_DOMAIN_PROVIDER_CURRENT_VERSION,
506 		.nxdpi_flags = 0,
507 		.nxdpi_init = ipsec_nxdp_init,
508 		.nxdpi_fini = ipsec_nxdp_fini
509 	};
510 	errno_t err = 0;
511 
512 	/* ipsec_nxdp_init() is called before this function returns */
513 	err = kern_nexus_register_domain_provider(NEXUS_TYPE_NET_IF,
514 	    (const uint8_t *) "com.apple.ipsec",
515 	    &dp_init, sizeof(dp_init),
516 	    &ipsec_nx_dom_prov);
517 	if (err != 0) {
518 		os_log_error(OS_LOG_DEFAULT, "%s: failed to register domain provider\n", __func__);
519 		return err;
520 	}
521 	return 0;
522 }
523 
524 static errno_t
ipsec_netif_prepare(kern_nexus_t nexus,ifnet_t ifp)525 ipsec_netif_prepare(kern_nexus_t nexus, ifnet_t ifp)
526 {
527 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
528 	pcb->ipsec_netif_nexus = nexus;
529 	return ipsec_ifnet_set_attrs(ifp);
530 }
531 
532 static errno_t
ipsec_nexus_pre_connect(kern_nexus_provider_t nxprov,proc_t p,kern_nexus_t nexus,nexus_port_t nexus_port,kern_channel_t channel,void ** ch_ctx)533 ipsec_nexus_pre_connect(kern_nexus_provider_t nxprov,
534     proc_t p, kern_nexus_t nexus,
535     nexus_port_t nexus_port, kern_channel_t channel, void **ch_ctx)
536 {
537 #pragma unused(nxprov, p)
538 #pragma unused(nexus, nexus_port, channel, ch_ctx)
539 	return 0;
540 }
541 
542 static errno_t
ipsec_nexus_connected(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_t channel)543 ipsec_nexus_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
544     kern_channel_t channel)
545 {
546 #pragma unused(nxprov, channel)
547 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
548 	boolean_t ok = ifnet_is_attached(pcb->ipsec_ifp, 1);
549 	/* Mark the data path as ready */
550 	if (ok) {
551 		lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
552 		IPSEC_SET_DATA_PATH_READY(pcb);
553 		lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
554 	}
555 	return ok ? 0 : ENXIO;
556 }
557 
558 static void
ipsec_nexus_pre_disconnect(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_t channel)559 ipsec_nexus_pre_disconnect(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
560     kern_channel_t channel)
561 {
562 #pragma unused(nxprov, channel)
563 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
564 
565 	VERIFY(pcb->ipsec_kpipe_count != 0);
566 
567 	/* Wait until all threads in the data paths are done. */
568 	ipsec_wait_data_move_drain(pcb);
569 }
570 
571 static void
ipsec_netif_pre_disconnect(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_t channel)572 ipsec_netif_pre_disconnect(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
573     kern_channel_t channel)
574 {
575 #pragma unused(nxprov, channel)
576 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
577 
578 	/* Wait until all threads in the data paths are done. */
579 	ipsec_wait_data_move_drain(pcb);
580 }
581 
582 static void
ipsec_nexus_disconnected(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_t channel)583 ipsec_nexus_disconnected(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
584     kern_channel_t channel)
585 {
586 #pragma unused(nxprov, channel)
587 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
588 	if (pcb->ipsec_netif_nexus == nexus) {
589 		pcb->ipsec_netif_nexus = NULL;
590 	}
591 	ifnet_decr_iorefcnt(pcb->ipsec_ifp);
592 }
593 
594 static errno_t
ipsec_kpipe_ring_init(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_t channel,kern_channel_ring_t ring,boolean_t is_tx_ring,void ** ring_ctx)595 ipsec_kpipe_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
596     kern_channel_t channel, kern_channel_ring_t ring, boolean_t is_tx_ring,
597     void **ring_ctx)
598 {
599 #pragma unused(nxprov)
600 #pragma unused(channel)
601 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
602 	uint8_t ring_idx;
603 
604 	for (ring_idx = 0; ring_idx < pcb->ipsec_kpipe_count; ring_idx++) {
605 		if (!uuid_compare(channel->ch_info->cinfo_nx_uuid, pcb->ipsec_kpipe_uuid[ring_idx])) {
606 			break;
607 		}
608 	}
609 
610 	if (ring_idx == pcb->ipsec_kpipe_count) {
611 		uuid_string_t uuidstr;
612 		uuid_unparse(channel->ch_info->cinfo_nx_uuid, uuidstr);
613 		os_log_error(OS_LOG_DEFAULT, "%s: %s cannot find channel %s\n", __func__, pcb->ipsec_if_xname, uuidstr);
614 		return ENOENT;
615 	}
616 
617 	*ring_ctx = (void *)(uintptr_t)ring_idx;
618 
619 	if (!is_tx_ring) {
620 		VERIFY(pcb->ipsec_kpipe_rxring[ring_idx] == NULL);
621 		pcb->ipsec_kpipe_rxring[ring_idx] = ring;
622 	} else {
623 		VERIFY(pcb->ipsec_kpipe_txring[ring_idx] == NULL);
624 		pcb->ipsec_kpipe_txring[ring_idx] = ring;
625 	}
626 	return 0;
627 }
628 
629 static void
ipsec_kpipe_ring_fini(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t ring)630 ipsec_kpipe_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
631     kern_channel_ring_t ring)
632 {
633 #pragma unused(nxprov)
634 	bool found = false;
635 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
636 
637 	for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) {
638 		if (pcb->ipsec_kpipe_rxring[i] == ring) {
639 			pcb->ipsec_kpipe_rxring[i] = NULL;
640 			found = true;
641 		} else if (pcb->ipsec_kpipe_txring[i] == ring) {
642 			pcb->ipsec_kpipe_txring[i] = NULL;
643 			found = true;
644 		}
645 	}
646 	VERIFY(found);
647 }
648 
649 static errno_t
ipsec_kpipe_sync_tx(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t tx_ring,uint32_t flags)650 ipsec_kpipe_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
651     kern_channel_ring_t tx_ring, uint32_t flags)
652 {
653 #pragma unused(nxprov)
654 #pragma unused(flags)
655 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
656 
657 	if (!ipsec_data_move_begin(pcb)) {
658 		os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
659 		return 0;
660 	}
661 
662 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
663 
664 	if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
665 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
666 		ipsec_data_move_end(pcb);
667 		return 0;
668 	}
669 
670 	VERIFY(pcb->ipsec_kpipe_count);
671 
672 	kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
673 	if (tx_slot == NULL) {
674 		// Nothing to write, bail
675 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
676 		ipsec_data_move_end(pcb);
677 		return 0;
678 	}
679 
680 	// Signal the netif ring to read
681 	kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring[0];
682 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
683 
684 	if (rx_ring != NULL) {
685 		kern_channel_notify(rx_ring, 0);
686 	}
687 
688 	ipsec_data_move_end(pcb);
689 	return 0;
690 }
691 
692 static mbuf_t
ipsec_encrypt_mbuf(ifnet_t interface,mbuf_t data)693 ipsec_encrypt_mbuf(ifnet_t interface,
694     mbuf_t data)
695 {
696 	struct ipsec_output_state ipsec_state;
697 	int error = 0;
698 	uint32_t af;
699 
700 	// Make sure this packet isn't looping through the interface
701 	if (necp_get_last_interface_index_from_packet(data) == interface->if_index) {
702 		error = -1;
703 		goto ipsec_output_err;
704 	}
705 
706 	// Mark the interface so NECP can evaluate tunnel policy
707 	necp_mark_packet_from_interface(data, interface);
708 
709 	struct ip *ip = mtod(data, struct ip *);
710 	u_int ip_version = ip->ip_v;
711 
712 	switch (ip_version) {
713 	case 4: {
714 		af = AF_INET;
715 
716 		memset(&ipsec_state, 0, sizeof(ipsec_state));
717 		ipsec_state.m = data;
718 		ipsec_state.dst = (struct sockaddr *)&ip->ip_dst;
719 		memset(&ipsec_state.ro, 0, sizeof(ipsec_state.ro));
720 
721 		error = ipsec4_interface_output(&ipsec_state, interface);
722 		if (error == 0 && ipsec_state.tunneled == 6) {
723 			// Tunneled in IPv6 - packet is gone
724 			// TODO: Don't lose mbuf
725 			data = NULL;
726 			goto done;
727 		}
728 
729 		data = ipsec_state.m;
730 		if (error || data == NULL) {
731 			if (error) {
732 				os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec4_output error %d\n", error);
733 			}
734 			goto ipsec_output_err;
735 		}
736 		goto done;
737 	}
738 	case 6: {
739 		af = AF_INET6;
740 
741 		data = ipsec6_splithdr(data);
742 		if (data == NULL) {
743 			os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec6_splithdr returned NULL\n");
744 			goto ipsec_output_err;
745 		}
746 
747 		struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *);
748 
749 		memset(&ipsec_state, 0, sizeof(ipsec_state));
750 		ipsec_state.m = data;
751 		ipsec_state.dst = (struct sockaddr *)&ip6->ip6_dst;
752 		memset(&ipsec_state.ro, 0, sizeof(ipsec_state.ro));
753 
754 		error = ipsec6_interface_output(&ipsec_state, interface, &ip6->ip6_nxt, ipsec_state.m);
755 		if (error == 0 && ipsec_state.tunneled == 4) {
756 			// Tunneled in IPv4 - packet is gone
757 			// TODO: Don't lose mbuf
758 			data = NULL;
759 			goto done;
760 		}
761 		data = ipsec_state.m;
762 		if (error || data == NULL) {
763 			if (error) {
764 				os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec6_output error %d\n", error);
765 			}
766 			goto ipsec_output_err;
767 		}
768 		goto done;
769 	}
770 	default: {
771 		os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: Received unknown packet version %d\n", ip_version);
772 		error = -1;
773 		goto ipsec_output_err;
774 	}
775 	}
776 
777 done:
778 	return data;
779 
780 ipsec_output_err:
781 	if (data) {
782 		mbuf_freem(data);
783 	}
784 	return NULL;
785 }
786 
787 static errno_t
ipsec_kpipe_sync_rx_mbuf(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t rx_ring,uint32_t flags)788 ipsec_kpipe_sync_rx_mbuf(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
789     kern_channel_ring_t rx_ring, uint32_t flags)
790 {
791 #pragma unused(nxprov)
792 #pragma unused(flags)
793 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
794 	struct kern_channel_ring_stat_increment rx_ring_stats;
795 	uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(rx_ring);
796 
797 	if (!ipsec_data_move_begin(pcb)) {
798 		os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
799 		return 0;
800 	}
801 
802 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
803 
804 	if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
805 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
806 		ipsec_data_move_end(pcb);
807 		return 0;
808 	}
809 
810 	VERIFY(pcb->ipsec_kpipe_count);
811 	VERIFY(ring_idx <= pcb->ipsec_kpipe_count);
812 
813 	// Reclaim user-released slots
814 	(void) kern_channel_reclaim(rx_ring);
815 
816 	uint32_t avail = kern_channel_available_slot_count(rx_ring);
817 	if (avail == 0) {
818 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
819 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d no room in rx_ring\n", __func__,
820 		    pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
821 		ipsec_data_move_end(pcb);
822 		return 0;
823 	}
824 
825 	kern_channel_ring_t tx_ring = pcb->ipsec_netif_txring[ring_idx];
826 	if (tx_ring == NULL) {
827 		// Net-If TX ring not set up yet, nothing to read
828 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
829 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 1\n", __func__,
830 		    pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
831 		ipsec_data_move_end(pcb);
832 		return 0;
833 	}
834 
835 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(pcb->ipsec_netif_nexus)->nif_stats;
836 
837 	// Unlock ipsec before entering ring
838 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
839 
840 	(void)kr_enter(tx_ring, TRUE);
841 
842 	// Lock again after entering and validate
843 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
844 	if (tx_ring != pcb->ipsec_netif_txring[ring_idx]) {
845 		// Ring no longer valid
846 		// Unlock first, then exit ring
847 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
848 		kr_exit(tx_ring);
849 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 2\n", __func__,
850 		    pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
851 		ipsec_data_move_end(pcb);
852 		return 0;
853 	}
854 
855 	struct kern_channel_ring_stat_increment tx_ring_stats;
856 	bzero(&tx_ring_stats, sizeof(tx_ring_stats));
857 	kern_channel_slot_t tx_pslot = NULL;
858 	kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
859 	if (tx_slot == NULL) {
860 		// Nothing to read, don't bother signalling
861 		// Unlock first, then exit ring
862 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
863 		kr_exit(tx_ring);
864 		ipsec_data_move_end(pcb);
865 		return 0;
866 	}
867 
868 	struct kern_pbufpool *rx_pp = rx_ring->ckr_pp;
869 	VERIFY(rx_pp != NULL);
870 	struct kern_pbufpool *tx_pp = tx_ring->ckr_pp;
871 	VERIFY(tx_pp != NULL);
872 	bzero(&rx_ring_stats, sizeof(rx_ring_stats));
873 	kern_channel_slot_t rx_pslot = NULL;
874 	kern_channel_slot_t rx_slot = kern_channel_get_next_slot(rx_ring, NULL, NULL);
875 	kern_packet_t tx_chain_ph = 0;
876 
877 	while (rx_slot != NULL && tx_slot != NULL) {
878 		size_t length = 0;
879 		mbuf_t data = NULL;
880 		errno_t error = 0;
881 
882 		// Allocate rx packet
883 		kern_packet_t rx_ph = 0;
884 		error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
885 		if (__improbable(error != 0)) {
886 			os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: failed to allocate packet\n",
887 			    pcb->ipsec_ifp->if_xname);
888 			break;
889 		}
890 
891 		kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot);
892 
893 		if (tx_ph == 0) {
894 			// Advance TX ring
895 			tx_pslot = tx_slot;
896 			tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
897 			kern_pbufpool_free(rx_pp, rx_ph);
898 			continue;
899 		}
900 		(void) kern_channel_slot_detach_packet(tx_ring, tx_slot, tx_ph);
901 		if (tx_chain_ph != 0) {
902 			kern_packet_append(tx_ph, tx_chain_ph);
903 		}
904 		tx_chain_ph = tx_ph;
905 
906 		// Advance TX ring
907 		tx_pslot = tx_slot;
908 		tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
909 
910 		kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL);
911 		VERIFY(tx_buf != NULL);
912 		uint8_t *tx_baddr = kern_buflet_get_data_address(tx_buf);
913 		VERIFY(tx_baddr != NULL);
914 		tx_baddr += kern_buflet_get_data_offset(tx_buf);
915 
916 		bpf_tap_packet_out(pcb->ipsec_ifp, DLT_RAW, tx_ph, NULL, 0);
917 
918 		length = MIN(kern_packet_get_data_length(tx_ph),
919 		    pcb->ipsec_slot_size);
920 
921 		// Increment TX stats
922 		tx_ring_stats.kcrsi_slots_transferred++;
923 		tx_ring_stats.kcrsi_bytes_transferred += length;
924 
925 		if (length > 0) {
926 			error = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_HEADER, &data);
927 			if (error == 0) {
928 				error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT);
929 				if (error == 0) {
930 					// Encrypt and send packet
931 					lck_mtx_lock(&pcb->ipsec_kpipe_encrypt_lock);
932 					data = ipsec_encrypt_mbuf(pcb->ipsec_ifp, data);
933 					lck_mtx_unlock(&pcb->ipsec_kpipe_encrypt_lock);
934 				} else {
935 					os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
936 					STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
937 					STATS_INC(nifs, NETIF_STATS_DROP);
938 					mbuf_freem(data);
939 					data = NULL;
940 				}
941 			} else {
942 				os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
943 				STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
944 				STATS_INC(nifs, NETIF_STATS_DROP);
945 			}
946 		} else {
947 			os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname);
948 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
949 			STATS_INC(nifs, NETIF_STATS_DROP);
950 		}
951 
952 		if (data == NULL) {
953 			os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname);
954 			kern_pbufpool_free(rx_pp, rx_ph);
955 			break;
956 		}
957 
958 		length = mbuf_pkthdr_len(data);
959 		if (length > PP_BUF_SIZE_DEF(rx_pp)) {
960 			// Flush data
961 			mbuf_freem(data);
962 			kern_pbufpool_free(rx_pp, rx_ph);
963 			os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: encrypted packet length %zu > %u\n",
964 			    pcb->ipsec_ifp->if_xname, length, PP_BUF_SIZE_DEF(rx_pp));
965 			continue;
966 		}
967 
968 		// Fillout rx packet
969 		kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL);
970 		VERIFY(rx_buf != NULL);
971 		void *rx_baddr = kern_buflet_get_data_address(rx_buf);
972 		VERIFY(rx_baddr != NULL);
973 
974 		// Copy-in data from mbuf to buflet
975 		mbuf_copydata(data, 0, length, (void *)rx_baddr);
976 		kern_packet_clear_flow_uuid(rx_ph);     // Zero flow id
977 
978 		// Finalize and attach the packet
979 		error = kern_buflet_set_data_offset(rx_buf, 0);
980 		VERIFY(error == 0);
981 		error = kern_buflet_set_data_length(rx_buf, (uint16_t)length);
982 		VERIFY(error == 0);
983 		error = kern_packet_finalize(rx_ph);
984 		VERIFY(error == 0);
985 		error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
986 		VERIFY(error == 0);
987 
988 		STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
989 		STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT);
990 
991 		rx_ring_stats.kcrsi_slots_transferred++;
992 		rx_ring_stats.kcrsi_bytes_transferred += length;
993 
994 		if (!pcb->ipsec_ext_ifdata_stats) {
995 			ifnet_stat_increment_out(pcb->ipsec_ifp, 1, (uint16_t)length, 0);
996 		}
997 
998 		mbuf_freem(data);
999 
1000 		rx_pslot = rx_slot;
1001 		rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
1002 	}
1003 
1004 	if (rx_pslot) {
1005 		kern_channel_advance_slot(rx_ring, rx_pslot);
1006 		kern_channel_increment_ring_net_stats(rx_ring, pcb->ipsec_ifp, &rx_ring_stats);
1007 	}
1008 
1009 	if (tx_chain_ph != 0) {
1010 		kern_pbufpool_free_chain(tx_pp, tx_chain_ph);
1011 	}
1012 
1013 	if (tx_pslot) {
1014 		kern_channel_advance_slot(tx_ring, tx_pslot);
1015 		kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats);
1016 		(void)kern_channel_reclaim(tx_ring);
1017 	}
1018 
1019 	/* always reenable output */
1020 	errno_t error = ifnet_enable_output(pcb->ipsec_ifp);
1021 	if (error != 0) {
1022 		os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error);
1023 	}
1024 
1025 	// Unlock first, then exit ring
1026 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1027 
1028 	if (tx_pslot != NULL) {
1029 		kern_channel_notify(tx_ring, 0);
1030 	}
1031 	kr_exit(tx_ring);
1032 
1033 	ipsec_data_move_end(pcb);
1034 	return 0;
1035 }
1036 
1037 static errno_t
ipsec_encrypt_kpipe_pkt(ifnet_t interface,kern_packet_t sph,kern_packet_t dph)1038 ipsec_encrypt_kpipe_pkt(ifnet_t interface, kern_packet_t sph,
1039     kern_packet_t dph)
1040 {
1041 	uint8_t *sbaddr = NULL;
1042 	int err = 0;
1043 	uint32_t slen = 0;
1044 
1045 	VERIFY(interface != NULL);
1046 	VERIFY(sph != 0);
1047 	VERIFY(dph != 0);
1048 
1049 	kern_buflet_t sbuf = __packet_get_next_buflet(sph, NULL);
1050 	VERIFY(sbuf != NULL);
1051 	slen = __buflet_get_data_length(sbuf);
1052 
1053 	if (__improbable(slen < sizeof(struct ip))) {
1054 		os_log_error(OS_LOG_DEFAULT, "ipsec encrypt kpipe pkt: source "
1055 		    "buffer shorter than ip header, %u\n", slen);
1056 		return EINVAL;
1057 	}
1058 
1059 	MD_BUFLET_ADDR(SK_PTR_ADDR_KPKT(sph), sbaddr);
1060 	struct ip *ip = (struct ip *)(void *)sbaddr;
1061 	ASSERT(IP_HDR_ALIGNED_P(ip));
1062 
1063 	u_int ip_vers = ip->ip_v;
1064 	switch (ip_vers) {
1065 	case IPVERSION: {
1066 		err = ipsec4_interface_kpipe_output(interface, sph, dph);
1067 		if (__improbable(err != 0)) {
1068 			os_log_error(OS_LOG_DEFAULT, "ipsec4 interface kpipe "
1069 			    "output error %d\n", err);
1070 			return err;
1071 		}
1072 		break;
1073 	}
1074 	case 6: {
1075 		err = ipsec6_interface_kpipe_output(interface, sph, dph);
1076 		if (__improbable(err != 0)) {
1077 			os_log_error(OS_LOG_DEFAULT, "ipsec6 interface kpipe "
1078 			    "output error %d\n", err);
1079 			return err;
1080 		}
1081 		break;
1082 	}
1083 	default: {
1084 		os_log_error(OS_LOG_DEFAULT, "received unknown packet version: %d\n",
1085 		    ip_vers);
1086 		return EINVAL;
1087 	}
1088 	}
1089 
1090 	return err;
1091 }
1092 
1093 static errno_t
ipsec_kpipe_sync_rx_packet(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t rx_ring,uint32_t flags)1094 ipsec_kpipe_sync_rx_packet(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1095     kern_channel_ring_t rx_ring, uint32_t flags)
1096 {
1097 #pragma unused(nxprov)
1098 #pragma unused(flags)
1099 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
1100 	struct kern_channel_ring_stat_increment rx_ring_stats;
1101 	uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(rx_ring);
1102 
1103 	if (!ipsec_data_move_begin(pcb)) {
1104 		os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
1105 		return 0;
1106 	}
1107 
1108 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
1109 
1110 	if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
1111 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1112 		ipsec_data_move_end(pcb);
1113 		return 0;
1114 	}
1115 
1116 	VERIFY(pcb->ipsec_kpipe_count);
1117 	VERIFY(ring_idx <= pcb->ipsec_kpipe_count);
1118 
1119 	// Reclaim user-released slots
1120 	(void) kern_channel_reclaim(rx_ring);
1121 
1122 	uint32_t avail = kern_channel_available_slot_count(rx_ring);
1123 	if (avail == 0) {
1124 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1125 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d no room in rx_ring\n", __func__,
1126 		    pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
1127 		ipsec_data_move_end(pcb);
1128 		return 0;
1129 	}
1130 
1131 	kern_channel_ring_t tx_ring = pcb->ipsec_netif_txring[ring_idx];
1132 	if (tx_ring == NULL) {
1133 		// Net-If TX ring not set up yet, nothing to read
1134 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1135 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 1\n", __func__,
1136 		    pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
1137 		ipsec_data_move_end(pcb);
1138 		return 0;
1139 	}
1140 
1141 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(pcb->ipsec_netif_nexus)->nif_stats;
1142 
1143 	// Unlock ipsec before entering ring
1144 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1145 
1146 	(void)kr_enter(tx_ring, TRUE);
1147 
1148 	// Lock again after entering and validate
1149 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
1150 	if (tx_ring != pcb->ipsec_netif_txring[ring_idx]) {
1151 		// Ring no longer valid
1152 		// Unlock first, then exit ring
1153 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1154 		kr_exit(tx_ring);
1155 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 2\n", __func__,
1156 		    pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
1157 		ipsec_data_move_end(pcb);
1158 		return 0;
1159 	}
1160 
1161 	struct kern_channel_ring_stat_increment tx_ring_stats;
1162 	bzero(&tx_ring_stats, sizeof(tx_ring_stats));
1163 	kern_channel_slot_t tx_pslot = NULL;
1164 	kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
1165 	if (tx_slot == NULL) {
1166 		// Nothing to read, don't bother signalling
1167 		// Unlock first, then exit ring
1168 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1169 		kr_exit(tx_ring);
1170 		ipsec_data_move_end(pcb);
1171 		return 0;
1172 	}
1173 
1174 	struct kern_pbufpool *rx_pp = rx_ring->ckr_pp;
1175 	VERIFY(rx_pp != NULL);
1176 	struct kern_pbufpool *tx_pp = tx_ring->ckr_pp;
1177 	VERIFY(tx_pp != NULL);
1178 	bzero(&rx_ring_stats, sizeof(rx_ring_stats));
1179 	kern_channel_slot_t rx_pslot = NULL;
1180 	kern_channel_slot_t rx_slot = kern_channel_get_next_slot(rx_ring, NULL, NULL);
1181 	kern_packet_t tx_chain_ph = 0;
1182 
1183 	while (rx_slot != NULL && tx_slot != NULL) {
1184 		size_t tx_pkt_length = 0;
1185 		errno_t error = 0;
1186 
1187 		// Allocate rx packet
1188 		kern_packet_t rx_ph = 0;
1189 		error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
1190 		if (__improbable(error != 0)) {
1191 			os_log_info(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: "
1192 			    "failed to allocate packet\n", pcb->ipsec_ifp->if_xname);
1193 			STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
1194 			STATS_INC(nifs, NETIF_STATS_DROP);
1195 			break;
1196 		}
1197 
1198 		kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot);
1199 		if (__improbable(tx_ph == 0)) {
1200 			// Advance TX ring
1201 			tx_pslot = tx_slot;
1202 			tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
1203 			kern_pbufpool_free(rx_pp, rx_ph);
1204 			continue;
1205 		}
1206 
1207 		(void) kern_channel_slot_detach_packet(tx_ring, tx_slot, tx_ph);
1208 		if (tx_chain_ph != 0) {
1209 			kern_packet_append(tx_ph, tx_chain_ph);
1210 		}
1211 		tx_chain_ph = tx_ph;
1212 
1213 		// Advance TX ring
1214 		tx_pslot = tx_slot;
1215 		tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
1216 
1217 		bpf_tap_packet_out(pcb->ipsec_ifp, DLT_RAW, tx_ph, NULL, 0);
1218 
1219 		tx_pkt_length = kern_packet_get_data_length(tx_ph);
1220 		if (tx_pkt_length == 0 || tx_pkt_length > pcb->ipsec_slot_size) {
1221 			os_log_info(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: "
1222 			    "packet length %zu", pcb->ipsec_ifp->if_xname,
1223 			    tx_pkt_length);
1224 			kern_pbufpool_free(rx_pp, rx_ph);
1225 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1226 			STATS_INC(nifs, NETIF_STATS_DROP);
1227 			continue;
1228 		}
1229 
1230 		// Increment TX stats
1231 		tx_ring_stats.kcrsi_slots_transferred++;
1232 		tx_ring_stats.kcrsi_bytes_transferred += tx_pkt_length;
1233 
1234 		// Encrypt packet
1235 		lck_mtx_lock(&pcb->ipsec_kpipe_encrypt_lock);
1236 		error = ipsec_encrypt_kpipe_pkt(pcb->ipsec_ifp, tx_ph, rx_ph);
1237 		lck_mtx_unlock(&pcb->ipsec_kpipe_encrypt_lock);
1238 		if (__improbable(error != 0)) {
1239 			os_log_info(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: "
1240 			    "failed to encrypt packet", pcb->ipsec_ifp->if_xname);
1241 			kern_pbufpool_free(rx_pp, rx_ph);
1242 			STATS_INC(nifs, NETIF_STATS_DROP);
1243 			continue;
1244 		}
1245 
1246 		kern_packet_clear_flow_uuid(rx_ph);         // Zero flow id
1247 		// Finalize and attach the packet
1248 		kern_buflet_t rx_buf = __packet_get_next_buflet(rx_ph, NULL);
1249 		error = kern_buflet_set_data_offset(rx_buf, 0);
1250 		VERIFY(error == 0);
1251 		error = kern_packet_finalize(rx_ph);
1252 		VERIFY(error == 0);
1253 		error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
1254 		VERIFY(error == 0);
1255 
1256 		STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
1257 		STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT);
1258 
1259 		rx_ring_stats.kcrsi_slots_transferred++;
1260 		rx_ring_stats.kcrsi_bytes_transferred += kern_packet_get_data_length(rx_ph);
1261 
1262 		if (!pcb->ipsec_ext_ifdata_stats) {
1263 			ifnet_stat_increment_out(pcb->ipsec_ifp, 1,
1264 			    kern_packet_get_data_length(rx_ph), 0);
1265 		}
1266 
1267 		rx_pslot = rx_slot;
1268 		rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
1269 	}
1270 
1271 	if (rx_pslot) {
1272 		kern_channel_advance_slot(rx_ring, rx_pslot);
1273 		kern_channel_increment_ring_net_stats(rx_ring, pcb->ipsec_ifp, &rx_ring_stats);
1274 	}
1275 
1276 	if (tx_chain_ph != 0) {
1277 		kern_pbufpool_free_chain(tx_pp, tx_chain_ph);
1278 	}
1279 
1280 	if (tx_pslot) {
1281 		kern_channel_advance_slot(tx_ring, tx_pslot);
1282 		kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats);
1283 		(void)kern_channel_reclaim(tx_ring);
1284 	}
1285 
1286 	/* always reenable output */
1287 	errno_t error = ifnet_enable_output(pcb->ipsec_ifp);
1288 	if (error != 0) {
1289 		os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error);
1290 	}
1291 
1292 	// Unlock first, then exit ring
1293 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1294 
1295 	if (tx_pslot != NULL) {
1296 		kern_channel_notify(tx_ring, 0);
1297 	}
1298 	kr_exit(tx_ring);
1299 
1300 	ipsec_data_move_end(pcb);
1301 	return 0;
1302 }
1303 
1304 static errno_t
ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t rx_ring,uint32_t flags)1305 ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1306     kern_channel_ring_t rx_ring, uint32_t flags)
1307 {
1308 	if (__improbable(ipsec_kpipe_mbuf == 1)) {
1309 		return ipsec_kpipe_sync_rx_mbuf(nxprov, nexus, rx_ring, flags);
1310 	} else {
1311 		return ipsec_kpipe_sync_rx_packet(nxprov, nexus, rx_ring, flags);
1312 	}
1313 }
1314 
1315 static uint8_t
ipsec_find_tx_ring_by_svc(kern_packet_svc_class_t svc_class)1316 ipsec_find_tx_ring_by_svc(kern_packet_svc_class_t svc_class)
1317 {
1318 	switch (svc_class) {
1319 	case KPKT_SC_VO: {
1320 		return 0;
1321 	}
1322 	case KPKT_SC_VI: {
1323 		return 1;
1324 	}
1325 	case KPKT_SC_BE: {
1326 		return 2;
1327 	}
1328 	case KPKT_SC_BK: {
1329 		return 3;
1330 	}
1331 	default: {
1332 		VERIFY(0);
1333 		return 0;
1334 	}
1335 	}
1336 }
1337 
1338 static errno_t
ipsec_netif_ring_init(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_t channel,kern_channel_ring_t ring,boolean_t is_tx_ring,void ** ring_ctx)1339 ipsec_netif_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1340     kern_channel_t channel, kern_channel_ring_t ring, boolean_t is_tx_ring,
1341     void **ring_ctx)
1342 {
1343 #pragma unused(nxprov)
1344 #pragma unused(channel)
1345 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
1346 
1347 	if (!is_tx_ring) {
1348 		VERIFY(pcb->ipsec_netif_rxring[0] == NULL);
1349 		pcb->ipsec_netif_rxring[0] = ring;
1350 	} else {
1351 		uint8_t ring_idx = 0;
1352 		if (ipsec_in_wmm_mode(pcb)) {
1353 			int err;
1354 			kern_packet_svc_class_t svc_class;
1355 			err = kern_channel_get_service_class(ring, &svc_class);
1356 			VERIFY(err == 0);
1357 			ring_idx = ipsec_find_tx_ring_by_svc(svc_class);
1358 			VERIFY(ring_idx < IPSEC_IF_WMM_RING_COUNT);
1359 		}
1360 
1361 		*ring_ctx = (void *)(uintptr_t)ring_idx;
1362 
1363 		VERIFY(pcb->ipsec_netif_txring[ring_idx] == NULL);
1364 		pcb->ipsec_netif_txring[ring_idx] = ring;
1365 	}
1366 	return 0;
1367 }
1368 
1369 static void
ipsec_netif_ring_fini(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t ring)1370 ipsec_netif_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1371     kern_channel_ring_t ring)
1372 {
1373 #pragma unused(nxprov)
1374 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
1375 	bool found = false;
1376 
1377 	for (int i = 0; i < IPSEC_NETIF_MAX_RX_RING_COUNT; i++) {
1378 		if (pcb->ipsec_netif_rxring[i] == ring) {
1379 			pcb->ipsec_netif_rxring[i] = NULL;
1380 			VERIFY(!found);
1381 			found = true;
1382 		}
1383 	}
1384 	for (int i = 0; i < IPSEC_NETIF_MAX_TX_RING_COUNT; i++) {
1385 		if (pcb->ipsec_netif_txring[i] == ring) {
1386 			pcb->ipsec_netif_txring[i] = NULL;
1387 			VERIFY(!found);
1388 			found = true;
1389 		}
1390 	}
1391 	VERIFY(found);
1392 }
1393 
1394 static bool
ipsec_netif_check_policy(ifnet_t interface,mbuf_t data)1395 ipsec_netif_check_policy(ifnet_t interface, mbuf_t data)
1396 {
1397 	necp_kernel_policy_result necp_result = 0;
1398 	necp_kernel_policy_result_parameter necp_result_parameter = {};
1399 	uint32_t necp_matched_policy_id = 0;
1400 	struct ip_out_args args4 = { };
1401 	struct ip6_out_args args6 = { };
1402 
1403 	// This packet has been marked with IP level policy, do not mark again.
1404 	if (data && data->m_pkthdr.necp_mtag.necp_policy_id >= NECP_KERNEL_POLICY_ID_FIRST_VALID_IP) {
1405 		return true;
1406 	}
1407 
1408 	size_t length = mbuf_pkthdr_len(data);
1409 	if (length < sizeof(struct ip)) {
1410 		return false;
1411 	}
1412 
1413 	struct ip *ip = mtod(data, struct ip *);
1414 	u_int ip_version = ip->ip_v;
1415 	switch (ip_version) {
1416 	case 4: {
1417 		if (interface != NULL) {
1418 			args4.ipoa_flags |= IPOAF_BOUND_IF;
1419 			args4.ipoa_boundif = interface->if_index;
1420 		}
1421 		necp_matched_policy_id = necp_ip_output_find_policy_match(data, IP_OUTARGS, &args4, NULL,
1422 		    &necp_result, &necp_result_parameter);
1423 		break;
1424 	}
1425 	case 6: {
1426 		if (interface != NULL) {
1427 			args6.ip6oa_flags |= IP6OAF_BOUND_IF;
1428 			args6.ip6oa_boundif = interface->if_index;
1429 		}
1430 		necp_matched_policy_id = necp_ip6_output_find_policy_match(data, IPV6_OUTARGS, &args6, NULL,
1431 		    &necp_result, &necp_result_parameter);
1432 		break;
1433 	}
1434 	default: {
1435 		return false;
1436 	}
1437 	}
1438 
1439 	if (necp_result == NECP_KERNEL_POLICY_RESULT_DROP ||
1440 	    necp_result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT) {
1441 		/* Drop and flow divert packets should be blocked at the IP layer */
1442 		return false;
1443 	}
1444 
1445 	necp_mark_packet_from_ip(data, necp_matched_policy_id);
1446 	return true;
1447 }
1448 
1449 static errno_t
ipsec_netif_sync_tx(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t tx_ring,uint32_t flags)1450 ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1451     kern_channel_ring_t tx_ring, uint32_t flags)
1452 {
1453 #pragma unused(nxprov)
1454 #pragma unused(flags)
1455 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
1456 
1457 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats;
1458 
1459 	if (!ipsec_data_move_begin(pcb)) {
1460 		os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
1461 		return 0;
1462 	}
1463 
1464 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
1465 
1466 	struct kern_channel_ring_stat_increment tx_ring_stats;
1467 	bzero(&tx_ring_stats, sizeof(tx_ring_stats));
1468 	kern_channel_slot_t tx_pslot = NULL;
1469 	kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
1470 	kern_packet_t tx_chain_ph = 0;
1471 
1472 	STATS_INC(nifs, NETIF_STATS_TX_SYNC);
1473 
1474 	if (tx_slot == NULL) {
1475 		// Nothing to write, don't bother signalling
1476 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1477 		ipsec_data_move_end(pcb);
1478 		return 0;
1479 	}
1480 
1481 	if (pcb->ipsec_kpipe_count &&
1482 	    ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
1483 		// Select the corresponding kpipe rx ring
1484 		uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(tx_ring);
1485 		VERIFY(ring_idx < IPSEC_IF_MAX_RING_COUNT);
1486 		kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring[ring_idx];
1487 
1488 		// Unlock while calling notify
1489 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1490 
1491 		// Signal the kernel pipe ring to read
1492 		if (rx_ring != NULL) {
1493 			kern_channel_notify(rx_ring, 0);
1494 		}
1495 
1496 		ipsec_data_move_end(pcb);
1497 		return 0;
1498 	}
1499 
1500 	// If we're here, we're injecting into the BSD stack
1501 	while (tx_slot != NULL) {
1502 		size_t length = 0;
1503 		mbuf_t data = NULL;
1504 
1505 		kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot);
1506 
1507 		if (tx_ph == 0) {
1508 			// Advance TX ring
1509 			tx_pslot = tx_slot;
1510 			tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
1511 			continue;
1512 		}
1513 		(void) kern_channel_slot_detach_packet(tx_ring, tx_slot, tx_ph);
1514 		if (tx_chain_ph != 0) {
1515 			kern_packet_append(tx_ph, tx_chain_ph);
1516 		}
1517 		tx_chain_ph = tx_ph;
1518 
1519 		// Advance TX ring
1520 		tx_pslot = tx_slot;
1521 		tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
1522 
1523 		kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL);
1524 		VERIFY(tx_buf != NULL);
1525 		uint8_t *tx_baddr = kern_buflet_get_data_address(tx_buf);
1526 		VERIFY(tx_baddr != 0);
1527 		tx_baddr += kern_buflet_get_data_offset(tx_buf);
1528 
1529 		bpf_tap_packet_out(pcb->ipsec_ifp, DLT_RAW, tx_ph, NULL, 0);
1530 
1531 		length = MIN(kern_packet_get_data_length(tx_ph),
1532 		    pcb->ipsec_slot_size);
1533 
1534 		if (length > 0) {
1535 			errno_t error = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_HEADER, &data);
1536 			if (error == 0) {
1537 				error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT);
1538 				if (error == 0) {
1539 					// Mark packet from policy
1540 					uint32_t policy_id = kern_packet_get_policy_id(tx_ph);
1541 					necp_mark_packet_from_ip(data, policy_id);
1542 
1543 					// Check policy with NECP
1544 					if (!ipsec_netif_check_policy(pcb->ipsec_ifp, data)) {
1545 						os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - failed policy check\n", pcb->ipsec_ifp->if_xname);
1546 						STATS_INC(nifs, NETIF_STATS_DROP);
1547 						mbuf_freem(data);
1548 						data = NULL;
1549 					} else {
1550 						// Send through encryption
1551 						error = ipsec_output(pcb->ipsec_ifp, data);
1552 						if (error != 0) {
1553 							os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - ipsec_output error %d\n", pcb->ipsec_ifp->if_xname, error);
1554 						}
1555 					}
1556 				} else {
1557 					os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
1558 					STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
1559 					STATS_INC(nifs, NETIF_STATS_DROP);
1560 					mbuf_freem(data);
1561 					data = NULL;
1562 				}
1563 			} else {
1564 				os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
1565 				STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
1566 				STATS_INC(nifs, NETIF_STATS_DROP);
1567 			}
1568 		} else {
1569 			os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname);
1570 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1571 			STATS_INC(nifs, NETIF_STATS_DROP);
1572 		}
1573 
1574 		if (data == NULL) {
1575 			os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname);
1576 			break;
1577 		}
1578 
1579 		STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
1580 		STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
1581 
1582 		tx_ring_stats.kcrsi_slots_transferred++;
1583 		tx_ring_stats.kcrsi_bytes_transferred += length;
1584 	}
1585 
1586 	if (tx_chain_ph != 0) {
1587 		kern_pbufpool_free_chain(tx_ring->ckr_pp, tx_chain_ph);
1588 	}
1589 
1590 	if (tx_pslot) {
1591 		kern_channel_advance_slot(tx_ring, tx_pslot);
1592 		kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats);
1593 		(void)kern_channel_reclaim(tx_ring);
1594 	}
1595 
1596 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1597 	ipsec_data_move_end(pcb);
1598 
1599 	return 0;
1600 }
1601 
1602 static errno_t
ipsec_netif_tx_doorbell_one(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t ring,uint32_t flags,uint8_t ring_idx)1603 ipsec_netif_tx_doorbell_one(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1604     kern_channel_ring_t ring, uint32_t flags, uint8_t ring_idx)
1605 {
1606 #pragma unused(nxprov)
1607 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
1608 	boolean_t more = false;
1609 	errno_t rc = 0;
1610 
1611 	VERIFY((flags & KERN_NEXUS_TXDOORBELLF_ASYNC_REFILL) == 0);
1612 
1613 	/*
1614 	 * Refill and sync the ring; we may be racing against another thread doing
1615 	 * an RX sync that also wants to do kr_enter(), and so use the blocking
1616 	 * variant here.
1617 	 */
1618 	rc = kern_channel_tx_refill_canblock(ring, UINT32_MAX, UINT32_MAX, true, &more);
1619 	if (rc != 0 && rc != EAGAIN && rc != EBUSY) {
1620 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s tx refill failed %d\n", __func__,
1621 		    pcb->ipsec_if_xname, ring->ckr_name, rc);
1622 	}
1623 
1624 	(void) kr_enter(ring, TRUE);
1625 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
1626 	if (ring != pcb->ipsec_netif_txring[ring_idx]) {
1627 		// ring no longer valid
1628 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1629 		kr_exit(ring);
1630 		os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 3\n", __func__,
1631 		    pcb->ipsec_if_xname, ring->ckr_name, ring_idx);
1632 		return ENXIO;
1633 	}
1634 
1635 	if (pcb->ipsec_kpipe_count) {
1636 		uint32_t tx_available = kern_channel_available_slot_count(ring);
1637 		if (pcb->ipsec_netif_txring_size > 0 &&
1638 		    tx_available >= pcb->ipsec_netif_txring_size - 1) {
1639 			// No room left in tx ring, disable output for now
1640 			errno_t error = ifnet_disable_output(pcb->ipsec_ifp);
1641 			if (error != 0) {
1642 				os_log_error(OS_LOG_DEFAULT, "ipsec_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error);
1643 			}
1644 		}
1645 	}
1646 
1647 	if (pcb->ipsec_kpipe_count) {
1648 		kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring[ring_idx];
1649 
1650 		// Unlock while calling notify
1651 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1652 		// Signal the kernel pipe ring to read
1653 		if (rx_ring != NULL) {
1654 			kern_channel_notify(rx_ring, 0);
1655 		}
1656 	} else {
1657 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1658 	}
1659 
1660 	kr_exit(ring);
1661 
1662 	return 0;
1663 }
1664 
1665 static errno_t
ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t ring,__unused uint32_t flags)1666 ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1667     kern_channel_ring_t ring, __unused uint32_t flags)
1668 {
1669 	errno_t ret = 0;
1670 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
1671 
1672 	if (!ipsec_data_move_begin(pcb)) {
1673 		os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
1674 		return 0;
1675 	}
1676 
1677 	if (ipsec_in_wmm_mode(pcb)) {
1678 		for (uint8_t i = 0; i < IPSEC_IF_WMM_RING_COUNT; i++) {
1679 			kern_channel_ring_t nring = pcb->ipsec_netif_txring[i];
1680 			ret = ipsec_netif_tx_doorbell_one(nxprov, nexus, nring, flags, i);
1681 			if (ret) {
1682 				break;
1683 			}
1684 		}
1685 	} else {
1686 		ret = ipsec_netif_tx_doorbell_one(nxprov, nexus, ring, flags, 0);
1687 	}
1688 
1689 	ipsec_data_move_end(pcb);
1690 	return ret;
1691 }
1692 
1693 static errno_t
ipsec_netif_sync_rx_mbuf(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t rx_ring,uint32_t flags)1694 ipsec_netif_sync_rx_mbuf(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
1695     kern_channel_ring_t rx_ring, uint32_t flags)
1696 {
1697 #pragma unused(nxprov)
1698 #pragma unused(flags)
1699 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
1700 	struct kern_channel_ring_stat_increment rx_ring_stats;
1701 
1702 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats;
1703 
1704 	if (!ipsec_data_move_begin(pcb)) {
1705 		os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
1706 		return 0;
1707 	}
1708 
1709 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
1710 
1711 	// Reclaim user-released slots
1712 	(void) kern_channel_reclaim(rx_ring);
1713 
1714 	STATS_INC(nifs, NETIF_STATS_RX_SYNC);
1715 
1716 	uint32_t avail = kern_channel_available_slot_count(rx_ring);
1717 	if (avail == 0) {
1718 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1719 		ipsec_data_move_end(pcb);
1720 		return 0;
1721 	}
1722 
1723 	struct kern_pbufpool *rx_pp = rx_ring->ckr_pp;
1724 	VERIFY(rx_pp != NULL);
1725 	bzero(&rx_ring_stats, sizeof(rx_ring_stats));
1726 	kern_channel_slot_t rx_pslot = NULL;
1727 	kern_channel_slot_t rx_slot = kern_channel_get_next_slot(rx_ring, NULL, NULL);
1728 
1729 	while (rx_slot != NULL) {
1730 		// Check for a waiting packet
1731 		lck_mtx_lock(&pcb->ipsec_input_chain_lock);
1732 		mbuf_t data = pcb->ipsec_input_chain;
1733 		if (data == NULL) {
1734 			lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
1735 			break;
1736 		}
1737 
1738 		// Allocate rx packet
1739 		kern_packet_t rx_ph = 0;
1740 		errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
1741 		if (__improbable(error != 0)) {
1742 			STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
1743 			STATS_INC(nifs, NETIF_STATS_DROP);
1744 			lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
1745 			break;
1746 		}
1747 
1748 		// Advance waiting packets
1749 		if (pcb->ipsec_input_chain_count > 0) {
1750 			pcb->ipsec_input_chain_count--;
1751 		}
1752 		pcb->ipsec_input_chain = data->m_nextpkt;
1753 		data->m_nextpkt = NULL;
1754 		if (pcb->ipsec_input_chain == NULL) {
1755 			pcb->ipsec_input_chain_last = NULL;
1756 		}
1757 		lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
1758 
1759 		size_t length = mbuf_pkthdr_len(data);
1760 
1761 		if (length < sizeof(struct ip)) {
1762 			// Flush data
1763 			mbuf_freem(data);
1764 			kern_pbufpool_free(rx_pp, rx_ph);
1765 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1766 			STATS_INC(nifs, NETIF_STATS_DROP);
1767 			os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy decrypted packet length cannot hold IP %zu < %zu\n",
1768 			    pcb->ipsec_ifp->if_xname, length, sizeof(struct ip));
1769 			continue;
1770 		}
1771 
1772 		uint32_t af = 0;
1773 		struct ip *ip = mtod(data, struct ip *);
1774 		u_int ip_version = ip->ip_v;
1775 		switch (ip_version) {
1776 		case 4: {
1777 			af = AF_INET;
1778 			break;
1779 		}
1780 		case 6: {
1781 			af = AF_INET6;
1782 			break;
1783 		}
1784 		default: {
1785 			os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy unknown ip version %u\n",
1786 			    pcb->ipsec_ifp->if_xname, ip_version);
1787 			break;
1788 		}
1789 		}
1790 
1791 		if (length > PP_BUF_SIZE_DEF(rx_pp) ||
1792 		    (pcb->ipsec_frag_size_set && length > pcb->ipsec_input_frag_size)) {
1793 			// We need to fragment to send up into the netif
1794 
1795 			u_int32_t fragment_mtu = PP_BUF_SIZE_DEF(rx_pp);
1796 			if (pcb->ipsec_frag_size_set &&
1797 			    pcb->ipsec_input_frag_size < PP_BUF_SIZE_DEF(rx_pp)) {
1798 				fragment_mtu = pcb->ipsec_input_frag_size;
1799 			}
1800 
1801 			mbuf_t fragment_chain = NULL;
1802 			switch (af) {
1803 			case AF_INET: {
1804 				// ip_fragment expects the length in host order
1805 				ip->ip_len = ntohs(ip->ip_len);
1806 
1807 				// ip_fragment will modify the original data, don't free
1808 				int fragment_error = ip_fragment(data, pcb->ipsec_ifp, fragment_mtu, TRUE);
1809 				if (fragment_error == 0 && data != NULL) {
1810 					fragment_chain = data;
1811 				} else {
1812 					STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1813 					STATS_INC(nifs, NETIF_STATS_DROP);
1814 					os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv4 packet of length %zu (%d)\n",
1815 					    pcb->ipsec_ifp->if_xname, length, fragment_error);
1816 				}
1817 				break;
1818 			}
1819 			case AF_INET6: {
1820 				if (length < sizeof(struct ip6_hdr)) {
1821 					mbuf_freem(data);
1822 					STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1823 					STATS_INC(nifs, NETIF_STATS_DROP);
1824 					os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu < %zu\n",
1825 					    pcb->ipsec_ifp->if_xname, length, sizeof(struct ip6_hdr));
1826 				} else {
1827 					// ip6_do_fragmentation will free the original data on success only
1828 					struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *);
1829 
1830 					int fragment_error = ip6_do_fragmentation(&data, 0, pcb->ipsec_ifp, sizeof(struct ip6_hdr),
1831 					    ip6, NULL, fragment_mtu, ip6->ip6_nxt, htonl(ip6_randomid()));
1832 					if (fragment_error == 0 && data != NULL) {
1833 						fragment_chain = data;
1834 					} else {
1835 						mbuf_freem(data);
1836 						STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1837 						STATS_INC(nifs, NETIF_STATS_DROP);
1838 						os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu (%d)\n",
1839 						    pcb->ipsec_ifp->if_xname, length, fragment_error);
1840 					}
1841 				}
1842 				break;
1843 			}
1844 			default: {
1845 				// Cannot fragment unknown families
1846 				mbuf_freem(data);
1847 				STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1848 				STATS_INC(nifs, NETIF_STATS_DROP);
1849 				os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: uknown legacy decrypted packet length %zu > %u\n",
1850 				    pcb->ipsec_ifp->if_xname, length, PP_BUF_SIZE_DEF(rx_pp));
1851 				break;
1852 			}
1853 			}
1854 
1855 			if (fragment_chain != NULL) {
1856 				// Add fragments to chain before continuing
1857 				lck_mtx_lock(&pcb->ipsec_input_chain_lock);
1858 				if (pcb->ipsec_input_chain != NULL) {
1859 					pcb->ipsec_input_chain_last->m_nextpkt = fragment_chain;
1860 				} else {
1861 					pcb->ipsec_input_chain = fragment_chain;
1862 				}
1863 				pcb->ipsec_input_chain_count++;
1864 				while (fragment_chain->m_nextpkt) {
1865 					VERIFY(fragment_chain != fragment_chain->m_nextpkt);
1866 					fragment_chain = fragment_chain->m_nextpkt;
1867 					pcb->ipsec_input_chain_count++;
1868 				}
1869 				pcb->ipsec_input_chain_last = fragment_chain;
1870 				lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
1871 			}
1872 
1873 			// Make sure to free unused rx packet
1874 			kern_pbufpool_free(rx_pp, rx_ph);
1875 
1876 			continue;
1877 		}
1878 
1879 		mbuf_pkthdr_setrcvif(data, pcb->ipsec_ifp);
1880 
1881 		// Fillout rx packet
1882 		kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL);
1883 		VERIFY(rx_buf != NULL);
1884 		void *rx_baddr = kern_buflet_get_data_address(rx_buf);
1885 		VERIFY(rx_baddr != NULL);
1886 
1887 		// Copy-in data from mbuf to buflet
1888 		mbuf_copydata(data, 0, length, (void *)rx_baddr);
1889 		kern_packet_clear_flow_uuid(rx_ph);     // Zero flow id
1890 
1891 		// Finalize and attach the packet
1892 		error = kern_buflet_set_data_offset(rx_buf, 0);
1893 		VERIFY(error == 0);
1894 		error = kern_buflet_set_data_length(rx_buf, (uint16_t)length);
1895 		VERIFY(error == 0);
1896 		error = kern_packet_set_headroom(rx_ph, 0);
1897 		VERIFY(error == 0);
1898 		error = kern_packet_finalize(rx_ph);
1899 		VERIFY(error == 0);
1900 		error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
1901 		VERIFY(error == 0);
1902 
1903 		STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
1904 		STATS_INC(nifs, NETIF_STATS_RX_COPY_MBUF);
1905 		bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0);
1906 
1907 		rx_ring_stats.kcrsi_slots_transferred++;
1908 		rx_ring_stats.kcrsi_bytes_transferred += length;
1909 
1910 		if (!pcb->ipsec_ext_ifdata_stats) {
1911 			ifnet_stat_increment_in(pcb->ipsec_ifp, 1, (uint16_t)length, 0);
1912 		}
1913 
1914 		mbuf_freem(data);
1915 
1916 		// Advance ring
1917 		rx_pslot = rx_slot;
1918 		rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
1919 	}
1920 
1921 	for (uint8_t ring_idx = 0; ring_idx < pcb->ipsec_kpipe_count; ring_idx++) {
1922 		struct kern_channel_ring_stat_increment tx_ring_stats;
1923 		bzero(&tx_ring_stats, sizeof(tx_ring_stats));
1924 		kern_channel_ring_t tx_ring = pcb->ipsec_kpipe_txring[ring_idx];
1925 		kern_channel_slot_t tx_pslot = NULL;
1926 		kern_channel_slot_t tx_slot = NULL;
1927 		if (tx_ring == NULL) {
1928 			// Net-If TX ring not set up yet, nothing to read
1929 			goto done;
1930 		}
1931 		// Unlock ipsec before entering ring
1932 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
1933 
1934 		(void)kr_enter(tx_ring, TRUE);
1935 
1936 		// Lock again after entering and validate
1937 		lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
1938 
1939 		if (tx_ring != pcb->ipsec_kpipe_txring[ring_idx]) {
1940 			goto done;
1941 		}
1942 
1943 		tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
1944 		if (tx_slot == NULL) {
1945 			// Nothing to read, don't bother signalling
1946 			goto done;
1947 		}
1948 
1949 		while (rx_slot != NULL && tx_slot != NULL) {
1950 			size_t length = 0;
1951 			mbuf_t data = NULL;
1952 			errno_t error = 0;
1953 			uint32_t af;
1954 
1955 			// Allocate rx packet
1956 			kern_packet_t rx_ph = 0;
1957 			error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
1958 			if (__improbable(error != 0)) {
1959 				STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
1960 				STATS_INC(nifs, NETIF_STATS_DROP);
1961 				break;
1962 			}
1963 
1964 			kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot);
1965 
1966 			// Advance TX ring
1967 			tx_pslot = tx_slot;
1968 			tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
1969 
1970 			if (tx_ph == 0) {
1971 				kern_pbufpool_free(rx_pp, rx_ph);
1972 				continue;
1973 			}
1974 
1975 			kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL);
1976 			VERIFY(tx_buf != NULL);
1977 			uint8_t *tx_baddr = kern_buflet_get_data_address(tx_buf);
1978 			VERIFY(tx_baddr != 0);
1979 			tx_baddr += kern_buflet_get_data_offset(tx_buf);
1980 
1981 			length = MIN(kern_packet_get_data_length(tx_ph),
1982 			    pcb->ipsec_slot_size);
1983 
1984 			// Increment TX stats
1985 			tx_ring_stats.kcrsi_slots_transferred++;
1986 			tx_ring_stats.kcrsi_bytes_transferred += length;
1987 
1988 			if (length >= sizeof(struct ip)) {
1989 				error = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_HEADER, &data);
1990 				if (error == 0) {
1991 					error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT);
1992 					if (error == 0) {
1993 						// Check for wake packet flag
1994 						uuid_t flow_uuid;
1995 						kern_packet_get_flow_uuid(tx_ph, &flow_uuid);
1996 						u_int8_t *id_8 = (u_int8_t *)(uintptr_t)flow_uuid;
1997 						if ((id_8[0] & IPSEC_KPIPE_FLAG_WAKE_PKT) == IPSEC_KPIPE_FLAG_WAKE_PKT) {
1998 							os_log_info(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: wake packet flag is set\n",
1999 							    pcb->ipsec_ifp->if_xname);
2000 							data->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2001 						}
2002 
2003 						lck_mtx_lock(&pcb->ipsec_kpipe_decrypt_lock);
2004 						struct ip *ip = mtod(data, struct ip *);
2005 						u_int ip_version = ip->ip_v;
2006 						switch (ip_version) {
2007 						case 4: {
2008 							af = AF_INET;
2009 							ip->ip_len = ntohs(ip->ip_len) - sizeof(struct ip);
2010 							ip->ip_off = ntohs(ip->ip_off);
2011 
2012 							if (length < ip->ip_len) {
2013 								os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv4 packet length too short (%zu < %u)\n",
2014 								    pcb->ipsec_ifp->if_xname, length, ip->ip_len);
2015 								STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2016 								STATS_INC(nifs, NETIF_STATS_DROP);
2017 								mbuf_freem(data);
2018 								data = NULL;
2019 							} else {
2020 								data = esp4_input_extended(data, sizeof(struct ip), pcb->ipsec_ifp);
2021 							}
2022 							break;
2023 						}
2024 						case 6: {
2025 							if (length < sizeof(struct ip6_hdr)) {
2026 								os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv6 packet length too short for header %zu\n",
2027 								    pcb->ipsec_ifp->if_xname, length);
2028 								STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2029 								STATS_INC(nifs, NETIF_STATS_DROP);
2030 								mbuf_freem(data);
2031 								data = NULL;
2032 							} else {
2033 								af = AF_INET6;
2034 								struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *);
2035 								const size_t ip6_len = sizeof(*ip6) + ntohs(ip6->ip6_plen);
2036 								if (length < ip6_len) {
2037 									os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv6 packet length too short (%zu < %zu)\n",
2038 									    pcb->ipsec_ifp->if_xname, length, ip6_len);
2039 									STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2040 									STATS_INC(nifs, NETIF_STATS_DROP);
2041 									mbuf_freem(data);
2042 									data = NULL;
2043 								} else {
2044 									int offset = sizeof(struct ip6_hdr);
2045 									esp6_input_extended(&data, &offset, ip6->ip6_nxt, pcb->ipsec_ifp);
2046 								}
2047 							}
2048 							break;
2049 						}
2050 						default: {
2051 							os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: unknown ip version %u\n",
2052 							    pcb->ipsec_ifp->if_xname, ip_version);
2053 							STATS_INC(nifs, NETIF_STATS_DROP);
2054 							mbuf_freem(data);
2055 							data = NULL;
2056 							break;
2057 						}
2058 						}
2059 						lck_mtx_unlock(&pcb->ipsec_kpipe_decrypt_lock);
2060 					} else {
2061 						os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
2062 						STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
2063 						STATS_INC(nifs, NETIF_STATS_DROP);
2064 						mbuf_freem(data);
2065 						data = NULL;
2066 					}
2067 				} else {
2068 					os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
2069 					STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
2070 					STATS_INC(nifs, NETIF_STATS_DROP);
2071 				}
2072 			} else {
2073 				os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - bad packet length %zu\n", pcb->ipsec_ifp->if_xname, length);
2074 				STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2075 				STATS_INC(nifs, NETIF_STATS_DROP);
2076 			}
2077 
2078 			if (data == NULL) {
2079 				// Failed to get decrypted data data
2080 				kern_pbufpool_free(rx_pp, rx_ph);
2081 				continue;
2082 			}
2083 
2084 			length = mbuf_pkthdr_len(data);
2085 			if (length > PP_BUF_SIZE_DEF(rx_pp)) {
2086 				// Flush data
2087 				mbuf_freem(data);
2088 				kern_pbufpool_free(rx_pp, rx_ph);
2089 				STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2090 				STATS_INC(nifs, NETIF_STATS_DROP);
2091 				os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: decrypted packet length %zu > %u\n",
2092 				    pcb->ipsec_ifp->if_xname, length, PP_BUF_SIZE_DEF(rx_pp));
2093 				continue;
2094 			}
2095 
2096 			mbuf_pkthdr_setrcvif(data, pcb->ipsec_ifp);
2097 
2098 			// Fillout rx packet
2099 			kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL);
2100 			VERIFY(rx_buf != NULL);
2101 			void *rx_baddr = kern_buflet_get_data_address(rx_buf);
2102 			VERIFY(rx_baddr != NULL);
2103 
2104 			// Copy-in data from mbuf to buflet
2105 			mbuf_copydata(data, 0, length, (void *)rx_baddr);
2106 			kern_packet_clear_flow_uuid(rx_ph);     // Zero flow id
2107 
2108 			// Finalize and attach the packet
2109 			error = kern_buflet_set_data_offset(rx_buf, 0);
2110 			VERIFY(error == 0);
2111 			error = kern_buflet_set_data_length(rx_buf, (uint16_t)length);
2112 			VERIFY(error == 0);
2113 			error = kern_packet_set_link_header_offset(rx_ph, 0);
2114 			VERIFY(error == 0);
2115 			error = kern_packet_set_network_header_offset(rx_ph, 0);
2116 			VERIFY(error == 0);
2117 			error = kern_packet_finalize(rx_ph);
2118 			VERIFY(error == 0);
2119 			error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
2120 			VERIFY(error == 0);
2121 
2122 			STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
2123 			STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT);
2124 			bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0);
2125 
2126 			rx_ring_stats.kcrsi_slots_transferred++;
2127 			rx_ring_stats.kcrsi_bytes_transferred += length;
2128 
2129 			if (!pcb->ipsec_ext_ifdata_stats) {
2130 				ifnet_stat_increment_in(pcb->ipsec_ifp, 1, (uint16_t)length, 0);
2131 			}
2132 
2133 			mbuf_freem(data);
2134 
2135 			rx_pslot = rx_slot;
2136 			rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
2137 		}
2138 
2139 done:
2140 		if (tx_pslot) {
2141 			kern_channel_advance_slot(tx_ring, tx_pslot);
2142 			kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats);
2143 			(void)kern_channel_reclaim(tx_ring);
2144 		}
2145 
2146 		// Unlock first, then exit ring
2147 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
2148 		if (tx_ring != NULL) {
2149 			if (tx_pslot != NULL) {
2150 				kern_channel_notify(tx_ring, 0);
2151 			}
2152 			kr_exit(tx_ring);
2153 		}
2154 
2155 		lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
2156 	}
2157 
2158 	if (rx_pslot) {
2159 		kern_channel_advance_slot(rx_ring, rx_pslot);
2160 		kern_channel_increment_ring_net_stats(rx_ring, pcb->ipsec_ifp, &rx_ring_stats);
2161 	}
2162 
2163 
2164 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
2165 
2166 	ipsec_data_move_end(pcb);
2167 	return 0;
2168 }
2169 
2170 static errno_t
ipsec_transform_kpipe_pkt_to_netif_pkt(struct ipsec_pcb * pcb,struct kern_channel_ring_stat_increment * tx_ring_stats,struct netif_stats * nifs,kern_packet_t kpipe_ph,kern_packet_t netif_ph)2171 ipsec_transform_kpipe_pkt_to_netif_pkt(struct ipsec_pcb *pcb,
2172     struct kern_channel_ring_stat_increment *tx_ring_stats,
2173     struct netif_stats *nifs, kern_packet_t kpipe_ph, kern_packet_t netif_ph)
2174 {
2175 	kern_buflet_t kpipe_buf = NULL, netif_buf = NULL;
2176 	uint8_t *kpipe_baddr = NULL, *netif_baddr = NULL;
2177 	uuid_t flow_uuid;
2178 	size_t iphlen = 0;
2179 	uint32_t kpipe_buf_len = 0, netif_buf_lim = 0;
2180 	int err = 0;
2181 
2182 	VERIFY(kpipe_ph != 0);
2183 	VERIFY(netif_ph != 0);
2184 	VERIFY(pcb != NULL);
2185 	VERIFY(tx_ring_stats != NULL);
2186 	VERIFY(nifs != NULL);
2187 
2188 	kpipe_buf = kern_packet_get_next_buflet(kpipe_ph, NULL);
2189 	VERIFY(kpipe_buf != NULL);
2190 	kpipe_baddr = kern_buflet_get_data_address(kpipe_buf);
2191 	VERIFY(kpipe_baddr != NULL);
2192 	kpipe_baddr += kern_buflet_get_data_offset(kpipe_buf);
2193 	kpipe_buf_len = kern_buflet_get_data_length(kpipe_buf);
2194 
2195 	netif_buf = kern_packet_get_next_buflet(netif_ph, NULL);
2196 	VERIFY(netif_buf != NULL);
2197 	netif_baddr = kern_buflet_get_data_address(netif_buf);
2198 	VERIFY(netif_baddr != NULL);
2199 	netif_baddr += kern_buflet_get_data_offset(netif_buf);
2200 	netif_buf_lim = __buflet_get_data_limit(netif_buf);
2201 	netif_buf_lim -= __buflet_get_data_offset(netif_buf);
2202 
2203 	if (kpipe_buf_len > pcb->ipsec_slot_size) {
2204 		os_log_info(OS_LOG_DEFAULT,
2205 		    "ipsec_transform_kpipe_pkt_to_netif_pkt %s: kpipe buffer length "
2206 		    "%u > pcb ipsec slot size %u", pcb->ipsec_ifp->if_xname,
2207 		    kpipe_buf_len, pcb->ipsec_slot_size);
2208 		STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2209 		err = EMSGSIZE;
2210 		goto bad;
2211 	}
2212 
2213 	tx_ring_stats->kcrsi_slots_transferred++;
2214 	tx_ring_stats->kcrsi_bytes_transferred += kpipe_buf_len;
2215 
2216 	if (__improbable(kpipe_buf_len < sizeof(struct ip))) {
2217 		os_log_info(OS_LOG_DEFAULT, "ipsec_transform_kpipe_pkt_to_netif_pkt %s - bad "
2218 		    "packet length %u\n", pcb->ipsec_ifp->if_xname, kpipe_buf_len);
2219 		STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2220 		err = EBADMSG;
2221 		goto bad;
2222 	}
2223 
2224 	struct ip *ip = (struct ip *)(void *)kpipe_baddr;
2225 	ASSERT(IP_HDR_ALIGNED_P(ip));
2226 
2227 	u_int ip_vers = ip->ip_v;
2228 	switch (ip_vers) {
2229 	case IPVERSION: {
2230 #ifdef _IP_VHL
2231 		iphlen = IP_VHL_HL(ip->ip_vhl) << 2;
2232 #else /* _IP_VHL */
2233 		iphlen = ip->ip_hl << 2;
2234 #endif /* _IP_VHL */
2235 		break;
2236 	}
2237 	case 6: {
2238 		iphlen = sizeof(struct ip6_hdr);
2239 		break;
2240 	}
2241 	default: {
2242 		os_log_info(OS_LOG_DEFAULT, "ipsec_transform_kpipe_pkt_to_netif_pkt %s - bad "
2243 		    "ip version %u\n", pcb->ipsec_ifp->if_xname, ip_vers);
2244 		err = EBADMSG;
2245 		goto bad;
2246 	}
2247 	}
2248 
2249 	if (__improbable(kpipe_buf_len < iphlen)) {
2250 		os_log_info(OS_LOG_DEFAULT, "ipsec_transform_kpipe_pkt_to_netif_pkt %s - bad "
2251 		    "packet length %u\n", pcb->ipsec_ifp->if_xname, kpipe_buf_len);
2252 		STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2253 		err = EBADMSG;
2254 		goto bad;
2255 	}
2256 
2257 	if (__improbable(netif_buf_lim < iphlen)) {
2258 		os_log_info(OS_LOG_DEFAULT, "ipsec_transform_kpipe_pkt_to_netif_pkt %s - netif "
2259 		    "buffer length %u too short\n", pcb->ipsec_ifp->if_xname, netif_buf_lim);
2260 		STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2261 		err = EBADMSG;
2262 		goto bad;
2263 	}
2264 
2265 	memcpy(netif_baddr, kpipe_baddr, iphlen);
2266 	__buflet_set_data_length(netif_buf, (uint16_t)iphlen);
2267 
2268 	lck_mtx_lock(&pcb->ipsec_kpipe_decrypt_lock);
2269 	err = esp_kpipe_input(pcb->ipsec_ifp, kpipe_ph, netif_ph);
2270 	lck_mtx_unlock(&pcb->ipsec_kpipe_decrypt_lock);
2271 
2272 	if (__improbable((err != 0))) {
2273 		goto bad;
2274 	}
2275 
2276 	kern_packet_get_flow_uuid(kpipe_ph, &flow_uuid);
2277 	uint8_t *id_8 = (uint8_t *)(uintptr_t)flow_uuid;
2278 	if (__improbable((id_8[0] & IPSEC_KPIPE_FLAG_WAKE_PKT) == IPSEC_KPIPE_FLAG_WAKE_PKT)) {
2279 		os_log_info(OS_LOG_DEFAULT, "ipsec_transform_kpipe_pkt_to_netif_pkt %s: wake packet "
2280 		    "flag is set\n", pcb->ipsec_ifp->if_xname);
2281 		__packet_set_wake_flag(netif_ph);
2282 	}
2283 
2284 	kern_packet_clear_flow_uuid(netif_ph);
2285 	err = kern_buflet_set_data_offset(netif_buf, 0);
2286 	VERIFY(err == 0);
2287 	err = kern_packet_set_link_header_offset(netif_ph, 0);
2288 	VERIFY(err == 0);
2289 	err = kern_packet_set_network_header_offset(netif_ph, 0);
2290 	VERIFY(err == 0);
2291 	err = kern_packet_finalize(netif_ph);
2292 	VERIFY(err == 0);
2293 
2294 	return 0;
2295 bad:
2296 	STATS_INC(nifs, NETIF_STATS_DROP);
2297 	return err;
2298 }
2299 
2300 
2301 static errno_t
ipsec_netif_sync_rx_packet(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t rx_ring,uint32_t flags)2302 ipsec_netif_sync_rx_packet(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
2303     kern_channel_ring_t rx_ring, uint32_t flags)
2304 {
2305 #pragma unused(nxprov)
2306 #pragma unused(flags)
2307 	struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
2308 	struct kern_channel_ring_stat_increment rx_ring_stats;
2309 
2310 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats;
2311 
2312 	if (!ipsec_data_move_begin(pcb)) {
2313 		os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
2314 		return 0;
2315 	}
2316 
2317 	lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
2318 
2319 	// Reclaim user-released slots
2320 	(void) kern_channel_reclaim(rx_ring);
2321 
2322 	STATS_INC(nifs, NETIF_STATS_RX_SYNC);
2323 
2324 	uint32_t avail = kern_channel_available_slot_count(rx_ring);
2325 	if (avail == 0) {
2326 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
2327 		ipsec_data_move_end(pcb);
2328 		return 0;
2329 	}
2330 
2331 	struct kern_pbufpool *rx_pp = rx_ring->ckr_pp;
2332 	VERIFY(rx_pp != NULL);
2333 	bzero(&rx_ring_stats, sizeof(rx_ring_stats));
2334 	kern_channel_slot_t rx_pslot = NULL;
2335 	kern_channel_slot_t rx_slot = kern_channel_get_next_slot(rx_ring, NULL, NULL);
2336 
2337 	while (rx_slot != NULL) {
2338 		// Check for a waiting packet
2339 		lck_mtx_lock(&pcb->ipsec_input_chain_lock);
2340 		mbuf_t data = pcb->ipsec_input_chain;
2341 		if (data == NULL) {
2342 			lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
2343 			break;
2344 		}
2345 
2346 		// Allocate rx packet
2347 		kern_packet_t rx_ph = 0;
2348 		errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
2349 		if (__improbable(error != 0)) {
2350 			STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
2351 			STATS_INC(nifs, NETIF_STATS_DROP);
2352 			lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
2353 			break;
2354 		}
2355 
2356 		// Advance waiting packets
2357 		if (pcb->ipsec_input_chain_count > 0) {
2358 			pcb->ipsec_input_chain_count--;
2359 		}
2360 		pcb->ipsec_input_chain = data->m_nextpkt;
2361 		data->m_nextpkt = NULL;
2362 		if (pcb->ipsec_input_chain == NULL) {
2363 			pcb->ipsec_input_chain_last = NULL;
2364 		}
2365 		lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
2366 
2367 		size_t length = mbuf_pkthdr_len(data);
2368 
2369 		if (length < sizeof(struct ip)) {
2370 			// Flush data
2371 			mbuf_freem(data);
2372 			kern_pbufpool_free(rx_pp, rx_ph);
2373 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2374 			STATS_INC(nifs, NETIF_STATS_DROP);
2375 			os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy decrypted packet length cannot hold IP %zu < %zu\n",
2376 			    pcb->ipsec_ifp->if_xname, length, sizeof(struct ip));
2377 			continue;
2378 		}
2379 
2380 		uint32_t af = 0;
2381 		struct ip *ip = mtod(data, struct ip *);
2382 		u_int ip_version = ip->ip_v;
2383 		switch (ip_version) {
2384 		case 4: {
2385 			af = AF_INET;
2386 			break;
2387 		}
2388 		case 6: {
2389 			af = AF_INET6;
2390 			break;
2391 		}
2392 		default: {
2393 			os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy unknown ip version %u\n",
2394 			    pcb->ipsec_ifp->if_xname, ip_version);
2395 			break;
2396 		}
2397 		}
2398 
2399 		if (length > PP_BUF_SIZE_DEF(rx_pp) ||
2400 		    (pcb->ipsec_frag_size_set && length > pcb->ipsec_input_frag_size)) {
2401 			// We need to fragment to send up into the netif
2402 
2403 			u_int32_t fragment_mtu = PP_BUF_SIZE_DEF(rx_pp);
2404 			if (pcb->ipsec_frag_size_set &&
2405 			    pcb->ipsec_input_frag_size < PP_BUF_SIZE_DEF(rx_pp)) {
2406 				fragment_mtu = pcb->ipsec_input_frag_size;
2407 			}
2408 
2409 			mbuf_t fragment_chain = NULL;
2410 			switch (af) {
2411 			case AF_INET: {
2412 				// ip_fragment expects the length in host order
2413 				ip->ip_len = ntohs(ip->ip_len);
2414 
2415 				// ip_fragment will modify the original data, don't free
2416 				int fragment_error = ip_fragment(data, pcb->ipsec_ifp, fragment_mtu, TRUE);
2417 				if (fragment_error == 0 && data != NULL) {
2418 					fragment_chain = data;
2419 				} else {
2420 					STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2421 					STATS_INC(nifs, NETIF_STATS_DROP);
2422 					os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv4 packet of length %zu (%d)\n",
2423 					    pcb->ipsec_ifp->if_xname, length, fragment_error);
2424 				}
2425 				break;
2426 			}
2427 			case AF_INET6: {
2428 				if (length < sizeof(struct ip6_hdr)) {
2429 					mbuf_freem(data);
2430 					STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2431 					STATS_INC(nifs, NETIF_STATS_DROP);
2432 					os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu < %zu\n",
2433 					    pcb->ipsec_ifp->if_xname, length, sizeof(struct ip6_hdr));
2434 				} else {
2435 					// ip6_do_fragmentation will free the original data on success only
2436 					struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *);
2437 
2438 					int fragment_error = ip6_do_fragmentation(&data, 0, pcb->ipsec_ifp, sizeof(struct ip6_hdr),
2439 					    ip6, NULL, fragment_mtu, ip6->ip6_nxt, htonl(ip6_randomid()));
2440 					if (fragment_error == 0 && data != NULL) {
2441 						fragment_chain = data;
2442 					} else {
2443 						mbuf_freem(data);
2444 						STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2445 						STATS_INC(nifs, NETIF_STATS_DROP);
2446 						os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu (%d)\n",
2447 						    pcb->ipsec_ifp->if_xname, length, fragment_error);
2448 					}
2449 				}
2450 				break;
2451 			}
2452 			default: {
2453 				// Cannot fragment unknown families
2454 				mbuf_freem(data);
2455 				STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
2456 				STATS_INC(nifs, NETIF_STATS_DROP);
2457 				os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: uknown legacy decrypted packet length %zu > %u\n",
2458 				    pcb->ipsec_ifp->if_xname, length, PP_BUF_SIZE_DEF(rx_pp));
2459 				break;
2460 			}
2461 			}
2462 
2463 			if (fragment_chain != NULL) {
2464 				// Add fragments to chain before continuing
2465 				lck_mtx_lock(&pcb->ipsec_input_chain_lock);
2466 				if (pcb->ipsec_input_chain != NULL) {
2467 					pcb->ipsec_input_chain_last->m_nextpkt = fragment_chain;
2468 				} else {
2469 					pcb->ipsec_input_chain = fragment_chain;
2470 				}
2471 				pcb->ipsec_input_chain_count++;
2472 				while (fragment_chain->m_nextpkt) {
2473 					VERIFY(fragment_chain != fragment_chain->m_nextpkt);
2474 					fragment_chain = fragment_chain->m_nextpkt;
2475 					pcb->ipsec_input_chain_count++;
2476 				}
2477 				pcb->ipsec_input_chain_last = fragment_chain;
2478 				lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
2479 			}
2480 
2481 			// Make sure to free unused rx packet
2482 			kern_pbufpool_free(rx_pp, rx_ph);
2483 
2484 			continue;
2485 		}
2486 
2487 		mbuf_pkthdr_setrcvif(data, pcb->ipsec_ifp);
2488 
2489 		// Fillout rx packet
2490 		kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL);
2491 		VERIFY(rx_buf != NULL);
2492 		void *rx_baddr = kern_buflet_get_data_address(rx_buf);
2493 		VERIFY(rx_baddr != NULL);
2494 
2495 		// Copy-in data from mbuf to buflet
2496 		mbuf_copydata(data, 0, length, (void *)rx_baddr);
2497 		kern_packet_clear_flow_uuid(rx_ph);         // Zero flow id
2498 
2499 		// Finalize and attach the packet
2500 		error = kern_buflet_set_data_offset(rx_buf, 0);
2501 		VERIFY(error == 0);
2502 		error = kern_buflet_set_data_length(rx_buf, (uint16_t)length);
2503 		VERIFY(error == 0);
2504 		error = kern_packet_set_headroom(rx_ph, 0);
2505 		VERIFY(error == 0);
2506 		error = kern_packet_finalize(rx_ph);
2507 		VERIFY(error == 0);
2508 		error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
2509 		VERIFY(error == 0);
2510 
2511 		STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
2512 		STATS_INC(nifs, NETIF_STATS_RX_COPY_MBUF);
2513 		bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0);
2514 
2515 		rx_ring_stats.kcrsi_slots_transferred++;
2516 		rx_ring_stats.kcrsi_bytes_transferred += length;
2517 
2518 		if (!pcb->ipsec_ext_ifdata_stats) {
2519 			ifnet_stat_increment_in(pcb->ipsec_ifp, 1, (uint16_t)length, 0);
2520 		}
2521 
2522 		mbuf_freem(data);
2523 
2524 		// Advance ring
2525 		rx_pslot = rx_slot;
2526 		rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
2527 	}
2528 
2529 	for (uint8_t ring_idx = 0; ring_idx < pcb->ipsec_kpipe_count; ring_idx++) {
2530 		struct kern_channel_ring_stat_increment tx_ring_stats = {};
2531 		kern_channel_slot_t tx_pslot = NULL;
2532 		kern_channel_slot_t tx_slot = NULL;
2533 
2534 		kern_channel_ring_t tx_ring = pcb->ipsec_kpipe_txring[ring_idx];
2535 		if (tx_ring == NULL) {
2536 			// Net-If TX ring not set up yet, nothing to read
2537 			goto done;
2538 		}
2539 
2540 		// Unlock ipsec before entering ring
2541 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
2542 
2543 		(void)kr_enter(tx_ring, TRUE);
2544 
2545 		// Lock again after entering and validate
2546 		lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
2547 
2548 		if (tx_ring != pcb->ipsec_kpipe_txring[ring_idx]) {
2549 			goto done;
2550 		}
2551 
2552 		tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
2553 		if (tx_slot == NULL) {
2554 			// Nothing to read, don't bother signalling
2555 			goto done;
2556 		}
2557 
2558 		while (rx_slot != NULL && tx_slot != NULL) {
2559 			errno_t error = 0;
2560 
2561 			// Allocate rx packet
2562 			kern_packet_t rx_ph = 0;
2563 			error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
2564 			if (__improbable(error != 0)) {
2565 				STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
2566 				STATS_INC(nifs, NETIF_STATS_DROP);
2567 				break;
2568 			}
2569 
2570 			kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot);
2571 			tx_pslot = tx_slot;
2572 			tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
2573 			if (tx_ph == 0) {
2574 				kern_pbufpool_free(rx_pp, rx_ph);
2575 				continue;
2576 			}
2577 
2578 			error = ipsec_transform_kpipe_pkt_to_netif_pkt(pcb,
2579 			    &tx_ring_stats, nifs, tx_ph, rx_ph);
2580 			if (error != 0) {
2581 				// Failed to get decrypted packet
2582 				kern_pbufpool_free(rx_pp, rx_ph);
2583 				continue;
2584 			}
2585 
2586 			error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
2587 			VERIFY(error == 0);
2588 
2589 			STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
2590 			STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT);
2591 
2592 			bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0);
2593 
2594 			rx_ring_stats.kcrsi_slots_transferred++;
2595 			rx_ring_stats.kcrsi_bytes_transferred += kern_packet_get_data_length(rx_ph);
2596 
2597 			if (!pcb->ipsec_ext_ifdata_stats) {
2598 				ifnet_stat_increment_in(pcb->ipsec_ifp, 1,
2599 				    kern_packet_get_data_length(rx_ph), 0);
2600 			}
2601 
2602 			rx_pslot = rx_slot;
2603 			rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
2604 		}
2605 
2606 done:
2607 		if (tx_pslot) {
2608 			kern_channel_advance_slot(tx_ring, tx_pslot);
2609 			kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats);
2610 			(void)kern_channel_reclaim(tx_ring);
2611 		}
2612 
2613 		// Unlock first, then exit ring
2614 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
2615 		if (tx_ring != NULL) {
2616 			if (tx_pslot != NULL) {
2617 				kern_channel_notify(tx_ring, 0);
2618 			}
2619 			kr_exit(tx_ring);
2620 		}
2621 
2622 		lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
2623 	}
2624 
2625 	if (rx_pslot) {
2626 		kern_channel_advance_slot(rx_ring, rx_pslot);
2627 		kern_channel_increment_ring_net_stats(rx_ring, pcb->ipsec_ifp, &rx_ring_stats);
2628 	}
2629 
2630 
2631 	lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
2632 
2633 	ipsec_data_move_end(pcb);
2634 	return 0;
2635 }
2636 
2637 static errno_t
ipsec_netif_sync_rx(kern_nexus_provider_t nxprov,kern_nexus_t nexus,kern_channel_ring_t rx_ring,uint32_t flags)2638 ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
2639     kern_channel_ring_t rx_ring, uint32_t flags)
2640 {
2641 	if (__improbable(ipsec_kpipe_mbuf == 1)) {
2642 		return ipsec_netif_sync_rx_mbuf(nxprov, nexus, rx_ring, flags);
2643 	} else {
2644 		return ipsec_netif_sync_rx_packet(nxprov, nexus, rx_ring, flags);
2645 	}
2646 }
2647 
2648 static errno_t
ipsec_nexus_ifattach(struct ipsec_pcb * pcb,struct ifnet_init_eparams * init_params,struct ifnet ** ifp)2649 ipsec_nexus_ifattach(struct ipsec_pcb *pcb,
2650     struct ifnet_init_eparams *init_params,
2651     struct ifnet **ifp)
2652 {
2653 	errno_t err;
2654 	nexus_controller_t controller = kern_nexus_shared_controller();
2655 	struct kern_nexus_net_init net_init;
2656 	struct kern_pbufpool_init pp_init;
2657 
2658 	nexus_name_t provider_name;
2659 	snprintf((char *)provider_name, sizeof(provider_name),
2660 	    "com.apple.netif.%s", pcb->ipsec_if_xname);
2661 
2662 	struct kern_nexus_provider_init prov_init = {
2663 		.nxpi_version = KERN_NEXUS_DOMAIN_PROVIDER_CURRENT_VERSION,
2664 		.nxpi_flags = NXPIF_VIRTUAL_DEVICE,
2665 		.nxpi_pre_connect = ipsec_nexus_pre_connect,
2666 		.nxpi_connected = ipsec_nexus_connected,
2667 		.nxpi_pre_disconnect = ipsec_netif_pre_disconnect,
2668 		.nxpi_disconnected = ipsec_nexus_disconnected,
2669 		.nxpi_ring_init = ipsec_netif_ring_init,
2670 		.nxpi_ring_fini = ipsec_netif_ring_fini,
2671 		.nxpi_slot_init = NULL,
2672 		.nxpi_slot_fini = NULL,
2673 		.nxpi_sync_tx = ipsec_netif_sync_tx,
2674 		.nxpi_sync_rx = ipsec_netif_sync_rx,
2675 		.nxpi_tx_doorbell = ipsec_netif_tx_doorbell,
2676 	};
2677 
2678 	nexus_attr_t nxa = NULL;
2679 	err = kern_nexus_attr_create(&nxa);
2680 	IPSEC_IF_VERIFY(err == 0);
2681 	if (err != 0) {
2682 		os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
2683 		    __func__, err);
2684 		goto failed;
2685 	}
2686 
2687 	uint64_t slot_buffer_size = pcb->ipsec_slot_size;
2688 	err = kern_nexus_attr_set(nxa, NEXUS_ATTR_SLOT_BUF_SIZE, slot_buffer_size);
2689 	VERIFY(err == 0);
2690 
2691 	// Reset ring size for netif nexus to limit memory usage
2692 	uint64_t ring_size = pcb->ipsec_netif_ring_size;
2693 	err = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_SLOTS, ring_size);
2694 	VERIFY(err == 0);
2695 	err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size);
2696 	VERIFY(err == 0);
2697 
2698 	assert(err == 0);
2699 
2700 	if (ipsec_in_wmm_mode(pcb)) {
2701 		os_log(OS_LOG_DEFAULT, "%s: %s enabling wmm mode\n",
2702 		    __func__, pcb->ipsec_if_xname);
2703 
2704 		init_params->output_sched_model = IFNET_SCHED_MODEL_DRIVER_MANAGED;
2705 
2706 		err = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_RINGS,
2707 		    IPSEC_NETIF_WMM_TX_RING_COUNT);
2708 		VERIFY(err == 0);
2709 		err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_RINGS,
2710 		    IPSEC_NETIF_WMM_RX_RING_COUNT);
2711 		VERIFY(err == 0);
2712 
2713 		err = kern_nexus_attr_set(nxa, NEXUS_ATTR_QMAP, NEXUS_QMAP_TYPE_WMM);
2714 		VERIFY(err == 0);
2715 	}
2716 
2717 	pcb->ipsec_netif_txring_size = ring_size;
2718 
2719 	bzero(&pp_init, sizeof(pp_init));
2720 	pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION;
2721 	pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE;
2722 	// Note: we need more packets than can be held in the tx and rx rings because
2723 	// packets can also be in the AQM queue(s)
2724 	pp_init.kbi_packets = pcb->ipsec_netif_ring_size * (2 * pcb->ipsec_kpipe_count + 1);
2725 	pp_init.kbi_bufsize = pcb->ipsec_slot_size;
2726 	pp_init.kbi_buf_seg_size = IPSEC_IF_DEFAULT_BUF_SEG_SIZE;
2727 	pp_init.kbi_max_frags = 1;
2728 	(void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name),
2729 	    "%s", provider_name);
2730 	pp_init.kbi_ctx = NULL;
2731 	pp_init.kbi_ctx_retain = NULL;
2732 	pp_init.kbi_ctx_release = NULL;
2733 
2734 	err = kern_pbufpool_create(&pp_init, &pcb->ipsec_netif_pp, NULL);
2735 	if (err != 0) {
2736 		os_log_error(OS_LOG_DEFAULT, "%s pbufbool create failed, error %d\n", __func__, err);
2737 		goto failed;
2738 	}
2739 
2740 	err = kern_nexus_controller_register_provider(controller,
2741 	    ipsec_nx_dom_prov,
2742 	    provider_name,
2743 	    &prov_init,
2744 	    sizeof(prov_init),
2745 	    nxa,
2746 	    &pcb->ipsec_nx.if_provider);
2747 	IPSEC_IF_VERIFY(err == 0);
2748 	if (err != 0) {
2749 		os_log_error(OS_LOG_DEFAULT, "%s register provider failed, error %d\n",
2750 		    __func__, err);
2751 		goto failed;
2752 	}
2753 
2754 	bzero(&net_init, sizeof(net_init));
2755 	net_init.nxneti_version = KERN_NEXUS_NET_CURRENT_VERSION;
2756 	net_init.nxneti_flags = 0;
2757 	net_init.nxneti_eparams = init_params;
2758 	net_init.nxneti_lladdr = NULL;
2759 	net_init.nxneti_prepare = ipsec_netif_prepare;
2760 	net_init.nxneti_rx_pbufpool = pcb->ipsec_netif_pp;
2761 	net_init.nxneti_tx_pbufpool = pcb->ipsec_netif_pp;
2762 	err = kern_nexus_controller_alloc_net_provider_instance(controller,
2763 	    pcb->ipsec_nx.if_provider,
2764 	    pcb,
2765 	    NULL,
2766 	    &pcb->ipsec_nx.if_instance,
2767 	    &net_init,
2768 	    ifp);
2769 	IPSEC_IF_VERIFY(err == 0);
2770 	if (err != 0) {
2771 		os_log_error(OS_LOG_DEFAULT, "%s alloc_net_provider_instance failed, %d\n",
2772 		    __func__, err);
2773 		kern_nexus_controller_deregister_provider(controller,
2774 		    pcb->ipsec_nx.if_provider);
2775 		uuid_clear(pcb->ipsec_nx.if_provider);
2776 		goto failed;
2777 	}
2778 
2779 failed:
2780 	if (nxa) {
2781 		kern_nexus_attr_destroy(nxa);
2782 	}
2783 	if (err && pcb->ipsec_netif_pp != NULL) {
2784 		kern_pbufpool_destroy(pcb->ipsec_netif_pp);
2785 		pcb->ipsec_netif_pp = NULL;
2786 	}
2787 	return err;
2788 }
2789 
2790 static void
ipsec_detach_provider_and_instance(uuid_t provider,uuid_t instance)2791 ipsec_detach_provider_and_instance(uuid_t provider, uuid_t instance)
2792 {
2793 	nexus_controller_t controller = kern_nexus_shared_controller();
2794 	errno_t err;
2795 
2796 	if (!uuid_is_null(instance)) {
2797 		err = kern_nexus_controller_free_provider_instance(controller,
2798 		    instance);
2799 		if (err != 0) {
2800 			os_log_error(OS_LOG_DEFAULT, "%s free_provider_instance failed %d\n",
2801 			    __func__, err);
2802 		}
2803 		uuid_clear(instance);
2804 	}
2805 	if (!uuid_is_null(provider)) {
2806 		err = kern_nexus_controller_deregister_provider(controller,
2807 		    provider);
2808 		if (err != 0) {
2809 			os_log_error(OS_LOG_DEFAULT, "%s deregister_provider %d\n", __func__, err);
2810 		}
2811 		uuid_clear(provider);
2812 	}
2813 	return;
2814 }
2815 
2816 static void
ipsec_nexus_detach(struct ipsec_pcb * pcb)2817 ipsec_nexus_detach(struct ipsec_pcb *pcb)
2818 {
2819 	ipsec_nx_t nx = &pcb->ipsec_nx;
2820 	nexus_controller_t controller = kern_nexus_shared_controller();
2821 	errno_t err;
2822 
2823 	if (!uuid_is_null(nx->fsw_device)) {
2824 		err = kern_nexus_ifdetach(controller,
2825 		    nx->fsw_instance,
2826 		    nx->fsw_device);
2827 		if (err != 0) {
2828 			os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms device failed %d\n",
2829 			    __func__, err);
2830 		}
2831 	}
2832 
2833 	ipsec_detach_provider_and_instance(nx->fsw_provider,
2834 	    nx->fsw_instance);
2835 	ipsec_detach_provider_and_instance(nx->if_provider,
2836 	    nx->if_instance);
2837 
2838 	if (pcb->ipsec_netif_pp != NULL) {
2839 		kern_pbufpool_destroy(pcb->ipsec_netif_pp);
2840 		pcb->ipsec_netif_pp = NULL;
2841 	}
2842 	memset(nx, 0, sizeof(*nx));
2843 }
2844 
2845 static errno_t
ipsec_create_fs_provider_and_instance(struct ipsec_pcb * pcb,const char * type_name,const char * ifname,uuid_t * provider,uuid_t * instance)2846 ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb,
2847     const char *type_name,
2848     const char *ifname,
2849     uuid_t *provider, uuid_t *instance)
2850 {
2851 	nexus_attr_t attr = NULL;
2852 	nexus_controller_t controller = kern_nexus_shared_controller();
2853 	uuid_t dom_prov;
2854 	errno_t err;
2855 	struct kern_nexus_init init;
2856 	nexus_name_t    provider_name;
2857 
2858 	err = kern_nexus_get_default_domain_provider(NEXUS_TYPE_FLOW_SWITCH,
2859 	    &dom_prov);
2860 	IPSEC_IF_VERIFY(err == 0);
2861 	if (err != 0) {
2862 		os_log_error(OS_LOG_DEFAULT, "%s can't get %s provider, error %d\n",
2863 		    __func__, type_name, err);
2864 		goto failed;
2865 	}
2866 
2867 	err = kern_nexus_attr_create(&attr);
2868 	IPSEC_IF_VERIFY(err == 0);
2869 	if (err != 0) {
2870 		os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
2871 		    __func__, err);
2872 		goto failed;
2873 	}
2874 
2875 	uint64_t slot_buffer_size = pcb->ipsec_slot_size;
2876 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, slot_buffer_size);
2877 	VERIFY(err == 0);
2878 
2879 	// Reset ring size for flowswitch nexus to limit memory usage. Larger RX than netif.
2880 	uint64_t tx_ring_size = pcb->ipsec_tx_fsw_ring_size;
2881 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_TX_SLOTS, tx_ring_size);
2882 	VERIFY(err == 0);
2883 	uint64_t rx_ring_size = pcb->ipsec_rx_fsw_ring_size;
2884 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_RX_SLOTS, rx_ring_size);
2885 	VERIFY(err == 0);
2886 	/*
2887 	 * Configure flowswitch to use super-packet (multi-buflet).
2888 	 * This allows flowswitch to perform intra-stack packet aggregation.
2889 	 */
2890 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
2891 	    NX_FSW_TCP_RX_AGG_ENABLED() ? NX_PBUF_FRAGS_MAX : 1);
2892 	VERIFY(err == 0);
2893 
2894 	snprintf((char *)provider_name, sizeof(provider_name),
2895 	    "com.apple.%s.%s", type_name, ifname);
2896 	err = kern_nexus_controller_register_provider(controller,
2897 	    dom_prov,
2898 	    provider_name,
2899 	    NULL,
2900 	    0,
2901 	    attr,
2902 	    provider);
2903 	kern_nexus_attr_destroy(attr);
2904 	attr = NULL;
2905 	IPSEC_IF_VERIFY(err == 0);
2906 	if (err != 0) {
2907 		os_log_error(OS_LOG_DEFAULT, "%s register %s provider failed, error %d\n",
2908 		    __func__, type_name, err);
2909 		goto failed;
2910 	}
2911 	bzero(&init, sizeof(init));
2912 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
2913 	err = kern_nexus_controller_alloc_provider_instance(controller,
2914 	    *provider,
2915 	    NULL, NULL,
2916 	    instance, &init);
2917 	IPSEC_IF_VERIFY(err == 0);
2918 	if (err != 0) {
2919 		os_log_error(OS_LOG_DEFAULT, "%s alloc_provider_instance %s failed, %d\n",
2920 		    __func__, type_name, err);
2921 		kern_nexus_controller_deregister_provider(controller,
2922 		    *provider);
2923 		uuid_clear(*provider);
2924 	}
2925 failed:
2926 	return err;
2927 }
2928 
2929 static errno_t
ipsec_flowswitch_attach(struct ipsec_pcb * pcb)2930 ipsec_flowswitch_attach(struct ipsec_pcb *pcb)
2931 {
2932 	nexus_controller_t controller = kern_nexus_shared_controller();
2933 	errno_t err = 0;
2934 	ipsec_nx_t nx = &pcb->ipsec_nx;
2935 
2936 	// Allocate flowswitch
2937 	err = ipsec_create_fs_provider_and_instance(pcb,
2938 	    "flowswitch",
2939 	    pcb->ipsec_ifp->if_xname,
2940 	    &nx->fsw_provider,
2941 	    &nx->fsw_instance);
2942 	if (err != 0) {
2943 		os_log_error(OS_LOG_DEFAULT, "%s: failed to create bridge provider and instance\n",
2944 		    __func__);
2945 		goto failed;
2946 	}
2947 
2948 	// Attach flowswitch to device port
2949 	err = kern_nexus_ifattach(controller, nx->fsw_instance,
2950 	    NULL, nx->if_instance,
2951 	    FALSE, &nx->fsw_device);
2952 	if (err != 0) {
2953 		os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms device %d\n", __func__, err);
2954 		goto failed;
2955 	}
2956 
2957 	// Extract the agent UUID and save for later
2958 	struct kern_nexus *flowswitch_nx = nx_find(nx->fsw_instance, false);
2959 	if (flowswitch_nx != NULL) {
2960 		struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(flowswitch_nx);
2961 		if (flowswitch != NULL) {
2962 			FSW_RLOCK(flowswitch);
2963 			uuid_copy(nx->fsw_agent, flowswitch->fsw_agent_uuid);
2964 			FSW_UNLOCK(flowswitch);
2965 		} else {
2966 			os_log_error(OS_LOG_DEFAULT, "ipsec_flowswitch_attach - flowswitch is NULL\n");
2967 		}
2968 		nx_release(flowswitch_nx);
2969 	} else {
2970 		os_log_error(OS_LOG_DEFAULT, "ipsec_flowswitch_attach - unable to find flowswitch nexus\n");
2971 	}
2972 
2973 	return 0;
2974 
2975 failed:
2976 	ipsec_nexus_detach(pcb);
2977 
2978 	errno_t detach_error = 0;
2979 	if ((detach_error = ifnet_detach(pcb->ipsec_ifp)) != 0) {
2980 		panic("ipsec_flowswitch_attach - ifnet_detach failed: %d", detach_error);
2981 		/* NOT REACHED */
2982 	}
2983 
2984 	return err;
2985 }
2986 
2987 #pragma mark Kernel Pipe Nexus
2988 
2989 static errno_t
ipsec_register_kernel_pipe_nexus(struct ipsec_pcb * pcb)2990 ipsec_register_kernel_pipe_nexus(struct ipsec_pcb *pcb)
2991 {
2992 	nexus_attr_t nxa = NULL;
2993 	errno_t result;
2994 
2995 	lck_mtx_lock(&ipsec_lock);
2996 	if (ipsec_ncd_refcount++) {
2997 		lck_mtx_unlock(&ipsec_lock);
2998 		return 0;
2999 	}
3000 
3001 	result = kern_nexus_controller_create(&ipsec_ncd);
3002 	if (result) {
3003 		os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_create failed: %d\n",
3004 		    __FUNCTION__, result);
3005 		goto done;
3006 	}
3007 
3008 	uuid_t dom_prov;
3009 	result = kern_nexus_get_default_domain_provider(
3010 		NEXUS_TYPE_KERNEL_PIPE, &dom_prov);
3011 	if (result) {
3012 		os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_get_default_domain_provider failed: %d\n",
3013 		    __FUNCTION__, result);
3014 		goto done;
3015 	}
3016 
3017 	struct kern_nexus_provider_init prov_init = {
3018 		.nxpi_version = KERN_NEXUS_DOMAIN_PROVIDER_CURRENT_VERSION,
3019 		.nxpi_flags = NXPIF_VIRTUAL_DEVICE,
3020 		.nxpi_pre_connect = ipsec_nexus_pre_connect,
3021 		.nxpi_connected = ipsec_nexus_connected,
3022 		.nxpi_pre_disconnect = ipsec_nexus_pre_disconnect,
3023 		.nxpi_disconnected = ipsec_nexus_disconnected,
3024 		.nxpi_ring_init = ipsec_kpipe_ring_init,
3025 		.nxpi_ring_fini = ipsec_kpipe_ring_fini,
3026 		.nxpi_slot_init = NULL,
3027 		.nxpi_slot_fini = NULL,
3028 		.nxpi_sync_tx = ipsec_kpipe_sync_tx,
3029 		.nxpi_sync_rx = ipsec_kpipe_sync_rx,
3030 		.nxpi_tx_doorbell = NULL,
3031 	};
3032 
3033 	result = kern_nexus_attr_create(&nxa);
3034 	if (result) {
3035 		os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
3036 		    __FUNCTION__, result);
3037 		goto done;
3038 	}
3039 
3040 	uint64_t slot_buffer_size = IPSEC_IF_DEFAULT_SLOT_SIZE;
3041 	result = kern_nexus_attr_set(nxa, NEXUS_ATTR_SLOT_BUF_SIZE, slot_buffer_size);
3042 	VERIFY(result == 0);
3043 
3044 	// Reset ring size for kernel pipe nexus to limit memory usage
3045 	// Note: It's better to have less on slots on the kpipe TX ring than the netif
3046 	// so back pressure is applied at the AQM layer
3047 	uint64_t ring_size =
3048 	    pcb->ipsec_kpipe_tx_ring_size != 0 ? pcb->ipsec_kpipe_tx_ring_size :
3049 	    pcb->ipsec_netif_ring_size != 0 ? pcb->ipsec_netif_ring_size :
3050 	    if_ipsec_ring_size;
3051 	result = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_SLOTS, ring_size);
3052 	VERIFY(result == 0);
3053 
3054 	ring_size =
3055 	    pcb->ipsec_kpipe_rx_ring_size != 0 ? pcb->ipsec_kpipe_rx_ring_size :
3056 	    pcb->ipsec_netif_ring_size != 0 ? pcb->ipsec_netif_ring_size :
3057 	    if_ipsec_ring_size;
3058 	result = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size);
3059 	VERIFY(result == 0);
3060 
3061 	result = kern_nexus_controller_register_provider(ipsec_ncd,
3062 	    dom_prov,
3063 	    (const uint8_t *)"com.apple.nexus.ipsec.kpipe",
3064 	    &prov_init,
3065 	    sizeof(prov_init),
3066 	    nxa,
3067 	    &ipsec_kpipe_uuid);
3068 	if (result) {
3069 		os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_register_provider failed: %d\n",
3070 		    __FUNCTION__, result);
3071 		goto done;
3072 	}
3073 
3074 done:
3075 	if (nxa) {
3076 		kern_nexus_attr_destroy(nxa);
3077 	}
3078 
3079 	if (result) {
3080 		if (ipsec_ncd) {
3081 			kern_nexus_controller_destroy(ipsec_ncd);
3082 			ipsec_ncd = NULL;
3083 		}
3084 		ipsec_ncd_refcount = 0;
3085 	}
3086 
3087 	lck_mtx_unlock(&ipsec_lock);
3088 
3089 	return result;
3090 }
3091 
3092 static void
ipsec_unregister_kernel_pipe_nexus(void)3093 ipsec_unregister_kernel_pipe_nexus(void)
3094 {
3095 	lck_mtx_lock(&ipsec_lock);
3096 
3097 	VERIFY(ipsec_ncd_refcount > 0);
3098 
3099 	if (--ipsec_ncd_refcount == 0) {
3100 		kern_nexus_controller_destroy(ipsec_ncd);
3101 		ipsec_ncd = NULL;
3102 	}
3103 
3104 	lck_mtx_unlock(&ipsec_lock);
3105 }
3106 
3107 /* This structure only holds onto kpipe channels that need to be
3108  * freed in the future, but are cleared from the pcb under lock
3109  */
3110 struct ipsec_detached_channels {
3111 	int count;
3112 	kern_pbufpool_t pp;
3113 	uuid_t uuids[IPSEC_IF_MAX_RING_COUNT];
3114 };
3115 
3116 static void
ipsec_detach_channels(struct ipsec_pcb * pcb,struct ipsec_detached_channels * dc)3117 ipsec_detach_channels(struct ipsec_pcb *pcb, struct ipsec_detached_channels *dc)
3118 {
3119 	LCK_RW_ASSERT(&pcb->ipsec_pcb_lock, LCK_RW_TYPE_EXCLUSIVE);
3120 
3121 	if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
3122 		for (int i = 0; i < IPSEC_IF_MAX_RING_COUNT; i++) {
3123 			VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
3124 		}
3125 		dc->count = 0;
3126 		return;
3127 	}
3128 
3129 	dc->count = pcb->ipsec_kpipe_count;
3130 
3131 	VERIFY(dc->count >= 0);
3132 	VERIFY(dc->count <= IPSEC_IF_MAX_RING_COUNT);
3133 
3134 	for (int i = 0; i < dc->count; i++) {
3135 		VERIFY(!uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
3136 		uuid_copy(dc->uuids[i], pcb->ipsec_kpipe_uuid[i]);
3137 		uuid_clear(pcb->ipsec_kpipe_uuid[i]);
3138 	}
3139 	for (int i = dc->count; i < IPSEC_IF_MAX_RING_COUNT; i++) {
3140 		VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
3141 	}
3142 
3143 	if (dc->count) {
3144 		VERIFY(pcb->ipsec_kpipe_pp);
3145 	} else {
3146 		VERIFY(!pcb->ipsec_kpipe_pp);
3147 	}
3148 
3149 	dc->pp = pcb->ipsec_kpipe_pp;
3150 
3151 	pcb->ipsec_kpipe_pp = NULL;
3152 
3153 	ipsec_flag_clr(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED);
3154 }
3155 
3156 static void
ipsec_free_channels(struct ipsec_detached_channels * dc)3157 ipsec_free_channels(struct ipsec_detached_channels *dc)
3158 {
3159 	if (!dc->count) {
3160 		return;
3161 	}
3162 
3163 	for (int i = 0; i < dc->count; i++) {
3164 		errno_t result;
3165 		result = kern_nexus_controller_free_provider_instance(ipsec_ncd, dc->uuids[i]);
3166 		VERIFY(!result);
3167 	}
3168 
3169 	VERIFY(dc->pp);
3170 	kern_pbufpool_destroy(dc->pp);
3171 
3172 	ipsec_unregister_kernel_pipe_nexus();
3173 
3174 	memset(dc, 0, sizeof(*dc));
3175 }
3176 
3177 static errno_t
ipsec_enable_channel(struct ipsec_pcb * pcb,struct proc * proc)3178 ipsec_enable_channel(struct ipsec_pcb *pcb, struct proc *proc)
3179 {
3180 	struct kern_nexus_init init;
3181 	struct kern_pbufpool_init pp_init;
3182 	errno_t result;
3183 
3184 	kauth_cred_t cred = kauth_cred_get();
3185 	result = priv_check_cred(cred, PRIV_SKYWALK_REGISTER_KERNEL_PIPE, 0);
3186 	if (result) {
3187 		return result;
3188 	}
3189 
3190 	VERIFY(pcb->ipsec_kpipe_count);
3191 	VERIFY(!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED));
3192 
3193 	result = ipsec_register_kernel_pipe_nexus(pcb);
3194 
3195 	lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
3196 
3197 	if (result) {
3198 		os_log_error(OS_LOG_DEFAULT, "%s: %s failed to register kernel pipe nexus\n",
3199 		    __func__, pcb->ipsec_if_xname);
3200 		goto done;
3201 	}
3202 
3203 	VERIFY(ipsec_ncd);
3204 
3205 	bzero(&pp_init, sizeof(pp_init));
3206 	pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION;
3207 	pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE;
3208 	// Note: We only needs are many packets as can be held in the tx and rx rings
3209 	pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2 * pcb->ipsec_kpipe_count;
3210 	pp_init.kbi_bufsize = pcb->ipsec_slot_size;
3211 	pp_init.kbi_buf_seg_size = IPSEC_IF_DEFAULT_BUF_SEG_SIZE;
3212 	pp_init.kbi_max_frags = 1;
3213 	pp_init.kbi_flags |= KBIF_QUANTUM;
3214 	(void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name),
3215 	    "com.apple.kpipe.%s", pcb->ipsec_if_xname);
3216 	pp_init.kbi_ctx = NULL;
3217 	pp_init.kbi_ctx_retain = NULL;
3218 	pp_init.kbi_ctx_release = NULL;
3219 
3220 	result = kern_pbufpool_create(&pp_init, &pcb->ipsec_kpipe_pp,
3221 	    NULL);
3222 	if (result != 0) {
3223 		os_log_error(OS_LOG_DEFAULT, "%s: %s pbufbool create failed, error %d\n",
3224 		    __func__, pcb->ipsec_if_xname, result);
3225 		goto done;
3226 	}
3227 
3228 	bzero(&init, sizeof(init));
3229 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
3230 	init.nxi_tx_pbufpool = pcb->ipsec_kpipe_pp;
3231 
3232 	for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) {
3233 		VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
3234 		result = kern_nexus_controller_alloc_provider_instance(ipsec_ncd,
3235 		    ipsec_kpipe_uuid, pcb, NULL, &pcb->ipsec_kpipe_uuid[i], &init);
3236 
3237 		if (result == 0) {
3238 			nexus_port_t port = NEXUS_PORT_KERNEL_PIPE_CLIENT;
3239 			const bool has_proc_uuid = !uuid_is_null(pcb->ipsec_kpipe_proc_uuid);
3240 			pid_t pid = pcb->ipsec_kpipe_pid;
3241 			if (!pid && !has_proc_uuid) {
3242 				pid = proc_pid(proc);
3243 			}
3244 			result = kern_nexus_controller_bind_provider_instance(ipsec_ncd,
3245 			    pcb->ipsec_kpipe_uuid[i], &port,
3246 			    pid, has_proc_uuid ? pcb->ipsec_kpipe_proc_uuid : NULL, NULL,
3247 			    0, has_proc_uuid ? NEXUS_BIND_EXEC_UUID:NEXUS_BIND_PID);
3248 		}
3249 
3250 		if (result) {
3251 			/* Unwind all of them on error */
3252 			for (int j = 0; j < IPSEC_IF_MAX_RING_COUNT; j++) {
3253 				if (!uuid_is_null(pcb->ipsec_kpipe_uuid[j])) {
3254 					kern_nexus_controller_free_provider_instance(ipsec_ncd,
3255 					    pcb->ipsec_kpipe_uuid[j]);
3256 					uuid_clear(pcb->ipsec_kpipe_uuid[j]);
3257 				}
3258 			}
3259 			goto done;
3260 		}
3261 	}
3262 
3263 done:
3264 	lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
3265 
3266 	if (result) {
3267 		if (pcb->ipsec_kpipe_pp != NULL) {
3268 			kern_pbufpool_destroy(pcb->ipsec_kpipe_pp);
3269 			pcb->ipsec_kpipe_pp = NULL;
3270 		}
3271 		ipsec_unregister_kernel_pipe_nexus();
3272 	} else {
3273 		ipsec_flag_set(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED);
3274 	}
3275 
3276 	return result;
3277 }
3278 
3279 #endif // IPSEC_NEXUS
3280 
3281 
3282 /* Kernel control functions */
3283 
3284 static inline int
ipsec_find_by_unit(u_int32_t unit)3285 ipsec_find_by_unit(u_int32_t unit)
3286 {
3287 	struct ipsec_pcb *next_pcb = NULL;
3288 	int found = 0;
3289 
3290 	TAILQ_FOREACH(next_pcb, &ipsec_head, ipsec_chain) {
3291 		if (next_pcb->ipsec_unit == unit) {
3292 			found = 1;
3293 			break;
3294 		}
3295 	}
3296 
3297 	return found;
3298 }
3299 
3300 static inline void
ipsec_free_pcb(struct ipsec_pcb * pcb,bool locked)3301 ipsec_free_pcb(struct ipsec_pcb *pcb, bool locked)
3302 {
3303 #if IPSEC_NEXUS
3304 	mbuf_freem_list(pcb->ipsec_input_chain);
3305 	pcb->ipsec_input_chain_count = 0;
3306 	lck_mtx_destroy(&pcb->ipsec_input_chain_lock, &ipsec_lck_grp);
3307 	lck_mtx_destroy(&pcb->ipsec_kpipe_encrypt_lock, &ipsec_lck_grp);
3308 	lck_mtx_destroy(&pcb->ipsec_kpipe_decrypt_lock, &ipsec_lck_grp);
3309 #endif // IPSEC_NEXUS
3310 	lck_mtx_destroy(&pcb->ipsec_pcb_data_move_lock, &ipsec_lck_grp);
3311 	lck_rw_destroy(&pcb->ipsec_pcb_lock, &ipsec_lck_grp);
3312 	if (!locked) {
3313 		lck_mtx_lock(&ipsec_lock);
3314 	}
3315 	TAILQ_REMOVE(&ipsec_head, pcb, ipsec_chain);
3316 	if (!locked) {
3317 		lck_mtx_unlock(&ipsec_lock);
3318 	}
3319 	zfree(ipsec_pcb_zone, pcb);
3320 }
3321 
3322 static errno_t
ipsec_ctl_setup(u_int32_t * unit,void ** unitinfo)3323 ipsec_ctl_setup(u_int32_t *unit, void **unitinfo)
3324 {
3325 	if (unit == NULL || unitinfo == NULL) {
3326 		return EINVAL;
3327 	}
3328 
3329 	lck_mtx_lock(&ipsec_lock);
3330 
3331 	/* Find next available unit */
3332 	if (*unit == 0) {
3333 		*unit = 1;
3334 		while (*unit != ctl_maxunit) {
3335 			if (ipsec_find_by_unit(*unit)) {
3336 				(*unit)++;
3337 			} else {
3338 				break;
3339 			}
3340 		}
3341 		if (*unit == ctl_maxunit) {
3342 			lck_mtx_unlock(&ipsec_lock);
3343 			return EBUSY;
3344 		}
3345 	} else if (ipsec_find_by_unit(*unit)) {
3346 		lck_mtx_unlock(&ipsec_lock);
3347 		return EBUSY;
3348 	}
3349 
3350 	/* Find some open interface id */
3351 	u_int32_t chosen_unique_id = 1;
3352 	struct ipsec_pcb *next_pcb = TAILQ_LAST(&ipsec_head, ipsec_list);
3353 	if (next_pcb != NULL) {
3354 		/* List was not empty, add one to the last item */
3355 		chosen_unique_id = next_pcb->ipsec_unique_id + 1;
3356 		next_pcb = NULL;
3357 
3358 		/*
3359 		 * If this wrapped the id number, start looking at
3360 		 * the front of the list for an unused id.
3361 		 */
3362 		if (chosen_unique_id == 0) {
3363 			/* Find the next unused ID */
3364 			chosen_unique_id = 1;
3365 			TAILQ_FOREACH(next_pcb, &ipsec_head, ipsec_chain) {
3366 				if (next_pcb->ipsec_unique_id > chosen_unique_id) {
3367 					/* We found a gap */
3368 					break;
3369 				}
3370 
3371 				chosen_unique_id = next_pcb->ipsec_unique_id + 1;
3372 			}
3373 		}
3374 	}
3375 
3376 	struct ipsec_pcb *pcb = zalloc_flags(ipsec_pcb_zone, Z_WAITOK | Z_ZERO);
3377 
3378 	*unitinfo = pcb;
3379 	pcb->ipsec_unit = *unit;
3380 	pcb->ipsec_unique_id = chosen_unique_id;
3381 
3382 	if (next_pcb != NULL) {
3383 		TAILQ_INSERT_BEFORE(next_pcb, pcb, ipsec_chain);
3384 	} else {
3385 		TAILQ_INSERT_TAIL(&ipsec_head, pcb, ipsec_chain);
3386 	}
3387 
3388 	lck_mtx_unlock(&ipsec_lock);
3389 
3390 	return 0;
3391 }
3392 
3393 static errno_t
ipsec_ctl_bind(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)3394 ipsec_ctl_bind(kern_ctl_ref kctlref,
3395     struct sockaddr_ctl *sac,
3396     void **unitinfo)
3397 {
3398 	if (*unitinfo == NULL) {
3399 		u_int32_t unit = 0;
3400 		(void)ipsec_ctl_setup(&unit, unitinfo);
3401 	}
3402 
3403 	struct ipsec_pcb *pcb = (struct ipsec_pcb *)*unitinfo;
3404 	if (pcb == NULL) {
3405 		return EINVAL;
3406 	}
3407 
3408 	/* Setup the protocol control block */
3409 	pcb->ipsec_ctlref = kctlref;
3410 	pcb->ipsec_unit = sac->sc_unit;
3411 	pcb->ipsec_output_service_class = MBUF_SC_OAM;
3412 
3413 #if IPSEC_NEXUS
3414 	pcb->ipsec_use_netif = false;
3415 	pcb->ipsec_slot_size = IPSEC_IF_DEFAULT_SLOT_SIZE;
3416 	pcb->ipsec_netif_ring_size = if_ipsec_ring_size;
3417 	pcb->ipsec_tx_fsw_ring_size = if_ipsec_tx_fsw_ring_size;
3418 	pcb->ipsec_rx_fsw_ring_size = if_ipsec_rx_fsw_ring_size;
3419 #endif // IPSEC_NEXUS
3420 
3421 	lck_rw_init(&pcb->ipsec_pcb_lock, &ipsec_lck_grp, &ipsec_lck_attr);
3422 	lck_mtx_init(&pcb->ipsec_pcb_data_move_lock, &ipsec_lck_grp, &ipsec_lck_attr);
3423 #if IPSEC_NEXUS
3424 	pcb->ipsec_input_chain_count = 0;
3425 	lck_mtx_init(&pcb->ipsec_input_chain_lock, &ipsec_lck_grp, &ipsec_lck_attr);
3426 	lck_mtx_init(&pcb->ipsec_kpipe_encrypt_lock, &ipsec_lck_grp, &ipsec_lck_attr);
3427 	lck_mtx_init(&pcb->ipsec_kpipe_decrypt_lock, &ipsec_lck_grp, &ipsec_lck_attr);
3428 #endif // IPSEC_NEXUS
3429 
3430 	return 0;
3431 }
3432 
3433 static errno_t
ipsec_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)3434 ipsec_ctl_connect(kern_ctl_ref kctlref,
3435     struct sockaddr_ctl *sac,
3436     void **unitinfo)
3437 {
3438 	struct ifnet_init_eparams ipsec_init = {};
3439 	errno_t result = 0;
3440 
3441 	if (*unitinfo == NULL) {
3442 		(void)ipsec_ctl_bind(kctlref, sac, unitinfo);
3443 	}
3444 
3445 	struct ipsec_pcb *pcb = *unitinfo;
3446 	if (pcb == NULL) {
3447 		return EINVAL;
3448 	}
3449 
3450 	/* Handle case where ipsec_ctl_setup() was called, but ipsec_ctl_bind() was not */
3451 	if (pcb->ipsec_ctlref == NULL) {
3452 		(void)ipsec_ctl_bind(kctlref, sac, unitinfo);
3453 	}
3454 
3455 	snprintf(pcb->ipsec_if_xname, sizeof(pcb->ipsec_if_xname), "ipsec%d", pcb->ipsec_unit - 1);
3456 	snprintf(pcb->ipsec_unique_name, sizeof(pcb->ipsec_unique_name), "ipsecid%d", pcb->ipsec_unique_id - 1);
3457 	os_log(OS_LOG_DEFAULT, "ipsec_ctl_connect: creating interface %s (id %s)\n", pcb->ipsec_if_xname, pcb->ipsec_unique_name);
3458 
3459 	/* Create the interface */
3460 	bzero(&ipsec_init, sizeof(ipsec_init));
3461 	ipsec_init.ver = IFNET_INIT_CURRENT_VERSION;
3462 	ipsec_init.len = sizeof(ipsec_init);
3463 
3464 #if IPSEC_NEXUS
3465 	if (pcb->ipsec_use_netif) {
3466 		ipsec_init.flags = (IFNET_INIT_SKYWALK_NATIVE | IFNET_INIT_NX_NOAUTO);
3467 	} else
3468 #endif // IPSEC_NEXUS
3469 	{
3470 		ipsec_init.flags = IFNET_INIT_NX_NOAUTO;
3471 		ipsec_init.start = ipsec_start;
3472 	}
3473 	ipsec_init.name = "ipsec";
3474 	ipsec_init.unit = pcb->ipsec_unit - 1;
3475 	ipsec_init.uniqueid = pcb->ipsec_unique_name;
3476 	ipsec_init.uniqueid_len = (uint32_t)strlen(pcb->ipsec_unique_name);
3477 	ipsec_init.family = IFNET_FAMILY_IPSEC;
3478 	ipsec_init.type = IFT_OTHER;
3479 	ipsec_init.demux = ipsec_demux;
3480 	ipsec_init.add_proto = ipsec_add_proto;
3481 	ipsec_init.del_proto = ipsec_del_proto;
3482 	ipsec_init.softc = pcb;
3483 	ipsec_init.ioctl = ipsec_ioctl;
3484 	ipsec_init.free = ipsec_detached;
3485 
3486 #if IPSEC_NEXUS
3487 	/* We don't support kpipes without a netif */
3488 	if (pcb->ipsec_kpipe_count && !pcb->ipsec_use_netif) {
3489 		result = ENOTSUP;
3490 		os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - kpipe requires netif: failed %d\n", result);
3491 		ipsec_free_pcb(pcb, false);
3492 		*unitinfo = NULL;
3493 		return result;
3494 	}
3495 
3496 	if (if_ipsec_debug != 0) {
3497 		printf("%s: %s%d use_netif %d kpipe_count %d slot_size %u ring_size %u "
3498 		    "kpipe_tx_ring_size %u kpipe_rx_ring_size %u\n",
3499 		    __func__,
3500 		    ipsec_init.name, ipsec_init.unit,
3501 		    pcb->ipsec_use_netif,
3502 		    pcb->ipsec_kpipe_count,
3503 		    pcb->ipsec_slot_size,
3504 		    pcb->ipsec_netif_ring_size,
3505 		    pcb->ipsec_kpipe_tx_ring_size,
3506 		    pcb->ipsec_kpipe_rx_ring_size);
3507 	}
3508 	if (pcb->ipsec_use_netif) {
3509 		if (pcb->ipsec_kpipe_count) {
3510 			result = ipsec_enable_channel(pcb, current_proc());
3511 			if (result) {
3512 				os_log_error(OS_LOG_DEFAULT, "%s: %s failed to enable channels\n",
3513 				    __func__, pcb->ipsec_if_xname);
3514 				ipsec_free_pcb(pcb, false);
3515 				*unitinfo = NULL;
3516 				return result;
3517 			}
3518 		}
3519 
3520 		result = ipsec_nexus_ifattach(pcb, &ipsec_init, &pcb->ipsec_ifp);
3521 		if (result != 0) {
3522 			os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ipsec_nexus_ifattach failed: %d\n", result);
3523 			ipsec_free_pcb(pcb, false);
3524 			*unitinfo = NULL;
3525 			return result;
3526 		}
3527 
3528 		result = ipsec_flowswitch_attach(pcb);
3529 		if (result != 0) {
3530 			os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ipsec_flowswitch_attach failed: %d\n", result);
3531 			// Do not call ipsec_free_pcb(). We will be attached already, and will be freed later
3532 			// in ipsec_detached().
3533 			*unitinfo = NULL;
3534 			return result;
3535 		}
3536 
3537 		/* Attach to bpf */
3538 		bpfattach(pcb->ipsec_ifp, DLT_RAW, 0);
3539 	} else
3540 #endif // IPSEC_NEXUS
3541 	{
3542 		result = ifnet_allocate_extended(&ipsec_init, &pcb->ipsec_ifp);
3543 		if (result != 0) {
3544 			os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ifnet_allocate failed: %d\n", result);
3545 			ipsec_free_pcb(pcb, false);
3546 			*unitinfo = NULL;
3547 			return result;
3548 		}
3549 		ipsec_ifnet_set_attrs(pcb->ipsec_ifp);
3550 
3551 		/* Attach the interface */
3552 		result = ifnet_attach(pcb->ipsec_ifp, NULL);
3553 		if (result != 0) {
3554 			os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ifnet_attach failed: %d\n", result);
3555 			ifnet_release(pcb->ipsec_ifp);
3556 			ipsec_free_pcb(pcb, false);
3557 			*unitinfo = NULL;
3558 			return result;
3559 		}
3560 
3561 		/* Attach to bpf */
3562 		bpfattach(pcb->ipsec_ifp, DLT_NULL, 0);
3563 	}
3564 
3565 #if IPSEC_NEXUS
3566 	/*
3567 	 * Mark the data path as ready.
3568 	 * If kpipe nexus is being used then the data path is marked ready only when a kpipe channel is connected.
3569 	 */
3570 	if (pcb->ipsec_kpipe_count == 0) {
3571 		lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
3572 		IPSEC_SET_DATA_PATH_READY(pcb);
3573 		lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
3574 	}
3575 #endif
3576 
3577 	/* The interfaces resoures allocated, mark it as running */
3578 	ifnet_set_flags(pcb->ipsec_ifp, IFF_RUNNING, IFF_RUNNING);
3579 
3580 	return 0;
3581 }
3582 
3583 static errno_t
ipsec_detach_ip(ifnet_t interface,protocol_family_t protocol,socket_t pf_socket)3584 ipsec_detach_ip(ifnet_t                         interface,
3585     protocol_family_t       protocol,
3586     socket_t                        pf_socket)
3587 {
3588 	errno_t result = EPROTONOSUPPORT;
3589 
3590 	/* Attempt a detach */
3591 	if (protocol == PF_INET) {
3592 		struct ifreq    ifr;
3593 
3594 		bzero(&ifr, sizeof(ifr));
3595 		snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s%d",
3596 		    ifnet_name(interface), ifnet_unit(interface));
3597 
3598 		result = sock_ioctl(pf_socket, SIOCPROTODETACH, &ifr);
3599 	} else if (protocol == PF_INET6) {
3600 		struct in6_ifreq        ifr6;
3601 
3602 		bzero(&ifr6, sizeof(ifr6));
3603 		snprintf(ifr6.ifr_name, sizeof(ifr6.ifr_name), "%s%d",
3604 		    ifnet_name(interface), ifnet_unit(interface));
3605 
3606 		result = sock_ioctl(pf_socket, SIOCPROTODETACH_IN6, &ifr6);
3607 	}
3608 
3609 	return result;
3610 }
3611 
3612 static void
ipsec_remove_address(ifnet_t interface,protocol_family_t protocol,ifaddr_t address,socket_t pf_socket)3613 ipsec_remove_address(ifnet_t                            interface,
3614     protocol_family_t      protocol,
3615     ifaddr_t                       address,
3616     socket_t                       pf_socket)
3617 {
3618 	errno_t result = 0;
3619 
3620 	/* Attempt a detach */
3621 	if (protocol == PF_INET) {
3622 		struct ifreq    ifr;
3623 
3624 		bzero(&ifr, sizeof(ifr));
3625 		snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s%d",
3626 		    ifnet_name(interface), ifnet_unit(interface));
3627 		result = ifaddr_address(address, &ifr.ifr_addr, sizeof(ifr.ifr_addr));
3628 		if (result != 0) {
3629 			os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - ifaddr_address failed: %d", result);
3630 		} else {
3631 			result = sock_ioctl(pf_socket, SIOCDIFADDR, &ifr);
3632 			if (result != 0) {
3633 				os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - SIOCDIFADDR failed: %d", result);
3634 			}
3635 		}
3636 	} else if (protocol == PF_INET6) {
3637 		struct in6_ifreq        ifr6;
3638 
3639 		bzero(&ifr6, sizeof(ifr6));
3640 		snprintf(ifr6.ifr_name, sizeof(ifr6.ifr_name), "%s%d",
3641 		    ifnet_name(interface), ifnet_unit(interface));
3642 		result = ifaddr_address(address, (struct sockaddr*)&ifr6.ifr_addr,
3643 		    sizeof(ifr6.ifr_addr));
3644 		if (result != 0) {
3645 			os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - ifaddr_address failed (v6): %d",
3646 			    result);
3647 		} else {
3648 			result = sock_ioctl(pf_socket, SIOCDIFADDR_IN6, &ifr6);
3649 			if (result != 0) {
3650 				os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - SIOCDIFADDR_IN6 failed: %d",
3651 				    result);
3652 			}
3653 		}
3654 	}
3655 }
3656 
3657 static void
ipsec_cleanup_family(ifnet_t interface,protocol_family_t protocol)3658 ipsec_cleanup_family(ifnet_t                            interface,
3659     protocol_family_t      protocol)
3660 {
3661 	errno_t         result = 0;
3662 	socket_t        pf_socket = NULL;
3663 	ifaddr_t        *addresses = NULL;
3664 	int                     i;
3665 
3666 	if (protocol != PF_INET && protocol != PF_INET6) {
3667 		os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - invalid protocol family %d\n", protocol);
3668 		return;
3669 	}
3670 
3671 	/* Create a socket for removing addresses and detaching the protocol */
3672 	result = sock_socket(protocol, SOCK_DGRAM, 0, NULL, NULL, &pf_socket);
3673 	if (result != 0) {
3674 		if (result != EAFNOSUPPORT) {
3675 			os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - failed to create %s socket: %d\n",
3676 			    protocol == PF_INET ? "IP" : "IPv6", result);
3677 		}
3678 		goto cleanup;
3679 	}
3680 
3681 	/* always set SS_PRIV, we want to close and detach regardless */
3682 	sock_setpriv(pf_socket, 1);
3683 
3684 	result = ipsec_detach_ip(interface, protocol, pf_socket);
3685 	if (result == 0 || result == ENXIO) {
3686 		/* We are done! We either detached or weren't attached. */
3687 		goto cleanup;
3688 	} else if (result != EBUSY) {
3689 		/* Uh, not really sure what happened here... */
3690 		os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result);
3691 		goto cleanup;
3692 	}
3693 
3694 	/*
3695 	 * At this point, we received an EBUSY error. This means there are
3696 	 * addresses attached. We should detach them and then try again.
3697 	 */
3698 	result = ifnet_get_address_list_family(interface, &addresses, (sa_family_t)protocol);
3699 	if (result != 0) {
3700 		os_log_error(OS_LOG_DEFAULT, "fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n",
3701 		    ifnet_name(interface), ifnet_unit(interface),
3702 		    protocol == PF_INET ? "PF_INET" : "PF_INET6", result);
3703 		goto cleanup;
3704 	}
3705 
3706 	for (i = 0; addresses[i] != 0; i++) {
3707 		ipsec_remove_address(interface, protocol, addresses[i], pf_socket);
3708 	}
3709 	ifnet_free_address_list(addresses);
3710 	addresses = NULL;
3711 
3712 	/*
3713 	 * The addresses should be gone, we should try the remove again.
3714 	 */
3715 	result = ipsec_detach_ip(interface, protocol, pf_socket);
3716 	if (result != 0 && result != ENXIO) {
3717 		os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result);
3718 	}
3719 
3720 cleanup:
3721 	if (pf_socket != NULL) {
3722 		sock_close(pf_socket);
3723 	}
3724 
3725 	if (addresses != NULL) {
3726 		ifnet_free_address_list(addresses);
3727 	}
3728 }
3729 
3730 static errno_t
ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref,__unused u_int32_t unit,void * unitinfo)3731 ipsec_ctl_disconnect(__unused kern_ctl_ref      kctlref,
3732     __unused u_int32_t             unit,
3733     void                                   *unitinfo)
3734 {
3735 	struct ipsec_pcb *pcb = unitinfo;
3736 	ifnet_t ifp = NULL;
3737 	errno_t result = 0;
3738 
3739 	if (pcb == NULL) {
3740 		return EINVAL;
3741 	}
3742 
3743 	/* Wait until all threads in the data paths are done. */
3744 	ipsec_wait_data_move_drain(pcb);
3745 
3746 #if IPSEC_NEXUS
3747 	// Tell the nexus to stop all rings
3748 	if (pcb->ipsec_netif_nexus != NULL) {
3749 		kern_nexus_stop(pcb->ipsec_netif_nexus);
3750 	}
3751 #endif // IPSEC_NEXUS
3752 
3753 	lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
3754 
3755 #if IPSEC_NEXUS
3756 	if (if_ipsec_debug != 0) {
3757 		printf("ipsec_ctl_disconnect: detaching interface %s (id %s)\n",
3758 		    pcb->ipsec_if_xname, pcb->ipsec_unique_name);
3759 	}
3760 
3761 	struct ipsec_detached_channels dc;
3762 	ipsec_detach_channels(pcb, &dc);
3763 #endif // IPSEC_NEXUS
3764 
3765 	pcb->ipsec_ctlref = NULL;
3766 
3767 	ifp = pcb->ipsec_ifp;
3768 	if (ifp != NULL) {
3769 #if IPSEC_NEXUS
3770 		if (pcb->ipsec_netif_nexus != NULL) {
3771 			/*
3772 			 * Quiesce the interface and flush any pending outbound packets.
3773 			 */
3774 			if_down(ifp);
3775 
3776 			/*
3777 			 * Suspend data movement and wait for IO threads to exit.
3778 			 * We can't rely on the logic in dlil_quiesce_and_detach_nexuses() to
3779 			 * do this because ipsec nexuses are attached/detached separately.
3780 			 */
3781 			ifnet_datamov_suspend_and_drain(ifp);
3782 			if ((result = ifnet_detach(ifp)) != 0) {
3783 				panic("ipsec_ctl_disconnect - ifnet_detach failed: %d", result);
3784 				/* NOT REACHED */
3785 			}
3786 
3787 			/*
3788 			 * We want to do everything in our power to ensure that the interface
3789 			 * really goes away when the socket is closed. We must remove IP/IPv6
3790 			 * addresses and detach the protocols. Finally, we can remove and
3791 			 * release the interface.
3792 			 */
3793 			key_delsp_for_ipsec_if(ifp);
3794 
3795 			ipsec_cleanup_family(ifp, AF_INET);
3796 			ipsec_cleanup_family(ifp, AF_INET6);
3797 
3798 			lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
3799 
3800 			ipsec_free_channels(&dc);
3801 
3802 			ipsec_nexus_detach(pcb);
3803 
3804 			/* Decrement refcnt added by ifnet_datamov_suspend_and_drain(). */
3805 			ifnet_datamov_resume(ifp);
3806 		} else
3807 #endif // IPSEC_NEXUS
3808 		{
3809 			lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
3810 
3811 #if IPSEC_NEXUS
3812 			ipsec_free_channels(&dc);
3813 #endif // IPSEC_NEXUS
3814 
3815 			/*
3816 			 * We want to do everything in our power to ensure that the interface
3817 			 * really goes away when the socket is closed. We must remove IP/IPv6
3818 			 * addresses and detach the protocols. Finally, we can remove and
3819 			 * release the interface.
3820 			 */
3821 			key_delsp_for_ipsec_if(ifp);
3822 
3823 			ipsec_cleanup_family(ifp, AF_INET);
3824 			ipsec_cleanup_family(ifp, AF_INET6);
3825 
3826 			/*
3827 			 * Detach now; ipsec_detach() will be called asynchronously once
3828 			 * the I/O reference count drops to 0.  There we will invoke
3829 			 * ifnet_release().
3830 			 */
3831 			if ((result = ifnet_detach(ifp)) != 0) {
3832 				os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_disconnect - ifnet_detach failed: %d\n", result);
3833 			}
3834 		}
3835 	} else {
3836 		// Bound, but not connected
3837 		lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
3838 		ipsec_free_pcb(pcb, false);
3839 	}
3840 
3841 	return 0;
3842 }
3843 
3844 static errno_t
ipsec_ctl_send(__unused kern_ctl_ref kctlref,__unused u_int32_t unit,__unused void * unitinfo,mbuf_t m,__unused int flags)3845 ipsec_ctl_send(__unused kern_ctl_ref    kctlref,
3846     __unused u_int32_t           unit,
3847     __unused void                        *unitinfo,
3848     mbuf_t                  m,
3849     __unused int                 flags)
3850 {
3851 	/* Receive messages from the control socket. Currently unused. */
3852 	mbuf_freem(m);
3853 	return 0;
3854 }
3855 
3856 static errno_t
ipsec_ctl_setopt(__unused kern_ctl_ref kctlref,__unused u_int32_t unit,void * unitinfo,int opt,void * data,size_t len)3857 ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
3858     __unused u_int32_t             unit,
3859     void                                   *unitinfo,
3860     int                                            opt,
3861     void                                   *data,
3862     size_t                                 len)
3863 {
3864 	errno_t                                 result = 0;
3865 	struct ipsec_pcb                        *pcb = unitinfo;
3866 	if (pcb == NULL) {
3867 		return EINVAL;
3868 	}
3869 
3870 	/* check for privileges for privileged options */
3871 	switch (opt) {
3872 	case IPSEC_OPT_FLAGS:
3873 	case IPSEC_OPT_EXT_IFDATA_STATS:
3874 	case IPSEC_OPT_SET_DELEGATE_INTERFACE:
3875 	case IPSEC_OPT_OUTPUT_TRAFFIC_CLASS:
3876 	case IPSEC_OPT_OUTPUT_DSCP_MAPPING:
3877 		if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3878 			return EPERM;
3879 		}
3880 		break;
3881 	}
3882 
3883 	switch (opt) {
3884 	case IPSEC_OPT_FLAGS: {
3885 		if (len != sizeof(u_int32_t)) {
3886 			result = EMSGSIZE;
3887 		} else {
3888 			pcb->ipsec_external_flags = *(u_int32_t *)data;
3889 		}
3890 		break;
3891 	}
3892 
3893 	case IPSEC_OPT_EXT_IFDATA_STATS: {
3894 		if (len != sizeof(int)) {
3895 			result = EMSGSIZE;
3896 			break;
3897 		}
3898 		if (pcb->ipsec_ifp == NULL) {
3899 			// Only can set after connecting
3900 			result = EINVAL;
3901 			break;
3902 		}
3903 		pcb->ipsec_ext_ifdata_stats = (*(int *)data) ? 1 : 0;
3904 		break;
3905 	}
3906 
3907 	case IPSEC_OPT_INC_IFDATA_STATS_IN:
3908 	case IPSEC_OPT_INC_IFDATA_STATS_OUT: {
3909 		struct ipsec_stats_param *utsp = (struct ipsec_stats_param *)data;
3910 
3911 		if (utsp == NULL || len < sizeof(struct ipsec_stats_param)) {
3912 			result = EINVAL;
3913 			break;
3914 		}
3915 		if (pcb->ipsec_ifp == NULL) {
3916 			// Only can set after connecting
3917 			result = EINVAL;
3918 			break;
3919 		}
3920 		if (!pcb->ipsec_ext_ifdata_stats) {
3921 			result = EINVAL;
3922 			break;
3923 		}
3924 		if (opt == IPSEC_OPT_INC_IFDATA_STATS_IN) {
3925 			ifnet_stat_increment_in(pcb->ipsec_ifp, (uint32_t)utsp->utsp_packets,
3926 			    (uint32_t)utsp->utsp_bytes, (uint32_t)utsp->utsp_errors);
3927 		} else {
3928 			ifnet_stat_increment_out(pcb->ipsec_ifp, (uint32_t)utsp->utsp_packets,
3929 			    (uint32_t)utsp->utsp_bytes, (uint32_t)utsp->utsp_errors);
3930 		}
3931 		break;
3932 	}
3933 
3934 	case IPSEC_OPT_SET_DELEGATE_INTERFACE: {
3935 		ifnet_t del_ifp = NULL;
3936 		char name[IFNAMSIZ];
3937 
3938 		if (len > IFNAMSIZ - 1) {
3939 			result = EMSGSIZE;
3940 			break;
3941 		}
3942 		if (pcb->ipsec_ifp == NULL) {
3943 			// Only can set after connecting
3944 			result = EINVAL;
3945 			break;
3946 		}
3947 		if (len != 0) {                   /* if len==0, del_ifp will be NULL causing the delegate to be removed */
3948 			bcopy(data, name, len);
3949 			name[len] = 0;
3950 			result = ifnet_find_by_name(name, &del_ifp);
3951 		}
3952 		if (result == 0) {
3953 			os_log_error(OS_LOG_DEFAULT, "%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n",
3954 			    __func__, pcb->ipsec_ifp->if_xname,
3955 			    del_ifp ? del_ifp->if_xname : "NULL");
3956 
3957 			result = ifnet_set_delegate(pcb->ipsec_ifp, del_ifp);
3958 			if (del_ifp) {
3959 				ifnet_release(del_ifp);
3960 			}
3961 		}
3962 		break;
3963 	}
3964 
3965 	case IPSEC_OPT_OUTPUT_TRAFFIC_CLASS: {
3966 		if (len != sizeof(int)) {
3967 			result = EMSGSIZE;
3968 			break;
3969 		}
3970 		if (pcb->ipsec_ifp == NULL) {
3971 			// Only can set after connecting
3972 			result = EINVAL;
3973 			break;
3974 		}
3975 		mbuf_svc_class_t output_service_class = so_tc2msc(*(int *)data);
3976 		if (output_service_class == MBUF_SC_UNSPEC) {
3977 			pcb->ipsec_output_service_class = MBUF_SC_OAM;
3978 		} else {
3979 			pcb->ipsec_output_service_class = output_service_class;
3980 		}
3981 		os_log_error(OS_LOG_DEFAULT, "%s IPSEC_OPT_OUTPUT_TRAFFIC_CLASS %s svc %d\n",
3982 		    __func__, pcb->ipsec_ifp->if_xname,
3983 		    pcb->ipsec_output_service_class);
3984 		break;
3985 	}
3986 
3987 #if IPSEC_NEXUS
3988 	case IPSEC_OPT_ENABLE_CHANNEL: {
3989 		if (len != sizeof(int)) {
3990 			result = EMSGSIZE;
3991 			break;
3992 		}
3993 		if (pcb->ipsec_ifp != NULL) {
3994 			// Only can set before connecting
3995 			result = EINVAL;
3996 			break;
3997 		}
3998 		if ((*(int *)data) != 0 &&
3999 		    (*(int *)data) != 1 &&
4000 		    (*(int *)data) != IPSEC_IF_WMM_RING_COUNT) {
4001 			result = EINVAL;
4002 			break;
4003 		}
4004 		lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
4005 		pcb->ipsec_kpipe_count = *(int *)data;
4006 		lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
4007 		break;
4008 	}
4009 
4010 	case IPSEC_OPT_CHANNEL_BIND_PID: {
4011 		if (len != sizeof(pid_t)) {
4012 			result = EMSGSIZE;
4013 			break;
4014 		}
4015 		if (pcb->ipsec_ifp != NULL) {
4016 			// Only can set before connecting
4017 			result = EINVAL;
4018 			break;
4019 		}
4020 		lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
4021 		pcb->ipsec_kpipe_pid = *(pid_t *)data;
4022 		lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
4023 		break;
4024 	}
4025 
4026 	case IPSEC_OPT_CHANNEL_BIND_UUID: {
4027 		if (len != sizeof(uuid_t)) {
4028 			result = EMSGSIZE;
4029 			break;
4030 		}
4031 		if (pcb->ipsec_ifp != NULL) {
4032 			// Only can set before connecting
4033 			result = EINVAL;
4034 			break;
4035 		}
4036 		lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
4037 		uuid_copy(pcb->ipsec_kpipe_proc_uuid, *((uuid_t *)data));
4038 		lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
4039 		break;
4040 	}
4041 
4042 	case IPSEC_OPT_ENABLE_FLOWSWITCH: {
4043 		if (len != sizeof(int)) {
4044 			result = EMSGSIZE;
4045 			break;
4046 		}
4047 		if (pcb->ipsec_ifp == NULL) {
4048 			// Only can set after connecting
4049 			result = EINVAL;
4050 			break;
4051 		}
4052 		if (!if_is_fsw_transport_netagent_enabled()) {
4053 			result = ENOTSUP;
4054 			break;
4055 		}
4056 		if (uuid_is_null(pcb->ipsec_nx.fsw_agent)) {
4057 			result = ENOENT;
4058 			break;
4059 		}
4060 
4061 		uint32_t flags = netagent_get_flags(pcb->ipsec_nx.fsw_agent);
4062 
4063 		if (*(int *)data) {
4064 			flags |= (NETAGENT_FLAG_NEXUS_PROVIDER |
4065 			    NETAGENT_FLAG_NEXUS_LISTENER);
4066 			result = netagent_set_flags(pcb->ipsec_nx.fsw_agent, flags);
4067 			pcb->ipsec_needs_netagent = true;
4068 		} else {
4069 			pcb->ipsec_needs_netagent = false;
4070 			flags &= ~(NETAGENT_FLAG_NEXUS_PROVIDER |
4071 			    NETAGENT_FLAG_NEXUS_LISTENER);
4072 			result = netagent_set_flags(pcb->ipsec_nx.fsw_agent, flags);
4073 		}
4074 		break;
4075 	}
4076 
4077 	case IPSEC_OPT_INPUT_FRAG_SIZE: {
4078 		if (len != sizeof(u_int32_t)) {
4079 			result = EMSGSIZE;
4080 			break;
4081 		}
4082 		u_int32_t input_frag_size = *(u_int32_t *)data;
4083 		if (input_frag_size <= sizeof(struct ip6_hdr)) {
4084 			pcb->ipsec_frag_size_set = FALSE;
4085 			pcb->ipsec_input_frag_size = 0;
4086 		} else {
4087 			pcb->ipsec_frag_size_set = TRUE;
4088 			pcb->ipsec_input_frag_size = input_frag_size;
4089 		}
4090 		break;
4091 	}
4092 	case IPSEC_OPT_ENABLE_NETIF: {
4093 		if (len != sizeof(int)) {
4094 			result = EMSGSIZE;
4095 			break;
4096 		}
4097 		if (pcb->ipsec_ifp != NULL) {
4098 			// Only can set before connecting
4099 			result = EINVAL;
4100 			break;
4101 		}
4102 		lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
4103 		pcb->ipsec_use_netif = !!(*(int *)data);
4104 		lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
4105 		break;
4106 	}
4107 	case IPSEC_OPT_SLOT_SIZE: {
4108 		if (len != sizeof(u_int32_t)) {
4109 			result = EMSGSIZE;
4110 			break;
4111 		}
4112 		if (pcb->ipsec_ifp != NULL) {
4113 			// Only can set before connecting
4114 			result = EINVAL;
4115 			break;
4116 		}
4117 		u_int32_t slot_size = *(u_int32_t *)data;
4118 		if (slot_size < IPSEC_IF_MIN_SLOT_SIZE ||
4119 		    slot_size > IPSEC_IF_MAX_SLOT_SIZE) {
4120 			return EINVAL;
4121 		}
4122 		pcb->ipsec_slot_size = slot_size;
4123 		if (if_ipsec_debug != 0) {
4124 			printf("%s: IPSEC_OPT_SLOT_SIZE %u\n", __func__, slot_size);
4125 		}
4126 		break;
4127 	}
4128 	case IPSEC_OPT_NETIF_RING_SIZE: {
4129 		if (len != sizeof(u_int32_t)) {
4130 			result = EMSGSIZE;
4131 			break;
4132 		}
4133 		if (pcb->ipsec_ifp != NULL) {
4134 			// Only can set before connecting
4135 			result = EINVAL;
4136 			break;
4137 		}
4138 		u_int32_t ring_size = *(u_int32_t *)data;
4139 		if (ring_size < IPSEC_IF_MIN_RING_SIZE ||
4140 		    ring_size > IPSEC_IF_MAX_RING_SIZE) {
4141 			return EINVAL;
4142 		}
4143 		pcb->ipsec_netif_ring_size = ring_size;
4144 		if (if_ipsec_debug != 0) {
4145 			printf("%s: IPSEC_OPT_NETIF_RING_SIZE %u\n", __func__, ring_size);
4146 		}
4147 		break;
4148 	}
4149 	case IPSEC_OPT_TX_FSW_RING_SIZE: {
4150 		if (len != sizeof(u_int32_t)) {
4151 			result = EMSGSIZE;
4152 			break;
4153 		}
4154 		if (pcb->ipsec_ifp != NULL) {
4155 			// Only can set before connecting
4156 			result = EINVAL;
4157 			break;
4158 		}
4159 		u_int32_t ring_size = *(u_int32_t *)data;
4160 		if (ring_size < IPSEC_IF_MIN_RING_SIZE ||
4161 		    ring_size > IPSEC_IF_MAX_RING_SIZE) {
4162 			return EINVAL;
4163 		}
4164 		pcb->ipsec_tx_fsw_ring_size = ring_size;
4165 		if (if_ipsec_debug != 0) {
4166 			printf("%s: IPSEC_OPT_TX_FSW_RING_SIZE %u\n", __func__, ring_size);
4167 		}
4168 		break;
4169 	}
4170 	case IPSEC_OPT_RX_FSW_RING_SIZE: {
4171 		if (len != sizeof(u_int32_t)) {
4172 			result = EMSGSIZE;
4173 			break;
4174 		}
4175 		if (pcb->ipsec_ifp != NULL) {
4176 			// Only can set before connecting
4177 			result = EINVAL;
4178 			break;
4179 		}
4180 		u_int32_t ring_size = *(u_int32_t *)data;
4181 		if (ring_size < IPSEC_IF_MIN_RING_SIZE ||
4182 		    ring_size > IPSEC_IF_MAX_RING_SIZE) {
4183 			return EINVAL;
4184 		}
4185 		pcb->ipsec_rx_fsw_ring_size = ring_size;
4186 		if (if_ipsec_debug != 0) {
4187 			printf("%s: IPSEC_OPT_TX_FSW_RING_SIZE %u\n", __func__, ring_size);
4188 		}
4189 		break;
4190 	}
4191 	case IPSEC_OPT_KPIPE_TX_RING_SIZE: {
4192 		if (len != sizeof(u_int32_t)) {
4193 			result = EMSGSIZE;
4194 			break;
4195 		}
4196 		if (pcb->ipsec_ifp != NULL) {
4197 			// Only can set before connecting
4198 			result = EINVAL;
4199 			break;
4200 		}
4201 		u_int32_t ring_size = *(u_int32_t *)data;
4202 		if (ring_size < IPSEC_IF_MIN_RING_SIZE ||
4203 		    ring_size > IPSEC_IF_MAX_RING_SIZE) {
4204 			return EINVAL;
4205 		}
4206 		pcb->ipsec_kpipe_tx_ring_size = ring_size;
4207 		if (if_ipsec_debug != 0) {
4208 			printf("%s: IPSEC_OPT_KPIPE_TX_RING_SIZE %u\n", __func__, ring_size);
4209 		}
4210 		break;
4211 	}
4212 	case IPSEC_OPT_KPIPE_RX_RING_SIZE: {
4213 		if (len != sizeof(u_int32_t)) {
4214 			result = EMSGSIZE;
4215 			break;
4216 		}
4217 		if (pcb->ipsec_ifp != NULL) {
4218 			// Only can set before connecting
4219 			result = EINVAL;
4220 			break;
4221 		}
4222 		u_int32_t ring_size = *(u_int32_t *)data;
4223 		if (ring_size < IPSEC_IF_MIN_RING_SIZE ||
4224 		    ring_size > IPSEC_IF_MAX_RING_SIZE) {
4225 			return EINVAL;
4226 		}
4227 		pcb->ipsec_kpipe_rx_ring_size = ring_size;
4228 		if (if_ipsec_debug != 0) {
4229 			printf("%s: IPSEC_OPT_KPIPE_RX_RING_SIZE %u\n", __func__, ring_size);
4230 		}
4231 		break;
4232 	}
4233 	case IPSEC_OPT_OUTPUT_DSCP_MAPPING: {
4234 		if (len != sizeof(int)) {
4235 			result = EMSGSIZE;
4236 			break;
4237 		}
4238 		if (pcb->ipsec_ifp == NULL) {
4239 			// Only can set after connecting
4240 			result = EINVAL;
4241 			break;
4242 		}
4243 
4244 		ipsec_dscp_mapping_t output_dscp_mapping = (ipsec_dscp_mapping_t)(*(int *)data);
4245 		if (output_dscp_mapping > IPSEC_DSCP_MAPPING_LEGACY) {
4246 			return EINVAL;
4247 		}
4248 
4249 		pcb->ipsec_output_dscp_mapping = output_dscp_mapping;
4250 
4251 		os_log(OS_LOG_DEFAULT, "%s IPSEC_OPT_OUTPUT_DSCP_MAPPING %s DSCP %d\n",
4252 		    __func__, pcb->ipsec_ifp->if_xname,
4253 		    pcb->ipsec_output_dscp_mapping);
4254 		break;
4255 	}
4256 
4257 #endif // IPSEC_NEXUS
4258 
4259 	default: {
4260 		result = ENOPROTOOPT;
4261 		break;
4262 	}
4263 	}
4264 
4265 	return result;
4266 }
4267 
4268 static errno_t
ipsec_ctl_getopt(__unused kern_ctl_ref kctlref,__unused u_int32_t unit,void * unitinfo,int opt,void * data,size_t * len)4269 ipsec_ctl_getopt(__unused kern_ctl_ref kctlref,
4270     __unused u_int32_t unit,
4271     void *unitinfo,
4272     int opt,
4273     void *data,
4274     size_t *len)
4275 {
4276 	errno_t result = 0;
4277 	struct ipsec_pcb *pcb = unitinfo;
4278 	if (pcb == NULL) {
4279 		return EINVAL;
4280 	}
4281 
4282 	switch (opt) {
4283 	case IPSEC_OPT_FLAGS: {
4284 		if (*len != sizeof(u_int32_t)) {
4285 			result = EMSGSIZE;
4286 		} else {
4287 			*(u_int32_t *)data = pcb->ipsec_external_flags;
4288 		}
4289 		break;
4290 	}
4291 
4292 	case IPSEC_OPT_EXT_IFDATA_STATS: {
4293 		if (*len != sizeof(int)) {
4294 			result = EMSGSIZE;
4295 		} else {
4296 			*(int *)data = (pcb->ipsec_ext_ifdata_stats) ? 1 : 0;
4297 		}
4298 		break;
4299 	}
4300 
4301 	case IPSEC_OPT_IFNAME: {
4302 		if (*len < MIN(strlen(pcb->ipsec_if_xname) + 1, sizeof(pcb->ipsec_if_xname))) {
4303 			result = EMSGSIZE;
4304 		} else {
4305 			if (pcb->ipsec_ifp == NULL) {
4306 				// Only can get after connecting
4307 				result = EINVAL;
4308 				break;
4309 			}
4310 			*len = scnprintf(data, *len, "%s", pcb->ipsec_if_xname) + 1;
4311 		}
4312 		break;
4313 	}
4314 
4315 	case IPSEC_OPT_OUTPUT_TRAFFIC_CLASS: {
4316 		if (*len != sizeof(int)) {
4317 			result = EMSGSIZE;
4318 		} else {
4319 			*(int *)data = so_svc2tc(pcb->ipsec_output_service_class);
4320 		}
4321 		break;
4322 	}
4323 
4324 #if IPSEC_NEXUS
4325 
4326 	case IPSEC_OPT_ENABLE_CHANNEL: {
4327 		if (*len != sizeof(int)) {
4328 			result = EMSGSIZE;
4329 		} else {
4330 			lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
4331 			*(int *)data = pcb->ipsec_kpipe_count;
4332 			lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
4333 		}
4334 		break;
4335 	}
4336 
4337 	case IPSEC_OPT_CHANNEL_BIND_PID: {
4338 		if (*len != sizeof(pid_t)) {
4339 			result = EMSGSIZE;
4340 		} else {
4341 			lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
4342 			*(pid_t *)data = pcb->ipsec_kpipe_pid;
4343 			lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
4344 		}
4345 		break;
4346 	}
4347 
4348 	case IPSEC_OPT_CHANNEL_BIND_UUID: {
4349 		if (*len != sizeof(uuid_t)) {
4350 			result = EMSGSIZE;
4351 		} else {
4352 			lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
4353 			uuid_copy(*((uuid_t *)data), pcb->ipsec_kpipe_proc_uuid);
4354 			lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
4355 		}
4356 		break;
4357 	}
4358 
4359 	case IPSEC_OPT_ENABLE_FLOWSWITCH: {
4360 		if (*len != sizeof(int)) {
4361 			result = EMSGSIZE;
4362 		} else {
4363 			*(int *)data = if_check_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.fsw_agent);
4364 		}
4365 		break;
4366 	}
4367 
4368 	case IPSEC_OPT_ENABLE_NETIF: {
4369 		if (*len != sizeof(int)) {
4370 			result = EMSGSIZE;
4371 		} else {
4372 			lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
4373 			*(int *)data = !!pcb->ipsec_use_netif;
4374 			lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
4375 		}
4376 		break;
4377 	}
4378 
4379 	case IPSEC_OPT_GET_CHANNEL_UUID: {
4380 		lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
4381 		if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
4382 			result = ENXIO;
4383 		} else if (*len != sizeof(uuid_t) * pcb->ipsec_kpipe_count) {
4384 			result = EMSGSIZE;
4385 		} else {
4386 			for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) {
4387 				uuid_copy(((uuid_t *)data)[i], pcb->ipsec_kpipe_uuid[i]);
4388 			}
4389 		}
4390 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
4391 		break;
4392 	}
4393 
4394 	case IPSEC_OPT_INPUT_FRAG_SIZE: {
4395 		if (*len != sizeof(u_int32_t)) {
4396 			result = EMSGSIZE;
4397 		} else {
4398 			*(u_int32_t *)data = pcb->ipsec_input_frag_size;
4399 		}
4400 		break;
4401 	}
4402 	case IPSEC_OPT_SLOT_SIZE: {
4403 		if (*len != sizeof(u_int32_t)) {
4404 			result = EMSGSIZE;
4405 		} else {
4406 			*(u_int32_t *)data = pcb->ipsec_slot_size;
4407 		}
4408 		break;
4409 	}
4410 	case IPSEC_OPT_NETIF_RING_SIZE: {
4411 		if (*len != sizeof(u_int32_t)) {
4412 			result = EMSGSIZE;
4413 		} else {
4414 			*(u_int32_t *)data = pcb->ipsec_netif_ring_size;
4415 		}
4416 		break;
4417 	}
4418 	case IPSEC_OPT_TX_FSW_RING_SIZE: {
4419 		if (*len != sizeof(u_int32_t)) {
4420 			result = EMSGSIZE;
4421 		} else {
4422 			*(u_int32_t *)data = pcb->ipsec_tx_fsw_ring_size;
4423 		}
4424 		break;
4425 	}
4426 	case IPSEC_OPT_RX_FSW_RING_SIZE: {
4427 		if (*len != sizeof(u_int32_t)) {
4428 			result = EMSGSIZE;
4429 		} else {
4430 			*(u_int32_t *)data = pcb->ipsec_rx_fsw_ring_size;
4431 		}
4432 		break;
4433 	}
4434 	case IPSEC_OPT_KPIPE_TX_RING_SIZE: {
4435 		if (*len != sizeof(u_int32_t)) {
4436 			result = EMSGSIZE;
4437 		} else {
4438 			*(u_int32_t *)data = pcb->ipsec_kpipe_tx_ring_size;
4439 		}
4440 		break;
4441 	}
4442 	case IPSEC_OPT_KPIPE_RX_RING_SIZE: {
4443 		if (*len != sizeof(u_int32_t)) {
4444 			result = EMSGSIZE;
4445 		} else {
4446 			*(u_int32_t *)data = pcb->ipsec_kpipe_rx_ring_size;
4447 		}
4448 		break;
4449 	}
4450 
4451 #endif // IPSEC_NEXUS
4452 
4453 	default: {
4454 		result = ENOPROTOOPT;
4455 		break;
4456 	}
4457 	}
4458 
4459 	return result;
4460 }
4461 
4462 /* Network Interface functions */
4463 static errno_t
ipsec_output(ifnet_t interface,mbuf_t data)4464 ipsec_output(ifnet_t interface,
4465     mbuf_t data)
4466 {
4467 	struct ipsec_pcb *pcb = ifnet_softc(interface);
4468 	struct ipsec_output_state ipsec_state;
4469 	struct route ro;
4470 	struct route_in6 ro6;
4471 	size_t length;
4472 	struct ip *ip = NULL;
4473 	struct ip6_hdr *ip6 = NULL;
4474 	struct ip_out_args ipoa;
4475 	struct ip6_out_args ip6oa;
4476 	int error = 0;
4477 	u_int ip_version = 0;
4478 	int flags = 0;
4479 	struct flowadv *adv = NULL;
4480 
4481 	// Make sure this packet isn't looping through the interface
4482 	if (necp_get_last_interface_index_from_packet(data) == interface->if_index) {
4483 		error = EINVAL;
4484 		goto ipsec_output_err;
4485 	}
4486 
4487 	// Mark the interface so NECP can evaluate tunnel policy
4488 	necp_mark_packet_from_interface(data, interface);
4489 
4490 	if (data->m_len < sizeof(*ip)) {
4491 		os_log_error(OS_LOG_DEFAULT, "ipsec_output: first mbuf length shorter than IP header length: %d.\n", data->m_len);
4492 		IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
4493 		error = EINVAL;
4494 		goto ipsec_output_err;
4495 	}
4496 
4497 	ip = mtod(data, struct ip *);
4498 	ip_version = ip->ip_v;
4499 
4500 	switch (ip_version) {
4501 	case 4: {
4502 		u_int8_t ip_hlen = 0;
4503 #ifdef _IP_VHL
4504 		ip_hlen = _IP_VHL_HL(ip->ip_vhl) << 2;
4505 #else
4506 		ip_hlen = (uint8_t)(ip->ip_hl << 2);
4507 #endif
4508 		if (ip_hlen < sizeof(*ip)) {
4509 			os_log_error(OS_LOG_DEFAULT, "ipsec_output: Bad ip header length %d.\n", ip_hlen);
4510 			IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
4511 			error = EINVAL;
4512 			goto ipsec_output_err;
4513 		}
4514 #if IPSEC_NEXUS
4515 		if (!pcb->ipsec_use_netif)
4516 #endif // IPSEC_NEXUS
4517 		{
4518 			int af = AF_INET;
4519 			bpf_tap_out(pcb->ipsec_ifp, DLT_NULL, data, &af, sizeof(af));
4520 		}
4521 
4522 		/* Apply encryption */
4523 		memset(&ipsec_state, 0, sizeof(ipsec_state));
4524 		ipsec_state.m = data;
4525 		ipsec_state.dst = (struct sockaddr *)&ip->ip_dst;
4526 		memset(&ipsec_state.ro, 0, sizeof(ipsec_state.ro));
4527 		ipsec_state.dscp_mapping = pcb->ipsec_output_dscp_mapping;
4528 
4529 		error = ipsec4_interface_output(&ipsec_state, interface);
4530 		/* Tunneled in IPv6 - packet is gone */
4531 		if (error == 0 && ipsec_state.tunneled == 6) {
4532 			goto done;
4533 		}
4534 
4535 		data = ipsec_state.m;
4536 		if (error || data == NULL) {
4537 			if (error) {
4538 				os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec4_output error %d.\n", error);
4539 			}
4540 			goto ipsec_output_err;
4541 		}
4542 
4543 		/* Set traffic class, set flow */
4544 		m_set_service_class(data, pcb->ipsec_output_service_class);
4545 		data->m_pkthdr.pkt_flowsrc = FLOWSRC_IFNET;
4546 #if SKYWALK
4547 		data->m_pkthdr.pkt_mpriv_srcid = interface->if_flowhash;
4548 #else /* !SKYWALK */
4549 		data->m_pkthdr.pkt_flowid = interface->if_flowhash;
4550 #endif /* !SKYWALK */
4551 		data->m_pkthdr.pkt_proto = ip->ip_p;
4552 		data->m_pkthdr.pkt_flags = (PKTF_FLOW_ID | PKTF_FLOW_ADV | PKTF_FLOW_LOCALSRC);
4553 
4554 		/* Flip endian-ness for ip_output */
4555 		ip = mtod(data, struct ip *);
4556 		NTOHS(ip->ip_len);
4557 		NTOHS(ip->ip_off);
4558 
4559 		/* Increment statistics */
4560 		length = mbuf_pkthdr_len(data);
4561 		ifnet_stat_increment_out(interface, 1, (uint16_t)length, 0);
4562 
4563 		/* Send to ip_output */
4564 		memset(&ro, 0, sizeof(ro));
4565 
4566 		flags = (IP_OUTARGS |   /* Passing out args to specify interface */
4567 		    IP_NOIPSEC);                        /* To ensure the packet doesn't go through ipsec twice */
4568 
4569 		memset(&ipoa, 0, sizeof(ipoa));
4570 		ipoa.ipoa_flowadv.code = 0;
4571 		ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR;
4572 		if (ipsec_state.outgoing_if) {
4573 			ipoa.ipoa_boundif = ipsec_state.outgoing_if;
4574 			ipoa.ipoa_flags |= IPOAF_BOUND_IF;
4575 		}
4576 		ipsec_set_ipoa_for_interface(pcb->ipsec_ifp, &ipoa);
4577 
4578 		adv = &ipoa.ipoa_flowadv;
4579 
4580 		(void)ip_output(data, NULL, &ro, flags, NULL, &ipoa);
4581 		data = NULL;
4582 
4583 		if (adv->code == FADV_FLOW_CONTROLLED || adv->code == FADV_SUSPENDED) {
4584 			error = ENOBUFS;
4585 			ifnet_disable_output(interface);
4586 		}
4587 
4588 		goto done;
4589 	}
4590 	case 6: {
4591 		if (data->m_len < sizeof(*ip6)) {
4592 			os_log_error(OS_LOG_DEFAULT, "ipsec_output: first mbuf length shorter than IPv6 header length: %d.\n", data->m_len);
4593 			IPSEC_STAT_INCREMENT(ipsec6stat.out_inval);
4594 			error = EINVAL;
4595 			goto ipsec_output_err;
4596 		}
4597 #if IPSEC_NEXUS
4598 		if (!pcb->ipsec_use_netif)
4599 #endif // IPSEC_NEXUS
4600 		{
4601 			int af = AF_INET6;
4602 			bpf_tap_out(pcb->ipsec_ifp, DLT_NULL, data, &af, sizeof(af));
4603 		}
4604 
4605 		data = ipsec6_splithdr(data);
4606 		if (data == NULL) {
4607 			os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec6_splithdr returned NULL\n");
4608 			goto ipsec_output_err;
4609 		}
4610 
4611 		ip6 = mtod(data, struct ip6_hdr *);
4612 
4613 		memset(&ipsec_state, 0, sizeof(ipsec_state));
4614 		ipsec_state.m = data;
4615 		ipsec_state.dst = (struct sockaddr *)&ip6->ip6_dst;
4616 		memset(&ipsec_state.ro, 0, sizeof(ipsec_state.ro));
4617 		ipsec_state.dscp_mapping = pcb->ipsec_output_dscp_mapping;
4618 
4619 		error = ipsec6_interface_output(&ipsec_state, interface, &ip6->ip6_nxt, ipsec_state.m);
4620 		if (error == 0 && ipsec_state.tunneled == 4) {          /* tunneled in IPv4 - packet is gone */
4621 			goto done;
4622 		}
4623 		data = ipsec_state.m;
4624 		if (error || data == NULL) {
4625 			if (error) {
4626 				os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec6_output error %d\n", error);
4627 			}
4628 			goto ipsec_output_err;
4629 		}
4630 
4631 		/* Set traffic class, set flow */
4632 		m_set_service_class(data, pcb->ipsec_output_service_class);
4633 		data->m_pkthdr.pkt_flowsrc = FLOWSRC_IFNET;
4634 #if SKYWALK
4635 		data->m_pkthdr.pkt_mpriv_srcid = interface->if_flowhash;
4636 #else /* !SKYWALK */
4637 		data->m_pkthdr.pkt_flowid = interface->if_flowhash;
4638 #endif /* !SKYWALK */
4639 		data->m_pkthdr.pkt_proto = ip6->ip6_nxt;
4640 		data->m_pkthdr.pkt_flags = (PKTF_FLOW_ID | PKTF_FLOW_ADV | PKTF_FLOW_LOCALSRC);
4641 
4642 		/* Increment statistics */
4643 		length = mbuf_pkthdr_len(data);
4644 		ifnet_stat_increment_out(interface, 1, (uint16_t)length, 0);
4645 
4646 		/* Send to ip6_output */
4647 		memset(&ro6, 0, sizeof(ro6));
4648 
4649 		flags = IPV6_OUTARGS;
4650 
4651 		memset(&ip6oa, 0, sizeof(ip6oa));
4652 		ip6oa.ip6oa_flowadv.code = 0;
4653 		ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR;
4654 		if (ipsec_state.outgoing_if) {
4655 			ip6oa.ip6oa_boundif = ipsec_state.outgoing_if;
4656 			ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
4657 			ip6_output_setsrcifscope(data, ipsec_state.outgoing_if, NULL);
4658 			ip6_output_setdstifscope(data, ipsec_state.outgoing_if, NULL);
4659 		} else {
4660 			ip6_output_setsrcifscope(data, IFSCOPE_UNKNOWN, NULL);
4661 			ip6_output_setdstifscope(data, IFSCOPE_UNKNOWN, NULL);
4662 		}
4663 		ipsec_set_ip6oa_for_interface(pcb->ipsec_ifp, &ip6oa);
4664 
4665 		adv = &ip6oa.ip6oa_flowadv;
4666 
4667 		(void) ip6_output(data, NULL, &ro6, flags, NULL, NULL, &ip6oa);
4668 		data = NULL;
4669 
4670 		if (adv->code == FADV_FLOW_CONTROLLED || adv->code == FADV_SUSPENDED) {
4671 			error = ENOBUFS;
4672 			ifnet_disable_output(interface);
4673 		}
4674 
4675 		goto done;
4676 	}
4677 	default: {
4678 		os_log_error(OS_LOG_DEFAULT, "ipsec_output: Received unknown packet version %d.\n", ip_version);
4679 		error = EINVAL;
4680 		goto ipsec_output_err;
4681 	}
4682 	}
4683 
4684 done:
4685 	return error;
4686 
4687 ipsec_output_err:
4688 	if (data) {
4689 		mbuf_freem(data);
4690 	}
4691 	goto done;
4692 }
4693 
4694 static void
ipsec_start(ifnet_t interface)4695 ipsec_start(ifnet_t     interface)
4696 {
4697 	mbuf_t data;
4698 	struct ipsec_pcb *pcb = ifnet_softc(interface);
4699 
4700 	VERIFY(pcb != NULL);
4701 	for (;;) {
4702 		if (ifnet_dequeue(interface, &data) != 0) {
4703 			break;
4704 		}
4705 		if (ipsec_output(interface, data) != 0) {
4706 			break;
4707 		}
4708 	}
4709 }
4710 
4711 /* Network Interface functions */
4712 static errno_t
ipsec_demux(__unused ifnet_t interface,mbuf_t data,__unused char * frame_header,protocol_family_t * protocol)4713 ipsec_demux(__unused ifnet_t    interface,
4714     mbuf_t                          data,
4715     __unused char           *frame_header,
4716     protocol_family_t       *protocol)
4717 {
4718 	struct ip *ip;
4719 	u_int ip_version;
4720 
4721 	while (data != NULL && mbuf_len(data) < 1) {
4722 		data = mbuf_next(data);
4723 	}
4724 
4725 	if (data == NULL) {
4726 		return ENOENT;
4727 	}
4728 
4729 	ip = mtod(data, struct ip *);
4730 	ip_version = ip->ip_v;
4731 
4732 	switch (ip_version) {
4733 	case 4:
4734 		*protocol = PF_INET;
4735 		return 0;
4736 	case 6:
4737 		*protocol = PF_INET6;
4738 		return 0;
4739 	default:
4740 		*protocol = PF_UNSPEC;
4741 		break;
4742 	}
4743 
4744 	return 0;
4745 }
4746 
4747 static errno_t
ipsec_add_proto(__unused ifnet_t interface,protocol_family_t protocol,__unused const struct ifnet_demux_desc * demux_array,__unused u_int32_t demux_count)4748 ipsec_add_proto(__unused ifnet_t                                                interface,
4749     protocol_family_t                                               protocol,
4750     __unused const struct ifnet_demux_desc  *demux_array,
4751     __unused u_int32_t                                              demux_count)
4752 {
4753 	switch (protocol) {
4754 	case PF_INET:
4755 		return 0;
4756 	case PF_INET6:
4757 		return 0;
4758 	default:
4759 		break;
4760 	}
4761 
4762 	return ENOPROTOOPT;
4763 }
4764 
4765 static errno_t
ipsec_del_proto(__unused ifnet_t interface,__unused protocol_family_t protocol)4766 ipsec_del_proto(__unused ifnet_t                        interface,
4767     __unused protocol_family_t      protocol)
4768 {
4769 	return 0;
4770 }
4771 
4772 static errno_t
ipsec_ioctl(ifnet_t interface,u_long command,void * data)4773 ipsec_ioctl(ifnet_t interface,
4774     u_long command,
4775     void *data)
4776 {
4777 #if IPSEC_NEXUS
4778 	struct ipsec_pcb *pcb = ifnet_softc(interface);
4779 #endif
4780 	errno_t result = 0;
4781 
4782 	switch (command) {
4783 	case SIOCSIFMTU: {
4784 #if IPSEC_NEXUS
4785 		if (pcb->ipsec_use_netif) {
4786 			// Make sure we can fit packets in the channel buffers
4787 			if (((uint64_t)((struct ifreq*)data)->ifr_mtu) > pcb->ipsec_slot_size) {
4788 				result = EINVAL;
4789 			} else {
4790 				ifnet_set_mtu(interface, (uint32_t)((struct ifreq*)data)->ifr_mtu);
4791 			}
4792 		} else
4793 #endif // IPSEC_NEXUS
4794 		{
4795 			ifnet_set_mtu(interface, ((struct ifreq*)data)->ifr_mtu);
4796 		}
4797 		break;
4798 	}
4799 
4800 	case SIOCSIFFLAGS:
4801 		/* ifioctl() takes care of it */
4802 		break;
4803 
4804 	case SIOCSIFSUBFAMILY: {
4805 		uint32_t subfamily;
4806 
4807 		subfamily = ((struct ifreq*)data)->ifr_type.ift_subfamily;
4808 		switch (subfamily) {
4809 		case IFRTYPE_SUBFAMILY_BLUETOOTH:
4810 			interface->if_subfamily = IFNET_SUBFAMILY_BLUETOOTH;
4811 			break;
4812 		case IFRTYPE_SUBFAMILY_WIFI:
4813 			interface->if_subfamily = IFNET_SUBFAMILY_WIFI;
4814 			break;
4815 		case IFRTYPE_SUBFAMILY_QUICKRELAY:
4816 			interface->if_subfamily = IFNET_SUBFAMILY_QUICKRELAY;
4817 			break;
4818 		case IFRTYPE_SUBFAMILY_DEFAULT:
4819 			interface->if_subfamily = IFNET_SUBFAMILY_DEFAULT;
4820 			break;
4821 		default:
4822 			result = EINVAL;
4823 			break;
4824 		}
4825 		break;
4826 	}
4827 
4828 	default:
4829 		result = EOPNOTSUPP;
4830 	}
4831 
4832 	return result;
4833 }
4834 
4835 static void
ipsec_detached(ifnet_t interface)4836 ipsec_detached(ifnet_t interface)
4837 {
4838 	struct ipsec_pcb *pcb = ifnet_softc(interface);
4839 
4840 	(void)ifnet_release(interface);
4841 	lck_mtx_lock(&ipsec_lock);
4842 	ipsec_free_pcb(pcb, true);
4843 	(void)ifnet_dispose(interface);
4844 	lck_mtx_unlock(&ipsec_lock);
4845 }
4846 
4847 /* Protocol Handlers */
4848 
4849 static errno_t
ipsec_proto_input(ifnet_t interface,protocol_family_t protocol,mbuf_t m,__unused char * frame_header)4850 ipsec_proto_input(ifnet_t interface,
4851     protocol_family_t     protocol,
4852     mbuf_t m,
4853     __unused char *frame_header)
4854 {
4855 	mbuf_pkthdr_setrcvif(m, interface);
4856 
4857 #if IPSEC_NEXUS
4858 	struct ipsec_pcb *pcb = ifnet_softc(interface);
4859 	if (!pcb->ipsec_use_netif)
4860 #endif // IPSEC_NEXUS
4861 	{
4862 		uint32_t af = 0;
4863 		struct ip *ip = mtod(m, struct ip *);
4864 		if (ip->ip_v == 4) {
4865 			af = AF_INET;
4866 		} else if (ip->ip_v == 6) {
4867 			af = AF_INET6;
4868 		}
4869 		bpf_tap_in(interface, DLT_NULL, m, &af, sizeof(af));
4870 		pktap_input(interface, protocol, m, NULL);
4871 	}
4872 
4873 	int32_t pktlen = m->m_pkthdr.len;
4874 	if (proto_input(protocol, m) != 0) {
4875 		ifnet_stat_increment_in(interface, 0, 0, 1);
4876 		m_freem(m);
4877 	} else {
4878 		ifnet_stat_increment_in(interface, 1, pktlen, 0);
4879 	}
4880 
4881 	return 0;
4882 }
4883 
4884 static errno_t
ipsec_proto_pre_output(__unused ifnet_t interface,protocol_family_t protocol,__unused mbuf_t * packet,__unused const struct sockaddr * dest,__unused void * route,__unused char * frame_type,__unused char * link_layer_dest)4885 ipsec_proto_pre_output(__unused ifnet_t interface,
4886     protocol_family_t    protocol,
4887     __unused mbuf_t              *packet,
4888     __unused const struct sockaddr *dest,
4889     __unused void *route,
4890     __unused char *frame_type,
4891     __unused char *link_layer_dest)
4892 {
4893 	*(protocol_family_t *)(void *)frame_type = protocol;
4894 	return 0;
4895 }
4896 
4897 static errno_t
ipsec_attach_proto(ifnet_t interface,protocol_family_t protocol)4898 ipsec_attach_proto(ifnet_t                              interface,
4899     protocol_family_t    protocol)
4900 {
4901 	struct ifnet_attach_proto_param proto;
4902 	errno_t                                                 result;
4903 
4904 	bzero(&proto, sizeof(proto));
4905 	proto.input = ipsec_proto_input;
4906 	proto.pre_output = ipsec_proto_pre_output;
4907 
4908 	result = ifnet_attach_protocol(interface, protocol, &proto);
4909 	if (result != 0 && result != EEXIST) {
4910 		os_log_error(OS_LOG_DEFAULT, "ipsec_attach_inet - ifnet_attach_protocol %d failed: %d\n",
4911 		    protocol, result);
4912 	}
4913 
4914 	return result;
4915 }
4916 
4917 errno_t
ipsec_inject_inbound_packet(ifnet_t interface,mbuf_t packet)4918 ipsec_inject_inbound_packet(ifnet_t     interface,
4919     mbuf_t      packet)
4920 {
4921 #if IPSEC_NEXUS
4922 	struct ipsec_pcb *pcb = ifnet_softc(interface);
4923 
4924 	if (pcb->ipsec_use_netif) {
4925 		if (!ipsec_data_move_begin(pcb)) {
4926 			os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__,
4927 			    if_name(pcb->ipsec_ifp));
4928 			return ENXIO;
4929 		}
4930 
4931 		lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
4932 
4933 		lck_mtx_lock(&pcb->ipsec_input_chain_lock);
4934 
4935 		if (pcb->ipsec_input_chain_count > (u_int32_t)if_ipsec_max_pending_input) {
4936 			lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
4937 			lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
4938 			ipsec_data_move_end(pcb);
4939 			return ENOSPC;
4940 		}
4941 
4942 		if (pcb->ipsec_input_chain != NULL) {
4943 			pcb->ipsec_input_chain_last->m_nextpkt = packet;
4944 		} else {
4945 			pcb->ipsec_input_chain = packet;
4946 		}
4947 		pcb->ipsec_input_chain_count++;
4948 		while (packet->m_nextpkt) {
4949 			VERIFY(packet != packet->m_nextpkt);
4950 			packet = packet->m_nextpkt;
4951 			pcb->ipsec_input_chain_count++;
4952 		}
4953 		pcb->ipsec_input_chain_last = packet;
4954 		lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
4955 
4956 		kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring[0];
4957 		lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
4958 
4959 		if (rx_ring != NULL) {
4960 			kern_channel_notify(rx_ring, 0);
4961 		}
4962 
4963 		ipsec_data_move_end(pcb);
4964 		return 0;
4965 	} else
4966 #endif // IPSEC_NEXUS
4967 	{
4968 		errno_t error;
4969 		protocol_family_t protocol;
4970 		if ((error = ipsec_demux(interface, packet, NULL, &protocol)) != 0) {
4971 			return error;
4972 		}
4973 
4974 		return ipsec_proto_input(interface, protocol, packet, NULL);
4975 	}
4976 }
4977 
4978 void
ipsec_set_pkthdr_for_interface(ifnet_t interface,mbuf_t packet,int family,uint32_t flowid)4979 ipsec_set_pkthdr_for_interface(ifnet_t interface, mbuf_t packet, int family,
4980     uint32_t flowid)
4981 {
4982 #pragma unused (flowid)
4983 	if (packet != NULL && interface != NULL) {
4984 		struct ipsec_pcb *pcb = ifnet_softc(interface);
4985 		if (pcb != NULL) {
4986 			/* Set traffic class, set flow */
4987 			m_set_service_class(packet, pcb->ipsec_output_service_class);
4988 			packet->m_pkthdr.pkt_flowsrc = FLOWSRC_IFNET;
4989 #if SKYWALK
4990 			packet->m_pkthdr.pkt_mpriv_srcid = interface->if_flowhash;
4991 			packet->m_pkthdr.pkt_flowid = flowid;
4992 #else /* !SKYWALK */
4993 			packet->m_pkthdr.pkt_flowid = interface->if_flowhash;
4994 #endif /* !SKYWALK */
4995 			if (family == AF_INET) {
4996 				struct ip *ip = mtod(packet, struct ip *);
4997 				packet->m_pkthdr.pkt_proto = ip->ip_p;
4998 			} else if (family == AF_INET6) {
4999 				struct ip6_hdr *ip6 = mtod(packet, struct ip6_hdr *);
5000 				packet->m_pkthdr.pkt_proto = ip6->ip6_nxt;
5001 			}
5002 			packet->m_pkthdr.pkt_flags = (PKTF_FLOW_ID | PKTF_FLOW_ADV | PKTF_FLOW_LOCALSRC);
5003 		}
5004 	}
5005 }
5006 
5007 void
ipsec_set_ipoa_for_interface(ifnet_t interface,struct ip_out_args * ipoa)5008 ipsec_set_ipoa_for_interface(ifnet_t interface, struct ip_out_args *ipoa)
5009 {
5010 	struct ipsec_pcb *pcb;
5011 
5012 	if (interface == NULL || ipoa == NULL) {
5013 		return;
5014 	}
5015 	pcb = ifnet_softc(interface);
5016 
5017 	if (net_qos_policy_restricted == 0) {
5018 		ipoa->ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
5019 		ipoa->ipoa_sotc = so_svc2tc(pcb->ipsec_output_service_class);
5020 	} else if (pcb->ipsec_output_service_class != MBUF_SC_VO ||
5021 	    net_qos_policy_restrict_avapps != 0) {
5022 		ipoa->ipoa_flags &= ~IPOAF_QOSMARKING_ALLOWED;
5023 	} else {
5024 		ipoa->ipoa_flags |= IP6OAF_QOSMARKING_ALLOWED;
5025 		ipoa->ipoa_sotc = SO_TC_VO;
5026 	}
5027 }
5028 
5029 void
ipsec_set_ip6oa_for_interface(ifnet_t interface,struct ip6_out_args * ip6oa)5030 ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa)
5031 {
5032 	struct ipsec_pcb *pcb;
5033 
5034 	if (interface == NULL || ip6oa == NULL) {
5035 		return;
5036 	}
5037 	pcb = ifnet_softc(interface);
5038 
5039 	if (net_qos_policy_restricted == 0) {
5040 		ip6oa->ip6oa_flags |= IPOAF_QOSMARKING_ALLOWED;
5041 		ip6oa->ip6oa_sotc = so_svc2tc(pcb->ipsec_output_service_class);
5042 	} else if (pcb->ipsec_output_service_class != MBUF_SC_VO ||
5043 	    net_qos_policy_restrict_avapps != 0) {
5044 		ip6oa->ip6oa_flags &= ~IPOAF_QOSMARKING_ALLOWED;
5045 	} else {
5046 		ip6oa->ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
5047 		ip6oa->ip6oa_sotc = SO_TC_VO;
5048 	}
5049 }
5050 
5051 static boolean_t
ipsec_data_move_begin(struct ipsec_pcb * pcb)5052 ipsec_data_move_begin(struct ipsec_pcb *pcb)
5053 {
5054 	boolean_t ret = 0;
5055 
5056 	lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock);
5057 	if ((ret = IPSEC_IS_DATA_PATH_READY(pcb))) {
5058 		pcb->ipsec_pcb_data_move++;
5059 	}
5060 	lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
5061 
5062 	return ret;
5063 }
5064 
5065 static void
ipsec_data_move_end(struct ipsec_pcb * pcb)5066 ipsec_data_move_end(struct ipsec_pcb *pcb)
5067 {
5068 	lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock);
5069 	VERIFY(pcb->ipsec_pcb_data_move > 0);
5070 	/*
5071 	 * if there's no more thread moving data, wakeup any
5072 	 * drainers that's blocked waiting for this.
5073 	 */
5074 	if (--pcb->ipsec_pcb_data_move == 0 && pcb->ipsec_pcb_drainers > 0) {
5075 		wakeup(&(pcb->ipsec_pcb_data_move));
5076 	}
5077 	lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
5078 }
5079 
5080 static void
ipsec_data_move_drain(struct ipsec_pcb * pcb)5081 ipsec_data_move_drain(struct ipsec_pcb *pcb)
5082 {
5083 	lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
5084 	/* data path must already be marked as not ready */
5085 	VERIFY(!IPSEC_IS_DATA_PATH_READY(pcb));
5086 	pcb->ipsec_pcb_drainers++;
5087 	while (pcb->ipsec_pcb_data_move != 0) {
5088 		(void)msleep(&(pcb->ipsec_pcb_data_move), &pcb->ipsec_pcb_data_move_lock,
5089 		    (PZERO - 1), __func__, NULL);
5090 	}
5091 	VERIFY(!IPSEC_IS_DATA_PATH_READY(pcb));
5092 	VERIFY(pcb->ipsec_pcb_drainers > 0);
5093 	pcb->ipsec_pcb_drainers--;
5094 	lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
5095 }
5096 
5097 static void
ipsec_wait_data_move_drain(struct ipsec_pcb * pcb)5098 ipsec_wait_data_move_drain(struct ipsec_pcb *pcb)
5099 {
5100 	/*
5101 	 * Mark the data path as not usable.
5102 	 */
5103 	lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
5104 	IPSEC_CLR_DATA_PATH_READY(pcb);
5105 	lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
5106 
5107 	/* Wait until all threads in the data paths are done. */
5108 	ipsec_data_move_drain(pcb);
5109 }
5110