xref: /xnu-11215.41.3/bsd/net/content_filter.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2013-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. Please obtain a copy of the License at
10  * http://www.opensource.apple.com/apsl/ and read it before using this
11  * file.
12  *
13  * The Original Code and all software distributed under the License are
14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18  * Please see the License for the specific language governing rights and
19  * limitations under the License.
20  *
21  * @APPLE_LICENSE_HEADER_END@
22  */
23 
24 /*
25  * THEORY OF OPERATION
26  *
27  * The socket content filter subsystem provides a way for user space agents to
28  * make filtering decisions based on the content of the data being sent and
29  * received by INET/INET6 sockets.
30  *
31  * A content filter user space agents gets a copy of the data and the data is
32  * also kept in kernel buffer until the user space agents makes a pass or drop
33  * decision. This unidirectional flow of content avoids unnecessary data copies
34  * back to the kernel.
35  *
36  * A user space filter agent opens a kernel control socket with the name
37  * CONTENT_FILTER_CONTROL_NAME to attach to the socket content filter subsystem.
38  * When connected, a "struct content_filter" is created and set as the
39  * "unitinfo" of the corresponding kernel control socket instance.
40  *
41  * The socket content filter subsystem exchanges messages with the user space
42  * filter agent until an ultimate pass or drop decision is made by the
43  * user space filter agent.
44  *
45  * It should be noted that messages about many INET/INET6 sockets can be multiplexed
46  * over a single kernel control socket.
47  *
48  * Notes:
49  * - The current implementation supports all INET/INET6 sockets (i.e. TCP,
50  *   UDP, ICMP, etc).
51  * - The current implementation supports up to two simultaneous content filters
52  *   for iOS devices and eight simultaneous content filters for OSX.
53  *
54  *
55  * NECP FILTER CONTROL UNIT
56  *
57  * A user space filter agent uses the Network Extension Control Policy (NECP)
58  * database to specify which INET/INET6 sockets need to be filtered. The NECP
59  * criteria may be based on a variety of properties like user ID or proc UUID.
60  *
61  * The NECP "filter control unit" is used by the socket content filter subsystem
62  * to deliver the relevant INET/INET6 content information to the appropriate
63  * user space filter agent via its kernel control socket instance.
64  * This works as follows:
65  *
66  * 1) The user space filter agent specifies an NECP filter control unit when
67  *    in adds its filtering rules to the NECP database.
68  *
69  * 2) The user space filter agent also sets its NECP filter control unit on the
70  *    content filter kernel control socket via the socket option
71  *    CFIL_OPT_NECP_CONTROL_UNIT.
72  *
73  * 3) The NECP database is consulted to find out if a given INET/INET6 socket
74  *    needs to be subjected to content filtering and returns the corresponding
75  *    NECP filter control unit  -- the NECP filter control unit is actually
76  *    stored in the INET/INET6 socket structure so the NECP lookup is really simple.
77  *
78  * 4) The NECP filter control unit is then used to find the corresponding
79  *    kernel control socket instance.
80  *
81  * Note: NECP currently supports a single filter control unit per INET/INET6 socket
82  *       but this restriction may be soon lifted.
83  *
84  *
85  * THE MESSAGING PROTOCOL
86  *
87  * The socket content filter subsystem and a user space filter agent
88  * communicate over the kernel control socket via an asynchronous
89  * messaging protocol (this is not a request-response protocol).
90  * The socket content filter subsystem sends event messages to the user
91  * space filter agent about the INET/INET6 sockets it is interested to filter.
92  * The user space filter agent sends action messages to either allow
93  * data to pass or to disallow the data flow (and drop the connection).
94  *
95  * All messages over a content filter kernel control socket share the same
96  * common header of type "struct cfil_msg_hdr". The message type tells if
97  * it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
98  * The message header field "cfm_sock_id" identifies a given INET/INET6 flow.
99  * For TCP, flows are per-socket.  For UDP and other datagrame protocols, there
100  * could be multiple flows per socket.
101  *
102  * Note the message header length field may be padded for alignment and can
103  * be larger than the actual content of the message.
104  * The field "cfm_op" describe the kind of event or action.
105  *
106  * Here are the kinds of content filter events:
107  * - CFM_OP_SOCKET_ATTACHED: a new INET/INET6 socket is being filtered
108  * - CFM_OP_SOCKET_CLOSED: A INET/INET6 socket is closed
109  * - CFM_OP_DATA_OUT: A span of data is being sent on a INET/INET6 socket
110  * - CFM_OP_DATA_IN: A span of data is being or received on a INET/INET6 socket
111  *
112  *
113  * EVENT MESSAGES
114  *
115  * The CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages contains a span of
116  * data that is being sent or received. The position of this span of data
117  * in the data flow is described by a set of start and end offsets. These
118  * are absolute 64 bits offsets. The first byte sent (or received) starts
119  * at offset 0 and ends at offset 1. The length of the content data
120  * is given by the difference between the end offset and the start offset.
121  *
122  * After a CFM_OP_SOCKET_ATTACHED is delivered, CFM_OP_DATA_OUT and
123  * CFM_OP_DATA_OUT events are not delivered until a CFM_OP_DATA_UPDATE
124  * action message is sent by the user space filter agent.
125  *
126  * Note: absolute 64 bits offsets should be large enough for the foreseeable
127  * future.  A 64-bits counter will wrap after 468 years at 10 Gbit/sec:
128  *   2E64 / ((10E9 / 8) * 60 * 60 * 24 * 365.25) = 467.63
129  *
130  * They are two kinds of primary content filter actions:
131  * - CFM_OP_DATA_UPDATE: to update pass or peek offsets for each direction.
132  * - CFM_OP_DROP: to shutdown socket and disallow further data flow
133  *
134  * There is also an action to mark a given client flow as already filtered
135  * at a higher level, CFM_OP_BLESS_CLIENT.
136  *
137  *
138  * ACTION MESSAGES
139  *
140  * The CFM_OP_DATA_UPDATE action messages let the user space filter
141  * agent allow data to flow up to the specified pass offset -- there
142  * is a pass offset for outgoing data and a pass offset for incoming data.
143  * When a new INET/INET6 socket is attached to the content filter and a flow is
144  * created, each pass offset is initially set to 0 so no data is allowed to pass by
145  * default.  When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
146  * then the data flow becomes unrestricted.
147  *
148  * Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
149  * with a pass offset smaller than the pass offset of a previous
150  * CFM_OP_DATA_UPDATE message is silently ignored.
151  *
152  * A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
153  * to tell the kernel how much data it wants to see by using the peek offsets.
154  * Just like pass offsets, there is a peek offset for each direction.
155  * When a new INET/INET6 flow is created, each peek offset is initially set to 0
156  * so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages are dispatched by default
157  * until a CFM_OP_DATA_UPDATE action message with a greater than 0 peek offset is sent
158  * by the user space filter agent.  When the peek offset is set to CFM_MAX_OFFSET via
159  * a CFM_OP_DATA_UPDATE then the flow of update data events becomes unrestricted.
160  *
161  * Note that peek offsets cannot be smaller than the corresponding pass offset.
162  * Also a peek offsets cannot be smaller than the corresponding end offset
163  * of the last CFM_OP_DATA_OUT/CFM_OP_DATA_IN message dispatched. Trying
164  * to set a too small peek value is silently ignored.
165  *
166  *
167  * PER FLOW "struct cfil_info"
168  *
169  * As soon as a INET/INET6 socket gets attached to a content filter, a
170  * "struct cfil_info" is created to hold the content filtering state for this
171  * socket.  For UDP and other datagram protocols, as soon as traffic is seen for
172  * each new flow identified by its 4-tuple of source address/port and destination
173  * address/port, a "struct cfil_info" is created.  Each datagram socket may
174  * have multiple flows maintained in a hash table of "struct cfil_info" entries.
175  *
176  * The content filtering state is made of the following information
177  * for each direction:
178  * - The current pass offset;
179  * - The first and last offsets of the data pending, waiting for a filtering
180  *   decision;
181  * - The inject queue for data that passed the filters and that needs
182  *   to be re-injected;
183  * - A content filter specific state in a set of  "struct cfil_entry"
184  *
185  *
186  * CONTENT FILTER STATE "struct cfil_entry"
187  *
188  * The "struct cfil_entry" maintains the information most relevant to the
189  * message handling over a kernel control socket with a user space filter agent.
190  *
191  * The "struct cfil_entry" holds the NECP filter control unit that corresponds
192  * to the kernel control socket unit it corresponds to and also has a pointer
193  * to the corresponding "struct content_filter".
194  *
195  * For each direction, "struct cfil_entry" maintains the following information:
196  * - The pass offset
197  * - The peek offset
198  * - The offset of the last data peeked at by the filter
199  * - A queue of data that's waiting to be delivered to the  user space filter
200  *   agent on the kernel control socket
201  * - A queue of data for which event messages have been sent on the kernel
202  *   control socket and are pending for a filtering decision.
203  *
204  *
205  * CONTENT FILTER QUEUES
206  *
207  * Data that is being filtered is steered away from the INET/INET6 socket buffer
208  * and instead will sit in one of three content filter queues until the data
209  * can be re-injected into the INET/INET6 socket buffer.
210  *
211  * A content filter queue is represented by "struct cfil_queue" that contains
212  * a list of mbufs and the start and end offset of the data span of
213  * the list of mbufs.
214  *
215  * The data moves into the three content filter queues according to this
216  * sequence:
217  * a) The "cfe_ctl_q" of "struct cfil_entry"
218  * b) The "cfe_pending_q" of "struct cfil_entry"
219  * c) The "cfi_inject_q" of "struct cfil_info"
220  *
221  * Note: The sequence (a),(b) may be repeated several times if there is more
222  * than one content filter attached to the INET/INET6 socket.
223  *
224  * The "cfe_ctl_q" queue holds data than cannot be delivered to the
225  * kernel conntrol socket for two reasons:
226  * - The peek offset is less that the end offset of the mbuf data
227  * - The kernel control socket is flow controlled
228  *
229  * The "cfe_pending_q" queue holds data for which CFM_OP_DATA_OUT or
230  * CFM_OP_DATA_IN have been successfully dispatched to the kernel control
231  * socket and are waiting for a pass action message fromn the user space
232  * filter agent. An mbuf length must be fully allowed to pass to be removed
233  * from the cfe_pending_q.
234  *
235  * The "cfi_inject_q" queue holds data that has been fully allowed to pass
236  * by the user space filter agent and that needs to be re-injected into the
237  * INET/INET6 socket.
238  *
239  *
240  * IMPACT ON FLOW CONTROL
241  *
242  * An essential aspect of the content filer subsystem is to minimize the
243  * impact on flow control of the INET/INET6 sockets being filtered.
244  *
245  * The processing overhead of the content filtering may have an effect on
246  * flow control by adding noticeable delays and cannot be eliminated --
247  * care must be taken by the user space filter agent to minimize the
248  * processing delays.
249  *
250  * The amount of data being filtered is kept in buffers while waiting for
251  * a decision by the user space filter agent. This amount of data pending
252  * needs to be subtracted from the amount of data available in the
253  * corresponding INET/INET6 socket buffer. This is done by modifying
254  * sbspace() and tcp_sbspace() to account for amount of data pending
255  * in the content filter.
256  *
257  *
258  * LOCKING STRATEGY
259  *
260  * The global state of content filter subsystem is protected by a single
261  * read-write lock "cfil_lck_rw". The data flow can be done with the
262  * cfil read-write lock held as shared so it can be re-entered from multiple
263  * threads.
264  *
265  * The per INET/INET6 socket content filterstate -- "struct cfil_info" -- is
266  * protected by the socket lock.
267  *
268  * A INET/INET6 socket lock cannot be taken while the cfil read-write lock
269  * is held. That's why we have some sequences where we drop the cfil read-write
270  * lock before taking the INET/INET6 lock.
271  *
272  * It is also important to lock the INET/INET6 socket buffer while the content
273  * filter is modifying the amount of pending data. Otherwise the calculations
274  * in sbspace() and tcp_sbspace()  could be wrong.
275  *
276  * The "cfil_lck_rw" protects "struct content_filter" and also the fields
277  * "cfe_link" and "cfe_filter" of "struct cfil_entry".
278  *
279  * Actually "cfe_link" and "cfe_filter" are protected by both by
280  * "cfil_lck_rw" and the socket lock: they may be modified only when
281  * "cfil_lck_rw" is exclusive and the socket is locked.
282  *
283  * To read the other fields of "struct content_filter" we have to take
284  * "cfil_lck_rw" in shared mode.
285  *
286  * DATAGRAM SPECIFICS:
287  *
288  * The socket content filter supports all INET/INET6 protocols.  However
289  * the treatments for TCP sockets and for datagram (UDP, ICMP, etc) sockets
290  * are slightly different.
291  *
292  * Each datagram socket may have multiple flows.  Each flow is identified
293  * by the flow's source address/port and destination address/port tuple
294  * and is represented as a "struct cfil_info" entry.  For each socket,
295  * a hash table is used to maintain the collection of flows under that socket.
296  *
297  * Each datagram flow is uniquely identified by it's "struct cfil_info" cfi_sock_id.
298  * The highest 32-bits of the cfi_sock_id contains the socket's so_gencnt.  This portion
299  * of the cfi_sock_id is used locate the socket during socket lookup.  The lowest 32-bits
300  * of the cfi_sock_id contains a hash of the flow's 4-tuple.  This portion of the cfi_sock_id
301  * is used as the hash value for the flow hash table lookup within the parent socket.
302  *
303  * Since datagram sockets may not be connected, flow states may not be maintained in the
304  * socket structures and thus have to be saved for each packet.  These saved states will be
305  * used for both outgoing and incoming reinjections.  For outgoing packets, destination
306  * address/port as well as the current socket states will be saved.  During reinjection,
307  * these saved states will be used instead.  For incoming packets, control and address
308  * mbufs will be chained to the data.  During reinjection, the whole chain will be queued
309  * onto the incoming socket buffer.
310  *
311  * LIMITATIONS
312  *
313  * - Support all INET/INET6 sockets, such as TCP, UDP, ICMP, etc
314  *
315  * - Does not support TCP unordered messages
316  */
317 
318 /*
319  *	TO DO LIST
320  *
321  *	Deal with OOB
322  *
323  */
324 
325 #include <sys/types.h>
326 #include <sys/kern_control.h>
327 #include <sys/queue.h>
328 #include <sys/domain.h>
329 #include <sys/protosw.h>
330 #include <sys/syslog.h>
331 #include <sys/systm.h>
332 #include <sys/param.h>
333 #include <sys/mbuf.h>
334 
335 #include <kern/locks.h>
336 #include <kern/zalloc.h>
337 #include <kern/debug.h>
338 
339 #include <net/ntstat.h>
340 #include <net/content_filter.h>
341 #include <net/content_filter_crypto.h>
342 
343 #define _IP_VHL
344 #include <netinet/ip.h>
345 #include <netinet/in_pcb.h>
346 #include <netinet/tcp.h>
347 #include <netinet/tcp_var.h>
348 #include <netinet/udp.h>
349 #include <netinet/udp_var.h>
350 #include <kern/socket_flows.h>
351 
352 #include <string.h>
353 #include <libkern/libkern.h>
354 #include <kern/sched_prim.h>
355 #include <kern/task.h>
356 #include <mach/task_info.h>
357 
358 #include <net/sockaddr_utils.h>
359 
360 #define MAX_CONTENT_FILTER 8
361 
362 extern int tcp_msl;
363 extern struct inpcbinfo ripcbinfo;
364 struct cfil_entry;
365 
366 /*
367  * The structure content_filter represents a user space content filter
368  * It's created and associated with a kernel control socket instance
369  */
370 struct content_filter {
371 	kern_ctl_ref            cf_kcref;
372 	u_int32_t               cf_kcunit;
373 	u_int32_t               cf_flags;
374 
375 	uint32_t                cf_necp_control_unit;
376 
377 	uint32_t                cf_sock_count;
378 	TAILQ_HEAD(, cfil_entry) cf_sock_entries;
379 
380 	cfil_crypto_state_t cf_crypto_state;
381 };
382 
383 #define CFF_ACTIVE              0x01
384 #define CFF_DETACHING           0x02
385 #define CFF_FLOW_CONTROLLED     0x04
386 #define CFF_PRESERVE_CONNECTIONS 0x08
387 
388 struct content_filter *content_filters[MAX_CONTENT_FILTER];
389 uint32_t cfil_active_count = 0; /* Number of active content filters */
390 uint32_t cfil_sock_attached_count = 0;  /* Number of sockets attachements */
391 uint32_t cfil_sock_attached_stats_count = 0;    /* Number of sockets requested periodic stats report */
392 uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */
393 
394 static kern_ctl_ref cfil_kctlref = NULL;
395 
396 static LCK_GRP_DECLARE(cfil_lck_grp, "content filter");
397 static LCK_RW_DECLARE(cfil_lck_rw, &cfil_lck_grp);
398 
399 #define CFIL_RW_LCK_MAX 8
400 
401 int cfil_rw_nxt_lck = 0;
402 void* cfil_rw_lock_history[CFIL_RW_LCK_MAX];
403 
404 int cfil_rw_nxt_unlck = 0;
405 void* cfil_rw_unlock_history[CFIL_RW_LCK_MAX];
406 
407 static KALLOC_TYPE_DEFINE(content_filter_zone, struct content_filter, NET_KT_DEFAULT);
408 
409 MBUFQ_HEAD(cfil_mqhead);
410 
411 struct cfil_queue {
412 	uint64_t                q_start; /* offset of first byte in queue */
413 	uint64_t                q_end; /* offset of last byte in queue */
414 	struct cfil_mqhead      q_mq;
415 };
416 
417 /*
418  * struct cfil_entry
419  *
420  * The is one entry per content filter
421  */
422 struct cfil_entry {
423 	TAILQ_ENTRY(cfil_entry) cfe_link;
424 	SLIST_ENTRY(cfil_entry) cfe_order_link;
425 	struct content_filter   *cfe_filter;
426 
427 	struct cfil_info        *cfe_cfil_info;
428 	uint32_t                cfe_flags;
429 	uint32_t                cfe_necp_control_unit;
430 	struct timeval          cfe_last_event; /* To user space */
431 	struct timeval          cfe_last_action; /* From user space */
432 	uint64_t                cfe_byte_inbound_count_reported; /* stats already been reported */
433 	uint64_t                cfe_byte_outbound_count_reported; /* stats already been reported */
434 	struct timeval          cfe_stats_report_ts; /* Timestamp for last stats report */
435 	uint32_t                cfe_stats_report_frequency; /* Interval for stats report in msecs */
436 	boolean_t               cfe_laddr_sent;
437 
438 	struct cfe_buf {
439 		/*
440 		 * cfe_pending_q holds data that has been delivered to
441 		 * the filter and for which we are waiting for an action
442 		 */
443 		struct cfil_queue       cfe_pending_q;
444 		/*
445 		 * This queue is for data that has not be delivered to
446 		 * the content filter (new data, pass peek or flow control)
447 		 */
448 		struct cfil_queue       cfe_ctl_q;
449 
450 		uint64_t                cfe_pass_offset;
451 		uint64_t                cfe_peek_offset;
452 		uint64_t                cfe_peeked;
453 	} cfe_snd, cfe_rcv;
454 };
455 
456 #define CFEF_CFIL_ATTACHED              0x0001  /* was attached to filter */
457 #define CFEF_SENT_SOCK_ATTACHED         0x0002  /* sock attach event was sent */
458 #define CFEF_DATA_START                 0x0004  /* can send data event */
459 #define CFEF_FLOW_CONTROLLED            0x0008  /* wait for flow control lift */
460 #define CFEF_SENT_DISCONNECT_IN         0x0010  /* event was sent */
461 #define CFEF_SENT_DISCONNECT_OUT        0x0020  /* event was sent */
462 #define CFEF_SENT_SOCK_CLOSED           0x0040  /* closed event was sent */
463 #define CFEF_CFIL_DETACHED              0x0080  /* filter was detached */
464 
465 
466 #define CFI_ADD_TIME_LOG(cfil, t1, t0, op)                                                                                      \
467 	        struct timeval64 _tdiff;                                                                                          \
468 	        size_t offset = (cfil)->cfi_op_list_ctr;                                                                        \
469 	        if (offset < CFI_MAX_TIME_LOG_ENTRY) {                                                                          \
470 	                timersub(t1, t0, &_tdiff);                                                                              \
471 	                (cfil)->cfi_op_time[offset] = (uint32_t)(_tdiff.tv_sec * 1000 + _tdiff.tv_usec / 1000);                 \
472 	                (cfil)->cfi_op_list[offset] = (unsigned char)op;                                                        \
473 	                (cfil)->cfi_op_list_ctr ++;                                                                             \
474 	        }
475 
476 /*
477  * struct cfil_info
478  *
479  * There is a struct cfil_info per socket
480  */
481 struct cfil_info {
482 	TAILQ_ENTRY(cfil_info)  cfi_link;
483 	TAILQ_ENTRY(cfil_info)  cfi_link_stats;
484 	struct socket           *cfi_so;
485 	uint64_t                cfi_flags;
486 	uint64_t                cfi_sock_id;
487 	struct timeval64        cfi_first_event;
488 	uint32_t                cfi_op_list_ctr;
489 	uint32_t                cfi_op_time[CFI_MAX_TIME_LOG_ENTRY];    /* time interval in microseconds since first event */
490 	unsigned char           cfi_op_list[CFI_MAX_TIME_LOG_ENTRY];
491 	union sockaddr_in_4_6   cfi_so_attach_faddr;                    /* faddr at the time of attach */
492 	union sockaddr_in_4_6   cfi_so_attach_laddr;                    /* laddr at the time of attach */
493 
494 	int                     cfi_dir;
495 	uint64_t                cfi_byte_inbound_count;
496 	uint64_t                cfi_byte_outbound_count;
497 
498 	boolean_t               cfi_isSignatureLatest;                  /* Indicates if signature covers latest flow attributes */
499 	u_int32_t               cfi_filter_control_unit;
500 	u_int32_t               cfi_filter_policy_gencount;
501 	u_int32_t               cfi_debug;
502 	struct cfi_buf {
503 		/*
504 		 * cfi_pending_first and cfi_pending_last describe the total
505 		 * amount of data outstanding for all the filters on
506 		 * this socket and data in the flow queue
507 		 * cfi_pending_mbcnt counts in sballoc() "chars of mbufs used"
508 		 */
509 		uint64_t                cfi_pending_first;
510 		uint64_t                cfi_pending_last;
511 		uint32_t                cfi_pending_mbcnt;
512 		uint32_t                cfi_pending_mbnum;
513 		uint32_t                cfi_tail_drop_cnt;
514 		/*
515 		 * cfi_pass_offset is the minimum of all the filters
516 		 */
517 		uint64_t                cfi_pass_offset;
518 		/*
519 		 * cfi_inject_q holds data that needs to be re-injected
520 		 * into the socket after filtering and that can
521 		 * be queued because of flow control
522 		 */
523 		struct cfil_queue       cfi_inject_q;
524 	} cfi_snd, cfi_rcv;
525 
526 	struct cfil_entry       cfi_entries[MAX_CONTENT_FILTER];
527 	struct soflow_hash_entry *cfi_hash_entry;
528 	SLIST_HEAD(, cfil_entry) cfi_ordered_entries;
529 	os_refcnt_t             cfi_ref_count;
530 } __attribute__((aligned(8)));
531 
532 #define CFIF_DROP                       0x0001  /* drop action applied */
533 #define CFIF_CLOSE_WAIT                 0x0002  /* waiting for filter to close */
534 #define CFIF_SOCK_CLOSED                0x0004  /* socket is closed */
535 #define CFIF_RETRY_INJECT_IN            0x0010  /* inject in failed */
536 #define CFIF_RETRY_INJECT_OUT           0x0020  /* inject out failed */
537 #define CFIF_SHUT_WR                    0x0040  /* shutdown write */
538 #define CFIF_SHUT_RD                    0x0080  /* shutdown read */
539 #define CFIF_SOCKET_CONNECTED           0x0100  /* socket is connected */
540 #define CFIF_INITIAL_VERDICT            0x0200  /* received initial verdict */
541 #define CFIF_NO_CLOSE_WAIT              0x0400  /* do not wait to close */
542 #define CFIF_SO_DELAYED_DEAD            0x0800  /* Delayed socket DEAD marking */
543 #define CFIF_SO_DELAYED_TCP_TIME_WAIT   0x1000  /* Delayed TCP FIN TIME WAIT */
544 
545 #define CFI_MASK_GENCNT         0xFFFFFFFF00000000      /* upper 32 bits */
546 #define CFI_SHIFT_GENCNT        32
547 #define CFI_MASK_FLOWHASH       0x00000000FFFFFFFF      /* lower 32 bits */
548 #define CFI_SHIFT_FLOWHASH      0
549 
550 #define CFI_ENTRY_KCUNIT(i, e) ((uint32_t)(((e) - &((i)->cfi_entries[0])) + 1))
551 
552 static KALLOC_TYPE_DEFINE(cfil_info_zone, struct cfil_info, NET_KT_DEFAULT);
553 
554 TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
555 TAILQ_HEAD(cfil_sock_head_stats, cfil_info) cfil_sock_head_stats;
556 
557 #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
558 #define CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x)
559 
560 /*
561  * UDP Socket Support
562  */
563 #define IS_ICMP(so) (so && (SOCK_CHECK_TYPE(so, SOCK_RAW) || SOCK_CHECK_TYPE(so, SOCK_DGRAM)) && \
564 	                                   (SOCK_CHECK_PROTO(so, IPPROTO_ICMP) || SOCK_CHECK_PROTO(so, IPPROTO_ICMPV6)))
565 #define IS_RAW(so)  (so && SOCK_CHECK_TYPE(so, SOCK_RAW) && SOCK_CHECK_PROTO(so, IPPROTO_RAW))
566 
567 #define OPTIONAL_IP_HEADER(so) (!IS_TCP(so) && !IS_UDP(so))
568 #define GET_SO_PROTOCOL(so) (so ? SOCK_PROTO(so) : IPPROTO_IP)
569 #define GET_SO_INP_PROTOCOL(so) ((so && sotoinpcb(so)) ? sotoinpcb(so)->inp_ip_p : IPPROTO_IP)
570 #define GET_SO_PROTO(so) ((GET_SO_PROTOCOL(so) != IPPROTO_IP) ? GET_SO_PROTOCOL(so) : GET_SO_INP_PROTOCOL(so))
571 #define IS_INP_V6(inp) (inp && (inp->inp_vflag & INP_IPV6))
572 
573 #define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \
574 	                                                          ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))))
575 #define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \
576 	                                                                                  cfil_info->cfi_entries[kcunit - 1].cfe_filter != NULL)
577 #define IS_DNS(local, remote) (check_port(local, 53) || check_port(remote, 53) || check_port(local, 5353) || check_port(remote, 5353))
578 #define IS_INITIAL_TFO_DATA(so) (so && (so->so_flags1 & SOF1_PRECONNECT_DATA) && (so->so_state & SS_ISCONNECTING))
579 #define NULLADDRESS(addr) ((addr.sa.sa_len == 0) || \
580 	                   (addr.sa.sa_family == AF_INET && addr.sin.sin_addr.s_addr == 0) || \
581 	                   (addr.sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&addr.sin6.sin6_addr)))
582 
583 #define SKIP_FILTER_FOR_TCP_SOCKET(so) \
584     (so == NULL || \
585      (!SOCK_CHECK_DOM(so, PF_INET) && !SOCK_CHECK_DOM(so, PF_INET6)) || \
586       !SOCK_CHECK_TYPE(so, SOCK_STREAM) || \
587       !SOCK_CHECK_PROTO(so, IPPROTO_TCP) || \
588       (so->so_flags & SOF_MP_SUBFLOW) != 0 || \
589       (so->so_flags1 & SOF1_CONTENT_FILTER_SKIP) != 0)
590 
591 /*
592  * Special handling for 0.0.0.0-faddr TCP flows.  This flows will be changed to loopback addr by TCP and
593  * may result in an immediate TCP RESET and socket close.  This leads to CFIL blocking the owner thread for
594  * 1 sec waiting for ack from user-space provider (ack recevied by CFIL but socket already removed from
595  * global socket list).  To avoid this, identify these flows and do not perform the close-wait blocking.
596  * These flows are identified as destined to Loopback address and were disconnected shortly after connect
597  * (before initial-verdict received).
598  */
599 #define IS_LOOPBACK_FADDR(inp) \
600     (inp && ((IS_INP_V6(inp) && IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr)) || (ntohl(inp->inp_faddr.s_addr) == INADDR_LOOPBACK)))
601 
602 #define SET_NO_CLOSE_WAIT(inp, cfil_info) \
603     if (inp && cfil_info && !(cfil_info->cfi_flags & CFIF_INITIAL_VERDICT) && IS_LOOPBACK_FADDR(inp)) { \
604 	cfil_info->cfi_flags |= CFIF_NO_CLOSE_WAIT; \
605     }
606 
607 #define IS_NO_CLOSE_WAIT(cfil_info) (cfil_info && (cfil_info->cfi_flags & CFIF_NO_CLOSE_WAIT))
608 
609 os_refgrp_decl(static, cfil_refgrp, "CFILRefGroup", NULL);
610 
611 #define CFIL_INFO_FREE(cfil_info) \
612     if (cfil_info && (os_ref_release(&cfil_info->cfi_ref_count) == 0)) { \
613 	cfil_info_free(cfil_info); \
614     }
615 
616 #define SOCKET_PID(so) ((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid)
617 #define MATCH_PID(so) (so && (cfil_log_pid == SOCKET_PID(so)))
618 #define MATCH_PORT(inp, local, remote) \
619     ((inp && ntohs(inp->inp_lport) == cfil_log_port) || (inp && ntohs(inp->inp_fport) == cfil_log_port) || \
620 	check_port(local, cfil_log_port) || check_port(remote, cfil_log_port))
621 #define MATCH_PROTO(so) (GET_SO_PROTO(so) == cfil_log_proto)
622 
623 #define DEBUG_FLOW(inp, so, local, remote) \
624     ((cfil_log_port && MATCH_PORT(inp, local, remote)) || (cfil_log_pid && MATCH_PID(so)) || (cfil_log_proto && MATCH_PROTO(so)))
625 
626 #define SO_DELAYED_DEAD_SET(so, set) \
627     if (so->so_cfil) { \
628 	if (set) { \
629 	    so->so_cfil->cfi_flags |= CFIF_SO_DELAYED_DEAD; \
630 	} else { \
631 	    so->so_cfil->cfi_flags &= ~CFIF_SO_DELAYED_DEAD; \
632 	} \
633     } else if (so->so_flow_db) { \
634 	if (set) { \
635 	    so->so_flow_db->soflow_db_flags |= SOFLOWF_SO_DELAYED_DEAD; \
636 	} else { \
637 	    so->so_flow_db->soflow_db_flags &= ~SOFLOWF_SO_DELAYED_DEAD; \
638 	} \
639     }
640 
641 #define SO_DELAYED_DEAD_GET(so) \
642     (so->so_cfil ? (so->so_cfil->cfi_flags & CFIF_SO_DELAYED_DEAD) : \
643 	            (so->so_flow_db) ? (so->so_flow_db->soflow_db_flags & SOFLOWF_SO_DELAYED_DEAD) : false)
644 
645 #define SO_DELAYED_TCP_TIME_WAIT_SET(so, set) \
646     if (so->so_cfil) { \
647     if (set) { \
648        so->so_cfil->cfi_flags |= CFIF_SO_DELAYED_TCP_TIME_WAIT; \
649     } else { \
650        so->so_cfil->cfi_flags &= ~CFIF_SO_DELAYED_TCP_TIME_WAIT; \
651     } \
652     }
653 
654 #define SO_DELAYED_TCP_TIME_WAIT_GET(so) \
655     (so->so_cfil ? (so->so_cfil->cfi_flags & CFIF_SO_DELAYED_TCP_TIME_WAIT) : false)
656 
657 /*
658  * Periodic Statistics Report:
659  */
660 static struct thread *cfil_stats_report_thread;
661 #define CFIL_STATS_REPORT_INTERVAL_MIN_MSEC  500   // Highest report frequency
662 #define CFIL_STATS_REPORT_RUN_INTERVAL_NSEC  (CFIL_STATS_REPORT_INTERVAL_MIN_MSEC * NSEC_PER_MSEC)
663 #define CFIL_STATS_REPORT_MAX_COUNT          50    // Max stats to be reported per run
664 
665 /* This buffer must have same layout as struct cfil_msg_stats_report */
666 struct cfil_stats_report_buffer {
667 	struct cfil_msg_hdr        msghdr;
668 	uint32_t                   count;
669 	struct cfil_msg_sock_stats stats[CFIL_STATS_REPORT_MAX_COUNT];
670 };
671 static struct cfil_stats_report_buffer *global_cfil_stats_report_buffers[MAX_CONTENT_FILTER];
672 static uint32_t global_cfil_stats_counts[MAX_CONTENT_FILTER];
673 
674 /*
675  * UDP Garbage Collection:
676  */
677 #define UDP_FLOW_GC_ACTION_TO        10  // Flow Action Timeout (no action from user space) in seconds
678 #define UDP_FLOW_GC_MAX_COUNT        100 // Max UDP flows to be handled per run
679 
680 /*
681  * UDP flow queue thresholds
682  */
683 #define UDP_FLOW_GC_MBUF_CNT_MAX  (2 << MBSHIFT) // Max mbuf byte count in flow queue (2MB)
684 #define UDP_FLOW_GC_MBUF_NUM_MAX  (UDP_FLOW_GC_MBUF_CNT_MAX >> MCLSHIFT) // Max mbuf count in flow queue (1K)
685 #define UDP_FLOW_GC_MBUF_SHIFT    5             // Shift to get 1/32 of platform limits
686 /*
687  * UDP flow queue threshold globals:
688  */
689 static unsigned int cfil_udp_gc_mbuf_num_max = UDP_FLOW_GC_MBUF_NUM_MAX;
690 static unsigned int cfil_udp_gc_mbuf_cnt_max = UDP_FLOW_GC_MBUF_CNT_MAX;
691 
692 /*
693  * CFIL specific mbuf tag:
694  * Save state of socket at the point of data entry into cfil.
695  * Use saved state for reinjection at protocol layer.
696  */
697 struct cfil_tag {
698 	union sockaddr_in_4_6 cfil_faddr;
699 	uint32_t cfil_so_state_change_cnt;
700 	uint32_t cfil_so_options;
701 	int cfil_inp_flags;
702 };
703 
704 /*
705  * Global behavior flags:
706  */
707 #define CFIL_BEHAVIOR_FLAG_PRESERVE_CONNECTIONS 0x00000001
708 static uint32_t cfil_behavior_flags = 0;
709 
710 #define DO_PRESERVE_CONNECTIONS (cfil_behavior_flags & CFIL_BEHAVIOR_FLAG_PRESERVE_CONNECTIONS)
711 
712 /*
713  * Statistics
714  */
715 
716 struct cfil_stats cfil_stats;
717 
718 /*
719  * For troubleshooting
720  */
721 int cfil_log_level = LOG_ERR;
722 int cfil_log_port = 0;
723 int cfil_log_pid = 0;
724 int cfil_log_proto = 0;
725 int cfil_log_data = 0;
726 int cfil_log_stats = 0;
727 int cfil_debug = 1;
728 
729 /*
730  * Sysctls for logs and statistics
731  */
732 static int sysctl_cfil_filter_list(struct sysctl_oid *, void *, int,
733     struct sysctl_req *);
734 static int sysctl_cfil_sock_list(struct sysctl_oid *, void *, int,
735     struct sysctl_req *);
736 
737 SYSCTL_NODE(_net, OID_AUTO, cfil, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "cfil");
738 
739 SYSCTL_INT(_net_cfil, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED,
740     &cfil_log_level, 0, "");
741 
742 SYSCTL_INT(_net_cfil, OID_AUTO, log_port, CTLFLAG_RW | CTLFLAG_LOCKED,
743     &cfil_log_port, 0, "");
744 
745 SYSCTL_INT(_net_cfil, OID_AUTO, log_pid, CTLFLAG_RW | CTLFLAG_LOCKED,
746     &cfil_log_pid, 0, "");
747 
748 SYSCTL_INT(_net_cfil, OID_AUTO, log_proto, CTLFLAG_RW | CTLFLAG_LOCKED,
749     &cfil_log_proto, 0, "");
750 
751 SYSCTL_INT(_net_cfil, OID_AUTO, log_data, CTLFLAG_RW | CTLFLAG_LOCKED,
752     &cfil_log_data, 0, "");
753 
754 SYSCTL_INT(_net_cfil, OID_AUTO, log_stats, CTLFLAG_RW | CTLFLAG_LOCKED,
755     &cfil_log_stats, 0, "");
756 
757 SYSCTL_INT(_net_cfil, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
758     &cfil_debug, 0, "");
759 
760 SYSCTL_UINT(_net_cfil, OID_AUTO, sock_attached_count, CTLFLAG_RD | CTLFLAG_LOCKED,
761     &cfil_sock_attached_count, 0, "");
762 
763 SYSCTL_UINT(_net_cfil, OID_AUTO, active_count, CTLFLAG_RD | CTLFLAG_LOCKED,
764     &cfil_active_count, 0, "");
765 
766 SYSCTL_UINT(_net_cfil, OID_AUTO, close_wait_timeout, CTLFLAG_RW | CTLFLAG_LOCKED,
767     &cfil_close_wait_timeout, 0, "");
768 
769 SYSCTL_UINT(_net_cfil, OID_AUTO, behavior_flags, CTLFLAG_RW | CTLFLAG_LOCKED,
770     &cfil_behavior_flags, 0, "");
771 
772 static int cfil_sbtrim = 1;
773 SYSCTL_UINT(_net_cfil, OID_AUTO, sbtrim, CTLFLAG_RW | CTLFLAG_LOCKED,
774     &cfil_sbtrim, 0, "");
775 
776 SYSCTL_PROC(_net_cfil, OID_AUTO, filter_list, CTLFLAG_RD | CTLFLAG_LOCKED,
777     0, 0, sysctl_cfil_filter_list, "S,cfil_filter_stat", "");
778 
779 SYSCTL_PROC(_net_cfil, OID_AUTO, sock_list, CTLFLAG_RD | CTLFLAG_LOCKED,
780     0, 0, sysctl_cfil_sock_list, "S,cfil_sock_stat", "");
781 
782 SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_LOCKED,
783     &cfil_stats, cfil_stats, "");
784 
785 /*
786  * Forward declaration to appease the compiler
787  */
788 static int cfil_action_data_pass(struct socket *, struct cfil_info *, uint32_t, int,
789     uint64_t, uint64_t);
790 static int cfil_action_drop(struct socket *, struct cfil_info *, uint32_t);
791 static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *);
792 static int cfil_action_set_crypto_key(uint32_t, struct cfil_msg_hdr *);
793 static int cfil_dispatch_closed_event(struct socket *, struct cfil_info *, int);
794 static int cfil_data_common(struct socket *, struct cfil_info *, int, struct sockaddr *,
795     struct mbuf *, struct mbuf *, uint32_t);
796 static int cfil_data_filter(struct socket *, struct cfil_info *, uint32_t, int,
797     struct mbuf *, uint32_t);
798 static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
799     struct in_addr, u_int16_t);
800 static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
801     struct in6_addr *, u_int16_t, uint32_t);
802 
803 static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t, int);
804 static void cfil_info_free(struct cfil_info *);
805 static struct cfil_info * cfil_info_alloc(struct socket *, struct soflow_hash_entry *);
806 static int cfil_info_attach_unit(struct socket *, uint32_t, struct cfil_info *);
807 static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t, bool);
808 static struct socket * cfil_socket_from_client_uuid(uuid_t, bool *);
809 static int cfil_service_pending_queue(struct socket *, struct cfil_info *, uint32_t, int);
810 static int cfil_data_service_ctl_q(struct socket *, struct cfil_info *, uint32_t, int);
811 static void cfil_info_verify(struct cfil_info *);
812 static int cfil_update_data_offsets(struct socket *, struct cfil_info *, uint32_t, int,
813     uint64_t, uint64_t);
814 static int cfil_acquire_sockbuf(struct socket *, struct cfil_info *, int);
815 static void cfil_release_sockbuf(struct socket *, int);
816 static int cfil_filters_attached(struct socket *);
817 
818 static void cfil_rw_lock_exclusive(lck_rw_t *);
819 static void cfil_rw_unlock_exclusive(lck_rw_t *);
820 static void cfil_rw_lock_shared(lck_rw_t *);
821 static void cfil_rw_unlock_shared(lck_rw_t *);
822 static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *);
823 static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
824 
825 static unsigned int cfil_data_length(struct mbuf *, int *, int *);
826 static struct cfil_info *cfil_sock_udp_get_info(struct socket *, uint32_t, bool, struct soflow_hash_entry *, struct sockaddr *, struct sockaddr *);
827 static errno_t cfil_sock_udp_handle_data(bool, struct socket *, struct sockaddr *, struct sockaddr *,
828     struct mbuf *, struct mbuf *, uint32_t, struct soflow_hash_entry *);
829 static int32_t cfil_sock_udp_data_pending(struct sockbuf *, bool);
830 static void cfil_sock_udp_is_closed(struct socket *);
831 static int cfil_sock_udp_notify_shutdown(struct socket *, int, int, int);
832 static int cfil_sock_udp_shutdown(struct socket *, int *);
833 static void cfil_sock_udp_close_wait(struct socket *);
834 static void cfil_sock_udp_buf_update(struct sockbuf *);
835 static int cfil_filters_udp_attached(struct socket *, bool);
836 static void cfil_get_flow_address_v6(struct soflow_hash_entry *, struct inpcb *,
837     struct in6_addr **, struct in6_addr **,
838     u_int16_t *, u_int16_t *);
839 static void cfil_get_flow_address(struct soflow_hash_entry *, struct inpcb *,
840     struct in_addr *, struct in_addr *,
841     u_int16_t *, u_int16_t *);
842 static void cfil_info_log(int, struct cfil_info *, const char *);
843 void cfil_filter_show(u_int32_t);
844 void cfil_info_show(void);
845 bool cfil_info_action_timed_out(struct cfil_info *, int);
846 bool cfil_info_buffer_threshold_exceeded(struct cfil_info *);
847 struct m_tag *cfil_dgram_save_socket_state(struct cfil_info *, struct mbuf *);
848 boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags);
849 static void cfil_sock_received_verdict(struct socket *so);
850 static void cfil_fill_event_msg_addresses(struct soflow_hash_entry *, struct inpcb *,
851     union sockaddr_in_4_6 *, union sockaddr_in_4_6 *,
852     boolean_t, boolean_t);
853 static void cfil_stats_report_thread_func(void *, wait_result_t);
854 static void cfil_stats_report(void *v, wait_result_t w);
855 static bool cfil_dgram_gc_needed(struct socket *, struct soflow_hash_entry *, u_int64_t);
856 static bool cfil_dgram_gc_perform(struct socket *, struct soflow_hash_entry *);
857 static bool cfil_dgram_detach_entry(struct socket *, struct soflow_hash_entry *);
858 static bool cfil_dgram_detach_db(struct socket *, struct soflow_db *);
859 bool check_port(struct sockaddr *, u_short);
860 
861 /*
862  * Content filter global read write lock
863  */
864 
865 static void
cfil_rw_lock_exclusive(lck_rw_t * lck)866 cfil_rw_lock_exclusive(lck_rw_t *lck)
867 {
868 	void * __single lr_saved;
869 
870 	lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
871 
872 	lck_rw_lock_exclusive(lck);
873 
874 	cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
875 	cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
876 }
877 
878 static void
cfil_rw_unlock_exclusive(lck_rw_t * lck)879 cfil_rw_unlock_exclusive(lck_rw_t *lck)
880 {
881 	void * __single lr_saved;
882 
883 	lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
884 
885 	lck_rw_unlock_exclusive(lck);
886 
887 	cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
888 	cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
889 }
890 
891 static void
cfil_rw_lock_shared(lck_rw_t * lck)892 cfil_rw_lock_shared(lck_rw_t *lck)
893 {
894 	void * __single lr_saved;
895 
896 	lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
897 
898 	lck_rw_lock_shared(lck);
899 
900 	cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
901 	cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
902 }
903 
904 static void
cfil_rw_unlock_shared(lck_rw_t * lck)905 cfil_rw_unlock_shared(lck_rw_t *lck)
906 {
907 	void * __single lr_saved;
908 
909 	lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
910 
911 	lck_rw_unlock_shared(lck);
912 
913 	cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
914 	cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
915 }
916 
917 static boolean_t
cfil_rw_lock_shared_to_exclusive(lck_rw_t * lck)918 cfil_rw_lock_shared_to_exclusive(lck_rw_t *lck)
919 {
920 	boolean_t upgraded;
921 	void * __single lr_saved;
922 
923 	lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
924 
925 	upgraded = lck_rw_lock_shared_to_exclusive(lck);
926 	if (upgraded) {
927 		cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
928 		cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
929 	}
930 	return upgraded;
931 }
932 
933 static void
cfil_rw_lock_exclusive_to_shared(lck_rw_t * lck)934 cfil_rw_lock_exclusive_to_shared(lck_rw_t *lck)
935 {
936 	void * __single lr_saved;
937 
938 	lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
939 
940 	lck_rw_lock_exclusive_to_shared(lck);
941 
942 	cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
943 	cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
944 }
945 
946 static void
cfil_rw_lock_assert_held(lck_rw_t * lck,int exclusive)947 cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive)
948 {
949 #if !MACH_ASSERT
950 #pragma unused(lck, exclusive)
951 #endif
952 	LCK_RW_ASSERT(lck,
953 	    exclusive ? LCK_RW_ASSERT_EXCLUSIVE : LCK_RW_ASSERT_HELD);
954 }
955 
956 /*
957  * Return the number of bytes in the mbuf chain using the same
958  * method as m_length() or sballoc()
959  *
960  * Returns data len - starting from PKT start
961  * - retmbcnt - optional param to get total mbuf bytes in chain
962  * - retmbnum - optional param to get number of mbufs in chain
963  */
964 static unsigned int
cfil_data_length(struct mbuf * m,int * retmbcnt,int * retmbnum)965 cfil_data_length(struct mbuf *m, int *retmbcnt, int *retmbnum)
966 {
967 	struct mbuf *m0;
968 	unsigned int pktlen = 0;
969 	int mbcnt;
970 	int mbnum;
971 
972 	// Locate M_PKTHDR and mark as start of data if present
973 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
974 		if (m0->m_flags & M_PKTHDR) {
975 			m = m0;
976 			break;
977 		}
978 	}
979 
980 	if (retmbcnt == NULL && retmbnum == NULL) {
981 		return m_length(m);
982 	}
983 
984 	pktlen = 0;
985 	mbcnt = 0;
986 	mbnum = 0;
987 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
988 		pktlen += m0->m_len;
989 		mbnum++;
990 		mbcnt += _MSIZE;
991 		if (m0->m_flags & M_EXT) {
992 			mbcnt += m0->m_ext.ext_size;
993 		}
994 	}
995 	if (retmbcnt) {
996 		*retmbcnt = mbcnt;
997 	}
998 	if (retmbnum) {
999 		*retmbnum = mbnum;
1000 	}
1001 	return pktlen;
1002 }
1003 
1004 static struct mbuf *
cfil_data_start(struct mbuf * m)1005 cfil_data_start(struct mbuf *m)
1006 {
1007 	struct mbuf *m0;
1008 
1009 	// Locate M_PKTHDR and use it as start of data if present
1010 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
1011 		if (m0->m_flags & M_PKTHDR) {
1012 			return m0;
1013 		}
1014 	}
1015 	return m;
1016 }
1017 
1018 /*
1019  * Common mbuf queue utilities
1020  */
1021 
1022 static inline void
cfil_queue_init(struct cfil_queue * cfq)1023 cfil_queue_init(struct cfil_queue *cfq)
1024 {
1025 	cfq->q_start = 0;
1026 	cfq->q_end = 0;
1027 	MBUFQ_INIT(&cfq->q_mq);
1028 }
1029 
1030 static inline uint64_t
cfil_queue_drain(struct cfil_queue * cfq)1031 cfil_queue_drain(struct cfil_queue *cfq)
1032 {
1033 	uint64_t drained = cfq->q_start - cfq->q_end;
1034 	cfq->q_start = 0;
1035 	cfq->q_end = 0;
1036 	MBUFQ_DRAIN(&cfq->q_mq);
1037 
1038 	return drained;
1039 }
1040 
1041 /* Return 1 when empty, 0 otherwise */
1042 static inline int
cfil_queue_empty(struct cfil_queue * cfq)1043 cfil_queue_empty(struct cfil_queue *cfq)
1044 {
1045 	return MBUFQ_EMPTY(&cfq->q_mq);
1046 }
1047 
1048 static inline uint64_t
cfil_queue_offset_first(struct cfil_queue * cfq)1049 cfil_queue_offset_first(struct cfil_queue *cfq)
1050 {
1051 	return cfq->q_start;
1052 }
1053 
1054 static inline uint64_t
cfil_queue_offset_last(struct cfil_queue * cfq)1055 cfil_queue_offset_last(struct cfil_queue *cfq)
1056 {
1057 	return cfq->q_end;
1058 }
1059 
1060 static inline uint64_t
cfil_queue_len(struct cfil_queue * cfq)1061 cfil_queue_len(struct cfil_queue *cfq)
1062 {
1063 	return cfq->q_end - cfq->q_start;
1064 }
1065 
1066 /*
1067  * Routines to verify some fundamental assumptions
1068  */
1069 
1070 static void
cfil_queue_verify(struct cfil_queue * cfq)1071 cfil_queue_verify(struct cfil_queue *cfq)
1072 {
1073 	mbuf_t chain;
1074 	mbuf_t m;
1075 	mbuf_t n;
1076 	uint64_t queuesize = 0;
1077 
1078 	/* Verify offset are ordered */
1079 	VERIFY(cfq->q_start <= cfq->q_end);
1080 
1081 	/*
1082 	 * When queue is empty, the offsets are equal otherwise the offsets
1083 	 * are different
1084 	 */
1085 	VERIFY((MBUFQ_EMPTY(&cfq->q_mq) && cfq->q_start == cfq->q_end) ||
1086 	    (!MBUFQ_EMPTY(&cfq->q_mq) &&
1087 	    cfq->q_start != cfq->q_end));
1088 
1089 	MBUFQ_FOREACH(chain, &cfq->q_mq) {
1090 		size_t chainsize = 0;
1091 		m = chain;
1092 		unsigned int mlen = cfil_data_length(m, NULL, NULL);
1093 		// skip the addr and control stuff if present
1094 		m = cfil_data_start(m);
1095 
1096 		if (m == NULL ||
1097 		    m == (void *)M_TAG_FREE_PATTERN ||
1098 		    m->m_next == (void *)M_TAG_FREE_PATTERN ||
1099 		    m->m_nextpkt == (void *)M_TAG_FREE_PATTERN) {
1100 			panic("%s - mq %p is free at %p", __func__,
1101 			    &cfq->q_mq, m);
1102 		}
1103 		for (n = m; n != NULL; n = n->m_next) {
1104 			if (!m_has_mtype(n, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
1105 				panic("%s - %p unsupported type %u", __func__,
1106 				    n, n->m_type);
1107 			}
1108 			chainsize += n->m_len;
1109 		}
1110 		if (mlen != chainsize) {
1111 			panic("%s - %p m_length() %u != chainsize %lu",
1112 			    __func__, m, mlen, chainsize);
1113 		}
1114 		queuesize += chainsize;
1115 	}
1116 	OS_ANALYZER_SUPPRESS("81031590") if (queuesize != cfq->q_end - cfq->q_start) {
1117 		panic("%s - %p queuesize %llu != offsetdiffs %llu", __func__,
1118 		    m, queuesize, cfq->q_end - cfq->q_start);
1119 	}
1120 }
1121 
1122 static void
cfil_queue_enqueue(struct cfil_queue * cfq,mbuf_t m,size_t len)1123 cfil_queue_enqueue(struct cfil_queue *cfq, mbuf_t m, size_t len)
1124 {
1125 	CFIL_QUEUE_VERIFY(cfq);
1126 
1127 	MBUFQ_ENQUEUE(&cfq->q_mq, m);
1128 	cfq->q_end += len;
1129 
1130 	CFIL_QUEUE_VERIFY(cfq);
1131 }
1132 
1133 static void
cfil_queue_remove(struct cfil_queue * cfq,mbuf_t m,size_t len)1134 cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len)
1135 {
1136 	CFIL_QUEUE_VERIFY(cfq);
1137 
1138 	VERIFY(cfil_data_length(m, NULL, NULL) == len);
1139 
1140 	MBUFQ_REMOVE(&cfq->q_mq, m);
1141 	MBUFQ_NEXT(m) = NULL;
1142 	cfq->q_start += len;
1143 
1144 	CFIL_QUEUE_VERIFY(cfq);
1145 }
1146 
1147 static mbuf_t
cfil_queue_first(struct cfil_queue * cfq)1148 cfil_queue_first(struct cfil_queue *cfq)
1149 {
1150 	return MBUFQ_FIRST(&cfq->q_mq);
1151 }
1152 
1153 static mbuf_t
cfil_queue_next(struct cfil_queue * cfq,mbuf_t m)1154 cfil_queue_next(struct cfil_queue *cfq, mbuf_t m)
1155 {
1156 #pragma unused(cfq)
1157 	return MBUFQ_NEXT(m);
1158 }
1159 
1160 static void
cfil_entry_buf_verify(struct cfe_buf * cfe_buf)1161 cfil_entry_buf_verify(struct cfe_buf *cfe_buf)
1162 {
1163 	CFIL_QUEUE_VERIFY(&cfe_buf->cfe_ctl_q);
1164 	CFIL_QUEUE_VERIFY(&cfe_buf->cfe_pending_q);
1165 
1166 	/* Verify the queues are ordered so that pending is before ctl */
1167 	VERIFY(cfe_buf->cfe_ctl_q.q_start >= cfe_buf->cfe_pending_q.q_end);
1168 
1169 	/* The peek offset cannot be less than the pass offset */
1170 	VERIFY(cfe_buf->cfe_peek_offset >= cfe_buf->cfe_pass_offset);
1171 
1172 	/* Make sure we've updated the offset we peeked at  */
1173 	VERIFY(cfe_buf->cfe_ctl_q.q_start <= cfe_buf->cfe_peeked);
1174 }
1175 
1176 static void
cfil_entry_verify(struct cfil_entry * entry)1177 cfil_entry_verify(struct cfil_entry *entry)
1178 {
1179 	cfil_entry_buf_verify(&entry->cfe_snd);
1180 	cfil_entry_buf_verify(&entry->cfe_rcv);
1181 }
1182 
1183 static void
cfil_info_buf_verify(struct cfi_buf * cfi_buf)1184 cfil_info_buf_verify(struct cfi_buf *cfi_buf)
1185 {
1186 	CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q);
1187 
1188 	VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last);
1189 }
1190 
1191 static void
cfil_info_verify(struct cfil_info * cfil_info)1192 cfil_info_verify(struct cfil_info *cfil_info)
1193 {
1194 	int i;
1195 
1196 	if (cfil_info == NULL) {
1197 		return;
1198 	}
1199 
1200 	cfil_info_buf_verify(&cfil_info->cfi_snd);
1201 	cfil_info_buf_verify(&cfil_info->cfi_rcv);
1202 
1203 	for (i = 0; i < MAX_CONTENT_FILTER; i++) {
1204 		cfil_entry_verify(&cfil_info->cfi_entries[i]);
1205 	}
1206 }
1207 
1208 static void
verify_content_filter(struct content_filter * cfc)1209 verify_content_filter(struct content_filter *cfc)
1210 {
1211 	struct cfil_entry *entry;
1212 	uint32_t count = 0;
1213 
1214 	VERIFY(cfc->cf_sock_count >= 0);
1215 
1216 	TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
1217 		count++;
1218 		VERIFY(cfc == entry->cfe_filter);
1219 	}
1220 	VERIFY(count == cfc->cf_sock_count);
1221 }
1222 
1223 /*
1224  * Kernel control socket callbacks
1225  */
1226 static errno_t
cfil_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)1227 cfil_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
1228     void **unitinfo)
1229 {
1230 	errno_t error = 0;
1231 	struct content_filter * __single cfc = NULL;
1232 
1233 	CFIL_LOG(LOG_NOTICE, "");
1234 
1235 	cfc = zalloc_flags(content_filter_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
1236 
1237 	cfil_rw_lock_exclusive(&cfil_lck_rw);
1238 
1239 	if (sac->sc_unit == 0 || sac->sc_unit > MAX_CONTENT_FILTER) {
1240 		CFIL_LOG(LOG_ERR, "bad sc_unit %u", sac->sc_unit);
1241 		error = EINVAL;
1242 	} else if (content_filters[sac->sc_unit - 1] != NULL) {
1243 		CFIL_LOG(LOG_ERR, "sc_unit %u in use", sac->sc_unit);
1244 		error = EADDRINUSE;
1245 	} else {
1246 		/*
1247 		 * kernel control socket kcunit numbers start at 1
1248 		 */
1249 		content_filters[sac->sc_unit - 1] = cfc;
1250 
1251 		cfc->cf_kcref = kctlref;
1252 		cfc->cf_kcunit = sac->sc_unit;
1253 		TAILQ_INIT(&cfc->cf_sock_entries);
1254 
1255 		*unitinfo = cfc;
1256 		cfil_active_count++;
1257 
1258 		if (cfil_active_count == 1) {
1259 			soflow_feat_set_functions(cfil_dgram_gc_needed, cfil_dgram_gc_perform,
1260 			    cfil_dgram_detach_entry, cfil_dgram_detach_db);
1261 		}
1262 
1263 		// Allocate periodic stats buffer for this filter
1264 		if (global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] == NULL) {
1265 			cfil_rw_unlock_exclusive(&cfil_lck_rw);
1266 
1267 			struct cfil_stats_report_buffer * __single buf;
1268 
1269 			buf = kalloc_type(struct cfil_stats_report_buffer,
1270 			    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1271 
1272 			cfil_rw_lock_exclusive(&cfil_lck_rw);
1273 
1274 			/* Another thread may have won the race */
1275 			if (global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] != NULL) {
1276 				kfree_type(struct cfil_stats_report_buffer, buf);
1277 			} else {
1278 				global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] = buf;
1279 			}
1280 		}
1281 	}
1282 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
1283 
1284 	if (error != 0 && cfc != NULL) {
1285 		zfree(content_filter_zone, cfc);
1286 	}
1287 
1288 	if (error == 0) {
1289 		OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_ok);
1290 	} else {
1291 		OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_fail);
1292 	}
1293 
1294 	CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
1295 	    error, cfil_active_count, sac->sc_unit);
1296 
1297 	return error;
1298 }
1299 
1300 static void
cfil_update_behavior_flags(void)1301 cfil_update_behavior_flags(void)
1302 {
1303 	struct content_filter *cfc = NULL;
1304 
1305 	// Update global flag
1306 	bool preserve_connections = false;
1307 	for (int i = 0; i < MAX_CONTENT_FILTER; i++) {
1308 		cfc = content_filters[i];
1309 		if (cfc != NULL) {
1310 			if (cfc->cf_flags & CFF_PRESERVE_CONNECTIONS) {
1311 				preserve_connections = true;
1312 			} else {
1313 				preserve_connections = false;
1314 				break;
1315 			}
1316 		}
1317 	}
1318 	if (preserve_connections == true) {
1319 		cfil_behavior_flags |= CFIL_BEHAVIOR_FLAG_PRESERVE_CONNECTIONS;
1320 	} else {
1321 		cfil_behavior_flags &= ~CFIL_BEHAVIOR_FLAG_PRESERVE_CONNECTIONS;
1322 	}
1323 	CFIL_LOG(LOG_INFO, "CFIL Preserve Connections - %s",
1324 	    (cfil_behavior_flags & CFIL_BEHAVIOR_FLAG_PRESERVE_CONNECTIONS) ? "On" : "Off");
1325 }
1326 
1327 static errno_t
cfil_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)1328 cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
1329 {
1330 #pragma unused(kctlref)
1331 	errno_t error = 0;
1332 	struct content_filter * __single cfc;
1333 	struct cfil_entry *entry;
1334 	uint64_t sock_flow_id = 0;
1335 
1336 	CFIL_LOG(LOG_NOTICE, "");
1337 
1338 	if (kcunit > MAX_CONTENT_FILTER) {
1339 		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1340 		    kcunit, MAX_CONTENT_FILTER);
1341 		error = EINVAL;
1342 		goto done;
1343 	}
1344 
1345 	cfc = (struct content_filter *)unitinfo;
1346 	if (cfc == NULL) {
1347 		goto done;
1348 	}
1349 
1350 	cfil_rw_lock_exclusive(&cfil_lck_rw);
1351 	if (content_filters[kcunit - 1] != cfc || cfc->cf_kcunit != kcunit) {
1352 		CFIL_LOG(LOG_ERR, "bad unit info %u)",
1353 		    kcunit);
1354 		cfil_rw_unlock_exclusive(&cfil_lck_rw);
1355 		goto done;
1356 	}
1357 	cfc->cf_flags |= CFF_DETACHING;
1358 	/*
1359 	 * Remove all sockets from the filter
1360 	 */
1361 	while ((entry = TAILQ_FIRST(&cfc->cf_sock_entries)) != NULL) {
1362 		cfil_rw_lock_assert_held(&cfil_lck_rw, 1);
1363 
1364 		verify_content_filter(cfc);
1365 		/*
1366 		 * Accept all outstanding data by pushing to next filter
1367 		 * or back to socket
1368 		 *
1369 		 * TBD: Actually we should make sure all data has been pushed
1370 		 * back to socket
1371 		 */
1372 		if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
1373 			struct cfil_info *cfil_info = entry->cfe_cfil_info;
1374 			struct socket *so = cfil_info->cfi_so;
1375 			sock_flow_id = cfil_info->cfi_sock_id;
1376 
1377 			/* Need to let data flow immediately */
1378 			entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED |
1379 			    CFEF_DATA_START;
1380 
1381 			// Before we release global lock, retain the cfil_info -
1382 			// We attempt to retain a valid cfil_info to prevent any deallocation until
1383 			// we are done.  Abort retain if cfil_info has already entered the free code path.
1384 			if (cfil_info == NULL || os_ref_retain_try(&cfil_info->cfi_ref_count) == false) {
1385 				// Failing to retain cfil_info means detach is in progress already,
1386 				// remove entry from filter list and move on.
1387 				entry->cfe_filter = NULL;
1388 				entry->cfe_necp_control_unit = 0;
1389 				TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1390 				cfc->cf_sock_count--;
1391 				continue;
1392 			}
1393 
1394 			/*
1395 			 * Respect locking hierarchy
1396 			 */
1397 			cfil_rw_unlock_exclusive(&cfil_lck_rw);
1398 
1399 			// Search for socket from cfil_info sock_flow_id and lock so
1400 			so = cfil_socket_from_sock_id(sock_flow_id, false);
1401 			if (so == NULL || so != cfil_info->cfi_so) {
1402 				cfil_rw_lock_exclusive(&cfil_lck_rw);
1403 
1404 				// Socket has already been disconnected and removed from socket list.
1405 				// Remove entry from filter list and move on.
1406 				if (entry == TAILQ_FIRST(&cfc->cf_sock_entries)) {
1407 					entry->cfe_filter = NULL;
1408 					entry->cfe_necp_control_unit = 0;
1409 					TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1410 					cfc->cf_sock_count--;
1411 				}
1412 
1413 				goto release_cfil_info;
1414 			}
1415 
1416 			/*
1417 			 * When cfe_filter is NULL the filter is detached
1418 			 * and the entry has been removed from cf_sock_entries
1419 			 */
1420 			if ((so->so_cfil == NULL && so->so_flow_db == NULL) || entry->cfe_filter == NULL) {
1421 				cfil_rw_lock_exclusive(&cfil_lck_rw);
1422 				goto release;
1423 			}
1424 
1425 			(void) cfil_action_data_pass(so, cfil_info, kcunit, 1,
1426 			    CFM_MAX_OFFSET,
1427 			    CFM_MAX_OFFSET);
1428 
1429 			(void) cfil_action_data_pass(so, cfil_info, kcunit, 0,
1430 			    CFM_MAX_OFFSET,
1431 			    CFM_MAX_OFFSET);
1432 
1433 			cfil_rw_lock_exclusive(&cfil_lck_rw);
1434 
1435 			/*
1436 			 * Check again to make sure if the cfil_info is still valid
1437 			 * as the socket may have been unlocked when when calling
1438 			 * cfil_acquire_sockbuf()
1439 			 */
1440 			if (entry->cfe_filter == NULL ||
1441 			    (so->so_cfil == NULL && soflow_db_get_feature_context(so->so_flow_db, sock_flow_id) == NULL)) {
1442 				goto release;
1443 			}
1444 
1445 			/* The filter is now detached */
1446 			entry->cfe_flags |= CFEF_CFIL_DETACHED;
1447 
1448 			if (cfil_info->cfi_debug) {
1449 				cfil_info_log(LOG_ERR, cfil_info, "CFIL: FILTER DISCONNECTED");
1450 			}
1451 
1452 			CFIL_LOG(LOG_NOTICE, "so %llx detached %u",
1453 			    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1454 			if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
1455 			    cfil_filters_attached(so) == 0) {
1456 				CFIL_LOG(LOG_NOTICE, "so %llx waking",
1457 				    (uint64_t)VM_KERNEL_ADDRPERM(so));
1458 				wakeup((caddr_t)cfil_info);
1459 			}
1460 
1461 			/*
1462 			 * Remove the filter entry from the content filter
1463 			 * but leave the rest of the state intact as the queues
1464 			 * may not be empty yet
1465 			 */
1466 			entry->cfe_filter = NULL;
1467 			entry->cfe_necp_control_unit = 0;
1468 
1469 			TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1470 			cfc->cf_sock_count--;
1471 
1472 			// This is the last filter disconnecting, clear the cfil_info
1473 			// saved policy state so we will be able to drop this flow if
1474 			// a new filter get installed.
1475 			if (cfil_active_count == 1) {
1476 				cfil_info->cfi_filter_control_unit = 0;
1477 				cfil_info->cfi_filter_policy_gencount = 0;
1478 			}
1479 release:
1480 			socket_unlock(so, 1);
1481 
1482 release_cfil_info:
1483 			/*
1484 			 * Release reference on cfil_info.  To avoid double locking,
1485 			 * temporarily unlock in case it has been detached and we
1486 			 * end up freeing it which will take the global lock again.
1487 			 */
1488 			cfil_rw_unlock_exclusive(&cfil_lck_rw);
1489 			CFIL_INFO_FREE(cfil_info);
1490 			cfil_rw_lock_exclusive(&cfil_lck_rw);
1491 		}
1492 	}
1493 	verify_content_filter(cfc);
1494 
1495 	/* Free the stats buffer for this filter */
1496 	if (global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] != NULL) {
1497 		kfree_type(struct cfil_stats_report_buffer,
1498 		    global_cfil_stats_report_buffers[cfc->cf_kcunit - 1]);
1499 		global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] = NULL;
1500 	}
1501 	VERIFY(cfc->cf_sock_count == 0);
1502 
1503 	/*
1504 	 * Make filter inactive
1505 	 */
1506 	content_filters[kcunit - 1] = NULL;
1507 	cfil_active_count--;
1508 	cfil_update_behavior_flags();
1509 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
1510 
1511 	if (cfc->cf_crypto_state != NULL) {
1512 		cfil_crypto_cleanup_state(cfc->cf_crypto_state);
1513 		cfc->cf_crypto_state = NULL;
1514 	}
1515 
1516 	zfree(content_filter_zone, cfc);
1517 done:
1518 	if (error == 0) {
1519 		OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_ok);
1520 	} else {
1521 		OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_fail);
1522 	}
1523 
1524 	CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
1525 	    error, cfil_active_count, kcunit);
1526 
1527 	return error;
1528 }
1529 
1530 /*
1531  * cfil_acquire_sockbuf()
1532  *
1533  * Prevent any other thread from acquiring the sockbuf
1534  * We use sb_cfil_thread as a semaphore to prevent other threads from
1535  * messing with the sockbuf -- see sblock()
1536  * Note: We do not set SB_LOCK here because the thread may check or modify
1537  * SB_LOCK several times until it calls cfil_release_sockbuf() -- currently
1538  * sblock(), sbunlock() or sodefunct()
1539  */
1540 static int
cfil_acquire_sockbuf(struct socket * so,struct cfil_info * cfil_info,int outgoing)1541 cfil_acquire_sockbuf(struct socket *so, struct cfil_info *cfil_info, int outgoing)
1542 {
1543 	thread_t __single tp = current_thread();
1544 	struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1545 	lck_mtx_t *mutex_held;
1546 	int error = 0;
1547 
1548 	/*
1549 	 * Wait until no thread is holding the sockbuf and other content
1550 	 * filter threads have released the sockbuf
1551 	 */
1552 	while ((sb->sb_flags & SB_LOCK) ||
1553 	    (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)) {
1554 		if (so->so_proto->pr_getlock != NULL) {
1555 			mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1556 		} else {
1557 			mutex_held = so->so_proto->pr_domain->dom_mtx;
1558 		}
1559 
1560 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1561 
1562 		sb->sb_wantlock++;
1563 		VERIFY(sb->sb_wantlock != 0);
1564 
1565 		msleep(&sb->sb_flags, mutex_held, PSOCK, "cfil_acquire_sockbuf",
1566 		    NULL);
1567 
1568 		VERIFY(sb->sb_wantlock != 0);
1569 		sb->sb_wantlock--;
1570 	}
1571 	/*
1572 	 * Use reference count for repetitive calls on same thread
1573 	 */
1574 	if (sb->sb_cfil_refs == 0) {
1575 		VERIFY(sb->sb_cfil_thread == NULL);
1576 		VERIFY((sb->sb_flags & SB_LOCK) == 0);
1577 
1578 		sb->sb_cfil_thread = tp;
1579 		sb->sb_flags |= SB_LOCK;
1580 	}
1581 	sb->sb_cfil_refs++;
1582 
1583 	/* We acquire the socket buffer when we need to cleanup */
1584 	if (cfil_info == NULL) {
1585 		CFIL_LOG(LOG_ERR, "so %llx cfil detached",
1586 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
1587 		error = 0;
1588 	} else if (cfil_info->cfi_flags & CFIF_DROP) {
1589 		CFIL_LOG(LOG_ERR, "so %llx drop set",
1590 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
1591 		error = EPIPE;
1592 	}
1593 
1594 	return error;
1595 }
1596 
1597 static void
cfil_release_sockbuf(struct socket * so,int outgoing)1598 cfil_release_sockbuf(struct socket *so, int outgoing)
1599 {
1600 	struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1601 	thread_t __single tp = current_thread();
1602 
1603 	socket_lock_assert_owned(so);
1604 
1605 	if (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp) {
1606 		panic("%s sb_cfil_thread %p not current %p", __func__,
1607 		    sb->sb_cfil_thread, tp);
1608 	}
1609 	/*
1610 	 * Don't panic if we are defunct because SB_LOCK has
1611 	 * been cleared by sodefunct()
1612 	 */
1613 	if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK)) {
1614 		panic("%s SB_LOCK not set on %p", __func__,
1615 		    sb);
1616 	}
1617 	/*
1618 	 * We can unlock when the thread unwinds to the last reference
1619 	 */
1620 	sb->sb_cfil_refs--;
1621 	if (sb->sb_cfil_refs == 0) {
1622 		sb->sb_cfil_thread = NULL;
1623 		sb->sb_flags &= ~SB_LOCK;
1624 
1625 		if (sb->sb_wantlock > 0) {
1626 			wakeup(&sb->sb_flags);
1627 		}
1628 	}
1629 }
1630 
1631 cfil_sock_id_t
cfil_sock_id_from_socket(struct socket * so)1632 cfil_sock_id_from_socket(struct socket *so)
1633 {
1634 	if ((so->so_flags & SOF_CONTENT_FILTER) && so->so_cfil) {
1635 		return so->so_cfil->cfi_sock_id;
1636 	} else {
1637 		return CFIL_SOCK_ID_NONE;
1638 	}
1639 }
1640 
1641 /*
1642  * cfil_socket_safe_lock -
1643  * This routine attempts to lock the socket safely.
1644  *
1645  * The passed in pcbinfo is assumed to be locked and must be unlocked once the
1646  * inp state is safeguarded and before we attempt to lock/unlock the socket.
1647  * This is to prevent getting blocked by socket_lock() while holding the pcbinfo
1648  * lock, avoiding potential deadlock with other processes contending for the same
1649  * resources.  This is also to avoid double locking the pcbinfo for rip sockets
1650  * since rip_unlock() will lock ripcbinfo if it needs to dispose inpcb when
1651  * so_usecount is 0.
1652  */
1653 static bool
cfil_socket_safe_lock(struct inpcb * inp,struct inpcbinfo * pcbinfo)1654 cfil_socket_safe_lock(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1655 {
1656 	struct socket *so = NULL;
1657 
1658 	VERIFY(pcbinfo != NULL);
1659 
1660 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1661 		// Safeguarded the inp state, unlock pcbinfo before locking socket.
1662 		lck_rw_done(&pcbinfo->ipi_lock);
1663 
1664 		so = inp->inp_socket;
1665 		socket_lock(so, 1);
1666 		if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) {
1667 			return true;
1668 		}
1669 	} else {
1670 		// Failed to safeguarded the inp state, unlock pcbinfo and abort.
1671 		lck_rw_done(&pcbinfo->ipi_lock);
1672 	}
1673 
1674 	if (so) {
1675 		socket_unlock(so, 1);
1676 	}
1677 	return false;
1678 }
1679 
1680 static struct socket *
cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id,bool udp_only)1681 cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id, bool udp_only)
1682 {
1683 	struct socket *so = NULL;
1684 	u_int64_t gencnt = cfil_sock_id >> 32;
1685 	u_int32_t flowhash = (u_int32_t)(cfil_sock_id & 0x0ffffffff);
1686 	struct inpcb *inp = NULL;
1687 	struct inpcbinfo *pcbinfo = NULL;
1688 
1689 	if (udp_only) {
1690 		goto find_udp;
1691 	}
1692 
1693 	pcbinfo = &tcbinfo;
1694 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
1695 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1696 		if (inp->inp_state != INPCB_STATE_DEAD &&
1697 		    inp->inp_socket != NULL &&
1698 		    inp->inp_flowhash == flowhash &&
1699 		    (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt &&
1700 		    inp->inp_socket->so_cfil != NULL) {
1701 			if (cfil_socket_safe_lock(inp, pcbinfo)) {
1702 				so = inp->inp_socket;
1703 			}
1704 			/* pcbinfo is already unlocked, we are done. */
1705 			goto done;
1706 		}
1707 	}
1708 	lck_rw_done(&pcbinfo->ipi_lock);
1709 	if (so != NULL) {
1710 		goto done;
1711 	}
1712 
1713 find_udp:
1714 
1715 	pcbinfo = &udbinfo;
1716 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
1717 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1718 		if (inp->inp_state != INPCB_STATE_DEAD &&
1719 		    inp->inp_socket != NULL &&
1720 		    inp->inp_socket->so_flow_db != NULL &&
1721 		    (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
1722 			if (cfil_socket_safe_lock(inp, pcbinfo)) {
1723 				so = inp->inp_socket;
1724 			}
1725 			/* pcbinfo is already unlocked, we are done. */
1726 			goto done;
1727 		}
1728 	}
1729 	lck_rw_done(&pcbinfo->ipi_lock);
1730 	if (so != NULL) {
1731 		goto done;
1732 	}
1733 
1734 	pcbinfo = &ripcbinfo;
1735 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
1736 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1737 		if (inp->inp_state != INPCB_STATE_DEAD &&
1738 		    inp->inp_socket != NULL &&
1739 		    inp->inp_socket->so_flow_db != NULL &&
1740 		    (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
1741 			if (cfil_socket_safe_lock(inp, pcbinfo)) {
1742 				so = inp->inp_socket;
1743 			}
1744 			/* pcbinfo is already unlocked, we are done. */
1745 			goto done;
1746 		}
1747 	}
1748 	lck_rw_done(&pcbinfo->ipi_lock);
1749 
1750 done:
1751 	if (so == NULL) {
1752 		OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
1753 		CFIL_LOG(LOG_DEBUG,
1754 		    "no socket for sock_id %llx gencnt %llx flowhash %x",
1755 		    cfil_sock_id, gencnt, flowhash);
1756 	}
1757 
1758 	return so;
1759 }
1760 
1761 static struct socket *
cfil_socket_from_client_uuid(uuid_t necp_client_uuid,bool * cfil_attached)1762 cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached)
1763 {
1764 	struct socket *so = NULL;
1765 	struct inpcb *inp = NULL;
1766 	struct inpcbinfo *pcbinfo = &tcbinfo;
1767 
1768 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
1769 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1770 		if (inp->inp_state != INPCB_STATE_DEAD &&
1771 		    inp->inp_socket != NULL &&
1772 		    uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
1773 			*cfil_attached = (inp->inp_socket->so_cfil != NULL);
1774 			if (cfil_socket_safe_lock(inp, pcbinfo)) {
1775 				so = inp->inp_socket;
1776 			}
1777 			/* pcbinfo is already unlocked, we are done. */
1778 			goto done;
1779 		}
1780 	}
1781 	lck_rw_done(&pcbinfo->ipi_lock);
1782 	if (so != NULL) {
1783 		goto done;
1784 	}
1785 
1786 	pcbinfo = &udbinfo;
1787 	lck_rw_lock_shared(&pcbinfo->ipi_lock);
1788 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1789 		if (inp->inp_state != INPCB_STATE_DEAD &&
1790 		    inp->inp_socket != NULL &&
1791 		    uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
1792 			*cfil_attached = (inp->inp_socket->so_flow_db != NULL);
1793 			if (cfil_socket_safe_lock(inp, pcbinfo)) {
1794 				so = inp->inp_socket;
1795 			}
1796 			/* pcbinfo is already unlocked, we are done. */
1797 			goto done;
1798 		}
1799 	}
1800 	lck_rw_done(&pcbinfo->ipi_lock);
1801 
1802 done:
1803 	return so;
1804 }
1805 
1806 static void
cfil_info_stats_toggle(struct cfil_info * cfil_info,struct cfil_entry * entry,uint32_t report_frequency)1807 cfil_info_stats_toggle(struct cfil_info *cfil_info, struct cfil_entry *entry, uint32_t report_frequency)
1808 {
1809 	struct cfil_info *cfil = NULL;
1810 	Boolean found = FALSE;
1811 	int kcunit;
1812 
1813 	if (cfil_info == NULL) {
1814 		return;
1815 	}
1816 
1817 	if (report_frequency) {
1818 		if (entry == NULL) {
1819 			return;
1820 		}
1821 
1822 		// Update stats reporting frequency.
1823 		if (entry->cfe_stats_report_frequency != report_frequency) {
1824 			entry->cfe_stats_report_frequency = report_frequency;
1825 			if (entry->cfe_stats_report_frequency < CFIL_STATS_REPORT_INTERVAL_MIN_MSEC) {
1826 				entry->cfe_stats_report_frequency = CFIL_STATS_REPORT_INTERVAL_MIN_MSEC;
1827 			}
1828 			microuptime(&entry->cfe_stats_report_ts);
1829 
1830 			// Insert cfil_info into list only if it is not in yet.
1831 			TAILQ_FOREACH(cfil, &cfil_sock_head_stats, cfi_link_stats) {
1832 				if (cfil == cfil_info) {
1833 					return;
1834 				}
1835 			}
1836 
1837 			TAILQ_INSERT_TAIL(&cfil_sock_head_stats, cfil_info, cfi_link_stats);
1838 
1839 			// Wake up stats thread if this is first flow added
1840 			if (cfil_sock_attached_stats_count == 0) {
1841 				thread_wakeup((caddr_t)&cfil_sock_attached_stats_count);
1842 			}
1843 			cfil_sock_attached_stats_count++;
1844 
1845 			if (cfil_info->cfi_debug && cfil_log_stats) {
1846 				CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED - STATS FLOW INSERTED: <so %llx sockID %llu <%llx>> stats frequency %d msecs",
1847 				    cfil_info->cfi_so ? (uint64_t)VM_KERNEL_ADDRPERM(cfil_info->cfi_so) : 0,
1848 				    cfil_info->cfi_sock_id, cfil_info->cfi_sock_id,
1849 				    entry->cfe_stats_report_frequency);
1850 			}
1851 		}
1852 	} else {
1853 		// Turn off stats reporting for this filter.
1854 		if (entry != NULL) {
1855 			// Already off, no change.
1856 			if (entry->cfe_stats_report_frequency == 0) {
1857 				return;
1858 			}
1859 
1860 			entry->cfe_stats_report_frequency = 0;
1861 			// If cfil_info still has filter(s) asking for stats, no need to remove from list.
1862 			for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
1863 				if (cfil_info->cfi_entries[kcunit - 1].cfe_stats_report_frequency > 0) {
1864 					return;
1865 				}
1866 			}
1867 		}
1868 
1869 		// No more filter asking for stats for this cfil_info, remove from list.
1870 		if (!TAILQ_EMPTY(&cfil_sock_head_stats)) {
1871 			found = FALSE;
1872 			TAILQ_FOREACH(cfil, &cfil_sock_head_stats, cfi_link_stats) {
1873 				if (cfil == cfil_info) {
1874 					found = TRUE;
1875 					break;
1876 				}
1877 			}
1878 			if (found) {
1879 				cfil_sock_attached_stats_count--;
1880 				TAILQ_REMOVE(&cfil_sock_head_stats, cfil_info, cfi_link_stats);
1881 				if (cfil_info->cfi_debug && cfil_log_stats) {
1882 					CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED - STATS FLOW DELETED: <so %llx sockID %llu <%llx>> stats frequency reset",
1883 					    cfil_info->cfi_so ? (uint64_t)VM_KERNEL_ADDRPERM(cfil_info->cfi_so) : 0,
1884 					    cfil_info->cfi_sock_id, cfil_info->cfi_sock_id);
1885 				}
1886 			}
1887 		}
1888 	}
1889 }
1890 
1891 static errno_t
cfil_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)1892 cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
1893     int flags)
1894 {
1895 #pragma unused(kctlref, flags)
1896 	errno_t error = 0;
1897 	struct cfil_msg_hdr *msghdr;
1898 	struct content_filter *cfc = (struct content_filter *)unitinfo;
1899 	struct socket *so;
1900 	struct cfil_msg_action * __single action_msg;
1901 	struct cfil_entry *entry;
1902 	struct cfil_info * __single cfil_info = NULL;
1903 	unsigned int data_len = 0;
1904 
1905 	CFIL_LOG(LOG_INFO, "");
1906 
1907 	if (cfc == NULL) {
1908 		CFIL_LOG(LOG_ERR, "no unitinfo");
1909 		error = EINVAL;
1910 		goto done;
1911 	}
1912 
1913 	if (kcunit > MAX_CONTENT_FILTER) {
1914 		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1915 		    kcunit, MAX_CONTENT_FILTER);
1916 		error = EINVAL;
1917 		goto done;
1918 	}
1919 	if (m == NULL) {
1920 		CFIL_LOG(LOG_ERR, "null mbuf");
1921 		error = EINVAL;
1922 		goto done;
1923 	}
1924 	data_len = m_length(m);
1925 
1926 	if (data_len < sizeof(struct cfil_msg_hdr)) {
1927 		CFIL_LOG(LOG_ERR, "too short %u", data_len);
1928 		error = EINVAL;
1929 		goto done;
1930 	}
1931 	msghdr = mtod(m, struct cfil_msg_hdr *);
1932 	if (msghdr->cfm_version != CFM_VERSION_CURRENT) {
1933 		CFIL_LOG(LOG_ERR, "bad version %u", msghdr->cfm_version);
1934 		error = EINVAL;
1935 		goto done;
1936 	}
1937 	if (msghdr->cfm_type != CFM_TYPE_ACTION) {
1938 		CFIL_LOG(LOG_ERR, "bad type %u", msghdr->cfm_type);
1939 		error = EINVAL;
1940 		goto done;
1941 	}
1942 	if (msghdr->cfm_len > data_len) {
1943 		CFIL_LOG(LOG_ERR, "bad length %u", msghdr->cfm_len);
1944 		error = EINVAL;
1945 		goto done;
1946 	}
1947 
1948 	/* Validate action operation */
1949 	switch (msghdr->cfm_op) {
1950 	case CFM_OP_DATA_UPDATE:
1951 		OSIncrementAtomic(
1952 			&cfil_stats.cfs_ctl_action_data_update);
1953 		break;
1954 	case CFM_OP_DROP:
1955 		OSIncrementAtomic(&cfil_stats.cfs_ctl_action_drop);
1956 		break;
1957 	case CFM_OP_BLESS_CLIENT:
1958 		if (msghdr->cfm_len != sizeof(struct cfil_msg_bless_client)) {
1959 			OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1960 			error = EINVAL;
1961 			CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1962 			    msghdr->cfm_len,
1963 			    msghdr->cfm_op);
1964 			goto done;
1965 		}
1966 		error = cfil_action_bless_client(kcunit, msghdr);
1967 		goto done;
1968 	case CFM_OP_SET_CRYPTO_KEY:
1969 		if (msghdr->cfm_len != sizeof(struct cfil_msg_set_crypto_key)) {
1970 			OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1971 			error = EINVAL;
1972 			CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1973 			    msghdr->cfm_len,
1974 			    msghdr->cfm_op);
1975 			goto done;
1976 		}
1977 		error = cfil_action_set_crypto_key(kcunit, msghdr);
1978 		goto done;
1979 	default:
1980 		OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_op);
1981 		CFIL_LOG(LOG_ERR, "bad op %u", msghdr->cfm_op);
1982 		error = EINVAL;
1983 		goto done;
1984 	}
1985 	if (msghdr->cfm_len != sizeof(struct cfil_msg_action)) {
1986 		OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1987 		error = EINVAL;
1988 		CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1989 		    msghdr->cfm_len,
1990 		    msghdr->cfm_op);
1991 		goto done;
1992 	}
1993 	cfil_rw_lock_shared(&cfil_lck_rw);
1994 	if (cfc != (void *)content_filters[kcunit - 1]) {
1995 		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1996 		    kcunit);
1997 		error = EINVAL;
1998 		cfil_rw_unlock_shared(&cfil_lck_rw);
1999 		goto done;
2000 	}
2001 	cfil_rw_unlock_shared(&cfil_lck_rw);
2002 
2003 	// Search for socket (TCP+UDP and lock so)
2004 	so = cfil_socket_from_sock_id(msghdr->cfm_sock_id, false);
2005 	if (so == NULL) {
2006 		CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
2007 		    msghdr->cfm_sock_id);
2008 		error = EINVAL;
2009 		goto done;
2010 	}
2011 
2012 	cfil_info = so->so_flow_db != NULL ?
2013 	    soflow_db_get_feature_context(so->so_flow_db, msghdr->cfm_sock_id) : so->so_cfil;
2014 
2015 	// We should not obtain global lock here in order to avoid deadlock down the path.
2016 	// But we attempt to retain a valid cfil_info to prevent any deallocation until
2017 	// we are done.  Abort retain if cfil_info has already entered the free code path.
2018 	if (cfil_info && os_ref_retain_try(&cfil_info->cfi_ref_count) == false) {
2019 		socket_unlock(so, 1);
2020 		goto done;
2021 	}
2022 
2023 	if (cfil_info == NULL) {
2024 		CFIL_LOG(LOG_NOTICE, "so %llx <id %llu> not attached",
2025 		    (uint64_t)VM_KERNEL_ADDRPERM(so), msghdr->cfm_sock_id);
2026 		error = EINVAL;
2027 		goto unlock;
2028 	} else if (cfil_info->cfi_flags & CFIF_DROP) {
2029 		CFIL_LOG(LOG_NOTICE, "so %llx drop set",
2030 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
2031 		error = EINVAL;
2032 		goto unlock;
2033 	}
2034 
2035 	if (cfil_info->cfi_debug) {
2036 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED MSG FROM FILTER");
2037 	}
2038 
2039 	entry = &cfil_info->cfi_entries[kcunit - 1];
2040 	if (entry->cfe_filter == NULL) {
2041 		CFIL_LOG(LOG_NOTICE, "so %llx no filter",
2042 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
2043 		error = EINVAL;
2044 		goto unlock;
2045 	}
2046 
2047 	if (entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) {
2048 		entry->cfe_flags |= CFEF_DATA_START;
2049 	} else {
2050 		CFIL_LOG(LOG_ERR,
2051 		    "so %llx attached not sent for %u",
2052 		    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
2053 		error = EINVAL;
2054 		goto unlock;
2055 	}
2056 
2057 	microuptime(&entry->cfe_last_action);
2058 	CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_action, &cfil_info->cfi_first_event, msghdr->cfm_op);
2059 
2060 	action_msg = (struct cfil_msg_action *)msghdr;
2061 
2062 	switch (msghdr->cfm_op) {
2063 	case CFM_OP_DATA_UPDATE:
2064 
2065 		if (cfil_info->cfi_debug) {
2066 			cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DATA_UPDATE");
2067 			CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: <so %llx sockID %llu <%llx>> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
2068 			    (uint64_t)VM_KERNEL_ADDRPERM(so),
2069 			    cfil_info->cfi_sock_id, cfil_info->cfi_sock_id,
2070 			    action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
2071 			    action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
2072 		}
2073 
2074 		/*
2075 		 * Received verdict, at this point we know this
2076 		 * socket connection is allowed.  Unblock thread
2077 		 * immediately before proceeding to process the verdict.
2078 		 */
2079 		cfil_sock_received_verdict(so);
2080 
2081 		if (action_msg->cfa_out_peek_offset != 0 ||
2082 		    action_msg->cfa_out_pass_offset != 0) {
2083 			error = cfil_action_data_pass(so, cfil_info, kcunit, 1,
2084 			    action_msg->cfa_out_pass_offset,
2085 			    action_msg->cfa_out_peek_offset);
2086 		}
2087 		if (error == EJUSTRETURN) {
2088 			error = 0;
2089 		}
2090 		if (error != 0) {
2091 			break;
2092 		}
2093 		if (action_msg->cfa_in_peek_offset != 0 ||
2094 		    action_msg->cfa_in_pass_offset != 0) {
2095 			error = cfil_action_data_pass(so, cfil_info, kcunit, 0,
2096 			    action_msg->cfa_in_pass_offset,
2097 			    action_msg->cfa_in_peek_offset);
2098 		}
2099 		if (error == EJUSTRETURN) {
2100 			error = 0;
2101 		}
2102 
2103 		// Toggle stats reporting according to received verdict.
2104 		cfil_rw_lock_exclusive(&cfil_lck_rw);
2105 		cfil_info_stats_toggle(cfil_info, entry, action_msg->cfa_stats_frequency);
2106 		cfil_rw_unlock_exclusive(&cfil_lck_rw);
2107 
2108 		break;
2109 
2110 	case CFM_OP_DROP:
2111 		if (cfil_info->cfi_debug) {
2112 			cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DROP");
2113 			CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: <so %llx sockID %llu <%llx>> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
2114 			    (uint64_t)VM_KERNEL_ADDRPERM(so),
2115 			    cfil_info->cfi_sock_id, cfil_info->cfi_sock_id,
2116 			    action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
2117 			    action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
2118 		}
2119 
2120 		error = cfil_action_drop(so, cfil_info, kcunit);
2121 		cfil_sock_received_verdict(so);
2122 		break;
2123 
2124 	default:
2125 		error = EINVAL;
2126 		break;
2127 	}
2128 unlock:
2129 	CFIL_INFO_FREE(cfil_info)
2130 	socket_unlock(so, 1);
2131 done:
2132 	mbuf_freem(m);
2133 
2134 	if (error == 0) {
2135 		OSIncrementAtomic(&cfil_stats.cfs_ctl_send_ok);
2136 	} else {
2137 		OSIncrementAtomic(&cfil_stats.cfs_ctl_send_bad);
2138 	}
2139 
2140 	return error;
2141 }
2142 
2143 static errno_t
cfil_ctl_getopt(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,int opt,void * data,size_t * len)2144 cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
2145     int opt, void *data, size_t *len)
2146 {
2147 #pragma unused(kctlref, opt)
2148 	struct cfil_info * __single cfil_info = NULL;
2149 	errno_t error = 0;
2150 	struct content_filter *cfc = (struct content_filter *)unitinfo;
2151 
2152 	CFIL_LOG(LOG_NOTICE, "");
2153 
2154 	if (cfc == NULL) {
2155 		CFIL_LOG(LOG_ERR, "no unitinfo");
2156 		return EINVAL;
2157 	}
2158 
2159 	cfil_rw_lock_shared(&cfil_lck_rw);
2160 
2161 	if (kcunit > MAX_CONTENT_FILTER) {
2162 		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
2163 		    kcunit, MAX_CONTENT_FILTER);
2164 		error = EINVAL;
2165 		goto done;
2166 	}
2167 	if (cfc != (void *)content_filters[kcunit - 1]) {
2168 		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
2169 		    kcunit);
2170 		error = EINVAL;
2171 		goto done;
2172 	}
2173 	switch (opt) {
2174 	case CFIL_OPT_NECP_CONTROL_UNIT:
2175 		if (*len < sizeof(uint32_t)) {
2176 			CFIL_LOG(LOG_ERR, "len too small %lu", *len);
2177 			error = EINVAL;
2178 			goto done;
2179 		}
2180 		if (data != NULL) {
2181 			*(uint32_t *)data = cfc->cf_necp_control_unit;
2182 		}
2183 		break;
2184 	case CFIL_OPT_PRESERVE_CONNECTIONS:
2185 		if (*len < sizeof(uint32_t)) {
2186 			CFIL_LOG(LOG_ERR, "CFIL_OPT_PRESERVE_CONNECTIONS len too small %lu", *len);
2187 			error = EINVAL;
2188 			goto done;
2189 		}
2190 		if (data != NULL) {
2191 			*(uint32_t *)data = (cfc->cf_flags & CFF_PRESERVE_CONNECTIONS) ? true : false;
2192 		}
2193 		break;
2194 	case CFIL_OPT_GET_SOCKET_INFO:
2195 		if (*len != sizeof(struct cfil_opt_sock_info)) {
2196 			CFIL_LOG(LOG_ERR, "len does not match %lu", *len);
2197 			error = EINVAL;
2198 			goto done;
2199 		}
2200 		if (data == NULL) {
2201 			CFIL_LOG(LOG_ERR, "data not passed");
2202 			error = EINVAL;
2203 			goto done;
2204 		}
2205 
2206 		struct cfil_opt_sock_info *sock_info =
2207 		    (struct cfil_opt_sock_info *) data;
2208 
2209 		// Unlock here so that we never hold both cfil_lck_rw and the
2210 		// socket_lock at the same time. Otherwise, this can deadlock
2211 		// because soclose() takes the socket_lock and then exclusive
2212 		// cfil_lck_rw and we require the opposite order.
2213 
2214 		// WARNING: Be sure to never use anything protected
2215 		//     by cfil_lck_rw beyond this point.
2216 		// WARNING: Be sure to avoid fallthrough and
2217 		//     goto return_already_unlocked from this branch.
2218 		cfil_rw_unlock_shared(&cfil_lck_rw);
2219 
2220 		// Search (TCP+UDP) and lock socket
2221 		struct socket *sock =
2222 		    cfil_socket_from_sock_id(sock_info->cfs_sock_id, false);
2223 		if (sock == NULL) {
2224 			CFIL_LOG(LOG_ERR, "CFIL: GET_SOCKET_INFO failed: bad sock_id %llu",
2225 			    sock_info->cfs_sock_id);
2226 			error = ENOENT;
2227 			goto return_already_unlocked;
2228 		}
2229 
2230 		cfil_info = (sock->so_flow_db != NULL) ?
2231 		    soflow_db_get_feature_context(sock->so_flow_db, sock_info->cfs_sock_id) : sock->so_cfil;
2232 
2233 		if (cfil_info == NULL) {
2234 			CFIL_LOG(LOG_INFO, "CFIL: GET_SOCKET_INFO failed: so %llx not attached, cannot fetch info",
2235 			    (uint64_t)VM_KERNEL_ADDRPERM(sock));
2236 			error = EINVAL;
2237 			socket_unlock(sock, 1);
2238 			goto return_already_unlocked;
2239 		}
2240 
2241 		if (sock->so_proto == NULL || sock->so_proto->pr_domain == NULL) {
2242 			CFIL_LOG(LOG_INFO, "CFIL: GET_SOCKET_INFO failed: so %llx NULL so_proto / pr_domain",
2243 			    (uint64_t)VM_KERNEL_ADDRPERM(sock));
2244 			error = EINVAL;
2245 			socket_unlock(sock, 1);
2246 			goto return_already_unlocked;
2247 		}
2248 
2249 		// Fill out family, type, and protocol
2250 		sock_info->cfs_sock_family = SOCK_DOM(sock);
2251 		sock_info->cfs_sock_type = SOCK_TYPE(sock);
2252 		sock_info->cfs_sock_protocol = GET_SO_PROTO(sock);
2253 
2254 		// Source and destination addresses
2255 		struct inpcb *inp = sotoinpcb(sock);
2256 		if (inp->inp_vflag & INP_IPV6) {
2257 			struct in6_addr * __single laddr = NULL, * __single faddr = NULL;
2258 			u_int16_t lport = 0, fport = 0;
2259 
2260 			cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp,
2261 			    &laddr, &faddr, &lport, &fport);
2262 			fill_ip6_sockaddr_4_6(&sock_info->cfs_local, laddr, lport, inp->inp_lifscope);
2263 			fill_ip6_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport, inp->inp_fifscope);
2264 		} else if (inp->inp_vflag & INP_IPV4) {
2265 			struct in_addr laddr = {.s_addr = 0}, faddr = {.s_addr = 0};
2266 			u_int16_t lport = 0, fport = 0;
2267 
2268 			cfil_get_flow_address(cfil_info->cfi_hash_entry, inp,
2269 			    &laddr, &faddr, &lport, &fport);
2270 			fill_ip_sockaddr_4_6(&sock_info->cfs_local, laddr, lport);
2271 			fill_ip_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport);
2272 		}
2273 
2274 		// Set the pid info
2275 		sock_info->cfs_pid = sock->last_pid;
2276 		memcpy(sock_info->cfs_uuid, sock->last_uuid, sizeof(uuid_t));
2277 
2278 		if (sock->so_flags & SOF_DELEGATED) {
2279 			sock_info->cfs_e_pid = sock->e_pid;
2280 			memcpy(sock_info->cfs_e_uuid, sock->e_uuid, sizeof(uuid_t));
2281 		} else {
2282 			sock_info->cfs_e_pid = sock->last_pid;
2283 			memcpy(sock_info->cfs_e_uuid, sock->last_uuid, sizeof(uuid_t));
2284 		}
2285 #if defined(XNU_TARGET_OS_OSX)
2286 		if (!uuid_is_null(sock->so_ruuid)) {
2287 			sock_info->cfs_r_pid = sock->so_rpid;
2288 			memcpy(sock_info->cfs_r_uuid, sock->so_ruuid, sizeof(uuid_t));
2289 		}
2290 #endif
2291 		socket_unlock(sock, 1);
2292 
2293 		goto return_already_unlocked;
2294 	default:
2295 		error = ENOPROTOOPT;
2296 		break;
2297 	}
2298 done:
2299 	cfil_rw_unlock_shared(&cfil_lck_rw);
2300 
2301 	return error;
2302 
2303 return_already_unlocked:
2304 
2305 	return error;
2306 }
2307 
2308 static errno_t
cfil_ctl_setopt(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,int opt,void * data,size_t len)2309 cfil_ctl_setopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
2310     int opt, void *data, size_t len)
2311 {
2312 #pragma unused(kctlref, opt)
2313 	errno_t error = 0;
2314 	struct content_filter *cfc = (struct content_filter *)unitinfo;
2315 
2316 	CFIL_LOG(LOG_NOTICE, "");
2317 
2318 	if (cfc == NULL) {
2319 		CFIL_LOG(LOG_ERR, "no unitinfo");
2320 		return EINVAL;
2321 	}
2322 
2323 	cfil_rw_lock_exclusive(&cfil_lck_rw);
2324 
2325 	if (kcunit > MAX_CONTENT_FILTER) {
2326 		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
2327 		    kcunit, MAX_CONTENT_FILTER);
2328 		error = EINVAL;
2329 		goto done;
2330 	}
2331 	if (cfc != (void *)content_filters[kcunit - 1]) {
2332 		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
2333 		    kcunit);
2334 		error = EINVAL;
2335 		goto done;
2336 	}
2337 	switch (opt) {
2338 	case CFIL_OPT_NECP_CONTROL_UNIT:
2339 		if (len < sizeof(uint32_t)) {
2340 			CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
2341 			    "len too small %lu", len);
2342 			error = EINVAL;
2343 			goto done;
2344 		}
2345 		if (cfc->cf_necp_control_unit != 0) {
2346 			CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
2347 			    "already set %u",
2348 			    cfc->cf_necp_control_unit);
2349 			error = EINVAL;
2350 			goto done;
2351 		}
2352 		cfc->cf_necp_control_unit = *(uint32_t *)data;
2353 		break;
2354 	case CFIL_OPT_PRESERVE_CONNECTIONS:
2355 		if (len < sizeof(uint32_t)) {
2356 			CFIL_LOG(LOG_ERR, "CFIL_OPT_PRESERVE_CONNECTIONS "
2357 			    "len too small %lu", len);
2358 			error = EINVAL;
2359 			goto done;
2360 		}
2361 		uint32_t preserve_connections = *((uint32_t *)data);
2362 		CFIL_LOG(LOG_INFO, "CFIL_OPT_PRESERVE_CONNECTIONS got %d (kcunit %d)", preserve_connections, kcunit);
2363 		if (preserve_connections) {
2364 			cfc->cf_flags |= CFF_PRESERVE_CONNECTIONS;
2365 		} else {
2366 			cfc->cf_flags &= ~CFF_PRESERVE_CONNECTIONS;
2367 		}
2368 
2369 		cfil_update_behavior_flags();
2370 		break;
2371 	default:
2372 		error = ENOPROTOOPT;
2373 		break;
2374 	}
2375 done:
2376 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
2377 
2378 	return error;
2379 }
2380 
2381 
2382 static void
cfil_ctl_rcvd(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,int flags)2383 cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags)
2384 {
2385 #pragma unused(kctlref, flags)
2386 	struct content_filter *cfc = (struct content_filter *)unitinfo;
2387 	struct socket *so = NULL;
2388 	int error;
2389 	struct cfil_entry *entry;
2390 	struct cfil_info *cfil_info = NULL;
2391 
2392 	CFIL_LOG(LOG_INFO, "");
2393 
2394 	if (cfc == NULL) {
2395 		CFIL_LOG(LOG_ERR, "no unitinfo");
2396 		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
2397 		return;
2398 	}
2399 
2400 	if (kcunit > MAX_CONTENT_FILTER) {
2401 		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
2402 		    kcunit, MAX_CONTENT_FILTER);
2403 		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
2404 		return;
2405 	}
2406 	cfil_rw_lock_shared(&cfil_lck_rw);
2407 	if (cfc != (void *)content_filters[kcunit - 1]) {
2408 		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
2409 		    kcunit);
2410 		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
2411 		goto done;
2412 	}
2413 	/* Let's assume the flow control is lifted */
2414 	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2415 		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
2416 			cfil_rw_lock_exclusive(&cfil_lck_rw);
2417 		}
2418 
2419 		cfc->cf_flags &= ~CFF_FLOW_CONTROLLED;
2420 
2421 		cfil_rw_lock_exclusive_to_shared(&cfil_lck_rw);
2422 		LCK_RW_ASSERT(&cfil_lck_rw, LCK_RW_ASSERT_SHARED);
2423 	}
2424 	/*
2425 	 * Flow control will be raised again as soon as an entry cannot enqueue
2426 	 * to the kernel control socket
2427 	 */
2428 	while ((cfc->cf_flags & CFF_FLOW_CONTROLLED) == 0) {
2429 		verify_content_filter(cfc);
2430 
2431 		cfil_rw_lock_assert_held(&cfil_lck_rw, 0);
2432 
2433 		/* Find an entry that is flow controlled */
2434 		TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
2435 			if (entry->cfe_cfil_info == NULL ||
2436 			    entry->cfe_cfil_info->cfi_so == NULL) {
2437 				continue;
2438 			}
2439 			if ((entry->cfe_flags & CFEF_FLOW_CONTROLLED) == 0) {
2440 				continue;
2441 			}
2442 		}
2443 		if (entry == NULL) {
2444 			break;
2445 		}
2446 
2447 		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_flow_lift);
2448 
2449 		cfil_info = entry->cfe_cfil_info;
2450 		so = cfil_info->cfi_so;
2451 
2452 		if (cfil_info == NULL || os_ref_retain_try(&cfil_info->cfi_ref_count) == false) {
2453 			break;
2454 		}
2455 
2456 		cfil_rw_unlock_shared(&cfil_lck_rw);
2457 		socket_lock(so, 1);
2458 
2459 		do {
2460 			error = cfil_acquire_sockbuf(so, cfil_info, 1);
2461 			if (error == 0) {
2462 				error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 1);
2463 			}
2464 			cfil_release_sockbuf(so, 1);
2465 			if (error != 0) {
2466 				break;
2467 			}
2468 
2469 			error = cfil_acquire_sockbuf(so, cfil_info, 0);
2470 			if (error == 0) {
2471 				error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 0);
2472 			}
2473 			cfil_release_sockbuf(so, 0);
2474 		} while (0);
2475 
2476 		CFIL_INFO_FREE(cfil_info);
2477 		socket_lock_assert_owned(so);
2478 		socket_unlock(so, 1);
2479 
2480 		cfil_rw_lock_shared(&cfil_lck_rw);
2481 	}
2482 done:
2483 	cfil_rw_unlock_shared(&cfil_lck_rw);
2484 }
2485 
2486 struct cflil_tag_container {
2487 	struct m_tag    cfil_m_tag;
2488 	struct cfil_tag cfil_tag;
2489 };
2490 
2491 static struct m_tag *
m_tag_kalloc_cfil_udp(u_int32_t id,u_int16_t type,uint16_t len,int wait)2492 m_tag_kalloc_cfil_udp(u_int32_t id, u_int16_t type, uint16_t len, int wait)
2493 {
2494 	struct cflil_tag_container *tag_container;
2495 	struct m_tag *tag = NULL;
2496 
2497 	assert3u(id, ==, KERNEL_MODULE_TAG_ID);
2498 	assert3u(type, ==, KERNEL_TAG_TYPE_CFIL_UDP);
2499 	assert3u(len, ==, sizeof(struct cfil_tag));
2500 
2501 	if (len != sizeof(struct cfil_tag)) {
2502 		return NULL;
2503 	}
2504 
2505 	tag_container = kalloc_type(struct cflil_tag_container, wait | M_ZERO);
2506 	if (tag_container != NULL) {
2507 		tag =  &tag_container->cfil_m_tag;
2508 
2509 		assert3p(tag, ==, tag_container);
2510 
2511 		M_TAG_INIT(tag, id, type, len, &tag_container->cfil_tag, NULL);
2512 	}
2513 
2514 	return tag;
2515 }
2516 
2517 static void
m_tag_kfree_cfil_udp(struct m_tag * tag)2518 m_tag_kfree_cfil_udp(struct m_tag *tag)
2519 {
2520 	struct cflil_tag_container * __single tag_container = (struct cflil_tag_container *)tag;
2521 
2522 	kfree_type(struct cflil_tag_container, tag_container);
2523 }
2524 
2525 void
cfil_register_m_tag(void)2526 cfil_register_m_tag(void)
2527 {
2528 	errno_t error = 0;
2529 
2530 	error = m_register_internal_tag_type(KERNEL_TAG_TYPE_CFIL_UDP, sizeof(struct cfil_tag),
2531 	    m_tag_kalloc_cfil_udp, m_tag_kfree_cfil_udp);
2532 
2533 	assert3u(error, ==, 0);
2534 }
2535 
2536 void
cfil_init(void)2537 cfil_init(void)
2538 {
2539 	struct kern_ctl_reg kern_ctl;
2540 	errno_t error = 0;
2541 	unsigned int mbuf_limit = 0;
2542 
2543 	CFIL_LOG(LOG_NOTICE, "");
2544 
2545 	/*
2546 	 * Compile time verifications
2547 	 */
2548 	_CASSERT(CFIL_MAX_FILTER_COUNT == MAX_CONTENT_FILTER);
2549 	_CASSERT(sizeof(struct cfil_filter_stat) % sizeof(uint32_t) == 0);
2550 	_CASSERT(sizeof(struct cfil_entry_stat) % sizeof(uint32_t) == 0);
2551 	_CASSERT(sizeof(struct cfil_sock_stat) % sizeof(uint32_t) == 0);
2552 
2553 	/*
2554 	 * Runtime time verifications
2555 	 */
2556 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_enqueued,
2557 	    sizeof(uint32_t)));
2558 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_enqueued,
2559 	    sizeof(uint32_t)));
2560 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_peeked,
2561 	    sizeof(uint32_t)));
2562 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_peeked,
2563 	    sizeof(uint32_t)));
2564 
2565 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_in_enqueued,
2566 	    sizeof(uint32_t)));
2567 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_out_enqueued,
2568 	    sizeof(uint32_t)));
2569 
2570 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_enqueued,
2571 	    sizeof(uint32_t)));
2572 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_enqueued,
2573 	    sizeof(uint32_t)));
2574 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_passed,
2575 	    sizeof(uint32_t)));
2576 	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_passed,
2577 	    sizeof(uint32_t)));
2578 
2579 	/*
2580 	 * Allocate locks
2581 	 */
2582 	TAILQ_INIT(&cfil_sock_head);
2583 	TAILQ_INIT(&cfil_sock_head_stats);
2584 
2585 	/*
2586 	 * Register kernel control
2587 	 */
2588 	bzero(&kern_ctl, sizeof(kern_ctl));
2589 	strlcpy(kern_ctl.ctl_name, CONTENT_FILTER_CONTROL_NAME,
2590 	    sizeof(kern_ctl.ctl_name));
2591 	kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED | CTL_FLAG_REG_EXTENDED;
2592 	kern_ctl.ctl_sendsize = 512 * 1024; /* enough? */
2593 	kern_ctl.ctl_recvsize = 512 * 1024; /* enough? */
2594 	kern_ctl.ctl_connect = cfil_ctl_connect;
2595 	kern_ctl.ctl_disconnect = cfil_ctl_disconnect;
2596 	kern_ctl.ctl_send = cfil_ctl_send;
2597 	kern_ctl.ctl_getopt = cfil_ctl_getopt;
2598 	kern_ctl.ctl_setopt = cfil_ctl_setopt;
2599 	kern_ctl.ctl_rcvd = cfil_ctl_rcvd;
2600 	error = ctl_register(&kern_ctl, &cfil_kctlref);
2601 	if (error != 0) {
2602 		CFIL_LOG(LOG_ERR, "ctl_register failed: %d", error);
2603 		return;
2604 	}
2605 
2606 	// Spawn thread for statistics reporting
2607 	if (kernel_thread_start(cfil_stats_report_thread_func, NULL,
2608 	    &cfil_stats_report_thread) != KERN_SUCCESS) {
2609 		panic_plain("%s: Can't create statistics report thread", __func__);
2610 		/* NOTREACHED */
2611 	}
2612 	/* this must not fail */
2613 	VERIFY(cfil_stats_report_thread != NULL);
2614 
2615 	// Set UDP per-flow mbuf thresholds to 1/32 of platform max
2616 	mbuf_limit = MAX(UDP_FLOW_GC_MBUF_CNT_MAX, (nmbclusters << MCLSHIFT) >> UDP_FLOW_GC_MBUF_SHIFT);
2617 	cfil_udp_gc_mbuf_num_max = (mbuf_limit >> MCLSHIFT);
2618 	cfil_udp_gc_mbuf_cnt_max = mbuf_limit;
2619 
2620 	memset(&global_cfil_stats_report_buffers, 0, sizeof(global_cfil_stats_report_buffers));
2621 }
2622 
2623 struct cfil_info *
cfil_info_alloc(struct socket * so,struct soflow_hash_entry * hash_entry)2624 cfil_info_alloc(struct socket *so, struct soflow_hash_entry *hash_entry)
2625 {
2626 	int kcunit;
2627 	struct cfil_info *cfil_info = NULL;
2628 	struct inpcb *inp = sotoinpcb(so);
2629 
2630 	CFIL_LOG(LOG_INFO, "");
2631 
2632 	socket_lock_assert_owned(so);
2633 
2634 	cfil_info = zalloc_flags(cfil_info_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2635 	os_ref_init(&cfil_info->cfi_ref_count, &cfil_refgrp);
2636 
2637 	cfil_queue_init(&cfil_info->cfi_snd.cfi_inject_q);
2638 	cfil_queue_init(&cfil_info->cfi_rcv.cfi_inject_q);
2639 
2640 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2641 		struct cfil_entry *entry;
2642 
2643 		entry = &cfil_info->cfi_entries[kcunit - 1];
2644 		entry->cfe_cfil_info = cfil_info;
2645 
2646 		/* Initialize the filter entry */
2647 		entry->cfe_filter = NULL;
2648 		entry->cfe_flags = 0;
2649 		entry->cfe_necp_control_unit = 0;
2650 		entry->cfe_snd.cfe_pass_offset = 0;
2651 		entry->cfe_snd.cfe_peek_offset = 0;
2652 		entry->cfe_snd.cfe_peeked = 0;
2653 		entry->cfe_rcv.cfe_pass_offset = 0;
2654 		entry->cfe_rcv.cfe_peek_offset = 0;
2655 		entry->cfe_rcv.cfe_peeked = 0;
2656 		/*
2657 		 * Timestamp the last action to avoid pre-maturely
2658 		 * triggering garbage collection
2659 		 */
2660 		microuptime(&entry->cfe_last_action);
2661 
2662 		cfil_queue_init(&entry->cfe_snd.cfe_pending_q);
2663 		cfil_queue_init(&entry->cfe_rcv.cfe_pending_q);
2664 		cfil_queue_init(&entry->cfe_snd.cfe_ctl_q);
2665 		cfil_queue_init(&entry->cfe_rcv.cfe_ctl_q);
2666 	}
2667 
2668 	cfil_rw_lock_exclusive(&cfil_lck_rw);
2669 
2670 	/*
2671 	 * Create a cfi_sock_id that's not the socket pointer!
2672 	 */
2673 
2674 	if (hash_entry == NULL) {
2675 		// This is the TCP case, cfil_info is tracked per socket
2676 		if (inp->inp_flowhash == 0) {
2677 			inp_calc_flowhash(inp);
2678 			ASSERT(inp->inp_flowhash != 0);
2679 		}
2680 
2681 		so->so_cfil = cfil_info;
2682 		cfil_info->cfi_so = so;
2683 		cfil_info->cfi_sock_id =
2684 		    ((so->so_gencnt << 32) | inp->inp_flowhash);
2685 	} else {
2686 		// This is the UDP case, cfil_info is tracked in per-socket hash
2687 		cfil_info->cfi_so = so;
2688 		cfil_info->cfi_hash_entry = hash_entry;
2689 		cfil_info->cfi_sock_id = ((so->so_gencnt << 32) | (hash_entry->soflow_flowhash & 0xffffffff));
2690 	}
2691 
2692 	TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link);
2693 	SLIST_INIT(&cfil_info->cfi_ordered_entries);
2694 
2695 	cfil_sock_attached_count++;
2696 
2697 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
2698 
2699 	if (cfil_info != NULL) {
2700 		OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_ok);
2701 	} else {
2702 		OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_fail);
2703 	}
2704 
2705 	return cfil_info;
2706 }
2707 
2708 int
cfil_info_attach_unit(struct socket * so,uint32_t filter_control_unit,struct cfil_info * cfil_info)2709 cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit, struct cfil_info *cfil_info)
2710 {
2711 	int kcunit;
2712 	int attached = 0;
2713 
2714 	CFIL_LOG(LOG_INFO, "");
2715 
2716 	socket_lock_assert_owned(so);
2717 
2718 	cfil_rw_lock_exclusive(&cfil_lck_rw);
2719 
2720 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2721 		struct content_filter *cfc = content_filters[kcunit - 1];
2722 		struct cfil_entry *entry;
2723 		struct cfil_entry *iter_entry;
2724 		struct cfil_entry *iter_prev;
2725 
2726 		if (cfc == NULL) {
2727 			continue;
2728 		}
2729 		if (!(cfc->cf_necp_control_unit & filter_control_unit)) {
2730 			continue;
2731 		}
2732 
2733 		entry = &cfil_info->cfi_entries[kcunit - 1];
2734 
2735 		entry->cfe_filter = cfc;
2736 		entry->cfe_necp_control_unit = cfc->cf_necp_control_unit;
2737 		TAILQ_INSERT_TAIL(&cfc->cf_sock_entries, entry, cfe_link);
2738 		cfc->cf_sock_count++;
2739 
2740 		/* Insert the entry into the list ordered by control unit */
2741 		iter_prev = NULL;
2742 		SLIST_FOREACH(iter_entry, &cfil_info->cfi_ordered_entries, cfe_order_link) {
2743 			if (entry->cfe_necp_control_unit < iter_entry->cfe_necp_control_unit) {
2744 				break;
2745 			}
2746 			iter_prev = iter_entry;
2747 		}
2748 
2749 		if (iter_prev == NULL) {
2750 			SLIST_INSERT_HEAD(&cfil_info->cfi_ordered_entries, entry, cfe_order_link);
2751 		} else {
2752 			SLIST_INSERT_AFTER(iter_prev, entry, cfe_order_link);
2753 		}
2754 
2755 		verify_content_filter(cfc);
2756 		attached = 1;
2757 		entry->cfe_flags |= CFEF_CFIL_ATTACHED;
2758 	}
2759 
2760 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
2761 
2762 	return attached;
2763 }
2764 
2765 static void
cfil_info_free(struct cfil_info * cfil_info)2766 cfil_info_free(struct cfil_info *cfil_info)
2767 {
2768 	int kcunit;
2769 	uint64_t in_drain = 0;
2770 	uint64_t out_drained = 0;
2771 
2772 	if (cfil_info == NULL) {
2773 		return;
2774 	}
2775 
2776 	CFIL_LOG(LOG_INFO, "");
2777 
2778 	cfil_rw_lock_exclusive(&cfil_lck_rw);
2779 
2780 	if (cfil_info->cfi_debug) {
2781 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: FREEING CFIL_INFO");
2782 	}
2783 
2784 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2785 		struct cfil_entry *entry;
2786 		struct content_filter *cfc;
2787 
2788 		entry = &cfil_info->cfi_entries[kcunit - 1];
2789 
2790 		/* Don't be silly and try to detach twice */
2791 		if (entry->cfe_filter == NULL) {
2792 			continue;
2793 		}
2794 
2795 		cfc = content_filters[kcunit - 1];
2796 
2797 		VERIFY(cfc == entry->cfe_filter);
2798 
2799 		entry->cfe_filter = NULL;
2800 		entry->cfe_necp_control_unit = 0;
2801 		TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
2802 		cfc->cf_sock_count--;
2803 
2804 		verify_content_filter(cfc);
2805 	}
2806 
2807 	cfil_sock_attached_count--;
2808 	TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link);
2809 
2810 	// Turn off stats reporting for cfil_info.
2811 	cfil_info_stats_toggle(cfil_info, NULL, 0);
2812 
2813 	out_drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
2814 	in_drain += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
2815 
2816 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2817 		struct cfil_entry *entry;
2818 
2819 		entry = &cfil_info->cfi_entries[kcunit - 1];
2820 		out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
2821 		in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_pending_q);
2822 		out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
2823 		in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_ctl_q);
2824 	}
2825 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
2826 
2827 	if (out_drained) {
2828 		OSIncrementAtomic(&cfil_stats.cfs_flush_out_free);
2829 	}
2830 	if (in_drain) {
2831 		OSIncrementAtomic(&cfil_stats.cfs_flush_in_free);
2832 	}
2833 
2834 	zfree(cfil_info_zone, cfil_info);
2835 }
2836 
2837 /*
2838  * Received a verdict from userspace for a socket.
2839  * Perform any delayed operation if needed.
2840  */
2841 static void
cfil_sock_received_verdict(struct socket * so)2842 cfil_sock_received_verdict(struct socket *so)
2843 {
2844 	if (so == NULL || so->so_cfil == NULL) {
2845 		return;
2846 	}
2847 
2848 	so->so_cfil->cfi_flags |= CFIF_INITIAL_VERDICT;
2849 
2850 	/*
2851 	 * If socket has already been connected, trigger
2852 	 * soisconnected now.
2853 	 */
2854 	if (so->so_cfil->cfi_flags & CFIF_SOCKET_CONNECTED) {
2855 		so->so_cfil->cfi_flags &= ~CFIF_SOCKET_CONNECTED;
2856 		soisconnected(so);
2857 		return;
2858 	}
2859 }
2860 
2861 /*
2862  * Entry point from Sockets layer
2863  * The socket is locked.
2864  *
2865  * Checks if a connected socket is subject to filter and
2866  * pending the initial verdict.
2867  */
2868 boolean_t
cfil_sock_connected_pending_verdict(struct socket * so)2869 cfil_sock_connected_pending_verdict(struct socket *so)
2870 {
2871 	if (so == NULL || so->so_cfil == NULL) {
2872 		return false;
2873 	}
2874 
2875 	if (so->so_cfil->cfi_flags & CFIF_INITIAL_VERDICT) {
2876 		return false;
2877 	} else {
2878 		/*
2879 		 * Remember that this protocol is already connected, so
2880 		 * we will trigger soisconnected() upon receipt of
2881 		 * initial verdict later.
2882 		 */
2883 		so->so_cfil->cfi_flags |= CFIF_SOCKET_CONNECTED;
2884 		return true;
2885 	}
2886 }
2887 
2888 /*
2889  * Entry point from Flow Divert
2890  * The socket is locked.
2891  *
2892  * Mark socket as DEAD if all CFIL data has been processed by filter(s).
2893  * Otherwise, delay the marking until all data has been processed.
2894  */
2895 boolean_t
cfil_sock_is_dead(struct socket * so)2896 cfil_sock_is_dead(struct socket *so)
2897 {
2898 	struct inpcb *inp = NULL;
2899 
2900 	if (so == NULL) {
2901 		return false;
2902 	}
2903 
2904 	socket_lock_assert_owned(so);
2905 
2906 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
2907 		int32_t pending_snd = cfil_sock_data_pending(&so->so_snd);
2908 		int32_t pending_rcv = cfil_sock_data_pending(&so->so_rcv);
2909 		if (pending_snd || pending_rcv) {
2910 			SO_DELAYED_DEAD_SET(so, true)
2911 			return false;
2912 		}
2913 	}
2914 
2915 	inp = sotoinpcb(so);
2916 	if (inp != NULL) {
2917 		inp->inp_state = INPCB_STATE_DEAD;
2918 		inpcb_gc_sched(inp->inp_pcbinfo, INPCB_TIMER_FAST);
2919 		SO_DELAYED_DEAD_SET(so, false)
2920 		return true;
2921 	}
2922 	return false;
2923 }
2924 
2925 /*
2926  * Entry point from tcp_timer.c
2927  * The socket is locked.
2928  *
2929  * Perform TCP FIN time wait handling if all CFIL data has been processed by filter(s).
2930  * Otherwise, delay until all data has been processed.
2931  */
2932 boolean_t
cfil_sock_tcp_add_time_wait(struct socket * so)2933 cfil_sock_tcp_add_time_wait(struct socket *so)
2934 {
2935 	struct inpcb *inp = NULL;
2936 	struct tcpcb *tp = NULL;
2937 
2938 	// Only handle TCP sockets
2939 	if (so == NULL || !IS_TCP(so)) {
2940 		return false;
2941 	}
2942 
2943 	socket_lock_assert_owned(so);
2944 
2945 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
2946 		int32_t pending_snd = cfil_sock_data_pending(&so->so_snd);
2947 		int32_t pending_rcv = cfil_sock_data_pending(&so->so_rcv);
2948 		if (pending_snd || pending_rcv) {
2949 			SO_DELAYED_TCP_TIME_WAIT_SET(so, true)
2950 			return false;
2951 		}
2952 	}
2953 
2954 	inp = sotoinpcb(so);
2955 	tp = inp ? intotcpcb(inp) : NULL;
2956 	if (tp != NULL) {
2957 		add_to_time_wait_now(tp, 2 * tcp_msl);
2958 		SO_DELAYED_TCP_TIME_WAIT_SET(so, false)
2959 		return true;
2960 	}
2961 	return false;
2962 }
2963 
2964 boolean_t
cfil_filter_present(void)2965 cfil_filter_present(void)
2966 {
2967 	return cfil_active_count > 0;
2968 }
2969 
2970 /*
2971  * Entry point from Sockets layer
2972  * The socket is locked.
2973  */
2974 errno_t
cfil_sock_attach(struct socket * so,struct sockaddr * local,struct sockaddr * remote,int dir)2975 cfil_sock_attach(struct socket *so, struct sockaddr *local, struct sockaddr *remote, int dir)
2976 {
2977 	errno_t error = 0;
2978 	uint32_t filter_control_unit;
2979 	int debug = 0;
2980 
2981 	socket_lock_assert_owned(so);
2982 
2983 	if (so->so_flags1 & SOF1_FLOW_DIVERT_SKIP) {
2984 		/*
2985 		 * This socket has already been evaluated (and ultimately skipped) by
2986 		 * flow divert, so it has also already been through content filter if there
2987 		 * is one.
2988 		 */
2989 		goto done;
2990 	}
2991 
2992 	/* Limit ourselves to TCP that are not MPTCP subflows */
2993 	if (SKIP_FILTER_FOR_TCP_SOCKET(so)) {
2994 		goto done;
2995 	}
2996 
2997 	debug = DEBUG_FLOW(sotoinpcb(so), so, local, remote);
2998 	if (debug) {
2999 		CFIL_LOG(LOG_ERR, "CFIL: TCP (dir %d) - debug flow with port %d", dir, cfil_log_port);
3000 	}
3001 
3002 	filter_control_unit = necp_socket_get_content_filter_control_unit(so);
3003 	if (filter_control_unit == 0) {
3004 		goto done;
3005 	}
3006 
3007 	if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) {
3008 		goto done;
3009 	}
3010 	if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
3011 		OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
3012 		goto done;
3013 	}
3014 	if (cfil_active_count == 0) {
3015 		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
3016 		goto done;
3017 	}
3018 	if (so->so_cfil != NULL) {
3019 		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_already);
3020 		CFIL_LOG(LOG_ERR, "already attached");
3021 		goto done;
3022 	} else {
3023 		cfil_info_alloc(so, NULL);
3024 		if (so->so_cfil == NULL) {
3025 			error = ENOMEM;
3026 			OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
3027 			goto done;
3028 		}
3029 		so->so_cfil->cfi_dir = dir;
3030 		so->so_cfil->cfi_filter_control_unit = filter_control_unit;
3031 		so->so_cfil->cfi_filter_policy_gencount = necp_socket_get_policy_gencount(so);
3032 		so->so_cfil->cfi_debug = debug;
3033 	}
3034 	if (cfil_info_attach_unit(so, filter_control_unit, so->so_cfil) == 0) {
3035 		CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed",
3036 		    filter_control_unit);
3037 		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
3038 		goto done;
3039 	}
3040 	CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockID %llu <%llx>",
3041 	    (uint64_t)VM_KERNEL_ADDRPERM(so),
3042 	    filter_control_unit, so->so_cfil->cfi_sock_id, so->so_cfil->cfi_sock_id);
3043 
3044 	so->so_flags |= SOF_CONTENT_FILTER;
3045 	OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
3046 
3047 	/* Hold a reference on the socket */
3048 	so->so_usecount++;
3049 
3050 	/*
3051 	 * Save passed addresses for attach event msg (in case resend
3052 	 * is needed.
3053 	 */
3054 	if (remote != NULL && (remote->sa_len <= sizeof(union sockaddr_in_4_6))) {
3055 		SOCKADDR_COPY(remote, SA(&so->so_cfil->cfi_so_attach_faddr), remote->sa_len);
3056 	}
3057 	if (local != NULL && (local->sa_len <= sizeof(union sockaddr_in_4_6))) {
3058 		SOCKADDR_COPY(local, SA(&so->so_cfil->cfi_so_attach_laddr), local->sa_len);
3059 	}
3060 
3061 	if (so->so_cfil->cfi_debug) {
3062 		cfil_info_log(LOG_ERR, so->so_cfil, "CFIL: ADDED");
3063 	}
3064 
3065 	error = cfil_dispatch_attach_event(so, so->so_cfil, 0, dir);
3066 	/* We can recover from flow control or out of memory errors */
3067 	if (error == ENOBUFS || error == ENOMEM) {
3068 		error = 0;
3069 	} else if (error != 0) {
3070 		goto done;
3071 	}
3072 
3073 	CFIL_INFO_VERIFY(so->so_cfil);
3074 done:
3075 	return error;
3076 }
3077 
3078 /*
3079  * Entry point from Sockets layer
3080  * The socket is locked.
3081  */
3082 errno_t
cfil_sock_detach(struct socket * so)3083 cfil_sock_detach(struct socket *so)
3084 {
3085 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
3086 		return 0;
3087 	}
3088 
3089 	if (so->so_cfil) {
3090 		if (so->so_flags & SOF_CONTENT_FILTER) {
3091 			so->so_flags &= ~SOF_CONTENT_FILTER;
3092 			VERIFY(so->so_usecount > 0);
3093 			so->so_usecount--;
3094 		}
3095 		CFIL_INFO_FREE(so->so_cfil);
3096 		so->so_cfil = NULL;
3097 		OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
3098 	}
3099 	return 0;
3100 }
3101 
3102 /*
3103  * Fill in the address info of an event message from either
3104  * the socket or passed in address info.
3105  */
3106 static void
cfil_fill_event_msg_addresses(struct soflow_hash_entry * entry,struct inpcb * inp,union sockaddr_in_4_6 * sin_src,union sockaddr_in_4_6 * sin_dst,boolean_t isIPv4,boolean_t outgoing)3107 cfil_fill_event_msg_addresses(struct soflow_hash_entry *entry, struct inpcb *inp,
3108     union sockaddr_in_4_6 *sin_src, union sockaddr_in_4_6 *sin_dst,
3109     boolean_t isIPv4, boolean_t outgoing)
3110 {
3111 	if (isIPv4) {
3112 		struct in_addr laddr = {0}, faddr = {0};
3113 		u_int16_t lport = 0, fport = 0;
3114 
3115 		cfil_get_flow_address(entry, inp, &laddr, &faddr, &lport, &fport);
3116 
3117 		if (outgoing) {
3118 			fill_ip_sockaddr_4_6(sin_src, laddr, lport);
3119 			fill_ip_sockaddr_4_6(sin_dst, faddr, fport);
3120 		} else {
3121 			fill_ip_sockaddr_4_6(sin_src, faddr, fport);
3122 			fill_ip_sockaddr_4_6(sin_dst, laddr, lport);
3123 		}
3124 	} else {
3125 		struct in6_addr * __single laddr = NULL, * __single faddr = NULL;
3126 		u_int16_t lport = 0, fport = 0;
3127 		const u_int32_t lifscope = inp ? inp->inp_lifscope : IFSCOPE_UNKNOWN;
3128 		const u_int32_t fifscope = inp ? inp->inp_fifscope : IFSCOPE_UNKNOWN;
3129 
3130 		cfil_get_flow_address_v6(entry, inp, &laddr, &faddr, &lport, &fport);
3131 		if (outgoing) {
3132 			fill_ip6_sockaddr_4_6(sin_src, laddr, lport, lifscope);
3133 			fill_ip6_sockaddr_4_6(sin_dst, faddr, fport, fifscope);
3134 		} else {
3135 			fill_ip6_sockaddr_4_6(sin_src, faddr, fport, fifscope);
3136 			fill_ip6_sockaddr_4_6(sin_dst, laddr, lport, lifscope);
3137 		}
3138 	}
3139 }
3140 
3141 static boolean_t
cfil_dispatch_attach_event_sign(cfil_crypto_state_t crypto_state,struct cfil_info * cfil_info,struct cfil_msg_sock_attached * msg)3142 cfil_dispatch_attach_event_sign(cfil_crypto_state_t crypto_state,
3143     struct cfil_info *cfil_info,
3144     struct cfil_msg_sock_attached *msg)
3145 {
3146 	struct cfil_crypto_data data = {};
3147 	struct iovec extra_data[1] = { { NULL, 0 } };
3148 
3149 	if (crypto_state == NULL || msg == NULL || cfil_info == NULL) {
3150 		return false;
3151 	}
3152 
3153 	data.sock_id = msg->cfs_msghdr.cfm_sock_id;
3154 	data.direction = msg->cfs_conn_dir;
3155 
3156 	data.pid = msg->cfs_pid;
3157 	data.effective_pid = msg->cfs_e_pid;
3158 	data.responsible_pid = msg->cfs_r_pid;
3159 	uuid_copy(data.uuid, msg->cfs_uuid);
3160 	uuid_copy(data.effective_uuid, msg->cfs_e_uuid);
3161 	uuid_copy(data.responsible_uuid, msg->cfs_r_uuid);
3162 	data.socketProtocol = msg->cfs_sock_protocol;
3163 	if (data.direction == CFS_CONNECTION_DIR_OUT) {
3164 		data.remote.sin6 = msg->cfs_dst.sin6;
3165 		data.local.sin6 = msg->cfs_src.sin6;
3166 	} else {
3167 		data.remote.sin6 = msg->cfs_src.sin6;
3168 		data.local.sin6 = msg->cfs_dst.sin6;
3169 	}
3170 
3171 	size_t len = strbuflen(msg->cfs_remote_domain_name, sizeof(msg->cfs_remote_domain_name));
3172 	if (len > 0) {
3173 		extra_data[0].iov_base = msg->cfs_remote_domain_name;
3174 		extra_data[0].iov_len = len;
3175 	}
3176 
3177 	// At attach, if local address is already present, no need to re-sign subsequent data messages.
3178 	if (!NULLADDRESS(data.local)) {
3179 		cfil_info->cfi_isSignatureLatest = true;
3180 	}
3181 
3182 	msg->cfs_signature_length = sizeof(cfil_crypto_signature);
3183 	if (cfil_crypto_sign_data(crypto_state, &data, extra_data, sizeof(extra_data) / sizeof(extra_data[0]), msg->cfs_signature, &msg->cfs_signature_length) != 0) {
3184 		msg->cfs_signature_length = 0;
3185 		CFIL_LOG(LOG_ERR, "CFIL: Failed to sign attached msg <sockID %llu <%llx>>",
3186 		    msg->cfs_msghdr.cfm_sock_id, msg->cfs_msghdr.cfm_sock_id);
3187 		return false;
3188 	}
3189 
3190 	return true;
3191 }
3192 
3193 struct cfil_sign_parameters {
3194 	cfil_crypto_state_t csp_state;
3195 	struct cfil_crypto_data *csp_data;
3196 	uint8_t * __indexable csp_signature;
3197 	uint32_t *csp_signature_size;
3198 };
3199 
3200 static void
cfil_sign_with_domain_name(char * domain_name __null_terminated,void * ctx)3201 cfil_sign_with_domain_name(char *domain_name __null_terminated, void *ctx)
3202 {
3203 	struct cfil_sign_parameters *parameters = (struct cfil_sign_parameters *)ctx;
3204 	struct iovec extra_data[1] = { { NULL, 0 } };
3205 
3206 	if (parameters == NULL) {
3207 		return;
3208 	}
3209 
3210 	if (domain_name != NULL) {
3211 		extra_data[0].iov_base = __unsafe_null_terminated_to_indexable(domain_name);
3212 		extra_data[0].iov_len = strlen(domain_name);
3213 	}
3214 
3215 	*(parameters->csp_signature_size) = sizeof(cfil_crypto_signature);
3216 	if (cfil_crypto_sign_data(parameters->csp_state, parameters->csp_data,
3217 	    extra_data, sizeof(extra_data) / sizeof(extra_data[0]),
3218 	    parameters->csp_signature, parameters->csp_signature_size) != 0) {
3219 		*(parameters->csp_signature_size) = 0;
3220 	}
3221 }
3222 
3223 static boolean_t
cfil_dispatch_data_event_sign(cfil_crypto_state_t crypto_state,struct socket * so,struct cfil_info * cfil_info,struct cfil_msg_data_event * msg)3224 cfil_dispatch_data_event_sign(cfil_crypto_state_t crypto_state,
3225     struct socket *so, struct cfil_info *cfil_info,
3226     struct cfil_msg_data_event *msg)
3227 {
3228 	struct cfil_crypto_data data = {};
3229 
3230 	if (crypto_state == NULL || msg == NULL ||
3231 	    so == NULL || cfil_info == NULL) {
3232 		return false;
3233 	}
3234 
3235 	data.sock_id = cfil_info->cfi_sock_id;
3236 	data.direction = cfil_info->cfi_dir;
3237 	data.pid = so->last_pid;
3238 	memcpy(data.uuid, so->last_uuid, sizeof(uuid_t));
3239 	if (so->so_flags & SOF_DELEGATED) {
3240 		data.effective_pid = so->e_pid;
3241 		memcpy(data.effective_uuid, so->e_uuid, sizeof(uuid_t));
3242 	} else {
3243 		data.effective_pid = so->last_pid;
3244 		memcpy(data.effective_uuid, so->last_uuid, sizeof(uuid_t));
3245 	}
3246 #if defined(XNU_TARGET_OS_OSX)
3247 	if (!uuid_is_null(so->so_ruuid)) {
3248 		data.responsible_pid = so->so_rpid;
3249 		memcpy(data.responsible_uuid, so->so_ruuid, sizeof(uuid_t));
3250 	}
3251 #endif
3252 	data.socketProtocol = GET_SO_PROTO(so);
3253 
3254 	if (data.direction == CFS_CONNECTION_DIR_OUT) {
3255 		data.remote.sin6 = msg->cfc_dst.sin6;
3256 		data.local.sin6 = msg->cfc_src.sin6;
3257 	} else {
3258 		data.remote.sin6 = msg->cfc_src.sin6;
3259 		data.local.sin6 = msg->cfc_dst.sin6;
3260 	}
3261 
3262 	// At first data, local address may show up for the first time, update address cache and
3263 	// no need to re-sign subsequent data messages anymore.
3264 	if (!NULLADDRESS(data.local)) {
3265 		memcpy(&cfil_info->cfi_so_attach_laddr, &data.local, data.local.sa.sa_len);
3266 		cfil_info->cfi_isSignatureLatest = true;
3267 	}
3268 
3269 	struct cfil_sign_parameters parameters = {
3270 		.csp_state = crypto_state,
3271 		.csp_data = &data,
3272 		.csp_signature = msg->cfd_signature,
3273 		.csp_signature_size = &msg->cfd_signature_length,
3274 	};
3275 	necp_with_inp_domain_name(so, &parameters, cfil_sign_with_domain_name);
3276 
3277 	if (msg->cfd_signature_length == 0) {
3278 		CFIL_LOG(LOG_ERR, "CFIL: Failed to sign data msg <sockID %llu <%llx>>",
3279 		    msg->cfd_msghdr.cfm_sock_id, msg->cfd_msghdr.cfm_sock_id);
3280 		return false;
3281 	}
3282 
3283 	return true;
3284 }
3285 
3286 static boolean_t
cfil_dispatch_closed_event_sign(cfil_crypto_state_t crypto_state,struct socket * so,struct cfil_info * cfil_info,struct cfil_msg_sock_closed * msg)3287 cfil_dispatch_closed_event_sign(cfil_crypto_state_t crypto_state,
3288     struct socket *so, struct cfil_info *cfil_info,
3289     struct cfil_msg_sock_closed *msg)
3290 {
3291 	struct cfil_crypto_data data = {};
3292 	struct soflow_hash_entry hash_entry = {};
3293 	struct soflow_hash_entry *hash_entry_ptr = NULL;
3294 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
3295 
3296 	if (crypto_state == NULL || msg == NULL ||
3297 	    so == NULL || inp == NULL || cfil_info == NULL) {
3298 		return false;
3299 	}
3300 
3301 	data.sock_id = cfil_info->cfi_sock_id;
3302 	data.direction = cfil_info->cfi_dir;
3303 
3304 	data.pid = so->last_pid;
3305 	memcpy(data.uuid, so->last_uuid, sizeof(uuid_t));
3306 	if (so->so_flags & SOF_DELEGATED) {
3307 		data.effective_pid = so->e_pid;
3308 		memcpy(data.effective_uuid, so->e_uuid, sizeof(uuid_t));
3309 	} else {
3310 		data.effective_pid = so->last_pid;
3311 		memcpy(data.effective_uuid, so->last_uuid, sizeof(uuid_t));
3312 	}
3313 #if defined(XNU_TARGET_OS_OSX)
3314 	if (!uuid_is_null(so->so_ruuid)) {
3315 		data.responsible_pid = so->so_rpid;
3316 		memcpy(data.responsible_uuid, so->so_ruuid, sizeof(uuid_t));
3317 	}
3318 #endif
3319 	data.socketProtocol = GET_SO_PROTO(so);
3320 
3321 	/*
3322 	 * Fill in address info:
3323 	 * For UDP, use the cfil_info hash entry directly.
3324 	 * For TCP, compose an hash entry with the saved addresses.
3325 	 */
3326 	if (cfil_info->cfi_hash_entry != NULL) {
3327 		hash_entry_ptr = cfil_info->cfi_hash_entry;
3328 	} else if (cfil_info->cfi_so_attach_faddr.sa.sa_len > 0 ||
3329 	    cfil_info->cfi_so_attach_laddr.sa.sa_len > 0) {
3330 		soflow_fill_hash_entry_from_address(&hash_entry, TRUE, SA(&cfil_info->cfi_so_attach_laddr), FALSE);
3331 		soflow_fill_hash_entry_from_address(&hash_entry, FALSE, SA(&cfil_info->cfi_so_attach_faddr), FALSE);
3332 		hash_entry_ptr = &hash_entry;
3333 	}
3334 	if (hash_entry_ptr != NULL) {
3335 		boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT);
3336 		union sockaddr_in_4_6 *src = outgoing ? &data.local : &data.remote;
3337 		union sockaddr_in_4_6 *dst = outgoing ? &data.remote : &data.local;
3338 		cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, !IS_INP_V6(inp), outgoing);
3339 	}
3340 
3341 	data.byte_count_in = cfil_info->cfi_byte_inbound_count;
3342 	data.byte_count_out = cfil_info->cfi_byte_outbound_count;
3343 
3344 	struct cfil_sign_parameters parameters = {
3345 		.csp_state = crypto_state,
3346 		.csp_data = &data,
3347 		.csp_signature = msg->cfc_signature,
3348 		.csp_signature_size = &msg->cfc_signature_length
3349 	};
3350 	necp_with_inp_domain_name(so, &parameters, cfil_sign_with_domain_name);
3351 
3352 	if (msg->cfc_signature_length == 0) {
3353 		CFIL_LOG(LOG_ERR, "CFIL: Failed to sign closed msg <sockID %llu <%llx>>",
3354 		    msg->cfc_msghdr.cfm_sock_id, msg->cfc_msghdr.cfm_sock_id);
3355 		return false;
3356 	}
3357 
3358 	return true;
3359 }
3360 
3361 static void
cfil_populate_attached_msg_domain_name(char * domain_name __null_terminated,void * ctx)3362 cfil_populate_attached_msg_domain_name(char *domain_name __null_terminated, void *ctx)
3363 {
3364 	struct cfil_msg_sock_attached *msg_attached = (struct cfil_msg_sock_attached *)ctx;
3365 
3366 	if (msg_attached == NULL) {
3367 		return;
3368 	}
3369 
3370 	if (domain_name != NULL) {
3371 		strlcpy(msg_attached->cfs_remote_domain_name, domain_name, sizeof(msg_attached->cfs_remote_domain_name));
3372 	}
3373 }
3374 
3375 static bool
cfil_copy_audit_token(pid_t pid,audit_token_t * buffer)3376 cfil_copy_audit_token(pid_t pid, audit_token_t *buffer)
3377 {
3378 	bool success = false;
3379 	proc_t p = proc_find(pid);
3380 	if (p != PROC_NULL) {
3381 		task_t __single t = proc_task(p);
3382 		if (t != TASK_NULL) {
3383 			audit_token_t audit_token = {};
3384 			mach_msg_type_number_t count = TASK_AUDIT_TOKEN_COUNT;
3385 			if (task_info(t, TASK_AUDIT_TOKEN, (task_info_t)&audit_token, &count) == KERN_SUCCESS) {
3386 				memcpy(buffer, &audit_token, sizeof(audit_token_t));
3387 				success = true;
3388 			}
3389 		}
3390 		proc_rele(p);
3391 	}
3392 	return success;
3393 }
3394 
3395 static int
cfil_dispatch_attach_event(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int conn_dir)3396 cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info,
3397     uint32_t kcunit, int conn_dir)
3398 {
3399 	errno_t error = 0;
3400 	struct cfil_entry *entry = NULL;
3401 	struct cfil_msg_sock_attached * __single msg_attached;
3402 	struct content_filter *cfc = NULL;
3403 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
3404 	struct soflow_hash_entry *hash_entry_ptr = NULL;
3405 	struct soflow_hash_entry hash_entry;
3406 
3407 	memset(&hash_entry, 0, sizeof(struct soflow_hash_entry));
3408 
3409 	socket_lock_assert_owned(so);
3410 
3411 	cfil_rw_lock_shared(&cfil_lck_rw);
3412 
3413 	if (so->so_proto == NULL || so->so_proto->pr_domain == NULL) {
3414 		error = EINVAL;
3415 		goto done;
3416 	}
3417 
3418 	if (kcunit == 0) {
3419 		entry = SLIST_FIRST(&cfil_info->cfi_ordered_entries);
3420 	} else {
3421 		entry = &cfil_info->cfi_entries[kcunit - 1];
3422 	}
3423 
3424 	if (entry == NULL) {
3425 		goto done;
3426 	}
3427 
3428 	cfc = entry->cfe_filter;
3429 	if (cfc == NULL) {
3430 		goto done;
3431 	}
3432 
3433 	if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED)) {
3434 		goto done;
3435 	}
3436 
3437 	if (kcunit == 0) {
3438 		kcunit = CFI_ENTRY_KCUNIT(cfil_info, entry);
3439 	}
3440 
3441 	CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u kcunit %u",
3442 	    (uint64_t)VM_KERNEL_ADDRPERM(so), entry->cfe_necp_control_unit, kcunit);
3443 
3444 	/* Would be wasteful to try when flow controlled */
3445 	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
3446 		error = ENOBUFS;
3447 		goto done;
3448 	}
3449 
3450 	msg_attached = kalloc_data(sizeof(struct cfil_msg_sock_attached), Z_WAITOK);
3451 	if (msg_attached == NULL) {
3452 		error = ENOMEM;
3453 		goto done;
3454 	}
3455 
3456 	bzero(msg_attached, sizeof(struct cfil_msg_sock_attached));
3457 	msg_attached->cfs_msghdr.cfm_len = sizeof(struct cfil_msg_sock_attached);
3458 	msg_attached->cfs_msghdr.cfm_version = CFM_VERSION_CURRENT;
3459 	msg_attached->cfs_msghdr.cfm_type = CFM_TYPE_EVENT;
3460 	msg_attached->cfs_msghdr.cfm_op = CFM_OP_SOCKET_ATTACHED;
3461 	msg_attached->cfs_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
3462 
3463 	msg_attached->cfs_sock_family = SOCK_DOM(so);
3464 	msg_attached->cfs_sock_type = SOCK_TYPE(so);
3465 	msg_attached->cfs_sock_protocol = GET_SO_PROTO(so);
3466 	msg_attached->cfs_pid = so->last_pid;
3467 	memcpy(msg_attached->cfs_uuid, so->last_uuid, sizeof(uuid_t));
3468 	if (so->so_flags & SOF_DELEGATED) {
3469 		msg_attached->cfs_e_pid = so->e_pid;
3470 		memcpy(msg_attached->cfs_e_uuid, so->e_uuid, sizeof(uuid_t));
3471 	} else {
3472 		msg_attached->cfs_e_pid = so->last_pid;
3473 		memcpy(msg_attached->cfs_e_uuid, so->last_uuid, sizeof(uuid_t));
3474 	}
3475 #if defined(XNU_TARGET_OS_OSX)
3476 	if (!uuid_is_null(so->so_ruuid)) {
3477 		msg_attached->cfs_r_pid = so->so_rpid;
3478 		memcpy(msg_attached->cfs_r_uuid, so->so_ruuid, sizeof(uuid_t));
3479 	}
3480 #endif
3481 	/*
3482 	 * Fill in address info:
3483 	 * For UDP, use the cfil_info hash entry directly.
3484 	 * For TCP, compose an hash entry with the saved addresses.
3485 	 */
3486 	if (cfil_info->cfi_hash_entry != NULL) {
3487 		hash_entry_ptr = cfil_info->cfi_hash_entry;
3488 	} else if (cfil_info->cfi_so_attach_faddr.sa.sa_len > 0 ||
3489 	    cfil_info->cfi_so_attach_laddr.sa.sa_len > 0) {
3490 		soflow_fill_hash_entry_from_address(&hash_entry, TRUE, SA(&cfil_info->cfi_so_attach_laddr), FALSE);
3491 		soflow_fill_hash_entry_from_address(&hash_entry, FALSE, SA(&cfil_info->cfi_so_attach_faddr), FALSE);
3492 		hash_entry_ptr = &hash_entry;
3493 	}
3494 	if (hash_entry_ptr != NULL) {
3495 		cfil_fill_event_msg_addresses(hash_entry_ptr, inp,
3496 		    &msg_attached->cfs_src, &msg_attached->cfs_dst,
3497 		    !IS_INP_V6(inp), conn_dir == CFS_CONNECTION_DIR_OUT);
3498 	}
3499 	msg_attached->cfs_conn_dir = conn_dir;
3500 
3501 	if (msg_attached->cfs_e_pid != 0) {
3502 		if (!cfil_copy_audit_token(msg_attached->cfs_e_pid, (audit_token_t *)&msg_attached->cfs_audit_token)) {
3503 			CFIL_LOG(LOG_ERR, "CFIL: Failed to get effective audit token for <sockID %llu <%llx>> ",
3504 			    entry->cfe_cfil_info->cfi_sock_id, entry->cfe_cfil_info->cfi_sock_id);
3505 		}
3506 	}
3507 
3508 	if (msg_attached->cfs_pid != 0) {
3509 		if (msg_attached->cfs_pid == msg_attached->cfs_e_pid) {
3510 			memcpy(&msg_attached->cfs_real_audit_token, &msg_attached->cfs_audit_token, sizeof(msg_attached->cfs_real_audit_token));
3511 		} else if (!cfil_copy_audit_token(msg_attached->cfs_pid, (audit_token_t *)&msg_attached->cfs_real_audit_token)) {
3512 			CFIL_LOG(LOG_ERR, "CFIL: Failed to get real audit token for <sockID %llu <%llx>> ",
3513 			    entry->cfe_cfil_info->cfi_sock_id, entry->cfe_cfil_info->cfi_sock_id);
3514 		}
3515 	}
3516 
3517 	necp_with_inp_domain_name(so, msg_attached, cfil_populate_attached_msg_domain_name);
3518 
3519 	if (cfil_info->cfi_debug) {
3520 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING ATTACH UP");
3521 	}
3522 
3523 	cfil_dispatch_attach_event_sign(entry->cfe_filter->cf_crypto_state, cfil_info, msg_attached);
3524 
3525 	error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
3526 	    entry->cfe_filter->cf_kcunit,
3527 	    msg_attached,
3528 	    sizeof(struct cfil_msg_sock_attached),
3529 	    CTL_DATA_EOR);
3530 
3531 	kfree_data(msg_attached, sizeof(struct cfil_msg_sock_attached));
3532 
3533 	if (error != 0) {
3534 		CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d", error);
3535 		goto done;
3536 	}
3537 	microuptime(&entry->cfe_last_event);
3538 	cfil_info->cfi_first_event.tv_sec = entry->cfe_last_event.tv_sec;
3539 	cfil_info->cfi_first_event.tv_usec = entry->cfe_last_event.tv_usec;
3540 
3541 	entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED;
3542 	OSIncrementAtomic(&cfil_stats.cfs_attach_event_ok);
3543 done:
3544 
3545 	/* We can recover from flow control */
3546 	if (error == ENOBUFS) {
3547 		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
3548 		OSIncrementAtomic(&cfil_stats.cfs_attach_event_flow_control);
3549 
3550 		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
3551 			cfil_rw_lock_exclusive(&cfil_lck_rw);
3552 		}
3553 
3554 		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
3555 
3556 		cfil_rw_unlock_exclusive(&cfil_lck_rw);
3557 	} else {
3558 		if (error != 0) {
3559 			OSIncrementAtomic(&cfil_stats.cfs_attach_event_fail);
3560 		}
3561 
3562 		cfil_rw_unlock_shared(&cfil_lck_rw);
3563 	}
3564 	return error;
3565 }
3566 
3567 static int
cfil_dispatch_disconnect_event(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int outgoing)3568 cfil_dispatch_disconnect_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
3569 {
3570 	errno_t error = 0;
3571 	struct mbuf *msg = NULL;
3572 	struct cfil_entry *entry;
3573 	struct cfe_buf *entrybuf;
3574 	struct cfil_msg_hdr msg_disconnected;
3575 	struct content_filter *cfc;
3576 
3577 	socket_lock_assert_owned(so);
3578 
3579 	cfil_rw_lock_shared(&cfil_lck_rw);
3580 
3581 	entry = &cfil_info->cfi_entries[kcunit - 1];
3582 	if (outgoing) {
3583 		entrybuf = &entry->cfe_snd;
3584 	} else {
3585 		entrybuf = &entry->cfe_rcv;
3586 	}
3587 
3588 	cfc = entry->cfe_filter;
3589 	if (cfc == NULL) {
3590 		goto done;
3591 	}
3592 
3593 	// Mark if this flow qualifies for immediate close.
3594 	SET_NO_CLOSE_WAIT(sotoinpcb(so), cfil_info);
3595 
3596 	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
3597 	    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
3598 
3599 	/*
3600 	 * Send the disconnection event once
3601 	 */
3602 	if ((outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) ||
3603 	    (!outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))) {
3604 		CFIL_LOG(LOG_INFO, "so %llx disconnect already sent",
3605 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
3606 		goto done;
3607 	}
3608 
3609 	/*
3610 	 * We're not disconnected as long as some data is waiting
3611 	 * to be delivered to the filter
3612 	 */
3613 	if (outgoing && cfil_queue_empty(&entrybuf->cfe_ctl_q) == 0) {
3614 		CFIL_LOG(LOG_INFO, "so %llx control queue not empty",
3615 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
3616 		error = EBUSY;
3617 		goto done;
3618 	}
3619 	/* Would be wasteful to try when flow controlled */
3620 	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
3621 		error = ENOBUFS;
3622 		goto done;
3623 	}
3624 
3625 	if (cfil_info->cfi_debug) {
3626 		const char * __null_terminated out = "CFIL: OUT - SENDING DISCONNECT UP";
3627 		const char * __null_terminated in = "CFIL: IN - SENDING DISCONNECT UP";
3628 		cfil_info_log(LOG_ERR, cfil_info, outgoing ? out : in);
3629 	}
3630 
3631 	bzero(&msg_disconnected, sizeof(struct cfil_msg_hdr));
3632 	msg_disconnected.cfm_len = sizeof(struct cfil_msg_hdr);
3633 	msg_disconnected.cfm_version = CFM_VERSION_CURRENT;
3634 	msg_disconnected.cfm_type = CFM_TYPE_EVENT;
3635 	msg_disconnected.cfm_op = outgoing ? CFM_OP_DISCONNECT_OUT :
3636 	    CFM_OP_DISCONNECT_IN;
3637 	msg_disconnected.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
3638 	error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
3639 	    entry->cfe_filter->cf_kcunit,
3640 	    &msg_disconnected,
3641 	    sizeof(struct cfil_msg_hdr),
3642 	    CTL_DATA_EOR);
3643 	if (error != 0) {
3644 		CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
3645 		mbuf_freem(msg);
3646 		goto done;
3647 	}
3648 	microuptime(&entry->cfe_last_event);
3649 	CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, msg_disconnected.cfm_op);
3650 
3651 	/* Remember we have sent the disconnection message */
3652 	if (outgoing) {
3653 		entry->cfe_flags |= CFEF_SENT_DISCONNECT_OUT;
3654 		OSIncrementAtomic(&cfil_stats.cfs_disconnect_out_event_ok);
3655 	} else {
3656 		entry->cfe_flags |= CFEF_SENT_DISCONNECT_IN;
3657 		OSIncrementAtomic(&cfil_stats.cfs_disconnect_in_event_ok);
3658 	}
3659 done:
3660 	if (error == ENOBUFS) {
3661 		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
3662 		OSIncrementAtomic(
3663 			&cfil_stats.cfs_disconnect_event_flow_control);
3664 
3665 		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
3666 			cfil_rw_lock_exclusive(&cfil_lck_rw);
3667 		}
3668 
3669 		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
3670 
3671 		cfil_rw_unlock_exclusive(&cfil_lck_rw);
3672 	} else {
3673 		if (error != 0) {
3674 			OSIncrementAtomic(
3675 				&cfil_stats.cfs_disconnect_event_fail);
3676 		}
3677 
3678 		cfil_rw_unlock_shared(&cfil_lck_rw);
3679 	}
3680 	return error;
3681 }
3682 
3683 int
cfil_dispatch_closed_event(struct socket * so,struct cfil_info * cfil_info,int kcunit)3684 cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int kcunit)
3685 {
3686 	struct cfil_entry *entry;
3687 	struct cfil_msg_sock_closed msg_closed;
3688 	errno_t error = 0;
3689 	struct content_filter *cfc;
3690 	struct inpcb *inp = NULL;
3691 
3692 	socket_lock_assert_owned(so);
3693 
3694 	cfil_rw_lock_shared(&cfil_lck_rw);
3695 
3696 	entry = &cfil_info->cfi_entries[kcunit - 1];
3697 	cfc = entry->cfe_filter;
3698 	if (cfc == NULL) {
3699 		goto done;
3700 	}
3701 
3702 	CFIL_LOG(LOG_INFO, "so %llx kcunit %d",
3703 	    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3704 
3705 	/* Would be wasteful to try when flow controlled */
3706 	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
3707 		error = ENOBUFS;
3708 		goto done;
3709 	}
3710 	/*
3711 	 * Send a single closed message per filter
3712 	 */
3713 	if ((entry->cfe_flags & CFEF_SENT_SOCK_CLOSED) != 0) {
3714 		goto done;
3715 	}
3716 	if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
3717 		goto done;
3718 	}
3719 
3720 	microuptime(&entry->cfe_last_event);
3721 	CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, CFM_OP_SOCKET_CLOSED);
3722 
3723 	bzero(&msg_closed, sizeof(struct cfil_msg_sock_closed));
3724 	msg_closed.cfc_msghdr.cfm_len = sizeof(struct cfil_msg_sock_closed);
3725 	msg_closed.cfc_msghdr.cfm_version = CFM_VERSION_CURRENT;
3726 	msg_closed.cfc_msghdr.cfm_type = CFM_TYPE_EVENT;
3727 	msg_closed.cfc_msghdr.cfm_op = CFM_OP_SOCKET_CLOSED;
3728 	msg_closed.cfc_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
3729 	msg_closed.cfc_first_event.tv_sec = cfil_info->cfi_first_event.tv_sec;
3730 	msg_closed.cfc_first_event.tv_usec = cfil_info->cfi_first_event.tv_usec;
3731 	memcpy(msg_closed.cfc_op_time, cfil_info->cfi_op_time, sizeof(uint32_t) * CFI_MAX_TIME_LOG_ENTRY);
3732 	memcpy(msg_closed.cfc_op_list, cfil_info->cfi_op_list, sizeof(unsigned char) * CFI_MAX_TIME_LOG_ENTRY);
3733 	msg_closed.cfc_op_list_ctr = cfil_info->cfi_op_list_ctr;
3734 	msg_closed.cfc_byte_inbound_count = cfil_info->cfi_byte_inbound_count;
3735 	msg_closed.cfc_byte_outbound_count = cfil_info->cfi_byte_outbound_count;
3736 
3737 	if (entry->cfe_laddr_sent == false) {
3738 		/* cache it if necessary */
3739 		if (cfil_info->cfi_so_attach_laddr.sa.sa_len == 0) {
3740 			inp = cfil_info->cfi_so ? sotoinpcb(cfil_info->cfi_so) : NULL;
3741 			if (inp != NULL) {
3742 				boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT);
3743 				union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL;
3744 				union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr;
3745 				cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
3746 				    src, dst, !IS_INP_V6(inp), outgoing);
3747 			}
3748 		}
3749 
3750 		if (cfil_info->cfi_so_attach_laddr.sa.sa_len != 0) {
3751 			msg_closed.cfc_laddr.sin6 = cfil_info->cfi_so_attach_laddr.sin6;
3752 			entry->cfe_laddr_sent = true;
3753 		}
3754 	}
3755 
3756 	cfil_dispatch_closed_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, &msg_closed);
3757 
3758 	if (cfil_info->cfi_debug) {
3759 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING CLOSED UP");
3760 	}
3761 
3762 	/* for debugging
3763 	 *  if (msg_closed.cfc_op_list_ctr > CFI_MAX_TIME_LOG_ENTRY) {
3764 	 *       msg_closed.cfc_op_list_ctr  = CFI_MAX_TIME_LOG_ENTRY;       // just in case
3765 	 *  }
3766 	 *  for (unsigned int i = 0; i < msg_closed.cfc_op_list_ctr ; i++) {
3767 	 *       CFIL_LOG(LOG_ERR, "MD: socket %llu event %2u, time + %u msec", msg_closed.cfc_msghdr.cfm_sock_id, (unsigned short)msg_closed.cfc_op_list[i], msg_closed.cfc_op_time[i]);
3768 	 *  }
3769 	 */
3770 
3771 	error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
3772 	    entry->cfe_filter->cf_kcunit,
3773 	    &msg_closed,
3774 	    sizeof(struct cfil_msg_sock_closed),
3775 	    CTL_DATA_EOR);
3776 	if (error != 0) {
3777 		CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d",
3778 		    error);
3779 		goto done;
3780 	}
3781 
3782 	entry->cfe_flags |= CFEF_SENT_SOCK_CLOSED;
3783 	OSIncrementAtomic(&cfil_stats.cfs_closed_event_ok);
3784 done:
3785 	/* We can recover from flow control */
3786 	if (error == ENOBUFS) {
3787 		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
3788 		OSIncrementAtomic(&cfil_stats.cfs_closed_event_flow_control);
3789 
3790 		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
3791 			cfil_rw_lock_exclusive(&cfil_lck_rw);
3792 		}
3793 
3794 		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
3795 
3796 		cfil_rw_unlock_exclusive(&cfil_lck_rw);
3797 	} else {
3798 		if (error != 0) {
3799 			OSIncrementAtomic(&cfil_stats.cfs_closed_event_fail);
3800 		}
3801 
3802 		cfil_rw_unlock_shared(&cfil_lck_rw);
3803 	}
3804 
3805 	return error;
3806 }
3807 
3808 static void
fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 * sin46,struct in6_addr * ip6,u_int16_t port,uint32_t ifscope)3809 fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
3810     struct in6_addr *ip6, u_int16_t port, uint32_t ifscope)
3811 {
3812 	if (sin46 == NULL) {
3813 		return;
3814 	}
3815 
3816 	struct sockaddr_in6 *sin6 = &sin46->sin6;
3817 
3818 	sin6->sin6_family = AF_INET6;
3819 	sin6->sin6_len = sizeof(*sin6);
3820 	sin6->sin6_port = port;
3821 	sin6->sin6_addr = *ip6;
3822 	if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) {
3823 		sin6->sin6_scope_id = ifscope;
3824 		if (in6_embedded_scope) {
3825 			in6_verify_ifscope(&sin6->sin6_addr, sin6->sin6_scope_id);
3826 			if (sin6->sin6_addr.s6_addr16[1] != 0) {
3827 				sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]);
3828 				sin6->sin6_addr.s6_addr16[1] = 0;
3829 			}
3830 		}
3831 	}
3832 }
3833 
3834 static void
fill_ip_sockaddr_4_6(union sockaddr_in_4_6 * sin46,struct in_addr ip,u_int16_t port)3835 fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
3836     struct in_addr ip, u_int16_t port)
3837 {
3838 	if (sin46 == NULL) {
3839 		return;
3840 	}
3841 
3842 	struct sockaddr_in *sin = &sin46->sin;
3843 
3844 	sin->sin_family = AF_INET;
3845 	sin->sin_len = sizeof(*sin);
3846 	sin->sin_port = port;
3847 	sin->sin_addr.s_addr = ip.s_addr;
3848 }
3849 
3850 static void
cfil_get_flow_address_v6(struct soflow_hash_entry * entry,struct inpcb * inp,struct in6_addr ** laddr,struct in6_addr ** faddr,u_int16_t * lport,u_int16_t * fport)3851 cfil_get_flow_address_v6(struct soflow_hash_entry *entry, struct inpcb *inp,
3852     struct in6_addr **laddr, struct in6_addr **faddr,
3853     u_int16_t *lport, u_int16_t *fport)
3854 {
3855 	if (entry != NULL) {
3856 		*laddr = &entry->soflow_laddr.addr6;
3857 		*faddr = &entry->soflow_faddr.addr6;
3858 		*lport = entry->soflow_lport;
3859 		*fport = entry->soflow_fport;
3860 	} else {
3861 		*laddr = &inp->in6p_laddr;
3862 		*faddr = &inp->in6p_faddr;
3863 		*lport = inp->inp_lport;
3864 		*fport = inp->inp_fport;
3865 	}
3866 }
3867 
3868 static void
cfil_get_flow_address(struct soflow_hash_entry * entry,struct inpcb * inp,struct in_addr * laddr,struct in_addr * faddr,u_int16_t * lport,u_int16_t * fport)3869 cfil_get_flow_address(struct soflow_hash_entry *entry, struct inpcb *inp,
3870     struct in_addr *laddr, struct in_addr *faddr,
3871     u_int16_t *lport, u_int16_t *fport)
3872 {
3873 	if (entry != NULL) {
3874 		*laddr = entry->soflow_laddr.addr46.ia46_addr4;
3875 		*faddr = entry->soflow_faddr.addr46.ia46_addr4;
3876 		*lport = entry->soflow_lport;
3877 		*fport = entry->soflow_fport;
3878 	} else {
3879 		*laddr = inp->inp_laddr;
3880 		*faddr = inp->inp_faddr;
3881 		*lport = inp->inp_lport;
3882 		*fport = inp->inp_fport;
3883 	}
3884 }
3885 
3886 static int
cfil_dispatch_data_event(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int outgoing,struct mbuf * data,unsigned int copyoffset,unsigned int copylen)3887 cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
3888     struct mbuf *data, unsigned int copyoffset, unsigned int copylen)
3889 {
3890 	errno_t error = 0;
3891 	struct mbuf *copy = NULL;
3892 	struct mbuf * __single msg = NULL;
3893 	unsigned int one = 1;
3894 	struct cfil_msg_data_event *data_req;
3895 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
3896 	struct cfil_entry *entry;
3897 	struct cfe_buf *entrybuf;
3898 	struct content_filter *cfc;
3899 	struct timeval tv;
3900 	int inp_flags = 0;
3901 
3902 	cfil_rw_lock_shared(&cfil_lck_rw);
3903 
3904 	entry = &cfil_info->cfi_entries[kcunit - 1];
3905 	if (outgoing) {
3906 		entrybuf = &entry->cfe_snd;
3907 	} else {
3908 		entrybuf = &entry->cfe_rcv;
3909 	}
3910 
3911 	cfc = entry->cfe_filter;
3912 	if (cfc == NULL) {
3913 		goto done;
3914 	}
3915 
3916 	data = cfil_data_start(data);
3917 	if (data == NULL) {
3918 		CFIL_LOG(LOG_ERR, "No data start");
3919 		goto done;
3920 	}
3921 
3922 	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
3923 	    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
3924 
3925 	socket_lock_assert_owned(so);
3926 
3927 	/* Would be wasteful to try */
3928 	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
3929 		error = ENOBUFS;
3930 		goto done;
3931 	}
3932 
3933 	/* Make a copy of the data to pass to kernel control socket */
3934 	copy = m_copym_mode(data, copyoffset, copylen, M_DONTWAIT, NULL, NULL,
3935 	    M_COPYM_NOOP_HDR);
3936 	if (copy == NULL) {
3937 		CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
3938 		error = ENOMEM;
3939 		goto done;
3940 	}
3941 
3942 	/* We need an mbuf packet for the message header */
3943 	const size_t hdrsize = sizeof(struct cfil_msg_data_event);
3944 	error = mbuf_allocpacket(MBUF_DONTWAIT, hdrsize, &one, &msg);
3945 	if (error != 0) {
3946 		CFIL_LOG(LOG_ERR, "mbuf_allocpacket() failed");
3947 		m_freem(copy);
3948 		/*
3949 		 * ENOBUFS is to indicate flow control
3950 		 */
3951 		error = ENOMEM;
3952 		goto done;
3953 	}
3954 	mbuf_setlen(msg, hdrsize);
3955 	mbuf_pkthdr_setlen(msg, hdrsize + copylen);
3956 	msg->m_next = copy;
3957 	data_req = (struct cfil_msg_data_event *)mbuf_data(msg);
3958 	bzero(data_req, hdrsize);
3959 	data_req->cfd_msghdr.cfm_len = (uint32_t)hdrsize + copylen;
3960 	data_req->cfd_msghdr.cfm_version = 1;
3961 	data_req->cfd_msghdr.cfm_type = CFM_TYPE_EVENT;
3962 	data_req->cfd_msghdr.cfm_op =
3963 	    outgoing ? CFM_OP_DATA_OUT : CFM_OP_DATA_IN;
3964 	data_req->cfd_msghdr.cfm_sock_id =
3965 	    entry->cfe_cfil_info->cfi_sock_id;
3966 	data_req->cfd_start_offset = entrybuf->cfe_peeked;
3967 	data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
3968 	// The last_pid or e_pid is set here because a socket could have been
3969 	// accepted by launchd and a new process spawned (with a new pid).
3970 	// So the last pid associated with the socket is appended to the data event.
3971 	// for a provider that is peeking bytes.
3972 	if (so->so_flags & SOF_DELEGATED) {
3973 		data_req->cfd_delegated_pid = so->e_pid;
3974 	} else {
3975 		data_req->cfd_delegated_pid = so->last_pid;
3976 	}
3977 	if (data_req->cfd_delegated_pid != 0) {
3978 		if (!cfil_copy_audit_token(data_req->cfd_delegated_pid, (audit_token_t *)&data_req->cfd_delegated_audit_token)) {
3979 			CFIL_LOG(LOG_ERR, "CFIL: Failed to get audit token for <sockID %llu <%llx>> ",
3980 			    entry->cfe_cfil_info->cfi_sock_id, entry->cfe_cfil_info->cfi_sock_id);
3981 		}
3982 	}
3983 
3984 	data_req->cfd_flags = 0;
3985 	if (OPTIONAL_IP_HEADER(so)) {
3986 		/*
3987 		 * For non-UDP/TCP traffic, indicate to filters if optional
3988 		 * IP header is present:
3989 		 *      outgoing - indicate according to INP_HDRINCL flag
3990 		 *      incoming - For IPv4 only, stripping of IP header is
3991 		 *                 optional.  But for CFIL, we delay stripping
3992 		 *                 at rip_input.  So CFIL always expects IP
3993 		 *                 frames. IP header will be stripped according
3994 		 *                 to INP_STRIPHDR flag later at reinjection.
3995 		 */
3996 		if ((!outgoing && !IS_INP_V6(inp)) ||
3997 		    (outgoing && cfil_dgram_peek_socket_state(data, &inp_flags) && (inp_flags & INP_HDRINCL))) {
3998 			data_req->cfd_flags |= CFD_DATA_FLAG_IP_HEADER;
3999 		}
4000 	}
4001 
4002 	/*
4003 	 * Copy address/port into event msg.
4004 	 * For non connected sockets need to copy addresses from passed
4005 	 * parameters
4006 	 */
4007 	cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
4008 	    &data_req->cfc_src, &data_req->cfc_dst,
4009 	    !IS_INP_V6(inp), outgoing);
4010 
4011 	if (cfil_info->cfi_debug && cfil_log_data) {
4012 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DATA UP");
4013 	}
4014 
4015 	if (cfil_info->cfi_isSignatureLatest == false) {
4016 		cfil_dispatch_data_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, data_req);
4017 	}
4018 
4019 	microuptime(&tv);
4020 	CFI_ADD_TIME_LOG(cfil_info, &tv, &cfil_info->cfi_first_event, data_req->cfd_msghdr.cfm_op);
4021 
4022 	/* Pass the message to the content filter */
4023 	error = ctl_enqueuembuf(entry->cfe_filter->cf_kcref,
4024 	    entry->cfe_filter->cf_kcunit,
4025 	    msg, CTL_DATA_EOR);
4026 	if (error != 0) {
4027 		CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
4028 		mbuf_freem(msg);
4029 		goto done;
4030 	}
4031 	entry->cfe_flags &= ~CFEF_FLOW_CONTROLLED;
4032 	OSIncrementAtomic(&cfil_stats.cfs_data_event_ok);
4033 
4034 	if (cfil_info->cfi_debug && cfil_log_data) {
4035 		CFIL_LOG(LOG_ERR, "CFIL: VERDICT ACTION: so %llx sockID %llu <%llx> outgoing %d: mbuf %llx copyoffset %u copylen %u (%s)",
4036 		    (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen,
4037 		    data_req->cfd_flags & CFD_DATA_FLAG_IP_HEADER ? "IP HDR" : "NO IP HDR");
4038 	}
4039 
4040 done:
4041 	if (error == ENOBUFS) {
4042 		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
4043 		OSIncrementAtomic(
4044 			&cfil_stats.cfs_data_event_flow_control);
4045 
4046 		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
4047 			cfil_rw_lock_exclusive(&cfil_lck_rw);
4048 		}
4049 
4050 		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
4051 
4052 		cfil_rw_unlock_exclusive(&cfil_lck_rw);
4053 	} else {
4054 		if (error != 0) {
4055 			OSIncrementAtomic(&cfil_stats.cfs_data_event_fail);
4056 		}
4057 
4058 		cfil_rw_unlock_shared(&cfil_lck_rw);
4059 	}
4060 	return error;
4061 }
4062 
4063 /*
4064  * Process the queue of data waiting to be delivered to content filter
4065  */
4066 static int
cfil_data_service_ctl_q(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int outgoing)4067 cfil_data_service_ctl_q(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
4068 {
4069 	errno_t error = 0;
4070 	struct mbuf *data, *tmp = NULL;
4071 	unsigned int datalen = 0, copylen = 0, copyoffset = 0;
4072 	struct cfil_entry *entry;
4073 	struct cfe_buf *entrybuf;
4074 	uint64_t currentoffset = 0;
4075 
4076 	if (cfil_info == NULL) {
4077 		return 0;
4078 	}
4079 
4080 	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
4081 	    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
4082 
4083 	socket_lock_assert_owned(so);
4084 
4085 	entry = &cfil_info->cfi_entries[kcunit - 1];
4086 	if (outgoing) {
4087 		entrybuf = &entry->cfe_snd;
4088 	} else {
4089 		entrybuf = &entry->cfe_rcv;
4090 	}
4091 
4092 	/* Send attached message if not yet done */
4093 	if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
4094 		error = cfil_dispatch_attach_event(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, entry),
4095 		    cfil_info->cfi_dir);
4096 		if (error != 0) {
4097 			/* We can recover from flow control */
4098 			if (error == ENOBUFS || error == ENOMEM) {
4099 				error = 0;
4100 			}
4101 			goto done;
4102 		}
4103 	} else if ((entry->cfe_flags & CFEF_DATA_START) == 0) {
4104 		OSIncrementAtomic(&cfil_stats.cfs_ctl_q_not_started);
4105 		goto done;
4106 	}
4107 
4108 	if (cfil_info->cfi_debug && cfil_log_data) {
4109 		CFIL_LOG(LOG_ERR, "CFIL: SERVICE CTL-Q: pass_offset %llu peeked %llu peek_offset %llu",
4110 		    entrybuf->cfe_pass_offset,
4111 		    entrybuf->cfe_peeked,
4112 		    entrybuf->cfe_peek_offset);
4113 	}
4114 
4115 	/* Move all data that can pass */
4116 	while ((data = cfil_queue_first(&entrybuf->cfe_ctl_q)) != NULL &&
4117 	    entrybuf->cfe_ctl_q.q_start < entrybuf->cfe_pass_offset) {
4118 		datalen = cfil_data_length(data, NULL, NULL);
4119 		tmp = data;
4120 
4121 		if (entrybuf->cfe_ctl_q.q_start + datalen <=
4122 		    entrybuf->cfe_pass_offset) {
4123 			/*
4124 			 * The first mbuf can fully pass
4125 			 */
4126 			copylen = datalen;
4127 		} else {
4128 			/*
4129 			 * The first mbuf can partially pass
4130 			 */
4131 			copylen = (unsigned int)(entrybuf->cfe_pass_offset - entrybuf->cfe_ctl_q.q_start);
4132 		}
4133 		VERIFY(copylen <= datalen);
4134 
4135 		if (cfil_info->cfi_debug && cfil_log_data) {
4136 			CFIL_LOG(LOG_ERR,
4137 			    "CFIL: SERVICE CTL-Q PASSING: %llx first %llu peeked %llu pass %llu peek %llu"
4138 			    "datalen %u copylen %u",
4139 			    (uint64_t)VM_KERNEL_ADDRPERM(tmp),
4140 			    entrybuf->cfe_ctl_q.q_start,
4141 			    entrybuf->cfe_peeked,
4142 			    entrybuf->cfe_pass_offset,
4143 			    entrybuf->cfe_peek_offset,
4144 			    datalen, copylen);
4145 		}
4146 
4147 		/*
4148 		 * Data that passes has been peeked at explicitly or
4149 		 * implicitly
4150 		 */
4151 		if (entrybuf->cfe_ctl_q.q_start + copylen >
4152 		    entrybuf->cfe_peeked) {
4153 			entrybuf->cfe_peeked =
4154 			    entrybuf->cfe_ctl_q.q_start + copylen;
4155 		}
4156 		/*
4157 		 * Stop on partial pass
4158 		 */
4159 		if (copylen < datalen) {
4160 			break;
4161 		}
4162 
4163 		/* All good, move full data from ctl queue to pending queue */
4164 		cfil_queue_remove(&entrybuf->cfe_ctl_q, data, datalen);
4165 
4166 		cfil_queue_enqueue(&entrybuf->cfe_pending_q, data, datalen);
4167 		if (outgoing) {
4168 			OSAddAtomic64(datalen,
4169 			    &cfil_stats.cfs_pending_q_out_enqueued);
4170 		} else {
4171 			OSAddAtomic64(datalen,
4172 			    &cfil_stats.cfs_pending_q_in_enqueued);
4173 		}
4174 	}
4175 	CFIL_INFO_VERIFY(cfil_info);
4176 	if (tmp != NULL) {
4177 		CFIL_LOG(LOG_DEBUG,
4178 		    "%llx first %llu peeked %llu pass %llu peek %llu"
4179 		    "datalen %u copylen %u",
4180 		    (uint64_t)VM_KERNEL_ADDRPERM(tmp),
4181 		    entrybuf->cfe_ctl_q.q_start,
4182 		    entrybuf->cfe_peeked,
4183 		    entrybuf->cfe_pass_offset,
4184 		    entrybuf->cfe_peek_offset,
4185 		    datalen, copylen);
4186 	}
4187 	tmp = NULL;
4188 
4189 	/* Now deal with remaining data the filter wants to peek at */
4190 	for (data = cfil_queue_first(&entrybuf->cfe_ctl_q),
4191 	    currentoffset = entrybuf->cfe_ctl_q.q_start;
4192 	    data != NULL && currentoffset < entrybuf->cfe_peek_offset;
4193 	    data = cfil_queue_next(&entrybuf->cfe_ctl_q, data),
4194 	    currentoffset += datalen) {
4195 		datalen = cfil_data_length(data, NULL, NULL);
4196 		tmp = data;
4197 
4198 		/* We've already peeked at this mbuf */
4199 		if (currentoffset + datalen <= entrybuf->cfe_peeked) {
4200 			continue;
4201 		}
4202 		/*
4203 		 * The data in the first mbuf may have been
4204 		 * partially peeked at
4205 		 */
4206 		copyoffset = (unsigned int)(entrybuf->cfe_peeked - currentoffset);
4207 		VERIFY(copyoffset < datalen);
4208 		copylen = datalen - copyoffset;
4209 		VERIFY(copylen <= datalen);
4210 		/*
4211 		 * Do not copy more than needed
4212 		 */
4213 		if (currentoffset + copyoffset + copylen >
4214 		    entrybuf->cfe_peek_offset) {
4215 			copylen = (unsigned int)(entrybuf->cfe_peek_offset -
4216 			    (currentoffset + copyoffset));
4217 		}
4218 
4219 		if (cfil_info->cfi_debug && cfil_log_data) {
4220 			CFIL_LOG(LOG_ERR,
4221 			    "CFIL: SERVICE CTL-Q PEEKING: %llx current %llu peeked %llu pass %llu peek %llu "
4222 			    "datalen %u copylen %u copyoffset %u",
4223 			    (uint64_t)VM_KERNEL_ADDRPERM(tmp),
4224 			    currentoffset,
4225 			    entrybuf->cfe_peeked,
4226 			    entrybuf->cfe_pass_offset,
4227 			    entrybuf->cfe_peek_offset,
4228 			    datalen, copylen, copyoffset);
4229 		}
4230 
4231 		/*
4232 		 * Stop if there is nothing more to peek at
4233 		 */
4234 		if (copylen == 0) {
4235 			break;
4236 		}
4237 		/*
4238 		 * Let the filter get a peek at this span of data
4239 		 */
4240 		error = cfil_dispatch_data_event(so, cfil_info, kcunit,
4241 		    outgoing, data, copyoffset, copylen);
4242 		if (error != 0) {
4243 			/* On error, leave data in ctl_q */
4244 			break;
4245 		}
4246 		entrybuf->cfe_peeked += copylen;
4247 		if (outgoing) {
4248 			OSAddAtomic64(copylen,
4249 			    &cfil_stats.cfs_ctl_q_out_peeked);
4250 		} else {
4251 			OSAddAtomic64(copylen,
4252 			    &cfil_stats.cfs_ctl_q_in_peeked);
4253 		}
4254 
4255 		/* Stop when data could not be fully peeked at */
4256 		if (copylen + copyoffset < datalen) {
4257 			break;
4258 		}
4259 	}
4260 	CFIL_INFO_VERIFY(cfil_info);
4261 	if (tmp != NULL) {
4262 		CFIL_LOG(LOG_DEBUG,
4263 		    "%llx first %llu peeked %llu pass %llu peek %llu"
4264 		    "datalen %u copylen %u copyoffset %u",
4265 		    (uint64_t)VM_KERNEL_ADDRPERM(tmp),
4266 		    currentoffset,
4267 		    entrybuf->cfe_peeked,
4268 		    entrybuf->cfe_pass_offset,
4269 		    entrybuf->cfe_peek_offset,
4270 		    datalen, copylen, copyoffset);
4271 	}
4272 
4273 	/*
4274 	 * Process data that has passed the filter
4275 	 */
4276 	error = cfil_service_pending_queue(so, cfil_info, kcunit, outgoing);
4277 	if (error != 0) {
4278 		CFIL_LOG(LOG_ERR, "cfil_service_pending_queue() error %d",
4279 		    error);
4280 		goto done;
4281 	}
4282 
4283 	/*
4284 	 * Dispatch disconnect events that could not be sent
4285 	 */
4286 	if (cfil_info == NULL) {
4287 		goto done;
4288 	} else if (outgoing) {
4289 		if ((cfil_info->cfi_flags & CFIF_SHUT_WR) &&
4290 		    !(entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) {
4291 			cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1);
4292 		}
4293 	} else {
4294 		if ((cfil_info->cfi_flags & CFIF_SHUT_RD) &&
4295 		    !(entry->cfe_flags & CFEF_SENT_DISCONNECT_IN)) {
4296 			cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0);
4297 		}
4298 	}
4299 
4300 done:
4301 	CFIL_LOG(LOG_DEBUG,
4302 	    "first %llu peeked %llu pass %llu peek %llu",
4303 	    entrybuf->cfe_ctl_q.q_start,
4304 	    entrybuf->cfe_peeked,
4305 	    entrybuf->cfe_pass_offset,
4306 	    entrybuf->cfe_peek_offset);
4307 
4308 	CFIL_INFO_VERIFY(cfil_info);
4309 	return error;
4310 }
4311 
4312 /*
4313  * cfil_data_filter()
4314  *
4315  * Process data for a content filter installed on a socket
4316  */
4317 int
cfil_data_filter(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int outgoing,struct mbuf * data,uint32_t datalen)4318 cfil_data_filter(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
4319     struct mbuf *data, uint32_t datalen)
4320 {
4321 	errno_t error = 0;
4322 	struct cfil_entry *entry;
4323 	struct cfe_buf *entrybuf;
4324 
4325 	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
4326 	    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
4327 
4328 	socket_lock_assert_owned(so);
4329 
4330 	entry = &cfil_info->cfi_entries[kcunit - 1];
4331 	if (outgoing) {
4332 		entrybuf = &entry->cfe_snd;
4333 	} else {
4334 		entrybuf = &entry->cfe_rcv;
4335 	}
4336 
4337 	/* Are we attached to the filter? */
4338 	if (entry->cfe_filter == NULL) {
4339 		error = 0;
4340 		goto done;
4341 	}
4342 
4343 	/* Dispatch to filters */
4344 	cfil_queue_enqueue(&entrybuf->cfe_ctl_q, data, datalen);
4345 	if (outgoing) {
4346 		OSAddAtomic64(datalen,
4347 		    &cfil_stats.cfs_ctl_q_out_enqueued);
4348 	} else {
4349 		OSAddAtomic64(datalen,
4350 		    &cfil_stats.cfs_ctl_q_in_enqueued);
4351 	}
4352 
4353 	error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing);
4354 	if (error != 0) {
4355 		CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
4356 		    error);
4357 	}
4358 	/*
4359 	 * We have to return EJUSTRETURN in all cases to avoid double free
4360 	 * by socket layer
4361 	 */
4362 	error = EJUSTRETURN;
4363 done:
4364 	CFIL_INFO_VERIFY(cfil_info);
4365 
4366 	CFIL_LOG(LOG_INFO, "return %d", error);
4367 	return error;
4368 }
4369 
4370 static void
cfil_strip_ip_header(struct cfil_info * cfil_info,mbuf_t data,struct socket * so)4371 cfil_strip_ip_header(struct cfil_info *cfil_info, mbuf_t data, struct socket *so)
4372 {
4373 	struct ip *ip = NULL;
4374 	unsigned int hlen = 0;
4375 	mbuf_t data_start = NULL;
4376 	struct inpcb *inp = so ? sotoinpcb(so) : NULL;
4377 
4378 	if (inp && (inp->inp_flags & INP_STRIPHDR)) {
4379 		data_start = cfil_data_start(data);
4380 		if (data_start != NULL && (data_start->m_flags & M_PKTHDR)) {
4381 			ip = mtod(data_start, struct ip *);
4382 			hlen = IP_VHL_HL(ip->ip_vhl) << 2;
4383 
4384 			if (cfil_info->cfi_debug && cfil_log_data) {
4385 				CFIL_LOG(LOG_ERR, "CFIL: IPHDR STRIPPING: <so %llx>: <hlen %d m_len %d>",
4386 				    (uint64_t)VM_KERNEL_ADDRPERM(so),
4387 				    hlen, data_start->m_len);
4388 			}
4389 			VERIFY(hlen <= data_start->m_len);
4390 			data_start->m_len -= hlen;
4391 			data_start->m_pkthdr.len -= hlen;
4392 			data_start->m_data += hlen;
4393 		}
4394 	}
4395 }
4396 
4397 /*
4398  * cfil_service_inject_queue() re-inject data that passed the
4399  * content filters
4400  */
4401 static int
cfil_service_inject_queue(struct socket * so,struct cfil_info * cfil_info,int outgoing)4402 cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int outgoing)
4403 {
4404 	mbuf_t data;
4405 	unsigned int datalen;
4406 	int mbcnt = 0;
4407 	int mbnum = 0;
4408 	errno_t error = 0;
4409 	struct cfi_buf *cfi_buf;
4410 	struct cfil_queue *inject_q;
4411 	int need_rwakeup = 0;
4412 	int count = 0;
4413 
4414 	if (cfil_info == NULL) {
4415 		return 0;
4416 	}
4417 
4418 	socket_lock_assert_owned(so);
4419 
4420 	if (so->so_state & SS_DEFUNCT) {
4421 		return 0;
4422 	}
4423 
4424 	if (outgoing) {
4425 		cfi_buf = &cfil_info->cfi_snd;
4426 		cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_OUT;
4427 	} else {
4428 		cfi_buf = &cfil_info->cfi_rcv;
4429 		cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_IN;
4430 	}
4431 	inject_q = &cfi_buf->cfi_inject_q;
4432 
4433 	if (cfil_queue_empty(inject_q)) {
4434 		return 0;
4435 	}
4436 
4437 	if (cfil_info->cfi_debug && cfil_log_data) {
4438 		CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> outgoing %d queue len %llu",
4439 		    (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, cfil_queue_len(inject_q));
4440 	}
4441 
4442 	while ((data = cfil_queue_first(inject_q)) != NULL) {
4443 		datalen = cfil_data_length(data, &mbcnt, &mbnum);
4444 
4445 		if (cfil_info->cfi_debug && cfil_log_data) {
4446 			CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> data %llx datalen %u (mbcnt %u)",
4447 			    (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt);
4448 		}
4449 
4450 		/* Remove data from queue and adjust stats */
4451 		cfil_queue_remove(inject_q, data, datalen);
4452 		cfi_buf->cfi_pending_first += datalen;
4453 		cfi_buf->cfi_pending_mbcnt -= mbcnt;
4454 		cfi_buf->cfi_pending_mbnum -= mbnum;
4455 		cfil_info_buf_verify(cfi_buf);
4456 
4457 		if (outgoing) {
4458 			error = sosend_reinject(so, NULL, data, NULL, 0);
4459 			if (error != 0) {
4460 				cfil_info_log(LOG_ERR, cfil_info, "CFIL: Error: sosend_reinject() failed");
4461 				CFIL_LOG(LOG_ERR, "CFIL: sosend() failed %d", error);
4462 				break;
4463 			}
4464 			// At least one injection succeeded, need to wake up pending threads.
4465 			need_rwakeup = 1;
4466 		} else {
4467 			data->m_flags |= M_SKIPCFIL;
4468 
4469 			/*
4470 			 * NOTE: We currently only support TCP, UDP, ICMP,
4471 			 * ICMPv6 and RAWIP.  For MPTCP and message TCP we'll
4472 			 * need to call the appropriate sbappendxxx()
4473 			 * of fix sock_inject_data_in()
4474 			 */
4475 			if (NEED_DGRAM_FLOW_TRACKING(so)) {
4476 				if (OPTIONAL_IP_HEADER(so)) {
4477 					cfil_strip_ip_header(cfil_info, data, so);
4478 				}
4479 
4480 				if (sbappendchain(&so->so_rcv, data)) {
4481 					need_rwakeup = 1;
4482 				}
4483 			} else {
4484 				if (sbappendstream(&so->so_rcv, data)) {
4485 					need_rwakeup = 1;
4486 				}
4487 			}
4488 		}
4489 
4490 		if (outgoing) {
4491 			OSAddAtomic64(datalen,
4492 			    &cfil_stats.cfs_inject_q_out_passed);
4493 		} else {
4494 			OSAddAtomic64(datalen,
4495 			    &cfil_stats.cfs_inject_q_in_passed);
4496 		}
4497 
4498 		count++;
4499 	}
4500 
4501 	if (cfil_info->cfi_debug && cfil_log_data) {
4502 		CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> injected %d",
4503 		    (uint64_t)VM_KERNEL_ADDRPERM(so), count);
4504 	}
4505 
4506 	/* A single wakeup is for several packets is more efficient */
4507 	if (need_rwakeup) {
4508 		if (outgoing == TRUE) {
4509 			sowwakeup(so);
4510 		} else {
4511 			sorwakeup(so);
4512 		}
4513 	}
4514 
4515 	if (error != 0 && cfil_info) {
4516 		if (error == ENOBUFS) {
4517 			OSIncrementAtomic(&cfil_stats.cfs_inject_q_nobufs);
4518 		}
4519 		if (error == ENOMEM) {
4520 			OSIncrementAtomic(&cfil_stats.cfs_inject_q_nomem);
4521 		}
4522 
4523 		if (outgoing) {
4524 			cfil_info->cfi_flags |= CFIF_RETRY_INJECT_OUT;
4525 			OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_fail);
4526 		} else {
4527 			cfil_info->cfi_flags |= CFIF_RETRY_INJECT_IN;
4528 			OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_fail);
4529 		}
4530 	}
4531 
4532 	/*
4533 	 * Notify
4534 	 */
4535 	if (cfil_info && (cfil_info->cfi_flags & CFIF_SHUT_WR)) {
4536 		cfil_sock_notify_shutdown(so, SHUT_WR);
4537 		if (cfil_sock_data_pending(&so->so_snd) == 0) {
4538 			soshutdownlock_final(so, SHUT_WR);
4539 		}
4540 	}
4541 	if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) {
4542 		if (cfil_filters_attached(so) == 0) {
4543 			CFIL_LOG(LOG_INFO, "so %llx waking",
4544 			    (uint64_t)VM_KERNEL_ADDRPERM(so));
4545 			wakeup((caddr_t)cfil_info);
4546 		}
4547 	}
4548 
4549 	if (SO_DELAYED_DEAD_GET(so)) {
4550 		// Check to see if all data processed for this socket, if so mark it DEAD now.
4551 		const bool is_dead = cfil_sock_is_dead(so);
4552 		if (is_dead && cfil_info->cfi_debug) {
4553 			cfil_info_log(LOG_ERR, cfil_info, "CFIL: Marked previoulsy delayed socket as DEAD");
4554 		}
4555 	}
4556 	if (SO_DELAYED_TCP_TIME_WAIT_GET(so)) {
4557 		// Check to see if all data processed for this socket, if so handle the TCP time wait now
4558 		const bool is_added = cfil_sock_tcp_add_time_wait(so);
4559 		if (is_added && cfil_info->cfi_debug) {
4560 			cfil_info_log(LOG_ERR, cfil_info, "CFIL: Handled previously delayed socket for TCP time wait");
4561 		}
4562 	}
4563 
4564 	CFIL_INFO_VERIFY(cfil_info);
4565 
4566 	return error;
4567 }
4568 
4569 static int
cfil_service_pending_queue(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int outgoing)4570 cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
4571 {
4572 	uint64_t passlen, curlen;
4573 	mbuf_t data;
4574 	unsigned int datalen;
4575 	errno_t error = 0;
4576 	struct cfil_entry *entry;
4577 	struct cfe_buf *entrybuf;
4578 	struct cfil_queue *pending_q;
4579 	struct cfil_entry *iter_entry = NULL;
4580 
4581 	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
4582 	    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
4583 
4584 	socket_lock_assert_owned(so);
4585 
4586 	entry = &cfil_info->cfi_entries[kcunit - 1];
4587 	if (outgoing) {
4588 		entrybuf = &entry->cfe_snd;
4589 	} else {
4590 		entrybuf = &entry->cfe_rcv;
4591 	}
4592 
4593 	pending_q = &entrybuf->cfe_pending_q;
4594 
4595 	passlen = entrybuf->cfe_pass_offset - pending_q->q_start;
4596 
4597 	if (cfil_queue_empty(pending_q)) {
4598 		for (iter_entry = SLIST_NEXT(entry, cfe_order_link);
4599 		    iter_entry != NULL;
4600 		    iter_entry = SLIST_NEXT(iter_entry, cfe_order_link)) {
4601 			error = cfil_data_service_ctl_q(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, iter_entry), outgoing);
4602 			/* 0 means passed so we can continue */
4603 			if (error != 0) {
4604 				break;
4605 			}
4606 		}
4607 		goto done;
4608 	}
4609 
4610 	/*
4611 	 * Locate the chunks of data that we can pass to the next filter
4612 	 * A data chunk must be on mbuf boundaries
4613 	 */
4614 	curlen = 0;
4615 	while ((data = cfil_queue_first(pending_q)) != NULL) {
4616 		datalen = cfil_data_length(data, NULL, NULL);
4617 
4618 		if (cfil_info->cfi_debug && cfil_log_data) {
4619 			CFIL_LOG(LOG_ERR,
4620 			    "CFIL: SERVICE PENDING-Q: data %llx datalen %u passlen %llu curlen %llu",
4621 			    (uint64_t)VM_KERNEL_ADDRPERM(data), datalen,
4622 			    passlen, curlen);
4623 		}
4624 
4625 		if (curlen + datalen > passlen) {
4626 			break;
4627 		}
4628 
4629 		cfil_queue_remove(pending_q, data, datalen);
4630 
4631 		curlen += datalen;
4632 
4633 		for (iter_entry = SLIST_NEXT(entry, cfe_order_link);
4634 		    iter_entry != NULL;
4635 		    iter_entry = SLIST_NEXT(iter_entry, cfe_order_link)) {
4636 			error = cfil_data_filter(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, iter_entry), outgoing,
4637 			    data, datalen);
4638 			/* 0 means passed so we can continue */
4639 			if (error != 0) {
4640 				break;
4641 			}
4642 		}
4643 		/* When data has passed all filters, re-inject */
4644 		if (error == 0) {
4645 			if (outgoing) {
4646 				cfil_queue_enqueue(
4647 					&cfil_info->cfi_snd.cfi_inject_q,
4648 					data, datalen);
4649 				OSAddAtomic64(datalen,
4650 				    &cfil_stats.cfs_inject_q_out_enqueued);
4651 			} else {
4652 				cfil_queue_enqueue(
4653 					&cfil_info->cfi_rcv.cfi_inject_q,
4654 					data, datalen);
4655 				OSAddAtomic64(datalen,
4656 				    &cfil_stats.cfs_inject_q_in_enqueued);
4657 			}
4658 		}
4659 	}
4660 
4661 done:
4662 	CFIL_INFO_VERIFY(cfil_info);
4663 
4664 	return error;
4665 }
4666 
4667 int
cfil_update_data_offsets(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int outgoing,uint64_t pass_offset,uint64_t peek_offset)4668 cfil_update_data_offsets(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
4669     uint64_t pass_offset, uint64_t peek_offset)
4670 {
4671 	errno_t error = 0;
4672 	struct cfil_entry *entry = NULL;
4673 	struct cfe_buf *entrybuf;
4674 	int updated = 0;
4675 
4676 	CFIL_LOG(LOG_INFO, "pass %llu peek %llu", pass_offset, peek_offset);
4677 
4678 	socket_lock_assert_owned(so);
4679 
4680 	if (cfil_info == NULL) {
4681 		CFIL_LOG(LOG_ERR, "so %llx cfil detached",
4682 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
4683 		error = 0;
4684 		goto done;
4685 	} else if (cfil_info->cfi_flags & CFIF_DROP) {
4686 		CFIL_LOG(LOG_ERR, "so %llx drop set",
4687 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
4688 		error = EPIPE;
4689 		goto done;
4690 	}
4691 
4692 	entry = &cfil_info->cfi_entries[kcunit - 1];
4693 	if (outgoing) {
4694 		entrybuf = &entry->cfe_snd;
4695 	} else {
4696 		entrybuf = &entry->cfe_rcv;
4697 	}
4698 
4699 	/* Record updated offsets for this content filter */
4700 	if (pass_offset > entrybuf->cfe_pass_offset) {
4701 		entrybuf->cfe_pass_offset = pass_offset;
4702 
4703 		if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset) {
4704 			entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
4705 		}
4706 		updated = 1;
4707 	} else {
4708 		CFIL_LOG(LOG_INFO, "pass_offset %llu <= cfe_pass_offset %llu",
4709 		    pass_offset, entrybuf->cfe_pass_offset);
4710 	}
4711 	/* Filter does not want or need to see data that's allowed to pass */
4712 	if (peek_offset > entrybuf->cfe_pass_offset &&
4713 	    peek_offset > entrybuf->cfe_peek_offset) {
4714 		entrybuf->cfe_peek_offset = peek_offset;
4715 		updated = 1;
4716 	}
4717 	/* Nothing to do */
4718 	if (updated == 0) {
4719 		goto done;
4720 	}
4721 
4722 	/* Move data held in control queue to pending queue if needed */
4723 	error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing);
4724 	if (error != 0) {
4725 		CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
4726 		    error);
4727 		goto done;
4728 	}
4729 	error = EJUSTRETURN;
4730 
4731 done:
4732 	/*
4733 	 * The filter is effectively detached when pass all from both sides
4734 	 * or when the socket is closed and no more data is waiting
4735 	 * to be delivered to the filter
4736 	 */
4737 	if (entry != NULL &&
4738 	    ((entry->cfe_snd.cfe_pass_offset == CFM_MAX_OFFSET &&
4739 	    entry->cfe_rcv.cfe_pass_offset == CFM_MAX_OFFSET) ||
4740 	    ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
4741 	    cfil_queue_empty(&entry->cfe_snd.cfe_ctl_q) &&
4742 	    cfil_queue_empty(&entry->cfe_rcv.cfe_ctl_q)))) {
4743 		entry->cfe_flags |= CFEF_CFIL_DETACHED;
4744 
4745 		if (cfil_info->cfi_debug) {
4746 			const char * __null_terminated out = "CFIL: OUT - PASSED ALL - DETACH";
4747 			const char * __null_terminated in = "CFIL: IN - PASSED ALL - DETACH";
4748 			cfil_info_log(LOG_ERR, cfil_info, outgoing ? out : in);
4749 		}
4750 
4751 		CFIL_LOG(LOG_INFO, "so %llx detached %u",
4752 		    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
4753 		if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
4754 		    cfil_filters_attached(so) == 0) {
4755 			if (cfil_info->cfi_debug) {
4756 				cfil_info_log(LOG_ERR, cfil_info, "CFIL: WAKING");
4757 			}
4758 			CFIL_LOG(LOG_INFO, "so %llx waking",
4759 			    (uint64_t)VM_KERNEL_ADDRPERM(so));
4760 			wakeup((caddr_t)cfil_info);
4761 		}
4762 	}
4763 	CFIL_INFO_VERIFY(cfil_info);
4764 	CFIL_LOG(LOG_INFO, "return %d", error);
4765 	return error;
4766 }
4767 
4768 /*
4769  * Update pass offset for socket when no data is pending
4770  */
4771 static int
cfil_set_socket_pass_offset(struct socket * so,struct cfil_info * cfil_info,int outgoing)4772 cfil_set_socket_pass_offset(struct socket *so, struct cfil_info *cfil_info, int outgoing)
4773 {
4774 	struct cfi_buf *cfi_buf;
4775 	struct cfil_entry *entry;
4776 	struct cfe_buf *entrybuf;
4777 	uint32_t kcunit;
4778 	uint64_t pass_offset = 0;
4779 	boolean_t first = true;
4780 
4781 	if (cfil_info == NULL) {
4782 		return 0;
4783 	}
4784 
4785 	if (cfil_info->cfi_debug && cfil_log_data) {
4786 		CFIL_LOG(LOG_ERR, "so %llx outgoing %d",
4787 		    (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
4788 	}
4789 
4790 	socket_lock_assert_owned(so);
4791 
4792 	if (outgoing) {
4793 		cfi_buf = &cfil_info->cfi_snd;
4794 	} else {
4795 		cfi_buf = &cfil_info->cfi_rcv;
4796 	}
4797 
4798 	if (cfil_info->cfi_debug && cfil_log_data) {
4799 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx, sockID %llu <%llx>> outgoing %d cfi_pending_first %llu cfi_pending_last %llu",
4800 		    (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, cfil_info->cfi_sock_id, outgoing,
4801 		    cfi_buf->cfi_pending_first, cfi_buf->cfi_pending_last);
4802 	}
4803 
4804 	if (cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first == 0) {
4805 		for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
4806 			entry = &cfil_info->cfi_entries[kcunit - 1];
4807 
4808 			/* Are we attached to a filter? */
4809 			if (entry->cfe_filter == NULL) {
4810 				continue;
4811 			}
4812 
4813 			if (outgoing) {
4814 				entrybuf = &entry->cfe_snd;
4815 			} else {
4816 				entrybuf = &entry->cfe_rcv;
4817 			}
4818 
4819 			// Keep track of the smallest pass_offset among filters.
4820 			if (first == true ||
4821 			    entrybuf->cfe_pass_offset < pass_offset) {
4822 				pass_offset = entrybuf->cfe_pass_offset;
4823 				first = false;
4824 			}
4825 		}
4826 		cfi_buf->cfi_pass_offset = pass_offset;
4827 	}
4828 
4829 	if (cfil_info->cfi_debug && cfil_log_data) {
4830 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx, sockID %llu <%llx>>, cfi_pass_offset %llu",
4831 		    (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, cfil_info->cfi_sock_id, cfi_buf->cfi_pass_offset);
4832 	}
4833 
4834 	return 0;
4835 }
4836 
4837 int
cfil_action_data_pass(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit,int outgoing,uint64_t pass_offset,uint64_t peek_offset)4838 cfil_action_data_pass(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
4839     uint64_t pass_offset, uint64_t peek_offset)
4840 {
4841 	errno_t error = 0;
4842 
4843 	CFIL_LOG(LOG_INFO, "");
4844 
4845 	socket_lock_assert_owned(so);
4846 
4847 	error = cfil_acquire_sockbuf(so, cfil_info, outgoing);
4848 	if (error != 0) {
4849 		CFIL_LOG(LOG_INFO, "so %llx %s dropped",
4850 		    (uint64_t)VM_KERNEL_ADDRPERM(so),
4851 		    outgoing ? "out" : "in");
4852 		goto release;
4853 	}
4854 
4855 	error = cfil_update_data_offsets(so, cfil_info, kcunit, outgoing,
4856 	    pass_offset, peek_offset);
4857 
4858 	cfil_service_inject_queue(so, cfil_info, outgoing);
4859 
4860 	cfil_set_socket_pass_offset(so, cfil_info, outgoing);
4861 release:
4862 	CFIL_INFO_VERIFY(cfil_info);
4863 	cfil_release_sockbuf(so, outgoing);
4864 
4865 	return error;
4866 }
4867 
4868 
4869 static void
cfil_flush_queues(struct socket * so,struct cfil_info * cfil_info)4870 cfil_flush_queues(struct socket *so, struct cfil_info *cfil_info)
4871 {
4872 	struct cfil_entry *entry;
4873 	int kcunit;
4874 	uint64_t drained;
4875 
4876 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL) {
4877 		goto done;
4878 	}
4879 
4880 	socket_lock_assert_owned(so);
4881 
4882 	/*
4883 	 * Flush the output queues and ignore errors as long as
4884 	 * we are attached
4885 	 */
4886 	(void) cfil_acquire_sockbuf(so, cfil_info, 1);
4887 	if (cfil_info != NULL) {
4888 		drained = 0;
4889 		for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
4890 			entry = &cfil_info->cfi_entries[kcunit - 1];
4891 
4892 			drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
4893 			drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
4894 		}
4895 		drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
4896 
4897 		if (drained) {
4898 			if (cfil_info->cfi_flags & CFIF_DROP) {
4899 				OSIncrementAtomic(
4900 					&cfil_stats.cfs_flush_out_drop);
4901 			} else {
4902 				OSIncrementAtomic(
4903 					&cfil_stats.cfs_flush_out_close);
4904 			}
4905 		}
4906 	}
4907 	cfil_release_sockbuf(so, 1);
4908 
4909 	/*
4910 	 * Flush the input queues
4911 	 */
4912 	(void) cfil_acquire_sockbuf(so, cfil_info, 0);
4913 	if (cfil_info != NULL) {
4914 		drained = 0;
4915 		for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
4916 			entry = &cfil_info->cfi_entries[kcunit - 1];
4917 
4918 			drained += cfil_queue_drain(
4919 				&entry->cfe_rcv.cfe_ctl_q);
4920 			drained += cfil_queue_drain(
4921 				&entry->cfe_rcv.cfe_pending_q);
4922 		}
4923 		drained += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
4924 
4925 		if (drained) {
4926 			if (cfil_info->cfi_flags & CFIF_DROP) {
4927 				OSIncrementAtomic(
4928 					&cfil_stats.cfs_flush_in_drop);
4929 			} else {
4930 				OSIncrementAtomic(
4931 					&cfil_stats.cfs_flush_in_close);
4932 			}
4933 		}
4934 	}
4935 	cfil_release_sockbuf(so, 0);
4936 done:
4937 	CFIL_INFO_VERIFY(cfil_info);
4938 }
4939 
4940 int
cfil_action_drop(struct socket * so,struct cfil_info * cfil_info,uint32_t kcunit)4941 cfil_action_drop(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit)
4942 {
4943 	errno_t error = 0;
4944 	struct cfil_entry *entry;
4945 	struct proc *p;
4946 
4947 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL) {
4948 		goto done;
4949 	}
4950 
4951 	socket_lock_assert_owned(so);
4952 
4953 	entry = &cfil_info->cfi_entries[kcunit - 1];
4954 
4955 	/* Are we attached to the filter? */
4956 	if (entry->cfe_filter == NULL) {
4957 		goto done;
4958 	}
4959 
4960 	cfil_info->cfi_flags |= CFIF_DROP;
4961 
4962 	p = current_proc();
4963 
4964 	/*
4965 	 * Force the socket to be marked defunct
4966 	 * (forcing fixed along with rdar://19391339)
4967 	 */
4968 	if (so->so_flow_db == NULL) {
4969 		error = sosetdefunct(p, so,
4970 		    SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
4971 		    FALSE);
4972 
4973 		/* Flush the socket buffer and disconnect */
4974 		if (error == 0) {
4975 			error = sodefunct(p, so,
4976 			    SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
4977 		}
4978 	}
4979 
4980 	/* The filter is done, mark as detached */
4981 	entry->cfe_flags |= CFEF_CFIL_DETACHED;
4982 
4983 	if (cfil_info->cfi_debug) {
4984 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: DROP - DETACH");
4985 	}
4986 
4987 	CFIL_LOG(LOG_INFO, "so %llx detached %u",
4988 	    (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
4989 
4990 	/* Pending data needs to go */
4991 	cfil_flush_queues(so, cfil_info);
4992 
4993 	if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) {
4994 		if (cfil_filters_attached(so) == 0) {
4995 			CFIL_LOG(LOG_INFO, "so %llx waking",
4996 			    (uint64_t)VM_KERNEL_ADDRPERM(so));
4997 			wakeup((caddr_t)cfil_info);
4998 		}
4999 	}
5000 done:
5001 	return error;
5002 }
5003 
5004 int
cfil_action_bless_client(uint32_t kcunit,struct cfil_msg_hdr * msghdr)5005 cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
5006 {
5007 	errno_t error = 0;
5008 	struct cfil_info * __single cfil_info = NULL;
5009 
5010 	bool cfil_attached = false;
5011 	struct cfil_msg_bless_client *blessmsg = (struct cfil_msg_bless_client *)msghdr;
5012 
5013 	// Search and lock socket
5014 	struct socket *so = cfil_socket_from_client_uuid(blessmsg->cfb_client_uuid, &cfil_attached);
5015 	if (so == NULL) {
5016 		error = ENOENT;
5017 	} else {
5018 		// The client gets a pass automatically
5019 		cfil_info = (so->so_flow_db != NULL) ?
5020 		    soflow_db_get_feature_context(so->so_flow_db, msghdr->cfm_sock_id) : so->so_cfil;
5021 
5022 		if (cfil_attached) {
5023 			if (cfil_info != NULL && cfil_info->cfi_debug) {
5024 				cfil_info_log(LOG_ERR, cfil_info, "CFIL: VERDICT RECEIVED: BLESS");
5025 			}
5026 			cfil_sock_received_verdict(so);
5027 			(void)cfil_action_data_pass(so, cfil_info, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
5028 			(void)cfil_action_data_pass(so, cfil_info, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
5029 		} else {
5030 			so->so_flags1 |= SOF1_CONTENT_FILTER_SKIP;
5031 		}
5032 		socket_unlock(so, 1);
5033 	}
5034 
5035 	return error;
5036 }
5037 
5038 int
cfil_action_set_crypto_key(uint32_t kcunit,struct cfil_msg_hdr * msghdr)5039 cfil_action_set_crypto_key(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
5040 {
5041 	struct content_filter *cfc = NULL;
5042 	cfil_crypto_state_t crypto_state = NULL;
5043 	struct cfil_msg_set_crypto_key *keymsg = (struct cfil_msg_set_crypto_key *)msghdr;
5044 
5045 	CFIL_LOG(LOG_NOTICE, "");
5046 
5047 	if (kcunit > MAX_CONTENT_FILTER) {
5048 		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
5049 		    kcunit, MAX_CONTENT_FILTER);
5050 		return EINVAL;
5051 	}
5052 	crypto_state = cfil_crypto_init_client((uint8_t *)keymsg->crypto_key);
5053 	if (crypto_state == NULL) {
5054 		CFIL_LOG(LOG_ERR, "failed to initialize crypto state for unit %u)",
5055 		    kcunit);
5056 		return EINVAL;
5057 	}
5058 
5059 	cfil_rw_lock_exclusive(&cfil_lck_rw);
5060 
5061 	cfc = content_filters[kcunit - 1];
5062 	if (cfc->cf_kcunit != kcunit) {
5063 		CFIL_LOG(LOG_ERR, "bad unit info %u)",
5064 		    kcunit);
5065 		cfil_rw_unlock_exclusive(&cfil_lck_rw);
5066 		cfil_crypto_cleanup_state(crypto_state);
5067 		return EINVAL;
5068 	}
5069 	if (cfc->cf_crypto_state != NULL) {
5070 		cfil_crypto_cleanup_state(cfc->cf_crypto_state);
5071 		cfc->cf_crypto_state = NULL;
5072 	}
5073 	cfc->cf_crypto_state = crypto_state;
5074 
5075 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
5076 	return 0;
5077 }
5078 
5079 static int
cfil_update_entry_offsets(struct socket * so,struct cfil_info * cfil_info,int outgoing,unsigned int datalen)5080 cfil_update_entry_offsets(struct socket *so, struct cfil_info *cfil_info, int outgoing, unsigned int datalen)
5081 {
5082 	struct cfil_entry *entry;
5083 	struct cfe_buf *entrybuf;
5084 	uint32_t kcunit;
5085 
5086 	CFIL_LOG(LOG_INFO, "so %llx outgoing %d datalen %u",
5087 	    (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, datalen);
5088 
5089 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
5090 		entry = &cfil_info->cfi_entries[kcunit - 1];
5091 
5092 		/* Are we attached to the filter? */
5093 		if (entry->cfe_filter == NULL) {
5094 			continue;
5095 		}
5096 
5097 		if (outgoing) {
5098 			entrybuf = &entry->cfe_snd;
5099 		} else {
5100 			entrybuf = &entry->cfe_rcv;
5101 		}
5102 
5103 		entrybuf->cfe_ctl_q.q_start += datalen;
5104 		if (entrybuf->cfe_pass_offset < entrybuf->cfe_ctl_q.q_start) {
5105 			entrybuf->cfe_pass_offset = entrybuf->cfe_ctl_q.q_start;
5106 		}
5107 		entrybuf->cfe_peeked = entrybuf->cfe_ctl_q.q_start;
5108 		if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset) {
5109 			entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
5110 		}
5111 
5112 		entrybuf->cfe_ctl_q.q_end += datalen;
5113 
5114 		entrybuf->cfe_pending_q.q_start += datalen;
5115 		entrybuf->cfe_pending_q.q_end += datalen;
5116 	}
5117 	CFIL_INFO_VERIFY(cfil_info);
5118 	return 0;
5119 }
5120 
5121 int
cfil_data_common(struct socket * so,struct cfil_info * cfil_info,int outgoing,struct sockaddr * to,struct mbuf * data,struct mbuf * control,uint32_t flags)5122 cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, struct sockaddr *to,
5123     struct mbuf *data, struct mbuf *control, uint32_t flags)
5124 {
5125 #pragma unused(to, control, flags)
5126 	errno_t error = 0;
5127 	unsigned int datalen;
5128 	int mbcnt = 0;
5129 	int mbnum = 0;
5130 	int kcunit;
5131 	struct cfi_buf *cfi_buf;
5132 	struct mbuf *chain = NULL;
5133 
5134 	if (cfil_info == NULL) {
5135 		CFIL_LOG(LOG_ERR, "so %llx cfil detached",
5136 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5137 		error = 0;
5138 		goto done;
5139 	} else if (cfil_info->cfi_flags & CFIF_DROP) {
5140 		CFIL_LOG(LOG_ERR, "so %llx drop set",
5141 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5142 		error = EPIPE;
5143 		goto done;
5144 	}
5145 
5146 	datalen = cfil_data_length(data, &mbcnt, &mbnum);
5147 
5148 	if (datalen == 0) {
5149 		error = 0;
5150 		goto done;
5151 	}
5152 
5153 	if (outgoing) {
5154 		cfi_buf = &cfil_info->cfi_snd;
5155 		cfil_info->cfi_byte_outbound_count += datalen;
5156 	} else {
5157 		cfi_buf = &cfil_info->cfi_rcv;
5158 		cfil_info->cfi_byte_inbound_count += datalen;
5159 	}
5160 
5161 	cfi_buf->cfi_pending_last += datalen;
5162 	cfi_buf->cfi_pending_mbcnt += mbcnt;
5163 	cfi_buf->cfi_pending_mbnum += mbnum;
5164 
5165 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5166 		if (cfi_buf->cfi_pending_mbnum > cfil_udp_gc_mbuf_num_max ||
5167 		    cfi_buf->cfi_pending_mbcnt > cfil_udp_gc_mbuf_cnt_max) {
5168 			cfi_buf->cfi_tail_drop_cnt++;
5169 			cfi_buf->cfi_pending_mbcnt -= mbcnt;
5170 			cfi_buf->cfi_pending_mbnum -= mbnum;
5171 			return EPIPE;
5172 		}
5173 	}
5174 
5175 	cfil_info_buf_verify(cfi_buf);
5176 
5177 	if (cfil_info->cfi_debug && cfil_log_data) {
5178 		CFIL_LOG(LOG_ERR, "CFIL: QUEUEING DATA: <so %llx> %s: data %llx len %u flags 0x%x nextpkt %llx - cfi_pending_last %llu cfi_pending_mbcnt %u   cfi_pass_offset %llu",
5179 		    (uint64_t)VM_KERNEL_ADDRPERM(so),
5180 		    outgoing ? "OUT" : "IN",
5181 		    (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags,
5182 		    (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt),
5183 		    cfi_buf->cfi_pending_last,
5184 		    cfi_buf->cfi_pending_mbcnt,
5185 		    cfi_buf->cfi_pass_offset);
5186 	}
5187 
5188 	/* Fast path when below pass offset */
5189 	if (cfi_buf->cfi_pending_last <= cfi_buf->cfi_pass_offset) {
5190 		cfil_update_entry_offsets(so, cfil_info, outgoing, datalen);
5191 		if (cfil_info->cfi_debug && cfil_log_data) {
5192 			CFIL_LOG(LOG_ERR, "CFIL: QUEUEING DATA: <so %llx> %s: FAST PATH",
5193 			    (uint64_t)VM_KERNEL_ADDRPERM(so),
5194 			    outgoing ? "OUT" : "IN");
5195 		}
5196 		// For incoming packets, see if we need to strip off ip header
5197 		if (!outgoing && NEED_DGRAM_FLOW_TRACKING(so) && OPTIONAL_IP_HEADER(so)) {
5198 			cfil_strip_ip_header(cfil_info, data, so);
5199 		}
5200 	} else {
5201 		struct cfil_entry *iter_entry;
5202 		SLIST_FOREACH(iter_entry, &cfil_info->cfi_ordered_entries, cfe_order_link) {
5203 			// Is cfil attached to this filter?
5204 			kcunit = CFI_ENTRY_KCUNIT(cfil_info, iter_entry);
5205 			if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) {
5206 				if (NEED_DGRAM_FLOW_TRACKING(so) && chain == NULL) {
5207 					/* Datagrams only:
5208 					 * Chain addr (incoming only TDB), control (optional) and data into one chain.
5209 					 * This full chain will be reinjected into socket after recieving verdict.
5210 					 */
5211 					(void) cfil_dgram_save_socket_state(cfil_info, data);
5212 					chain = sbconcat_mbufs(NULL, outgoing ? NULL : to, data, control);
5213 					if (chain == NULL) {
5214 						return ENOBUFS;
5215 					}
5216 					data = chain;
5217 				}
5218 				error = cfil_data_filter(so, cfil_info, kcunit, outgoing, data,
5219 				    datalen);
5220 			}
5221 			/* 0 means passed so continue with next filter */
5222 			if (error != 0) {
5223 				break;
5224 			}
5225 		}
5226 	}
5227 
5228 	/* Move cursor if no filter claimed the data */
5229 	if (error == 0) {
5230 		cfi_buf->cfi_pending_first += datalen;
5231 		cfi_buf->cfi_pending_mbcnt -= mbcnt;
5232 		cfi_buf->cfi_pending_mbnum -= mbnum;
5233 		cfil_info_buf_verify(cfi_buf);
5234 	}
5235 done:
5236 	CFIL_INFO_VERIFY(cfil_info);
5237 
5238 	return error;
5239 }
5240 
5241 /*
5242  * Callback from socket layer sosendxxx()
5243  */
5244 int
cfil_sock_data_out(struct socket * so,struct sockaddr * to,struct mbuf * data,struct mbuf * control,uint32_t flags,struct soflow_hash_entry * flow_entry)5245 cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
5246     struct mbuf *data, struct mbuf *control, uint32_t flags, struct soflow_hash_entry *flow_entry)
5247 {
5248 	int error = 0;
5249 	int new_filter_control_unit = 0;
5250 
5251 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5252 		return cfil_sock_udp_handle_data(TRUE, so, NULL, to, data, control, flags, flow_entry);
5253 	}
5254 
5255 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5256 		/* Drop pre-existing TCP sockets if filter is enabled now */
5257 		if (!DO_PRESERVE_CONNECTIONS && cfil_active_count > 0 && !SKIP_FILTER_FOR_TCP_SOCKET(so)) {
5258 			new_filter_control_unit = necp_socket_get_content_filter_control_unit(so);
5259 			if (new_filter_control_unit > 0) {
5260 				CFIL_LOG(LOG_NOTICE, "CFIL: TCP(OUT) <so %llx> - filter state changed - dropped pre-existing flow", (uint64_t)VM_KERNEL_ADDRPERM(so));
5261 				return EPIPE;
5262 			}
5263 		}
5264 		return 0;
5265 	}
5266 
5267 	/* Drop pre-existing TCP sockets when filter state changed */
5268 	new_filter_control_unit = necp_socket_get_content_filter_control_unit(so);
5269 	if (new_filter_control_unit > 0 && new_filter_control_unit != so->so_cfil->cfi_filter_control_unit && !SKIP_FILTER_FOR_TCP_SOCKET(so)) {
5270 		if (DO_PRESERVE_CONNECTIONS || (so->so_cfil->cfi_filter_policy_gencount == necp_socket_get_policy_gencount(so))) {
5271 			// CFIL state has changed, but preserve the flow intentionally or if this is not a result of NECP policy change
5272 			so->so_cfil->cfi_filter_control_unit = new_filter_control_unit;
5273 		} else {
5274 			CFIL_LOG(LOG_NOTICE, "CFIL: TCP(OUT) <so %llx> - filter state changed - dropped pre-existing flow (old state 0x%x new state 0x%x)",
5275 			    (uint64_t)VM_KERNEL_ADDRPERM(so),
5276 			    so->so_cfil->cfi_filter_control_unit, new_filter_control_unit);
5277 			return EPIPE;
5278 		}
5279 	}
5280 
5281 	/*
5282 	 * Pass initial data for TFO.
5283 	 */
5284 	if (IS_INITIAL_TFO_DATA(so)) {
5285 		return 0;
5286 	}
5287 
5288 	socket_lock_assert_owned(so);
5289 
5290 	if (so->so_cfil->cfi_flags & CFIF_DROP) {
5291 		CFIL_LOG(LOG_ERR, "so %llx drop set",
5292 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5293 		return EPIPE;
5294 	}
5295 	if (control != NULL) {
5296 		CFIL_LOG(LOG_ERR, "so %llx control",
5297 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5298 		OSIncrementAtomic(&cfil_stats.cfs_data_out_control);
5299 	}
5300 	if ((flags & MSG_OOB)) {
5301 		CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
5302 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5303 		OSIncrementAtomic(&cfil_stats.cfs_data_out_oob);
5304 	}
5305 	/*
5306 	 * Abort if socket is defunct.
5307 	 */
5308 	if (so->so_flags & SOF_DEFUNCT) {
5309 		return EPIPE;
5310 	}
5311 	if ((so->so_snd.sb_flags & SB_LOCK) == 0) {
5312 		panic("so %p SB_LOCK not set", so);
5313 	}
5314 
5315 	if (so->so_snd.sb_cfil_thread != NULL) {
5316 		panic("%s sb_cfil_thread %p not NULL", __func__,
5317 		    so->so_snd.sb_cfil_thread);
5318 	}
5319 
5320 	error = cfil_data_common(so, so->so_cfil, 1, to, data, control, flags);
5321 
5322 	return error;
5323 }
5324 
5325 /*
5326  * Callback from socket layer sbappendxxx()
5327  */
5328 int
cfil_sock_data_in(struct socket * so,struct sockaddr * from,struct mbuf * data,struct mbuf * control,uint32_t flags,struct soflow_hash_entry * flow_entry)5329 cfil_sock_data_in(struct socket *so, struct sockaddr *from,
5330     struct mbuf *data, struct mbuf *control, uint32_t flags, struct soflow_hash_entry *flow_entry)
5331 {
5332 	int error = 0;
5333 	int new_filter_control_unit = 0;
5334 
5335 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5336 		return cfil_sock_udp_handle_data(FALSE, so, NULL, from, data, control, flags, flow_entry);
5337 	}
5338 
5339 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5340 		/* Drop pre-existing TCP sockets if filter is enabled now */
5341 		if (!DO_PRESERVE_CONNECTIONS && cfil_active_count > 0 && !SKIP_FILTER_FOR_TCP_SOCKET(so)) {
5342 			new_filter_control_unit = necp_socket_get_content_filter_control_unit(so);
5343 			if (new_filter_control_unit > 0) {
5344 				CFIL_LOG(LOG_NOTICE, "CFIL: TCP(IN) <so %llx> - filter state changed - dropped pre-existing flow", (uint64_t)VM_KERNEL_ADDRPERM(so));
5345 				return EPIPE;
5346 			}
5347 		}
5348 		return 0;
5349 	}
5350 
5351 	/* Drop pre-existing TCP sockets when filter state changed */
5352 	new_filter_control_unit = necp_socket_get_content_filter_control_unit(so);
5353 	if (new_filter_control_unit > 0 && new_filter_control_unit != so->so_cfil->cfi_filter_control_unit && !SKIP_FILTER_FOR_TCP_SOCKET(so)) {
5354 		if (DO_PRESERVE_CONNECTIONS || (so->so_cfil->cfi_filter_policy_gencount == necp_socket_get_policy_gencount(so))) {
5355 			// CFIL state has changed, but preserve the flow intentionally or if this is not a result of NECP policy change
5356 			so->so_cfil->cfi_filter_control_unit = new_filter_control_unit;
5357 		} else {
5358 			CFIL_LOG(LOG_NOTICE, "CFIL: TCP(IN) <so %llx> - filter state changed - dropped pre-existing flow (old state 0x%x new state 0x%x)",
5359 			    (uint64_t)VM_KERNEL_ADDRPERM(so),
5360 			    so->so_cfil->cfi_filter_control_unit, new_filter_control_unit);
5361 			return EPIPE;
5362 		}
5363 	}
5364 
5365 	/*
5366 	 * Pass initial data for TFO.
5367 	 */
5368 	if (IS_INITIAL_TFO_DATA(so)) {
5369 		return 0;
5370 	}
5371 
5372 	socket_lock_assert_owned(so);
5373 
5374 	if (so->so_cfil->cfi_flags & CFIF_DROP) {
5375 		CFIL_LOG(LOG_ERR, "so %llx drop set",
5376 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5377 		return EPIPE;
5378 	}
5379 	if (control != NULL) {
5380 		CFIL_LOG(LOG_ERR, "so %llx control",
5381 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5382 		OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
5383 	}
5384 	if (data->m_type == MT_OOBDATA) {
5385 		CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
5386 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5387 		OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
5388 	}
5389 	error = cfil_data_common(so, so->so_cfil, 0, from, data, control, flags);
5390 
5391 	return error;
5392 }
5393 
5394 /*
5395  * Callback from socket layer soshutdownxxx()
5396  *
5397  * We may delay the shutdown write if there's outgoing data in process.
5398  *
5399  * There is no point in delaying the shutdown read because the process
5400  * indicated that it does not want to read anymore data.
5401  */
5402 int
cfil_sock_shutdown(struct socket * so,int * how)5403 cfil_sock_shutdown(struct socket *so, int *how)
5404 {
5405 	int error = 0;
5406 
5407 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5408 		return cfil_sock_udp_shutdown(so, how);
5409 	}
5410 
5411 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5412 		goto done;
5413 	}
5414 
5415 	socket_lock_assert_owned(so);
5416 
5417 	CFIL_LOG(LOG_INFO, "so %llx how %d",
5418 	    (uint64_t)VM_KERNEL_ADDRPERM(so), *how);
5419 
5420 	/*
5421 	 * Check the state of the socket before the content filter
5422 	 */
5423 	if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
5424 		/* read already shut down */
5425 		error = ENOTCONN;
5426 		goto done;
5427 	}
5428 	if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
5429 		/* write already shut down */
5430 		error = ENOTCONN;
5431 		goto done;
5432 	}
5433 
5434 	if ((so->so_cfil->cfi_flags & CFIF_DROP) != 0) {
5435 		CFIL_LOG(LOG_ERR, "so %llx drop set",
5436 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5437 		goto done;
5438 	}
5439 
5440 	/*
5441 	 * shutdown read: SHUT_RD or SHUT_RDWR
5442 	 */
5443 	if (*how != SHUT_WR) {
5444 		if (so->so_cfil->cfi_flags & CFIF_SHUT_RD) {
5445 			error = ENOTCONN;
5446 			goto done;
5447 		}
5448 		so->so_cfil->cfi_flags |= CFIF_SHUT_RD;
5449 		cfil_sock_notify_shutdown(so, SHUT_RD);
5450 	}
5451 	/*
5452 	 * shutdown write: SHUT_WR or SHUT_RDWR
5453 	 */
5454 	if (*how != SHUT_RD) {
5455 		if (so->so_cfil->cfi_flags & CFIF_SHUT_WR) {
5456 			error = ENOTCONN;
5457 			goto done;
5458 		}
5459 		so->so_cfil->cfi_flags |= CFIF_SHUT_WR;
5460 		cfil_sock_notify_shutdown(so, SHUT_WR);
5461 		/*
5462 		 * When outgoing data is pending, we delay the shutdown at the
5463 		 * protocol level until the content filters give the final
5464 		 * verdict on the pending data.
5465 		 */
5466 		if (cfil_sock_data_pending(&so->so_snd) != 0) {
5467 			/*
5468 			 * When shutting down the read and write sides at once
5469 			 * we can proceed to the final shutdown of the read
5470 			 * side. Otherwise, we just return.
5471 			 */
5472 			if (*how == SHUT_WR) {
5473 				error = EJUSTRETURN;
5474 			} else if (*how == SHUT_RDWR) {
5475 				*how = SHUT_RD;
5476 			}
5477 		}
5478 	}
5479 done:
5480 	return error;
5481 }
5482 
5483 /*
5484  * This is called when the socket is closed and there is no more
5485  * opportunity for filtering
5486  */
5487 void
cfil_sock_is_closed(struct socket * so)5488 cfil_sock_is_closed(struct socket *so)
5489 {
5490 	errno_t error = 0;
5491 	int kcunit;
5492 
5493 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5494 		cfil_sock_udp_is_closed(so);
5495 		return;
5496 	}
5497 
5498 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5499 		return;
5500 	}
5501 
5502 	CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
5503 
5504 	socket_lock_assert_owned(so);
5505 
5506 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
5507 		/* Let the filters know of the closing */
5508 		error = cfil_dispatch_closed_event(so, so->so_cfil, kcunit);
5509 	}
5510 
5511 	/* Last chance to push passed data out */
5512 	error = cfil_acquire_sockbuf(so, so->so_cfil, 1);
5513 	if (error == 0) {
5514 		cfil_service_inject_queue(so, so->so_cfil, 1);
5515 	}
5516 	cfil_release_sockbuf(so, 1);
5517 
5518 	if (so->so_cfil != NULL) {
5519 		so->so_cfil->cfi_flags |= CFIF_SOCK_CLOSED;
5520 	}
5521 
5522 	/* Pending data needs to go */
5523 	cfil_flush_queues(so, so->so_cfil);
5524 
5525 	CFIL_INFO_VERIFY(so->so_cfil);
5526 }
5527 
5528 /*
5529  * This is called when the socket is disconnected so let the filters
5530  * know about the disconnection and that no more data will come
5531  *
5532  * The how parameter has the same values as soshutown()
5533  */
5534 void
cfil_sock_notify_shutdown(struct socket * so,int how)5535 cfil_sock_notify_shutdown(struct socket *so, int how)
5536 {
5537 	errno_t error = 0;
5538 	int kcunit;
5539 
5540 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5541 		cfil_sock_udp_notify_shutdown(so, how, 0, 0);
5542 		return;
5543 	}
5544 
5545 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5546 		return;
5547 	}
5548 
5549 	CFIL_LOG(LOG_INFO, "so %llx how %d",
5550 	    (uint64_t)VM_KERNEL_ADDRPERM(so), how);
5551 
5552 	socket_lock_assert_owned(so);
5553 
5554 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
5555 		/* Disconnect incoming side */
5556 		if (how != SHUT_WR) {
5557 			error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 0);
5558 		}
5559 		/* Disconnect outgoing side */
5560 		if (how != SHUT_RD) {
5561 			error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 1);
5562 		}
5563 	}
5564 }
5565 
5566 static int
cfil_filters_attached(struct socket * so)5567 cfil_filters_attached(struct socket *so)
5568 {
5569 	struct cfil_entry *entry;
5570 	uint32_t kcunit;
5571 	int attached = 0;
5572 
5573 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5574 		return cfil_filters_udp_attached(so, FALSE);
5575 	}
5576 
5577 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5578 		return 0;
5579 	}
5580 
5581 	socket_lock_assert_owned(so);
5582 
5583 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
5584 		entry = &so->so_cfil->cfi_entries[kcunit - 1];
5585 
5586 		/* Are we attached to the filter? */
5587 		if (entry->cfe_filter == NULL) {
5588 			continue;
5589 		}
5590 		if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
5591 			continue;
5592 		}
5593 		if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0) {
5594 			continue;
5595 		}
5596 		attached = 1;
5597 		break;
5598 	}
5599 
5600 	return attached;
5601 }
5602 
5603 /*
5604  * This is called when the socket is closed and we are waiting for
5605  * the filters to gives the final pass or drop
5606  */
5607 void
cfil_sock_close_wait(struct socket * so)5608 cfil_sock_close_wait(struct socket *so)
5609 {
5610 	lck_mtx_t *mutex_held;
5611 	struct timespec ts;
5612 	int error;
5613 
5614 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5615 		cfil_sock_udp_close_wait(so);
5616 		return;
5617 	}
5618 
5619 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5620 		return;
5621 	}
5622 
5623 	// This flow does not need to wait for close ack from user-space
5624 	if (IS_NO_CLOSE_WAIT(so->so_cfil)) {
5625 		if (so->so_cfil->cfi_debug) {
5626 			cfil_info_log(LOG_ERR, so->so_cfil, "CFIL: SKIP CLOSE WAIT");
5627 		}
5628 		return;
5629 	}
5630 
5631 	CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
5632 
5633 	if (so->so_proto->pr_getlock != NULL) {
5634 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
5635 	} else {
5636 		mutex_held = so->so_proto->pr_domain->dom_mtx;
5637 	}
5638 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
5639 
5640 	while (cfil_filters_attached(so)) {
5641 		/*
5642 		 * Notify the filters we are going away so they can detach
5643 		 */
5644 		cfil_sock_notify_shutdown(so, SHUT_RDWR);
5645 
5646 		/*
5647 		 * Make sure we need to wait after the filter are notified
5648 		 * of the disconnection
5649 		 */
5650 		if (cfil_filters_attached(so) == 0) {
5651 			break;
5652 		}
5653 
5654 		CFIL_LOG(LOG_INFO, "so %llx waiting",
5655 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
5656 
5657 		ts.tv_sec = cfil_close_wait_timeout / 1000;
5658 		ts.tv_nsec = (cfil_close_wait_timeout % 1000) *
5659 		    NSEC_PER_USEC * 1000;
5660 
5661 		OSIncrementAtomic(&cfil_stats.cfs_close_wait);
5662 		so->so_cfil->cfi_flags |= CFIF_CLOSE_WAIT;
5663 		error = msleep((caddr_t)so->so_cfil, mutex_held,
5664 		    PSOCK | PCATCH, "cfil_sock_close_wait", &ts);
5665 
5666 		// Woke up from sleep, validate if cfil_info is still valid
5667 		if (so->so_cfil == NULL) {
5668 			// cfil_info is not valid, do not continue
5669 			return;
5670 		}
5671 
5672 		so->so_cfil->cfi_flags &= ~CFIF_CLOSE_WAIT;
5673 
5674 		CFIL_LOG(LOG_NOTICE, "so %llx timed out %d",
5675 		    (uint64_t)VM_KERNEL_ADDRPERM(so), (error != 0));
5676 
5677 		/*
5678 		 * Force close in case of timeout
5679 		 */
5680 		if (error != 0) {
5681 			OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
5682 			break;
5683 		}
5684 	}
5685 }
5686 
5687 /*
5688  * Returns the size of the data held by the content filter by using
5689  */
5690 int32_t
cfil_sock_data_pending(struct sockbuf * sb)5691 cfil_sock_data_pending(struct sockbuf *sb)
5692 {
5693 	struct socket *so = sb->sb_so;
5694 	uint64_t pending = 0;
5695 
5696 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5697 		return cfil_sock_udp_data_pending(sb, FALSE);
5698 	}
5699 
5700 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL) {
5701 		struct cfi_buf *cfi_buf;
5702 
5703 		socket_lock_assert_owned(so);
5704 
5705 		if ((sb->sb_flags & SB_RECV) == 0) {
5706 			cfi_buf = &so->so_cfil->cfi_snd;
5707 		} else {
5708 			cfi_buf = &so->so_cfil->cfi_rcv;
5709 		}
5710 
5711 		pending = cfi_buf->cfi_pending_last -
5712 		    cfi_buf->cfi_pending_first;
5713 
5714 		/*
5715 		 * If we are limited by the "chars of mbufs used" roughly
5716 		 * adjust so we won't overcommit
5717 		 */
5718 		if (pending > (uint64_t)cfi_buf->cfi_pending_mbcnt) {
5719 			pending = cfi_buf->cfi_pending_mbcnt;
5720 		}
5721 	}
5722 
5723 	VERIFY(pending < INT32_MAX);
5724 
5725 	return (int32_t)(pending);
5726 }
5727 
5728 /*
5729  * Return the socket buffer space used by data being held by content filters
5730  * so processes won't clog the socket buffer
5731  */
5732 int32_t
cfil_sock_data_space(struct sockbuf * sb)5733 cfil_sock_data_space(struct sockbuf *sb)
5734 {
5735 	struct socket *so = sb->sb_so;
5736 	uint64_t pending = 0;
5737 
5738 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5739 		return cfil_sock_udp_data_pending(sb, TRUE);
5740 	}
5741 
5742 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL &&
5743 	    so->so_snd.sb_cfil_thread != current_thread()) {
5744 		struct cfi_buf *cfi_buf;
5745 
5746 		socket_lock_assert_owned(so);
5747 
5748 		if ((sb->sb_flags & SB_RECV) == 0) {
5749 			cfi_buf = &so->so_cfil->cfi_snd;
5750 		} else {
5751 			cfi_buf = &so->so_cfil->cfi_rcv;
5752 		}
5753 
5754 		pending = cfi_buf->cfi_pending_last -
5755 		    cfi_buf->cfi_pending_first;
5756 
5757 		/*
5758 		 * If we are limited by the "chars of mbufs used" roughly
5759 		 * adjust so we won't overcommit
5760 		 */
5761 		if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending) {
5762 			pending = cfi_buf->cfi_pending_mbcnt;
5763 		}
5764 
5765 		VERIFY(pending < INT32_MAX);
5766 	}
5767 
5768 	return (int32_t)(pending);
5769 }
5770 
5771 /*
5772  * A callback from the socket and protocol layer when data becomes
5773  * available in the socket buffer to give a chance for the content filter
5774  * to re-inject data that was held back
5775  */
5776 void
cfil_sock_buf_update(struct sockbuf * sb)5777 cfil_sock_buf_update(struct sockbuf *sb)
5778 {
5779 	int outgoing;
5780 	int error;
5781 	struct socket *so = sb->sb_so;
5782 
5783 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
5784 		cfil_sock_udp_buf_update(sb);
5785 		return;
5786 	}
5787 
5788 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
5789 		return;
5790 	}
5791 
5792 	if (!cfil_sbtrim) {
5793 		return;
5794 	}
5795 
5796 	socket_lock_assert_owned(so);
5797 
5798 	if ((sb->sb_flags & SB_RECV) == 0) {
5799 		if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0) {
5800 			return;
5801 		}
5802 		outgoing = 1;
5803 		OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
5804 	} else {
5805 		if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_IN) == 0) {
5806 			return;
5807 		}
5808 		outgoing = 0;
5809 		OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
5810 	}
5811 
5812 	CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
5813 	    (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
5814 
5815 	error = cfil_acquire_sockbuf(so, so->so_cfil, outgoing);
5816 	if (error == 0) {
5817 		cfil_service_inject_queue(so, so->so_cfil, outgoing);
5818 	}
5819 	cfil_release_sockbuf(so, outgoing);
5820 }
5821 
5822 int
sysctl_cfil_filter_list(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)5823 sysctl_cfil_filter_list(struct sysctl_oid *oidp, void *arg1, int arg2,
5824     struct sysctl_req *req)
5825 {
5826 #pragma unused(oidp, arg1, arg2)
5827 	int error = 0;
5828 	size_t len = 0;
5829 	u_int32_t i;
5830 
5831 	/* Read only  */
5832 	if (req->newptr != USER_ADDR_NULL) {
5833 		return EPERM;
5834 	}
5835 
5836 	cfil_rw_lock_shared(&cfil_lck_rw);
5837 
5838 	for (i = 0; i < MAX_CONTENT_FILTER; i++) {
5839 		struct cfil_filter_stat filter_stat;
5840 		struct content_filter *cfc = content_filters[i];
5841 
5842 		if (cfc == NULL) {
5843 			continue;
5844 		}
5845 
5846 		/* If just asking for the size */
5847 		if (req->oldptr == USER_ADDR_NULL) {
5848 			len += sizeof(struct cfil_filter_stat);
5849 			continue;
5850 		}
5851 
5852 		bzero(&filter_stat, sizeof(struct cfil_filter_stat));
5853 		filter_stat.cfs_len = sizeof(struct cfil_filter_stat);
5854 		filter_stat.cfs_filter_id = cfc->cf_kcunit;
5855 		filter_stat.cfs_flags = cfc->cf_flags;
5856 		filter_stat.cfs_sock_count = cfc->cf_sock_count;
5857 		filter_stat.cfs_necp_control_unit = cfc->cf_necp_control_unit;
5858 
5859 		error = SYSCTL_OUT(req, &filter_stat,
5860 		    sizeof(struct cfil_filter_stat));
5861 		if (error != 0) {
5862 			break;
5863 		}
5864 	}
5865 	/* If just asking for the size */
5866 	if (req->oldptr == USER_ADDR_NULL) {
5867 		req->oldidx = len;
5868 	}
5869 
5870 	cfil_rw_unlock_shared(&cfil_lck_rw);
5871 
5872 	if (cfil_log_level >= LOG_DEBUG) {
5873 		if (req->oldptr != USER_ADDR_NULL) {
5874 			for (i = 1; i <= MAX_CONTENT_FILTER; i++) {
5875 				cfil_filter_show(i);
5876 			}
5877 		}
5878 	}
5879 
5880 	return error;
5881 }
5882 
5883 static int
sysctl_cfil_sock_list(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)5884 sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2,
5885     struct sysctl_req *req)
5886 {
5887 #pragma unused(oidp, arg1, arg2)
5888 	int error = 0;
5889 	u_int32_t i;
5890 	struct cfil_info *cfi;
5891 
5892 	/* Read only  */
5893 	if (req->newptr != USER_ADDR_NULL) {
5894 		return EPERM;
5895 	}
5896 
5897 	cfil_rw_lock_shared(&cfil_lck_rw);
5898 
5899 	/*
5900 	 * If just asking for the size,
5901 	 */
5902 	if (req->oldptr == USER_ADDR_NULL) {
5903 		req->oldidx = cfil_sock_attached_count *
5904 		    sizeof(struct cfil_sock_stat);
5905 		/* Bump the length in case new sockets gets attached */
5906 		req->oldidx += req->oldidx >> 3;
5907 		goto done;
5908 	}
5909 
5910 	TAILQ_FOREACH(cfi, &cfil_sock_head, cfi_link) {
5911 		struct cfil_entry *entry;
5912 		struct cfil_sock_stat stat;
5913 		struct socket *so = cfi->cfi_so;
5914 
5915 		bzero(&stat, sizeof(struct cfil_sock_stat));
5916 		stat.cfs_len = sizeof(struct cfil_sock_stat);
5917 		stat.cfs_sock_id = cfi->cfi_sock_id;
5918 		stat.cfs_flags = cfi->cfi_flags;
5919 
5920 		if (so != NULL && so->so_proto != NULL && so->so_proto->pr_domain != NULL) {
5921 			stat.cfs_pid = so->last_pid;
5922 			memcpy(stat.cfs_uuid, so->last_uuid,
5923 			    sizeof(uuid_t));
5924 			if (so->so_flags & SOF_DELEGATED) {
5925 				stat.cfs_e_pid = so->e_pid;
5926 				memcpy(stat.cfs_e_uuid, so->e_uuid,
5927 				    sizeof(uuid_t));
5928 			} else {
5929 				stat.cfs_e_pid = so->last_pid;
5930 				memcpy(stat.cfs_e_uuid, so->last_uuid,
5931 				    sizeof(uuid_t));
5932 			}
5933 
5934 			stat.cfs_sock_family = SOCK_DOM(so);
5935 			stat.cfs_sock_type = SOCK_TYPE(so);
5936 			stat.cfs_sock_protocol = GET_SO_PROTO(so);
5937 		}
5938 
5939 		stat.cfs_snd.cbs_pending_first =
5940 		    cfi->cfi_snd.cfi_pending_first;
5941 		stat.cfs_snd.cbs_pending_last =
5942 		    cfi->cfi_snd.cfi_pending_last;
5943 		stat.cfs_snd.cbs_inject_q_len =
5944 		    cfil_queue_len(&cfi->cfi_snd.cfi_inject_q);
5945 		stat.cfs_snd.cbs_pass_offset =
5946 		    cfi->cfi_snd.cfi_pass_offset;
5947 
5948 		stat.cfs_rcv.cbs_pending_first =
5949 		    cfi->cfi_rcv.cfi_pending_first;
5950 		stat.cfs_rcv.cbs_pending_last =
5951 		    cfi->cfi_rcv.cfi_pending_last;
5952 		stat.cfs_rcv.cbs_inject_q_len =
5953 		    cfil_queue_len(&cfi->cfi_rcv.cfi_inject_q);
5954 		stat.cfs_rcv.cbs_pass_offset =
5955 		    cfi->cfi_rcv.cfi_pass_offset;
5956 
5957 		for (i = 0; i < MAX_CONTENT_FILTER; i++) {
5958 			struct cfil_entry_stat *estat;
5959 			struct cfe_buf *ebuf;
5960 			struct cfe_buf_stat *sbuf;
5961 
5962 			entry = &cfi->cfi_entries[i];
5963 
5964 			estat = &stat.ces_entries[i];
5965 
5966 			estat->ces_len = sizeof(struct cfil_entry_stat);
5967 			estat->ces_filter_id = entry->cfe_filter ?
5968 			    entry->cfe_filter->cf_kcunit : 0;
5969 			estat->ces_flags = entry->cfe_flags;
5970 			estat->ces_necp_control_unit =
5971 			    entry->cfe_necp_control_unit;
5972 
5973 			estat->ces_last_event.tv_sec =
5974 			    (int64_t)entry->cfe_last_event.tv_sec;
5975 			estat->ces_last_event.tv_usec =
5976 			    (int64_t)entry->cfe_last_event.tv_usec;
5977 
5978 			estat->ces_last_action.tv_sec =
5979 			    (int64_t)entry->cfe_last_action.tv_sec;
5980 			estat->ces_last_action.tv_usec =
5981 			    (int64_t)entry->cfe_last_action.tv_usec;
5982 
5983 			ebuf = &entry->cfe_snd;
5984 			sbuf = &estat->ces_snd;
5985 			sbuf->cbs_pending_first =
5986 			    cfil_queue_offset_first(&ebuf->cfe_pending_q);
5987 			sbuf->cbs_pending_last =
5988 			    cfil_queue_offset_last(&ebuf->cfe_pending_q);
5989 			sbuf->cbs_ctl_first =
5990 			    cfil_queue_offset_first(&ebuf->cfe_ctl_q);
5991 			sbuf->cbs_ctl_last =
5992 			    cfil_queue_offset_last(&ebuf->cfe_ctl_q);
5993 			sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
5994 			sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
5995 			sbuf->cbs_peeked =  ebuf->cfe_peeked;
5996 
5997 			ebuf = &entry->cfe_rcv;
5998 			sbuf = &estat->ces_rcv;
5999 			sbuf->cbs_pending_first =
6000 			    cfil_queue_offset_first(&ebuf->cfe_pending_q);
6001 			sbuf->cbs_pending_last =
6002 			    cfil_queue_offset_last(&ebuf->cfe_pending_q);
6003 			sbuf->cbs_ctl_first =
6004 			    cfil_queue_offset_first(&ebuf->cfe_ctl_q);
6005 			sbuf->cbs_ctl_last =
6006 			    cfil_queue_offset_last(&ebuf->cfe_ctl_q);
6007 			sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
6008 			sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
6009 			sbuf->cbs_peeked =  ebuf->cfe_peeked;
6010 		}
6011 		error = SYSCTL_OUT(req, &stat,
6012 		    sizeof(struct cfil_sock_stat));
6013 		if (error != 0) {
6014 			break;
6015 		}
6016 	}
6017 done:
6018 	cfil_rw_unlock_shared(&cfil_lck_rw);
6019 
6020 	if (cfil_log_level >= LOG_DEBUG) {
6021 		if (req->oldptr != USER_ADDR_NULL) {
6022 			cfil_info_show();
6023 		}
6024 	}
6025 
6026 	return error;
6027 }
6028 
6029 /*
6030  * UDP Socket Support
6031  */
6032 static void
cfil_hash_entry_log(int level,struct socket * so,struct soflow_hash_entry * entry,uint64_t sockId,const char * msg)6033 cfil_hash_entry_log(int level, struct socket *so, struct soflow_hash_entry *entry, uint64_t sockId, const char* msg)
6034 {
6035 	char local[MAX_IPv6_STR_LEN + 6];
6036 	char remote[MAX_IPv6_STR_LEN + 6];
6037 	const void  *addr;
6038 
6039 	// No sock or not UDP, no-op
6040 	if (so == NULL || entry == NULL) {
6041 		return;
6042 	}
6043 
6044 	local[0] = remote[0] = 0x0;
6045 
6046 	switch (entry->soflow_family) {
6047 	case AF_INET6:
6048 		addr = &entry->soflow_laddr.addr6;
6049 		inet_ntop(AF_INET6, addr, local, sizeof(local));
6050 		addr = &entry->soflow_faddr.addr6;
6051 		inet_ntop(AF_INET6, addr, remote, sizeof(local));
6052 		break;
6053 	case AF_INET:
6054 		addr = &entry->soflow_laddr.addr46.ia46_addr4.s_addr;
6055 		inet_ntop(AF_INET, addr, local, sizeof(local));
6056 		addr = &entry->soflow_faddr.addr46.ia46_addr4.s_addr;
6057 		inet_ntop(AF_INET, addr, remote, sizeof(local));
6058 		break;
6059 	default:
6060 		return;
6061 	}
6062 
6063 	CFIL_LOG(level, "<%s>: <%s(%d) so %llx cfil %p, entry %p, sockID %llu <%llx> feat_ctxt_id <%llu> lport %d fport %d laddr %s faddr %s hash %X",
6064 	    msg,
6065 	    IS_UDP(so) ? "UDP" : "proto", GET_SO_PROTO(so),
6066 	    (uint64_t)VM_KERNEL_ADDRPERM(so), entry->soflow_feat_ctxt, entry, sockId, sockId, entry->soflow_feat_ctxt_id,
6067 	    ntohs(entry->soflow_lport), ntohs(entry->soflow_fport), local, remote,
6068 	    entry->soflow_flowhash);
6069 }
6070 
6071 static void
cfil_inp_log(int level,struct socket * so,const char * msg)6072 cfil_inp_log(int level, struct socket *so, const char* msg)
6073 {
6074 	struct inpcb *inp = NULL;
6075 	struct sockaddr_in *sin = NULL;
6076 	struct sockaddr_in6 *sin6 = NULL;
6077 	char local[MAX_IPv6_STR_LEN + 6];
6078 	char remote[MAX_IPv6_STR_LEN + 6];
6079 	ushort lport = 0;
6080 	ushort fport = 0;
6081 	const void  *addr;
6082 
6083 	if (so == NULL) {
6084 		return;
6085 	}
6086 
6087 	inp = sotoinpcb(so);
6088 	if (inp == NULL) {
6089 		return;
6090 	}
6091 
6092 	local[0] = remote[0] = 0x0;
6093 
6094 	if (inp->inp_vflag & INP_IPV6) {
6095 		addr = &inp->in6p_laddr.s6_addr32;
6096 		inet_ntop(AF_INET6, addr, local, sizeof(local));
6097 		addr = &inp->in6p_faddr.s6_addr32;
6098 		inet_ntop(AF_INET6, addr, remote, sizeof(remote));
6099 	} else {
6100 		addr = &inp->inp_laddr.s_addr;
6101 		inet_ntop(AF_INET, addr, local, sizeof(local));
6102 		addr = &inp->inp_faddr.s_addr;
6103 		inet_ntop(AF_INET, addr, remote, sizeof(remote));
6104 	}
6105 	lport = inp->inp_lport;
6106 	fport = inp->inp_fport;
6107 
6108 	if (so->so_cfil && so->so_cfil->cfi_so_attach_faddr.sa.sa_len > 0) {
6109 		if (so->so_cfil->cfi_so_attach_faddr.sa.sa_family == AF_INET6) {
6110 			sin6 = SIN6(&so->so_cfil->cfi_so_attach_faddr.sa);
6111 			addr = &sin6->sin6_addr;
6112 			inet_ntop(AF_INET6, addr, remote, sizeof(remote));
6113 			fport = sin6->sin6_port;
6114 		} else if (so->so_cfil->cfi_so_attach_faddr.sa.sa_family == AF_INET) {
6115 			sin = SIN(&so->so_cfil->cfi_so_attach_faddr.sa);
6116 			addr = &sin->sin_addr.s_addr;
6117 			inet_ntop(AF_INET, addr, remote, sizeof(remote));
6118 			fport = sin->sin_port;
6119 		}
6120 	}
6121 	if (so->so_cfil && so->so_cfil->cfi_so_attach_laddr.sa.sa_len > 0) {
6122 		if (so->so_cfil->cfi_so_attach_laddr.sa.sa_family == AF_INET6) {
6123 			sin6 = SIN6(&so->so_cfil->cfi_so_attach_laddr.sa);
6124 			addr = &sin6->sin6_addr;
6125 			inet_ntop(AF_INET6, addr, local, sizeof(remote));
6126 			fport = sin6->sin6_port;
6127 		} else if (so->so_cfil->cfi_so_attach_laddr.sa.sa_family == AF_INET) {
6128 			sin = SIN(&so->so_cfil->cfi_so_attach_laddr.sa);
6129 			addr = &sin->sin_addr.s_addr;
6130 			inet_ntop(AF_INET, addr, local, sizeof(remote));
6131 			fport = sin->sin_port;
6132 		}
6133 	}
6134 
6135 	if (so->so_cfil != NULL) {
6136 		CFIL_LOG(level, "<%s>: <%s so %llx cfil %p - flags 0x%x 0x%x, sockID %llu <%llx>> lport %d fport %d laddr %s faddr %s",
6137 		    msg, IS_UDP(so) ? "UDP" : "TCP",
6138 		    (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_cfil, inp->inp_flags, inp->inp_socket->so_flags, so->so_cfil->cfi_sock_id, so->so_cfil->cfi_sock_id,
6139 		    ntohs(lport), ntohs(fport), local, remote);
6140 	} else {
6141 		CFIL_LOG(level, "<%s>: <%s so %llx - flags 0x%x 0x%x> lport %d fport %d laddr %s faddr %s",
6142 		    msg, IS_UDP(so) ? "UDP" : "TCP",
6143 		    (uint64_t)VM_KERNEL_ADDRPERM(so), inp->inp_flags, inp->inp_socket->so_flags,
6144 		    ntohs(lport), ntohs(fport), local, remote);
6145 	}
6146 }
6147 
6148 static void
cfil_info_log(int level,struct cfil_info * cfil_info,const char * msg)6149 cfil_info_log(int level, struct cfil_info *cfil_info, const char* msg)
6150 {
6151 	if (cfil_info == NULL) {
6152 		return;
6153 	}
6154 
6155 	if (cfil_info->cfi_hash_entry != NULL) {
6156 		cfil_hash_entry_log(level, cfil_info->cfi_so, cfil_info->cfi_hash_entry, cfil_info->cfi_sock_id, msg);
6157 	} else {
6158 		cfil_inp_log(level, cfil_info->cfi_so, msg);
6159 	}
6160 }
6161 
6162 static void
cfil_sock_udp_unlink_flow(struct socket * so,struct soflow_hash_entry * hash_entry,struct cfil_info * cfil_info)6163 cfil_sock_udp_unlink_flow(struct socket *so, struct soflow_hash_entry *hash_entry, struct cfil_info *cfil_info)
6164 {
6165 	if (so == NULL || hash_entry == NULL || cfil_info == NULL) {
6166 		return;
6167 	}
6168 
6169 	if (so->so_flags & SOF_CONTENT_FILTER) {
6170 		VERIFY(so->so_usecount > 0);
6171 		so->so_usecount--;
6172 	}
6173 
6174 	// Hold exclusive lock before clearing cfil_info hash entry link
6175 	cfil_rw_lock_exclusive(&cfil_lck_rw);
6176 
6177 	cfil_info->cfi_hash_entry = NULL;
6178 
6179 	if (cfil_info->cfi_debug) {
6180 		CFIL_LOG(LOG_ERR, "CFIL <%s>: <so %llx> - use count %d",
6181 		    IS_UDP(so) ? "UDP" : "TCP", (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
6182 	}
6183 
6184 	cfil_rw_unlock_exclusive(&cfil_lck_rw);
6185 }
6186 
6187 bool
check_port(struct sockaddr * addr,u_short port)6188 check_port(struct sockaddr *addr, u_short port)
6189 {
6190 	struct sockaddr_in *sin = NULL;
6191 	struct sockaddr_in6 *sin6 = NULL;
6192 
6193 	if (addr == NULL || port == 0) {
6194 		return FALSE;
6195 	}
6196 
6197 	switch (addr->sa_family) {
6198 	case AF_INET:
6199 		sin = SIN(addr);
6200 		if (sin->sin_len < sizeof(*sin)) {
6201 			return FALSE;
6202 		}
6203 		if (port == ntohs(sin->sin_port)) {
6204 			return TRUE;
6205 		}
6206 		break;
6207 	case AF_INET6:
6208 		sin6 = SIN6(addr);
6209 		if (sin6->sin6_len < sizeof(*sin6)) {
6210 			return FALSE;
6211 		}
6212 		if (port == ntohs(sin6->sin6_port)) {
6213 			return TRUE;
6214 		}
6215 		break;
6216 	default:
6217 		break;
6218 	}
6219 	return FALSE;
6220 }
6221 
6222 cfil_sock_id_t
cfil_sock_id_from_datagram_socket(struct socket * so,struct sockaddr * local,struct sockaddr * remote)6223 cfil_sock_id_from_datagram_socket(struct socket *so, struct sockaddr *local, struct sockaddr *remote)
6224 {
6225 	socket_lock_assert_owned(so);
6226 
6227 	if (so->so_flow_db == NULL) {
6228 		return CFIL_SOCK_ID_NONE;
6229 	}
6230 	return (cfil_sock_id_t)soflow_db_get_feature_context_id(so->so_flow_db, local, remote);
6231 }
6232 
6233 static struct cfil_info *
cfil_sock_udp_get_info(struct socket * so,uint32_t filter_control_unit,bool outgoing,struct soflow_hash_entry * hash_entry,struct sockaddr * local,struct sockaddr * remote)6234 cfil_sock_udp_get_info(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct soflow_hash_entry *hash_entry,
6235     struct sockaddr *local, struct sockaddr *remote)
6236 {
6237 	int new_filter_control_unit = 0;
6238 	struct cfil_info *cfil_info = NULL;
6239 
6240 	errno_t error = 0;
6241 	socket_lock_assert_owned(so);
6242 
6243 	if (hash_entry == NULL || hash_entry->soflow_db == NULL) {
6244 		return NULL;
6245 	}
6246 
6247 	if (hash_entry->soflow_feat_ctxt != NULL && hash_entry->soflow_feat_ctxt_id != 0) {
6248 		/* Drop pre-existing UDP flow if filter state changed */
6249 		cfil_info = (struct cfil_info *) hash_entry->soflow_feat_ctxt;
6250 		new_filter_control_unit = necp_socket_get_content_filter_control_unit(so);
6251 		if (new_filter_control_unit > 0 &&
6252 		    new_filter_control_unit != cfil_info->cfi_filter_control_unit) {
6253 			if (DO_PRESERVE_CONNECTIONS || (cfil_info->cfi_filter_policy_gencount == necp_socket_get_policy_gencount(so))) {
6254 				// CFIL state has changed, but preserve the flow intentionally or if this is not a result of NECP policy change
6255 				cfil_info->cfi_filter_control_unit = new_filter_control_unit;
6256 			} else {
6257 				CFIL_LOG(LOG_NOTICE, "CFIL: UDP(%s) <so %llx> - filter state changed - dropped pre-existing flow (old state 0x%x new state 0x%x)",
6258 				    outgoing ? "OUT" : "IN", (uint64_t)VM_KERNEL_ADDRPERM(so),
6259 				    cfil_info->cfi_filter_control_unit, new_filter_control_unit);
6260 				return NULL;
6261 			}
6262 		}
6263 		return cfil_info;
6264 	}
6265 
6266 	cfil_info = cfil_info_alloc(so, hash_entry);
6267 	if (cfil_info == NULL) {
6268 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx> UDP failed to alloc cfil_info", (uint64_t)VM_KERNEL_ADDRPERM(so));
6269 		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
6270 		return NULL;
6271 	}
6272 	cfil_info->cfi_filter_control_unit = filter_control_unit;
6273 	cfil_info->cfi_dir = outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN;
6274 	cfil_info->cfi_debug = DEBUG_FLOW(sotoinpcb(so), so, local, remote);
6275 	if (cfil_info->cfi_debug) {
6276 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx> UDP (outgoing %d) - debug flow with port %d", (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, cfil_log_port);
6277 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx> UDP so_gencnt %llx entry flowhash %x cfil %p sockID %llu <%llx>",
6278 		    (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_gencnt, hash_entry->soflow_flowhash, cfil_info, cfil_info->cfi_sock_id, cfil_info->cfi_sock_id);
6279 	}
6280 
6281 	if (cfil_info_attach_unit(so, filter_control_unit, cfil_info) == 0) {
6282 		CFIL_INFO_FREE(cfil_info);
6283 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx> UDP cfil_info_attach_unit(%u) failed",
6284 		    (uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit);
6285 		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
6286 		return NULL;
6287 	}
6288 
6289 	if (cfil_info->cfi_debug) {
6290 		CFIL_LOG(LOG_ERR, "CFIL: UDP <so %llx> filter_control_unit %u sockID %llu <%llx> attached",
6291 		    (uint64_t)VM_KERNEL_ADDRPERM(so),
6292 		    filter_control_unit, cfil_info->cfi_sock_id, cfil_info->cfi_sock_id);
6293 	}
6294 
6295 	so->so_flags |= SOF_CONTENT_FILTER;
6296 	OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
6297 
6298 	/* Hold a reference on the socket for each flow */
6299 	so->so_usecount++;
6300 
6301 	/* link cfil_info to flow */
6302 	hash_entry->soflow_feat_ctxt = cfil_info;
6303 	hash_entry->soflow_feat_ctxt_id = cfil_info->cfi_sock_id;
6304 
6305 	if (cfil_info->cfi_debug) {
6306 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: ADDED");
6307 	}
6308 
6309 	error = cfil_dispatch_attach_event(so, cfil_info, 0,
6310 	    outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN);
6311 	/* We can recover from flow control or out of memory errors */
6312 	if (error != 0 && error != ENOBUFS && error != ENOMEM) {
6313 		CFIL_LOG(LOG_ERR, "CFIL: UDP <so %llx> cfil_dispatch_attach_event failed <error %d>",
6314 		    (uint64_t)VM_KERNEL_ADDRPERM(so), error);
6315 		return NULL;
6316 	}
6317 
6318 	CFIL_INFO_VERIFY(cfil_info);
6319 	return cfil_info;
6320 }
6321 
6322 errno_t
cfil_sock_udp_handle_data(bool outgoing,struct socket * so,struct sockaddr * local,struct sockaddr * remote,struct mbuf * data,struct mbuf * control,uint32_t flags,struct soflow_hash_entry * hash_entry)6323 cfil_sock_udp_handle_data(bool outgoing, struct socket *so,
6324     struct sockaddr *local, struct sockaddr *remote,
6325     struct mbuf *data, struct mbuf *control, uint32_t flags,
6326     struct soflow_hash_entry *hash_entry)
6327 {
6328 #pragma unused(outgoing, so, local, remote, data, control, flags)
6329 	errno_t error = 0;
6330 	uint32_t filter_control_unit;
6331 	struct cfil_info *cfil_info = NULL;
6332 
6333 	socket_lock_assert_owned(so);
6334 
6335 	if (cfil_active_count == 0) {
6336 		CFIL_LOG(LOG_DEBUG, "CFIL: UDP no active filter");
6337 		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
6338 		return error;
6339 	}
6340 
6341 	// Socket has been blessed
6342 	if ((so->so_flags1 & SOF1_CONTENT_FILTER_SKIP) != 0) {
6343 		return error;
6344 	}
6345 
6346 	filter_control_unit = necp_socket_get_content_filter_control_unit(so);
6347 	if (filter_control_unit == 0) {
6348 		CFIL_LOG(LOG_DEBUG, "CFIL: UDP failed to get control unit");
6349 		return error;
6350 	}
6351 
6352 	if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) {
6353 		return error;
6354 	}
6355 
6356 	if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
6357 		CFIL_LOG(LOG_DEBUG, "CFIL: UDP user space only");
6358 		OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
6359 		return error;
6360 	}
6361 
6362 	if (hash_entry == NULL) {
6363 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx> NULL soflow_hash_entry", (uint64_t)VM_KERNEL_ADDRPERM(so));
6364 		return EPIPE;
6365 	}
6366 
6367 	if (hash_entry->soflow_db == NULL) {
6368 		CFIL_LOG(LOG_ERR, "CFIL: <so %llx> NULL soflow_hash_entry db", (uint64_t)VM_KERNEL_ADDRPERM(so));
6369 		return EPIPE;
6370 	}
6371 
6372 	cfil_info = cfil_sock_udp_get_info(so, filter_control_unit, outgoing, hash_entry, local, remote);
6373 	if (cfil_info == NULL) {
6374 		return EPIPE;
6375 	}
6376 	// Update last used timestamp, this is for flow Idle TO
6377 
6378 	if (cfil_info->cfi_debug) {
6379 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: Got flow");
6380 	}
6381 
6382 	if (cfil_info->cfi_flags & CFIF_DROP) {
6383 		if (cfil_info->cfi_debug) {
6384 			cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP DROP");
6385 		}
6386 		return EPIPE;
6387 	}
6388 	if (control != NULL) {
6389 		OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
6390 	}
6391 	if (data->m_type == MT_OOBDATA) {
6392 		CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
6393 		    (uint64_t)VM_KERNEL_ADDRPERM(so));
6394 		OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
6395 	}
6396 
6397 	error = cfil_data_common(so, cfil_info, outgoing, remote, data, control, flags);
6398 
6399 	return error;
6400 }
6401 
6402 struct cfil_udp_attached_context {
6403 	bool need_wait;
6404 	lck_mtx_t *mutex_held;
6405 	int attached;
6406 };
6407 
6408 static bool
cfil_filters_udp_attached_per_flow(struct socket * so,struct soflow_hash_entry * hash_entry,void * context)6409 cfil_filters_udp_attached_per_flow(struct socket *so,
6410     struct soflow_hash_entry *hash_entry,
6411     void *context)
6412 {
6413 	struct cfil_udp_attached_context *apply_context = NULL;
6414 	struct cfil_info * __single cfil_info = NULL;
6415 	struct cfil_entry *entry = NULL;
6416 	uint64_t sock_flow_id = 0;
6417 	struct timespec ts;
6418 	errno_t error = 0;
6419 	int kcunit;
6420 
6421 	if (hash_entry->soflow_feat_ctxt == NULL || context == NULL) {
6422 		return true;
6423 	}
6424 
6425 	cfil_info = hash_entry->soflow_feat_ctxt;
6426 	apply_context = (struct cfil_udp_attached_context *)context;
6427 
6428 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
6429 		entry = &cfil_info->cfi_entries[kcunit - 1];
6430 
6431 		/* Are we attached to the filter? */
6432 		if (entry->cfe_filter == NULL) {
6433 			continue;
6434 		}
6435 
6436 		if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
6437 			continue;
6438 		}
6439 		if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0) {
6440 			continue;
6441 		}
6442 
6443 		if (apply_context->need_wait == TRUE) {
6444 			if (cfil_info->cfi_debug) {
6445 				cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP PER-FLOW WAIT FOR FLOW TO FINISH");
6446 			}
6447 
6448 			ts.tv_sec = cfil_close_wait_timeout / 1000;
6449 			ts.tv_nsec = (cfil_close_wait_timeout % 1000) * NSEC_PER_USEC * 1000;
6450 
6451 			OSIncrementAtomic(&cfil_stats.cfs_close_wait);
6452 			cfil_info->cfi_flags |= CFIF_CLOSE_WAIT;
6453 			sock_flow_id = cfil_info->cfi_sock_id;
6454 
6455 			error = msleep((caddr_t)cfil_info, apply_context->mutex_held,
6456 			    PSOCK | PCATCH, "cfil_filters_udp_attached_per_flow", &ts);
6457 
6458 			// Woke up from sleep, validate if cfil_info is still valid
6459 			if (so->so_flow_db == NULL ||
6460 			    (cfil_info != soflow_db_get_feature_context(so->so_flow_db, sock_flow_id))) {
6461 				// cfil_info is not valid, do not continue
6462 				return false;
6463 			}
6464 
6465 			cfil_info->cfi_flags &= ~CFIF_CLOSE_WAIT;
6466 
6467 			if (cfil_info->cfi_debug) {
6468 				cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP PER-FLOW WAIT FOR FLOW DONE");
6469 			}
6470 
6471 			/*
6472 			 * Force close in case of timeout
6473 			 */
6474 			if (error != 0) {
6475 				OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
6476 
6477 				if (cfil_info->cfi_debug) {
6478 					cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP PER-FLOW WAIT FOR FLOW TIMED OUT, FORCE DETACH");
6479 				}
6480 
6481 				entry->cfe_flags |= CFEF_CFIL_DETACHED;
6482 				return false;
6483 			}
6484 		}
6485 		apply_context->attached = 1;
6486 		return false;
6487 	}
6488 	return true;
6489 }
6490 
6491 /*
6492  * Go through all UDP flows for specified socket and returns TRUE if
6493  * any flow is still attached.  If need_wait is TRUE, wait on first
6494  * attached flow.
6495  */
6496 static int
cfil_filters_udp_attached(struct socket * so,bool need_wait)6497 cfil_filters_udp_attached(struct socket *so, bool need_wait)
6498 {
6499 	struct cfil_udp_attached_context apply_context = { 0 };
6500 	lck_mtx_t *mutex_held;
6501 
6502 	socket_lock_assert_owned(so);
6503 
6504 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_flow_db != NULL) {
6505 		if (so->so_proto->pr_getlock != NULL) {
6506 			mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
6507 		} else {
6508 			mutex_held = so->so_proto->pr_domain->dom_mtx;
6509 		}
6510 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6511 
6512 		apply_context.need_wait = need_wait;
6513 		apply_context.mutex_held = mutex_held;
6514 		soflow_db_apply(so->so_flow_db, cfil_filters_udp_attached_per_flow, (void *)&apply_context);
6515 	}
6516 
6517 	return apply_context.attached;
6518 }
6519 
6520 struct cfil_udp_data_pending_context {
6521 	struct sockbuf *sb;
6522 	uint64_t total_pending;
6523 };
6524 
6525 static bool
cfil_sock_udp_data_pending_per_flow(struct socket * so,struct soflow_hash_entry * hash_entry,void * context)6526 cfil_sock_udp_data_pending_per_flow(struct socket *so,
6527     struct soflow_hash_entry *hash_entry,
6528     void *context)
6529 {
6530 #pragma unused(so)
6531 	struct cfil_udp_data_pending_context *apply_context = NULL;
6532 	struct cfil_info * __single cfil_info = NULL;
6533 	struct cfi_buf *cfi_buf;
6534 
6535 	uint64_t pending = 0;
6536 
6537 	if (hash_entry->soflow_feat_ctxt == NULL || context == NULL) {
6538 		return true;
6539 	}
6540 
6541 	cfil_info = hash_entry->soflow_feat_ctxt;
6542 	apply_context = (struct cfil_udp_data_pending_context *)context;
6543 
6544 	if (apply_context->sb == NULL) {
6545 		return true;
6546 	}
6547 
6548 	if ((apply_context->sb->sb_flags & SB_RECV) == 0) {
6549 		cfi_buf = &cfil_info->cfi_snd;
6550 	} else {
6551 		cfi_buf = &cfil_info->cfi_rcv;
6552 	}
6553 
6554 	pending = cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first;
6555 	/*
6556 	 * If we are limited by the "chars of mbufs used" roughly
6557 	 * adjust so we won't overcommit
6558 	 */
6559 	if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending) {
6560 		pending = cfi_buf->cfi_pending_mbcnt;
6561 	}
6562 
6563 	apply_context->total_pending += pending;
6564 	return true;
6565 }
6566 
6567 int32_t
cfil_sock_udp_data_pending(struct sockbuf * sb,bool check_thread)6568 cfil_sock_udp_data_pending(struct sockbuf *sb, bool check_thread)
6569 {
6570 	struct cfil_udp_data_pending_context apply_context = { 0 };
6571 	struct socket *so = sb->sb_so;
6572 
6573 	socket_lock_assert_owned(so);
6574 
6575 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_flow_db != NULL &&
6576 	    (check_thread == FALSE || so->so_snd.sb_cfil_thread != current_thread())) {
6577 		apply_context.sb = sb;
6578 		soflow_db_apply(so->so_flow_db, cfil_sock_udp_data_pending_per_flow, (void *)&apply_context);
6579 
6580 		VERIFY(apply_context.total_pending < INT32_MAX);
6581 	}
6582 
6583 	return (int32_t)(apply_context.total_pending);
6584 }
6585 
6586 struct cfil_udp_notify_shutdown_context {
6587 	int how;
6588 	int drop_flag;
6589 	int shut_flag;
6590 	int done_count;
6591 };
6592 
6593 static bool
cfil_sock_udp_notify_shutdown_per_flow(struct socket * so,struct soflow_hash_entry * hash_entry,void * context)6594 cfil_sock_udp_notify_shutdown_per_flow(struct socket *so,
6595     struct soflow_hash_entry *hash_entry,
6596     void *context)
6597 {
6598 	struct cfil_udp_notify_shutdown_context *apply_context = NULL;
6599 	struct cfil_info * __single cfil_info = NULL;
6600 	errno_t error = 0;
6601 	int kcunit;
6602 
6603 	if (hash_entry->soflow_feat_ctxt == NULL || context == NULL) {
6604 		return true;
6605 	}
6606 
6607 	cfil_info = hash_entry->soflow_feat_ctxt;
6608 	apply_context = (struct cfil_udp_notify_shutdown_context *)context;
6609 
6610 	// This flow is marked as DROP
6611 	if (cfil_info->cfi_flags & apply_context->drop_flag) {
6612 		apply_context->done_count++;
6613 		return true;
6614 	}
6615 
6616 	// This flow has been shut already, skip
6617 	if (cfil_info->cfi_flags & apply_context->shut_flag) {
6618 		return true;
6619 	}
6620 	// Mark flow as shut
6621 	cfil_info->cfi_flags |= apply_context->shut_flag;
6622 	apply_context->done_count++;
6623 
6624 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
6625 		/* Disconnect incoming side */
6626 		if (apply_context->how != SHUT_WR) {
6627 			error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0);
6628 		}
6629 		/* Disconnect outgoing side */
6630 		if (apply_context->how != SHUT_RD) {
6631 			error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1);
6632 		}
6633 	}
6634 
6635 	if (cfil_info->cfi_debug) {
6636 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP PER-FLOW NOTIFY_SHUTDOWN");
6637 	}
6638 
6639 	return true;
6640 }
6641 
6642 int
cfil_sock_udp_notify_shutdown(struct socket * so,int how,int drop_flag,int shut_flag)6643 cfil_sock_udp_notify_shutdown(struct socket *so, int how, int drop_flag, int shut_flag)
6644 {
6645 	struct cfil_udp_notify_shutdown_context apply_context = { 0 };
6646 	errno_t error = 0;
6647 
6648 	socket_lock_assert_owned(so);
6649 
6650 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_flow_db != NULL) {
6651 		apply_context.how = how;
6652 		apply_context.drop_flag = drop_flag;
6653 		apply_context.shut_flag = shut_flag;
6654 
6655 		soflow_db_apply(so->so_flow_db, cfil_sock_udp_notify_shutdown_per_flow, (void *)&apply_context);
6656 	}
6657 
6658 	if (apply_context.done_count == 0) {
6659 		error = ENOTCONN;
6660 	}
6661 	return error;
6662 }
6663 
6664 int
cfil_sock_udp_shutdown(struct socket * so,int * how)6665 cfil_sock_udp_shutdown(struct socket *so, int *how)
6666 {
6667 	int error = 0;
6668 
6669 	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || (so->so_flow_db == NULL)) {
6670 		goto done;
6671 	}
6672 
6673 	socket_lock_assert_owned(so);
6674 
6675 	CFIL_LOG(LOG_INFO, "so %llx how %d",
6676 	    (uint64_t)VM_KERNEL_ADDRPERM(so), *how);
6677 
6678 	/*
6679 	 * Check the state of the socket before the content filter
6680 	 */
6681 	if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
6682 		/* read already shut down */
6683 		error = ENOTCONN;
6684 		goto done;
6685 	}
6686 	if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
6687 		/* write already shut down */
6688 		error = ENOTCONN;
6689 		goto done;
6690 	}
6691 
6692 	/*
6693 	 * shutdown read: SHUT_RD or SHUT_RDWR
6694 	 */
6695 	if (*how != SHUT_WR) {
6696 		error = cfil_sock_udp_notify_shutdown(so, SHUT_RD, CFIF_DROP, CFIF_SHUT_RD);
6697 		if (error != 0) {
6698 			goto done;
6699 		}
6700 	}
6701 	/*
6702 	 * shutdown write: SHUT_WR or SHUT_RDWR
6703 	 */
6704 	if (*how != SHUT_RD) {
6705 		error = cfil_sock_udp_notify_shutdown(so, SHUT_WR, CFIF_DROP, CFIF_SHUT_WR);
6706 		if (error != 0) {
6707 			goto done;
6708 		}
6709 
6710 		/*
6711 		 * When outgoing data is pending, we delay the shutdown at the
6712 		 * protocol level until the content filters give the final
6713 		 * verdict on the pending data.
6714 		 */
6715 		if (cfil_sock_data_pending(&so->so_snd) != 0) {
6716 			/*
6717 			 * When shutting down the read and write sides at once
6718 			 * we can proceed to the final shutdown of the read
6719 			 * side. Otherwise, we just return.
6720 			 */
6721 			if (*how == SHUT_WR) {
6722 				error = EJUSTRETURN;
6723 			} else if (*how == SHUT_RDWR) {
6724 				*how = SHUT_RD;
6725 			}
6726 		}
6727 	}
6728 done:
6729 	return error;
6730 }
6731 
6732 void
cfil_sock_udp_close_wait(struct socket * so)6733 cfil_sock_udp_close_wait(struct socket *so)
6734 {
6735 	socket_lock_assert_owned(so);
6736 
6737 	while (cfil_filters_udp_attached(so, FALSE)) {
6738 		/*
6739 		 * Notify the filters we are going away so they can detach
6740 		 */
6741 		cfil_sock_udp_notify_shutdown(so, SHUT_RDWR, 0, 0);
6742 
6743 		/*
6744 		 * Make sure we need to wait after the filter are notified
6745 		 * of the disconnection
6746 		 */
6747 		if (cfil_filters_udp_attached(so, TRUE) == 0) {
6748 			break;
6749 		}
6750 	}
6751 }
6752 
6753 static bool
cfil_sock_udp_is_closed_per_flow(struct socket * so,struct soflow_hash_entry * hash_entry,void * context)6754 cfil_sock_udp_is_closed_per_flow(struct socket *so,
6755     struct soflow_hash_entry *hash_entry,
6756     void *context)
6757 {
6758 #pragma unused(context)
6759 	struct cfil_info * __single cfil_info = NULL;
6760 	errno_t error = 0;
6761 	int kcunit;
6762 
6763 	if (hash_entry->soflow_feat_ctxt == NULL) {
6764 		return true;
6765 	}
6766 
6767 	cfil_info = hash_entry->soflow_feat_ctxt;
6768 
6769 	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
6770 		/* Let the filters know of the closing */
6771 		error = cfil_dispatch_closed_event(so, cfil_info, kcunit);
6772 	}
6773 
6774 	/* Last chance to push passed data out */
6775 	error = cfil_acquire_sockbuf(so, cfil_info, 1);
6776 	if (error == 0) {
6777 		cfil_service_inject_queue(so, cfil_info, 1);
6778 	}
6779 	cfil_release_sockbuf(so, 1);
6780 
6781 	cfil_info->cfi_flags |= CFIF_SOCK_CLOSED;
6782 
6783 	/* Pending data needs to go */
6784 	cfil_flush_queues(so, cfil_info);
6785 
6786 	CFIL_INFO_VERIFY(cfil_info);
6787 
6788 	if (cfil_info->cfi_debug) {
6789 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP PER-FLOW IS_CLOSED");
6790 	}
6791 
6792 	return true;
6793 }
6794 
6795 void
cfil_sock_udp_is_closed(struct socket * so)6796 cfil_sock_udp_is_closed(struct socket *so)
6797 {
6798 	socket_lock_assert_owned(so);
6799 
6800 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_flow_db != NULL) {
6801 		soflow_db_apply(so->so_flow_db, cfil_sock_udp_is_closed_per_flow, NULL);
6802 	}
6803 }
6804 
6805 static bool
cfil_sock_udp_buf_update_per_flow(struct socket * so,struct soflow_hash_entry * hash_entry,void * context)6806 cfil_sock_udp_buf_update_per_flow(struct socket *so,
6807     struct soflow_hash_entry *hash_entry,
6808     void *context)
6809 {
6810 	struct cfil_info * __single cfil_info = NULL;
6811 	struct sockbuf *sb = NULL;
6812 	errno_t error = 0;
6813 	int outgoing;
6814 
6815 	if (hash_entry->soflow_feat_ctxt == NULL || context == NULL) {
6816 		return true;
6817 	}
6818 
6819 	cfil_info = hash_entry->soflow_feat_ctxt;
6820 	sb = (struct sockbuf *) context;
6821 
6822 	if ((sb->sb_flags & SB_RECV) == 0) {
6823 		if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0) {
6824 			return true;
6825 		}
6826 		outgoing = 1;
6827 		OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
6828 	} else {
6829 		if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN) == 0) {
6830 			return true;
6831 		}
6832 		outgoing = 0;
6833 		OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
6834 	}
6835 
6836 	CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
6837 	    (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
6838 
6839 	error = cfil_acquire_sockbuf(so, cfil_info, outgoing);
6840 	if (error == 0) {
6841 		cfil_service_inject_queue(so, cfil_info, outgoing);
6842 	}
6843 	cfil_release_sockbuf(so, outgoing);
6844 	return true;
6845 }
6846 
6847 void
cfil_sock_udp_buf_update(struct sockbuf * sb)6848 cfil_sock_udp_buf_update(struct sockbuf *sb)
6849 {
6850 	struct socket *so = sb->sb_so;
6851 
6852 	socket_lock_assert_owned(so);
6853 
6854 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_flow_db != NULL) {
6855 		if (!cfil_sbtrim) {
6856 			return;
6857 		}
6858 		soflow_db_apply(so->so_flow_db, cfil_sock_udp_buf_update_per_flow, (void *)sb);
6859 	}
6860 }
6861 
6862 void
cfil_filter_show(u_int32_t kcunit)6863 cfil_filter_show(u_int32_t kcunit)
6864 {
6865 	struct content_filter *cfc = NULL;
6866 	struct cfil_entry *entry;
6867 	int count = 0;
6868 
6869 	if (kcunit > MAX_CONTENT_FILTER) {
6870 		return;
6871 	}
6872 
6873 	cfil_rw_lock_shared(&cfil_lck_rw);
6874 
6875 	if (content_filters[kcunit - 1] == NULL) {
6876 		cfil_rw_unlock_shared(&cfil_lck_rw);
6877 		return;
6878 	}
6879 	cfc = content_filters[kcunit - 1];
6880 
6881 	CFIL_LOG(LOG_DEBUG, "CFIL: FILTER SHOW: Filter <unit %d, entry count %d> flags <%lx>:",
6882 	    kcunit, cfc->cf_sock_count, (unsigned long)cfc->cf_flags);
6883 	if (cfc->cf_flags & CFF_DETACHING) {
6884 		CFIL_LOG(LOG_DEBUG, "CFIL: FILTER SHOW:-DETACHING");
6885 	}
6886 	if (cfc->cf_flags & CFF_ACTIVE) {
6887 		CFIL_LOG(LOG_DEBUG, "CFIL: FILTER SHOW:-ACTIVE");
6888 	}
6889 	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
6890 		CFIL_LOG(LOG_DEBUG, "CFIL: FILTER SHOW:-FLOW CONTROLLED");
6891 	}
6892 
6893 	TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
6894 		if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
6895 			struct cfil_info *cfil_info = entry->cfe_cfil_info;
6896 
6897 			count++;
6898 
6899 			if (entry->cfe_flags & CFEF_CFIL_DETACHED) {
6900 				cfil_info_log(LOG_DEBUG, cfil_info, "CFIL: FILTER SHOW:-DETACHED");
6901 			} else {
6902 				cfil_info_log(LOG_DEBUG, cfil_info, "CFIL: FILTER SHOW:-ATTACHED");
6903 			}
6904 		}
6905 	}
6906 
6907 	CFIL_LOG(LOG_DEBUG, "CFIL: FILTER SHOW:Filter - total entries shown: %d", count);
6908 
6909 	cfil_rw_unlock_shared(&cfil_lck_rw);
6910 }
6911 
6912 void
cfil_info_show(void)6913 cfil_info_show(void)
6914 {
6915 	struct cfil_info *cfil_info;
6916 	int count = 0;
6917 
6918 	cfil_rw_lock_shared(&cfil_lck_rw);
6919 
6920 	CFIL_LOG(LOG_DEBUG, "CFIL: INFO SHOW:count %d", cfil_sock_attached_count);
6921 
6922 	TAILQ_FOREACH(cfil_info, &cfil_sock_head, cfi_link) {
6923 		count++;
6924 
6925 		cfil_info_log(LOG_DEBUG, cfil_info, "CFIL: INFO SHOW");
6926 
6927 		if (cfil_info->cfi_flags & CFIF_DROP) {
6928 			CFIL_LOG(LOG_DEBUG, "CFIL: INFO FLAG - DROP");
6929 		}
6930 		if (cfil_info->cfi_flags & CFIF_CLOSE_WAIT) {
6931 			CFIL_LOG(LOG_DEBUG, "CFIL: INFO FLAG - CLOSE_WAIT");
6932 		}
6933 		if (cfil_info->cfi_flags & CFIF_SOCK_CLOSED) {
6934 			CFIL_LOG(LOG_DEBUG, "CFIL: INFO FLAG - SOCK_CLOSED");
6935 		}
6936 		if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN) {
6937 			CFIL_LOG(LOG_DEBUG, "CFIL: INFO FLAG - RETRY_INJECT_IN");
6938 		}
6939 		if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT) {
6940 			CFIL_LOG(LOG_DEBUG, "CFIL: INFO FLAG - RETRY_INJECT_OUT");
6941 		}
6942 		if (cfil_info->cfi_flags & CFIF_SHUT_WR) {
6943 			CFIL_LOG(LOG_DEBUG, "CFIL: INFO FLAG - SHUT_WR");
6944 		}
6945 		if (cfil_info->cfi_flags & CFIF_SHUT_RD) {
6946 			CFIL_LOG(LOG_DEBUG, "CFIL: INFO FLAG - SHUT_RD");
6947 		}
6948 	}
6949 
6950 	CFIL_LOG(LOG_DEBUG, "CFIL: INFO SHOW:total cfil_info shown: %d", count);
6951 
6952 	cfil_rw_unlock_shared(&cfil_lck_rw);
6953 }
6954 
6955 bool
cfil_info_action_timed_out(struct cfil_info * cfil_info,int timeout)6956 cfil_info_action_timed_out(struct cfil_info *cfil_info, int timeout)
6957 {
6958 	struct cfil_entry *entry;
6959 	struct timeval current_tv;
6960 	struct timeval diff_time;
6961 
6962 	if (cfil_info == NULL) {
6963 		return false;
6964 	}
6965 
6966 	/*
6967 	 * If we have queued up more data than passed offset and we haven't received
6968 	 * an action from user space for a while (the user space filter might have crashed),
6969 	 * return action timed out.
6970 	 */
6971 	if (cfil_info->cfi_snd.cfi_pending_last > cfil_info->cfi_snd.cfi_pass_offset ||
6972 	    cfil_info->cfi_rcv.cfi_pending_last > cfil_info->cfi_rcv.cfi_pass_offset) {
6973 		microuptime(&current_tv);
6974 
6975 		for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
6976 			entry = &cfil_info->cfi_entries[kcunit - 1];
6977 
6978 			if (entry->cfe_filter == NULL) {
6979 				continue;
6980 			}
6981 
6982 			if (cfil_info->cfi_snd.cfi_pending_last > entry->cfe_snd.cfe_pass_offset ||
6983 			    cfil_info->cfi_rcv.cfi_pending_last > entry->cfe_rcv.cfe_pass_offset) {
6984 				// haven't gotten an action from this filter, check timeout
6985 				timersub(&current_tv, &entry->cfe_last_action, &diff_time);
6986 				if (diff_time.tv_sec >= timeout) {
6987 					if (cfil_info->cfi_debug) {
6988 						cfil_info_log(LOG_ERR, cfil_info, "CFIL: flow ACTION timeout expired");
6989 					}
6990 					return true;
6991 				}
6992 			}
6993 		}
6994 	}
6995 	return false;
6996 }
6997 
6998 bool
cfil_info_buffer_threshold_exceeded(struct cfil_info * cfil_info)6999 cfil_info_buffer_threshold_exceeded(struct cfil_info *cfil_info)
7000 {
7001 	if (cfil_info == NULL) {
7002 		return false;
7003 	}
7004 
7005 	/*
7006 	 * Clean up flow if it exceeded queue thresholds
7007 	 */
7008 	if (cfil_info->cfi_snd.cfi_tail_drop_cnt ||
7009 	    cfil_info->cfi_rcv.cfi_tail_drop_cnt) {
7010 		if (cfil_info->cfi_debug) {
7011 			CFIL_LOG(LOG_ERR, "CFIL: queue threshold exceeded:mbuf max < count: %d bytes: %d > tail drop count < OUT: %d IN: %d > ",
7012 			    cfil_udp_gc_mbuf_num_max,
7013 			    cfil_udp_gc_mbuf_cnt_max,
7014 			    cfil_info->cfi_snd.cfi_tail_drop_cnt,
7015 			    cfil_info->cfi_rcv.cfi_tail_drop_cnt);
7016 			cfil_info_log(LOG_ERR, cfil_info, "CFIL: queue threshold exceeded");
7017 		}
7018 		return true;
7019 	}
7020 
7021 	return false;
7022 }
7023 
7024 static bool
cfil_dgram_gc_needed(struct socket * so,struct soflow_hash_entry * hash_entry,u_int64_t current_time)7025 cfil_dgram_gc_needed(struct socket *so, struct soflow_hash_entry *hash_entry, u_int64_t current_time)
7026 {
7027 #pragma unused(current_time)
7028 	struct cfil_info *cfil_info = NULL;
7029 
7030 	if (so == NULL || hash_entry == NULL || hash_entry->soflow_feat_ctxt == NULL) {
7031 		return false;
7032 	}
7033 	cfil_info = (struct cfil_info *) hash_entry->soflow_feat_ctxt;
7034 
7035 	cfil_rw_lock_shared(&cfil_lck_rw);
7036 
7037 	if (cfil_info_action_timed_out(cfil_info, UDP_FLOW_GC_ACTION_TO) ||
7038 	    cfil_info_buffer_threshold_exceeded(cfil_info)) {
7039 		if (cfil_info->cfi_debug) {
7040 			cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP PER-FLOW GC NEEDED");
7041 		}
7042 		cfil_rw_unlock_shared(&cfil_lck_rw);
7043 		return true;
7044 	}
7045 
7046 	cfil_rw_unlock_shared(&cfil_lck_rw);
7047 	return false;
7048 }
7049 
7050 static bool
cfil_dgram_gc_perform(struct socket * so,struct soflow_hash_entry * hash_entry)7051 cfil_dgram_gc_perform(struct socket *so, struct soflow_hash_entry *hash_entry)
7052 {
7053 	struct cfil_info *cfil_info = NULL;
7054 
7055 	if (so == NULL || hash_entry == NULL || hash_entry->soflow_feat_ctxt == NULL) {
7056 		return false;
7057 	}
7058 	cfil_info = (struct cfil_info *) hash_entry->soflow_feat_ctxt;
7059 
7060 	if (cfil_info->cfi_debug) {
7061 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: UDP PER-FLOW GC PERFORM");
7062 	}
7063 
7064 	for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
7065 		/* Let the filters know of the closing */
7066 		cfil_dispatch_closed_event(so, cfil_info, kcunit);
7067 	}
7068 	cfil_sock_udp_unlink_flow(so, hash_entry, cfil_info);
7069 	CFIL_INFO_FREE(cfil_info);
7070 	OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
7071 	return true;
7072 }
7073 
7074 static bool
cfil_dgram_detach_entry(struct socket * so,struct soflow_hash_entry * hash_entry)7075 cfil_dgram_detach_entry(struct socket *so, struct soflow_hash_entry *hash_entry)
7076 {
7077 	struct cfil_info *cfil_info = NULL;
7078 
7079 	if (hash_entry == NULL || hash_entry->soflow_feat_ctxt == NULL) {
7080 		return true;
7081 	}
7082 	cfil_info = (struct cfil_info *) hash_entry->soflow_feat_ctxt;
7083 
7084 	if (cfil_info->cfi_debug) {
7085 		cfil_info_log(LOG_ERR, cfil_info, "CFIL: DGRAM DETACH ENTRY");
7086 	}
7087 
7088 	cfil_sock_udp_unlink_flow(so, hash_entry, cfil_info);
7089 	CFIL_INFO_FREE(cfil_info);
7090 	OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
7091 
7092 	return true;
7093 }
7094 
7095 static bool
cfil_dgram_detach_db(struct socket * so,struct soflow_db * db)7096 cfil_dgram_detach_db(struct socket *so, struct soflow_db *db)
7097 {
7098 #pragma unused(db)
7099 	if (so && so->so_flags & SOF_CONTENT_FILTER) {
7100 		so->so_flags &= ~SOF_CONTENT_FILTER;
7101 		CFIL_LOG(LOG_DEBUG, "CFIL: DGRAM DETACH DB <so %llx>", (uint64_t)VM_KERNEL_ADDRPERM(so));
7102 	}
7103 	return true;
7104 }
7105 
7106 struct m_tag *
cfil_dgram_save_socket_state(struct cfil_info * cfil_info,struct mbuf * m)7107 cfil_dgram_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
7108 {
7109 	struct m_tag *tag = NULL;
7110 	struct cfil_tag *ctag = NULL;
7111 	struct soflow_hash_entry *hash_entry = NULL;
7112 	struct inpcb *inp = NULL;
7113 
7114 	if (cfil_info == NULL || cfil_info->cfi_so == NULL ||
7115 	    cfil_info->cfi_hash_entry == NULL || m == NULL || !(m->m_flags & M_PKTHDR)) {
7116 		return NULL;
7117 	}
7118 
7119 	inp = sotoinpcb(cfil_info->cfi_so);
7120 
7121 	/* Allocate a tag */
7122 	tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP,
7123 	    sizeof(struct cfil_tag), M_DONTWAIT, m);
7124 
7125 	if (tag) {
7126 		ctag = (struct cfil_tag *)(tag->m_tag_data);
7127 		ctag->cfil_so_state_change_cnt = cfil_info->cfi_so->so_state_change_cnt;
7128 		ctag->cfil_so_options = cfil_info->cfi_so->so_options;
7129 		ctag->cfil_inp_flags = inp ? inp->inp_flags : 0;
7130 
7131 		hash_entry = cfil_info->cfi_hash_entry;
7132 		if (hash_entry->soflow_family == AF_INET6) {
7133 			fill_ip6_sockaddr_4_6(&ctag->cfil_faddr,
7134 			    &hash_entry->soflow_faddr.addr6,
7135 			    hash_entry->soflow_fport, hash_entry->soflow_faddr6_ifscope);
7136 		} else if (hash_entry->soflow_family == AF_INET) {
7137 			fill_ip_sockaddr_4_6(&ctag->cfil_faddr,
7138 			    hash_entry->soflow_faddr.addr46.ia46_addr4,
7139 			    hash_entry->soflow_fport);
7140 		}
7141 		m_tag_prepend(m, tag);
7142 		return tag;
7143 	}
7144 	return NULL;
7145 }
7146 
7147 struct m_tag *
cfil_dgram_get_socket_state(struct mbuf * m,uint32_t * state_change_cnt,uint32_t * options,struct sockaddr ** faddr,int * inp_flags)7148 cfil_dgram_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, uint32_t *options,
7149     struct sockaddr **faddr, int *inp_flags)
7150 {
7151 	struct m_tag *tag = NULL;
7152 	struct cfil_tag *ctag = NULL;
7153 
7154 	tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP);
7155 	if (tag) {
7156 		ctag = (struct cfil_tag *)(tag->m_tag_data);
7157 		if (state_change_cnt) {
7158 			*state_change_cnt = ctag->cfil_so_state_change_cnt;
7159 		}
7160 		if (options) {
7161 			*options = ctag->cfil_so_options;
7162 		}
7163 		if (faddr) {
7164 			*faddr = SA(&ctag->cfil_faddr);
7165 		}
7166 		if (inp_flags) {
7167 			*inp_flags = ctag->cfil_inp_flags;
7168 		}
7169 
7170 		/*
7171 		 * Unlink tag and hand it over to caller.
7172 		 * Note that caller will be responsible to free it.
7173 		 */
7174 		m_tag_unlink(m, tag);
7175 		return tag;
7176 	}
7177 	return NULL;
7178 }
7179 
7180 boolean_t
cfil_dgram_peek_socket_state(struct mbuf * m,int * inp_flags)7181 cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags)
7182 {
7183 	struct m_tag *tag = NULL;
7184 	struct cfil_tag *ctag = NULL;
7185 
7186 	tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP);
7187 	if (tag) {
7188 		ctag = (struct cfil_tag *)(tag->m_tag_data);
7189 		if (inp_flags) {
7190 			*inp_flags = ctag->cfil_inp_flags;
7191 		}
7192 		return true;
7193 	}
7194 	return false;
7195 }
7196 
7197 static int
cfil_dispatch_stats_event_locked(int kcunit,struct cfil_stats_report_buffer * buffer,uint32_t stats_count)7198 cfil_dispatch_stats_event_locked(int kcunit, struct cfil_stats_report_buffer *buffer, uint32_t stats_count)
7199 {
7200 	struct content_filter *cfc = NULL;
7201 	errno_t error = 0;
7202 	size_t msgsize = 0;
7203 
7204 	if (buffer == NULL || stats_count == 0) {
7205 		return error;
7206 	}
7207 
7208 	if (kcunit > MAX_CONTENT_FILTER) {
7209 		return error;
7210 	}
7211 
7212 	cfc = content_filters[kcunit - 1];
7213 	if (cfc == NULL) {
7214 		return error;
7215 	}
7216 
7217 	/* Would be wasteful to try */
7218 	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
7219 		error = ENOBUFS;
7220 		goto done;
7221 	}
7222 
7223 	msgsize = sizeof(struct cfil_msg_stats_report) + (sizeof(struct cfil_msg_sock_stats) * stats_count);
7224 	buffer->msghdr.cfm_len = (uint32_t)msgsize;
7225 	buffer->msghdr.cfm_version = 1;
7226 	buffer->msghdr.cfm_type = CFM_TYPE_EVENT;
7227 	buffer->msghdr.cfm_op = CFM_OP_STATS;
7228 	buffer->msghdr.cfm_sock_id = 0;
7229 	buffer->count = stats_count;
7230 
7231 	if (cfil_log_stats) {
7232 		CFIL_LOG(LOG_DEBUG, "STATS (kcunit %d): msg size %lu - %lu %lu %lu",
7233 		    kcunit,
7234 		    (unsigned long)msgsize,
7235 		    (unsigned long)sizeof(struct cfil_msg_stats_report),
7236 		    (unsigned long)sizeof(struct cfil_msg_sock_stats),
7237 		    (unsigned long)stats_count);
7238 	}
7239 
7240 	error = ctl_enqueuedata(cfc->cf_kcref, cfc->cf_kcunit,
7241 	    buffer,
7242 	    sizeof(struct cfil_stats_report_buffer),
7243 	    CTL_DATA_EOR);
7244 	if (error != 0) {
7245 		CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed:%d", error);
7246 		goto done;
7247 	}
7248 	OSIncrementAtomic(&cfil_stats.cfs_stats_event_ok);
7249 
7250 	if (cfil_log_stats) {
7251 		CFIL_LOG(LOG_DEBUG, "CFIL: STATS REPORT:send msg to %d", kcunit);
7252 	}
7253 done:
7254 
7255 	if (error == ENOBUFS) {
7256 		OSIncrementAtomic(
7257 			&cfil_stats.cfs_stats_event_flow_control);
7258 
7259 		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
7260 			cfil_rw_lock_exclusive(&cfil_lck_rw);
7261 		}
7262 
7263 		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
7264 
7265 		cfil_rw_lock_exclusive_to_shared(&cfil_lck_rw);
7266 	} else if (error != 0) {
7267 		OSIncrementAtomic(&cfil_stats.cfs_stats_event_fail);
7268 	}
7269 
7270 	return error;
7271 }
7272 
7273 static void
cfil_stats_report_thread_sleep(bool forever)7274 cfil_stats_report_thread_sleep(bool forever)
7275 {
7276 	if (cfil_log_stats) {
7277 		CFIL_LOG(LOG_DEBUG, "CFIL: STATS COLLECTION SLEEP");
7278 	}
7279 
7280 	if (forever) {
7281 		(void) assert_wait((event_t) &cfil_sock_attached_stats_count,
7282 		    THREAD_INTERRUPTIBLE);
7283 	} else {
7284 		uint64_t deadline = 0;
7285 		nanoseconds_to_absolutetime(CFIL_STATS_REPORT_RUN_INTERVAL_NSEC, &deadline);
7286 		clock_absolutetime_interval_to_deadline(deadline, &deadline);
7287 
7288 		(void) assert_wait_deadline(&cfil_sock_attached_stats_count,
7289 		    THREAD_INTERRUPTIBLE, deadline);
7290 	}
7291 }
7292 
7293 static void
cfil_stats_report_thread_func(void * v,wait_result_t w)7294 cfil_stats_report_thread_func(void *v, wait_result_t w)
7295 {
7296 #pragma unused(v, w)
7297 
7298 	ASSERT(cfil_stats_report_thread == current_thread());
7299 	thread_set_thread_name(current_thread(), "CFIL_STATS_REPORT");
7300 
7301 	// Kick off gc shortly
7302 	cfil_stats_report_thread_sleep(false);
7303 	thread_block_parameter((thread_continue_t) cfil_stats_report, NULL);
7304 	/* NOTREACHED */
7305 }
7306 
7307 static bool
cfil_stats_collect_flow_stats_for_filter(int kcunit,struct cfil_info * cfil_info,struct cfil_entry * entry,struct timeval current_tv)7308 cfil_stats_collect_flow_stats_for_filter(int kcunit,
7309     struct cfil_info *cfil_info,
7310     struct cfil_entry *entry,
7311     struct timeval current_tv)
7312 {
7313 	struct cfil_stats_report_buffer *buffer = NULL;
7314 	struct cfil_msg_sock_stats *flow_array = NULL;
7315 	struct cfil_msg_sock_stats *stats = NULL;
7316 	struct inpcb *inp = NULL;
7317 	struct timeval diff_time;
7318 	uint64_t diff_time_usecs;
7319 	int index = 0;
7320 
7321 	if (entry->cfe_stats_report_frequency == 0) {
7322 		return false;
7323 	}
7324 
7325 	buffer = global_cfil_stats_report_buffers[kcunit - 1];
7326 	if (buffer == NULL) {
7327 		CFIL_LOG(LOG_ERR, "CFIL: STATS: no buffer");
7328 		return false;
7329 	}
7330 
7331 	timersub(&current_tv, &entry->cfe_stats_report_ts, &diff_time);
7332 	diff_time_usecs = (diff_time.tv_sec * USEC_PER_SEC) + diff_time.tv_usec;
7333 
7334 	if (cfil_info->cfi_debug && cfil_log_stats) {
7335 		CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT - elapsed time - ts %llu %llu cur ts %llu %llu diff %llu %llu(usecs %llu) @freq %llu usecs sockID %llu <%llx>",
7336 		    (unsigned long long)entry->cfe_stats_report_ts.tv_sec,
7337 		    (unsigned long long)entry->cfe_stats_report_ts.tv_usec,
7338 		    (unsigned long long)current_tv.tv_sec,
7339 		    (unsigned long long)current_tv.tv_usec,
7340 		    (unsigned long long)diff_time.tv_sec,
7341 		    (unsigned long long)diff_time.tv_usec,
7342 		    (unsigned long long)diff_time_usecs,
7343 		    (unsigned long long)((entry->cfe_stats_report_frequency * NSEC_PER_MSEC) / NSEC_PER_USEC),
7344 		    cfil_info->cfi_sock_id, cfil_info->cfi_sock_id);
7345 	}
7346 
7347 	// Compare elapsed time in usecs
7348 	if (diff_time_usecs >= (entry->cfe_stats_report_frequency * NSEC_PER_MSEC) / NSEC_PER_USEC) {
7349 		if (cfil_info->cfi_debug && cfil_log_stats) {
7350 			CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT - in %llu reported %llu",
7351 			    cfil_info->cfi_byte_inbound_count,
7352 			    entry->cfe_byte_inbound_count_reported);
7353 			CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT - out %llu reported %llu",
7354 			    cfil_info->cfi_byte_outbound_count,
7355 			    entry->cfe_byte_outbound_count_reported);
7356 		}
7357 		// Check if flow has new bytes that have not been reported
7358 		if (entry->cfe_byte_inbound_count_reported < cfil_info->cfi_byte_inbound_count ||
7359 		    entry->cfe_byte_outbound_count_reported < cfil_info->cfi_byte_outbound_count) {
7360 			flow_array = (struct cfil_msg_sock_stats *)&buffer->stats;
7361 			index = global_cfil_stats_counts[kcunit - 1];
7362 
7363 			stats = &flow_array[index];
7364 			stats->cfs_sock_id = cfil_info->cfi_sock_id;
7365 			stats->cfs_byte_inbound_count = cfil_info->cfi_byte_inbound_count;
7366 			stats->cfs_byte_outbound_count = cfil_info->cfi_byte_outbound_count;
7367 
7368 			if (entry->cfe_laddr_sent == false) {
7369 				/* cache it if necessary */
7370 				if (cfil_info->cfi_so_attach_laddr.sa.sa_len == 0) {
7371 					inp = cfil_info->cfi_so ? sotoinpcb(cfil_info->cfi_so) : NULL;
7372 					if (inp != NULL) {
7373 						boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT);
7374 						union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL;
7375 						union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr;
7376 						cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
7377 						    src, dst, !IS_INP_V6(inp), outgoing);
7378 					}
7379 				}
7380 
7381 				if (cfil_info->cfi_so_attach_laddr.sa.sa_len != 0) {
7382 					stats->cfs_laddr.sin6 = cfil_info->cfi_so_attach_laddr.sin6;
7383 					entry->cfe_laddr_sent = true;
7384 				}
7385 			}
7386 
7387 			global_cfil_stats_counts[kcunit - 1]++;
7388 
7389 			entry->cfe_stats_report_ts = current_tv;
7390 			entry->cfe_byte_inbound_count_reported = cfil_info->cfi_byte_inbound_count;
7391 			entry->cfe_byte_outbound_count_reported = cfil_info->cfi_byte_outbound_count;
7392 			if (cfil_info->cfi_debug && cfil_log_stats) {
7393 				cfil_info_log(LOG_ERR, cfil_info, "CFIL: STATS COLLECTED");
7394 			}
7395 			CFI_ADD_TIME_LOG(cfil_info, &current_tv, &cfil_info->cfi_first_event, CFM_OP_STATS);
7396 			return true;
7397 		}
7398 	}
7399 	return false;
7400 }
7401 
7402 static void
cfil_stats_report(void * v,wait_result_t w)7403 cfil_stats_report(void *v, wait_result_t w)
7404 {
7405 #pragma unused(v, w)
7406 
7407 	struct cfil_info *cfil_info = NULL;
7408 	struct cfil_entry *entry = NULL;
7409 	struct timeval current_tv;
7410 	uint32_t flow_count = 0;
7411 	uint64_t saved_next_sock_id = 0; // Next sock id to be reported for next loop
7412 	bool flow_reported = false;
7413 
7414 	if (cfil_log_stats) {
7415 		CFIL_LOG(LOG_DEBUG, "CFIL: STATS COLLECTION RUNNING");
7416 	}
7417 
7418 	do {
7419 		// Collect all sock ids of flows that has new stats
7420 		cfil_rw_lock_shared(&cfil_lck_rw);
7421 
7422 		if (cfil_sock_attached_stats_count == 0) {
7423 			if (cfil_log_stats) {
7424 				CFIL_LOG(LOG_DEBUG, "CFIL: STATS: no flow");
7425 			}
7426 			cfil_rw_unlock_shared(&cfil_lck_rw);
7427 			goto go_sleep;
7428 		}
7429 
7430 		for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
7431 			if (global_cfil_stats_report_buffers[kcunit - 1] != NULL) {
7432 				memset(global_cfil_stats_report_buffers[kcunit - 1], 0, sizeof(struct cfil_stats_report_buffer));
7433 			}
7434 			global_cfil_stats_counts[kcunit - 1] = 0;
7435 		}
7436 
7437 		microuptime(&current_tv);
7438 		flow_count = 0;
7439 
7440 		TAILQ_FOREACH(cfil_info, &cfil_sock_head_stats, cfi_link_stats) {
7441 			if (saved_next_sock_id != 0 &&
7442 			    saved_next_sock_id == cfil_info->cfi_sock_id) {
7443 				// Here is where we left off previously, start accumulating
7444 				saved_next_sock_id = 0;
7445 			}
7446 
7447 			if (saved_next_sock_id == 0) {
7448 				if (flow_count >= CFIL_STATS_REPORT_MAX_COUNT) {
7449 					// Examine a fixed number of flows each round.  Remember the current flow
7450 					// so we can start from here for next loop
7451 					saved_next_sock_id = cfil_info->cfi_sock_id;
7452 					break;
7453 				}
7454 
7455 				flow_reported = false;
7456 				for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
7457 					entry = &cfil_info->cfi_entries[kcunit - 1];
7458 					if (entry->cfe_filter == NULL) {
7459 						if (cfil_info->cfi_debug && cfil_log_stats) {
7460 							CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT - so %llx no filter",
7461 							    cfil_info->cfi_so ? (uint64_t)VM_KERNEL_ADDRPERM(cfil_info->cfi_so) : 0);
7462 						}
7463 						continue;
7464 					}
7465 
7466 					if ((entry->cfe_stats_report_frequency > 0) &&
7467 					    cfil_stats_collect_flow_stats_for_filter(kcunit, cfil_info, entry, current_tv) == true) {
7468 						flow_reported = true;
7469 					}
7470 				}
7471 				if (flow_reported == true) {
7472 					flow_count++;
7473 				}
7474 			}
7475 		}
7476 
7477 		if (flow_count > 0) {
7478 			if (cfil_log_stats) {
7479 				CFIL_LOG(LOG_DEBUG, "CFIL: STATS reporting for %d flows", flow_count);
7480 			}
7481 			for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
7482 				if (global_cfil_stats_report_buffers[kcunit - 1] != NULL &&
7483 				    global_cfil_stats_counts[kcunit - 1] > 0) {
7484 					cfil_dispatch_stats_event_locked(kcunit,
7485 					    global_cfil_stats_report_buffers[kcunit - 1],
7486 					    global_cfil_stats_counts[kcunit - 1]);
7487 				}
7488 			}
7489 		} else {
7490 			cfil_rw_unlock_shared(&cfil_lck_rw);
7491 			goto go_sleep;
7492 		}
7493 
7494 		cfil_rw_unlock_shared(&cfil_lck_rw);
7495 
7496 		// Loop again if we haven't finished the whole cfil_info list
7497 	} while (saved_next_sock_id != 0);
7498 
7499 go_sleep:
7500 
7501 	// Sleep forever (until waken up) if no more flow to report
7502 	cfil_rw_lock_shared(&cfil_lck_rw);
7503 	cfil_stats_report_thread_sleep(cfil_sock_attached_stats_count == 0 ? true : false);
7504 	cfil_rw_unlock_shared(&cfil_lck_rw);
7505 	thread_block_parameter((thread_continue_t) cfil_stats_report, NULL);
7506 	/* NOTREACHED */
7507 }
7508