xref: /xnu-11417.140.69/bsd/kern/uipc_socket.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 1998-2022, 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <net/sockaddr_utils.h>
106 #include <netinet/in.h>
107 #include <netinet/in_pcb.h>
108 #include <netinet/in_tclass.h>
109 #include <netinet/in_var.h>
110 #include <netinet/tcp_var.h>
111 #include <netinet/ip6.h>
112 #include <netinet6/ip6_var.h>
113 #include <netinet/flow_divert.h>
114 #include <kern/zalloc.h>
115 #include <kern/locks.h>
116 #include <machine/limits.h>
117 #include <libkern/OSAtomic.h>
118 #include <pexpert/pexpert.h>
119 #include <kern/assert.h>
120 #include <kern/task.h>
121 #include <kern/policy_internal.h>
122 
123 #include <sys/kpi_mbuf.h>
124 #include <sys/mcache.h>
125 #include <sys/unpcb.h>
126 #include <libkern/section_keywords.h>
127 
128 #include <os/log.h>
129 
130 #if CONFIG_MACF
131 #include <security/mac_framework.h>
132 #endif /* MAC */
133 
134 #if MULTIPATH
135 #include <netinet/mp_pcb.h>
136 #include <netinet/mptcp_var.h>
137 #endif /* MULTIPATH */
138 
139 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
140 
141 #if DEBUG || DEVELOPMENT
142 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
143 #else
144 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
145 #endif
146 
147 /* TODO: this should be in a header file somewhere */
148 extern char *proc_name_address(void *p);
149 
150 static u_int32_t        so_cache_hw;    /* High water mark for socache */
151 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
152 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
153 static u_int32_t        cached_sock_count = 0;
154 STAILQ_HEAD(, socket)   so_cache_head;
155 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
156 static uint64_t        so_cache_time;
157 static int              socketinit_done;
158 static struct zone      *so_cache_zone;
159 ZONE_DECLARE(so_cache_zone, struct zone *);
160 
161 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
162 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
163 
164 #include <machine/limits.h>
165 
166 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
167 static void     filt_sordetach(struct knote *kn);
168 static int      filt_soread(struct knote *kn, long hint);
169 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
170 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
171 
172 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
173 static void     filt_sowdetach(struct knote *kn);
174 static int      filt_sowrite(struct knote *kn, long hint);
175 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
176 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
177 
178 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
179 static void     filt_sockdetach(struct knote *kn);
180 static int      filt_sockev(struct knote *kn, long hint);
181 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
182 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
183 
184 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
185 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
186 
187 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
188 	.f_isfd = 1,
189 	.f_attach = filt_sorattach,
190 	.f_detach = filt_sordetach,
191 	.f_event = filt_soread,
192 	.f_touch = filt_sortouch,
193 	.f_process = filt_sorprocess,
194 };
195 
196 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
197 	.f_isfd = 1,
198 	.f_attach = filt_sowattach,
199 	.f_detach = filt_sowdetach,
200 	.f_event = filt_sowrite,
201 	.f_touch = filt_sowtouch,
202 	.f_process = filt_sowprocess,
203 };
204 
205 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
206 	.f_isfd = 1,
207 	.f_attach = filt_sockattach,
208 	.f_detach = filt_sockdetach,
209 	.f_event = filt_sockev,
210 	.f_touch = filt_socktouch,
211 	.f_process = filt_sockprocess,
212 };
213 
214 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
215 	.f_isfd = 1,
216 	.f_attach = filt_sorattach,
217 	.f_detach = filt_sordetach,
218 	.f_event = filt_soread,
219 	.f_touch = filt_sortouch,
220 	.f_process = filt_sorprocess,
221 };
222 
223 SYSCTL_DECL(_kern_ipc);
224 
225 #define EVEN_MORE_LOCKING_DEBUG 0
226 
227 int socket_debug = 0;
228 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
229     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
230 
231 #if (DEBUG || DEVELOPMENT)
232 #define DEFAULT_SOSEND_ASSERT_PANIC 1
233 #else
234 #define DEFAULT_SOSEND_ASSERT_PANIC 0
235 #endif /* (DEBUG || DEVELOPMENT) */
236 
237 int sosend_assert_panic = 0;
238 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
239     CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
240 
241 static unsigned long sodefunct_calls = 0;
242 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
243     &sodefunct_calls, "");
244 
245 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
246 so_gen_t        so_gencnt;      /* generation count for sockets */
247 
248 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
249 
250 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
251 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
252 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
253 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
254 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
255 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
256 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
257 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
258 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
259 
260 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
261 
262 int somaxconn = SOMAXCONN;
263 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
264     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
265 
266 /* Should we get a maximum also ??? */
267 static int sosendmaxchain = 65536;
268 static int sosendminchain = 16384;
269 static int sorecvmincopy  = 16384;
270 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
271     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
272 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
273     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
274 
275 /*
276  * Set to enable jumbo clusters (if available) for large writes when
277  * the socket is marked with SOF_MULTIPAGES; see below.
278  */
279 int sosendjcl = 1;
280 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
281     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
282 
283 /*
284  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
285  * writes on the socket for all protocols on any network interfaces,
286  * depending upon sosendjcl above.  Be extra careful when setting this
287  * to 1, because sending down packets that cross physical pages down to
288  * broken drivers (those that falsely assume that the physical pages
289  * are contiguous) might lead to system panics or silent data corruption.
290  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
291  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
292  * capable.  Set this to 1 only for testing/debugging purposes.
293  */
294 int sosendjcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
296     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
297 
298 /*
299  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
300  * writes on the socket for all protocols on any network interfaces.
301  * Be extra careful when setting this to 1, because sending down packets with
302  * clusters larger that 2 KB might lead to system panics or data corruption.
303  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
304  * on the outgoing interface
305  * Set this to 1  for testing/debugging purposes only.
306  */
307 int sosendbigcl_ignore_capab = 0;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
309     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
310 
311 int sodefunctlog = 0;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
313     &sodefunctlog, 0, "");
314 
315 int sothrottlelog = 0;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
317     &sothrottlelog, 0, "");
318 
319 int sorestrictrecv = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
321     &sorestrictrecv, 0, "Enable inbound interface restrictions");
322 
323 int sorestrictsend = 1;
324 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
325     &sorestrictsend, 0, "Enable outbound interface restrictions");
326 
327 int soreserveheadroom = 1;
328 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
329     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
330 
331 #if (DEBUG || DEVELOPMENT)
332 int so_notsent_lowat_check = 1;
333 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
334     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
335 #endif /* DEBUG || DEVELOPMENT */
336 
337 int so_accept_list_waits = 0;
338 #if (DEBUG || DEVELOPMENT)
339 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
340     &so_accept_list_waits, 0, "number of waits for listener incomp list");
341 #endif /* DEBUG || DEVELOPMENT */
342 
343 extern struct inpcbinfo tcbinfo;
344 
345 /* TODO: these should be in header file */
346 extern int get_inpcb_str_size(void);
347 extern int get_tcp_str_size(void);
348 
349 vm_size_t       so_cache_zone_element_size;
350 
351 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
352     user_ssize_t *);
353 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
354 static void cached_sock_free(struct socket *);
355 
356 /*
357  * Maximum of extended background idle sockets per process
358  * Set to zero to disable further setting of the option
359  */
360 
361 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
362 #define SO_IDLE_BK_IDLE_TIME            600
363 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
364 
365 struct soextbkidlestat soextbkidlestat;
366 
367 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
368     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
369     "Maximum of extended background idle sockets per process");
370 
371 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
372     &soextbkidlestat.so_xbkidle_time, 0,
373     "Time in seconds to keep extended background idle sockets");
374 
375 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
376     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
377     "High water mark for extended background idle sockets");
378 
379 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
380     &soextbkidlestat, soextbkidlestat, "");
381 
382 int so_set_extended_bk_idle(struct socket *, int);
383 
384 #define SO_MAX_MSG_X 1024
385 
386 /*
387  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
388  * setting the DSCP code on the packet based on the service class; see
389  * <rdar://problem/11277343> for details.
390  */
391 __private_extern__ u_int32_t sotcdb = 0;
392 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
393     &sotcdb, 0, "");
394 
395 void
socketinit(void)396 socketinit(void)
397 {
398 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
399 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
400 
401 #ifdef __LP64__
402 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
403 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
404 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
405 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
406 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
407 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
408 #else
409 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
410 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
411 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
412 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
413 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
414 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
415 #endif
416 
417 	if (socketinit_done) {
418 		printf("socketinit: already called...\n");
419 		return;
420 	}
421 	socketinit_done = 1;
422 
423 	PE_parse_boot_argn("socket_debug", &socket_debug,
424 	    sizeof(socket_debug));
425 
426 	PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
427 	    sizeof(sosend_assert_panic));
428 
429 	STAILQ_INIT(&so_cache_head);
430 
431 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
432 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
433 
434 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
435 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
436 
437 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
438 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
439 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
440 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
441 
442 	in_pcbinit();
443 }
444 
445 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)446 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
447 {
448 	caddr_t temp;
449 	uintptr_t offset;
450 
451 	lck_mtx_lock(&so_cache_mtx);
452 
453 	if (!STAILQ_EMPTY(&so_cache_head)) {
454 		VERIFY(cached_sock_count > 0);
455 
456 		*so = STAILQ_FIRST(&so_cache_head);
457 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
458 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
459 
460 		cached_sock_count--;
461 		lck_mtx_unlock(&so_cache_mtx);
462 
463 		temp = (*so)->so_saved_pcb;
464 		bzero(*so, sizeof(struct socket));
465 
466 		(*so)->so_saved_pcb = temp;
467 	} else {
468 		lck_mtx_unlock(&so_cache_mtx);
469 
470 		uint8_t *so_mem = zalloc_flags_buf(so_cache_zone, how | Z_ZERO);
471 #pragma clang diagnostic push
472 #pragma clang diagnostic ignored "-Wcast-align"
473 		*so = (struct socket *)so_mem;
474 
475 		/*
476 		 * Define offsets for extra structures into our
477 		 * single block of memory. Align extra structures
478 		 * on longword boundaries.
479 		 */
480 
481 		offset = (uintptr_t)so_mem;
482 		offset += sizeof(struct socket);
483 		offset = ALIGN(offset);
484 		struct inpcb *pcb = (struct inpcb *)(so_mem + (offset - (uintptr_t)so_mem));
485 #pragma clang diagnostic pop
486 		(*so)->so_saved_pcb = (caddr_t)pcb;
487 
488 		offset += get_inpcb_str_size();
489 		offset = ALIGN(offset);
490 		pcb->inp_saved_ppcb = (caddr_t)(so_mem + (offset - (uintptr_t)so_mem));
491 	}
492 
493 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
494 }
495 
496 static void
cached_sock_free(struct socket * so)497 cached_sock_free(struct socket *so)
498 {
499 	lck_mtx_lock(&so_cache_mtx);
500 
501 	so_cache_time = net_uptime();
502 	if (++cached_sock_count > max_cached_sock_count) {
503 		--cached_sock_count;
504 		lck_mtx_unlock(&so_cache_mtx);
505 		zfree(so_cache_zone, so);
506 	} else {
507 		if (so_cache_hw < cached_sock_count) {
508 			so_cache_hw = cached_sock_count;
509 		}
510 
511 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
512 
513 		so->cache_timestamp = so_cache_time;
514 		lck_mtx_unlock(&so_cache_mtx);
515 	}
516 }
517 
518 void
so_update_last_owner_locked(struct socket * so,proc_t self)519 so_update_last_owner_locked(struct socket *so, proc_t self)
520 {
521 	if (so->last_pid != 0) {
522 		/*
523 		 * last_pid and last_upid should remain zero for sockets
524 		 * created using sock_socket. The check above achieves that
525 		 */
526 		if (self == PROC_NULL) {
527 			self = current_proc();
528 		}
529 
530 		if (so->last_upid != proc_uniqueid(self) ||
531 		    so->last_pid != proc_pid(self)) {
532 			so->last_upid = proc_uniqueid(self);
533 			so->last_pid = proc_pid(self);
534 			proc_getexecutableuuid(self, so->last_uuid,
535 			    sizeof(so->last_uuid));
536 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
537 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
538 			}
539 		}
540 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
541 	}
542 }
543 
544 void
so_update_policy(struct socket * so)545 so_update_policy(struct socket *so)
546 {
547 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
548 		(void) inp_update_policy(sotoinpcb(so));
549 	}
550 }
551 
552 #if NECP
553 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)554 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
555     struct sockaddr *override_remote_addr)
556 {
557 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
558 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
559 		    override_remote_addr, 0);
560 	}
561 }
562 #endif /* NECP */
563 
564 boolean_t
so_cache_timer(void)565 so_cache_timer(void)
566 {
567 	struct socket   *p;
568 	int             n_freed = 0;
569 	boolean_t rc = FALSE;
570 
571 	lck_mtx_lock(&so_cache_mtx);
572 	so_cache_timeouts++;
573 	so_cache_time = net_uptime();
574 
575 	while (!STAILQ_EMPTY(&so_cache_head)) {
576 		VERIFY(cached_sock_count > 0);
577 		p = STAILQ_FIRST(&so_cache_head);
578 		if ((so_cache_time - p->cache_timestamp) <
579 		    SO_CACHE_TIME_LIMIT) {
580 			break;
581 		}
582 
583 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
584 		--cached_sock_count;
585 
586 		zfree(so_cache_zone, p);
587 
588 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
589 			so_cache_max_freed++;
590 			break;
591 		}
592 	}
593 
594 	/* Schedule again if there is more to cleanup */
595 	if (!STAILQ_EMPTY(&so_cache_head)) {
596 		rc = TRUE;
597 	}
598 
599 	lck_mtx_unlock(&so_cache_mtx);
600 	return rc;
601 }
602 
603 /*
604  * Get a socket structure from our zone, and initialize it.
605  * We don't implement `waitok' yet (see comments in uipc_domain.c).
606  * Note that it would probably be better to allocate socket
607  * and PCB at the same time, but I'm not convinced that all
608  * the protocols can be easily modified to do this.
609  */
610 struct socket *
soalloc(int waitok,int dom,int type)611 soalloc(int waitok, int dom, int type)
612 {
613 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
614 	struct socket *__single so;
615 
616 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
617 		cached_sock_alloc(&so, how);
618 	} else {
619 		so = zalloc_flags(socket_zone, how | Z_ZERO);
620 	}
621 	if (so != NULL) {
622 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
623 
624 		/*
625 		 * Increment the socket allocation statistics
626 		 */
627 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
628 	}
629 
630 	return so;
631 }
632 
633 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)634 socreate_internal(int dom, struct socket **aso, int type, int proto,
635     struct proc *p, uint32_t flags, struct proc *ep)
636 {
637 	struct protosw *prp;
638 	struct socket *so;
639 	int error = 0;
640 	pid_t rpid = -1;
641 
642 	VERIFY(aso != NULL);
643 	*aso = NULL;
644 
645 	if (proto != 0) {
646 		prp = pffindproto(dom, proto, type);
647 	} else {
648 		prp = pffindtype(dom, type);
649 	}
650 
651 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
652 		if (pffinddomain(dom) == NULL) {
653 			return EAFNOSUPPORT;
654 		}
655 		if (proto != 0) {
656 			if (pffindprotonotype(dom, proto) != NULL) {
657 				return EPROTOTYPE;
658 			}
659 		}
660 		return EPROTONOSUPPORT;
661 	}
662 	if (prp->pr_type != type) {
663 		return EPROTOTYPE;
664 	}
665 	so = soalloc(1, dom, type);
666 	if (so == NULL) {
667 		return ENOBUFS;
668 	}
669 
670 	switch (dom) {
671 	case PF_LOCAL:
672 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
673 		break;
674 	case PF_INET:
675 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
676 		if (type == SOCK_STREAM) {
677 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
678 		} else {
679 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
680 		}
681 		break;
682 	case PF_ROUTE:
683 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
684 		break;
685 	case PF_NDRV:
686 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
687 		break;
688 	case PF_KEY:
689 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
690 		break;
691 	case PF_INET6:
692 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
693 		if (type == SOCK_STREAM) {
694 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
695 		} else {
696 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
697 		}
698 		break;
699 	case PF_SYSTEM:
700 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
701 		break;
702 	case PF_MULTIPATH:
703 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
704 		break;
705 	default:
706 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
707 		break;
708 	}
709 
710 	if (flags & SOCF_MPTCP) {
711 		so->so_state |= SS_NBIO;
712 	}
713 
714 	TAILQ_INIT(&so->so_incomp);
715 	TAILQ_INIT(&so->so_comp);
716 	so->so_type = (short)type;
717 	so->so_family = prp->pr_domain->dom_family;
718 	so->so_protocol = prp->pr_protocol;
719 	so->last_upid = proc_uniqueid(p);
720 	so->last_pid = proc_pid(p);
721 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
722 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
723 
724 	so->so_rpid = -1;
725 	uuid_clear(so->so_ruuid);
726 
727 	if (ep != PROC_NULL && ep != p) {
728 		so->e_upid = proc_uniqueid(ep);
729 		so->e_pid = proc_pid(ep);
730 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
731 		so->so_flags |= SOF_DELEGATED;
732 		if (ep->p_responsible_pid != so->e_pid) {
733 			rpid = ep->p_responsible_pid;
734 			so->so_rpid = rpid;
735 			proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
736 		}
737 	}
738 
739 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
740 		rpid = p->p_responsible_pid;
741 		so->so_rpid = rpid;
742 		proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
743 	}
744 
745 	so->so_cred = kauth_cred_proc_ref(p);
746 	if (!suser(kauth_cred_get(), NULL)) {
747 		so->so_state |= SS_PRIV;
748 	}
749 
750 	so->so_persona_id = current_persona_get_id();
751 	so->so_proto = prp;
752 	so->so_rcv.sb_flags |= SB_RECV;
753 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
754 	so->next_lock_lr = 0;
755 	so->next_unlock_lr = 0;
756 
757 	/*
758 	 * Attachment will create the per pcb lock if necessary and
759 	 * increase refcount for creation, make sure it's done before
760 	 * socket is inserted in lists.
761 	 */
762 	so->so_usecount++;
763 
764 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
765 	if (error != 0) {
766 		/*
767 		 * Warning:
768 		 * If so_pcb is not zero, the socket will be leaked,
769 		 * so protocol attachment handler must be coded carefuly
770 		 */
771 		if (so->so_pcb != NULL) {
772 			os_log_error(OS_LOG_DEFAULT,
773 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
774 			    error, dom, proto, type);
775 		}
776 		/*
777 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
778 		 */
779 		so->so_state |= SS_NOFDREF;
780 		so->so_flags |= SOF_PCBCLEARING;
781 		VERIFY(so->so_usecount > 0);
782 		so->so_usecount--;
783 		sofreelastref(so, 1);   /* will deallocate the socket */
784 		return error;
785 	}
786 
787 	/*
788 	 * Note: needs so_pcb to be set after pru_attach
789 	 */
790 	if (prp->pr_update_last_owner != NULL) {
791 		(*prp->pr_update_last_owner)(so, p, ep);
792 	}
793 
794 	os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
795 
796 	/* Attach socket filters for this protocol */
797 	sflt_initsock(so);
798 	so_set_default_traffic_class(so);
799 
800 	/*
801 	 * If this thread or task is marked to create backgrounded sockets,
802 	 * mark the socket as background.
803 	 */
804 	if (!(flags & SOCF_MPTCP) &&
805 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
806 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
807 		so->so_background_thread = current_thread();
808 	}
809 
810 	switch (dom) {
811 	/*
812 	 * Don't mark Unix domain or system
813 	 * eligible for defunct by default.
814 	 */
815 	case PF_LOCAL:
816 	case PF_SYSTEM:
817 		so->so_flags |= SOF_NODEFUNCT;
818 		break;
819 	default:
820 		break;
821 	}
822 
823 	/*
824 	 * Entitlements can't be checked at socket creation time except if the
825 	 * application requested a feature guarded by a privilege (c.f., socket
826 	 * delegation).
827 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
828 	 * a privilege check should only be triggered by a userland request.
829 	 * A privilege check at socket creation time is time consuming and
830 	 * could trigger many authorisation error messages from the security
831 	 * APIs.
832 	 */
833 
834 	*aso = so;
835 
836 	return 0;
837 }
838 
839 /*
840  * Returns:	0			Success
841  *		EAFNOSUPPORT
842  *		EPROTOTYPE
843  *		EPROTONOSUPPORT
844  *		ENOBUFS
845  *	<pru_attach>:ENOBUFS[AF_UNIX]
846  *	<pru_attach>:ENOBUFS[TCP]
847  *	<pru_attach>:ENOMEM[TCP]
848  *	<pru_attach>:???		[other protocol families, IPSEC]
849  */
850 int
socreate(int dom,struct socket ** aso,int type,int proto)851 socreate(int dom, struct socket **aso, int type, int proto)
852 {
853 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
854 	           PROC_NULL);
855 }
856 
857 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)858 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
859 {
860 	int error = 0;
861 	struct proc *ep = PROC_NULL;
862 
863 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
864 		error = ESRCH;
865 		goto done;
866 	}
867 
868 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
869 
870 	/*
871 	 * It might not be wise to hold the proc reference when calling
872 	 * socreate_internal since it calls soalloc with M_WAITOK
873 	 */
874 done:
875 	if (ep != PROC_NULL) {
876 		proc_rele(ep);
877 	}
878 
879 	return error;
880 }
881 
882 /*
883  * Returns:	0			Success
884  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
885  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
886  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
887  *	<pru_bind>:EINVAL		Invalid argument
888  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
889  *	<pru_bind>:EACCES		Permission denied
890  *	<pru_bind>:EADDRINUSE		Address in use
891  *	<pru_bind>:EAGAIN		Resource unavailable, try again
892  *	<pru_bind>:EPERM		Operation not permitted
893  *	<pru_bind>:???
894  *	<sf_bind>:???
895  *
896  * Notes:	It's not possible to fully enumerate the return codes above,
897  *		since socket filter authors and protocol family authors may
898  *		not choose to limit their error returns to those listed, even
899  *		though this may result in some software operating incorrectly.
900  *
901  *		The error codes which are enumerated above are those known to
902  *		be returned by the tcp_usr_bind function supplied.
903  */
904 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)905 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
906 {
907 	struct proc *p = current_proc();
908 	int error = 0;
909 
910 	if (dolock) {
911 		socket_lock(so, 1);
912 	}
913 
914 	so_update_last_owner_locked(so, p);
915 	so_update_policy(so);
916 
917 #if NECP
918 	so_update_necp_policy(so, nam, NULL);
919 #endif /* NECP */
920 
921 	/*
922 	 * If this is a bind request on a socket that has been marked
923 	 * as inactive, reject it now before we go any further.
924 	 */
925 	if (so->so_flags & SOF_DEFUNCT) {
926 		error = EINVAL;
927 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
928 		    __func__, proc_pid(p), proc_best_name(p),
929 		    so->so_gencnt,
930 		    SOCK_DOM(so), SOCK_TYPE(so), error);
931 		goto out;
932 	}
933 
934 	/* Socket filter */
935 	error = sflt_bind(so, nam);
936 
937 	if (error == 0) {
938 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
939 	}
940 out:
941 	if (dolock) {
942 		socket_unlock(so, 1);
943 	}
944 
945 	if (error == EJUSTRETURN) {
946 		error = 0;
947 	}
948 
949 	return error;
950 }
951 
952 void
sodealloc(struct socket * so)953 sodealloc(struct socket *so)
954 {
955 	kauth_cred_unref(&so->so_cred);
956 
957 	/* Remove any filters */
958 	sflt_termsock(so);
959 
960 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
961 
962 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
963 		cached_sock_free(so);
964 	} else {
965 		zfree(socket_zone, so);
966 	}
967 }
968 
969 /*
970  * Returns:	0			Success
971  *		EINVAL
972  *		EOPNOTSUPP
973  *	<pru_listen>:EINVAL[AF_UNIX]
974  *	<pru_listen>:EINVAL[TCP]
975  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
976  *	<pru_listen>:EINVAL[TCP]	Invalid argument
977  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
978  *	<pru_listen>:EACCES[TCP]	Permission denied
979  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
980  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
981  *	<pru_listen>:EPERM[TCP]		Operation not permitted
982  *	<sf_listen>:???
983  *
984  * Notes:	Other <pru_listen> returns depend on the protocol family; all
985  *		<sf_listen> returns depend on what the filter author causes
986  *		their filter to return.
987  */
988 int
solisten(struct socket * so,int backlog)989 solisten(struct socket *so, int backlog)
990 {
991 	struct proc *p = current_proc();
992 	int error = 0;
993 
994 	socket_lock(so, 1);
995 
996 	so_update_last_owner_locked(so, p);
997 	so_update_policy(so);
998 
999 	if (TAILQ_EMPTY(&so->so_comp)) {
1000 		so->so_options |= SO_ACCEPTCONN;
1001 	}
1002 
1003 #if NECP
1004 	so_update_necp_policy(so, NULL, NULL);
1005 #endif /* NECP */
1006 
1007 	if (so->so_proto == NULL) {
1008 		error = EINVAL;
1009 		so->so_options &= ~SO_ACCEPTCONN;
1010 		goto out;
1011 	}
1012 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1013 		error = EOPNOTSUPP;
1014 		so->so_options &= ~SO_ACCEPTCONN;
1015 		goto out;
1016 	}
1017 
1018 	/*
1019 	 * If the listen request is made on a socket that is not fully
1020 	 * disconnected, or on a socket that has been marked as inactive,
1021 	 * reject the request now.
1022 	 */
1023 	if ((so->so_state &
1024 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1025 	    (so->so_flags & SOF_DEFUNCT)) {
1026 		error = EINVAL;
1027 		if (so->so_flags & SOF_DEFUNCT) {
1028 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1029 			    "(%d)\n", __func__, proc_pid(p),
1030 			    proc_best_name(p),
1031 			    so->so_gencnt,
1032 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1033 		}
1034 		so->so_options &= ~SO_ACCEPTCONN;
1035 		goto out;
1036 	}
1037 
1038 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1039 		error = EPERM;
1040 		so->so_options &= ~SO_ACCEPTCONN;
1041 		goto out;
1042 	}
1043 
1044 	error = sflt_listen(so);
1045 	if (error == 0) {
1046 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1047 	}
1048 
1049 	if (error) {
1050 		if (error == EJUSTRETURN) {
1051 			error = 0;
1052 		}
1053 		so->so_options &= ~SO_ACCEPTCONN;
1054 		goto out;
1055 	}
1056 
1057 	/*
1058 	 * POSIX: The implementation may have an upper limit on the length of
1059 	 * the listen queue-either global or per accepting socket. If backlog
1060 	 * exceeds this limit, the length of the listen queue is set to the
1061 	 * limit.
1062 	 *
1063 	 * If listen() is called with a backlog argument value that is less
1064 	 * than 0, the function behaves as if it had been called with a backlog
1065 	 * argument value of 0.
1066 	 *
1067 	 * A backlog argument of 0 may allow the socket to accept connections,
1068 	 * in which case the length of the listen queue may be set to an
1069 	 * implementation-defined minimum value.
1070 	 */
1071 	if (backlog <= 0 || backlog > somaxconn) {
1072 		backlog = somaxconn;
1073 	}
1074 
1075 	so->so_qlimit = (short)backlog;
1076 out:
1077 	socket_unlock(so, 1);
1078 	return error;
1079 }
1080 
1081 /*
1082  * The "accept list lock" protects the fields related to the listener queues
1083  * because we can unlock a socket to respect the lock ordering between
1084  * the listener socket and its clients sockets. The lock ordering is first to
1085  * acquire the client socket before the listener socket.
1086  *
1087  * The accept list lock serializes access to the following fields:
1088  * - of the listener socket:
1089  *   - so_comp
1090  *   - so_incomp
1091  *   - so_qlen
1092  *   - so_inqlen
1093  * - of client sockets that are in so_comp or so_incomp:
1094  *   - so_head
1095  *   - so_list
1096  *
1097  * As one can see the accept list lock protects the consistent of the
1098  * linkage of the client sockets.
1099  *
1100  * Note that those fields may be read without holding the accept list lock
1101  * for a preflight provided the accept list lock is taken when committing
1102  * to take an action based on the result of the preflight. The preflight
1103  * saves the cost of doing the unlock/lock dance.
1104  */
1105 void
so_acquire_accept_list(struct socket * head,struct socket * so)1106 so_acquire_accept_list(struct socket *head, struct socket *so)
1107 {
1108 	lck_mtx_t *mutex_held;
1109 
1110 	if (head->so_proto->pr_getlock == NULL) {
1111 		return;
1112 	}
1113 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1114 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1115 
1116 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1117 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1118 		return;
1119 	}
1120 	if (so != NULL) {
1121 		socket_unlock(so, 0);
1122 	}
1123 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1124 		so_accept_list_waits += 1;
1125 		msleep((caddr_t)&head->so_incomp, mutex_held,
1126 		    PSOCK | PCATCH, __func__, NULL);
1127 	}
1128 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 	if (so != NULL) {
1130 		socket_unlock(head, 0);
1131 		socket_lock(so, 0);
1132 		socket_lock(head, 0);
1133 	}
1134 }
1135 
1136 void
so_release_accept_list(struct socket * head)1137 so_release_accept_list(struct socket *head)
1138 {
1139 	if (head->so_proto->pr_getlock != NULL) {
1140 		lck_mtx_t *mutex_held;
1141 
1142 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1143 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1144 
1145 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1146 		wakeup((caddr_t)&head->so_incomp);
1147 	}
1148 }
1149 
1150 void
sofreelastref(struct socket * so,int dealloc)1151 sofreelastref(struct socket *so, int dealloc)
1152 {
1153 	struct socket *head = so->so_head;
1154 
1155 	/* Assume socket is locked */
1156 
1157 #if FLOW_DIVERT
1158 	if (so->so_flags & SOF_FLOW_DIVERT) {
1159 		flow_divert_detach(so);
1160 	}
1161 #endif  /* FLOW_DIVERT */
1162 
1163 #if CONTENT_FILTER
1164 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1165 		cfil_sock_detach(so);
1166 	}
1167 #endif /* CONTENT_FILTER */
1168 
1169 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1170 		soflow_detach(so);
1171 	}
1172 
1173 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1174 		selthreadclear(&so->so_snd.sb_sel);
1175 		selthreadclear(&so->so_rcv.sb_sel);
1176 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1177 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1178 		so->so_event = sonullevent;
1179 		return;
1180 	}
1181 	if (head != NULL) {
1182 		/*
1183 		 * Need to lock the listener when the protocol has
1184 		 * per socket locks
1185 		 */
1186 		if (head->so_proto->pr_getlock != NULL) {
1187 			socket_lock(head, 1);
1188 			so_acquire_accept_list(head, so);
1189 		}
1190 		if (so->so_state & SS_INCOMP) {
1191 			so->so_state &= ~SS_INCOMP;
1192 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1193 			head->so_incqlen--;
1194 			head->so_qlen--;
1195 			so->so_head = NULL;
1196 
1197 			if (head->so_proto->pr_getlock != NULL) {
1198 				so_release_accept_list(head);
1199 				socket_unlock(head, 1);
1200 			}
1201 		} else if (so->so_state & SS_COMP) {
1202 			if (head->so_proto->pr_getlock != NULL) {
1203 				so_release_accept_list(head);
1204 				socket_unlock(head, 1);
1205 			}
1206 			/*
1207 			 * We must not decommission a socket that's
1208 			 * on the accept(2) queue.  If we do, then
1209 			 * accept(2) may hang after select(2) indicated
1210 			 * that the listening socket was ready.
1211 			 */
1212 			selthreadclear(&so->so_snd.sb_sel);
1213 			selthreadclear(&so->so_rcv.sb_sel);
1214 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1215 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1216 			so->so_event = sonullevent;
1217 			return;
1218 		} else {
1219 			if (head->so_proto->pr_getlock != NULL) {
1220 				so_release_accept_list(head);
1221 				socket_unlock(head, 1);
1222 			}
1223 			printf("sofree: not queued\n");
1224 		}
1225 	}
1226 	sowflush(so);
1227 	sorflush(so);
1228 
1229 	/* 3932268: disable upcall */
1230 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1231 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1232 	so->so_event = sonullevent;
1233 
1234 	if (dealloc) {
1235 		sodealloc(so);
1236 	}
1237 }
1238 
1239 void
soclose_wait_locked(struct socket * so)1240 soclose_wait_locked(struct socket *so)
1241 {
1242 	lck_mtx_t *mutex_held;
1243 
1244 	if (so->so_proto->pr_getlock != NULL) {
1245 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1246 	} else {
1247 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1248 	}
1249 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1250 
1251 	/*
1252 	 * Double check here and return if there's no outstanding upcall;
1253 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1254 	 */
1255 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1256 		return;
1257 	}
1258 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1259 	so->so_snd.sb_flags &= ~SB_UPCALL;
1260 	so->so_flags |= SOF_CLOSEWAIT;
1261 
1262 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1263 	    "soclose_wait_locked", NULL);
1264 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1265 	so->so_flags &= ~SOF_CLOSEWAIT;
1266 }
1267 
1268 /*
1269  * Close a socket on last file table reference removal.
1270  * Initiate disconnect if connected.
1271  * Free socket when disconnect complete.
1272  */
1273 int
soclose_locked(struct socket * so)1274 soclose_locked(struct socket *so)
1275 {
1276 	int error = 0;
1277 	struct timespec ts;
1278 
1279 	if (so->so_usecount == 0) {
1280 		panic("soclose: so=%p refcount=0", so);
1281 		/* NOTREACHED */
1282 	}
1283 
1284 	sflt_notify(so, sock_evt_closing, NULL);
1285 
1286 	if (so->so_upcallusecount) {
1287 		soclose_wait_locked(so);
1288 	}
1289 
1290 #if CONTENT_FILTER
1291 	/*
1292 	 * We have to wait until the content filters are done
1293 	 */
1294 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1295 		cfil_sock_close_wait(so);
1296 		cfil_sock_is_closed(so);
1297 		cfil_sock_detach(so);
1298 	}
1299 #endif /* CONTENT_FILTER */
1300 
1301 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1302 		soflow_detach(so);
1303 	}
1304 
1305 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1306 		soresume(current_proc(), so, 1);
1307 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1308 	}
1309 
1310 	if ((so->so_options & SO_ACCEPTCONN)) {
1311 		struct socket *sp, *sonext;
1312 		int persocklock = 0;
1313 		int incomp_overflow_only;
1314 
1315 		/*
1316 		 * We do not want new connection to be added
1317 		 * to the connection queues
1318 		 */
1319 		so->so_options &= ~SO_ACCEPTCONN;
1320 
1321 		/*
1322 		 * We can drop the lock on the listener once
1323 		 * we've acquired the incoming list
1324 		 */
1325 		if (so->so_proto->pr_getlock != NULL) {
1326 			persocklock = 1;
1327 			so_acquire_accept_list(so, NULL);
1328 			socket_unlock(so, 0);
1329 		}
1330 again:
1331 		incomp_overflow_only = 1;
1332 
1333 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1334 			/*
1335 			 * Radar 5350314
1336 			 * skip sockets thrown away by tcpdropdropblreq
1337 			 * they will get cleanup by the garbage collection.
1338 			 * otherwise, remove the incomp socket from the queue
1339 			 * and let soabort trigger the appropriate cleanup.
1340 			 */
1341 			if (sp->so_flags & SOF_OVERFLOW) {
1342 				continue;
1343 			}
1344 
1345 			if (persocklock != 0) {
1346 				socket_lock(sp, 1);
1347 			}
1348 
1349 			/*
1350 			 * Radar 27945981
1351 			 * The extra reference for the list insure the
1352 			 * validity of the socket pointer when we perform the
1353 			 * unlock of the head above
1354 			 */
1355 			if (sp->so_state & SS_INCOMP) {
1356 				sp->so_state &= ~SS_INCOMP;
1357 				sp->so_head = NULL;
1358 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1359 				so->so_incqlen--;
1360 				so->so_qlen--;
1361 
1362 				(void) soabort(sp);
1363 			} else {
1364 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1365 				    __func__, sp);
1366 			}
1367 
1368 			if (persocklock != 0) {
1369 				socket_unlock(sp, 1);
1370 			}
1371 		}
1372 
1373 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1374 			/* Dequeue from so_comp since sofree() won't do it */
1375 			if (persocklock != 0) {
1376 				socket_lock(sp, 1);
1377 			}
1378 
1379 			if (sp->so_state & SS_COMP) {
1380 				sp->so_state &= ~SS_COMP;
1381 				sp->so_head = NULL;
1382 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1383 				so->so_qlen--;
1384 
1385 				(void) soabort(sp);
1386 			} else {
1387 				panic("%s sp %p in so_comp but !SS_COMP",
1388 				    __func__, sp);
1389 			}
1390 
1391 			if (persocklock) {
1392 				socket_unlock(sp, 1);
1393 			}
1394 		}
1395 
1396 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1397 #if (DEBUG | DEVELOPMENT)
1398 			panic("%s head %p so_comp not empty", __func__, so);
1399 #endif /* (DEVELOPMENT || DEBUG) */
1400 
1401 			goto again;
1402 		}
1403 
1404 		if (!TAILQ_EMPTY(&so->so_comp)) {
1405 #if (DEBUG | DEVELOPMENT)
1406 			panic("%s head %p so_comp not empty", __func__, so);
1407 #endif /* (DEVELOPMENT || DEBUG) */
1408 
1409 			goto again;
1410 		}
1411 
1412 		if (persocklock) {
1413 			socket_lock(so, 0);
1414 			so_release_accept_list(so);
1415 		}
1416 	}
1417 	if (so->so_pcb == NULL) {
1418 		/* 3915887: mark the socket as ready for dealloc */
1419 		so->so_flags |= SOF_PCBCLEARING;
1420 		goto discard;
1421 	}
1422 
1423 	if (so->so_state & SS_ISCONNECTED) {
1424 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1425 			error = sodisconnectlocked(so);
1426 			if (error) {
1427 				goto drop;
1428 			}
1429 		}
1430 		if (so->so_options & SO_LINGER) {
1431 			if ((so->so_state & SS_ISDISCONNECTING) &&
1432 			    (so->so_state & SS_NBIO)) {
1433 				goto drop;
1434 			}
1435 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1436 				lck_mtx_t *mutex_held;
1437 
1438 				if (so->so_proto->pr_getlock != NULL) {
1439 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1440 				} else {
1441 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1442 				}
1443 				ts.tv_sec = (so->so_linger / 100);
1444 				ts.tv_nsec = (so->so_linger % 100) *
1445 				    NSEC_PER_USEC * 1000 * 10;
1446 				error = msleep((caddr_t)&so->so_timeo,
1447 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1448 				if (error) {
1449 					/*
1450 					 * It's OK when the time fires,
1451 					 * don't report an error
1452 					 */
1453 					if (error == EWOULDBLOCK) {
1454 						error = 0;
1455 					}
1456 					break;
1457 				}
1458 			}
1459 		}
1460 	}
1461 drop:
1462 	if (so->so_usecount == 0) {
1463 		panic("soclose: usecount is zero so=%p", so);
1464 		/* NOTREACHED */
1465 	}
1466 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1467 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1468 		if (error == 0) {
1469 			error = error2;
1470 		}
1471 	}
1472 	if (so->so_usecount <= 0) {
1473 		panic("soclose: usecount is zero so=%p", so);
1474 		/* NOTREACHED */
1475 	}
1476 discard:
1477 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1478 	    (so->so_state & SS_NOFDREF)) {
1479 		panic("soclose: NOFDREF");
1480 		/* NOTREACHED */
1481 	}
1482 	so->so_state |= SS_NOFDREF;
1483 
1484 	if ((so->so_flags & SOF_KNOTE) != 0) {
1485 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1486 	}
1487 
1488 	os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1489 
1490 	VERIFY(so->so_usecount > 0);
1491 	so->so_usecount--;
1492 	sofree(so);
1493 	return error;
1494 }
1495 
1496 int
soclose(struct socket * so)1497 soclose(struct socket *so)
1498 {
1499 	int error = 0;
1500 	socket_lock(so, 1);
1501 
1502 	if (so->so_retaincnt == 0) {
1503 		error = soclose_locked(so);
1504 	} else {
1505 		/*
1506 		 * if the FD is going away, but socket is
1507 		 * retained in kernel remove its reference
1508 		 */
1509 		so->so_usecount--;
1510 		if (so->so_usecount < 2) {
1511 			panic("soclose: retaincnt non null and so=%p "
1512 			    "usecount=%d\n", so, so->so_usecount);
1513 		}
1514 	}
1515 	socket_unlock(so, 1);
1516 	return error;
1517 }
1518 
1519 /*
1520  * Must be called at splnet...
1521  */
1522 /* Should already be locked */
1523 int
soabort(struct socket * so)1524 soabort(struct socket *so)
1525 {
1526 	int error;
1527 
1528 #ifdef MORE_LOCKING_DEBUG
1529 	lck_mtx_t *mutex_held;
1530 
1531 	if (so->so_proto->pr_getlock != NULL) {
1532 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1533 	} else {
1534 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1535 	}
1536 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1537 #endif
1538 
1539 	if ((so->so_flags & SOF_ABORTED) == 0) {
1540 		so->so_flags |= SOF_ABORTED;
1541 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1542 		if (error) {
1543 			sofree(so);
1544 			return error;
1545 		}
1546 	}
1547 	return 0;
1548 }
1549 
1550 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1551 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1552 {
1553 	int error;
1554 
1555 	if (dolock) {
1556 		socket_lock(so, 1);
1557 	}
1558 
1559 	so_update_last_owner_locked(so, PROC_NULL);
1560 	so_update_policy(so);
1561 #if NECP
1562 	so_update_necp_policy(so, NULL, NULL);
1563 #endif /* NECP */
1564 
1565 	if ((so->so_state & SS_NOFDREF) == 0) {
1566 		panic("soaccept: !NOFDREF");
1567 	}
1568 	so->so_state &= ~SS_NOFDREF;
1569 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1570 
1571 	if (dolock) {
1572 		socket_unlock(so, 1);
1573 	}
1574 	return error;
1575 }
1576 
1577 int
soaccept(struct socket * so,struct sockaddr ** nam)1578 soaccept(struct socket *so, struct sockaddr **nam)
1579 {
1580 	return soacceptlock(so, nam, 1);
1581 }
1582 
1583 int
soacceptfilter(struct socket * so,struct socket * head)1584 soacceptfilter(struct socket *so, struct socket *head)
1585 {
1586 	struct sockaddr *__single local = NULL, *__single remote = NULL;
1587 	int error = 0;
1588 
1589 	/*
1590 	 * Hold the lock even if this socket has not been made visible
1591 	 * to the filter(s).  For sockets with global locks, this protects
1592 	 * against the head or peer going away
1593 	 */
1594 	socket_lock(so, 1);
1595 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1596 	    sogetaddr_locked(so, &local, 0) != 0) {
1597 		so->so_state &= ~SS_NOFDREF;
1598 		socket_unlock(so, 1);
1599 		soclose(so);
1600 		/* Out of resources; try it again next time */
1601 		error = ECONNABORTED;
1602 		goto done;
1603 	}
1604 
1605 	error = sflt_accept(head, so, local, remote);
1606 
1607 	/*
1608 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1609 	 * as inactive and return it anyway.  This newly accepted socket
1610 	 * will be disconnected later before we hand it off to the caller.
1611 	 */
1612 	if (error == EJUSTRETURN) {
1613 		error = 0;
1614 		(void) sosetdefunct(current_proc(), so,
1615 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1616 	}
1617 
1618 	if (error != 0) {
1619 		/*
1620 		 * This may seem like a duplication to the above error
1621 		 * handling part when we return ECONNABORTED, except
1622 		 * the following is done while holding the lock since
1623 		 * the socket has been exposed to the filter(s) earlier.
1624 		 */
1625 		so->so_state &= ~SS_NOFDREF;
1626 		socket_unlock(so, 1);
1627 		soclose(so);
1628 		/* Propagate socket filter's error code to the caller */
1629 	} else {
1630 		socket_unlock(so, 1);
1631 	}
1632 done:
1633 	/* Callee checks for NULL pointer */
1634 	sock_freeaddr(remote);
1635 	sock_freeaddr(local);
1636 	return error;
1637 }
1638 
1639 /*
1640  * Returns:	0			Success
1641  *		EOPNOTSUPP		Operation not supported on socket
1642  *		EISCONN			Socket is connected
1643  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1644  *	<pru_connect>:EINVAL		Invalid argument
1645  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1646  *	<pru_connect>:EACCES		Permission denied
1647  *	<pru_connect>:EADDRINUSE	Address in use
1648  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1649  *	<pru_connect>:EPERM		Operation not permitted
1650  *	<sf_connect_out>:???		[anything a filter writer might set]
1651  */
1652 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1653 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1654 {
1655 	int error;
1656 	struct proc *p = current_proc();
1657 	tracker_metadata_t metadata = { };
1658 
1659 	if (dolock) {
1660 		socket_lock(so, 1);
1661 	}
1662 
1663 	so_update_last_owner_locked(so, p);
1664 	so_update_policy(so);
1665 
1666 	/*
1667 	 * If this is a listening socket or if this is a previously-accepted
1668 	 * socket that has been marked as inactive, reject the connect request.
1669 	 */
1670 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1671 		error = EOPNOTSUPP;
1672 		if (so->so_flags & SOF_DEFUNCT) {
1673 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1674 			    "(%d)\n", __func__, proc_pid(p),
1675 			    proc_best_name(p),
1676 			    so->so_gencnt,
1677 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1678 		}
1679 		if (dolock) {
1680 			socket_unlock(so, 1);
1681 		}
1682 		return error;
1683 	}
1684 
1685 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1686 		if (dolock) {
1687 			socket_unlock(so, 1);
1688 		}
1689 		return EPERM;
1690 	}
1691 
1692 	/*
1693 	 * If protocol is connection-based, can only connect once.
1694 	 * Otherwise, if connected, try to disconnect first.
1695 	 * This allows user to disconnect by connecting to, e.g.,
1696 	 * a null address.
1697 	 */
1698 #if NECP
1699 	bool set_domain_from_tracker_lookup = false;
1700 #endif /* NECP */
1701 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1702 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1703 	    (error = sodisconnectlocked(so)))) {
1704 		error = EISCONN;
1705 	} else {
1706 		/*
1707 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1708 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1709 		 */
1710 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1711 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1712 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1713 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1714 				}
1715 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1716 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1717 				}
1718 #if NECP
1719 				set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1720 #endif /* NECP */
1721 				necp_set_socket_domain_attributes(so,
1722 				    __unsafe_null_terminated_from_indexable(metadata.domain),
1723 				    __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1724 			}
1725 		}
1726 
1727 #if NECP
1728 		/* Update NECP evaluation after setting any domain via the tracker checks */
1729 		so_update_necp_policy(so, NULL, nam);
1730 		if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1731 			// Mark extended timeout on tracker lookup to ensure that the entry stays around
1732 			tracker_metadata_t update_metadata = { };
1733 			update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1734 			(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &update_metadata);
1735 		}
1736 #endif /* NECP */
1737 
1738 		/*
1739 		 * Run connect filter before calling protocol:
1740 		 *  - non-blocking connect returns before completion;
1741 		 */
1742 		error = sflt_connectout(so, nam);
1743 		if (error != 0) {
1744 			if (error == EJUSTRETURN) {
1745 				error = 0;
1746 			}
1747 		} else {
1748 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1749 			    (so, nam, p);
1750 			if (error != 0) {
1751 				so->so_state &= ~SS_ISCONNECTING;
1752 			}
1753 		}
1754 	}
1755 	if (dolock) {
1756 		socket_unlock(so, 1);
1757 	}
1758 	return error;
1759 }
1760 
1761 int
soconnect(struct socket * so,struct sockaddr * nam)1762 soconnect(struct socket *so, struct sockaddr *nam)
1763 {
1764 	return soconnectlock(so, nam, 1);
1765 }
1766 
1767 /*
1768  * Returns:	0			Success
1769  *	<pru_connect2>:EINVAL[AF_UNIX]
1770  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1771  *	<pru_connect2>:???		[other protocol families]
1772  *
1773  * Notes:	<pru_connect2> is not supported by [TCP].
1774  */
1775 int
soconnect2(struct socket * so1,struct socket * so2)1776 soconnect2(struct socket *so1, struct socket *so2)
1777 {
1778 	int error;
1779 
1780 	socket_lock(so1, 1);
1781 	if (so2->so_proto->pr_lock) {
1782 		socket_lock(so2, 1);
1783 	}
1784 
1785 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1786 
1787 	socket_unlock(so1, 1);
1788 	if (so2->so_proto->pr_lock) {
1789 		socket_unlock(so2, 1);
1790 	}
1791 	return error;
1792 }
1793 
1794 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1795 soconnectxlocked(struct socket *so, struct sockaddr *src,
1796     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1797     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1798     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1799 {
1800 	int error;
1801 	tracker_metadata_t metadata = { };
1802 
1803 	so_update_last_owner_locked(so, p);
1804 	so_update_policy(so);
1805 
1806 	/*
1807 	 * If this is a listening socket or if this is a previously-accepted
1808 	 * socket that has been marked as inactive, reject the connect request.
1809 	 */
1810 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1811 		error = EOPNOTSUPP;
1812 		if (so->so_flags & SOF_DEFUNCT) {
1813 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1814 			    "(%d)\n", __func__, proc_pid(p),
1815 			    proc_best_name(p),
1816 			    so->so_gencnt,
1817 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1818 		}
1819 		return error;
1820 	}
1821 
1822 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1823 		return EPERM;
1824 	}
1825 
1826 	/*
1827 	 * If protocol is connection-based, can only connect once
1828 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1829 	 * try to disconnect first.  This allows user to disconnect
1830 	 * by connecting to, e.g., a null address.
1831 	 */
1832 #if NECP
1833 	bool set_domain_from_tracker_lookup = false;
1834 #endif /* NECP */
1835 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1836 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1837 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1838 	    (error = sodisconnectlocked(so)) != 0)) {
1839 		error = EISCONN;
1840 	} else {
1841 		/*
1842 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1843 		 * (only if it hasn't been marked yet).
1844 		 */
1845 		if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1846 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1847 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1848 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1849 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1850 				}
1851 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1852 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1853 				}
1854 #if NECP
1855 				set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1856 #endif /* NECP */
1857 				necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain),
1858 				    __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1859 			}
1860 		}
1861 
1862 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1863 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1864 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1865 
1866 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1867 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1868 			}
1869 		}
1870 
1871 		/*
1872 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1873 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1874 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1875 		 * Case 3 allows user to combine write with connect even if they have
1876 		 * no use for TFO (such as regular TCP, and UDP).
1877 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1878 		 */
1879 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1880 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1881 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1882 		}
1883 
1884 		/*
1885 		 * If a user sets data idempotent and does not pass an uio, or
1886 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1887 		 * SOF1_DATA_IDEMPOTENT.
1888 		 */
1889 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1890 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1891 			/* We should return EINVAL instead perhaps. */
1892 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1893 		}
1894 
1895 		/*
1896 		 * Run connect filter before calling protocol:
1897 		 *  - non-blocking connect returns before completion;
1898 		 */
1899 		error = sflt_connectout(so, dst);
1900 		if (error != 0) {
1901 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1902 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1903 			if (error == EJUSTRETURN) {
1904 				error = 0;
1905 			}
1906 		} else {
1907 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1908 			    (so, src, dst, p, ifscope, aid, pcid,
1909 			    flags, arg, arglen, auio, bytes_written);
1910 			if (error != 0) {
1911 				so->so_state &= ~SS_ISCONNECTING;
1912 				if (error != EINPROGRESS) {
1913 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1914 				}
1915 			}
1916 
1917 #if NECP
1918 			if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1919 				// Mark extended timeout on tracker lookup to ensure that the entry stays around
1920 				tracker_metadata_t update_metadata = { };
1921 				update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1922 				(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &update_metadata);
1923 			}
1924 #endif /* NECP */
1925 		}
1926 	}
1927 
1928 	return error;
1929 }
1930 
1931 int
sodisconnectlocked(struct socket * so)1932 sodisconnectlocked(struct socket *so)
1933 {
1934 	int error;
1935 
1936 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1937 		error = ENOTCONN;
1938 		goto bad;
1939 	}
1940 	if (so->so_state & SS_ISDISCONNECTING) {
1941 		error = EALREADY;
1942 		goto bad;
1943 	}
1944 
1945 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1946 	if (error == 0) {
1947 		sflt_notify(so, sock_evt_disconnected, NULL);
1948 	}
1949 
1950 bad:
1951 	return error;
1952 }
1953 
1954 /* Locking version */
1955 int
sodisconnect(struct socket * so)1956 sodisconnect(struct socket *so)
1957 {
1958 	int error;
1959 
1960 	socket_lock(so, 1);
1961 	error = sodisconnectlocked(so);
1962 	socket_unlock(so, 1);
1963 	return error;
1964 }
1965 
1966 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1967 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1968 {
1969 	int error;
1970 
1971 	/*
1972 	 * Call the protocol disconnectx handler; let it handle all
1973 	 * matters related to the connection state of this session.
1974 	 */
1975 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1976 	if (error == 0) {
1977 		/*
1978 		 * The event applies only for the session, not for
1979 		 * the disconnection of individual subflows.
1980 		 */
1981 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1982 			sflt_notify(so, sock_evt_disconnected, NULL);
1983 		}
1984 	}
1985 	return error;
1986 }
1987 
1988 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1989 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1990 {
1991 	int error;
1992 
1993 	socket_lock(so, 1);
1994 	error = sodisconnectxlocked(so, aid, cid);
1995 	socket_unlock(so, 1);
1996 	return error;
1997 }
1998 
1999 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
2000 
2001 /*
2002  * sosendcheck will lock the socket buffer if it isn't locked and
2003  * verify that there is space for the data being inserted.
2004  *
2005  * Returns:	0			Success
2006  *		EPIPE
2007  *	sblock:EWOULDBLOCK
2008  *	sblock:EINTR
2009  *	sbwait:EBADF
2010  *	sbwait:EINTR
2011  *	[so_error]:???
2012  */
2013 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)2014 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
2015     int32_t clen, int32_t atomic, int flags, int *sblocked)
2016 {
2017 	int     error = 0;
2018 	int32_t space;
2019 	int     assumelock = 0;
2020 
2021 restart:
2022 	if (*sblocked == 0) {
2023 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2024 		    so->so_send_filt_thread != 0 &&
2025 		    so->so_send_filt_thread == current_thread()) {
2026 			/*
2027 			 * We're being called recursively from a filter,
2028 			 * allow this to continue. Radar 4150520.
2029 			 * Don't set sblocked because we don't want
2030 			 * to perform an unlock later.
2031 			 */
2032 			assumelock = 1;
2033 		} else {
2034 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2035 			if (error) {
2036 				if (so->so_flags & SOF_DEFUNCT) {
2037 					goto defunct;
2038 				}
2039 				return error;
2040 			}
2041 			*sblocked = 1;
2042 		}
2043 	}
2044 
2045 	/*
2046 	 * If a send attempt is made on a socket that has been marked
2047 	 * as inactive (disconnected), reject the request.
2048 	 */
2049 	if (so->so_flags & SOF_DEFUNCT) {
2050 defunct:
2051 		error = EPIPE;
2052 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2053 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2054 		    so->so_gencnt,
2055 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2056 		return error;
2057 	}
2058 
2059 	if (so->so_state & SS_CANTSENDMORE) {
2060 #if CONTENT_FILTER
2061 		/*
2062 		 * Can re-inject data of half closed connections
2063 		 */
2064 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2065 		    so->so_snd.sb_cfil_thread == current_thread() &&
2066 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2067 			CFIL_LOG(LOG_INFO,
2068 			    "so %llx ignore SS_CANTSENDMORE",
2069 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2070 		} else
2071 #endif /* CONTENT_FILTER */
2072 		return EPIPE;
2073 	}
2074 	if (so->so_error) {
2075 		error = so->so_error;
2076 		so->so_error = 0;
2077 		return error;
2078 	}
2079 
2080 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2081 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2082 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2083 			    (resid != 0 || clen == 0) &&
2084 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2085 				return ENOTCONN;
2086 			}
2087 		} else if (addr == 0) {
2088 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2089 			       ENOTCONN : EDESTADDRREQ;
2090 		}
2091 	}
2092 
2093 	space = sbspace(&so->so_snd);
2094 
2095 	if (flags & MSG_OOB) {
2096 		space += 1024;
2097 	}
2098 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2099 	    clen > so->so_snd.sb_hiwat) {
2100 		return EMSGSIZE;
2101 	}
2102 
2103 	if ((space < resid + clen &&
2104 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2105 	    space < clen)) ||
2106 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2107 		/*
2108 		 * don't block the connectx call when there's more data
2109 		 * than can be copied.
2110 		 */
2111 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2112 			if (space == 0) {
2113 				return EWOULDBLOCK;
2114 			}
2115 			if (space < (int32_t)so->so_snd.sb_lowat) {
2116 				return 0;
2117 			}
2118 		}
2119 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2120 		    assumelock) {
2121 			return EWOULDBLOCK;
2122 		}
2123 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2124 		*sblocked = 0;
2125 		error = sbwait(&so->so_snd);
2126 		if (error) {
2127 			if (so->so_flags & SOF_DEFUNCT) {
2128 				goto defunct;
2129 			}
2130 			return error;
2131 		}
2132 		goto restart;
2133 	}
2134 	return 0;
2135 }
2136 
2137 /*
2138  * Send on a socket.
2139  * If send must go all at once and message is larger than
2140  * send buffering, then hard error.
2141  * Lock against other senders.
2142  * If must go all at once and not enough room now, then
2143  * inform user that this would block and do nothing.
2144  * Otherwise, if nonblocking, send as much as possible.
2145  * The data to be sent is described by "uio" if nonzero,
2146  * otherwise by the mbuf chain "top" (which must be null
2147  * if uio is not).  Data provided in mbuf chain must be small
2148  * enough to send all at once.
2149  *
2150  * Returns nonzero on error, timeout or signal; callers
2151  * must check for short counts if EINTR/ERESTART are returned.
2152  * Data and control buffers are freed on return.
2153  *
2154  * Returns:	0			Success
2155  *		EOPNOTSUPP
2156  *		EINVAL
2157  *		ENOBUFS
2158  *	uiomove:EFAULT
2159  *	sosendcheck:EPIPE
2160  *	sosendcheck:EWOULDBLOCK
2161  *	sosendcheck:EINTR
2162  *	sosendcheck:EBADF
2163  *	sosendcheck:EINTR
2164  *	sosendcheck:???			[value from so_error]
2165  *	<pru_send>:ECONNRESET[TCP]
2166  *	<pru_send>:EINVAL[TCP]
2167  *	<pru_send>:ENOBUFS[TCP]
2168  *	<pru_send>:EADDRINUSE[TCP]
2169  *	<pru_send>:EADDRNOTAVAIL[TCP]
2170  *	<pru_send>:EAFNOSUPPORT[TCP]
2171  *	<pru_send>:EACCES[TCP]
2172  *	<pru_send>:EAGAIN[TCP]
2173  *	<pru_send>:EPERM[TCP]
2174  *	<pru_send>:EMSGSIZE[TCP]
2175  *	<pru_send>:EHOSTUNREACH[TCP]
2176  *	<pru_send>:ENETUNREACH[TCP]
2177  *	<pru_send>:ENETDOWN[TCP]
2178  *	<pru_send>:ENOMEM[TCP]
2179  *	<pru_send>:ENOBUFS[TCP]
2180  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2181  *	<pru_send>:EINVAL[AF_UNIX]
2182  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2183  *	<pru_send>:EPIPE[AF_UNIX]
2184  *	<pru_send>:ENOTCONN[AF_UNIX]
2185  *	<pru_send>:EISCONN[AF_UNIX]
2186  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2187  *	<sf_data_out>:???		[whatever a filter author chooses]
2188  *
2189  * Notes:	Other <pru_send> returns depend on the protocol family; all
2190  *		<sf_data_out> returns depend on what the filter author causes
2191  *		their filter to return.
2192  */
2193 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2194 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2195     struct mbuf *top, struct mbuf *control, int flags)
2196 {
2197 	mbuf_ref_ref_t mp;
2198 	mbuf_ref_t m, freelist = NULL;
2199 	struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2200 	user_ssize_t space, len, resid, orig_resid;
2201 	int clen = 0, error, dontroute, sendflags;
2202 	int atomic = sosendallatonce(so) || top;
2203 	int sblocked = 0;
2204 	struct proc *p = current_proc();
2205 	uint16_t headroom = 0;
2206 	ssize_t mlen;
2207 	boolean_t en_tracing = FALSE;
2208 
2209 	if (uio != NULL) {
2210 		resid = uio_resid(uio);
2211 	} else {
2212 		resid = top->m_pkthdr.len;
2213 	}
2214 	orig_resid = resid;
2215 
2216 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2217 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2218 
2219 	socket_lock(so, 1);
2220 
2221 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2222 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, SOFLOW_DIRECTION_OUTBOUND, 0);
2223 	}
2224 
2225 	/*
2226 	 * trace if tracing & network (vs. unix) sockets & and
2227 	 * non-loopback
2228 	 */
2229 	if (ENTR_SHOULDTRACE &&
2230 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2231 		struct inpcb *inp = sotoinpcb(so);
2232 		if (inp->inp_last_outifp != NULL &&
2233 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2234 			en_tracing = TRUE;
2235 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2236 			    VM_KERNEL_ADDRPERM(so),
2237 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2238 			    (int64_t)resid);
2239 		}
2240 	}
2241 
2242 	/*
2243 	 * Re-injection should not affect process accounting
2244 	 */
2245 	if ((flags & MSG_SKIPCFIL) == 0) {
2246 		so_update_last_owner_locked(so, p);
2247 		so_update_policy(so);
2248 
2249 #if NECP
2250 		so_update_necp_policy(so, NULL, addr);
2251 #endif /* NECP */
2252 	}
2253 
2254 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2255 		error = EOPNOTSUPP;
2256 		goto out_locked;
2257 	}
2258 
2259 	/*
2260 	 * In theory resid should be unsigned.
2261 	 * However, space must be signed, as it might be less than 0
2262 	 * if we over-committed, and we must use a signed comparison
2263 	 * of space and resid.  On the other hand, a negative resid
2264 	 * causes us to loop sending 0-length segments to the protocol.
2265 	 *
2266 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2267 	 *
2268 	 * Note: We limit resid to be a positive int value as we use
2269 	 * imin() to set bytes_to_copy -- radr://14558484
2270 	 */
2271 	if (resid < 0 || resid > INT_MAX ||
2272 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2273 		error = EINVAL;
2274 		goto out_locked;
2275 	}
2276 
2277 	dontroute = (flags & MSG_DONTROUTE) &&
2278 	    (so->so_options & SO_DONTROUTE) == 0 &&
2279 	    (so->so_proto->pr_flags & PR_ATOMIC);
2280 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2281 
2282 	if (control != NULL) {
2283 		clen = control->m_len;
2284 	}
2285 
2286 	if (soreserveheadroom != 0) {
2287 		headroom = so->so_pktheadroom;
2288 	}
2289 
2290 	do {
2291 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2292 		    &sblocked);
2293 		if (error) {
2294 			goto out_locked;
2295 		}
2296 
2297 		mp = &top;
2298 		space = sbspace(&so->so_snd) - clen;
2299 		space += ((flags & MSG_OOB) ? 1024 : 0);
2300 
2301 		do {
2302 			if (uio == NULL) {
2303 				/*
2304 				 * Data is prepackaged in "top".
2305 				 */
2306 				resid = 0;
2307 				if (flags & MSG_EOR) {
2308 					top->m_flags |= M_EOR;
2309 				}
2310 			} else {
2311 				int chainlength;
2312 				int bytes_to_copy;
2313 				boolean_t jumbocl;
2314 				boolean_t bigcl;
2315 				int bytes_to_alloc;
2316 
2317 				bytes_to_copy = imin((int)resid, (int)space);
2318 
2319 				bytes_to_alloc = bytes_to_copy;
2320 				if (top == NULL) {
2321 					bytes_to_alloc += headroom;
2322 				}
2323 
2324 				if (sosendminchain > 0) {
2325 					chainlength = 0;
2326 				} else {
2327 					chainlength = sosendmaxchain;
2328 				}
2329 
2330 				/*
2331 				 * Use big 4 KB cluster when the outgoing interface
2332 				 * does not prefer 2 KB clusters
2333 				 */
2334 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2335 				    sosendbigcl_ignore_capab;
2336 
2337 				/*
2338 				 * Attempt to use larger than system page-size
2339 				 * clusters for large writes only if there is
2340 				 * a jumbo cluster pool and if the socket is
2341 				 * marked accordingly.
2342 				 */
2343 				jumbocl = sosendjcl && njcl > 0 &&
2344 				    ((so->so_flags & SOF_MULTIPAGES) ||
2345 				    sosendjcl_ignore_capab) &&
2346 				    bigcl;
2347 
2348 				socket_unlock(so, 0);
2349 
2350 				do {
2351 					int num_needed;
2352 					int hdrs_needed = (top == NULL) ? 1 : 0;
2353 
2354 					/*
2355 					 * try to maintain a local cache of mbuf
2356 					 * clusters needed to complete this
2357 					 * write the list is further limited to
2358 					 * the number that are currently needed
2359 					 * to fill the socket this mechanism
2360 					 * allows a large number of mbufs/
2361 					 * clusters to be grabbed under a single
2362 					 * mbuf lock... if we can't get any
2363 					 * clusters, than fall back to trying
2364 					 * for mbufs if we fail early (or
2365 					 * miscalcluate the number needed) make
2366 					 * sure to release any clusters we
2367 					 * haven't yet consumed.
2368 					 */
2369 					if (freelist == NULL &&
2370 					    bytes_to_alloc > MBIGCLBYTES &&
2371 					    jumbocl) {
2372 						num_needed =
2373 						    bytes_to_alloc / M16KCLBYTES;
2374 
2375 						if ((bytes_to_alloc -
2376 						    (num_needed * M16KCLBYTES))
2377 						    >= MINCLSIZE) {
2378 							num_needed++;
2379 						}
2380 
2381 						freelist =
2382 						    m_getpackets_internal(
2383 							(unsigned int *)&num_needed,
2384 							hdrs_needed, M_WAIT, 0,
2385 							M16KCLBYTES);
2386 						/*
2387 						 * Fall back to 4K cluster size
2388 						 * if allocation failed
2389 						 */
2390 					}
2391 
2392 					if (freelist == NULL &&
2393 					    bytes_to_alloc > MCLBYTES &&
2394 					    bigcl) {
2395 						num_needed =
2396 						    bytes_to_alloc / MBIGCLBYTES;
2397 
2398 						if ((bytes_to_alloc -
2399 						    (num_needed * MBIGCLBYTES)) >=
2400 						    MINCLSIZE) {
2401 							num_needed++;
2402 						}
2403 
2404 						freelist =
2405 						    m_getpackets_internal(
2406 							(unsigned int *)&num_needed,
2407 							hdrs_needed, M_WAIT, 0,
2408 							MBIGCLBYTES);
2409 						/*
2410 						 * Fall back to cluster size
2411 						 * if allocation failed
2412 						 */
2413 					}
2414 
2415 					/*
2416 					 * Allocate a cluster as we want to
2417 					 * avoid to split the data in more
2418 					 * that one segment and using MINCLSIZE
2419 					 * would lead us to allocate two mbufs
2420 					 */
2421 					if (soreserveheadroom != 0 &&
2422 					    freelist == NULL &&
2423 					    ((top == NULL &&
2424 					    bytes_to_alloc > _MHLEN) ||
2425 					    bytes_to_alloc > _MLEN)) {
2426 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2427 						    MCLBYTES;
2428 						freelist =
2429 						    m_getpackets_internal(
2430 							(unsigned int *)&num_needed,
2431 							hdrs_needed, M_WAIT, 0,
2432 							MCLBYTES);
2433 						/*
2434 						 * Fall back to a single mbuf
2435 						 * if allocation failed
2436 						 */
2437 					} else if (freelist == NULL &&
2438 					    bytes_to_alloc > MINCLSIZE) {
2439 						num_needed =
2440 						    bytes_to_alloc / MCLBYTES;
2441 
2442 						if ((bytes_to_alloc -
2443 						    (num_needed * MCLBYTES)) >=
2444 						    MINCLSIZE) {
2445 							num_needed++;
2446 						}
2447 
2448 						freelist =
2449 						    m_getpackets_internal(
2450 							(unsigned int *)&num_needed,
2451 							hdrs_needed, M_WAIT, 0,
2452 							MCLBYTES);
2453 						/*
2454 						 * Fall back to a single mbuf
2455 						 * if allocation failed
2456 						 */
2457 					}
2458 					/*
2459 					 * For datagram protocols, leave
2460 					 * headroom for protocol headers
2461 					 * in the first cluster of the chain
2462 					 */
2463 					if (freelist != NULL && atomic &&
2464 					    top == NULL && headroom > 0) {
2465 						freelist->m_data += headroom;
2466 					}
2467 
2468 					/*
2469 					 * Fall back to regular mbufs without
2470 					 * reserving the socket headroom
2471 					 */
2472 					if (freelist == NULL) {
2473 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2474 							if (top == NULL) {
2475 								MGETHDR(freelist,
2476 								    M_WAIT, MT_DATA);
2477 							} else {
2478 								MGET(freelist,
2479 								    M_WAIT, MT_DATA);
2480 							}
2481 						}
2482 
2483 						if (freelist == NULL) {
2484 							error = ENOBUFS;
2485 							socket_lock(so, 0);
2486 							goto out_locked;
2487 						}
2488 						/*
2489 						 * For datagram protocols,
2490 						 * leave room for protocol
2491 						 * headers in first mbuf.
2492 						 */
2493 						if (atomic && top == NULL &&
2494 						    bytes_to_copy > 0 &&
2495 						    bytes_to_copy < MHLEN) {
2496 							MH_ALIGN(freelist,
2497 							    bytes_to_copy);
2498 						}
2499 					}
2500 					m = freelist;
2501 					freelist = m->m_next;
2502 					m->m_next = NULL;
2503 
2504 					if ((m->m_flags & M_EXT)) {
2505 						mlen = m->m_ext.ext_size -
2506 						    M_LEADINGSPACE(m);
2507 					} else if ((m->m_flags & M_PKTHDR)) {
2508 						mlen = MHLEN - M_LEADINGSPACE(m);
2509 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2510 					} else {
2511 						mlen = MLEN - M_LEADINGSPACE(m);
2512 					}
2513 					len = imin((int)mlen, bytes_to_copy);
2514 
2515 					chainlength += len;
2516 
2517 					space -= len;
2518 
2519 					error = uiomove(mtod(m, caddr_t),
2520 					    (int)len, uio);
2521 
2522 					resid = uio_resid(uio);
2523 
2524 					m->m_len = (int32_t)len;
2525 					*mp = m;
2526 					top->m_pkthdr.len += len;
2527 					if (error) {
2528 						break;
2529 					}
2530 					mp = &m->m_next;
2531 					if (resid <= 0) {
2532 						if (flags & MSG_EOR) {
2533 							top->m_flags |= M_EOR;
2534 						}
2535 						break;
2536 					}
2537 					bytes_to_copy = imin((int)resid, (int)space);
2538 				} while (space > 0 &&
2539 				    (chainlength < sosendmaxchain || atomic ||
2540 				    resid < MINCLSIZE));
2541 
2542 				socket_lock(so, 0);
2543 
2544 				if (error) {
2545 					goto out_locked;
2546 				}
2547 			}
2548 
2549 			if (dontroute) {
2550 				so->so_options |= SO_DONTROUTE;
2551 			}
2552 
2553 			/*
2554 			 * Compute flags here, for pru_send and NKEs
2555 			 *
2556 			 * If the user set MSG_EOF, the protocol
2557 			 * understands this flag and nothing left to
2558 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2559 			 */
2560 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2561 			    ((flags & MSG_EOF) &&
2562 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2563 			    (resid <= 0)) ? PRUS_EOF :
2564 			    /* If there is more to send set PRUS_MORETOCOME */
2565 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2566 
2567 			if ((flags & MSG_SKIPCFIL) == 0) {
2568 				/*
2569 				 * Socket filter processing
2570 				 */
2571 				error = sflt_data_out(so, addr, &top,
2572 				    &control, (sendflags & MSG_OOB) ?
2573 				    sock_data_filt_flag_oob : 0);
2574 				if (error) {
2575 					if (error == EJUSTRETURN) {
2576 						error = 0;
2577 						goto packet_consumed;
2578 					}
2579 					goto out_locked;
2580 				}
2581 #if CONTENT_FILTER
2582 				/*
2583 				 * Content filter processing
2584 				 */
2585 				error = cfil_sock_data_out(so, addr, top,
2586 				    control, sendflags, dgram_flow_entry);
2587 				if (error) {
2588 					if (error == EJUSTRETURN) {
2589 						error = 0;
2590 						goto packet_consumed;
2591 					}
2592 					goto out_locked;
2593 				}
2594 #endif /* CONTENT_FILTER */
2595 			}
2596 			error = (*so->so_proto->pr_usrreqs->pru_send)
2597 			    (so, sendflags, top, addr, control, p);
2598 
2599 packet_consumed:
2600 			if (dontroute) {
2601 				so->so_options &= ~SO_DONTROUTE;
2602 			}
2603 
2604 			clen = 0;
2605 			control = NULL;
2606 			top = NULL;
2607 			mp = &top;
2608 			if (error) {
2609 				goto out_locked;
2610 			}
2611 		} while (resid && space > 0);
2612 	} while (resid);
2613 
2614 
2615 out_locked:
2616 	if (resid > orig_resid) {
2617 		char pname[MAXCOMLEN] = {};
2618 		pid_t current_pid = proc_pid(current_proc());
2619 		proc_name(current_pid, pname, sizeof(pname));
2620 
2621 		if (sosend_assert_panic != 0) {
2622 			panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2623 			    so, resid, orig_resid, pname, current_pid);
2624 		} else {
2625 			os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2626 			    so->so_gencnt, resid, orig_resid, pname, current_pid);
2627 		}
2628 	}
2629 
2630 	if (sblocked) {
2631 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2632 	} else {
2633 		socket_unlock(so, 1);
2634 	}
2635 	if (top != NULL) {
2636 		m_freem(top);
2637 	}
2638 	if (control != NULL) {
2639 		m_freem(control);
2640 	}
2641 	if (freelist != NULL) {
2642 		m_freem_list(freelist);
2643 	}
2644 
2645 	if (dgram_flow_entry != NULL) {
2646 		soflow_free_flow(dgram_flow_entry);
2647 	}
2648 
2649 	soclearfastopen(so);
2650 
2651 	if (en_tracing) {
2652 		/* resid passed here is the bytes left in uio */
2653 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2654 		    VM_KERNEL_ADDRPERM(so),
2655 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2656 		    (int64_t)(orig_resid - resid));
2657 	}
2658 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2659 	    so->so_snd.sb_cc, space, error);
2660 
2661 	return error;
2662 }
2663 
2664 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2665 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2666 {
2667 	struct mbuf *m0 = NULL, *control_end = NULL;
2668 
2669 	socket_lock_assert_owned(so);
2670 
2671 	/*
2672 	 * top must points to mbuf chain to be sent.
2673 	 * If control is not NULL, top must be packet header
2674 	 */
2675 	VERIFY(top != NULL &&
2676 	    (control == NULL || top->m_flags & M_PKTHDR));
2677 
2678 	/*
2679 	 * If control is not passed in, see if we can get it
2680 	 * from top.
2681 	 */
2682 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2683 		// Locate start of control if present and start of data
2684 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2685 			if (m0->m_flags & M_PKTHDR) {
2686 				top = m0;
2687 				break;
2688 			} else if (m0->m_type == MT_CONTROL) {
2689 				if (control == NULL) {
2690 					// Found start of control
2691 					control = m0;
2692 				}
2693 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2694 					// Found end of control
2695 					control_end = m0;
2696 				}
2697 			}
2698 		}
2699 		if (control_end != NULL) {
2700 			control_end->m_next = NULL;
2701 		}
2702 	}
2703 
2704 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2705 	    (so, sendflags, top, addr, control, current_proc());
2706 
2707 	return error;
2708 }
2709 
2710 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp,struct mbuf ** last_control)2711 mbuf_detach_control_from_list(struct mbuf **mp, struct mbuf **last_control)
2712 {
2713 	struct mbuf *control = NULL;
2714 	struct mbuf *m = *mp;
2715 
2716 	if (m->m_type == MT_CONTROL) {
2717 		struct mbuf *control_end;
2718 		struct mbuf *n;
2719 
2720 		n = control_end = control = m;
2721 
2722 		/*
2723 		 * Break the chain per mbuf type
2724 		 */
2725 		while (n != NULL && n->m_type == MT_CONTROL) {
2726 			control_end = n;
2727 			n = n->m_next;
2728 		}
2729 		control_end->m_next = NULL;
2730 		*mp = n;
2731 		if (last_control != NULL) {
2732 			*last_control = control_end;
2733 		}
2734 	}
2735 	VERIFY(*mp != NULL);
2736 
2737 	return control;
2738 }
2739 
2740 /*
2741  * Supported only connected sockets (no address) without ancillary data
2742  * (control mbuf) for atomic protocols
2743  */
2744 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2745 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2746 {
2747 	mbuf_ref_t m, control = NULL;
2748 	struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2749 	int error, dontroute;
2750 	int atomic = sosendallatonce(so);
2751 	int sblocked = 0;
2752 	struct proc *p = current_proc();
2753 	struct mbuf *top = pktlist;
2754 	bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2755 
2756 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2757 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2758 
2759 	if (so->so_type != SOCK_DGRAM) {
2760 		error = EINVAL;
2761 		os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2762 		    error);
2763 		goto out;
2764 	}
2765 	if (atomic == 0) {
2766 		error = EINVAL;
2767 		os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2768 		    error);
2769 		goto out;
2770 	}
2771 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2772 		error = ENOTCONN;
2773 		os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2774 		    error);
2775 		goto out;
2776 	}
2777 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2778 		error = EINVAL;
2779 		os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2780 		    flags, error);
2781 		goto out;
2782 	}
2783 
2784 	socket_lock(so, 1);
2785 	so_update_last_owner_locked(so, p);
2786 	so_update_policy(so);
2787 
2788 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2789 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, SOFLOW_DIRECTION_OUTBOUND, 0);
2790 	}
2791 
2792 #if NECP
2793 	so_update_necp_policy(so, NULL, NULL);
2794 #endif /* NECP */
2795 
2796 	dontroute = (flags & MSG_DONTROUTE) &&
2797 	    (so->so_options & SO_DONTROUTE) == 0 &&
2798 	    (so->so_proto->pr_flags & PR_ATOMIC);
2799 	if (dontroute) {
2800 		so->so_options |= SO_DONTROUTE;
2801 	}
2802 
2803 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2804 
2805 	error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2806 	if (error) {
2807 		os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2808 		    error);
2809 		goto release;
2810 	}
2811 
2812 	if (!skip_filt) {
2813 		mbuf_ref_ref_t prevnextp = NULL;
2814 
2815 		for (m = top; m != NULL; m = m->m_nextpkt) {
2816 			mbuf_ref_t nextpkt, last_control;
2817 
2818 			/*
2819 			 * Remove packet from the list of packets
2820 			 */
2821 			nextpkt = m->m_nextpkt;
2822 			if (prevnextp != NULL) {
2823 				*prevnextp = nextpkt;
2824 			} else {
2825 				top = nextpkt;
2826 			}
2827 			m->m_nextpkt = NULL;
2828 
2829 			/*
2830 			 * Break the chain per mbuf type
2831 			 */
2832 			if (m->m_type == MT_CONTROL) {
2833 				control = mbuf_detach_control_from_list(&m, &last_control);
2834 			}
2835 			/*
2836 			 * Socket filter processing
2837 			 */
2838 			error = sflt_data_out(so, NULL, &m,
2839 			    &control, 0);
2840 			if (error != 0 && error != EJUSTRETURN) {
2841 				os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2842 				    error);
2843 				m_freem(m);
2844 				goto release;
2845 			}
2846 
2847 #if CONTENT_FILTER
2848 			if (error == 0) {
2849 				/*
2850 				 * Content filter processing
2851 				 */
2852 				error = cfil_sock_data_out(so, NULL, m,
2853 				    control, 0, dgram_flow_entry);
2854 				if (error != 0 && error != EJUSTRETURN) {
2855 					os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2856 					    error);
2857 					m_freem(m);
2858 					goto release;
2859 				}
2860 			}
2861 #endif /* CONTENT_FILTER */
2862 			if (error == EJUSTRETURN) {
2863 				/*
2864 				 * When swallowed by a filter, the packet is not
2865 				 * in the list anymore
2866 				 */
2867 				error = 0;
2868 			} else {
2869 				/*
2870 				 * Rebuild the mbuf chain of the packet
2871 				 */
2872 				if (control != NULL) {
2873 					last_control->m_next = m;
2874 					m = control;
2875 				}
2876 				/*
2877 				 * Reinsert the packet in the list of packets
2878 				 */
2879 				m->m_nextpkt = nextpkt;
2880 				if (prevnextp != NULL) {
2881 					*prevnextp = m;
2882 				} else {
2883 					top = m;
2884 				}
2885 				prevnextp = &m->m_nextpkt;
2886 			}
2887 			control = NULL;
2888 		}
2889 	}
2890 
2891 	if (top != NULL) {
2892 		if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2893 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2894 			    (so, top, pktcnt, flags);
2895 			if (error != 0 && error != ENOBUFS) {
2896 				os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2897 				    error);
2898 			}
2899 			top = NULL;
2900 		} else {
2901 			*pktcnt = 0;
2902 			control = NULL;
2903 			for (m = top; m != NULL; m = top) {
2904 				top = m->m_nextpkt;
2905 				m->m_nextpkt = NULL;
2906 
2907 				/*
2908 				 * Break the chain per mbuf type
2909 				 */
2910 				if (m->m_type == MT_CONTROL) {
2911 					control = mbuf_detach_control_from_list(&m, NULL);
2912 				}
2913 
2914 				error = (*so->so_proto->pr_usrreqs->pru_send)
2915 				    (so, 0, m, NULL, control, current_proc());
2916 				if (error != 0) {
2917 					if (error != ENOBUFS) {
2918 						os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2919 						    error);
2920 					}
2921 					control = NULL;
2922 					goto release;
2923 				}
2924 				*pktcnt += 1;
2925 				control = NULL;
2926 			}
2927 		}
2928 	}
2929 
2930 release:
2931 	if (dontroute) {
2932 		so->so_options &= ~SO_DONTROUTE;
2933 	}
2934 	if (sblocked) {
2935 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2936 	} else {
2937 		socket_unlock(so, 1);
2938 	}
2939 out:
2940 	if (control != NULL) {
2941 		m_freem(control);
2942 	}
2943 	if (top != NULL) {
2944 		if (error != ENOBUFS) {
2945 			os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2946 			    error);
2947 		}
2948 		m_freem_list(top);
2949 	}
2950 
2951 	if (dgram_flow_entry != NULL) {
2952 		soflow_free_flow(dgram_flow_entry);
2953 	}
2954 
2955 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2956 	    so->so_snd.sb_cc, 0, error);
2957 
2958 	return error;
2959 }
2960 
2961 /*
2962  * May return ERESTART when packet is dropped by MAC policy check
2963  */
2964 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2965 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2966     struct mbuf **maddrp,
2967     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2968 {
2969 	int error = 0;
2970 	struct mbuf *m = *mp;
2971 	struct mbuf *nextrecord = *nextrecordp;
2972 
2973 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2974 #if CONFIG_MACF_SOCKET_SUBSET
2975 	/*
2976 	 * Call the MAC framework for policy checking if we're in
2977 	 * the user process context and the socket isn't connected.
2978 	 */
2979 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2980 		struct mbuf *m0 = m;
2981 		/*
2982 		 * Dequeue this record (temporarily) from the receive
2983 		 * list since we're about to drop the socket's lock
2984 		 * where a new record may arrive and be appended to
2985 		 * the list.  Upon MAC policy failure, the record
2986 		 * will be freed.  Otherwise, we'll add it back to
2987 		 * the head of the list.  We cannot rely on SB_LOCK
2988 		 * because append operation uses the socket's lock.
2989 		 */
2990 		do {
2991 			m->m_nextpkt = NULL;
2992 			sbfree(&so->so_rcv, m);
2993 			m = m->m_next;
2994 		} while (m != NULL);
2995 		m = m0;
2996 		so->so_rcv.sb_mb = nextrecord;
2997 		SB_EMPTY_FIXUP(&so->so_rcv);
2998 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2999 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3000 		socket_unlock(so, 0);
3001 
3002 		error = mac_socket_check_received(kauth_cred_get(), so,
3003 		    mtod(m, struct sockaddr *));
3004 
3005 		if (error != 0) {
3006 			/*
3007 			 * MAC policy failure; free this record and
3008 			 * process the next record (or block until
3009 			 * one is available).  We have adjusted sb_cc
3010 			 * and sb_mbcnt above so there is no need to
3011 			 * call sbfree() again.
3012 			 */
3013 			m_freem(m);
3014 			/*
3015 			 * Clear SB_LOCK but don't unlock the socket.
3016 			 * Process the next record or wait for one.
3017 			 */
3018 			socket_lock(so, 0);
3019 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
3020 			error = ERESTART;
3021 			goto done;
3022 		}
3023 		socket_lock(so, 0);
3024 		/*
3025 		 * If the socket has been defunct'd, drop it.
3026 		 */
3027 		if (so->so_flags & SOF_DEFUNCT) {
3028 			m_freem(m);
3029 			error = ENOTCONN;
3030 			goto done;
3031 		}
3032 		/*
3033 		 * Re-adjust the socket receive list and re-enqueue
3034 		 * the record in front of any packets which may have
3035 		 * been appended while we dropped the lock.
3036 		 */
3037 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3038 			sballoc(&so->so_rcv, m);
3039 		}
3040 		sballoc(&so->so_rcv, m);
3041 		if (so->so_rcv.sb_mb == NULL) {
3042 			so->so_rcv.sb_lastrecord = m0;
3043 			so->so_rcv.sb_mbtail = m;
3044 		}
3045 		m = m0;
3046 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3047 		so->so_rcv.sb_mb = m;
3048 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3049 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3050 	}
3051 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3052 	if (psa != NULL) {
3053 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3054 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3055 			error = EWOULDBLOCK;
3056 			goto done;
3057 		}
3058 	} else if (maddrp != NULL) {
3059 		*maddrp = m;
3060 	}
3061 	if (flags & MSG_PEEK) {
3062 		m = m->m_next;
3063 	} else {
3064 		sbfree(&so->so_rcv, m);
3065 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3066 			panic("%s: about to create invalid socketbuf",
3067 			    __func__);
3068 			/* NOTREACHED */
3069 		}
3070 		if (maddrp == NULL) {
3071 			MFREE(m, so->so_rcv.sb_mb);
3072 		} else {
3073 			so->so_rcv.sb_mb = m->m_next;
3074 			m->m_next = NULL;
3075 		}
3076 		m = so->so_rcv.sb_mb;
3077 		if (m != NULL) {
3078 			m->m_nextpkt = nextrecord;
3079 		} else {
3080 			so->so_rcv.sb_mb = nextrecord;
3081 			SB_EMPTY_FIXUP(&so->so_rcv);
3082 		}
3083 	}
3084 done:
3085 	*mp = m;
3086 	*nextrecordp = nextrecord;
3087 
3088 	return error;
3089 }
3090 
3091 /*
3092  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3093  * so clear the data portion in order not to leak the file pointers
3094  */
3095 static void
sopeek_scm_rights(struct mbuf * rights)3096 sopeek_scm_rights(struct mbuf *rights)
3097 {
3098 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3099 
3100 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3101 		VERIFY(cm->cmsg_len <= rights->m_len);
3102 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3103 	}
3104 }
3105 
3106 /*
3107  * Process one or more MT_CONTROL mbufs present before any data mbufs
3108  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3109  * just copy the data; if !MSG_PEEK, we call into the protocol to
3110  * perform externalization.
3111  */
3112 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3113 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3114     struct mbuf **mp, struct mbuf **nextrecordp)
3115 {
3116 	int error = 0;
3117 	mbuf_ref_t cm = NULL, cmn;
3118 	mbuf_ref_ref_t cme = &cm;
3119 	struct sockbuf *sb_rcv = &so->so_rcv;
3120 	mbuf_ref_ref_t msgpcm = NULL;
3121 	mbuf_ref_t m = *mp;
3122 	mbuf_ref_t nextrecord = *nextrecordp;
3123 	struct protosw *pr = so->so_proto;
3124 
3125 	/*
3126 	 * Externalizing the control messages would require us to
3127 	 * drop the socket's lock below.  Once we re-acquire the
3128 	 * lock, the mbuf chain might change.  In order to preserve
3129 	 * consistency, we unlink all control messages from the
3130 	 * first mbuf chain in one shot and link them separately
3131 	 * onto a different chain.
3132 	 */
3133 	do {
3134 		if (flags & MSG_PEEK) {
3135 			if (controlp != NULL) {
3136 				if (*controlp == NULL) {
3137 					msgpcm = controlp;
3138 				}
3139 				*controlp = m_copy(m, 0, m->m_len);
3140 
3141 				/*
3142 				 * If we failed to allocate an mbuf,
3143 				 * release any previously allocated
3144 				 * mbufs for control data. Return
3145 				 * an error. Keep the mbufs in the
3146 				 * socket as this is using
3147 				 * MSG_PEEK flag.
3148 				 */
3149 				if (*controlp == NULL) {
3150 					m_freem(*msgpcm);
3151 					error = ENOBUFS;
3152 					goto done;
3153 				}
3154 
3155 				if (pr->pr_domain->dom_externalize != NULL) {
3156 					sopeek_scm_rights(*controlp);
3157 				}
3158 
3159 				controlp = &(*controlp)->m_next;
3160 			}
3161 			m = m->m_next;
3162 		} else {
3163 			m->m_nextpkt = NULL;
3164 			sbfree(sb_rcv, m);
3165 			sb_rcv->sb_mb = m->m_next;
3166 			m->m_next = NULL;
3167 			*cme = m;
3168 			cme = &(*cme)->m_next;
3169 			m = sb_rcv->sb_mb;
3170 		}
3171 	} while (m != NULL && m->m_type == MT_CONTROL);
3172 
3173 	if (!(flags & MSG_PEEK)) {
3174 		if (sb_rcv->sb_mb != NULL) {
3175 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3176 		} else {
3177 			sb_rcv->sb_mb = nextrecord;
3178 			SB_EMPTY_FIXUP(sb_rcv);
3179 		}
3180 		if (nextrecord == NULL) {
3181 			sb_rcv->sb_lastrecord = m;
3182 		}
3183 	}
3184 
3185 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3186 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3187 
3188 	while (cm != NULL) {
3189 		int cmsg_level;
3190 		int cmsg_type;
3191 
3192 		cmn = cm->m_next;
3193 		cm->m_next = NULL;
3194 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3195 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3196 
3197 		/*
3198 		 * Call the protocol to externalize SCM_RIGHTS message
3199 		 * and return the modified message to the caller upon
3200 		 * success.  Otherwise, all other control messages are
3201 		 * returned unmodified to the caller.  Note that we
3202 		 * only get into this loop if MSG_PEEK is not set.
3203 		 */
3204 		if (pr->pr_domain->dom_externalize != NULL &&
3205 		    cmsg_level == SOL_SOCKET &&
3206 		    cmsg_type == SCM_RIGHTS) {
3207 			/*
3208 			 * Release socket lock: see 3903171.  This
3209 			 * would also allow more records to be appended
3210 			 * to the socket buffer.  We still have SB_LOCK
3211 			 * set on it, so we can be sure that the head
3212 			 * of the mbuf chain won't change.
3213 			 */
3214 			socket_unlock(so, 0);
3215 			error = (*pr->pr_domain->dom_externalize)(cm);
3216 			socket_lock(so, 0);
3217 		} else {
3218 			error = 0;
3219 		}
3220 
3221 		if (controlp != NULL && error == 0) {
3222 			*controlp = cm;
3223 			controlp = &(*controlp)->m_next;
3224 		} else {
3225 			(void) m_free(cm);
3226 		}
3227 		cm = cmn;
3228 	}
3229 	/*
3230 	 * Update the value of nextrecord in case we received new
3231 	 * records when the socket was unlocked above for
3232 	 * externalizing SCM_RIGHTS.
3233 	 */
3234 	if (m != NULL) {
3235 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3236 	} else {
3237 		nextrecord = sb_rcv->sb_mb;
3238 	}
3239 
3240 done:
3241 	*mp = m;
3242 	*nextrecordp = nextrecord;
3243 
3244 	return error;
3245 }
3246 
3247 /*
3248  * If we have less data than requested, block awaiting more
3249  * (subject to any timeout) if:
3250  *   1. the current count is less than the low water mark, or
3251  *   2. MSG_WAITALL is set, and it is possible to do the entire
3252  *	receive operation at once if we block (resid <= hiwat).
3253  *   3. MSG_DONTWAIT is not set
3254  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3255  * we have to do the receive in sections, and thus risk returning
3256  * a short count if a timeout or signal occurs after we start.
3257  */
3258 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3259 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3260 {
3261 	struct protosw *pr = so->so_proto;
3262 
3263 	/* No mbufs in the receive-queue? Wait! */
3264 	if (m == NULL) {
3265 		return true;
3266 	}
3267 
3268 	/* Not enough data in the receive socket-buffer - we may have to wait */
3269 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3270 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3271 		/*
3272 		 * Application did set the lowater-mark, so we should wait for
3273 		 * this data to be present.
3274 		 */
3275 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3276 			return true;
3277 		}
3278 
3279 		/*
3280 		 * Application wants all the data - so let's try to do the
3281 		 * receive-operation at once by waiting for everything to
3282 		 * be there.
3283 		 */
3284 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3285 			return true;
3286 		}
3287 	}
3288 
3289 	return false;
3290 }
3291 
3292 /*
3293  * Implement receive operations on a socket.
3294  * We depend on the way that records are added to the sockbuf
3295  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3296  * must begin with an address if the protocol so specifies,
3297  * followed by an optional mbuf or mbufs containing ancillary data,
3298  * and then zero or more mbufs of data.
3299  * In order to avoid blocking network interrupts for the entire time here,
3300  * we splx() while doing the actual copy to user space.
3301  * Although the sockbuf is locked, new data may still be appended,
3302  * and thus we must maintain consistency of the sockbuf during that time.
3303  *
3304  * The caller may receive the data as a single mbuf chain by supplying
3305  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3306  * only for the count in uio_resid.
3307  *
3308  * Returns:	0			Success
3309  *		ENOBUFS
3310  *		ENOTCONN
3311  *		EWOULDBLOCK
3312  *	uiomove:EFAULT
3313  *	sblock:EWOULDBLOCK
3314  *	sblock:EINTR
3315  *	sbwait:EBADF
3316  *	sbwait:EINTR
3317  *	sodelayed_copy:EFAULT
3318  *	<pru_rcvoob>:EINVAL[TCP]
3319  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3320  *	<pru_rcvoob>:???
3321  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3322  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3323  *	<pr_domain->dom_externalize>:???
3324  *
3325  * Notes:	Additional return values from calls through <pru_rcvoob> and
3326  *		<pr_domain->dom_externalize> depend on protocols other than
3327  *		TCP or AF_UNIX, which are documented above.
3328  */
3329 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3330 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3331     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3332 {
3333 	mbuf_ref_t m;
3334 	mbuf_ref_ref_t mp;
3335 	mbuf_ref_t ml = NULL;
3336 	mbuf_ref_t nextrecord, free_list;
3337 	int flags, error, offset;
3338 	user_ssize_t len;
3339 	struct protosw *pr = so->so_proto;
3340 	int moff, type = 0;
3341 	user_ssize_t orig_resid = uio_resid(uio);
3342 	user_ssize_t delayed_copy_len;
3343 	int can_delay;
3344 	struct proc *p = current_proc();
3345 	boolean_t en_tracing = FALSE;
3346 
3347 	/*
3348 	 * Sanity check on the length passed by caller as we are making 'int'
3349 	 * comparisons
3350 	 */
3351 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3352 		return EINVAL;
3353 	}
3354 
3355 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3356 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3357 	    so->so_rcv.sb_hiwat);
3358 
3359 	socket_lock(so, 1);
3360 	so_update_last_owner_locked(so, p);
3361 	so_update_policy(so);
3362 
3363 #ifdef MORE_LOCKING_DEBUG
3364 	if (so->so_usecount == 1) {
3365 		panic("%s: so=%x no other reference on socket", __func__, so);
3366 		/* NOTREACHED */
3367 	}
3368 #endif
3369 	mp = mp0;
3370 	if (psa != NULL) {
3371 		*psa = NULL;
3372 	}
3373 	if (controlp != NULL) {
3374 		*controlp = NULL;
3375 	}
3376 	if (flagsp != NULL) {
3377 		flags = *flagsp & ~MSG_EOR;
3378 	} else {
3379 		flags = 0;
3380 	}
3381 
3382 	/*
3383 	 * If a recv attempt is made on a previously-accepted socket
3384 	 * that has been marked as inactive (disconnected), reject
3385 	 * the request.
3386 	 */
3387 	if (so->so_flags & SOF_DEFUNCT) {
3388 		struct sockbuf *sb = &so->so_rcv;
3389 
3390 		error = ENOTCONN;
3391 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3392 		    __func__, proc_pid(p), proc_best_name(p),
3393 		    so->so_gencnt,
3394 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3395 		/*
3396 		 * This socket should have been disconnected and flushed
3397 		 * prior to being returned from sodefunct(); there should
3398 		 * be no data on its receive list, so panic otherwise.
3399 		 */
3400 		if (so->so_state & SS_DEFUNCT) {
3401 			sb_empty_assert(sb, __func__);
3402 		}
3403 		socket_unlock(so, 1);
3404 		return error;
3405 	}
3406 
3407 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3408 	    pr->pr_usrreqs->pru_preconnect) {
3409 		/*
3410 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3411 		 * calling write() right after this. *If* the app calls a read
3412 		 * we do not want to block this read indefinetely. Thus,
3413 		 * we trigger a connect so that the session gets initiated.
3414 		 */
3415 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3416 
3417 		if (error) {
3418 			socket_unlock(so, 1);
3419 			return error;
3420 		}
3421 	}
3422 
3423 	if (ENTR_SHOULDTRACE &&
3424 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3425 		/*
3426 		 * enable energy tracing for inet sockets that go over
3427 		 * non-loopback interfaces only.
3428 		 */
3429 		struct inpcb *inp = sotoinpcb(so);
3430 		if (inp->inp_last_outifp != NULL &&
3431 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3432 			en_tracing = TRUE;
3433 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3434 			    VM_KERNEL_ADDRPERM(so),
3435 			    ((so->so_state & SS_NBIO) ?
3436 			    kEnTrFlagNonBlocking : 0),
3437 			    (int64_t)orig_resid);
3438 		}
3439 	}
3440 
3441 	/*
3442 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3443 	 * regardless of the flags argument. Here is the case were
3444 	 * out-of-band data is not inline.
3445 	 */
3446 	if ((flags & MSG_OOB) ||
3447 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3448 	    (so->so_options & SO_OOBINLINE) == 0 &&
3449 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3450 		m = m_get(M_WAIT, MT_DATA);
3451 		if (m == NULL) {
3452 			socket_unlock(so, 1);
3453 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3454 			    ENOBUFS, 0, 0, 0, 0);
3455 			return ENOBUFS;
3456 		}
3457 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3458 		if (error) {
3459 			goto bad;
3460 		}
3461 		socket_unlock(so, 0);
3462 		do {
3463 			error = uiomove(mtod(m, caddr_t),
3464 			    imin((int)uio_resid(uio), m->m_len), uio);
3465 			m = m_free(m);
3466 		} while (uio_resid(uio) && error == 0 && m != NULL);
3467 		socket_lock(so, 0);
3468 bad:
3469 		if (m != NULL) {
3470 			m_freem(m);
3471 		}
3472 
3473 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3474 			if (error == EWOULDBLOCK || error == EINVAL) {
3475 				/*
3476 				 * Let's try to get normal data:
3477 				 * EWOULDBLOCK: out-of-band data not
3478 				 * receive yet. EINVAL: out-of-band data
3479 				 * already read.
3480 				 */
3481 				error = 0;
3482 				goto nooob;
3483 			} else if (error == 0 && flagsp != NULL) {
3484 				*flagsp |= MSG_OOB;
3485 			}
3486 		}
3487 		socket_unlock(so, 1);
3488 		if (en_tracing) {
3489 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3490 			    VM_KERNEL_ADDRPERM(so), 0,
3491 			    (int64_t)(orig_resid - uio_resid(uio)));
3492 		}
3493 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3494 		    0, 0, 0, 0);
3495 
3496 		return error;
3497 	}
3498 nooob:
3499 	if (mp != NULL) {
3500 		*mp = NULL;
3501 	}
3502 
3503 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3504 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3505 	}
3506 
3507 	free_list = NULL;
3508 	delayed_copy_len = 0;
3509 restart:
3510 #ifdef MORE_LOCKING_DEBUG
3511 	if (so->so_usecount <= 1) {
3512 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3513 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3514 	}
3515 #endif
3516 	/*
3517 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3518 	 * and if so just return to the caller.  This could happen when
3519 	 * soreceive() is called by a socket upcall function during the
3520 	 * time the socket is freed.  The socket buffer would have been
3521 	 * locked across the upcall, therefore we cannot put this thread
3522 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3523 	 * we may livelock), because the lock on the socket buffer will
3524 	 * only be released when the upcall routine returns to its caller.
3525 	 * Because the socket has been officially closed, there can be
3526 	 * no further read on it.
3527 	 *
3528 	 * A multipath subflow socket would have its SS_NOFDREF set by
3529 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3530 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3531 	 */
3532 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3533 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3534 		socket_unlock(so, 1);
3535 		return 0;
3536 	}
3537 
3538 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3539 	if (error) {
3540 		socket_unlock(so, 1);
3541 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3542 		    0, 0, 0, 0);
3543 		if (en_tracing) {
3544 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3545 			    VM_KERNEL_ADDRPERM(so), 0,
3546 			    (int64_t)(orig_resid - uio_resid(uio)));
3547 		}
3548 		return error;
3549 	}
3550 
3551 	m = so->so_rcv.sb_mb;
3552 	if (so_should_wait(so, uio, m, flags)) {
3553 		/*
3554 		 * Panic if we notice inconsistencies in the socket's
3555 		 * receive list; both sb_mb and sb_cc should correctly
3556 		 * reflect the contents of the list, otherwise we may
3557 		 * end up with false positives during select() or poll()
3558 		 * which could put the application in a bad state.
3559 		 */
3560 		SB_MB_CHECK(&so->so_rcv);
3561 
3562 		if (so->so_error) {
3563 			if (m != NULL) {
3564 				goto dontblock;
3565 			}
3566 			error = so->so_error;
3567 			if ((flags & MSG_PEEK) == 0) {
3568 				so->so_error = 0;
3569 			}
3570 			goto release;
3571 		}
3572 		if (so->so_state & SS_CANTRCVMORE) {
3573 #if CONTENT_FILTER
3574 			/*
3575 			 * Deal with half closed connections
3576 			 */
3577 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3578 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3579 				CFIL_LOG(LOG_INFO,
3580 				    "so %llx ignore SS_CANTRCVMORE",
3581 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3582 			} else
3583 #endif /* CONTENT_FILTER */
3584 			if (m != NULL) {
3585 				goto dontblock;
3586 			} else {
3587 				goto release;
3588 			}
3589 		}
3590 		for (; m != NULL; m = m->m_next) {
3591 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3592 				m = so->so_rcv.sb_mb;
3593 				goto dontblock;
3594 			}
3595 		}
3596 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3597 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3598 			error = ENOTCONN;
3599 			goto release;
3600 		}
3601 		if (uio_resid(uio) == 0) {
3602 			goto release;
3603 		}
3604 
3605 		if ((so->so_state & SS_NBIO) ||
3606 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3607 			error = EWOULDBLOCK;
3608 			goto release;
3609 		}
3610 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3611 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3612 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3613 #if EVEN_MORE_LOCKING_DEBUG
3614 		if (socket_debug) {
3615 			printf("Waiting for socket data\n");
3616 		}
3617 #endif
3618 
3619 		/*
3620 		 * Depending on the protocol (e.g. TCP), the following
3621 		 * might cause the socket lock to be dropped and later
3622 		 * be reacquired, and more data could have arrived and
3623 		 * have been appended to the receive socket buffer by
3624 		 * the time it returns.  Therefore, we only sleep in
3625 		 * sbwait() below if and only if the wait-condition is still
3626 		 * true.
3627 		 */
3628 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3629 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3630 		}
3631 
3632 		error = 0;
3633 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3634 			error = sbwait(&so->so_rcv);
3635 		}
3636 
3637 #if EVEN_MORE_LOCKING_DEBUG
3638 		if (socket_debug) {
3639 			printf("SORECEIVE - sbwait returned %d\n", error);
3640 		}
3641 #endif
3642 		if (so->so_usecount < 1) {
3643 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3644 			    __func__, so, so->so_usecount);
3645 			/* NOTREACHED */
3646 		}
3647 		if (error) {
3648 			socket_unlock(so, 1);
3649 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3650 			    0, 0, 0, 0);
3651 			if (en_tracing) {
3652 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3653 				    VM_KERNEL_ADDRPERM(so), 0,
3654 				    (int64_t)(orig_resid - uio_resid(uio)));
3655 			}
3656 			return error;
3657 		}
3658 		goto restart;
3659 	}
3660 dontblock:
3661 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3662 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3663 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3664 	nextrecord = m->m_nextpkt;
3665 
3666 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3667 		error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3668 		    mp0 == NULL);
3669 		if (error == ERESTART) {
3670 			goto restart;
3671 		} else if (error != 0) {
3672 			goto release;
3673 		}
3674 		orig_resid = 0;
3675 	}
3676 
3677 	/*
3678 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3679 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3680 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3681 	 * perform externalization.
3682 	 */
3683 	if (m != NULL && m->m_type == MT_CONTROL) {
3684 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3685 		if (error != 0) {
3686 			goto release;
3687 		}
3688 		orig_resid = 0;
3689 	}
3690 
3691 	if (m != NULL) {
3692 		if (!(flags & MSG_PEEK)) {
3693 			/*
3694 			 * We get here because m points to an mbuf following
3695 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3696 			 * processed above.  In any case, m should be pointing
3697 			 * to the head of the mbuf chain, and the nextrecord
3698 			 * should be either NULL or equal to m->m_nextpkt.
3699 			 * See comments above about SB_LOCK.
3700 			 */
3701 			if (m != so->so_rcv.sb_mb ||
3702 			    m->m_nextpkt != nextrecord) {
3703 				panic("%s: post-control !sync so=%p m=%p "
3704 				    "nextrecord=%p\n", __func__, so, m,
3705 				    nextrecord);
3706 				/* NOTREACHED */
3707 			}
3708 			if (nextrecord == NULL) {
3709 				so->so_rcv.sb_lastrecord = m;
3710 			}
3711 		}
3712 		type = m->m_type;
3713 		if (type == MT_OOBDATA) {
3714 			flags |= MSG_OOB;
3715 		}
3716 	} else {
3717 		if (!(flags & MSG_PEEK)) {
3718 			SB_EMPTY_FIXUP(&so->so_rcv);
3719 		}
3720 	}
3721 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3722 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3723 
3724 	moff = 0;
3725 	offset = 0;
3726 
3727 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3728 		can_delay = 1;
3729 	} else {
3730 		can_delay = 0;
3731 	}
3732 
3733 	while (m != NULL &&
3734 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3735 		if (m->m_type == MT_OOBDATA) {
3736 			if (type != MT_OOBDATA) {
3737 				break;
3738 			}
3739 		} else if (type == MT_OOBDATA) {
3740 			break;
3741 		}
3742 
3743 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3744 			break;
3745 		}
3746 		/*
3747 		 * Make sure to allways set MSG_OOB event when getting
3748 		 * out of band data inline.
3749 		 */
3750 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3751 		    (so->so_options & SO_OOBINLINE) != 0 &&
3752 		    (so->so_state & SS_RCVATMARK) != 0) {
3753 			flags |= MSG_OOB;
3754 		}
3755 		so->so_state &= ~SS_RCVATMARK;
3756 		len = uio_resid(uio) - delayed_copy_len;
3757 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3758 			len = so->so_oobmark - offset;
3759 		}
3760 		if (len > m->m_len - moff) {
3761 			len = m->m_len - moff;
3762 		}
3763 		/*
3764 		 * If mp is set, just pass back the mbufs.
3765 		 * Otherwise copy them out via the uio, then free.
3766 		 * Sockbuf must be consistent here (points to current mbuf,
3767 		 * it points to next record) when we drop priority;
3768 		 * we must note any additions to the sockbuf when we
3769 		 * block interrupts again.
3770 		 */
3771 		if (mp == NULL) {
3772 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3773 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3774 			if (can_delay && len == m->m_len) {
3775 				/*
3776 				 * only delay the copy if we're consuming the
3777 				 * mbuf and we're NOT in MSG_PEEK mode
3778 				 * and we have enough data to make it worthwile
3779 				 * to drop and retake the lock... can_delay
3780 				 * reflects the state of the 2 latter
3781 				 * constraints moff should always be zero
3782 				 * in these cases
3783 				 */
3784 				delayed_copy_len += len;
3785 			} else {
3786 				if (delayed_copy_len) {
3787 					error = sodelayed_copy(so, uio,
3788 					    &free_list, &delayed_copy_len);
3789 
3790 					if (error) {
3791 						goto release;
3792 					}
3793 					/*
3794 					 * can only get here if MSG_PEEK is not
3795 					 * set therefore, m should point at the
3796 					 * head of the rcv queue; if it doesn't,
3797 					 * it means something drastically
3798 					 * changed while we were out from behind
3799 					 * the lock in sodelayed_copy. perhaps
3800 					 * a RST on the stream. in any event,
3801 					 * the stream has been interrupted. it's
3802 					 * probably best just to return whatever
3803 					 * data we've moved and let the caller
3804 					 * sort it out...
3805 					 */
3806 					if (m != so->so_rcv.sb_mb) {
3807 						break;
3808 					}
3809 				}
3810 				socket_unlock(so, 0);
3811 				error = uiomove(mtod(m, caddr_t) + moff,
3812 				    (int)len, uio);
3813 				socket_lock(so, 0);
3814 
3815 				if (error) {
3816 					goto release;
3817 				}
3818 			}
3819 		} else {
3820 			uio_setresid(uio, (uio_resid(uio) - len));
3821 		}
3822 		if (len == m->m_len - moff) {
3823 			if (m->m_flags & M_EOR) {
3824 				flags |= MSG_EOR;
3825 			}
3826 			if (flags & MSG_PEEK) {
3827 				m = m->m_next;
3828 				moff = 0;
3829 			} else {
3830 				nextrecord = m->m_nextpkt;
3831 				sbfree(&so->so_rcv, m);
3832 				m->m_nextpkt = NULL;
3833 
3834 				if (mp != NULL) {
3835 					*mp = m;
3836 					mp = &m->m_next;
3837 					so->so_rcv.sb_mb = m = m->m_next;
3838 					*mp = NULL;
3839 				} else {
3840 					if (free_list == NULL) {
3841 						free_list = m;
3842 					} else {
3843 						ml->m_next = m;
3844 					}
3845 					ml = m;
3846 					so->so_rcv.sb_mb = m = m->m_next;
3847 					ml->m_next = NULL;
3848 				}
3849 				if (m != NULL) {
3850 					m->m_nextpkt = nextrecord;
3851 					if (nextrecord == NULL) {
3852 						so->so_rcv.sb_lastrecord = m;
3853 					}
3854 				} else {
3855 					so->so_rcv.sb_mb = nextrecord;
3856 					SB_EMPTY_FIXUP(&so->so_rcv);
3857 				}
3858 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3859 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3860 			}
3861 		} else {
3862 			if (flags & MSG_PEEK) {
3863 				moff += len;
3864 			} else {
3865 				if (mp != NULL) {
3866 					int copy_flag;
3867 
3868 					if (flags & MSG_DONTWAIT) {
3869 						copy_flag = M_DONTWAIT;
3870 					} else {
3871 						copy_flag = M_WAIT;
3872 					}
3873 					*mp = m_copym(m, 0, (int)len, copy_flag);
3874 					/*
3875 					 * Failed to allocate an mbuf?
3876 					 * Adjust uio_resid back, it was
3877 					 * adjusted down by len bytes which
3878 					 * we didn't copy over.
3879 					 */
3880 					if (*mp == NULL) {
3881 						uio_setresid(uio,
3882 						    (uio_resid(uio) + len));
3883 						break;
3884 					}
3885 				}
3886 				m->m_data += len;
3887 				m->m_len -= len;
3888 				so->so_rcv.sb_cc -= len;
3889 			}
3890 		}
3891 		if (so->so_oobmark) {
3892 			if ((flags & MSG_PEEK) == 0) {
3893 				so->so_oobmark -= len;
3894 				if (so->so_oobmark == 0) {
3895 					so->so_state |= SS_RCVATMARK;
3896 					break;
3897 				}
3898 			} else {
3899 				offset += len;
3900 				if (offset == so->so_oobmark) {
3901 					break;
3902 				}
3903 			}
3904 		}
3905 		if (flags & MSG_EOR) {
3906 			break;
3907 		}
3908 		/*
3909 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3910 		 * (for non-atomic socket), we must not quit until
3911 		 * "uio->uio_resid == 0" or an error termination.
3912 		 * If a signal/timeout occurs, return with a short
3913 		 * count but without error.  Keep sockbuf locked
3914 		 * against other readers.
3915 		 */
3916 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3917 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3918 		    !sosendallatonce(so) && !nextrecord) {
3919 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3920 #if CONTENT_FILTER
3921 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3922 #endif /* CONTENT_FILTER */
3923 			    )) {
3924 				goto release;
3925 			}
3926 
3927 			/*
3928 			 * Depending on the protocol (e.g. TCP), the following
3929 			 * might cause the socket lock to be dropped and later
3930 			 * be reacquired, and more data could have arrived and
3931 			 * have been appended to the receive socket buffer by
3932 			 * the time it returns.  Therefore, we only sleep in
3933 			 * sbwait() below if and only if the socket buffer is
3934 			 * empty, in order to avoid a false sleep.
3935 			 */
3936 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3937 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3938 			}
3939 
3940 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3941 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3942 
3943 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3944 				error = 0;
3945 				goto release;
3946 			}
3947 			/*
3948 			 * have to wait until after we get back from the sbwait
3949 			 * to do the copy because we will drop the lock if we
3950 			 * have enough data that has been delayed... by dropping
3951 			 * the lock we open up a window allowing the netisr
3952 			 * thread to process the incoming packets and to change
3953 			 * the state of this socket... we're issuing the sbwait
3954 			 * because the socket is empty and we're expecting the
3955 			 * netisr thread to wake us up when more packets arrive;
3956 			 * if we allow that processing to happen and then sbwait
3957 			 * we could stall forever with packets sitting in the
3958 			 * socket if no further packets arrive from the remote
3959 			 * side.
3960 			 *
3961 			 * we want to copy before we've collected all the data
3962 			 * to satisfy this request to allow the copy to overlap
3963 			 * the incoming packet processing on an MP system
3964 			 */
3965 			if (delayed_copy_len > sorecvmincopy &&
3966 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3967 				error = sodelayed_copy(so, uio,
3968 				    &free_list, &delayed_copy_len);
3969 
3970 				if (error) {
3971 					goto release;
3972 				}
3973 			}
3974 			m = so->so_rcv.sb_mb;
3975 			if (m != NULL) {
3976 				nextrecord = m->m_nextpkt;
3977 			}
3978 			SB_MB_CHECK(&so->so_rcv);
3979 		}
3980 	}
3981 #ifdef MORE_LOCKING_DEBUG
3982 	if (so->so_usecount <= 1) {
3983 		panic("%s: after big while so=%p ref=%d on socket",
3984 		    __func__, so, so->so_usecount);
3985 		/* NOTREACHED */
3986 	}
3987 #endif
3988 
3989 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3990 		if (so->so_options & SO_DONTTRUNC) {
3991 			flags |= MSG_RCVMORE;
3992 		} else {
3993 			flags |= MSG_TRUNC;
3994 			if ((flags & MSG_PEEK) == 0) {
3995 				(void) sbdroprecord(&so->so_rcv);
3996 			}
3997 		}
3998 	}
3999 
4000 	/*
4001 	 * pru_rcvd below (for TCP) may cause more data to be received
4002 	 * if the socket lock is dropped prior to sending the ACK; some
4003 	 * legacy OpenTransport applications don't handle this well
4004 	 * (if it receives less data than requested while MSG_HAVEMORE
4005 	 * is set), and so we set the flag now based on what we know
4006 	 * prior to calling pru_rcvd.
4007 	 */
4008 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4009 		flags |= MSG_HAVEMORE;
4010 	}
4011 
4012 	if ((flags & MSG_PEEK) == 0) {
4013 		if (m == NULL) {
4014 			so->so_rcv.sb_mb = nextrecord;
4015 			/*
4016 			 * First part is an inline SB_EMPTY_FIXUP().  Second
4017 			 * part makes sure sb_lastrecord is up-to-date if
4018 			 * there is still data in the socket buffer.
4019 			 */
4020 			if (so->so_rcv.sb_mb == NULL) {
4021 				so->so_rcv.sb_mbtail = NULL;
4022 				so->so_rcv.sb_lastrecord = NULL;
4023 			} else if (nextrecord->m_nextpkt == NULL) {
4024 				so->so_rcv.sb_lastrecord = nextrecord;
4025 			}
4026 			SB_MB_CHECK(&so->so_rcv);
4027 		}
4028 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4029 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4030 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4031 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4032 		}
4033 	}
4034 
4035 	if (delayed_copy_len) {
4036 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4037 		if (error) {
4038 			goto release;
4039 		}
4040 	}
4041 	if (free_list != NULL) {
4042 		m_freem_list(free_list);
4043 		free_list = NULL;
4044 	}
4045 
4046 	if (orig_resid == uio_resid(uio) && orig_resid &&
4047 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4048 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4049 		goto restart;
4050 	}
4051 
4052 	if (flagsp != NULL) {
4053 		*flagsp |= flags;
4054 	}
4055 release:
4056 #ifdef MORE_LOCKING_DEBUG
4057 	if (so->so_usecount <= 1) {
4058 		panic("%s: release so=%p ref=%d on socket", __func__,
4059 		    so, so->so_usecount);
4060 		/* NOTREACHED */
4061 	}
4062 #endif
4063 	if (delayed_copy_len) {
4064 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4065 	}
4066 
4067 	if (free_list != NULL) {
4068 		m_freem_list(free_list);
4069 	}
4070 
4071 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4072 
4073 	if (en_tracing) {
4074 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4075 		    VM_KERNEL_ADDRPERM(so),
4076 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4077 		    (int64_t)(orig_resid - uio_resid(uio)));
4078 	}
4079 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4080 	    so->so_rcv.sb_cc, 0, error);
4081 
4082 	return error;
4083 }
4084 
4085 /*
4086  * Returns:	0			Success
4087  *	uiomove:EFAULT
4088  */
4089 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4090 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4091     user_ssize_t *resid)
4092 {
4093 	int error = 0;
4094 	struct mbuf *m;
4095 
4096 	m = *free_list;
4097 
4098 	socket_unlock(so, 0);
4099 
4100 	while (m != NULL && error == 0) {
4101 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4102 		m = m->m_next;
4103 	}
4104 	m_freem_list(*free_list);
4105 
4106 	*free_list = NULL;
4107 	*resid = 0;
4108 
4109 	socket_lock(so, 0);
4110 
4111 	return error;
4112 }
4113 
4114 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)4115 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
4116     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
4117 {
4118 	mbuf_ref_t m;
4119 	mbuf_ref_ref_t mp;
4120 	mbuf_ref_t nextrecord;
4121 	int flags, error;
4122 	struct protosw *pr = so->so_proto;
4123 	struct proc *p = current_proc();
4124 	u_int npkts = 0;
4125 	mbuf_ref_t free_list = NULL;
4126 	int sblocked = 0;
4127 
4128 	/*
4129 	 * Sanity check on the parameters passed by caller
4130 	 */
4131 	if (mp0 == NULL || pktcntp == NULL) {
4132 		return EINVAL;
4133 	}
4134 	if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
4135 		return EINVAL;
4136 	}
4137 
4138 	mp = mp0;
4139 	*mp0 = NULL;
4140 	if (controlp != NULL) {
4141 		*controlp = NULL;
4142 	}
4143 	if (maddrp != NULL) {
4144 		*maddrp = NULL;
4145 	}
4146 	if (flagsp != NULL) {
4147 		flags = *flagsp;
4148 	} else {
4149 		flags = 0;
4150 	}
4151 
4152 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4153 	    *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4154 	    so->so_rcv.sb_hiwat);
4155 
4156 	socket_lock(so, 1);
4157 	so_update_last_owner_locked(so, p);
4158 	so_update_policy(so);
4159 
4160 #if NECP
4161 	so_update_necp_policy(so, NULL, NULL);
4162 #endif /* NECP */
4163 
4164 	/*
4165 	 * If a recv attempt is made on a previously-accepted socket
4166 	 * that has been marked as inactive (disconnected), reject
4167 	 * the request.
4168 	 */
4169 	if (so->so_flags & SOF_DEFUNCT) {
4170 		struct sockbuf *sb = &so->so_rcv;
4171 
4172 		error = ENOTCONN;
4173 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4174 		    __func__, proc_pid(p), proc_best_name(p),
4175 		    so->so_gencnt,
4176 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4177 		/*
4178 		 * This socket should have been disconnected and flushed
4179 		 * prior to being returned from sodefunct(); there should
4180 		 * be no data on its receive list, so panic otherwise.
4181 		 */
4182 		if (so->so_state & SS_DEFUNCT) {
4183 			sb_empty_assert(sb, __func__);
4184 		}
4185 		goto release;
4186 	}
4187 
4188 	*mp = NULL;
4189 
4190 restart:
4191 	/*
4192 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4193 	 * and if so just return to the caller.  This could happen when
4194 	 * soreceive() is called by a socket upcall function during the
4195 	 * time the socket is freed.  The socket buffer would have been
4196 	 * locked across the upcall, therefore we cannot put this thread
4197 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4198 	 * we may livelock), because the lock on the socket buffer will
4199 	 * only be released when the upcall routine returns to its caller.
4200 	 * Because the socket has been officially closed, there can be
4201 	 * no further read on it.
4202 	 */
4203 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4204 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4205 		error = 0;
4206 		goto release;
4207 	}
4208 
4209 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4210 	if (error) {
4211 		goto release;
4212 	}
4213 	sblocked = 1;
4214 
4215 	m = so->so_rcv.sb_mb;
4216 	/*
4217 	 * Block awaiting more datagram if needed
4218 	 */
4219 	if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4220 	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4221 		/*
4222 		 * Panic if we notice inconsistencies in the socket's
4223 		 * receive list; both sb_mb and sb_cc should correctly
4224 		 * reflect the contents of the list, otherwise we may
4225 		 * end up with false positives during select() or poll()
4226 		 * which could put the application in a bad state.
4227 		 */
4228 		SB_MB_CHECK(&so->so_rcv);
4229 
4230 		if (so->so_error) {
4231 			if (m != NULL) {
4232 				goto dontblock;
4233 			}
4234 			error = so->so_error;
4235 			if ((flags & MSG_PEEK) == 0) {
4236 				so->so_error = 0;
4237 			}
4238 			goto release;
4239 		}
4240 		if (so->so_state & SS_CANTRCVMORE) {
4241 			if (m != NULL) {
4242 				goto dontblock;
4243 			} else {
4244 				goto release;
4245 			}
4246 		}
4247 		for (; m != NULL; m = m->m_next) {
4248 			if (m->m_flags & M_EOR) {
4249 				m = so->so_rcv.sb_mb;
4250 				goto dontblock;
4251 			}
4252 		}
4253 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4254 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4255 			error = ENOTCONN;
4256 			goto release;
4257 		}
4258 		if ((so->so_state & SS_NBIO) ||
4259 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4260 			error = EWOULDBLOCK;
4261 			goto release;
4262 		}
4263 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4264 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4265 
4266 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4267 		sblocked = 0;
4268 
4269 		error = sbwait(&so->so_rcv);
4270 		if (error != 0) {
4271 			goto release;
4272 		}
4273 		goto restart;
4274 	}
4275 dontblock:
4276 	m = so->so_rcv.sb_mb;
4277 	if (m == NULL) {
4278 		goto release;
4279 	}
4280 
4281 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4282 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4283 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4284 	nextrecord = m->m_nextpkt;
4285 
4286 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4287 		mbuf_ref_t maddr = NULL;
4288 
4289 		error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4290 		    &nextrecord, 1);
4291 		if (error == ERESTART) {
4292 			goto restart;
4293 		} else if (error != 0) {
4294 			goto release;
4295 		}
4296 
4297 		if (maddr != NULL) {
4298 			maddr->m_nextpkt = NULL;
4299 			maddr->m_next = NULL;
4300 			if (maddrp != NULL) {
4301 				*maddrp = maddr;
4302 				maddrp = &maddr->m_nextpkt;
4303 			} else {
4304 				maddr->m_next = free_list;
4305 				free_list = maddr;
4306 			}
4307 		}
4308 	}
4309 
4310 	/*
4311 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
4312 	 * in the first mbuf chain on the socket buffer.
4313 	 * We call into the protocol to perform externalization.
4314 	 */
4315 	if (m != NULL && m->m_type == MT_CONTROL) {
4316 		mbuf_ref_t control = NULL;
4317 
4318 		error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4319 		if (error != 0) {
4320 			goto release;
4321 		}
4322 		if (control != NULL) {
4323 			control->m_nextpkt = NULL;
4324 			control->m_next = NULL;
4325 			if (controlp != NULL) {
4326 				*controlp = control;
4327 				controlp = &control->m_nextpkt;
4328 			} else {
4329 				control->m_next = free_list;
4330 				free_list = control;
4331 			}
4332 		}
4333 	}
4334 
4335 	/*
4336 	 * Link the packet to the list
4337 	 */
4338 	if (m != NULL) {
4339 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4340 			panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4341 		}
4342 		m->m_nextpkt = NULL;
4343 		*mp = m;
4344 		mp = &m->m_nextpkt;
4345 	}
4346 	while (m != NULL) {
4347 		sbfree(&so->so_rcv, m);
4348 
4349 		m = m->m_next;
4350 	}
4351 
4352 	so->so_rcv.sb_mb = nextrecord;
4353 	/*
4354 	 * First part is an inline SB_EMPTY_FIXUP().  Second
4355 	 * part makes sure sb_lastrecord is up-to-date if
4356 	 * there is still data in the socket buffer.
4357 	 */
4358 	if (so->so_rcv.sb_mb == NULL) {
4359 		so->so_rcv.sb_mbtail = NULL;
4360 		so->so_rcv.sb_lastrecord = NULL;
4361 	} else if (nextrecord->m_nextpkt == NULL) {
4362 		so->so_rcv.sb_lastrecord = nextrecord;
4363 	}
4364 	SB_MB_CHECK(&so->so_rcv);
4365 
4366 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4367 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4368 
4369 	npkts += 1;
4370 
4371 	/*
4372 	 * We continue as long as all those conditions as we have less packets
4373 	 * than requested and the socket buffer is not empty
4374 	 */
4375 	if (npkts < *pktcntp) {
4376 		if (so->so_rcv.sb_mb != NULL) {
4377 			goto dontblock;
4378 		}
4379 		if ((flags & MSG_WAITALL) != 0) {
4380 			goto restart;
4381 		}
4382 	}
4383 
4384 	if (flagsp != NULL) {
4385 		*flagsp |= flags;
4386 	}
4387 
4388 release:
4389 	/*
4390 	 * pru_rcvd may cause more data to be received if the socket lock
4391 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4392 	 * That way the caller won't be surprised if it receives less data
4393 	 * than requested.
4394 	 */
4395 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4396 		flags |= MSG_HAVEMORE;
4397 	}
4398 
4399 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4400 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4401 	}
4402 
4403 	if (sblocked) {
4404 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4405 	} else {
4406 		socket_unlock(so, 1);
4407 	}
4408 
4409 	*pktcntp = npkts;
4410 	/*
4411 	 * Amortize the cost of freeing the mbufs
4412 	 */
4413 	if (free_list != NULL) {
4414 		m_freem_list(free_list);
4415 	}
4416 
4417 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4418 	    0, 0, 0, 0);
4419 	return error;
4420 }
4421 
4422 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4423 so_statistics_event_to_nstat_event(int64_t *input_options,
4424     uint64_t *nstat_event)
4425 {
4426 	int error = 0;
4427 	switch (*input_options) {
4428 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4429 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4430 		break;
4431 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4432 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4433 		break;
4434 #if (DEBUG || DEVELOPMENT)
4435 	case SO_STATISTICS_EVENT_RESERVED_1:
4436 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4437 		break;
4438 	case SO_STATISTICS_EVENT_RESERVED_2:
4439 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4440 		break;
4441 #endif /* (DEBUG || DEVELOPMENT) */
4442 	default:
4443 		error = EINVAL;
4444 		break;
4445 	}
4446 	return error;
4447 }
4448 
4449 /*
4450  * Returns:	0			Success
4451  *		EINVAL
4452  *		ENOTCONN
4453  *	<pru_shutdown>:EINVAL
4454  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4455  *	<pru_shutdown>:ENOBUFS[TCP]
4456  *	<pru_shutdown>:EMSGSIZE[TCP]
4457  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4458  *	<pru_shutdown>:ENETUNREACH[TCP]
4459  *	<pru_shutdown>:ENETDOWN[TCP]
4460  *	<pru_shutdown>:ENOMEM[TCP]
4461  *	<pru_shutdown>:EACCES[TCP]
4462  *	<pru_shutdown>:EMSGSIZE[TCP]
4463  *	<pru_shutdown>:ENOBUFS[TCP]
4464  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4465  *	<pru_shutdown>:???		[other protocol families]
4466  */
4467 int
soshutdown(struct socket * so,int how)4468 soshutdown(struct socket *so, int how)
4469 {
4470 	int error;
4471 
4472 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4473 
4474 	switch (how) {
4475 	case SHUT_RD:
4476 	case SHUT_WR:
4477 	case SHUT_RDWR:
4478 		socket_lock(so, 1);
4479 		if ((so->so_state &
4480 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4481 			error = ENOTCONN;
4482 		} else {
4483 			error = soshutdownlock(so, how);
4484 		}
4485 		socket_unlock(so, 1);
4486 		break;
4487 	default:
4488 		error = EINVAL;
4489 		break;
4490 	}
4491 
4492 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4493 
4494 	return error;
4495 }
4496 
4497 int
soshutdownlock_final(struct socket * so,int how)4498 soshutdownlock_final(struct socket *so, int how)
4499 {
4500 	struct protosw *pr = so->so_proto;
4501 	int error = 0;
4502 
4503 	sflt_notify(so, sock_evt_shutdown, &how);
4504 
4505 	if (how != SHUT_WR) {
4506 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4507 			/* read already shut down */
4508 			error = ENOTCONN;
4509 			goto done;
4510 		}
4511 		sorflush(so);
4512 	}
4513 	if (how != SHUT_RD) {
4514 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4515 			/* write already shut down */
4516 			error = ENOTCONN;
4517 			goto done;
4518 		}
4519 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4520 	}
4521 done:
4522 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4523 	return error;
4524 }
4525 
4526 int
soshutdownlock(struct socket * so,int how)4527 soshutdownlock(struct socket *so, int how)
4528 {
4529 	int error = 0;
4530 
4531 #if CONTENT_FILTER
4532 	/*
4533 	 * A content filter may delay the actual shutdown until it
4534 	 * has processed the pending data
4535 	 */
4536 	if (so->so_flags & SOF_CONTENT_FILTER) {
4537 		error = cfil_sock_shutdown(so, &how);
4538 		if (error == EJUSTRETURN) {
4539 			error = 0;
4540 			goto done;
4541 		} else if (error != 0) {
4542 			goto done;
4543 		}
4544 	}
4545 #endif /* CONTENT_FILTER */
4546 
4547 	error = soshutdownlock_final(so, how);
4548 
4549 done:
4550 	return error;
4551 }
4552 
4553 void
sowflush(struct socket * so)4554 sowflush(struct socket *so)
4555 {
4556 	struct sockbuf *sb = &so->so_snd;
4557 
4558 	/*
4559 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4560 	 * to prevent the socket buffer from being unexpectedly altered
4561 	 * while it is used by another thread in socket send/receive.
4562 	 *
4563 	 * sblock() must not fail here, hence the assertion.
4564 	 */
4565 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4566 	VERIFY(sb->sb_flags & SB_LOCK);
4567 
4568 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4569 	sb->sb_flags            |= SB_DROP;
4570 	sb->sb_upcall           = NULL;
4571 	sb->sb_upcallarg        = NULL;
4572 
4573 	sbunlock(sb, TRUE);     /* keep socket locked */
4574 
4575 	selthreadclear(&sb->sb_sel);
4576 	sbrelease(sb);
4577 }
4578 
4579 void
sorflush(struct socket * so)4580 sorflush(struct socket *so)
4581 {
4582 	struct sockbuf *sb = &so->so_rcv;
4583 	struct protosw *pr = so->so_proto;
4584 	struct sockbuf asb;
4585 #ifdef notyet
4586 	lck_mtx_t *mutex_held;
4587 	/*
4588 	 * XXX: This code is currently commented out, because we may get here
4589 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4590 	 * longer be able to return us the lock; this will be fixed in future.
4591 	 */
4592 	if (so->so_proto->pr_getlock != NULL) {
4593 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4594 	} else {
4595 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4596 	}
4597 
4598 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4599 #endif /* notyet */
4600 
4601 	sflt_notify(so, sock_evt_flush_read, NULL);
4602 
4603 	socantrcvmore(so);
4604 
4605 	/*
4606 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4607 	 * to prevent the socket buffer from being unexpectedly altered
4608 	 * while it is used by another thread in socket send/receive.
4609 	 *
4610 	 * sblock() must not fail here, hence the assertion.
4611 	 */
4612 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4613 	VERIFY(sb->sb_flags & SB_LOCK);
4614 
4615 	/*
4616 	 * Copy only the relevant fields from "sb" to "asb" which we
4617 	 * need for sbrelease() to function.  In particular, skip
4618 	 * sb_sel as it contains the wait queue linkage, which would
4619 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4620 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4621 	 * to acquire it later as part of sbrelease().
4622 	 */
4623 	bzero(&asb, sizeof(asb));
4624 	asb.sb_cc               = sb->sb_cc;
4625 	asb.sb_hiwat            = sb->sb_hiwat;
4626 	asb.sb_mbcnt            = sb->sb_mbcnt;
4627 	asb.sb_mbmax            = sb->sb_mbmax;
4628 	asb.sb_ctl              = sb->sb_ctl;
4629 	asb.sb_lowat            = sb->sb_lowat;
4630 	asb.sb_mb               = sb->sb_mb;
4631 	asb.sb_mbtail           = sb->sb_mbtail;
4632 	asb.sb_lastrecord       = sb->sb_lastrecord;
4633 	asb.sb_so               = sb->sb_so;
4634 	asb.sb_flags            = sb->sb_flags;
4635 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4636 	asb.sb_flags            |= SB_DROP;
4637 
4638 	/*
4639 	 * Ideally we'd bzero() these and preserve the ones we need;
4640 	 * but to do that we'd need to shuffle things around in the
4641 	 * sockbuf, and we can't do it now because there are KEXTS
4642 	 * that are directly referring to the socket structure.
4643 	 *
4644 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4645 	 * Clearing SB_SEL is done for selthreadclear() below.
4646 	 */
4647 	sb->sb_cc               = 0;
4648 	sb->sb_hiwat            = 0;
4649 	sb->sb_mbcnt            = 0;
4650 	sb->sb_mbmax            = 0;
4651 	sb->sb_ctl              = 0;
4652 	sb->sb_lowat            = 0;
4653 	sb->sb_mb               = NULL;
4654 	sb->sb_mbtail           = NULL;
4655 	sb->sb_lastrecord       = NULL;
4656 	sb->sb_timeo.tv_sec     = 0;
4657 	sb->sb_timeo.tv_usec    = 0;
4658 	sb->sb_upcall           = NULL;
4659 	sb->sb_upcallarg        = NULL;
4660 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4661 	sb->sb_flags            |= SB_DROP;
4662 
4663 	sbunlock(sb, TRUE);     /* keep socket locked */
4664 
4665 	/*
4666 	 * Note that selthreadclear() is called on the original "sb" and
4667 	 * not the local "asb" because of the way wait queue linkage is
4668 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4669 	 * should no longer be set (cleared above.)
4670 	 */
4671 	selthreadclear(&sb->sb_sel);
4672 
4673 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4674 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4675 	}
4676 
4677 	sbrelease(&asb);
4678 }
4679 
4680 /*
4681  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4682  * an additional variant to handle the case where the option value needs
4683  * to be some kind of integer, but not a specific size.
4684  * In addition to their use here, these functions are also called by the
4685  * protocol-level pr_ctloutput() routines.
4686  *
4687  * Returns:	0			Success
4688  *		EINVAL
4689  *	copyin:EFAULT
4690  */
4691 int
sooptcopyin(struct sockopt * sopt,void * __sized_by (len)buf,size_t len,size_t minlen)4692 sooptcopyin(struct sockopt *sopt, void *__sized_by(len) buf, size_t len, size_t minlen)
4693 {
4694 	size_t  valsize;
4695 
4696 	/*
4697 	 * If the user gives us more than we wanted, we ignore it,
4698 	 * but if we don't get the minimum length the caller
4699 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4700 	 * is set to however much we actually retrieved.
4701 	 */
4702 	if ((valsize = sopt->sopt_valsize) < minlen) {
4703 		return EINVAL;
4704 	}
4705 	if (valsize > len) {
4706 		sopt->sopt_valsize = valsize = len;
4707 	}
4708 
4709 	if (sopt->sopt_p != kernproc) {
4710 		return copyin(sopt->sopt_val, buf, valsize);
4711 	}
4712 
4713 	caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4714 	    CAST_DOWN(caddr_t, sopt->sopt_val),
4715 	    valsize);
4716 	bcopy(tmp, buf, valsize);
4717 
4718 	return 0;
4719 }
4720 
4721 /*
4722  * sooptcopyin_timeval
4723  *   Copy in a timeval value into tv_p, and take into account whether the
4724  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4725  *   code here so that we can verify the 64-bit tv_sec value before we lose
4726  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4727  */
4728 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4729 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4730 {
4731 	int                     error;
4732 
4733 	if (proc_is64bit(sopt->sopt_p)) {
4734 		struct user64_timeval   tv64;
4735 
4736 		if (sopt->sopt_valsize < sizeof(tv64)) {
4737 			return EINVAL;
4738 		}
4739 
4740 		sopt->sopt_valsize = sizeof(tv64);
4741 		if (sopt->sopt_p != kernproc) {
4742 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4743 			if (error != 0) {
4744 				return error;
4745 			}
4746 		} else {
4747 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4748 			    CAST_DOWN(caddr_t, sopt->sopt_val),
4749 			    sizeof(tv64));
4750 			bcopy(tmp, &tv64, sizeof(tv64));
4751 		}
4752 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4753 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4754 			return EDOM;
4755 		}
4756 
4757 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4758 		tv_p->tv_usec = tv64.tv_usec;
4759 	} else {
4760 		struct user32_timeval   tv32;
4761 
4762 		if (sopt->sopt_valsize < sizeof(tv32)) {
4763 			return EINVAL;
4764 		}
4765 
4766 		sopt->sopt_valsize = sizeof(tv32);
4767 		if (sopt->sopt_p != kernproc) {
4768 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4769 			if (error != 0) {
4770 				return error;
4771 			}
4772 		} else {
4773 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4774 			    CAST_DOWN(caddr_t, sopt->sopt_val),
4775 			    sizeof(tv32));
4776 			bcopy(tmp, &tv32, sizeof(tv32));
4777 		}
4778 #ifndef __LP64__
4779 		/*
4780 		 * K64todo "comparison is always false due to
4781 		 * limited range of data type"
4782 		 */
4783 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4784 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4785 			return EDOM;
4786 		}
4787 #endif
4788 		tv_p->tv_sec = tv32.tv_sec;
4789 		tv_p->tv_usec = tv32.tv_usec;
4790 	}
4791 	return 0;
4792 }
4793 
4794 int
sooptcopyin_bindtodevice(struct sockopt * sopt,char * __sized_by (bufsize)buf,size_t bufsize)4795 sooptcopyin_bindtodevice(struct sockopt *sopt, char * __sized_by(bufsize) buf, size_t bufsize)
4796 {
4797 #define MIN_BINDTODEVICE_NAME_SIZE    2
4798 	size_t maxlen = bufsize - 1;             /* the max string length that fits in the buffer */
4799 
4800 	if (bufsize < MIN_BINDTODEVICE_NAME_SIZE) {
4801 #if DEBUG || DEVELOPMENT
4802 		os_log(OS_LOG_DEFAULT, "%s: bufsize %lu < MIN_BINDTODEVICE_NAME_SIZE %d",
4803 		    __func__, bufsize, MIN_BINDTODEVICE_NAME_SIZE);
4804 #endif /* DEBUG || DEVELOPMENT */
4805 		return EINVAL;
4806 	}
4807 
4808 	memset(buf, 0, bufsize);
4809 
4810 	/*
4811 	 * bufsize includes the end-of-string because of the uncertainty wether
4812 	 * interface names are passed as strings or byte buffers.
4813 	 * If the user gives us more than the max string length return EINVAL.
4814 	 * On success, sopt->sopt_valsize is not modified
4815 	 */
4816 	maxlen = bufsize - 1;
4817 	if (sopt->sopt_valsize > maxlen) {
4818 		os_log(OS_LOG_DEFAULT, "%s: sopt_valsize %lu > maxlen %lu",
4819 		    __func__, sopt->sopt_valsize, maxlen);
4820 		return EINVAL;
4821 	}
4822 
4823 	if (sopt->sopt_p != kernproc) {
4824 		return copyin(sopt->sopt_val, buf, sopt->sopt_valsize);
4825 	} else {
4826 		caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4827 		    CAST_DOWN(caddr_t, sopt->sopt_val),
4828 		    sopt->sopt_valsize);
4829 		bcopy(tmp, buf, sopt->sopt_valsize);
4830 	}
4831 
4832 	return 0;
4833 #undef MIN_BINDTODEVICE_NAME_SIZE
4834 }
4835 
4836 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4837 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4838     boolean_t ignore_delegate)
4839 {
4840 	kauth_cred_t cred =  NULL;
4841 	proc_t ep = PROC_NULL;
4842 	uid_t uid;
4843 	int error = 0;
4844 
4845 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4846 		ep = proc_find(so->e_pid);
4847 		if (ep) {
4848 			cred = kauth_cred_proc_ref(ep);
4849 		}
4850 	}
4851 
4852 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4853 
4854 	/* uid is 0 for root */
4855 	if (uid != 0 || !allow_root) {
4856 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4857 	}
4858 	if (cred) {
4859 		kauth_cred_unref(&cred);
4860 	}
4861 	if (ep != PROC_NULL) {
4862 		proc_rele(ep);
4863 	}
4864 
4865 	return error;
4866 }
4867 
4868 /*
4869  * Returns:	0			Success
4870  *		EINVAL
4871  *		ENOPROTOOPT
4872  *		ENOBUFS
4873  *		EDOM
4874  *	sooptcopyin:EINVAL
4875  *	sooptcopyin:EFAULT
4876  *	sooptcopyin_timeval:EINVAL
4877  *	sooptcopyin_timeval:EFAULT
4878  *	sooptcopyin_timeval:EDOM
4879  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4880  *	<pr_ctloutput>:???w
4881  *	sflt_attach_private:???		[whatever a filter author chooses]
4882  *	<sf_setoption>:???		[whatever a filter author chooses]
4883  *
4884  * Notes:	Other <pru_listen> returns depend on the protocol family; all
4885  *		<sf_listen> returns depend on what the filter author causes
4886  *		their filter to return.
4887  */
4888 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4889 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4890 {
4891 	int     error, optval;
4892 	int64_t long_optval;
4893 	struct  linger l;
4894 	struct  timeval tv;
4895 
4896 	if (sopt->sopt_dir != SOPT_SET) {
4897 		sopt->sopt_dir = SOPT_SET;
4898 	}
4899 
4900 	if (dolock) {
4901 		socket_lock(so, 1);
4902 	}
4903 
4904 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4905 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4906 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4907 		/* the socket has been shutdown, no more sockopt's */
4908 		error = EINVAL;
4909 		goto out;
4910 	}
4911 
4912 	error = sflt_setsockopt(so, sopt);
4913 	if (error != 0) {
4914 		if (error == EJUSTRETURN) {
4915 			error = 0;
4916 		}
4917 		goto out;
4918 	}
4919 
4920 	if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
4921 		if (so->so_proto != NULL &&
4922 		    so->so_proto->pr_ctloutput != NULL) {
4923 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
4924 			goto out;
4925 		}
4926 		error = ENOPROTOOPT;
4927 	} else {
4928 		/*
4929 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
4930 		 * the protocol layer, if needed.  A zero value returned from
4931 		 * the handler means use default socket-level processing as
4932 		 * done by the rest of this routine.  Otherwise, any other
4933 		 * return value indicates that the option is unsupported.
4934 		 */
4935 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4936 		    pru_socheckopt(so, sopt)) != 0) {
4937 			goto out;
4938 		}
4939 
4940 		error = 0;
4941 		switch (sopt->sopt_name) {
4942 		case SO_LINGER:
4943 		case SO_LINGER_SEC: {
4944 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4945 			if (error != 0) {
4946 				goto out;
4947 			}
4948 			/* Make sure to use sane values */
4949 			if (sopt->sopt_name == SO_LINGER) {
4950 				so->so_linger = (short)l.l_linger;
4951 			} else {
4952 				so->so_linger = (short)((long)l.l_linger * hz);
4953 			}
4954 			if (l.l_onoff != 0) {
4955 				so->so_options |= SO_LINGER;
4956 			} else {
4957 				so->so_options &= ~SO_LINGER;
4958 			}
4959 			break;
4960 		}
4961 		case SO_DEBUG:
4962 		case SO_KEEPALIVE:
4963 		case SO_DONTROUTE:
4964 		case SO_USELOOPBACK:
4965 		case SO_BROADCAST:
4966 		case SO_REUSEADDR:
4967 		case SO_REUSEPORT:
4968 		case SO_OOBINLINE:
4969 		case SO_TIMESTAMP:
4970 		case SO_TIMESTAMP_MONOTONIC:
4971 		case SO_TIMESTAMP_CONTINUOUS:
4972 		case SO_DONTTRUNC:
4973 		case SO_WANTMORE:
4974 		case SO_WANTOOBFLAG:
4975 		case SO_NOWAKEFROMSLEEP:
4976 		case SO_NOAPNFALLBK:
4977 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4978 			    sizeof(optval));
4979 			if (error != 0) {
4980 				goto out;
4981 			}
4982 			if (optval) {
4983 				so->so_options |= sopt->sopt_name;
4984 			} else {
4985 				so->so_options &= ~sopt->sopt_name;
4986 			}
4987 #if SKYWALK
4988 			inp_update_netns_flags(so);
4989 #endif /* SKYWALK */
4990 			break;
4991 
4992 		case SO_SNDBUF:
4993 		case SO_RCVBUF:
4994 		case SO_SNDLOWAT:
4995 		case SO_RCVLOWAT:
4996 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4997 			    sizeof(optval));
4998 			if (error != 0) {
4999 				goto out;
5000 			}
5001 
5002 			/*
5003 			 * Values < 1 make no sense for any of these
5004 			 * options, so disallow them.
5005 			 */
5006 			if (optval < 1) {
5007 				error = EINVAL;
5008 				goto out;
5009 			}
5010 
5011 			switch (sopt->sopt_name) {
5012 			case SO_SNDBUF:
5013 			case SO_RCVBUF: {
5014 				struct sockbuf *sb =
5015 				    (sopt->sopt_name == SO_SNDBUF) ?
5016 				    &so->so_snd : &so->so_rcv;
5017 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
5018 					error = ENOBUFS;
5019 					goto out;
5020 				}
5021 				sb->sb_flags |= SB_USRSIZE;
5022 				sb->sb_flags &= ~SB_AUTOSIZE;
5023 				sb->sb_idealsize = (u_int32_t)optval;
5024 				break;
5025 			}
5026 			/*
5027 			 * Make sure the low-water is never greater than
5028 			 * the high-water.
5029 			 */
5030 			case SO_SNDLOWAT: {
5031 				int space = sbspace(&so->so_snd);
5032 				uint32_t hiwat = so->so_snd.sb_hiwat;
5033 
5034 				if (so->so_snd.sb_flags & SB_UNIX) {
5035 					struct unpcb *unp =
5036 					    (struct unpcb *)(so->so_pcb);
5037 					if (unp != NULL &&
5038 					    unp->unp_conn != NULL) {
5039 						struct socket *so2 = unp->unp_conn->unp_socket;
5040 						hiwat += unp->unp_conn->unp_cc;
5041 						space = sbspace(&so2->so_rcv);
5042 					}
5043 				}
5044 
5045 				so->so_snd.sb_lowat =
5046 				    (optval > hiwat) ?
5047 				    hiwat : optval;
5048 
5049 				if (space >= so->so_snd.sb_lowat) {
5050 					sowwakeup(so);
5051 				}
5052 				break;
5053 			}
5054 			case SO_RCVLOWAT: {
5055 				int64_t data_len;
5056 				so->so_rcv.sb_lowat =
5057 				    (optval > so->so_rcv.sb_hiwat) ?
5058 				    so->so_rcv.sb_hiwat : optval;
5059 				if (so->so_rcv.sb_flags & SB_UNIX) {
5060 					struct unpcb *unp =
5061 					    (struct unpcb *)(so->so_pcb);
5062 					if (unp != NULL &&
5063 					    unp->unp_conn != NULL) {
5064 						struct socket *so2 = unp->unp_conn->unp_socket;
5065 						data_len = so2->so_snd.sb_cc
5066 						    - so2->so_snd.sb_ctl;
5067 					} else {
5068 						data_len = so->so_rcv.sb_cc
5069 						    - so->so_rcv.sb_ctl;
5070 					}
5071 				} else {
5072 					data_len = so->so_rcv.sb_cc
5073 					    - so->so_rcv.sb_ctl;
5074 				}
5075 
5076 				if (data_len >= so->so_rcv.sb_lowat) {
5077 					sorwakeup(so);
5078 				}
5079 				break;
5080 			}
5081 			}
5082 			break;
5083 
5084 		case SO_SNDTIMEO:
5085 		case SO_RCVTIMEO:
5086 			error = sooptcopyin_timeval(sopt, &tv);
5087 			if (error != 0) {
5088 				goto out;
5089 			}
5090 
5091 			switch (sopt->sopt_name) {
5092 			case SO_SNDTIMEO:
5093 				so->so_snd.sb_timeo = tv;
5094 				break;
5095 			case SO_RCVTIMEO:
5096 				so->so_rcv.sb_timeo = tv;
5097 				break;
5098 			}
5099 			break;
5100 
5101 		case SO_NKE: {
5102 			struct so_nke nke;
5103 
5104 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5105 			    sizeof(nke));
5106 			if (error != 0) {
5107 				goto out;
5108 			}
5109 
5110 			error = sflt_attach_internal(so, nke.nke_handle);
5111 			break;
5112 		}
5113 
5114 		case SO_NOSIGPIPE:
5115 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5116 			    sizeof(optval));
5117 			if (error != 0) {
5118 				goto out;
5119 			}
5120 			if (optval != 0) {
5121 				so->so_flags |= SOF_NOSIGPIPE;
5122 			} else {
5123 				so->so_flags &= ~SOF_NOSIGPIPE;
5124 			}
5125 			break;
5126 
5127 		case SO_NOADDRERR:
5128 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5129 			    sizeof(optval));
5130 			if (error != 0) {
5131 				goto out;
5132 			}
5133 			if (optval != 0) {
5134 				so->so_flags |= SOF_NOADDRAVAIL;
5135 			} else {
5136 				so->so_flags &= ~SOF_NOADDRAVAIL;
5137 			}
5138 			break;
5139 
5140 		case SO_REUSESHAREUID:
5141 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5142 			    sizeof(optval));
5143 			if (error != 0) {
5144 				goto out;
5145 			}
5146 			if (optval != 0) {
5147 				so->so_flags |= SOF_REUSESHAREUID;
5148 			} else {
5149 				so->so_flags &= ~SOF_REUSESHAREUID;
5150 			}
5151 			break;
5152 
5153 		case SO_NOTIFYCONFLICT:
5154 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5155 				error = EPERM;
5156 				goto out;
5157 			}
5158 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5159 			    sizeof(optval));
5160 			if (error != 0) {
5161 				goto out;
5162 			}
5163 			if (optval != 0) {
5164 				so->so_flags |= SOF_NOTIFYCONFLICT;
5165 			} else {
5166 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5167 			}
5168 			break;
5169 
5170 		case SO_RESTRICTIONS:
5171 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5172 			    sizeof(optval));
5173 			if (error != 0) {
5174 				goto out;
5175 			}
5176 
5177 			error = so_set_restrictions(so, optval);
5178 			break;
5179 
5180 		case SO_AWDL_UNRESTRICTED:
5181 			if (SOCK_DOM(so) != PF_INET &&
5182 			    SOCK_DOM(so) != PF_INET6) {
5183 				error = EOPNOTSUPP;
5184 				goto out;
5185 			}
5186 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5187 			    sizeof(optval));
5188 			if (error != 0) {
5189 				goto out;
5190 			}
5191 			if (optval != 0) {
5192 				error = soopt_cred_check(so,
5193 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5194 				if (error == 0) {
5195 					inp_set_awdl_unrestricted(
5196 						sotoinpcb(so));
5197 				}
5198 			} else {
5199 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5200 			}
5201 			break;
5202 		case SO_INTCOPROC_ALLOW:
5203 			if (SOCK_DOM(so) != PF_INET6) {
5204 				error = EOPNOTSUPP;
5205 				goto out;
5206 			}
5207 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5208 			    sizeof(optval));
5209 			if (error != 0) {
5210 				goto out;
5211 			}
5212 			if (optval != 0 &&
5213 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5214 				error = soopt_cred_check(so,
5215 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5216 				if (error == 0) {
5217 					inp_set_intcoproc_allowed(
5218 						sotoinpcb(so));
5219 				}
5220 			} else if (optval == 0) {
5221 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5222 			}
5223 			break;
5224 
5225 		case SO_LABEL:
5226 			error = EOPNOTSUPP;
5227 			break;
5228 
5229 		case SO_UPCALLCLOSEWAIT:
5230 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5231 			    sizeof(optval));
5232 			if (error != 0) {
5233 				goto out;
5234 			}
5235 			if (optval != 0) {
5236 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5237 			} else {
5238 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5239 			}
5240 			break;
5241 
5242 		case SO_RANDOMPORT:
5243 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5244 			    sizeof(optval));
5245 			if (error != 0) {
5246 				goto out;
5247 			}
5248 			if (optval != 0) {
5249 				so->so_flags |= SOF_BINDRANDOMPORT;
5250 			} else {
5251 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5252 			}
5253 			break;
5254 
5255 		case SO_NP_EXTENSIONS: {
5256 			struct so_np_extensions sonpx;
5257 
5258 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5259 			    sizeof(sonpx));
5260 			if (error != 0) {
5261 				goto out;
5262 			}
5263 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5264 				error = EINVAL;
5265 				goto out;
5266 			}
5267 			/*
5268 			 * Only one bit defined for now
5269 			 */
5270 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5271 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5272 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5273 				} else {
5274 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5275 				}
5276 			}
5277 			break;
5278 		}
5279 
5280 		case SO_TRAFFIC_CLASS: {
5281 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5282 			    sizeof(optval));
5283 			if (error != 0) {
5284 				goto out;
5285 			}
5286 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5287 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5288 				error = so_set_net_service_type(so, netsvc);
5289 				goto out;
5290 			}
5291 			error = so_set_traffic_class(so, optval);
5292 			if (error != 0) {
5293 				goto out;
5294 			}
5295 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5296 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5297 			break;
5298 		}
5299 
5300 		case SO_RECV_TRAFFIC_CLASS: {
5301 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5302 			    sizeof(optval));
5303 			if (error != 0) {
5304 				goto out;
5305 			}
5306 			if (optval == 0) {
5307 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5308 			} else {
5309 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5310 			}
5311 			break;
5312 		}
5313 
5314 #if (DEVELOPMENT || DEBUG)
5315 		case SO_TRAFFIC_CLASS_DBG: {
5316 			struct so_tcdbg so_tcdbg;
5317 
5318 			error = sooptcopyin(sopt, &so_tcdbg,
5319 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5320 			if (error != 0) {
5321 				goto out;
5322 			}
5323 			error = so_set_tcdbg(so, &so_tcdbg);
5324 			if (error != 0) {
5325 				goto out;
5326 			}
5327 			break;
5328 		}
5329 #endif /* (DEVELOPMENT || DEBUG) */
5330 
5331 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5332 			error = priv_check_cred(kauth_cred_get(),
5333 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5334 			if (error != 0) {
5335 				goto out;
5336 			}
5337 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5338 			    sizeof(optval));
5339 			if (error != 0) {
5340 				goto out;
5341 			}
5342 			if (optval == 0) {
5343 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5344 			} else {
5345 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5346 			}
5347 			break;
5348 
5349 #if (DEVELOPMENT || DEBUG)
5350 		case SO_DEFUNCTIT:
5351 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5352 			if (error == 0) {
5353 				error = sodefunct(current_proc(), so, 0);
5354 			}
5355 
5356 			break;
5357 #endif /* (DEVELOPMENT || DEBUG) */
5358 
5359 		case SO_DEFUNCTOK:
5360 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5361 			    sizeof(optval));
5362 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5363 				if (error == 0) {
5364 					error = EBADF;
5365 				}
5366 				goto out;
5367 			}
5368 			/*
5369 			 * Any process can set SO_DEFUNCTOK (clear
5370 			 * SOF_NODEFUNCT), but only root can clear
5371 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5372 			 */
5373 			if (optval == 0 &&
5374 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5375 				error = EPERM;
5376 				goto out;
5377 			}
5378 			if (optval) {
5379 				so->so_flags &= ~SOF_NODEFUNCT;
5380 			} else {
5381 				so->so_flags |= SOF_NODEFUNCT;
5382 			}
5383 
5384 			if (SOCK_DOM(so) == PF_INET ||
5385 			    SOCK_DOM(so) == PF_INET6) {
5386 				char s[MAX_IPv6_STR_LEN];
5387 				char d[MAX_IPv6_STR_LEN];
5388 				struct inpcb *inp = sotoinpcb(so);
5389 
5390 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5391 				    "[%s %s:%d -> %s:%d] is now marked "
5392 				    "as %seligible for "
5393 				    "defunct\n", __func__, proc_selfpid(),
5394 				    proc_best_name(current_proc()),
5395 				    so->so_gencnt,
5396 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5397 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5398 				    ((SOCK_DOM(so) == PF_INET) ?
5399 				    (void *)&inp->inp_laddr.s_addr :
5400 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5401 				    ntohs(inp->in6p_lport),
5402 				    inet_ntop(SOCK_DOM(so),
5403 				    (SOCK_DOM(so) == PF_INET) ?
5404 				    (void *)&inp->inp_faddr.s_addr :
5405 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5406 				    ntohs(inp->in6p_fport),
5407 				    (so->so_flags & SOF_NODEFUNCT) ?
5408 				    "not " : "");
5409 			} else {
5410 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5411 				    "is now marked as %seligible for "
5412 				    "defunct\n",
5413 				    __func__, proc_selfpid(),
5414 				    proc_best_name(current_proc()),
5415 				    so->so_gencnt,
5416 				    SOCK_DOM(so), SOCK_TYPE(so),
5417 				    (so->so_flags & SOF_NODEFUNCT) ?
5418 				    "not " : "");
5419 			}
5420 			break;
5421 
5422 		case SO_ISDEFUNCT:
5423 			/* This option is not settable */
5424 			error = EINVAL;
5425 			break;
5426 
5427 		case SO_OPPORTUNISTIC:
5428 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5429 			    sizeof(optval));
5430 			if (error == 0) {
5431 				error = so_set_opportunistic(so, optval);
5432 			}
5433 			break;
5434 
5435 		case SO_FLUSH:
5436 			/* This option is handled by lower layer(s) */
5437 			error = 0;
5438 			break;
5439 
5440 		case SO_RECV_ANYIF:
5441 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5442 			    sizeof(optval));
5443 			if (error == 0) {
5444 				error = so_set_recv_anyif(so, optval);
5445 			}
5446 			break;
5447 
5448 		case SO_TRAFFIC_MGT_BACKGROUND: {
5449 			/* This option is handled by lower layer(s) */
5450 			error = 0;
5451 			break;
5452 		}
5453 
5454 #if FLOW_DIVERT
5455 		case SO_FLOW_DIVERT_TOKEN:
5456 			error = flow_divert_token_set(so, sopt);
5457 			break;
5458 #endif  /* FLOW_DIVERT */
5459 
5460 
5461 		case SO_DELEGATED:
5462 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5463 			    sizeof(optval))) != 0) {
5464 				break;
5465 			}
5466 
5467 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5468 			break;
5469 
5470 		case SO_DELEGATED_UUID: {
5471 			uuid_t euuid;
5472 
5473 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5474 			    sizeof(euuid))) != 0) {
5475 				break;
5476 			}
5477 
5478 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5479 			break;
5480 		}
5481 
5482 #if NECP
5483 		case SO_NECP_ATTRIBUTES:
5484 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5485 				/* Handled by MPTCP itself */
5486 				break;
5487 			}
5488 
5489 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5490 				error = EINVAL;
5491 				goto out;
5492 			}
5493 
5494 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5495 			break;
5496 
5497 		case SO_NECP_CLIENTUUID: {
5498 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5499 				/* Handled by MPTCP itself */
5500 				break;
5501 			}
5502 
5503 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5504 				error = EINVAL;
5505 				goto out;
5506 			}
5507 
5508 			struct inpcb *inp = sotoinpcb(so);
5509 			if (!uuid_is_null(inp->necp_client_uuid)) {
5510 				// Clear out the old client UUID if present
5511 				necp_inpcb_remove_cb(inp);
5512 			}
5513 
5514 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5515 			    sizeof(uuid_t), sizeof(uuid_t));
5516 			if (error != 0) {
5517 				goto out;
5518 			}
5519 
5520 			if (uuid_is_null(inp->necp_client_uuid)) {
5521 				error = EINVAL;
5522 				goto out;
5523 			}
5524 
5525 			pid_t current_pid = proc_pid(current_proc());
5526 			error = necp_client_register_socket_flow(current_pid,
5527 			    inp->necp_client_uuid, inp);
5528 			if (error != 0) {
5529 				uuid_clear(inp->necp_client_uuid);
5530 				goto out;
5531 			}
5532 
5533 			if (inp->inp_lport != 0) {
5534 				// There is a bound local port, so this is not
5535 				// a fresh socket. Assign to the client.
5536 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5537 			}
5538 
5539 			break;
5540 		}
5541 		case SO_NECP_LISTENUUID: {
5542 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5543 				error = EINVAL;
5544 				goto out;
5545 			}
5546 
5547 			struct inpcb *inp = sotoinpcb(so);
5548 			if (!uuid_is_null(inp->necp_client_uuid)) {
5549 				error = EINVAL;
5550 				goto out;
5551 			}
5552 
5553 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5554 			    sizeof(uuid_t), sizeof(uuid_t));
5555 			if (error != 0) {
5556 				goto out;
5557 			}
5558 
5559 			if (uuid_is_null(inp->necp_client_uuid)) {
5560 				error = EINVAL;
5561 				goto out;
5562 			}
5563 
5564 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5565 			    inp->necp_client_uuid, inp);
5566 			if (error != 0) {
5567 				uuid_clear(inp->necp_client_uuid);
5568 				goto out;
5569 			}
5570 
5571 			// Mark that the port registration is held by NECP
5572 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5573 
5574 			break;
5575 		}
5576 
5577 		case SO_RESOLVER_SIGNATURE: {
5578 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5579 				error = EINVAL;
5580 				goto out;
5581 			}
5582 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5583 			break;
5584 		}
5585 #endif /* NECP */
5586 
5587 		case SO_EXTENDED_BK_IDLE:
5588 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5589 			    sizeof(optval));
5590 			if (error == 0) {
5591 				error = so_set_extended_bk_idle(so, optval);
5592 			}
5593 			break;
5594 
5595 		case SO_MARK_CELLFALLBACK:
5596 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5597 			    sizeof(optval));
5598 			if (error != 0) {
5599 				goto out;
5600 			}
5601 			if (optval < 0) {
5602 				error = EINVAL;
5603 				goto out;
5604 			}
5605 			if (optval == 0) {
5606 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5607 			} else {
5608 				so->so_flags1 |= SOF1_CELLFALLBACK;
5609 			}
5610 			break;
5611 
5612 		case SO_MARK_CELLFALLBACK_UUID:
5613 		{
5614 			struct so_mark_cellfallback_uuid_args args;
5615 
5616 			error = sooptcopyin(sopt, &args, sizeof(args),
5617 			    sizeof(args));
5618 			if (error != 0) {
5619 				goto out;
5620 			}
5621 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5622 			    args.flow_cellfallback);
5623 			break;
5624 		}
5625 
5626 		case SO_FALLBACK_MODE:
5627 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5628 			    sizeof(optval));
5629 			if (error != 0) {
5630 				goto out;
5631 			}
5632 			if (optval < SO_FALLBACK_MODE_NONE ||
5633 			    optval > SO_FALLBACK_MODE_PREFER) {
5634 				error = EINVAL;
5635 				goto out;
5636 			}
5637 			so->so_fallback_mode = (u_int8_t)optval;
5638 			break;
5639 
5640 		case SO_MARK_KNOWN_TRACKER: {
5641 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5642 			    sizeof(optval));
5643 			if (error != 0) {
5644 				goto out;
5645 			}
5646 			if (optval < 0) {
5647 				error = EINVAL;
5648 				goto out;
5649 			}
5650 			if (optval == 0) {
5651 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5652 			} else {
5653 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5654 			}
5655 			break;
5656 		}
5657 
5658 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5659 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5660 			    sizeof(optval));
5661 			if (error != 0) {
5662 				goto out;
5663 			}
5664 			if (optval < 0) {
5665 				error = EINVAL;
5666 				goto out;
5667 			}
5668 			if (optval == 0) {
5669 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5670 			} else {
5671 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5672 			}
5673 			break;
5674 		}
5675 
5676 		case SO_MARK_APPROVED_APP_DOMAIN: {
5677 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5678 			    sizeof(optval));
5679 			if (error != 0) {
5680 				goto out;
5681 			}
5682 			if (optval < 0) {
5683 				error = EINVAL;
5684 				goto out;
5685 			}
5686 			if (optval == 0) {
5687 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5688 			} else {
5689 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5690 			}
5691 			break;
5692 		}
5693 
5694 		case SO_STATISTICS_EVENT:
5695 			error = sooptcopyin(sopt, &long_optval,
5696 			    sizeof(long_optval), sizeof(long_optval));
5697 			if (error != 0) {
5698 				goto out;
5699 			}
5700 			u_int64_t nstat_event = 0;
5701 			error = so_statistics_event_to_nstat_event(
5702 				&long_optval, &nstat_event);
5703 			if (error != 0) {
5704 				goto out;
5705 			}
5706 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5707 			break;
5708 
5709 		case SO_NET_SERVICE_TYPE: {
5710 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5711 			    sizeof(optval));
5712 			if (error != 0) {
5713 				goto out;
5714 			}
5715 			error = so_set_net_service_type(so, optval);
5716 			break;
5717 		}
5718 
5719 		case SO_QOSMARKING_POLICY_OVERRIDE:
5720 			error = priv_check_cred(kauth_cred_get(),
5721 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5722 			if (error != 0) {
5723 				goto out;
5724 			}
5725 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5726 			    sizeof(optval));
5727 			if (error != 0) {
5728 				goto out;
5729 			}
5730 			if (optval == 0) {
5731 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5732 			} else {
5733 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5734 			}
5735 			break;
5736 
5737 		case SO_MPKL_SEND_INFO: {
5738 			struct so_mpkl_send_info so_mpkl_send_info;
5739 
5740 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5741 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5742 			if (error != 0) {
5743 				goto out;
5744 			}
5745 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5746 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5747 
5748 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5749 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5750 			} else {
5751 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5752 			}
5753 			break;
5754 		}
5755 		case SO_WANT_KEV_SOCKET_CLOSED: {
5756 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5757 			    sizeof(optval));
5758 			if (error != 0) {
5759 				goto out;
5760 			}
5761 			if (optval == 0) {
5762 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5763 			} else {
5764 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5765 			}
5766 			break;
5767 		}
5768 		case SO_MARK_WAKE_PKT: {
5769 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5770 			    sizeof(optval));
5771 			if (error != 0) {
5772 				goto out;
5773 			}
5774 			if (optval == 0) {
5775 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5776 			} else {
5777 				so->so_flags |= SOF_MARK_WAKE_PKT;
5778 			}
5779 			break;
5780 		}
5781 		case SO_RECV_WAKE_PKT: {
5782 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5783 			    sizeof(optval));
5784 			if (error != 0) {
5785 				goto out;
5786 			}
5787 			if (optval == 0) {
5788 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5789 			} else {
5790 				so->so_flags |= SOF_RECV_WAKE_PKT;
5791 			}
5792 			break;
5793 		}
5794 		case SO_APPLICATION_ID: {
5795 			so_application_id_t application_id = { 0 };
5796 
5797 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5798 				error = EINVAL;
5799 				goto out;
5800 			}
5801 			error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5802 			    sizeof(application_id));
5803 			if (error != 0) {
5804 				goto out;
5805 			}
5806 
5807 			// The user needs to match
5808 			if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5809 				error = EINVAL;
5810 				printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5811 				goto out;
5812 			}
5813 			error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5814 			if (error != 0) {
5815 				printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5816 				goto out;
5817 			}
5818 			if (application_id.persona_id != PERSONA_ID_NONE) {
5819 				so->so_persona_id = application_id.persona_id;
5820 			}
5821 			break;
5822 		}
5823 		case SO_MARK_DOMAIN_INFO_SILENT:
5824 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5825 			    sizeof(optval));
5826 			if (error != 0) {
5827 				goto out;
5828 			}
5829 			if (optval < 0) {
5830 				error = EINVAL;
5831 				goto out;
5832 			}
5833 			if (optval == 0) {
5834 				so->so_flags1 &= ~SOF1_DOMAIN_INFO_SILENT;
5835 			} else {
5836 				so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
5837 			}
5838 			break;
5839 
5840 		default:
5841 			error = ENOPROTOOPT;
5842 			break;
5843 		}
5844 		if (error == 0 && so->so_proto != NULL &&
5845 		    so->so_proto->pr_ctloutput != NULL) {
5846 			(void) so->so_proto->pr_ctloutput(so, sopt);
5847 		}
5848 	}
5849 out:
5850 	if (dolock) {
5851 		socket_unlock(so, 1);
5852 	}
5853 	return error;
5854 }
5855 
5856 /* Helper routines for getsockopt */
5857 int
sooptcopyout(struct sockopt * sopt,void * __sized_by (len)buf,size_t len)5858 sooptcopyout(struct sockopt *sopt, void *__sized_by(len) buf, size_t len)
5859 {
5860 	int     error;
5861 	size_t  valsize;
5862 
5863 	error = 0;
5864 
5865 	/*
5866 	 * Documented get behavior is that we always return a value,
5867 	 * possibly truncated to fit in the user's buffer.
5868 	 * Traditional behavior is that we always tell the user
5869 	 * precisely how much we copied, rather than something useful
5870 	 * like the total amount we had available for her.
5871 	 * Note that this interface is not idempotent; the entire answer must
5872 	 * generated ahead of time.
5873 	 */
5874 	valsize = MIN(len, sopt->sopt_valsize);
5875 	sopt->sopt_valsize = valsize;
5876 	if (sopt->sopt_valsize != 0 && sopt->sopt_val != USER_ADDR_NULL) {
5877 		if (sopt->sopt_p != kernproc) {
5878 			error = copyout(buf, sopt->sopt_val, valsize);
5879 		} else {
5880 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5881 			    CAST_DOWN(caddr_t, sopt->sopt_val),
5882 			    valsize);
5883 			bcopy(buf, tmp, valsize);
5884 		}
5885 	}
5886 	return error;
5887 }
5888 
5889 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5890 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5891 {
5892 	int                     error;
5893 	size_t                  len;
5894 	struct user64_timeval   tv64 = {};
5895 	struct user32_timeval   tv32 = {};
5896 	const void *            val;
5897 	size_t                  valsize;
5898 
5899 	error = 0;
5900 	if (proc_is64bit(sopt->sopt_p)) {
5901 		len = sizeof(tv64);
5902 		tv64.tv_sec = tv_p->tv_sec;
5903 		tv64.tv_usec = tv_p->tv_usec;
5904 		val = &tv64;
5905 	} else {
5906 		len = sizeof(tv32);
5907 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5908 		tv32.tv_usec = tv_p->tv_usec;
5909 		val = &tv32;
5910 	}
5911 	valsize = MIN(len, sopt->sopt_valsize);
5912 	sopt->sopt_valsize = valsize;
5913 	if (sopt->sopt_val != USER_ADDR_NULL) {
5914 		if (sopt->sopt_p != kernproc) {
5915 			error = copyout(val, sopt->sopt_val, valsize);
5916 		} else {
5917 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5918 			    CAST_DOWN(caddr_t, sopt->sopt_val),
5919 			    valsize);
5920 			bcopy(val, tmp, valsize);
5921 		}
5922 	}
5923 	return error;
5924 }
5925 
5926 /*
5927  * Return:	0			Success
5928  *		ENOPROTOOPT
5929  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5930  *	<pr_ctloutput>:???
5931  *	<sf_getoption>:???
5932  */
5933 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5934 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5935 {
5936 	int     error, optval;
5937 	struct  linger l;
5938 	struct  timeval tv;
5939 
5940 	if (sopt->sopt_dir != SOPT_GET) {
5941 		sopt->sopt_dir = SOPT_GET;
5942 	}
5943 
5944 	if (dolock) {
5945 		socket_lock(so, 1);
5946 	}
5947 
5948 	error = sflt_getsockopt(so, sopt);
5949 	if (error != 0) {
5950 		if (error == EJUSTRETURN) {
5951 			error = 0;
5952 		}
5953 		goto out;
5954 	}
5955 
5956 	if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
5957 		if (so->so_proto != NULL &&
5958 		    so->so_proto->pr_ctloutput != NULL) {
5959 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5960 			goto out;
5961 		}
5962 		error = ENOPROTOOPT;
5963 	} else {
5964 		/*
5965 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5966 		 * the protocol layer, if needed.  A zero value returned from
5967 		 * the handler means use default socket-level processing as
5968 		 * done by the rest of this routine.  Otherwise, any other
5969 		 * return value indicates that the option is unsupported.
5970 		 */
5971 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5972 		    pru_socheckopt(so, sopt)) != 0) {
5973 			goto out;
5974 		}
5975 
5976 		error = 0;
5977 		switch (sopt->sopt_name) {
5978 		case SO_LINGER:
5979 		case SO_LINGER_SEC:
5980 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5981 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5982 			    so->so_linger : so->so_linger / hz;
5983 			error = sooptcopyout(sopt, &l, sizeof(l));
5984 			break;
5985 
5986 		case SO_USELOOPBACK:
5987 		case SO_DONTROUTE:
5988 		case SO_DEBUG:
5989 		case SO_KEEPALIVE:
5990 		case SO_REUSEADDR:
5991 		case SO_REUSEPORT:
5992 		case SO_BROADCAST:
5993 		case SO_OOBINLINE:
5994 		case SO_TIMESTAMP:
5995 		case SO_TIMESTAMP_MONOTONIC:
5996 		case SO_TIMESTAMP_CONTINUOUS:
5997 		case SO_DONTTRUNC:
5998 		case SO_WANTMORE:
5999 		case SO_WANTOOBFLAG:
6000 		case SO_NOWAKEFROMSLEEP:
6001 		case SO_NOAPNFALLBK:
6002 			optval = so->so_options & sopt->sopt_name;
6003 integer:
6004 			error = sooptcopyout(sopt, &optval, sizeof(optval));
6005 			break;
6006 
6007 		case SO_TYPE:
6008 			optval = so->so_type;
6009 			goto integer;
6010 
6011 		case SO_NREAD:
6012 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6013 				int pkt_total;
6014 				struct mbuf *m1;
6015 
6016 				pkt_total = 0;
6017 				m1 = so->so_rcv.sb_mb;
6018 				while (m1 != NULL) {
6019 					if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
6020 						pkt_total += m1->m_len;
6021 					}
6022 					m1 = m1->m_next;
6023 				}
6024 				optval = pkt_total;
6025 			} else {
6026 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6027 			}
6028 			goto integer;
6029 
6030 		case SO_NUMRCVPKT:
6031 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6032 				int cnt = 0;
6033 				struct mbuf *m1;
6034 
6035 				m1 = so->so_rcv.sb_mb;
6036 				while (m1 != NULL) {
6037 					cnt += 1;
6038 					m1 = m1->m_nextpkt;
6039 				}
6040 				optval = cnt;
6041 				goto integer;
6042 			} else {
6043 				error = ENOPROTOOPT;
6044 				break;
6045 			}
6046 
6047 		case SO_NWRITE:
6048 			optval = so->so_snd.sb_cc;
6049 			goto integer;
6050 
6051 		case SO_ERROR:
6052 			optval = so->so_error;
6053 			so->so_error = 0;
6054 			goto integer;
6055 
6056 		case SO_SNDBUF: {
6057 			u_int32_t hiwat = so->so_snd.sb_hiwat;
6058 
6059 			if (so->so_snd.sb_flags & SB_UNIX) {
6060 				struct unpcb *unp =
6061 				    (struct unpcb *)(so->so_pcb);
6062 				if (unp != NULL && unp->unp_conn != NULL) {
6063 					hiwat += unp->unp_conn->unp_cc;
6064 				}
6065 			}
6066 
6067 			optval = hiwat;
6068 			goto integer;
6069 		}
6070 		case SO_RCVBUF:
6071 			optval = so->so_rcv.sb_hiwat;
6072 			goto integer;
6073 
6074 		case SO_SNDLOWAT:
6075 			optval = so->so_snd.sb_lowat;
6076 			goto integer;
6077 
6078 		case SO_RCVLOWAT:
6079 			optval = so->so_rcv.sb_lowat;
6080 			goto integer;
6081 
6082 		case SO_SNDTIMEO:
6083 		case SO_RCVTIMEO:
6084 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
6085 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6086 
6087 			error = sooptcopyout_timeval(sopt, &tv);
6088 			break;
6089 
6090 		case SO_NOSIGPIPE:
6091 			optval = (so->so_flags & SOF_NOSIGPIPE);
6092 			goto integer;
6093 
6094 		case SO_NOADDRERR:
6095 			optval = (so->so_flags & SOF_NOADDRAVAIL);
6096 			goto integer;
6097 
6098 		case SO_REUSESHAREUID:
6099 			optval = (so->so_flags & SOF_REUSESHAREUID);
6100 			goto integer;
6101 
6102 
6103 		case SO_NOTIFYCONFLICT:
6104 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6105 			goto integer;
6106 
6107 		case SO_RESTRICTIONS:
6108 			optval = so_get_restrictions(so);
6109 			goto integer;
6110 
6111 		case SO_AWDL_UNRESTRICTED:
6112 			if (SOCK_DOM(so) == PF_INET ||
6113 			    SOCK_DOM(so) == PF_INET6) {
6114 				optval = inp_get_awdl_unrestricted(
6115 					sotoinpcb(so));
6116 				goto integer;
6117 			} else {
6118 				error = EOPNOTSUPP;
6119 			}
6120 			break;
6121 
6122 		case SO_INTCOPROC_ALLOW:
6123 			if (SOCK_DOM(so) == PF_INET6) {
6124 				optval = inp_get_intcoproc_allowed(
6125 					sotoinpcb(so));
6126 				goto integer;
6127 			} else {
6128 				error = EOPNOTSUPP;
6129 			}
6130 			break;
6131 
6132 		case SO_LABEL:
6133 			error = EOPNOTSUPP;
6134 			break;
6135 
6136 		case SO_PEERLABEL:
6137 			error = EOPNOTSUPP;
6138 			break;
6139 
6140 #ifdef __APPLE_API_PRIVATE
6141 		case SO_UPCALLCLOSEWAIT:
6142 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6143 			goto integer;
6144 #endif
6145 		case SO_RANDOMPORT:
6146 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6147 			goto integer;
6148 
6149 		case SO_NP_EXTENSIONS: {
6150 			struct so_np_extensions sonpx = {};
6151 
6152 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6153 			    SONPX_SETOPTSHUT : 0;
6154 			sonpx.npx_mask = SONPX_MASK_VALID;
6155 
6156 			error = sooptcopyout(sopt, &sonpx,
6157 			    sizeof(struct so_np_extensions));
6158 			break;
6159 		}
6160 
6161 		case SO_TRAFFIC_CLASS:
6162 			optval = so->so_traffic_class;
6163 			goto integer;
6164 
6165 		case SO_RECV_TRAFFIC_CLASS:
6166 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6167 			goto integer;
6168 
6169 #if (DEVELOPMENT || DEBUG)
6170 		case SO_TRAFFIC_CLASS_DBG:
6171 			error = sogetopt_tcdbg(so, sopt);
6172 			break;
6173 #endif /* (DEVELOPMENT || DEBUG) */
6174 
6175 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6176 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6177 			goto integer;
6178 
6179 		case SO_DEFUNCTOK:
6180 			optval = !(so->so_flags & SOF_NODEFUNCT);
6181 			goto integer;
6182 
6183 		case SO_ISDEFUNCT:
6184 			optval = (so->so_flags & SOF_DEFUNCT);
6185 			goto integer;
6186 
6187 		case SO_OPPORTUNISTIC:
6188 			optval = so_get_opportunistic(so);
6189 			goto integer;
6190 
6191 		case SO_FLUSH:
6192 			/* This option is not gettable */
6193 			error = EINVAL;
6194 			break;
6195 
6196 		case SO_RECV_ANYIF:
6197 			optval = so_get_recv_anyif(so);
6198 			goto integer;
6199 
6200 		case SO_TRAFFIC_MGT_BACKGROUND:
6201 			/* This option is handled by lower layer(s) */
6202 			if (so->so_proto != NULL &&
6203 			    so->so_proto->pr_ctloutput != NULL) {
6204 				(void) so->so_proto->pr_ctloutput(so, sopt);
6205 			}
6206 			break;
6207 
6208 #if FLOW_DIVERT
6209 		case SO_FLOW_DIVERT_TOKEN:
6210 			error = flow_divert_token_get(so, sopt);
6211 			break;
6212 #endif  /* FLOW_DIVERT */
6213 
6214 #if NECP
6215 		case SO_NECP_ATTRIBUTES:
6216 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6217 				/* Handled by MPTCP itself */
6218 				break;
6219 			}
6220 
6221 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6222 				error = EINVAL;
6223 				goto out;
6224 			}
6225 
6226 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6227 			break;
6228 
6229 		case SO_NECP_CLIENTUUID: {
6230 			uuid_t *ncu;
6231 
6232 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6233 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6234 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6235 				ncu = &sotoinpcb(so)->necp_client_uuid;
6236 			} else {
6237 				error = EINVAL;
6238 				goto out;
6239 			}
6240 
6241 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6242 			break;
6243 		}
6244 
6245 		case SO_NECP_LISTENUUID: {
6246 			uuid_t *nlu;
6247 
6248 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6249 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6250 					nlu = &sotoinpcb(so)->necp_client_uuid;
6251 				} else {
6252 					error = ENOENT;
6253 					goto out;
6254 				}
6255 			} else {
6256 				error = EINVAL;
6257 				goto out;
6258 			}
6259 
6260 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6261 			break;
6262 		}
6263 
6264 		case SO_RESOLVER_SIGNATURE: {
6265 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6266 				error = EINVAL;
6267 				goto out;
6268 			}
6269 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6270 			break;
6271 		}
6272 
6273 #endif /* NECP */
6274 
6275 #if CONTENT_FILTER
6276 		case SO_CFIL_SOCK_ID: {
6277 			cfil_sock_id_t sock_id;
6278 
6279 			sock_id = cfil_sock_id_from_socket(so);
6280 
6281 			error = sooptcopyout(sopt, &sock_id,
6282 			    sizeof(cfil_sock_id_t));
6283 			break;
6284 		}
6285 #endif  /* CONTENT_FILTER */
6286 
6287 		case SO_EXTENDED_BK_IDLE:
6288 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6289 			goto integer;
6290 		case SO_MARK_CELLFALLBACK:
6291 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6292 			    ? 1 : 0;
6293 			goto integer;
6294 		case SO_FALLBACK_MODE:
6295 			optval = so->so_fallback_mode;
6296 			goto integer;
6297 		case SO_MARK_KNOWN_TRACKER: {
6298 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6299 			    ? 1 : 0;
6300 			goto integer;
6301 		}
6302 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6303 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6304 			    ? 1 : 0;
6305 			goto integer;
6306 		}
6307 		case SO_MARK_APPROVED_APP_DOMAIN: {
6308 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6309 			    ? 1 : 0;
6310 			goto integer;
6311 		}
6312 		case SO_NET_SERVICE_TYPE: {
6313 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6314 				optval = so->so_netsvctype;
6315 			} else {
6316 				optval = NET_SERVICE_TYPE_BE;
6317 			}
6318 			goto integer;
6319 		}
6320 		case SO_NETSVC_MARKING_LEVEL:
6321 			optval = so_get_netsvc_marking_level(so);
6322 			goto integer;
6323 
6324 		case SO_MPKL_SEND_INFO: {
6325 			struct so_mpkl_send_info so_mpkl_send_info;
6326 
6327 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6328 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6329 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6330 			    sizeof(struct so_mpkl_send_info));
6331 			break;
6332 		}
6333 		case SO_MARK_WAKE_PKT:
6334 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6335 			goto integer;
6336 		case SO_RECV_WAKE_PKT:
6337 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6338 			goto integer;
6339 		case SO_APPLICATION_ID: {
6340 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6341 				error = EINVAL;
6342 				goto out;
6343 			}
6344 			so_application_id_t application_id = { 0 };
6345 			application_id.uid = kauth_cred_getuid(so->so_cred);
6346 			uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6347 			application_id.persona_id = so->so_persona_id;
6348 			error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6349 			break;
6350 		}
6351 		case SO_MARK_DOMAIN_INFO_SILENT:
6352 			optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
6353 			    ? 1 : 0;
6354 			goto integer;
6355 		default:
6356 			error = ENOPROTOOPT;
6357 			break;
6358 		}
6359 	}
6360 out:
6361 	if (dolock) {
6362 		socket_unlock(so, 1);
6363 	}
6364 	return error;
6365 }
6366 
6367 /*
6368  * The size limits on our soopt_getm is different from that on FreeBSD.
6369  * We limit the size of options to MCLBYTES. This will have to change
6370  * if we need to define options that need more space than MCLBYTES.
6371  */
6372 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6373 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6374 {
6375 	struct mbuf *m, *m_prev;
6376 	int sopt_size = (int)sopt->sopt_valsize;
6377 	int how;
6378 
6379 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6380 		return EMSGSIZE;
6381 	}
6382 
6383 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6384 	MGET(m, how, MT_DATA);
6385 	if (m == NULL) {
6386 		return ENOBUFS;
6387 	}
6388 	if (sopt_size > MLEN) {
6389 		MCLGET(m, how);
6390 		if ((m->m_flags & M_EXT) == 0) {
6391 			m_free(m);
6392 			return ENOBUFS;
6393 		}
6394 		m->m_len = min(MCLBYTES, sopt_size);
6395 	} else {
6396 		m->m_len = min(MLEN, sopt_size);
6397 	}
6398 	sopt_size -= m->m_len;
6399 	*mp = m;
6400 	m_prev = m;
6401 
6402 	while (sopt_size > 0) {
6403 		MGET(m, how, MT_DATA);
6404 		if (m == NULL) {
6405 			m_freem(*mp);
6406 			return ENOBUFS;
6407 		}
6408 		if (sopt_size > MLEN) {
6409 			MCLGET(m, how);
6410 			if ((m->m_flags & M_EXT) == 0) {
6411 				m_freem(*mp);
6412 				m_freem(m);
6413 				return ENOBUFS;
6414 			}
6415 			m->m_len = min(MCLBYTES, sopt_size);
6416 		} else {
6417 			m->m_len = min(MLEN, sopt_size);
6418 		}
6419 		sopt_size -= m->m_len;
6420 		m_prev->m_next = m;
6421 		m_prev = m;
6422 	}
6423 	return 0;
6424 }
6425 
6426 /* copyin sopt data into mbuf chain */
6427 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6428 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6429 {
6430 	struct mbuf *m0 = m;
6431 
6432 	if (sopt->sopt_val == USER_ADDR_NULL) {
6433 		return 0;
6434 	}
6435 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6436 		if (sopt->sopt_p != kernproc) {
6437 			int error;
6438 
6439 			error = copyin(sopt->sopt_val, mtod(m, char *),
6440 			    m->m_len);
6441 			if (error != 0) {
6442 				m_freem(m0);
6443 				return error;
6444 			}
6445 		} else {
6446 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6447 			    CAST_DOWN(caddr_t, sopt->sopt_val),
6448 			    m->m_len);
6449 			bcopy(tmp, mtod(m, char *), m->m_len);
6450 		}
6451 		sopt->sopt_valsize -= m->m_len;
6452 		sopt->sopt_val += m->m_len;
6453 		m = m->m_next;
6454 	}
6455 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6456 	if (m != NULL) {
6457 		panic("soopt_mcopyin");
6458 		/* NOTREACHED */
6459 	}
6460 	return 0;
6461 }
6462 
6463 /* copyout mbuf chain data into soopt */
6464 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6465 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6466 {
6467 	struct mbuf *m0 = m;
6468 	size_t valsize = 0;
6469 
6470 	if (sopt->sopt_val == USER_ADDR_NULL) {
6471 		return 0;
6472 	}
6473 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6474 		if (sopt->sopt_p != kernproc) {
6475 			int error;
6476 
6477 			error = copyout(mtod(m, char *), sopt->sopt_val,
6478 			    m->m_len);
6479 			if (error != 0) {
6480 				m_freem(m0);
6481 				return error;
6482 			}
6483 		} else {
6484 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6485 			    CAST_DOWN(caddr_t, sopt->sopt_val),
6486 			    m->m_len);
6487 
6488 			bcopy(mtod(m, char *), tmp, m->m_len);
6489 		}
6490 		sopt->sopt_valsize -= m->m_len;
6491 		sopt->sopt_val += m->m_len;
6492 		valsize += m->m_len;
6493 		m = m->m_next;
6494 	}
6495 	if (m != NULL) {
6496 		/* enough soopt buffer should be given from user-land */
6497 		m_freem(m0);
6498 		return EINVAL;
6499 	}
6500 	sopt->sopt_valsize = valsize;
6501 	return 0;
6502 }
6503 
6504 void
sohasoutofband(struct socket * so)6505 sohasoutofband(struct socket *so)
6506 {
6507 	if (so->so_pgid < 0) {
6508 		gsignal(-so->so_pgid, SIGURG);
6509 	} else if (so->so_pgid > 0) {
6510 		proc_signal(so->so_pgid, SIGURG);
6511 	}
6512 	selwakeup(&so->so_rcv.sb_sel);
6513 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6514 		KNOTE(&so->so_rcv.sb_sel.si_note,
6515 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6516 	}
6517 }
6518 
6519 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6520 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6521 {
6522 #pragma unused(cred)
6523 	struct proc *p = current_proc();
6524 	int revents = 0;
6525 
6526 	socket_lock(so, 1);
6527 	so_update_last_owner_locked(so, PROC_NULL);
6528 	so_update_policy(so);
6529 
6530 	if (events & (POLLIN | POLLRDNORM)) {
6531 		if (soreadable(so)) {
6532 			revents |= events & (POLLIN | POLLRDNORM);
6533 		}
6534 	}
6535 
6536 	if (events & (POLLOUT | POLLWRNORM)) {
6537 		if (sowriteable(so)) {
6538 			revents |= events & (POLLOUT | POLLWRNORM);
6539 		}
6540 	}
6541 
6542 	if (events & (POLLPRI | POLLRDBAND)) {
6543 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6544 			revents |= events & (POLLPRI | POLLRDBAND);
6545 		}
6546 	}
6547 
6548 	if (revents == 0) {
6549 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6550 			/*
6551 			 * Darwin sets the flag first,
6552 			 * BSD calls selrecord first
6553 			 */
6554 			so->so_rcv.sb_flags |= SB_SEL;
6555 			selrecord(p, &so->so_rcv.sb_sel, wql);
6556 		}
6557 
6558 		if (events & (POLLOUT | POLLWRNORM)) {
6559 			/*
6560 			 * Darwin sets the flag first,
6561 			 * BSD calls selrecord first
6562 			 */
6563 			so->so_snd.sb_flags |= SB_SEL;
6564 			selrecord(p, &so->so_snd.sb_sel, wql);
6565 		}
6566 	}
6567 
6568 	socket_unlock(so, 1);
6569 	return revents;
6570 }
6571 
6572 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6573 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6574 {
6575 	struct socket *so = (struct socket *)fp_get_data(fp);
6576 	int result;
6577 
6578 	socket_lock(so, 1);
6579 	so_update_last_owner_locked(so, PROC_NULL);
6580 	so_update_policy(so);
6581 
6582 	switch (kn->kn_filter) {
6583 	case EVFILT_READ:
6584 		kn->kn_filtid = EVFILTID_SOREAD;
6585 		break;
6586 	case EVFILT_WRITE:
6587 		kn->kn_filtid = EVFILTID_SOWRITE;
6588 		break;
6589 	case EVFILT_SOCK:
6590 		kn->kn_filtid = EVFILTID_SCK;
6591 		break;
6592 	case EVFILT_EXCEPT:
6593 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6594 		break;
6595 	default:
6596 		socket_unlock(so, 1);
6597 		knote_set_error(kn, EINVAL);
6598 		return 0;
6599 	}
6600 
6601 	/*
6602 	 * call the appropriate sub-filter attach
6603 	 * with the socket still locked
6604 	 */
6605 	result = knote_fops(kn)->f_attach(kn, kev);
6606 
6607 	socket_unlock(so, 1);
6608 
6609 	return result;
6610 }
6611 
6612 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6613 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6614 {
6615 	int retval = 0;
6616 	int64_t data = 0;
6617 
6618 	if (so->so_options & SO_ACCEPTCONN) {
6619 		/*
6620 		 * Radar 6615193 handle the listen case dynamically
6621 		 * for kqueue read filter. This allows to call listen()
6622 		 * after registering the kqueue EVFILT_READ.
6623 		 */
6624 
6625 		retval = !TAILQ_EMPTY(&so->so_comp);
6626 		data = so->so_qlen;
6627 		goto out;
6628 	}
6629 
6630 	/* socket isn't a listener */
6631 	/*
6632 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6633 	 * the bytes of protocol data. We therefore exclude any
6634 	 * control bytes.
6635 	 */
6636 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6637 
6638 	if (kn->kn_sfflags & NOTE_OOB) {
6639 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6640 			kn->kn_fflags |= NOTE_OOB;
6641 			data -= so->so_oobmark;
6642 			retval = 1;
6643 			goto out;
6644 		}
6645 	}
6646 
6647 	if ((so->so_state & SS_CANTRCVMORE)
6648 #if CONTENT_FILTER
6649 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6650 #endif /* CONTENT_FILTER */
6651 	    ) {
6652 		kn->kn_flags |= EV_EOF;
6653 		kn->kn_fflags = so->so_error;
6654 		retval = 1;
6655 		goto out;
6656 	}
6657 
6658 	if (so->so_error) {     /* temporary udp error */
6659 		retval = 1;
6660 		goto out;
6661 	}
6662 
6663 	int64_t lowwat = so->so_rcv.sb_lowat;
6664 	/*
6665 	 * Ensure that when NOTE_LOWAT is used, the derived
6666 	 * low water mark is bounded by socket's rcv buf's
6667 	 * high and low water mark values.
6668 	 */
6669 	if (kn->kn_sfflags & NOTE_LOWAT) {
6670 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6671 			lowwat = so->so_rcv.sb_hiwat;
6672 		} else if (kn->kn_sdata > lowwat) {
6673 			lowwat = kn->kn_sdata;
6674 		}
6675 	}
6676 
6677 	/*
6678 	 * While the `data` field is the amount of data to read,
6679 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6680 	 * so we need to take control bytes into account too.
6681 	 */
6682 	retval = (so->so_rcv.sb_cc >= lowwat);
6683 
6684 out:
6685 	if (retval && kev) {
6686 		knote_fill_kevent(kn, kev, data);
6687 	}
6688 	return retval;
6689 }
6690 
6691 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6692 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6693 {
6694 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6695 
6696 	/* socket locked */
6697 
6698 	/*
6699 	 * If the caller explicitly asked for OOB results (e.g. poll())
6700 	 * from EVFILT_READ, then save that off in the hookid field
6701 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6702 	 */
6703 	if (kn->kn_filter == EVFILT_READ &&
6704 	    kn->kn_flags & EV_OOBAND) {
6705 		kn->kn_flags &= ~EV_OOBAND;
6706 		kn->kn_hook32 = EV_OOBAND;
6707 	} else {
6708 		kn->kn_hook32 = 0;
6709 	}
6710 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6711 		so->so_rcv.sb_flags |= SB_KNOTE;
6712 	}
6713 
6714 	/* indicate if event is already fired */
6715 	return filt_soread_common(kn, NULL, so);
6716 }
6717 
6718 static void
filt_sordetach(struct knote * kn)6719 filt_sordetach(struct knote *kn)
6720 {
6721 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6722 
6723 	socket_lock(so, 1);
6724 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6725 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6726 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6727 		}
6728 	}
6729 	socket_unlock(so, 1);
6730 }
6731 
6732 /*ARGSUSED*/
6733 static int
filt_soread(struct knote * kn,long hint)6734 filt_soread(struct knote *kn, long hint)
6735 {
6736 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6737 	int retval;
6738 
6739 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6740 		socket_lock(so, 1);
6741 	}
6742 
6743 	retval = filt_soread_common(kn, NULL, so);
6744 
6745 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6746 		socket_unlock(so, 1);
6747 	}
6748 
6749 	return retval;
6750 }
6751 
6752 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6753 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6754 {
6755 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6756 	int retval;
6757 
6758 	socket_lock(so, 1);
6759 
6760 	/* save off the new input fflags and data */
6761 	kn->kn_sfflags = kev->fflags;
6762 	kn->kn_sdata = kev->data;
6763 
6764 	/* determine if changes result in fired events */
6765 	retval = filt_soread_common(kn, NULL, so);
6766 
6767 	socket_unlock(so, 1);
6768 
6769 	return retval;
6770 }
6771 
6772 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6773 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6774 {
6775 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6776 	int retval;
6777 
6778 	socket_lock(so, 1);
6779 	retval = filt_soread_common(kn, kev, so);
6780 	socket_unlock(so, 1);
6781 
6782 	return retval;
6783 }
6784 
6785 int
so_wait_for_if_feedback(struct socket * so)6786 so_wait_for_if_feedback(struct socket *so)
6787 {
6788 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6789 	    (so->so_state & SS_ISCONNECTED)) {
6790 		struct inpcb *inp = sotoinpcb(so);
6791 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6792 			return 1;
6793 		}
6794 	}
6795 	return 0;
6796 }
6797 
6798 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6799 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6800 {
6801 	int ret = 0;
6802 	int64_t data = sbspace(&so->so_snd);
6803 
6804 	if (so->so_state & SS_CANTSENDMORE) {
6805 		kn->kn_flags |= EV_EOF;
6806 		kn->kn_fflags = so->so_error;
6807 		ret = 1;
6808 		goto out;
6809 	}
6810 
6811 	if (so->so_error) {     /* temporary udp error */
6812 		ret = 1;
6813 		goto out;
6814 	}
6815 
6816 	if (!socanwrite(so)) {
6817 		ret = 0;
6818 		goto out;
6819 	}
6820 
6821 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6822 		ret = 1;
6823 		goto out;
6824 	}
6825 
6826 	int64_t lowwat = so->so_snd.sb_lowat;
6827 	const int64_t hiwat = so->so_snd.sb_hiwat;
6828 	/*
6829 	 * Deal with connected UNIX domain sockets which
6830 	 * rely on the fact that the sender's socket buffer is
6831 	 * actually the receiver's socket buffer.
6832 	 */
6833 	if (SOCK_DOM(so) == PF_LOCAL) {
6834 		struct unpcb *unp = sotounpcb(so);
6835 		if (unp != NULL && unp->unp_conn != NULL &&
6836 		    unp->unp_conn->unp_socket != NULL) {
6837 			struct socket *so2 = unp->unp_conn->unp_socket;
6838 			/*
6839 			 * At this point we know that `so' is locked
6840 			 * and that `unp_conn` isn't going to change.
6841 			 * However, we don't lock `so2` because doing so
6842 			 * may require unlocking `so'
6843 			 * (see unp_get_locks_in_order()).
6844 			 *
6845 			 * Two cases can happen:
6846 			 *
6847 			 * 1) we return 1 and tell the application that
6848 			 *    it can write.  Meanwhile, another thread
6849 			 *    fills up the socket buffer.  This will either
6850 			 *    lead to a blocking send or EWOULDBLOCK
6851 			 *    which the application should deal with.
6852 			 * 2) we return 0 and tell the application that
6853 			 *    the socket is not writable.  Meanwhile,
6854 			 *    another thread depletes the receive socket
6855 			 *    buffer. In this case the application will
6856 			 *    be woken up by sb_notify().
6857 			 *
6858 			 * MIN() is required because otherwise sosendcheck()
6859 			 * may return EWOULDBLOCK since it only considers
6860 			 * so->so_snd.
6861 			 */
6862 			data = MIN(data, sbspace(&so2->so_rcv));
6863 		}
6864 	}
6865 
6866 	if (kn->kn_sfflags & NOTE_LOWAT) {
6867 		if (kn->kn_sdata > hiwat) {
6868 			lowwat = hiwat;
6869 		} else if (kn->kn_sdata > lowwat) {
6870 			lowwat = kn->kn_sdata;
6871 		}
6872 	}
6873 
6874 	if (data > 0 && data >= lowwat) {
6875 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6876 #if (DEBUG || DEVELOPMENT)
6877 		    && so_notsent_lowat_check == 1
6878 #endif /* DEBUG || DEVELOPMENT */
6879 		    ) {
6880 			if ((SOCK_DOM(so) == PF_INET ||
6881 			    SOCK_DOM(so) == PF_INET6) &&
6882 			    so->so_type == SOCK_STREAM) {
6883 				ret = tcp_notsent_lowat_check(so);
6884 			}
6885 #if MPTCP
6886 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6887 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6888 				ret = mptcp_notsent_lowat_check(so);
6889 			}
6890 #endif
6891 			else {
6892 				ret = 1;
6893 				goto out;
6894 			}
6895 		} else {
6896 			ret = 1;
6897 		}
6898 	}
6899 	if (so_wait_for_if_feedback(so)) {
6900 		ret = 0;
6901 	}
6902 
6903 out:
6904 	if (ret && kev) {
6905 		knote_fill_kevent(kn, kev, data);
6906 	}
6907 	return ret;
6908 }
6909 
6910 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6911 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6912 {
6913 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6914 
6915 	/* socket locked */
6916 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6917 		so->so_snd.sb_flags |= SB_KNOTE;
6918 	}
6919 
6920 	/* determine if its already fired */
6921 	return filt_sowrite_common(kn, NULL, so);
6922 }
6923 
6924 static void
filt_sowdetach(struct knote * kn)6925 filt_sowdetach(struct knote *kn)
6926 {
6927 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6928 	socket_lock(so, 1);
6929 
6930 	if (so->so_snd.sb_flags & SB_KNOTE) {
6931 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6932 			so->so_snd.sb_flags &= ~SB_KNOTE;
6933 		}
6934 	}
6935 	socket_unlock(so, 1);
6936 }
6937 
6938 /*ARGSUSED*/
6939 static int
filt_sowrite(struct knote * kn,long hint)6940 filt_sowrite(struct knote *kn, long hint)
6941 {
6942 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6943 	int ret;
6944 
6945 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6946 		socket_lock(so, 1);
6947 	}
6948 
6949 	ret = filt_sowrite_common(kn, NULL, so);
6950 
6951 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6952 		socket_unlock(so, 1);
6953 	}
6954 
6955 	return ret;
6956 }
6957 
6958 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6959 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6960 {
6961 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6962 	int ret;
6963 
6964 	socket_lock(so, 1);
6965 
6966 	/*save off the new input fflags and data */
6967 	kn->kn_sfflags = kev->fflags;
6968 	kn->kn_sdata = kev->data;
6969 
6970 	/* determine if these changes result in a triggered event */
6971 	ret = filt_sowrite_common(kn, NULL, so);
6972 
6973 	socket_unlock(so, 1);
6974 
6975 	return ret;
6976 }
6977 
6978 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6979 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6980 {
6981 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6982 	int ret;
6983 
6984 	socket_lock(so, 1);
6985 	ret = filt_sowrite_common(kn, kev, so);
6986 	socket_unlock(so, 1);
6987 
6988 	return ret;
6989 }
6990 
6991 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6992 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6993     struct socket *so, long ev_hint)
6994 {
6995 	int ret = 0;
6996 	int64_t data = 0;
6997 	uint32_t level_trigger = 0;
6998 
6999 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
7000 		kn->kn_fflags |= NOTE_CONNRESET;
7001 	}
7002 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
7003 		kn->kn_fflags |= NOTE_TIMEOUT;
7004 	}
7005 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
7006 		kn->kn_fflags |= NOTE_NOSRCADDR;
7007 	}
7008 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
7009 		kn->kn_fflags |= NOTE_IFDENIED;
7010 	}
7011 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7012 		kn->kn_fflags |= NOTE_KEEPALIVE;
7013 	}
7014 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7015 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7016 	}
7017 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7018 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7019 	}
7020 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7021 	    (so->so_state & SS_ISCONNECTED)) {
7022 		kn->kn_fflags |= NOTE_CONNECTED;
7023 		level_trigger |= NOTE_CONNECTED;
7024 	}
7025 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7026 	    (so->so_state & SS_ISDISCONNECTED)) {
7027 		kn->kn_fflags |= NOTE_DISCONNECTED;
7028 		level_trigger |= NOTE_DISCONNECTED;
7029 	}
7030 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7031 		if (so->so_proto != NULL &&
7032 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7033 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7034 		}
7035 	}
7036 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7037 	    tcp_notify_ack_active(so)) {
7038 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
7039 	}
7040 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7041 		kn->kn_fflags |= NOTE_WAKE_PKT;
7042 	}
7043 
7044 	if ((so->so_state & SS_CANTRCVMORE)
7045 #if CONTENT_FILTER
7046 	    && cfil_sock_data_pending(&so->so_rcv) == 0
7047 #endif /* CONTENT_FILTER */
7048 	    ) {
7049 		kn->kn_fflags |= NOTE_READCLOSED;
7050 		level_trigger |= NOTE_READCLOSED;
7051 	}
7052 
7053 	if (so->so_state & SS_CANTSENDMORE) {
7054 		kn->kn_fflags |= NOTE_WRITECLOSED;
7055 		level_trigger |= NOTE_WRITECLOSED;
7056 	}
7057 
7058 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7059 	    (so->so_flags & SOF_SUSPENDED)) {
7060 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7061 
7062 		/* If resume event was delivered before, reset it */
7063 		kn->kn_hook32 &= ~NOTE_RESUME;
7064 
7065 		kn->kn_fflags |= NOTE_SUSPEND;
7066 		level_trigger |= NOTE_SUSPEND;
7067 	}
7068 
7069 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
7070 	    (so->so_flags & SOF_SUSPENDED) == 0) {
7071 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7072 
7073 		/* If suspend event was delivered before, reset it */
7074 		kn->kn_hook32 &= ~NOTE_SUSPEND;
7075 
7076 		kn->kn_fflags |= NOTE_RESUME;
7077 		level_trigger |= NOTE_RESUME;
7078 	}
7079 
7080 	if (so->so_error != 0) {
7081 		ret = 1;
7082 		data = so->so_error;
7083 		kn->kn_flags |= EV_EOF;
7084 	} else {
7085 		u_int32_t data32 = 0;
7086 		get_sockev_state(so, &data32);
7087 		data = data32;
7088 	}
7089 
7090 	/* Reset any events that are not requested on this knote */
7091 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7092 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7093 
7094 	/* Find the level triggerred events that are already delivered */
7095 	level_trigger &= kn->kn_hook32;
7096 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7097 
7098 	/* Do not deliver level triggerred events more than once */
7099 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7100 		ret = 1;
7101 	}
7102 
7103 	if (ret && kev) {
7104 		/*
7105 		 * Store the state of the events being delivered. This
7106 		 * state can be used to deliver level triggered events
7107 		 * ateast once and still avoid waking up the application
7108 		 * multiple times as long as the event is active.
7109 		 */
7110 		if (kn->kn_fflags != 0) {
7111 			kn->kn_hook32 |= (kn->kn_fflags &
7112 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7113 		}
7114 
7115 		/*
7116 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7117 		 * only one of them and remember the last one that was
7118 		 * delivered last
7119 		 */
7120 		if (kn->kn_fflags & NOTE_SUSPEND) {
7121 			kn->kn_hook32 &= ~NOTE_RESUME;
7122 		}
7123 		if (kn->kn_fflags & NOTE_RESUME) {
7124 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7125 		}
7126 
7127 		knote_fill_kevent(kn, kev, data);
7128 	}
7129 	return ret;
7130 }
7131 
7132 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7133 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7134 {
7135 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7136 
7137 	/* socket locked */
7138 	kn->kn_hook32 = 0;
7139 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7140 		so->so_flags |= SOF_KNOTE;
7141 	}
7142 
7143 	/* determine if event already fired */
7144 	return filt_sockev_common(kn, NULL, so, 0);
7145 }
7146 
7147 static void
filt_sockdetach(struct knote * kn)7148 filt_sockdetach(struct knote *kn)
7149 {
7150 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7151 	socket_lock(so, 1);
7152 
7153 	if ((so->so_flags & SOF_KNOTE) != 0) {
7154 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7155 			so->so_flags &= ~SOF_KNOTE;
7156 		}
7157 	}
7158 	socket_unlock(so, 1);
7159 }
7160 
7161 static int
filt_sockev(struct knote * kn,long hint)7162 filt_sockev(struct knote *kn, long hint)
7163 {
7164 	int ret = 0, locked = 0;
7165 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7166 	long ev_hint = (hint & SO_FILT_HINT_EV);
7167 
7168 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7169 		socket_lock(so, 1);
7170 		locked = 1;
7171 	}
7172 
7173 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7174 
7175 	if (locked) {
7176 		socket_unlock(so, 1);
7177 	}
7178 
7179 	return ret;
7180 }
7181 
7182 
7183 
7184 /*
7185  *	filt_socktouch - update event state
7186  */
7187 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7188 filt_socktouch(
7189 	struct knote *kn,
7190 	struct kevent_qos_s *kev)
7191 {
7192 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7193 	uint32_t changed_flags;
7194 	int ret;
7195 
7196 	socket_lock(so, 1);
7197 
7198 	/* save off the [result] data and fflags */
7199 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7200 
7201 	/* save off the new input fflags and data */
7202 	kn->kn_sfflags = kev->fflags;
7203 	kn->kn_sdata = kev->data;
7204 
7205 	/* restrict the current results to the (smaller?) set of new interest */
7206 	/*
7207 	 * For compatibility with previous implementations, we leave kn_fflags
7208 	 * as they were before.
7209 	 */
7210 	//kn->kn_fflags &= kev->fflags;
7211 
7212 	/*
7213 	 * Since we keep track of events that are already
7214 	 * delivered, if any of those events are not requested
7215 	 * anymore the state related to them can be reset
7216 	 */
7217 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7218 
7219 	/* determine if we have events to deliver */
7220 	ret = filt_sockev_common(kn, NULL, so, 0);
7221 
7222 	socket_unlock(so, 1);
7223 
7224 	return ret;
7225 }
7226 
7227 /*
7228  *	filt_sockprocess - query event fired state and return data
7229  */
7230 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7231 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7232 {
7233 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7234 	int ret = 0;
7235 
7236 	socket_lock(so, 1);
7237 
7238 	ret = filt_sockev_common(kn, kev, so, 0);
7239 
7240 	socket_unlock(so, 1);
7241 
7242 	return ret;
7243 }
7244 
7245 void
get_sockev_state(struct socket * so,u_int32_t * statep)7246 get_sockev_state(struct socket *so, u_int32_t *statep)
7247 {
7248 	u_int32_t state = *(statep);
7249 
7250 	/*
7251 	 * If the state variable is already used by a previous event,
7252 	 * reset it.
7253 	 */
7254 	if (state != 0) {
7255 		return;
7256 	}
7257 
7258 	if (so->so_state & SS_ISCONNECTED) {
7259 		state |= SOCKEV_CONNECTED;
7260 	} else {
7261 		state &= ~(SOCKEV_CONNECTED);
7262 	}
7263 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7264 	*(statep) = state;
7265 }
7266 
7267 #define SO_LOCK_HISTORY_STR_LEN \
7268 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7269 
7270 __private_extern__ const char *
solockhistory_nr(struct socket * so)7271 solockhistory_nr(struct socket *so)
7272 {
7273 	size_t n = 0;
7274 	int i;
7275 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7276 
7277 	bzero(lock_history_str, sizeof(lock_history_str));
7278 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7279 		n += scnprintf(lock_history_str + n,
7280 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7281 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7282 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7283 	}
7284 	return __unsafe_null_terminated_from_indexable(lock_history_str);
7285 }
7286 
7287 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7288 socket_getlock(struct socket *so, int flags)
7289 {
7290 	if (so->so_proto->pr_getlock != NULL) {
7291 		return (*so->so_proto->pr_getlock)(so, flags);
7292 	} else {
7293 		return so->so_proto->pr_domain->dom_mtx;
7294 	}
7295 }
7296 
7297 void
socket_lock(struct socket * so,int refcount)7298 socket_lock(struct socket *so, int refcount)
7299 {
7300 	void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7301 
7302 	if (so->so_proto->pr_lock) {
7303 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7304 	} else {
7305 #ifdef MORE_LOCKING_DEBUG
7306 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7307 		    LCK_MTX_ASSERT_NOTOWNED);
7308 #endif
7309 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7310 		if (refcount) {
7311 			so->so_usecount++;
7312 		}
7313 		so->lock_lr[so->next_lock_lr] = lr_saved;
7314 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7315 	}
7316 }
7317 
7318 void
socket_lock_assert_owned(struct socket * so)7319 socket_lock_assert_owned(struct socket *so)
7320 {
7321 	lck_mtx_t *mutex_held;
7322 
7323 	if (so->so_proto->pr_getlock != NULL) {
7324 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7325 	} else {
7326 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7327 	}
7328 
7329 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7330 }
7331 
7332 int
socket_try_lock(struct socket * so)7333 socket_try_lock(struct socket *so)
7334 {
7335 	lck_mtx_t *mtx;
7336 
7337 	if (so->so_proto->pr_getlock != NULL) {
7338 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7339 	} else {
7340 		mtx = so->so_proto->pr_domain->dom_mtx;
7341 	}
7342 
7343 	return lck_mtx_try_lock(mtx);
7344 }
7345 
7346 void
socket_unlock(struct socket * so,int refcount)7347 socket_unlock(struct socket *so, int refcount)
7348 {
7349 	lck_mtx_t *mutex_held;
7350 	void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7351 
7352 	if (so == NULL || so->so_proto == NULL) {
7353 		panic("%s: null so_proto so=%p", __func__, so);
7354 		/* NOTREACHED */
7355 	}
7356 
7357 	if (so->so_proto->pr_unlock) {
7358 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7359 	} else {
7360 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7361 #ifdef MORE_LOCKING_DEBUG
7362 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7363 #endif
7364 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7365 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7366 
7367 		if (refcount) {
7368 			if (so->so_usecount <= 0) {
7369 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7370 				    "lrh=%s", __func__, so->so_usecount, so,
7371 				    SOCK_DOM(so), so->so_type,
7372 				    SOCK_PROTO(so), solockhistory_nr(so));
7373 				/* NOTREACHED */
7374 			}
7375 
7376 			so->so_usecount--;
7377 			if (so->so_usecount == 0) {
7378 				sofreelastref(so, 1);
7379 			}
7380 		}
7381 		lck_mtx_unlock(mutex_held);
7382 	}
7383 }
7384 
7385 /* Called with socket locked, will unlock socket */
7386 void
sofree(struct socket * so)7387 sofree(struct socket *so)
7388 {
7389 	lck_mtx_t *mutex_held;
7390 
7391 	if (so->so_proto->pr_getlock != NULL) {
7392 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7393 	} else {
7394 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7395 	}
7396 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7397 
7398 	sofreelastref(so, 0);
7399 }
7400 
7401 void
soreference(struct socket * so)7402 soreference(struct socket *so)
7403 {
7404 	socket_lock(so, 1);     /* locks & take one reference on socket */
7405 	socket_unlock(so, 0);   /* unlock only */
7406 }
7407 
7408 void
sodereference(struct socket * so)7409 sodereference(struct socket *so)
7410 {
7411 	socket_lock(so, 0);
7412 	socket_unlock(so, 1);
7413 }
7414 
7415 /*
7416  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7417  * possibility of using jumbo clusters.  Caller must ensure to hold
7418  * the socket lock.
7419  */
7420 void
somultipages(struct socket * so,boolean_t set)7421 somultipages(struct socket *so, boolean_t set)
7422 {
7423 	if (set) {
7424 		so->so_flags |= SOF_MULTIPAGES;
7425 	} else {
7426 		so->so_flags &= ~SOF_MULTIPAGES;
7427 	}
7428 }
7429 
7430 void
soif2kcl(struct socket * so,boolean_t set)7431 soif2kcl(struct socket *so, boolean_t set)
7432 {
7433 	if (set) {
7434 		so->so_flags1 |= SOF1_IF_2KCL;
7435 	} else {
7436 		so->so_flags1 &= ~SOF1_IF_2KCL;
7437 	}
7438 }
7439 
7440 int
so_isdstlocal(struct socket * so)7441 so_isdstlocal(struct socket *so)
7442 {
7443 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7444 
7445 	if (SOCK_DOM(so) == PF_INET) {
7446 		return inaddr_local(inp->inp_faddr);
7447 	} else if (SOCK_DOM(so) == PF_INET6) {
7448 		return in6addr_local(&inp->in6p_faddr);
7449 	}
7450 
7451 	return 0;
7452 }
7453 
7454 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7455 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7456 {
7457 	struct sockbuf *rcv, *snd;
7458 	int err = 0, defunct;
7459 
7460 	rcv = &so->so_rcv;
7461 	snd = &so->so_snd;
7462 
7463 	defunct = (so->so_flags & SOF_DEFUNCT);
7464 	if (defunct) {
7465 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7466 			panic("%s: SB_DROP not set", __func__);
7467 			/* NOTREACHED */
7468 		}
7469 		goto done;
7470 	}
7471 
7472 	if (so->so_flags & SOF_NODEFUNCT) {
7473 		if (noforce) {
7474 			err = EOPNOTSUPP;
7475 			if (p != PROC_NULL) {
7476 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7477 				    "name %s level %d) so 0x%llu [%d,%d] "
7478 				    "is not eligible for defunct "
7479 				    "(%d)\n", __func__, proc_selfpid(),
7480 				    proc_best_name(current_proc()), proc_pid(p),
7481 				    proc_best_name(p), level,
7482 				    so->so_gencnt,
7483 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7484 			}
7485 			return err;
7486 		}
7487 		so->so_flags &= ~SOF_NODEFUNCT;
7488 		if (p != PROC_NULL) {
7489 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7490 			    "name %s level %d) so 0x%llu [%d,%d] "
7491 			    "defunct by force "
7492 			    "(%d)\n", __func__, proc_selfpid(),
7493 			    proc_best_name(current_proc()), proc_pid(p),
7494 			    proc_best_name(p), level,
7495 			    so->so_gencnt,
7496 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7497 		}
7498 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7499 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7500 		struct ifnet *ifp = inp->inp_last_outifp;
7501 
7502 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7503 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7504 		} else if (so->so_flags & SOF_DELEGATED) {
7505 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7506 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7507 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7508 		} else if (noforce && p != PROC_NULL) {
7509 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7510 
7511 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7512 			so->so_extended_bk_start = net_uptime();
7513 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7514 
7515 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7516 
7517 			err = EOPNOTSUPP;
7518 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7519 			    "name %s level %d) so 0x%llu [%d,%d] "
7520 			    "extend bk idle "
7521 			    "(%d)\n", __func__, proc_selfpid(),
7522 			    proc_best_name(current_proc()), proc_pid(p),
7523 			    proc_best_name(p), level,
7524 			    so->so_gencnt,
7525 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7526 			return err;
7527 		} else {
7528 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7529 		}
7530 	}
7531 
7532 	so->so_flags |= SOF_DEFUNCT;
7533 
7534 	/* Prevent further data from being appended to the socket buffers */
7535 	snd->sb_flags |= SB_DROP;
7536 	rcv->sb_flags |= SB_DROP;
7537 
7538 	/* Flush any existing data in the socket buffers */
7539 	if (rcv->sb_cc != 0) {
7540 		rcv->sb_flags &= ~SB_SEL;
7541 		selthreadclear(&rcv->sb_sel);
7542 		sbrelease(rcv);
7543 	}
7544 	if (snd->sb_cc != 0) {
7545 		snd->sb_flags &= ~SB_SEL;
7546 		selthreadclear(&snd->sb_sel);
7547 		sbrelease(snd);
7548 	}
7549 
7550 done:
7551 	if (p != PROC_NULL) {
7552 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7553 		    "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7554 		    proc_selfpid(), proc_best_name(current_proc()),
7555 		    proc_pid(p), proc_best_name(p), level,
7556 		    so->so_gencnt, SOCK_DOM(so),
7557 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7558 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7559 		    " extbkidle" : "");
7560 	}
7561 	return err;
7562 }
7563 
7564 int
sodefunct(struct proc * p,struct socket * so,int level)7565 sodefunct(struct proc *p, struct socket *so, int level)
7566 {
7567 	struct sockbuf *rcv, *snd;
7568 
7569 	if (!(so->so_flags & SOF_DEFUNCT)) {
7570 		panic("%s improperly called", __func__);
7571 		/* NOTREACHED */
7572 	}
7573 	if (so->so_state & SS_DEFUNCT) {
7574 		goto done;
7575 	}
7576 
7577 	rcv = &so->so_rcv;
7578 	snd = &so->so_snd;
7579 
7580 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7581 		char s[MAX_IPv6_STR_LEN];
7582 		char d[MAX_IPv6_STR_LEN];
7583 		struct inpcb *inp = sotoinpcb(so);
7584 
7585 		if (p != PROC_NULL) {
7586 			SODEFUNCTLOG(
7587 				"%s[%d, %s]: (target pid %d name %s level %d) "
7588 				"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7589 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7590 				" snd_fl 0x%x]\n", __func__,
7591 				proc_selfpid(), proc_best_name(current_proc()),
7592 				proc_pid(p), proc_best_name(p), level,
7593 				so->so_gencnt,
7594 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7595 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7596 				(void *)&inp->inp_laddr.s_addr :
7597 				(void *)&inp->in6p_laddr),
7598 				s, sizeof(s)), ntohs(inp->in6p_lport),
7599 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7600 				(void *)&inp->inp_faddr.s_addr :
7601 				(void *)&inp->in6p_faddr,
7602 				d, sizeof(d)), ntohs(inp->in6p_fport),
7603 				(uint32_t)rcv->sb_sel.si_flags,
7604 				(uint32_t)snd->sb_sel.si_flags,
7605 				rcv->sb_flags, snd->sb_flags);
7606 		}
7607 	} else if (p != PROC_NULL) {
7608 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7609 		    "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7610 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7611 		    proc_selfpid(), proc_best_name(current_proc()),
7612 		    proc_pid(p), proc_best_name(p), level,
7613 		    so->so_gencnt,
7614 		    SOCK_DOM(so), SOCK_TYPE(so),
7615 		    (uint32_t)rcv->sb_sel.si_flags,
7616 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7617 		    snd->sb_flags);
7618 	}
7619 
7620 	/*
7621 	 * First tell the protocol the flow is defunct
7622 	 */
7623 	(void)  (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7624 
7625 	/*
7626 	 * Unwedge threads blocked on sbwait() and sb_lock().
7627 	 */
7628 	sbwakeup(rcv);
7629 	sbwakeup(snd);
7630 
7631 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7632 	if (rcv->sb_flags & SB_LOCK) {
7633 		sbunlock(rcv, TRUE);    /* keep socket locked */
7634 	}
7635 	if (snd->sb_flags & SB_LOCK) {
7636 		sbunlock(snd, TRUE);    /* keep socket locked */
7637 	}
7638 	/*
7639 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7640 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7641 	 * states are set for the socket.  This would also flush out data
7642 	 * hanging off the receive list of this socket.
7643 	 */
7644 	(void) soshutdownlock_final(so, SHUT_RD);
7645 	(void) soshutdownlock_final(so, SHUT_WR);
7646 	(void) sodisconnectlocked(so);
7647 
7648 	/*
7649 	 * Explicitly handle connectionless-protocol disconnection
7650 	 * and release any remaining data in the socket buffers.
7651 	 */
7652 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7653 		(void) soisdisconnected(so);
7654 	}
7655 
7656 	if (so->so_error == 0) {
7657 		so->so_error = EBADF;
7658 	}
7659 
7660 	if (rcv->sb_cc != 0) {
7661 		rcv->sb_flags &= ~SB_SEL;
7662 		selthreadclear(&rcv->sb_sel);
7663 		sbrelease(rcv);
7664 	}
7665 	if (snd->sb_cc != 0) {
7666 		snd->sb_flags &= ~SB_SEL;
7667 		selthreadclear(&snd->sb_sel);
7668 		sbrelease(snd);
7669 	}
7670 	so->so_state |= SS_DEFUNCT;
7671 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7672 
7673 done:
7674 	return 0;
7675 }
7676 
7677 int
soresume(struct proc * p,struct socket * so,int locked)7678 soresume(struct proc *p, struct socket *so, int locked)
7679 {
7680 	if (locked == 0) {
7681 		socket_lock(so, 1);
7682 	}
7683 
7684 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7685 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7686 		    "[%d,%d] resumed from bk idle\n",
7687 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7688 		    proc_pid(p), proc_best_name(p),
7689 		    so->so_gencnt,
7690 		    SOCK_DOM(so), SOCK_TYPE(so));
7691 
7692 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7693 		so->so_extended_bk_start = 0;
7694 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7695 
7696 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7697 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7698 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7699 	}
7700 	if (locked == 0) {
7701 		socket_unlock(so, 1);
7702 	}
7703 
7704 	return 0;
7705 }
7706 
7707 /*
7708  * Does not attempt to account for sockets that are delegated from
7709  * the current process
7710  */
7711 int
so_set_extended_bk_idle(struct socket * so,int optval)7712 so_set_extended_bk_idle(struct socket *so, int optval)
7713 {
7714 	int error = 0;
7715 
7716 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7717 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7718 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7719 		error = EOPNOTSUPP;
7720 	} else if (optval == 0) {
7721 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7722 
7723 		soresume(current_proc(), so, 1);
7724 	} else {
7725 		struct proc *p = current_proc();
7726 		struct fileproc *fp;
7727 		int count = 0;
7728 
7729 		/*
7730 		 * Unlock socket to avoid lock ordering issue with
7731 		 * the proc fd table lock
7732 		 */
7733 		socket_unlock(so, 0);
7734 
7735 		proc_fdlock(p);
7736 		fdt_foreach(fp, p) {
7737 			struct socket *so2;
7738 
7739 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7740 				continue;
7741 			}
7742 
7743 			so2 = (struct socket *)fp_get_data(fp);
7744 			if (so != so2 &&
7745 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7746 				count++;
7747 			}
7748 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7749 				break;
7750 			}
7751 		}
7752 		proc_fdunlock(p);
7753 
7754 		socket_lock(so, 0);
7755 
7756 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7757 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7758 			error = EBUSY;
7759 		} else if (so->so_flags & SOF_DELEGATED) {
7760 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7761 			error = EBUSY;
7762 		} else {
7763 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7764 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7765 		}
7766 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7767 		    "%s marked for extended bk idle\n",
7768 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7769 		    so->so_gencnt,
7770 		    SOCK_DOM(so), SOCK_TYPE(so),
7771 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7772 		    "is" : "not");
7773 	}
7774 
7775 	return error;
7776 }
7777 
7778 static void
so_stop_extended_bk_idle(struct socket * so)7779 so_stop_extended_bk_idle(struct socket *so)
7780 {
7781 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7782 	so->so_extended_bk_start = 0;
7783 
7784 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7785 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7786 	/*
7787 	 * Force defunct
7788 	 */
7789 	sosetdefunct(current_proc(), so,
7790 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7791 	if (so->so_flags & SOF_DEFUNCT) {
7792 		sodefunct(current_proc(), so,
7793 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7794 	}
7795 }
7796 
7797 void
so_drain_extended_bk_idle(struct socket * so)7798 so_drain_extended_bk_idle(struct socket *so)
7799 {
7800 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7801 		/*
7802 		 * Only penalize sockets that have outstanding data
7803 		 */
7804 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7805 			so_stop_extended_bk_idle(so);
7806 
7807 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7808 		}
7809 	}
7810 }
7811 
7812 /*
7813  * Return values tells if socket is still in extended background idle
7814  */
7815 int
so_check_extended_bk_idle_time(struct socket * so)7816 so_check_extended_bk_idle_time(struct socket *so)
7817 {
7818 	int ret = 1;
7819 
7820 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7821 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7822 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7823 		    so->so_gencnt,
7824 		    SOCK_DOM(so), SOCK_TYPE(so));
7825 		if (net_uptime() - so->so_extended_bk_start >
7826 		    soextbkidlestat.so_xbkidle_time) {
7827 			so_stop_extended_bk_idle(so);
7828 
7829 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7830 
7831 			ret = 0;
7832 		} else {
7833 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7834 
7835 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7836 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7837 		}
7838 	}
7839 
7840 	return ret;
7841 }
7842 
7843 void
resume_proc_sockets(proc_t p)7844 resume_proc_sockets(proc_t p)
7845 {
7846 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7847 		struct fileproc *fp;
7848 		struct socket *so;
7849 
7850 		proc_fdlock(p);
7851 		fdt_foreach(fp, p) {
7852 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7853 				continue;
7854 			}
7855 
7856 			so = (struct socket *)fp_get_data(fp);
7857 			(void) soresume(p, so, 0);
7858 		}
7859 		proc_fdunlock(p);
7860 
7861 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7862 	}
7863 }
7864 
7865 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7866 so_set_recv_anyif(struct socket *so, int optval)
7867 {
7868 	int ret = 0;
7869 
7870 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7871 		if (optval) {
7872 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7873 		} else {
7874 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7875 		}
7876 #if SKYWALK
7877 		inp_update_netns_flags(so);
7878 #endif /* SKYWALK */
7879 	}
7880 
7881 
7882 	return ret;
7883 }
7884 
7885 __private_extern__ int
so_get_recv_anyif(struct socket * so)7886 so_get_recv_anyif(struct socket *so)
7887 {
7888 	int ret = 0;
7889 
7890 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7891 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7892 	}
7893 
7894 	return ret;
7895 }
7896 
7897 int
so_set_restrictions(struct socket * so,uint32_t vals)7898 so_set_restrictions(struct socket *so, uint32_t vals)
7899 {
7900 	int nocell_old, nocell_new;
7901 	int noexpensive_old, noexpensive_new;
7902 	int noconstrained_old, noconstrained_new;
7903 
7904 	/*
7905 	 * Deny-type restrictions are trapdoors; once set they cannot be
7906 	 * unset for the lifetime of the socket.  This allows them to be
7907 	 * issued by a framework on behalf of the application without
7908 	 * having to worry that they can be undone.
7909 	 *
7910 	 * Note here that socket-level restrictions overrides any protocol
7911 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7912 	 * socket restriction issued on the socket has a higher precendence
7913 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7914 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7915 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7916 	 */
7917 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7918 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7919 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7920 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7921 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7922 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7923 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7924 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7925 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7926 
7927 	/* we can only set, not clear restrictions */
7928 	if ((nocell_new - nocell_old) == 0 &&
7929 	    (noexpensive_new - noexpensive_old) == 0 &&
7930 	    (noconstrained_new - noconstrained_old) == 0) {
7931 		return 0;
7932 	}
7933 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7934 		if (nocell_new - nocell_old != 0) {
7935 			/*
7936 			 * if deny cellular is now set, do what's needed
7937 			 * for INPCB
7938 			 */
7939 			inp_set_nocellular(sotoinpcb(so));
7940 		}
7941 		if (noexpensive_new - noexpensive_old != 0) {
7942 			inp_set_noexpensive(sotoinpcb(so));
7943 		}
7944 		if (noconstrained_new - noconstrained_old != 0) {
7945 			inp_set_noconstrained(sotoinpcb(so));
7946 		}
7947 	}
7948 
7949 	if (SOCK_DOM(so) == PF_MULTIPATH) {
7950 		mptcp_set_restrictions(so);
7951 	}
7952 
7953 	return 0;
7954 }
7955 
7956 uint32_t
so_get_restrictions(struct socket * so)7957 so_get_restrictions(struct socket *so)
7958 {
7959 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7960 	       SO_RESTRICT_DENY_OUT |
7961 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7962 }
7963 
7964 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7965 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7966 {
7967 	struct proc *ep = PROC_NULL;
7968 	int error = 0;
7969 
7970 	/* pid 0 is reserved for kernel */
7971 	if (epid == 0) {
7972 		error = EINVAL;
7973 		goto done;
7974 	}
7975 
7976 	/*
7977 	 * If this is an in-kernel socket, prevent its delegate
7978 	 * association from changing unless the socket option is
7979 	 * coming from within the kernel itself.
7980 	 */
7981 	if (so->last_pid == 0 && p != kernproc) {
7982 		error = EACCES;
7983 		goto done;
7984 	}
7985 
7986 	/*
7987 	 * If this is issued by a process that's recorded as the
7988 	 * real owner of the socket, or if the pid is the same as
7989 	 * the process's own pid, then proceed.  Otherwise ensure
7990 	 * that the issuing process has the necessary privileges.
7991 	 */
7992 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7993 		if ((error = priv_check_cred(kauth_cred_get(),
7994 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7995 			error = EACCES;
7996 			goto done;
7997 		}
7998 	}
7999 
8000 	/* Find the process that corresponds to the effective pid */
8001 	if ((ep = proc_find(epid)) == PROC_NULL) {
8002 		error = ESRCH;
8003 		goto done;
8004 	}
8005 
8006 	/*
8007 	 * If a process tries to delegate the socket to itself, then
8008 	 * there's really nothing to do; treat it as a way for the
8009 	 * delegate association to be cleared.  Note that we check
8010 	 * the passed-in proc rather than calling proc_selfpid(),
8011 	 * as we need to check the process issuing the socket option
8012 	 * which could be kernproc.  Given that we don't allow 0 for
8013 	 * effective pid, it means that a delegated in-kernel socket
8014 	 * stays delegated during its lifetime (which is probably OK.)
8015 	 */
8016 	if (epid == proc_pid(p)) {
8017 		so->so_flags &= ~SOF_DELEGATED;
8018 		so->e_upid = 0;
8019 		so->e_pid = 0;
8020 		uuid_clear(so->e_uuid);
8021 	} else {
8022 		so->so_flags |= SOF_DELEGATED;
8023 		so->e_upid = proc_uniqueid(ep);
8024 		so->e_pid = proc_pid(ep);
8025 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8026 
8027 #if defined(XNU_TARGET_OS_OSX)
8028 		if (ep->p_responsible_pid != so->e_pid) {
8029 			proc_t rp = proc_find(ep->p_responsible_pid);
8030 			if (rp != PROC_NULL) {
8031 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8032 				so->so_rpid = ep->p_responsible_pid;
8033 				proc_rele(rp);
8034 			} else {
8035 				uuid_clear(so->so_ruuid);
8036 				so->so_rpid = -1;
8037 			}
8038 		}
8039 #endif
8040 	}
8041 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8042 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8043 	}
8044 done:
8045 	if (error == 0 && net_io_policy_log) {
8046 		uuid_string_t buf;
8047 
8048 		uuid_unparse(so->e_uuid, buf);
8049 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8050 		    "euuid %s%s\n", __func__, proc_name_address(p),
8051 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8052 		    SOCK_DOM(so), SOCK_TYPE(so),
8053 		    so->e_pid, proc_name_address(ep), buf,
8054 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8055 	} else if (error != 0 && net_io_policy_log) {
8056 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8057 		    "ERROR (%d)\n", __func__, proc_name_address(p),
8058 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8059 		    SOCK_DOM(so), SOCK_TYPE(so),
8060 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
8061 		    proc_name_address(ep), error);
8062 	}
8063 
8064 	/* Update this socket's policy upon success */
8065 	if (error == 0) {
8066 		so->so_policy_gencnt *= -1;
8067 		so_update_policy(so);
8068 #if NECP
8069 		so_update_necp_policy(so, NULL, NULL);
8070 #endif /* NECP */
8071 	}
8072 
8073 	if (ep != PROC_NULL) {
8074 		proc_rele(ep);
8075 	}
8076 
8077 	return error;
8078 }
8079 
8080 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8081 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8082 {
8083 	uuid_string_t buf;
8084 	uuid_t uuid;
8085 	int error = 0;
8086 
8087 	/* UUID must not be all-zeroes (reserved for kernel) */
8088 	if (uuid_is_null(euuid)) {
8089 		error = EINVAL;
8090 		goto done;
8091 	}
8092 
8093 	/*
8094 	 * If this is an in-kernel socket, prevent its delegate
8095 	 * association from changing unless the socket option is
8096 	 * coming from within the kernel itself.
8097 	 */
8098 	if (so->last_pid == 0 && p != kernproc) {
8099 		error = EACCES;
8100 		goto done;
8101 	}
8102 
8103 	/* Get the UUID of the issuing process */
8104 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8105 
8106 	/*
8107 	 * If this is issued by a process that's recorded as the
8108 	 * real owner of the socket, or if the uuid is the same as
8109 	 * the process's own uuid, then proceed.  Otherwise ensure
8110 	 * that the issuing process has the necessary privileges.
8111 	 */
8112 	if (check_cred &&
8113 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8114 	    uuid_compare(euuid, uuid) != 0)) {
8115 		if ((error = priv_check_cred(kauth_cred_get(),
8116 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8117 			error = EACCES;
8118 			goto done;
8119 		}
8120 	}
8121 
8122 	/*
8123 	 * If a process tries to delegate the socket to itself, then
8124 	 * there's really nothing to do; treat it as a way for the
8125 	 * delegate association to be cleared.  Note that we check
8126 	 * the uuid of the passed-in proc rather than that of the
8127 	 * current process, as we need to check the process issuing
8128 	 * the socket option which could be kernproc itself.  Given
8129 	 * that we don't allow 0 for effective uuid, it means that
8130 	 * a delegated in-kernel socket stays delegated during its
8131 	 * lifetime (which is okay.)
8132 	 */
8133 	if (uuid_compare(euuid, uuid) == 0) {
8134 		so->so_flags &= ~SOF_DELEGATED;
8135 		so->e_upid = 0;
8136 		so->e_pid = 0;
8137 		uuid_clear(so->e_uuid);
8138 	} else {
8139 		so->so_flags |= SOF_DELEGATED;
8140 		/*
8141 		 * Unlike so_set_effective_pid(), we only have the UUID
8142 		 * here and the process ID is not known.  Inherit the
8143 		 * real {pid,upid} of the socket.
8144 		 */
8145 		so->e_upid = so->last_upid;
8146 		so->e_pid = so->last_pid;
8147 		uuid_copy(so->e_uuid, euuid);
8148 	}
8149 	/*
8150 	 * The following will clear the effective process name as it's the same
8151 	 * as the real process
8152 	 */
8153 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8154 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8155 	}
8156 done:
8157 	if (error == 0 && net_io_policy_log) {
8158 		uuid_unparse(so->e_uuid, buf);
8159 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8160 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8161 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8162 		    SOCK_TYPE(so), so->e_pid, buf,
8163 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8164 	} else if (error != 0 && net_io_policy_log) {
8165 		uuid_unparse(euuid, buf);
8166 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8167 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8168 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8169 		    SOCK_TYPE(so), buf, error);
8170 	}
8171 
8172 	/* Update this socket's policy upon success */
8173 	if (error == 0) {
8174 		so->so_policy_gencnt *= -1;
8175 		so_update_policy(so);
8176 #if NECP
8177 		so_update_necp_policy(so, NULL, NULL);
8178 #endif /* NECP */
8179 	}
8180 
8181 	return error;
8182 }
8183 
8184 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8185 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8186     uint32_t ev_datalen)
8187 {
8188 	struct kev_msg ev_msg;
8189 
8190 	/*
8191 	 * A netpolicy event always starts with a netpolicy_event_data
8192 	 * structure, but the caller can provide for a longer event
8193 	 * structure to post, depending on the event code.
8194 	 */
8195 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8196 
8197 	bzero(&ev_msg, sizeof(ev_msg));
8198 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8199 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8200 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8201 	ev_msg.event_code       = ev_code;
8202 
8203 	ev_msg.dv[0].data_ptr   = ev_data;
8204 	ev_msg.dv[0].data_length = ev_datalen;
8205 
8206 	kev_post_msg(&ev_msg);
8207 }
8208 
8209 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8210 socket_post_kev_msg(uint32_t ev_code,
8211     struct kev_socket_event_data *ev_data,
8212     uint32_t ev_datalen)
8213 {
8214 	struct kev_msg ev_msg;
8215 
8216 	bzero(&ev_msg, sizeof(ev_msg));
8217 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8218 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8219 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8220 	ev_msg.event_code = ev_code;
8221 
8222 	ev_msg.dv[0].data_ptr = ev_data;
8223 	ev_msg.dv[0].data_length = ev_datalen;
8224 
8225 	kev_post_msg(&ev_msg);
8226 }
8227 
8228 void
socket_post_kev_msg_closed(struct socket * so)8229 socket_post_kev_msg_closed(struct socket *so)
8230 {
8231 	struct kev_socket_closed ev = {};
8232 	struct sockaddr *__single socksa = NULL, *__single peersa = NULL;
8233 	int err;
8234 
8235 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8236 		return;
8237 	}
8238 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8239 	if (err == 0) {
8240 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8241 		    &peersa);
8242 		if (err == 0) {
8243 			SOCKADDR_COPY(socksa, &ev.ev_data.kev_sockname,
8244 			    min(socksa->sa_len,
8245 			    sizeof(ev.ev_data.kev_sockname)));
8246 			SOCKADDR_COPY(peersa, &ev.ev_data.kev_peername,
8247 			    min(peersa->sa_len,
8248 			    sizeof(ev.ev_data.kev_peername)));
8249 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8250 			    &ev.ev_data, sizeof(ev));
8251 		}
8252 	}
8253 	free_sockaddr(socksa);
8254 	free_sockaddr(peersa);
8255 }
8256 
8257 __attribute__((noinline, cold, not_tail_called, noreturn))
8258 __private_extern__ int
assfail(const char * a,const char * f,int l)8259 assfail(const char *a, const char *f, int l)
8260 {
8261 	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8262 	/* NOTREACHED */
8263 	__builtin_unreachable();
8264 }
8265