xref: /xnu-8792.61.2/bsd/kern/uipc_socket.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120 
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125 
126 #include <os/log.h>
127 
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131 
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136 
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138 
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144 
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147 
148 static u_int32_t        so_cache_hw;    /* High water mark for socache */
149 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
150 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
151 static u_int32_t        cached_sock_count = 0;
152 STAILQ_HEAD(, socket)   so_cache_head;
153 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t        so_cache_time;
155 static int              socketinit_done;
156 static struct zone      *so_cache_zone;
157 
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160 
161 #include <machine/limits.h>
162 
163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void     filt_sordetach(struct knote *kn);
165 static int      filt_soread(struct knote *kn, long hint);
166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168 
169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void     filt_sowdetach(struct knote *kn);
171 static int      filt_sowrite(struct knote *kn, long hint);
172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174 
175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void     filt_sockdetach(struct knote *kn);
177 static int      filt_sockev(struct knote *kn, long hint);
178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180 
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183 
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 	.f_isfd = 1,
186 	.f_attach = filt_sorattach,
187 	.f_detach = filt_sordetach,
188 	.f_event = filt_soread,
189 	.f_touch = filt_sortouch,
190 	.f_process = filt_sorprocess,
191 };
192 
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 	.f_isfd = 1,
195 	.f_attach = filt_sowattach,
196 	.f_detach = filt_sowdetach,
197 	.f_event = filt_sowrite,
198 	.f_touch = filt_sowtouch,
199 	.f_process = filt_sowprocess,
200 };
201 
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 	.f_isfd = 1,
204 	.f_attach = filt_sockattach,
205 	.f_detach = filt_sockdetach,
206 	.f_event = filt_sockev,
207 	.f_touch = filt_socktouch,
208 	.f_process = filt_sockprocess,
209 };
210 
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 	.f_isfd = 1,
213 	.f_attach = filt_sorattach,
214 	.f_detach = filt_sordetach,
215 	.f_event = filt_soread,
216 	.f_touch = filt_sortouch,
217 	.f_process = filt_sorprocess,
218 };
219 
220 SYSCTL_DECL(_kern_ipc);
221 
222 #define EVEN_MORE_LOCKING_DEBUG 0
223 
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227 
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230     &sodefunct_calls, "");
231 
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t        so_gencnt;      /* generation count for sockets */
234 
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236 
237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246 
247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
248 
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252 
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy  = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261 
262 /*
263  * Set to enable jumbo clusters (if available) for large writes when
264  * the socket is marked with SOF_MULTIPAGES; see below.
265  */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269 
270 /*
271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272  * writes on the socket for all protocols on any network interfaces,
273  * depending upon sosendjcl above.  Be extra careful when setting this
274  * to 1, because sending down packets that cross physical pages down to
275  * broken drivers (those that falsely assume that the physical pages
276  * are contiguous) might lead to system panics or silent data corruption.
277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279  * capable.  Set this to 1 only for testing/debugging purposes.
280  */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284 
285 /*
286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287  * writes on the socket for all protocols on any network interfaces.
288  * Be extra careful when setting this to 1, because sending down packets with
289  * clusters larger that 2 KB might lead to system panics or data corruption.
290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291  * on the outgoing interface
292  * Set this to 1  for testing/debugging purposes only.
293  */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297 
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300     &sodefunctlog, 0, "");
301 
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304     &sothrottlelog, 0, "");
305 
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308     &sorestrictrecv, 0, "Enable inbound interface restrictions");
309 
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312     &sorestrictsend, 0, "Enable outbound interface restrictions");
313 
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317 
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323 
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327     &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329 
330 extern struct inpcbinfo tcbinfo;
331 
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335 
336 vm_size_t       so_cache_zone_element_size;
337 
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339     user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342 
343 /*
344  * Maximum of extended background idle sockets per process
345  * Set to zero to disable further setting of the option
346  */
347 
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
349 #define SO_IDLE_BK_IDLE_TIME            600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
351 
352 struct soextbkidlestat soextbkidlestat;
353 
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356     "Maximum of extended background idle sockets per process");
357 
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359     &soextbkidlestat.so_xbkidle_time, 0,
360     "Time in seconds to keep extended background idle sockets");
361 
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364     "High water mark for extended background idle sockets");
365 
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367     &soextbkidlestat, soextbkidlestat, "");
368 
369 int so_set_extended_bk_idle(struct socket *, int);
370 
371 
372 /*
373  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374  * setting the DSCP code on the packet based on the service class; see
375  * <rdar://problem/11277343> for details.
376  */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379     &sotcdb, 0, "");
380 
381 void
socketinit(void)382 socketinit(void)
383 {
384 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386 
387 #ifdef __LP64__
388 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402 
403 	if (socketinit_done) {
404 		printf("socketinit: already called...\n");
405 		return;
406 	}
407 	socketinit_done = 1;
408 
409 	PE_parse_boot_argn("socket_debug", &socket_debug,
410 	    sizeof(socket_debug));
411 
412 	STAILQ_INIT(&so_cache_head);
413 
414 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
416 
417 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419 
420 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424 
425 	in_pcbinit();
426 }
427 
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 	caddr_t temp;
432 	uintptr_t offset;
433 
434 	lck_mtx_lock(&so_cache_mtx);
435 
436 	if (!STAILQ_EMPTY(&so_cache_head)) {
437 		VERIFY(cached_sock_count > 0);
438 
439 		*so = STAILQ_FIRST(&so_cache_head);
440 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
442 
443 		cached_sock_count--;
444 		lck_mtx_unlock(&so_cache_mtx);
445 
446 		temp = (*so)->so_saved_pcb;
447 		bzero((caddr_t)*so, sizeof(struct socket));
448 
449 		(*so)->so_saved_pcb = temp;
450 	} else {
451 		lck_mtx_unlock(&so_cache_mtx);
452 
453 		*so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454 
455 		/*
456 		 * Define offsets for extra structures into our
457 		 * single block of memory. Align extra structures
458 		 * on longword boundaries.
459 		 */
460 
461 		offset = (uintptr_t)*so;
462 		offset += sizeof(struct socket);
463 
464 		offset = ALIGN(offset);
465 
466 		(*so)->so_saved_pcb = (caddr_t)offset;
467 		offset += get_inpcb_str_size();
468 
469 		offset = ALIGN(offset);
470 
471 		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 		    (caddr_t)offset;
473 	}
474 
475 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477 
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 	lck_mtx_lock(&so_cache_mtx);
482 
483 	so_cache_time = net_uptime();
484 	if (++cached_sock_count > max_cached_sock_count) {
485 		--cached_sock_count;
486 		lck_mtx_unlock(&so_cache_mtx);
487 		zfree(so_cache_zone, so);
488 	} else {
489 		if (so_cache_hw < cached_sock_count) {
490 			so_cache_hw = cached_sock_count;
491 		}
492 
493 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494 
495 		so->cache_timestamp = so_cache_time;
496 		lck_mtx_unlock(&so_cache_mtx);
497 	}
498 }
499 
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 	if (so->last_pid != 0) {
504 		/*
505 		 * last_pid and last_upid should remain zero for sockets
506 		 * created using sock_socket. The check above achieves that
507 		 */
508 		if (self == PROC_NULL) {
509 			self = current_proc();
510 		}
511 
512 		if (so->last_upid != proc_uniqueid(self) ||
513 		    so->last_pid != proc_pid(self)) {
514 			so->last_upid = proc_uniqueid(self);
515 			so->last_pid = proc_pid(self);
516 			proc_getexecutableuuid(self, so->last_uuid,
517 			    sizeof(so->last_uuid));
518 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 			}
521 		}
522 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 	}
524 }
525 
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 		(void) inp_update_policy(sotoinpcb(so));
531 	}
532 }
533 
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537     struct sockaddr *override_remote_addr)
538 {
539 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 		    override_remote_addr, 0);
542 	}
543 }
544 #endif /* NECP */
545 
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 	struct socket   *p;
550 	int             n_freed = 0;
551 	boolean_t rc = FALSE;
552 
553 	lck_mtx_lock(&so_cache_mtx);
554 	so_cache_timeouts++;
555 	so_cache_time = net_uptime();
556 
557 	while (!STAILQ_EMPTY(&so_cache_head)) {
558 		VERIFY(cached_sock_count > 0);
559 		p = STAILQ_FIRST(&so_cache_head);
560 		if ((so_cache_time - p->cache_timestamp) <
561 		    SO_CACHE_TIME_LIMIT) {
562 			break;
563 		}
564 
565 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 		--cached_sock_count;
567 
568 		zfree(so_cache_zone, p);
569 
570 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 			so_cache_max_freed++;
572 			break;
573 		}
574 	}
575 
576 	/* Schedule again if there is more to cleanup */
577 	if (!STAILQ_EMPTY(&so_cache_head)) {
578 		rc = TRUE;
579 	}
580 
581 	lck_mtx_unlock(&so_cache_mtx);
582 	return rc;
583 }
584 
585 /*
586  * Get a socket structure from our zone, and initialize it.
587  * We don't implement `waitok' yet (see comments in uipc_domain.c).
588  * Note that it would probably be better to allocate socket
589  * and PCB at the same time, but I'm not convinced that all
590  * the protocols can be easily modified to do this.
591  */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 	struct socket *so;
597 
598 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 		cached_sock_alloc(&so, how);
600 	} else {
601 		so = zalloc_flags(socket_zone, how | Z_ZERO);
602 	}
603 	if (so != NULL) {
604 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605 
606 		/*
607 		 * Increment the socket allocation statistics
608 		 */
609 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 	}
611 
612 	return so;
613 }
614 
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617     struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 	struct protosw *prp;
620 	struct socket *so;
621 	int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 	pid_t rpid = -1;
624 #endif
625 
626 #if TCPDEBUG
627 	extern int tcpconsdebug;
628 #endif
629 
630 	VERIFY(aso != NULL);
631 	*aso = NULL;
632 
633 	if (proto != 0) {
634 		prp = pffindproto(dom, proto, type);
635 	} else {
636 		prp = pffindtype(dom, type);
637 	}
638 
639 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 		if (pffinddomain(dom) == NULL) {
641 			return EAFNOSUPPORT;
642 		}
643 		if (proto != 0) {
644 			if (pffindprotonotype(dom, proto) != NULL) {
645 				return EPROTOTYPE;
646 			}
647 		}
648 		return EPROTONOSUPPORT;
649 	}
650 	if (prp->pr_type != type) {
651 		return EPROTOTYPE;
652 	}
653 	so = soalloc(1, dom, type);
654 	if (so == NULL) {
655 		return ENOBUFS;
656 	}
657 
658 	switch (dom) {
659 	case PF_LOCAL:
660 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 		break;
662 	case PF_INET:
663 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 		if (type == SOCK_STREAM) {
665 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 		} else {
667 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 		}
669 		break;
670 	case PF_ROUTE:
671 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 		break;
673 	case PF_NDRV:
674 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 		break;
676 	case PF_KEY:
677 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 		break;
679 	case PF_INET6:
680 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 		if (type == SOCK_STREAM) {
682 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 		} else {
684 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 		}
686 		break;
687 	case PF_SYSTEM:
688 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 		break;
690 	case PF_MULTIPATH:
691 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 		break;
693 	default:
694 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 		break;
696 	}
697 
698 	if (flags & SOCF_MPTCP) {
699 		so->so_state |= SS_NBIO;
700 	}
701 
702 	TAILQ_INIT(&so->so_incomp);
703 	TAILQ_INIT(&so->so_comp);
704 	so->so_type = (short)type;
705 	so->last_upid = proc_uniqueid(p);
706 	so->last_pid = proc_pid(p);
707 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709 
710 	if (ep != PROC_NULL && ep != p) {
711 		so->e_upid = proc_uniqueid(ep);
712 		so->e_pid = proc_pid(ep);
713 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 		so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 		if (ep->p_responsible_pid != so->e_pid) {
717 			rpid = ep->p_responsible_pid;
718 		}
719 #endif
720 	}
721 
722 #if defined(XNU_TARGET_OS_OSX)
723 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 		rpid = p->p_responsible_pid;
725 	}
726 
727 	so->so_rpid = -1;
728 	uuid_clear(so->so_ruuid);
729 	if (rpid >= 0) {
730 		proc_t rp = proc_find(rpid);
731 		if (rp != PROC_NULL) {
732 			proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 			so->so_rpid = rpid;
734 			proc_rele(rp);
735 		}
736 	}
737 #endif
738 
739 	so->so_cred = kauth_cred_proc_ref(p);
740 	if (!suser(kauth_cred_get(), NULL)) {
741 		so->so_state |= SS_PRIV;
742 	}
743 
744 	so->so_proto = prp;
745 	so->so_rcv.sb_flags |= SB_RECV;
746 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 	so->next_lock_lr = 0;
748 	so->next_unlock_lr = 0;
749 
750 	/*
751 	 * Attachment will create the per pcb lock if necessary and
752 	 * increase refcount for creation, make sure it's done before
753 	 * socket is inserted in lists.
754 	 */
755 	so->so_usecount++;
756 
757 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 	if (error != 0) {
759 		/*
760 		 * Warning:
761 		 * If so_pcb is not zero, the socket will be leaked,
762 		 * so protocol attachment handler must be coded carefuly
763 		 */
764 		if (so->so_pcb != NULL) {
765 			os_log_error(OS_LOG_DEFAULT,
766 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 			    error, dom, proto, type);
768 		}
769 		/*
770 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 		 */
772 		so->so_state |= SS_NOFDREF;
773 		so->so_flags |= SOF_PCBCLEARING;
774 		VERIFY(so->so_usecount > 0);
775 		so->so_usecount--;
776 		sofreelastref(so, 1);   /* will deallocate the socket */
777 		return error;
778 	}
779 
780 	/*
781 	 * Note: needs so_pcb to be set after pru_attach
782 	 */
783 	if (prp->pr_update_last_owner != NULL) {
784 		(*prp->pr_update_last_owner)(so, p, ep);
785 	}
786 
787 	atomic_add_32(&prp->pr_domain->dom_refs, 1);
788 
789 	/* Attach socket filters for this protocol */
790 	sflt_initsock(so);
791 #if TCPDEBUG
792 	if (tcpconsdebug == 2) {
793 		so->so_options |= SO_DEBUG;
794 	}
795 #endif
796 	so_set_default_traffic_class(so);
797 
798 	/*
799 	 * If this thread or task is marked to create backgrounded sockets,
800 	 * mark the socket as background.
801 	 */
802 	if (!(flags & SOCF_MPTCP) &&
803 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 		so->so_background_thread = current_thread();
806 	}
807 
808 	switch (dom) {
809 	/*
810 	 * Don't mark Unix domain or system
811 	 * eligible for defunct by default.
812 	 */
813 	case PF_LOCAL:
814 	case PF_SYSTEM:
815 		so->so_flags |= SOF_NODEFUNCT;
816 		break;
817 	default:
818 		break;
819 	}
820 
821 	/*
822 	 * Entitlements can't be checked at socket creation time except if the
823 	 * application requested a feature guarded by a privilege (c.f., socket
824 	 * delegation).
825 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 	 * a privilege check should only be triggered by a userland request.
827 	 * A privilege check at socket creation time is time consuming and
828 	 * could trigger many authorisation error messages from the security
829 	 * APIs.
830 	 */
831 
832 	*aso = so;
833 
834 	return 0;
835 }
836 
837 /*
838  * Returns:	0			Success
839  *		EAFNOSUPPORT
840  *		EPROTOTYPE
841  *		EPROTONOSUPPORT
842  *		ENOBUFS
843  *	<pru_attach>:ENOBUFS[AF_UNIX]
844  *	<pru_attach>:ENOBUFS[TCP]
845  *	<pru_attach>:ENOMEM[TCP]
846  *	<pru_attach>:???		[other protocol families, IPSEC]
847  */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 	           PROC_NULL);
853 }
854 
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 	int error = 0;
859 	struct proc *ep = PROC_NULL;
860 
861 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 		error = ESRCH;
863 		goto done;
864 	}
865 
866 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867 
868 	/*
869 	 * It might not be wise to hold the proc reference when calling
870 	 * socreate_internal since it calls soalloc with M_WAITOK
871 	 */
872 done:
873 	if (ep != PROC_NULL) {
874 		proc_rele(ep);
875 	}
876 
877 	return error;
878 }
879 
880 /*
881  * Returns:	0			Success
882  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
883  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
884  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
885  *	<pru_bind>:EINVAL		Invalid argument
886  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
887  *	<pru_bind>:EACCES		Permission denied
888  *	<pru_bind>:EADDRINUSE		Address in use
889  *	<pru_bind>:EAGAIN		Resource unavailable, try again
890  *	<pru_bind>:EPERM		Operation not permitted
891  *	<pru_bind>:???
892  *	<sf_bind>:???
893  *
894  * Notes:	It's not possible to fully enumerate the return codes above,
895  *		since socket filter authors and protocol family authors may
896  *		not choose to limit their error returns to those listed, even
897  *		though this may result in some software operating incorrectly.
898  *
899  *		The error codes which are enumerated above are those known to
900  *		be returned by the tcp_usr_bind function supplied.
901  */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 	struct proc *p = current_proc();
906 	int error = 0;
907 
908 	if (dolock) {
909 		socket_lock(so, 1);
910 	}
911 
912 	so_update_last_owner_locked(so, p);
913 	so_update_policy(so);
914 
915 #if NECP
916 	so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918 
919 	/*
920 	 * If this is a bind request on a socket that has been marked
921 	 * as inactive, reject it now before we go any further.
922 	 */
923 	if (so->so_flags & SOF_DEFUNCT) {
924 		error = EINVAL;
925 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
926 		    __func__, proc_pid(p), proc_best_name(p),
927 		    so->so_gencnt,
928 		    SOCK_DOM(so), SOCK_TYPE(so), error);
929 		goto out;
930 	}
931 
932 	/* Socket filter */
933 	error = sflt_bind(so, nam);
934 
935 	if (error == 0) {
936 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 	}
938 out:
939 	if (dolock) {
940 		socket_unlock(so, 1);
941 	}
942 
943 	if (error == EJUSTRETURN) {
944 		error = 0;
945 	}
946 
947 	return error;
948 }
949 
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 	kauth_cred_unref(&so->so_cred);
954 
955 	/* Remove any filters */
956 	sflt_termsock(so);
957 
958 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959 
960 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
961 		cached_sock_free(so);
962 	} else {
963 		zfree(socket_zone, so);
964 	}
965 }
966 
967 /*
968  * Returns:	0			Success
969  *		EINVAL
970  *		EOPNOTSUPP
971  *	<pru_listen>:EINVAL[AF_UNIX]
972  *	<pru_listen>:EINVAL[TCP]
973  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
974  *	<pru_listen>:EINVAL[TCP]	Invalid argument
975  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
976  *	<pru_listen>:EACCES[TCP]	Permission denied
977  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
978  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
979  *	<pru_listen>:EPERM[TCP]		Operation not permitted
980  *	<sf_listen>:???
981  *
982  * Notes:	Other <pru_listen> returns depend on the protocol family; all
983  *		<sf_listen> returns depend on what the filter author causes
984  *		their filter to return.
985  */
986 int
solisten(struct socket * so,int backlog)987 solisten(struct socket *so, int backlog)
988 {
989 	struct proc *p = current_proc();
990 	int error = 0;
991 
992 	socket_lock(so, 1);
993 
994 	so_update_last_owner_locked(so, p);
995 	so_update_policy(so);
996 
997 	if (TAILQ_EMPTY(&so->so_comp)) {
998 		so->so_options |= SO_ACCEPTCONN;
999 	}
1000 
1001 #if NECP
1002 	so_update_necp_policy(so, NULL, NULL);
1003 #endif /* NECP */
1004 
1005 	if (so->so_proto == NULL) {
1006 		error = EINVAL;
1007 		so->so_options &= ~SO_ACCEPTCONN;
1008 		goto out;
1009 	}
1010 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1011 		error = EOPNOTSUPP;
1012 		so->so_options &= ~SO_ACCEPTCONN;
1013 		goto out;
1014 	}
1015 
1016 	/*
1017 	 * If the listen request is made on a socket that is not fully
1018 	 * disconnected, or on a socket that has been marked as inactive,
1019 	 * reject the request now.
1020 	 */
1021 	if ((so->so_state &
1022 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1023 	    (so->so_flags & SOF_DEFUNCT)) {
1024 		error = EINVAL;
1025 		if (so->so_flags & SOF_DEFUNCT) {
1026 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1027 			    "(%d)\n", __func__, proc_pid(p),
1028 			    proc_best_name(p),
1029 			    so->so_gencnt,
1030 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1031 		}
1032 		so->so_options &= ~SO_ACCEPTCONN;
1033 		goto out;
1034 	}
1035 
1036 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1037 		error = EPERM;
1038 		so->so_options &= ~SO_ACCEPTCONN;
1039 		goto out;
1040 	}
1041 
1042 	error = sflt_listen(so);
1043 	if (error == 0) {
1044 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1045 	}
1046 
1047 	if (error) {
1048 		if (error == EJUSTRETURN) {
1049 			error = 0;
1050 		}
1051 		so->so_options &= ~SO_ACCEPTCONN;
1052 		goto out;
1053 	}
1054 
1055 	/*
1056 	 * POSIX: The implementation may have an upper limit on the length of
1057 	 * the listen queue-either global or per accepting socket. If backlog
1058 	 * exceeds this limit, the length of the listen queue is set to the
1059 	 * limit.
1060 	 *
1061 	 * If listen() is called with a backlog argument value that is less
1062 	 * than 0, the function behaves as if it had been called with a backlog
1063 	 * argument value of 0.
1064 	 *
1065 	 * A backlog argument of 0 may allow the socket to accept connections,
1066 	 * in which case the length of the listen queue may be set to an
1067 	 * implementation-defined minimum value.
1068 	 */
1069 	if (backlog <= 0 || backlog > somaxconn) {
1070 		backlog = somaxconn;
1071 	}
1072 
1073 	so->so_qlimit = (short)backlog;
1074 out:
1075 	socket_unlock(so, 1);
1076 	return error;
1077 }
1078 
1079 /*
1080  * The "accept list lock" protects the fields related to the listener queues
1081  * because we can unlock a socket to respect the lock ordering between
1082  * the listener socket and its clients sockets. The lock ordering is first to
1083  * acquire the client socket before the listener socket.
1084  *
1085  * The accept list lock serializes access to the following fields:
1086  * - of the listener socket:
1087  *   - so_comp
1088  *   - so_incomp
1089  *   - so_qlen
1090  *   - so_inqlen
1091  * - of client sockets that are in so_comp or so_incomp:
1092  *   - so_head
1093  *   - so_list
1094  *
1095  * As one can see the accept list lock protects the consistent of the
1096  * linkage of the client sockets.
1097  *
1098  * Note that those fields may be read without holding the accept list lock
1099  * for a preflight provided the accept list lock is taken when committing
1100  * to take an action based on the result of the preflight. The preflight
1101  * saves the cost of doing the unlock/lock dance.
1102  */
1103 void
so_acquire_accept_list(struct socket * head,struct socket * so)1104 so_acquire_accept_list(struct socket *head, struct socket *so)
1105 {
1106 	lck_mtx_t *mutex_held;
1107 
1108 	if (head->so_proto->pr_getlock == NULL) {
1109 		return;
1110 	}
1111 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1112 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1113 
1114 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1115 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1116 		return;
1117 	}
1118 	if (so != NULL) {
1119 		socket_unlock(so, 0);
1120 	}
1121 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1122 		so_accept_list_waits += 1;
1123 		msleep((caddr_t)&head->so_incomp, mutex_held,
1124 		    PSOCK | PCATCH, __func__, NULL);
1125 	}
1126 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1127 	if (so != NULL) {
1128 		socket_unlock(head, 0);
1129 		socket_lock(so, 0);
1130 		socket_lock(head, 0);
1131 	}
1132 }
1133 
1134 void
so_release_accept_list(struct socket * head)1135 so_release_accept_list(struct socket *head)
1136 {
1137 	if (head->so_proto->pr_getlock != NULL) {
1138 		lck_mtx_t *mutex_held;
1139 
1140 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1141 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1142 
1143 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1144 		wakeup((caddr_t)&head->so_incomp);
1145 	}
1146 }
1147 
1148 void
sofreelastref(struct socket * so,int dealloc)1149 sofreelastref(struct socket *so, int dealloc)
1150 {
1151 	struct socket *head = so->so_head;
1152 
1153 	/* Assume socket is locked */
1154 
1155 #if FLOW_DIVERT
1156 	if (so->so_flags & SOF_FLOW_DIVERT) {
1157 		flow_divert_detach(so);
1158 	}
1159 #endif  /* FLOW_DIVERT */
1160 
1161 #if CONTENT_FILTER
1162 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1163 		cfil_sock_detach(so);
1164 	}
1165 #endif /* CONTENT_FILTER */
1166 
1167 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1168 		soflow_detach(so);
1169 	}
1170 
1171 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1172 		selthreadclear(&so->so_snd.sb_sel);
1173 		selthreadclear(&so->so_rcv.sb_sel);
1174 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1175 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1176 		so->so_event = sonullevent;
1177 		return;
1178 	}
1179 	if (head != NULL) {
1180 		/*
1181 		 * Need to lock the listener when the protocol has
1182 		 * per socket locks
1183 		 */
1184 		if (head->so_proto->pr_getlock != NULL) {
1185 			socket_lock(head, 1);
1186 			so_acquire_accept_list(head, so);
1187 		}
1188 		if (so->so_state & SS_INCOMP) {
1189 			so->so_state &= ~SS_INCOMP;
1190 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1191 			head->so_incqlen--;
1192 			head->so_qlen--;
1193 			so->so_head = NULL;
1194 
1195 			if (head->so_proto->pr_getlock != NULL) {
1196 				so_release_accept_list(head);
1197 				socket_unlock(head, 1);
1198 			}
1199 		} else if (so->so_state & SS_COMP) {
1200 			if (head->so_proto->pr_getlock != NULL) {
1201 				so_release_accept_list(head);
1202 				socket_unlock(head, 1);
1203 			}
1204 			/*
1205 			 * We must not decommission a socket that's
1206 			 * on the accept(2) queue.  If we do, then
1207 			 * accept(2) may hang after select(2) indicated
1208 			 * that the listening socket was ready.
1209 			 */
1210 			selthreadclear(&so->so_snd.sb_sel);
1211 			selthreadclear(&so->so_rcv.sb_sel);
1212 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1213 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1214 			so->so_event = sonullevent;
1215 			return;
1216 		} else {
1217 			if (head->so_proto->pr_getlock != NULL) {
1218 				so_release_accept_list(head);
1219 				socket_unlock(head, 1);
1220 			}
1221 			printf("sofree: not queued\n");
1222 		}
1223 	}
1224 	sowflush(so);
1225 	sorflush(so);
1226 
1227 	/* 3932268: disable upcall */
1228 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1229 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1230 	so->so_event = sonullevent;
1231 
1232 	if (dealloc) {
1233 		sodealloc(so);
1234 	}
1235 }
1236 
1237 void
soclose_wait_locked(struct socket * so)1238 soclose_wait_locked(struct socket *so)
1239 {
1240 	lck_mtx_t *mutex_held;
1241 
1242 	if (so->so_proto->pr_getlock != NULL) {
1243 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1244 	} else {
1245 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1246 	}
1247 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1248 
1249 	/*
1250 	 * Double check here and return if there's no outstanding upcall;
1251 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1252 	 */
1253 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1254 		return;
1255 	}
1256 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1257 	so->so_snd.sb_flags &= ~SB_UPCALL;
1258 	so->so_flags |= SOF_CLOSEWAIT;
1259 
1260 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1261 	    "soclose_wait_locked", NULL);
1262 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1263 	so->so_flags &= ~SOF_CLOSEWAIT;
1264 }
1265 
1266 /*
1267  * Close a socket on last file table reference removal.
1268  * Initiate disconnect if connected.
1269  * Free socket when disconnect complete.
1270  */
1271 int
soclose_locked(struct socket * so)1272 soclose_locked(struct socket *so)
1273 {
1274 	int error = 0;
1275 	struct timespec ts;
1276 
1277 	if (so->so_usecount == 0) {
1278 		panic("soclose: so=%p refcount=0", so);
1279 		/* NOTREACHED */
1280 	}
1281 
1282 	sflt_notify(so, sock_evt_closing, NULL);
1283 
1284 	if (so->so_upcallusecount) {
1285 		soclose_wait_locked(so);
1286 	}
1287 
1288 #if CONTENT_FILTER
1289 	/*
1290 	 * We have to wait until the content filters are done
1291 	 */
1292 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1293 		cfil_sock_close_wait(so);
1294 		cfil_sock_is_closed(so);
1295 		cfil_sock_detach(so);
1296 	}
1297 #endif /* CONTENT_FILTER */
1298 
1299 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1300 		soflow_detach(so);
1301 	}
1302 
1303 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1304 		soresume(current_proc(), so, 1);
1305 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1306 	}
1307 
1308 	if ((so->so_options & SO_ACCEPTCONN)) {
1309 		struct socket *sp, *sonext;
1310 		int persocklock = 0;
1311 		int incomp_overflow_only;
1312 
1313 		/*
1314 		 * We do not want new connection to be added
1315 		 * to the connection queues
1316 		 */
1317 		so->so_options &= ~SO_ACCEPTCONN;
1318 
1319 		/*
1320 		 * We can drop the lock on the listener once
1321 		 * we've acquired the incoming list
1322 		 */
1323 		if (so->so_proto->pr_getlock != NULL) {
1324 			persocklock = 1;
1325 			so_acquire_accept_list(so, NULL);
1326 			socket_unlock(so, 0);
1327 		}
1328 again:
1329 		incomp_overflow_only = 1;
1330 
1331 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1332 			/*
1333 			 * Radar 5350314
1334 			 * skip sockets thrown away by tcpdropdropblreq
1335 			 * they will get cleanup by the garbage collection.
1336 			 * otherwise, remove the incomp socket from the queue
1337 			 * and let soabort trigger the appropriate cleanup.
1338 			 */
1339 			if (sp->so_flags & SOF_OVERFLOW) {
1340 				continue;
1341 			}
1342 
1343 			if (persocklock != 0) {
1344 				socket_lock(sp, 1);
1345 			}
1346 
1347 			/*
1348 			 * Radar 27945981
1349 			 * The extra reference for the list insure the
1350 			 * validity of the socket pointer when we perform the
1351 			 * unlock of the head above
1352 			 */
1353 			if (sp->so_state & SS_INCOMP) {
1354 				sp->so_state &= ~SS_INCOMP;
1355 				sp->so_head = NULL;
1356 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1357 				so->so_incqlen--;
1358 				so->so_qlen--;
1359 
1360 				(void) soabort(sp);
1361 			} else {
1362 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1363 				    __func__, sp);
1364 			}
1365 
1366 			if (persocklock != 0) {
1367 				socket_unlock(sp, 1);
1368 			}
1369 		}
1370 
1371 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1372 			/* Dequeue from so_comp since sofree() won't do it */
1373 			if (persocklock != 0) {
1374 				socket_lock(sp, 1);
1375 			}
1376 
1377 			if (sp->so_state & SS_COMP) {
1378 				sp->so_state &= ~SS_COMP;
1379 				sp->so_head = NULL;
1380 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1381 				so->so_qlen--;
1382 
1383 				(void) soabort(sp);
1384 			} else {
1385 				panic("%s sp %p in so_comp but !SS_COMP",
1386 				    __func__, sp);
1387 			}
1388 
1389 			if (persocklock) {
1390 				socket_unlock(sp, 1);
1391 			}
1392 		}
1393 
1394 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1395 #if (DEBUG | DEVELOPMENT)
1396 			panic("%s head %p so_comp not empty", __func__, so);
1397 #endif /* (DEVELOPMENT || DEBUG) */
1398 
1399 			goto again;
1400 		}
1401 
1402 		if (!TAILQ_EMPTY(&so->so_comp)) {
1403 #if (DEBUG | DEVELOPMENT)
1404 			panic("%s head %p so_comp not empty", __func__, so);
1405 #endif /* (DEVELOPMENT || DEBUG) */
1406 
1407 			goto again;
1408 		}
1409 
1410 		if (persocklock) {
1411 			socket_lock(so, 0);
1412 			so_release_accept_list(so);
1413 		}
1414 	}
1415 	if (so->so_pcb == NULL) {
1416 		/* 3915887: mark the socket as ready for dealloc */
1417 		so->so_flags |= SOF_PCBCLEARING;
1418 		goto discard;
1419 	}
1420 
1421 	if (so->so_state & SS_ISCONNECTED) {
1422 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1423 			error = sodisconnectlocked(so);
1424 			if (error) {
1425 				goto drop;
1426 			}
1427 		}
1428 		if (so->so_options & SO_LINGER) {
1429 			if ((so->so_state & SS_ISDISCONNECTING) &&
1430 			    (so->so_state & SS_NBIO)) {
1431 				goto drop;
1432 			}
1433 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1434 				lck_mtx_t *mutex_held;
1435 
1436 				if (so->so_proto->pr_getlock != NULL) {
1437 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1438 				} else {
1439 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1440 				}
1441 				ts.tv_sec = (so->so_linger / 100);
1442 				ts.tv_nsec = (so->so_linger % 100) *
1443 				    NSEC_PER_USEC * 1000 * 10;
1444 				error = msleep((caddr_t)&so->so_timeo,
1445 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1446 				if (error) {
1447 					/*
1448 					 * It's OK when the time fires,
1449 					 * don't report an error
1450 					 */
1451 					if (error == EWOULDBLOCK) {
1452 						error = 0;
1453 					}
1454 					break;
1455 				}
1456 			}
1457 		}
1458 	}
1459 drop:
1460 	if (so->so_usecount == 0) {
1461 		panic("soclose: usecount is zero so=%p", so);
1462 		/* NOTREACHED */
1463 	}
1464 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1465 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1466 		if (error == 0) {
1467 			error = error2;
1468 		}
1469 	}
1470 	if (so->so_usecount <= 0) {
1471 		panic("soclose: usecount is zero so=%p", so);
1472 		/* NOTREACHED */
1473 	}
1474 discard:
1475 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1476 	    (so->so_state & SS_NOFDREF)) {
1477 		panic("soclose: NOFDREF");
1478 		/* NOTREACHED */
1479 	}
1480 	so->so_state |= SS_NOFDREF;
1481 
1482 	if ((so->so_flags & SOF_KNOTE) != 0) {
1483 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1484 	}
1485 
1486 	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1487 
1488 	VERIFY(so->so_usecount > 0);
1489 	so->so_usecount--;
1490 	sofree(so);
1491 	return error;
1492 }
1493 
1494 int
soclose(struct socket * so)1495 soclose(struct socket *so)
1496 {
1497 	int error = 0;
1498 	socket_lock(so, 1);
1499 
1500 	if (so->so_retaincnt == 0) {
1501 		error = soclose_locked(so);
1502 	} else {
1503 		/*
1504 		 * if the FD is going away, but socket is
1505 		 * retained in kernel remove its reference
1506 		 */
1507 		so->so_usecount--;
1508 		if (so->so_usecount < 2) {
1509 			panic("soclose: retaincnt non null and so=%p "
1510 			    "usecount=%d\n", so, so->so_usecount);
1511 		}
1512 	}
1513 	socket_unlock(so, 1);
1514 	return error;
1515 }
1516 
1517 /*
1518  * Must be called at splnet...
1519  */
1520 /* Should already be locked */
1521 int
soabort(struct socket * so)1522 soabort(struct socket *so)
1523 {
1524 	int error;
1525 
1526 #ifdef MORE_LOCKING_DEBUG
1527 	lck_mtx_t *mutex_held;
1528 
1529 	if (so->so_proto->pr_getlock != NULL) {
1530 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1531 	} else {
1532 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1533 	}
1534 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1535 #endif
1536 
1537 	if ((so->so_flags & SOF_ABORTED) == 0) {
1538 		so->so_flags |= SOF_ABORTED;
1539 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1540 		if (error) {
1541 			sofree(so);
1542 			return error;
1543 		}
1544 	}
1545 	return 0;
1546 }
1547 
1548 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1549 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1550 {
1551 	int error;
1552 
1553 	if (dolock) {
1554 		socket_lock(so, 1);
1555 	}
1556 
1557 	so_update_last_owner_locked(so, PROC_NULL);
1558 	so_update_policy(so);
1559 #if NECP
1560 	so_update_necp_policy(so, NULL, NULL);
1561 #endif /* NECP */
1562 
1563 	if ((so->so_state & SS_NOFDREF) == 0) {
1564 		panic("soaccept: !NOFDREF");
1565 	}
1566 	so->so_state &= ~SS_NOFDREF;
1567 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1568 
1569 	if (dolock) {
1570 		socket_unlock(so, 1);
1571 	}
1572 	return error;
1573 }
1574 
1575 int
soaccept(struct socket * so,struct sockaddr ** nam)1576 soaccept(struct socket *so, struct sockaddr **nam)
1577 {
1578 	return soacceptlock(so, nam, 1);
1579 }
1580 
1581 int
soacceptfilter(struct socket * so,struct socket * head)1582 soacceptfilter(struct socket *so, struct socket *head)
1583 {
1584 	struct sockaddr *local = NULL, *remote = NULL;
1585 	int error = 0;
1586 
1587 	/*
1588 	 * Hold the lock even if this socket has not been made visible
1589 	 * to the filter(s).  For sockets with global locks, this protects
1590 	 * against the head or peer going away
1591 	 */
1592 	socket_lock(so, 1);
1593 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1594 	    sogetaddr_locked(so, &local, 0) != 0) {
1595 		so->so_state &= ~SS_NOFDREF;
1596 		socket_unlock(so, 1);
1597 		soclose(so);
1598 		/* Out of resources; try it again next time */
1599 		error = ECONNABORTED;
1600 		goto done;
1601 	}
1602 
1603 	error = sflt_accept(head, so, local, remote);
1604 
1605 	/*
1606 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1607 	 * as inactive and return it anyway.  This newly accepted socket
1608 	 * will be disconnected later before we hand it off to the caller.
1609 	 */
1610 	if (error == EJUSTRETURN) {
1611 		error = 0;
1612 		(void) sosetdefunct(current_proc(), so,
1613 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1614 	}
1615 
1616 	if (error != 0) {
1617 		/*
1618 		 * This may seem like a duplication to the above error
1619 		 * handling part when we return ECONNABORTED, except
1620 		 * the following is done while holding the lock since
1621 		 * the socket has been exposed to the filter(s) earlier.
1622 		 */
1623 		so->so_state &= ~SS_NOFDREF;
1624 		socket_unlock(so, 1);
1625 		soclose(so);
1626 		/* Propagate socket filter's error code to the caller */
1627 	} else {
1628 		socket_unlock(so, 1);
1629 	}
1630 done:
1631 	/* Callee checks for NULL pointer */
1632 	sock_freeaddr(remote);
1633 	sock_freeaddr(local);
1634 	return error;
1635 }
1636 
1637 /*
1638  * Returns:	0			Success
1639  *		EOPNOTSUPP		Operation not supported on socket
1640  *		EISCONN			Socket is connected
1641  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1642  *	<pru_connect>:EINVAL		Invalid argument
1643  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1644  *	<pru_connect>:EACCES		Permission denied
1645  *	<pru_connect>:EADDRINUSE	Address in use
1646  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1647  *	<pru_connect>:EPERM		Operation not permitted
1648  *	<sf_connect_out>:???		[anything a filter writer might set]
1649  */
1650 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1651 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1652 {
1653 	int error;
1654 	struct proc *p = current_proc();
1655 	tracker_metadata_t metadata = { };
1656 
1657 	if (dolock) {
1658 		socket_lock(so, 1);
1659 	}
1660 
1661 	so_update_last_owner_locked(so, p);
1662 	so_update_policy(so);
1663 
1664 #if NECP
1665 	so_update_necp_policy(so, NULL, nam);
1666 #endif /* NECP */
1667 
1668 	/*
1669 	 * If this is a listening socket or if this is a previously-accepted
1670 	 * socket that has been marked as inactive, reject the connect request.
1671 	 */
1672 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1673 		error = EOPNOTSUPP;
1674 		if (so->so_flags & SOF_DEFUNCT) {
1675 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1676 			    "(%d)\n", __func__, proc_pid(p),
1677 			    proc_best_name(p),
1678 			    so->so_gencnt,
1679 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1680 		}
1681 		if (dolock) {
1682 			socket_unlock(so, 1);
1683 		}
1684 		return error;
1685 	}
1686 
1687 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1688 		if (dolock) {
1689 			socket_unlock(so, 1);
1690 		}
1691 		return EPERM;
1692 	}
1693 
1694 	/*
1695 	 * If protocol is connection-based, can only connect once.
1696 	 * Otherwise, if connected, try to disconnect first.
1697 	 * This allows user to disconnect by connecting to, e.g.,
1698 	 * a null address.
1699 	 */
1700 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1701 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1702 	    (error = sodisconnectlocked(so)))) {
1703 		error = EISCONN;
1704 	} else {
1705 		/*
1706 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1707 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1708 		 */
1709 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1710 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1711 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1712 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1713 				}
1714 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1715 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1716 				}
1717 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1718 					printf("connect() - failed necp_set_socket_domain_attributes");
1719 				}
1720 			}
1721 		}
1722 
1723 		/*
1724 		 * Run connect filter before calling protocol:
1725 		 *  - non-blocking connect returns before completion;
1726 		 */
1727 		error = sflt_connectout(so, nam);
1728 		if (error != 0) {
1729 			if (error == EJUSTRETURN) {
1730 				error = 0;
1731 			}
1732 		} else {
1733 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1734 			    (so, nam, p);
1735 			if (error != 0) {
1736 				so->so_state &= ~SS_ISCONNECTING;
1737 			}
1738 		}
1739 	}
1740 	if (dolock) {
1741 		socket_unlock(so, 1);
1742 	}
1743 	return error;
1744 }
1745 
1746 int
soconnect(struct socket * so,struct sockaddr * nam)1747 soconnect(struct socket *so, struct sockaddr *nam)
1748 {
1749 	return soconnectlock(so, nam, 1);
1750 }
1751 
1752 /*
1753  * Returns:	0			Success
1754  *	<pru_connect2>:EINVAL[AF_UNIX]
1755  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1756  *	<pru_connect2>:???		[other protocol families]
1757  *
1758  * Notes:	<pru_connect2> is not supported by [TCP].
1759  */
1760 int
soconnect2(struct socket * so1,struct socket * so2)1761 soconnect2(struct socket *so1, struct socket *so2)
1762 {
1763 	int error;
1764 
1765 	socket_lock(so1, 1);
1766 	if (so2->so_proto->pr_lock) {
1767 		socket_lock(so2, 1);
1768 	}
1769 
1770 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1771 
1772 	socket_unlock(so1, 1);
1773 	if (so2->so_proto->pr_lock) {
1774 		socket_unlock(so2, 1);
1775 	}
1776 	return error;
1777 }
1778 
1779 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1780 soconnectxlocked(struct socket *so, struct sockaddr *src,
1781     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1782     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1783     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1784 {
1785 	int error;
1786 	tracker_metadata_t metadata = { };
1787 
1788 	so_update_last_owner_locked(so, p);
1789 	so_update_policy(so);
1790 
1791 	/*
1792 	 * If this is a listening socket or if this is a previously-accepted
1793 	 * socket that has been marked as inactive, reject the connect request.
1794 	 */
1795 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1796 		error = EOPNOTSUPP;
1797 		if (so->so_flags & SOF_DEFUNCT) {
1798 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1799 			    "(%d)\n", __func__, proc_pid(p),
1800 			    proc_best_name(p),
1801 			    so->so_gencnt,
1802 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1803 		}
1804 		return error;
1805 	}
1806 
1807 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1808 		return EPERM;
1809 	}
1810 
1811 	/*
1812 	 * If protocol is connection-based, can only connect once
1813 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1814 	 * try to disconnect first.  This allows user to disconnect
1815 	 * by connecting to, e.g., a null address.
1816 	 */
1817 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1818 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1819 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1820 	    (error = sodisconnectlocked(so)) != 0)) {
1821 		error = EISCONN;
1822 	} else {
1823 		/*
1824 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1825 		 * (only if it hasn't been marked yet).
1826 		 */
1827 		if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1828 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1829 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1830 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1831 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1832 				}
1833 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1834 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1835 				}
1836 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1837 					printf("connectx() - failed necp_set_socket_domain_attributes");
1838 				}
1839 			}
1840 		}
1841 
1842 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1843 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1844 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1845 
1846 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1847 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1848 			}
1849 		}
1850 
1851 		/*
1852 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1853 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1854 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1855 		 * Case 3 allows user to combine write with connect even if they have
1856 		 * no use for TFO (such as regular TCP, and UDP).
1857 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1858 		 */
1859 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1860 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1861 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1862 		}
1863 
1864 		/*
1865 		 * If a user sets data idempotent and does not pass an uio, or
1866 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1867 		 * SOF1_DATA_IDEMPOTENT.
1868 		 */
1869 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1870 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1871 			/* We should return EINVAL instead perhaps. */
1872 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1873 		}
1874 
1875 		/*
1876 		 * Run connect filter before calling protocol:
1877 		 *  - non-blocking connect returns before completion;
1878 		 */
1879 		error = sflt_connectout(so, dst);
1880 		if (error != 0) {
1881 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1882 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1883 			if (error == EJUSTRETURN) {
1884 				error = 0;
1885 			}
1886 		} else {
1887 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1888 			    (so, src, dst, p, ifscope, aid, pcid,
1889 			    flags, arg, arglen, auio, bytes_written);
1890 			if (error != 0) {
1891 				so->so_state &= ~SS_ISCONNECTING;
1892 				if (error != EINPROGRESS) {
1893 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1894 				}
1895 			}
1896 		}
1897 	}
1898 
1899 	return error;
1900 }
1901 
1902 int
sodisconnectlocked(struct socket * so)1903 sodisconnectlocked(struct socket *so)
1904 {
1905 	int error;
1906 
1907 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1908 		error = ENOTCONN;
1909 		goto bad;
1910 	}
1911 	if (so->so_state & SS_ISDISCONNECTING) {
1912 		error = EALREADY;
1913 		goto bad;
1914 	}
1915 
1916 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1917 	if (error == 0) {
1918 		sflt_notify(so, sock_evt_disconnected, NULL);
1919 	}
1920 
1921 bad:
1922 	return error;
1923 }
1924 
1925 /* Locking version */
1926 int
sodisconnect(struct socket * so)1927 sodisconnect(struct socket *so)
1928 {
1929 	int error;
1930 
1931 	socket_lock(so, 1);
1932 	error = sodisconnectlocked(so);
1933 	socket_unlock(so, 1);
1934 	return error;
1935 }
1936 
1937 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1938 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1939 {
1940 	int error;
1941 
1942 	/*
1943 	 * Call the protocol disconnectx handler; let it handle all
1944 	 * matters related to the connection state of this session.
1945 	 */
1946 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1947 	if (error == 0) {
1948 		/*
1949 		 * The event applies only for the session, not for
1950 		 * the disconnection of individual subflows.
1951 		 */
1952 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1953 			sflt_notify(so, sock_evt_disconnected, NULL);
1954 		}
1955 	}
1956 	return error;
1957 }
1958 
1959 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1960 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1961 {
1962 	int error;
1963 
1964 	socket_lock(so, 1);
1965 	error = sodisconnectxlocked(so, aid, cid);
1966 	socket_unlock(so, 1);
1967 	return error;
1968 }
1969 
1970 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1971 
1972 /*
1973  * sosendcheck will lock the socket buffer if it isn't locked and
1974  * verify that there is space for the data being inserted.
1975  *
1976  * Returns:	0			Success
1977  *		EPIPE
1978  *	sblock:EWOULDBLOCK
1979  *	sblock:EINTR
1980  *	sbwait:EBADF
1981  *	sbwait:EINTR
1982  *	[so_error]:???
1983  */
1984 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1985 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1986     int32_t clen, int32_t atomic, int flags, int *sblocked)
1987 {
1988 	int     error = 0;
1989 	int32_t space;
1990 	int     assumelock = 0;
1991 
1992 restart:
1993 	if (*sblocked == 0) {
1994 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1995 		    so->so_send_filt_thread != 0 &&
1996 		    so->so_send_filt_thread == current_thread()) {
1997 			/*
1998 			 * We're being called recursively from a filter,
1999 			 * allow this to continue. Radar 4150520.
2000 			 * Don't set sblocked because we don't want
2001 			 * to perform an unlock later.
2002 			 */
2003 			assumelock = 1;
2004 		} else {
2005 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2006 			if (error) {
2007 				if (so->so_flags & SOF_DEFUNCT) {
2008 					goto defunct;
2009 				}
2010 				return error;
2011 			}
2012 			*sblocked = 1;
2013 		}
2014 	}
2015 
2016 	/*
2017 	 * If a send attempt is made on a socket that has been marked
2018 	 * as inactive (disconnected), reject the request.
2019 	 */
2020 	if (so->so_flags & SOF_DEFUNCT) {
2021 defunct:
2022 		error = EPIPE;
2023 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2024 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2025 		    so->so_gencnt,
2026 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2027 		return error;
2028 	}
2029 
2030 	if (so->so_state & SS_CANTSENDMORE) {
2031 #if CONTENT_FILTER
2032 		/*
2033 		 * Can re-inject data of half closed connections
2034 		 */
2035 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2036 		    so->so_snd.sb_cfil_thread == current_thread() &&
2037 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2038 			CFIL_LOG(LOG_INFO,
2039 			    "so %llx ignore SS_CANTSENDMORE",
2040 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2041 		} else
2042 #endif /* CONTENT_FILTER */
2043 		return EPIPE;
2044 	}
2045 	if (so->so_error) {
2046 		error = so->so_error;
2047 		so->so_error = 0;
2048 		return error;
2049 	}
2050 
2051 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2052 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2053 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2054 			    (resid != 0 || clen == 0) &&
2055 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2056 				return ENOTCONN;
2057 			}
2058 		} else if (addr == 0) {
2059 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2060 			       ENOTCONN : EDESTADDRREQ;
2061 		}
2062 	}
2063 
2064 	space = sbspace(&so->so_snd);
2065 
2066 	if (flags & MSG_OOB) {
2067 		space += 1024;
2068 	}
2069 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2070 	    clen > so->so_snd.sb_hiwat) {
2071 		return EMSGSIZE;
2072 	}
2073 
2074 	if ((space < resid + clen &&
2075 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2076 	    space < clen)) ||
2077 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2078 		/*
2079 		 * don't block the connectx call when there's more data
2080 		 * than can be copied.
2081 		 */
2082 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2083 			if (space == 0) {
2084 				return EWOULDBLOCK;
2085 			}
2086 			if (space < (int32_t)so->so_snd.sb_lowat) {
2087 				return 0;
2088 			}
2089 		}
2090 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2091 		    assumelock) {
2092 			return EWOULDBLOCK;
2093 		}
2094 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2095 		*sblocked = 0;
2096 		error = sbwait(&so->so_snd);
2097 		if (error) {
2098 			if (so->so_flags & SOF_DEFUNCT) {
2099 				goto defunct;
2100 			}
2101 			return error;
2102 		}
2103 		goto restart;
2104 	}
2105 	return 0;
2106 }
2107 
2108 /*
2109  * Send on a socket.
2110  * If send must go all at once and message is larger than
2111  * send buffering, then hard error.
2112  * Lock against other senders.
2113  * If must go all at once and not enough room now, then
2114  * inform user that this would block and do nothing.
2115  * Otherwise, if nonblocking, send as much as possible.
2116  * The data to be sent is described by "uio" if nonzero,
2117  * otherwise by the mbuf chain "top" (which must be null
2118  * if uio is not).  Data provided in mbuf chain must be small
2119  * enough to send all at once.
2120  *
2121  * Returns nonzero on error, timeout or signal; callers
2122  * must check for short counts if EINTR/ERESTART are returned.
2123  * Data and control buffers are freed on return.
2124  *
2125  * Returns:	0			Success
2126  *		EOPNOTSUPP
2127  *		EINVAL
2128  *		ENOBUFS
2129  *	uiomove:EFAULT
2130  *	sosendcheck:EPIPE
2131  *	sosendcheck:EWOULDBLOCK
2132  *	sosendcheck:EINTR
2133  *	sosendcheck:EBADF
2134  *	sosendcheck:EINTR
2135  *	sosendcheck:???			[value from so_error]
2136  *	<pru_send>:ECONNRESET[TCP]
2137  *	<pru_send>:EINVAL[TCP]
2138  *	<pru_send>:ENOBUFS[TCP]
2139  *	<pru_send>:EADDRINUSE[TCP]
2140  *	<pru_send>:EADDRNOTAVAIL[TCP]
2141  *	<pru_send>:EAFNOSUPPORT[TCP]
2142  *	<pru_send>:EACCES[TCP]
2143  *	<pru_send>:EAGAIN[TCP]
2144  *	<pru_send>:EPERM[TCP]
2145  *	<pru_send>:EMSGSIZE[TCP]
2146  *	<pru_send>:EHOSTUNREACH[TCP]
2147  *	<pru_send>:ENETUNREACH[TCP]
2148  *	<pru_send>:ENETDOWN[TCP]
2149  *	<pru_send>:ENOMEM[TCP]
2150  *	<pru_send>:ENOBUFS[TCP]
2151  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2152  *	<pru_send>:EINVAL[AF_UNIX]
2153  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2154  *	<pru_send>:EPIPE[AF_UNIX]
2155  *	<pru_send>:ENOTCONN[AF_UNIX]
2156  *	<pru_send>:EISCONN[AF_UNIX]
2157  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2158  *	<sf_data_out>:???		[whatever a filter author chooses]
2159  *
2160  * Notes:	Other <pru_send> returns depend on the protocol family; all
2161  *		<sf_data_out> returns depend on what the filter author causes
2162  *		their filter to return.
2163  */
2164 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2165 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2166     struct mbuf *top, struct mbuf *control, int flags)
2167 {
2168 	struct mbuf **mp;
2169 	struct mbuf *m, *freelist = NULL;
2170 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2171 	user_ssize_t space, len, resid, orig_resid;
2172 	int clen = 0, error, dontroute, sendflags;
2173 	int atomic = sosendallatonce(so) || top;
2174 	int sblocked = 0;
2175 	struct proc *p = current_proc();
2176 	uint16_t headroom = 0;
2177 	ssize_t mlen;
2178 	boolean_t en_tracing = FALSE;
2179 
2180 	if (uio != NULL) {
2181 		resid = uio_resid(uio);
2182 	} else {
2183 		resid = top->m_pkthdr.len;
2184 	}
2185 
2186 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2187 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2188 
2189 	socket_lock(so, 1);
2190 
2191 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2192 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2193 	}
2194 
2195 	/*
2196 	 * trace if tracing & network (vs. unix) sockets & and
2197 	 * non-loopback
2198 	 */
2199 	if (ENTR_SHOULDTRACE &&
2200 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2201 		struct inpcb *inp = sotoinpcb(so);
2202 		if (inp->inp_last_outifp != NULL &&
2203 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2204 			en_tracing = TRUE;
2205 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2206 			    VM_KERNEL_ADDRPERM(so),
2207 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2208 			    (int64_t)resid);
2209 			orig_resid = resid;
2210 		}
2211 	}
2212 
2213 	/*
2214 	 * Re-injection should not affect process accounting
2215 	 */
2216 	if ((flags & MSG_SKIPCFIL) == 0) {
2217 		so_update_last_owner_locked(so, p);
2218 		so_update_policy(so);
2219 
2220 #if NECP
2221 		so_update_necp_policy(so, NULL, addr);
2222 #endif /* NECP */
2223 	}
2224 
2225 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2226 		error = EOPNOTSUPP;
2227 		goto out_locked;
2228 	}
2229 
2230 	/*
2231 	 * In theory resid should be unsigned.
2232 	 * However, space must be signed, as it might be less than 0
2233 	 * if we over-committed, and we must use a signed comparison
2234 	 * of space and resid.  On the other hand, a negative resid
2235 	 * causes us to loop sending 0-length segments to the protocol.
2236 	 *
2237 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2238 	 *
2239 	 * Note: We limit resid to be a positive int value as we use
2240 	 * imin() to set bytes_to_copy -- radr://14558484
2241 	 */
2242 	if (resid < 0 || resid > INT_MAX ||
2243 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2244 		error = EINVAL;
2245 		goto out_locked;
2246 	}
2247 
2248 	dontroute = (flags & MSG_DONTROUTE) &&
2249 	    (so->so_options & SO_DONTROUTE) == 0 &&
2250 	    (so->so_proto->pr_flags & PR_ATOMIC);
2251 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2252 
2253 	if (control != NULL) {
2254 		clen = control->m_len;
2255 	}
2256 
2257 	if (soreserveheadroom != 0) {
2258 		headroom = so->so_pktheadroom;
2259 	}
2260 
2261 	do {
2262 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2263 		    &sblocked);
2264 		if (error) {
2265 			goto out_locked;
2266 		}
2267 
2268 		mp = &top;
2269 		space = sbspace(&so->so_snd) - clen;
2270 		space += ((flags & MSG_OOB) ? 1024 : 0);
2271 
2272 		do {
2273 			if (uio == NULL) {
2274 				/*
2275 				 * Data is prepackaged in "top".
2276 				 */
2277 				resid = 0;
2278 				if (flags & MSG_EOR) {
2279 					top->m_flags |= M_EOR;
2280 				}
2281 			} else {
2282 				int chainlength;
2283 				int bytes_to_copy;
2284 				boolean_t jumbocl;
2285 				boolean_t bigcl;
2286 				int bytes_to_alloc;
2287 
2288 				bytes_to_copy = imin((int)resid, (int)space);
2289 
2290 				bytes_to_alloc = bytes_to_copy;
2291 				if (top == NULL) {
2292 					bytes_to_alloc += headroom;
2293 				}
2294 
2295 				if (sosendminchain > 0) {
2296 					chainlength = 0;
2297 				} else {
2298 					chainlength = sosendmaxchain;
2299 				}
2300 
2301 				/*
2302 				 * Use big 4 KB cluster when the outgoing interface
2303 				 * does not prefer 2 KB clusters
2304 				 */
2305 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2306 				    sosendbigcl_ignore_capab;
2307 
2308 				/*
2309 				 * Attempt to use larger than system page-size
2310 				 * clusters for large writes only if there is
2311 				 * a jumbo cluster pool and if the socket is
2312 				 * marked accordingly.
2313 				 */
2314 				jumbocl = sosendjcl && njcl > 0 &&
2315 				    ((so->so_flags & SOF_MULTIPAGES) ||
2316 				    sosendjcl_ignore_capab) &&
2317 				    bigcl;
2318 
2319 				socket_unlock(so, 0);
2320 
2321 				do {
2322 					int num_needed;
2323 					int hdrs_needed = (top == NULL) ? 1 : 0;
2324 
2325 					/*
2326 					 * try to maintain a local cache of mbuf
2327 					 * clusters needed to complete this
2328 					 * write the list is further limited to
2329 					 * the number that are currently needed
2330 					 * to fill the socket this mechanism
2331 					 * allows a large number of mbufs/
2332 					 * clusters to be grabbed under a single
2333 					 * mbuf lock... if we can't get any
2334 					 * clusters, than fall back to trying
2335 					 * for mbufs if we fail early (or
2336 					 * miscalcluate the number needed) make
2337 					 * sure to release any clusters we
2338 					 * haven't yet consumed.
2339 					 */
2340 					if (freelist == NULL &&
2341 					    bytes_to_alloc > MBIGCLBYTES &&
2342 					    jumbocl) {
2343 						num_needed =
2344 						    bytes_to_alloc / M16KCLBYTES;
2345 
2346 						if ((bytes_to_alloc -
2347 						    (num_needed * M16KCLBYTES))
2348 						    >= MINCLSIZE) {
2349 							num_needed++;
2350 						}
2351 
2352 						freelist =
2353 						    m_getpackets_internal(
2354 							(unsigned int *)&num_needed,
2355 							hdrs_needed, M_WAIT, 0,
2356 							M16KCLBYTES);
2357 						/*
2358 						 * Fall back to 4K cluster size
2359 						 * if allocation failed
2360 						 */
2361 					}
2362 
2363 					if (freelist == NULL &&
2364 					    bytes_to_alloc > MCLBYTES &&
2365 					    bigcl) {
2366 						num_needed =
2367 						    bytes_to_alloc / MBIGCLBYTES;
2368 
2369 						if ((bytes_to_alloc -
2370 						    (num_needed * MBIGCLBYTES)) >=
2371 						    MINCLSIZE) {
2372 							num_needed++;
2373 						}
2374 
2375 						freelist =
2376 						    m_getpackets_internal(
2377 							(unsigned int *)&num_needed,
2378 							hdrs_needed, M_WAIT, 0,
2379 							MBIGCLBYTES);
2380 						/*
2381 						 * Fall back to cluster size
2382 						 * if allocation failed
2383 						 */
2384 					}
2385 
2386 					/*
2387 					 * Allocate a cluster as we want to
2388 					 * avoid to split the data in more
2389 					 * that one segment and using MINCLSIZE
2390 					 * would lead us to allocate two mbufs
2391 					 */
2392 					if (soreserveheadroom != 0 &&
2393 					    freelist == NULL &&
2394 					    ((top == NULL &&
2395 					    bytes_to_alloc > _MHLEN) ||
2396 					    bytes_to_alloc > _MLEN)) {
2397 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2398 						    MCLBYTES;
2399 						freelist =
2400 						    m_getpackets_internal(
2401 							(unsigned int *)&num_needed,
2402 							hdrs_needed, M_WAIT, 0,
2403 							MCLBYTES);
2404 						/*
2405 						 * Fall back to a single mbuf
2406 						 * if allocation failed
2407 						 */
2408 					} else if (freelist == NULL &&
2409 					    bytes_to_alloc > MINCLSIZE) {
2410 						num_needed =
2411 						    bytes_to_alloc / MCLBYTES;
2412 
2413 						if ((bytes_to_alloc -
2414 						    (num_needed * MCLBYTES)) >=
2415 						    MINCLSIZE) {
2416 							num_needed++;
2417 						}
2418 
2419 						freelist =
2420 						    m_getpackets_internal(
2421 							(unsigned int *)&num_needed,
2422 							hdrs_needed, M_WAIT, 0,
2423 							MCLBYTES);
2424 						/*
2425 						 * Fall back to a single mbuf
2426 						 * if allocation failed
2427 						 */
2428 					}
2429 					/*
2430 					 * For datagram protocols, leave
2431 					 * headroom for protocol headers
2432 					 * in the first cluster of the chain
2433 					 */
2434 					if (freelist != NULL && atomic &&
2435 					    top == NULL && headroom > 0) {
2436 						freelist->m_data += headroom;
2437 					}
2438 
2439 					/*
2440 					 * Fall back to regular mbufs without
2441 					 * reserving the socket headroom
2442 					 */
2443 					if (freelist == NULL) {
2444 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2445 							if (top == NULL) {
2446 								MGETHDR(freelist,
2447 								    M_WAIT, MT_DATA);
2448 							} else {
2449 								MGET(freelist,
2450 								    M_WAIT, MT_DATA);
2451 							}
2452 						}
2453 
2454 						if (freelist == NULL) {
2455 							error = ENOBUFS;
2456 							socket_lock(so, 0);
2457 							goto out_locked;
2458 						}
2459 						/*
2460 						 * For datagram protocols,
2461 						 * leave room for protocol
2462 						 * headers in first mbuf.
2463 						 */
2464 						if (atomic && top == NULL &&
2465 						    bytes_to_copy > 0 &&
2466 						    bytes_to_copy < MHLEN) {
2467 							MH_ALIGN(freelist,
2468 							    bytes_to_copy);
2469 						}
2470 					}
2471 					m = freelist;
2472 					freelist = m->m_next;
2473 					m->m_next = NULL;
2474 
2475 					if ((m->m_flags & M_EXT)) {
2476 						mlen = m->m_ext.ext_size -
2477 						    M_LEADINGSPACE(m);
2478 					} else if ((m->m_flags & M_PKTHDR)) {
2479 						mlen = MHLEN - M_LEADINGSPACE(m);
2480 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2481 					} else {
2482 						mlen = MLEN - M_LEADINGSPACE(m);
2483 					}
2484 					len = imin((int)mlen, bytes_to_copy);
2485 
2486 					chainlength += len;
2487 
2488 					space -= len;
2489 
2490 					error = uiomove(mtod(m, caddr_t),
2491 					    (int)len, uio);
2492 
2493 					resid = uio_resid(uio);
2494 
2495 					m->m_len = (int32_t)len;
2496 					*mp = m;
2497 					top->m_pkthdr.len += len;
2498 					if (error) {
2499 						break;
2500 					}
2501 					mp = &m->m_next;
2502 					if (resid <= 0) {
2503 						if (flags & MSG_EOR) {
2504 							top->m_flags |= M_EOR;
2505 						}
2506 						break;
2507 					}
2508 					bytes_to_copy = imin((int)resid, (int)space);
2509 				} while (space > 0 &&
2510 				    (chainlength < sosendmaxchain || atomic ||
2511 				    resid < MINCLSIZE));
2512 
2513 				socket_lock(so, 0);
2514 
2515 				if (error) {
2516 					goto out_locked;
2517 				}
2518 			}
2519 
2520 			if (dontroute) {
2521 				so->so_options |= SO_DONTROUTE;
2522 			}
2523 
2524 			/*
2525 			 * Compute flags here, for pru_send and NKEs
2526 			 *
2527 			 * If the user set MSG_EOF, the protocol
2528 			 * understands this flag and nothing left to
2529 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2530 			 */
2531 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2532 			    ((flags & MSG_EOF) &&
2533 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2534 			    (resid <= 0)) ? PRUS_EOF :
2535 			    /* If there is more to send set PRUS_MORETOCOME */
2536 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2537 
2538 			if ((flags & MSG_SKIPCFIL) == 0) {
2539 				/*
2540 				 * Socket filter processing
2541 				 */
2542 				error = sflt_data_out(so, addr, &top,
2543 				    &control, (sendflags & MSG_OOB) ?
2544 				    sock_data_filt_flag_oob : 0);
2545 				if (error) {
2546 					if (error == EJUSTRETURN) {
2547 						error = 0;
2548 						goto packet_consumed;
2549 					}
2550 					goto out_locked;
2551 				}
2552 #if CONTENT_FILTER
2553 				/*
2554 				 * Content filter processing
2555 				 */
2556 				error = cfil_sock_data_out(so, addr, top,
2557 				    control, sendflags, dgram_flow_entry);
2558 				if (error) {
2559 					if (error == EJUSTRETURN) {
2560 						error = 0;
2561 						goto packet_consumed;
2562 					}
2563 					goto out_locked;
2564 				}
2565 #endif /* CONTENT_FILTER */
2566 			}
2567 			error = (*so->so_proto->pr_usrreqs->pru_send)
2568 			    (so, sendflags, top, addr, control, p);
2569 
2570 packet_consumed:
2571 			if (dontroute) {
2572 				so->so_options &= ~SO_DONTROUTE;
2573 			}
2574 
2575 			clen = 0;
2576 			control = NULL;
2577 			top = NULL;
2578 			mp = &top;
2579 			if (error) {
2580 				goto out_locked;
2581 			}
2582 		} while (resid && space > 0);
2583 	} while (resid);
2584 
2585 out_locked:
2586 	if (sblocked) {
2587 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2588 	} else {
2589 		socket_unlock(so, 1);
2590 	}
2591 	if (top != NULL) {
2592 		m_freem(top);
2593 	}
2594 	if (control != NULL) {
2595 		m_freem(control);
2596 	}
2597 	if (freelist != NULL) {
2598 		m_freem_list(freelist);
2599 	}
2600 
2601 	if (dgram_flow_entry != NULL) {
2602 		soflow_free_flow(dgram_flow_entry);
2603 	}
2604 
2605 	soclearfastopen(so);
2606 
2607 	if (en_tracing) {
2608 		/* resid passed here is the bytes left in uio */
2609 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2610 		    VM_KERNEL_ADDRPERM(so),
2611 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2612 		    (int64_t)(orig_resid - resid));
2613 	}
2614 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2615 	    so->so_snd.sb_cc, space, error);
2616 
2617 	return error;
2618 }
2619 
2620 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2621 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2622 {
2623 	struct mbuf *m0 = NULL, *control_end = NULL;
2624 
2625 	socket_lock_assert_owned(so);
2626 
2627 	/*
2628 	 * top must points to mbuf chain to be sent.
2629 	 * If control is not NULL, top must be packet header
2630 	 */
2631 	VERIFY(top != NULL &&
2632 	    (control == NULL || top->m_flags & M_PKTHDR));
2633 
2634 	/*
2635 	 * If control is not passed in, see if we can get it
2636 	 * from top.
2637 	 */
2638 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2639 		// Locate start of control if present and start of data
2640 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2641 			if (m0->m_flags & M_PKTHDR) {
2642 				top = m0;
2643 				break;
2644 			} else if (m0->m_type == MT_CONTROL) {
2645 				if (control == NULL) {
2646 					// Found start of control
2647 					control = m0;
2648 				}
2649 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2650 					// Found end of control
2651 					control_end = m0;
2652 				}
2653 			}
2654 		}
2655 		if (control_end != NULL) {
2656 			control_end->m_next = NULL;
2657 		}
2658 	}
2659 
2660 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2661 	    (so, sendflags, top, addr, control, current_proc());
2662 
2663 	return error;
2664 }
2665 
2666 /*
2667  * Supported only connected sockets (no address) without ancillary data
2668  * (control mbuf) for atomic protocols
2669  */
2670 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2671 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2672 {
2673 	struct mbuf *m, *freelist = NULL;
2674 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2675 	user_ssize_t len, resid;
2676 	int error, dontroute;
2677 	int atomic = sosendallatonce(so);
2678 	int sblocked = 0;
2679 	struct proc *p = current_proc();
2680 	u_int uiofirst = 0;
2681 	u_int uiolast = 0;
2682 	struct mbuf *top = NULL;
2683 	uint16_t headroom = 0;
2684 	ssize_t mlen;
2685 	boolean_t bigcl;
2686 
2687 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2688 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2689 
2690 	if (so->so_type != SOCK_DGRAM) {
2691 		error = EINVAL;
2692 		goto out;
2693 	}
2694 	if (atomic == 0) {
2695 		error = EINVAL;
2696 		goto out;
2697 	}
2698 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2699 		error = EPROTONOSUPPORT;
2700 		goto out;
2701 	}
2702 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2703 		error = EINVAL;
2704 		goto out;
2705 	}
2706 	resid = uio_array_resid(uioarray, uiocnt);
2707 
2708 	/*
2709 	 * In theory resid should be unsigned.
2710 	 * However, space must be signed, as it might be less than 0
2711 	 * if we over-committed, and we must use a signed comparison
2712 	 * of space and resid.  On the other hand, a negative resid
2713 	 * causes us to loop sending 0-length segments to the protocol.
2714 	 *
2715 	 * Note: We limit resid to be a positive int value as we use
2716 	 * imin() to set bytes_to_copy -- radr://14558484
2717 	 */
2718 	if (resid < 0 || resid > INT_MAX) {
2719 		error = EINVAL;
2720 		goto out;
2721 	}
2722 
2723 	socket_lock(so, 1);
2724 	so_update_last_owner_locked(so, p);
2725 	so_update_policy(so);
2726 
2727 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2728 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2729 	}
2730 
2731 #if NECP
2732 	so_update_necp_policy(so, NULL, NULL);
2733 #endif /* NECP */
2734 
2735 	dontroute = (flags & MSG_DONTROUTE) &&
2736 	    (so->so_options & SO_DONTROUTE) == 0 &&
2737 	    (so->so_proto->pr_flags & PR_ATOMIC);
2738 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2739 
2740 	error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2741 	if (error) {
2742 		goto release;
2743 	}
2744 
2745 	/*
2746 	 * Use big 4 KB clusters when the outgoing interface does not prefer
2747 	 * 2 KB clusters
2748 	 */
2749 	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2750 
2751 	if (soreserveheadroom != 0) {
2752 		headroom = so->so_pktheadroom;
2753 	}
2754 
2755 	do {
2756 		int i;
2757 		int num_needed = 0;
2758 		int chainlength;
2759 		size_t maxpktlen = 0;
2760 		int bytes_to_alloc;
2761 
2762 		if (sosendminchain > 0) {
2763 			chainlength = 0;
2764 		} else {
2765 			chainlength = sosendmaxchain;
2766 		}
2767 
2768 		socket_unlock(so, 0);
2769 
2770 		/*
2771 		 * Find a set of uio that fit in a reasonable number
2772 		 * of mbuf packets
2773 		 */
2774 		for (i = uiofirst; i < uiocnt; i++) {
2775 			struct uio *auio = uioarray[i];
2776 
2777 			len = uio_resid(auio);
2778 
2779 			/* Do nothing for empty messages */
2780 			if (len == 0) {
2781 				continue;
2782 			}
2783 
2784 			num_needed += 1;
2785 			uiolast += 1;
2786 
2787 			if (len > maxpktlen) {
2788 				maxpktlen = len;
2789 			}
2790 
2791 			chainlength += len;
2792 			if (chainlength > sosendmaxchain) {
2793 				break;
2794 			}
2795 		}
2796 		/*
2797 		 * Nothing left to send
2798 		 */
2799 		if (num_needed == 0) {
2800 			socket_lock(so, 0);
2801 			break;
2802 		}
2803 		/*
2804 		 * Allocate buffer large enough to include headroom space for
2805 		 * network and link header
2806 		 *
2807 		 */
2808 		bytes_to_alloc = (int) maxpktlen + headroom;
2809 
2810 		/*
2811 		 * Allocate a single contiguous buffer of the smallest available
2812 		 * size when possible
2813 		 */
2814 		if (bytes_to_alloc > MCLBYTES &&
2815 		    bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2816 			freelist = m_getpackets_internal(
2817 				(unsigned int *)&num_needed,
2818 				num_needed, M_WAIT, 1,
2819 				MBIGCLBYTES);
2820 		} else if (bytes_to_alloc > _MHLEN &&
2821 		    bytes_to_alloc <= MCLBYTES) {
2822 			freelist = m_getpackets_internal(
2823 				(unsigned int *)&num_needed,
2824 				num_needed, M_WAIT, 1,
2825 				MCLBYTES);
2826 		} else {
2827 			freelist = m_allocpacket_internal(
2828 				(unsigned int *)&num_needed,
2829 				bytes_to_alloc, NULL, M_WAIT, 1, 0);
2830 		}
2831 
2832 		if (freelist == NULL) {
2833 			socket_lock(so, 0);
2834 			error = ENOMEM;
2835 			goto release;
2836 		}
2837 		/*
2838 		 * Copy each uio of the set into its own mbuf packet
2839 		 */
2840 		for (i = uiofirst, m = freelist;
2841 		    i < uiolast && m != NULL;
2842 		    i++) {
2843 			int bytes_to_copy;
2844 			struct mbuf *n;
2845 			struct uio *auio = uioarray[i];
2846 
2847 			bytes_to_copy = (int)uio_resid(auio);
2848 
2849 			/* Do nothing for empty messages */
2850 			if (bytes_to_copy == 0) {
2851 				continue;
2852 			}
2853 			/*
2854 			 * Leave headroom for protocol headers
2855 			 * in the first mbuf of the chain
2856 			 */
2857 			m->m_data += headroom;
2858 
2859 			for (n = m; n != NULL; n = n->m_next) {
2860 				if ((m->m_flags & M_EXT)) {
2861 					mlen = m->m_ext.ext_size -
2862 					    M_LEADINGSPACE(m);
2863 				} else if ((m->m_flags & M_PKTHDR)) {
2864 					mlen =
2865 					    MHLEN - M_LEADINGSPACE(m);
2866 				} else {
2867 					mlen = MLEN - M_LEADINGSPACE(m);
2868 				}
2869 				len = imin((int)mlen, bytes_to_copy);
2870 
2871 				/*
2872 				 * Note: uiomove() decrements the iovec
2873 				 * length
2874 				 */
2875 				error = uiomove(mtod(n, caddr_t),
2876 				    (int)len, auio);
2877 				if (error != 0) {
2878 					break;
2879 				}
2880 				n->m_len = (int32_t)len;
2881 				m->m_pkthdr.len += len;
2882 
2883 				VERIFY(m->m_pkthdr.len <= maxpktlen);
2884 
2885 				bytes_to_copy -= len;
2886 				resid -= len;
2887 			}
2888 			if (m->m_pkthdr.len == 0) {
2889 				printf(
2890 					"%s:%d so %llx pkt %llx type %u len null\n",
2891 					__func__, __LINE__,
2892 					(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2893 					(uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2894 					m->m_type);
2895 			}
2896 			if (error != 0) {
2897 				break;
2898 			}
2899 			m = m->m_nextpkt;
2900 		}
2901 
2902 		socket_lock(so, 0);
2903 
2904 		if (error) {
2905 			goto release;
2906 		}
2907 		top = freelist;
2908 		freelist = NULL;
2909 
2910 		if (dontroute) {
2911 			so->so_options |= SO_DONTROUTE;
2912 		}
2913 
2914 		if ((flags & MSG_SKIPCFIL) == 0) {
2915 			struct mbuf **prevnextp = NULL;
2916 
2917 			for (i = uiofirst, m = top;
2918 			    i < uiolast && m != NULL;
2919 			    i++) {
2920 				struct mbuf *nextpkt = m->m_nextpkt;
2921 
2922 				/*
2923 				 * Socket filter processing
2924 				 */
2925 				error = sflt_data_out(so, NULL, &m,
2926 				    NULL, 0);
2927 				if (error != 0 && error != EJUSTRETURN) {
2928 					goto release;
2929 				}
2930 
2931 #if CONTENT_FILTER
2932 				if (error == 0) {
2933 					/*
2934 					 * Content filter processing
2935 					 */
2936 					error = cfil_sock_data_out(so, NULL, m,
2937 					    NULL, 0, dgram_flow_entry);
2938 					if (error != 0 && error != EJUSTRETURN) {
2939 						goto release;
2940 					}
2941 				}
2942 #endif /* CONTENT_FILTER */
2943 				/*
2944 				 * Remove packet from the list when
2945 				 * swallowed by a filter
2946 				 */
2947 				if (error == EJUSTRETURN) {
2948 					error = 0;
2949 					if (prevnextp != NULL) {
2950 						*prevnextp = nextpkt;
2951 					} else {
2952 						top = nextpkt;
2953 					}
2954 				}
2955 
2956 				m = nextpkt;
2957 				if (m != NULL) {
2958 					prevnextp = &m->m_nextpkt;
2959 				}
2960 			}
2961 		}
2962 		if (top != NULL) {
2963 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2964 			    (so, 0, top, NULL, NULL, p);
2965 		}
2966 
2967 		if (dontroute) {
2968 			so->so_options &= ~SO_DONTROUTE;
2969 		}
2970 
2971 		top = NULL;
2972 		uiofirst = uiolast;
2973 	} while (resid > 0 && error == 0);
2974 release:
2975 	if (sblocked) {
2976 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2977 	} else {
2978 		socket_unlock(so, 1);
2979 	}
2980 out:
2981 	if (top != NULL) {
2982 		m_freem(top);
2983 	}
2984 	if (freelist != NULL) {
2985 		m_freem_list(freelist);
2986 	}
2987 
2988 	if (dgram_flow_entry != NULL) {
2989 		soflow_free_flow(dgram_flow_entry);
2990 	}
2991 
2992 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2993 	    so->so_snd.sb_cc, 0, error);
2994 
2995 	return error;
2996 }
2997 
2998 /*
2999  * May return ERESTART when packet is dropped by MAC policy check
3000  */
3001 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)3002 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
3003     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
3004 {
3005 	int error = 0;
3006 	struct mbuf *m = *mp;
3007 	struct mbuf *nextrecord = *nextrecordp;
3008 
3009 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3010 #if CONFIG_MACF_SOCKET_SUBSET
3011 	/*
3012 	 * Call the MAC framework for policy checking if we're in
3013 	 * the user process context and the socket isn't connected.
3014 	 */
3015 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3016 		struct mbuf *m0 = m;
3017 		/*
3018 		 * Dequeue this record (temporarily) from the receive
3019 		 * list since we're about to drop the socket's lock
3020 		 * where a new record may arrive and be appended to
3021 		 * the list.  Upon MAC policy failure, the record
3022 		 * will be freed.  Otherwise, we'll add it back to
3023 		 * the head of the list.  We cannot rely on SB_LOCK
3024 		 * because append operation uses the socket's lock.
3025 		 */
3026 		do {
3027 			m->m_nextpkt = NULL;
3028 			sbfree(&so->so_rcv, m);
3029 			m = m->m_next;
3030 		} while (m != NULL);
3031 		m = m0;
3032 		so->so_rcv.sb_mb = nextrecord;
3033 		SB_EMPTY_FIXUP(&so->so_rcv);
3034 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3035 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3036 		socket_unlock(so, 0);
3037 
3038 		error = mac_socket_check_received(kauth_cred_get(), so,
3039 		    mtod(m, struct sockaddr *));
3040 
3041 		if (error != 0) {
3042 			/*
3043 			 * MAC policy failure; free this record and
3044 			 * process the next record (or block until
3045 			 * one is available).  We have adjusted sb_cc
3046 			 * and sb_mbcnt above so there is no need to
3047 			 * call sbfree() again.
3048 			 */
3049 			m_freem(m);
3050 			/*
3051 			 * Clear SB_LOCK but don't unlock the socket.
3052 			 * Process the next record or wait for one.
3053 			 */
3054 			socket_lock(so, 0);
3055 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
3056 			error = ERESTART;
3057 			goto done;
3058 		}
3059 		socket_lock(so, 0);
3060 		/*
3061 		 * If the socket has been defunct'd, drop it.
3062 		 */
3063 		if (so->so_flags & SOF_DEFUNCT) {
3064 			m_freem(m);
3065 			error = ENOTCONN;
3066 			goto done;
3067 		}
3068 		/*
3069 		 * Re-adjust the socket receive list and re-enqueue
3070 		 * the record in front of any packets which may have
3071 		 * been appended while we dropped the lock.
3072 		 */
3073 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3074 			sballoc(&so->so_rcv, m);
3075 		}
3076 		sballoc(&so->so_rcv, m);
3077 		if (so->so_rcv.sb_mb == NULL) {
3078 			so->so_rcv.sb_lastrecord = m0;
3079 			so->so_rcv.sb_mbtail = m;
3080 		}
3081 		m = m0;
3082 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3083 		so->so_rcv.sb_mb = m;
3084 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3085 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3086 	}
3087 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3088 	if (psa != NULL) {
3089 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3090 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3091 			error = EWOULDBLOCK;
3092 			goto done;
3093 		}
3094 	}
3095 	if (flags & MSG_PEEK) {
3096 		m = m->m_next;
3097 	} else {
3098 		sbfree(&so->so_rcv, m);
3099 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3100 			panic("%s: about to create invalid socketbuf",
3101 			    __func__);
3102 			/* NOTREACHED */
3103 		}
3104 		MFREE(m, so->so_rcv.sb_mb);
3105 		m = so->so_rcv.sb_mb;
3106 		if (m != NULL) {
3107 			m->m_nextpkt = nextrecord;
3108 		} else {
3109 			so->so_rcv.sb_mb = nextrecord;
3110 			SB_EMPTY_FIXUP(&so->so_rcv);
3111 		}
3112 	}
3113 done:
3114 	*mp = m;
3115 	*nextrecordp = nextrecord;
3116 
3117 	return error;
3118 }
3119 
3120 /*
3121  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3122  * so clear the data portion in order not to leak the file pointers
3123  */
3124 static void
sopeek_scm_rights(struct mbuf * rights)3125 sopeek_scm_rights(struct mbuf *rights)
3126 {
3127 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3128 
3129 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3130 		VERIFY(cm->cmsg_len <= rights->m_len);
3131 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3132 	}
3133 }
3134 
3135 /*
3136  * Process one or more MT_CONTROL mbufs present before any data mbufs
3137  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3138  * just copy the data; if !MSG_PEEK, we call into the protocol to
3139  * perform externalization.
3140  */
3141 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3142 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3143     struct mbuf **mp, struct mbuf **nextrecordp)
3144 {
3145 	int error = 0;
3146 	struct mbuf *cm = NULL, *cmn;
3147 	struct mbuf **cme = &cm;
3148 	struct sockbuf *sb_rcv = &so->so_rcv;
3149 	struct mbuf **msgpcm = NULL;
3150 	struct mbuf *m = *mp;
3151 	struct mbuf *nextrecord = *nextrecordp;
3152 	struct protosw *pr = so->so_proto;
3153 
3154 	/*
3155 	 * Externalizing the control messages would require us to
3156 	 * drop the socket's lock below.  Once we re-acquire the
3157 	 * lock, the mbuf chain might change.  In order to preserve
3158 	 * consistency, we unlink all control messages from the
3159 	 * first mbuf chain in one shot and link them separately
3160 	 * onto a different chain.
3161 	 */
3162 	do {
3163 		if (flags & MSG_PEEK) {
3164 			if (controlp != NULL) {
3165 				if (*controlp == NULL) {
3166 					msgpcm = controlp;
3167 				}
3168 				*controlp = m_copy(m, 0, m->m_len);
3169 
3170 				/*
3171 				 * If we failed to allocate an mbuf,
3172 				 * release any previously allocated
3173 				 * mbufs for control data. Return
3174 				 * an error. Keep the mbufs in the
3175 				 * socket as this is using
3176 				 * MSG_PEEK flag.
3177 				 */
3178 				if (*controlp == NULL) {
3179 					m_freem(*msgpcm);
3180 					error = ENOBUFS;
3181 					goto done;
3182 				}
3183 
3184 				if (pr->pr_domain->dom_externalize != NULL) {
3185 					sopeek_scm_rights(*controlp);
3186 				}
3187 
3188 				controlp = &(*controlp)->m_next;
3189 			}
3190 			m = m->m_next;
3191 		} else {
3192 			m->m_nextpkt = NULL;
3193 			sbfree(sb_rcv, m);
3194 			sb_rcv->sb_mb = m->m_next;
3195 			m->m_next = NULL;
3196 			*cme = m;
3197 			cme = &(*cme)->m_next;
3198 			m = sb_rcv->sb_mb;
3199 		}
3200 	} while (m != NULL && m->m_type == MT_CONTROL);
3201 
3202 	if (!(flags & MSG_PEEK)) {
3203 		if (sb_rcv->sb_mb != NULL) {
3204 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3205 		} else {
3206 			sb_rcv->sb_mb = nextrecord;
3207 			SB_EMPTY_FIXUP(sb_rcv);
3208 		}
3209 		if (nextrecord == NULL) {
3210 			sb_rcv->sb_lastrecord = m;
3211 		}
3212 	}
3213 
3214 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3215 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3216 
3217 	while (cm != NULL) {
3218 		int cmsg_level;
3219 		int cmsg_type;
3220 
3221 		cmn = cm->m_next;
3222 		cm->m_next = NULL;
3223 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3224 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3225 
3226 		/*
3227 		 * Call the protocol to externalize SCM_RIGHTS message
3228 		 * and return the modified message to the caller upon
3229 		 * success.  Otherwise, all other control messages are
3230 		 * returned unmodified to the caller.  Note that we
3231 		 * only get into this loop if MSG_PEEK is not set.
3232 		 */
3233 		if (pr->pr_domain->dom_externalize != NULL &&
3234 		    cmsg_level == SOL_SOCKET &&
3235 		    cmsg_type == SCM_RIGHTS) {
3236 			/*
3237 			 * Release socket lock: see 3903171.  This
3238 			 * would also allow more records to be appended
3239 			 * to the socket buffer.  We still have SB_LOCK
3240 			 * set on it, so we can be sure that the head
3241 			 * of the mbuf chain won't change.
3242 			 */
3243 			socket_unlock(so, 0);
3244 			error = (*pr->pr_domain->dom_externalize)(cm);
3245 			socket_lock(so, 0);
3246 		} else {
3247 			error = 0;
3248 		}
3249 
3250 		if (controlp != NULL && error == 0) {
3251 			*controlp = cm;
3252 			controlp = &(*controlp)->m_next;
3253 		} else {
3254 			(void) m_free(cm);
3255 		}
3256 		cm = cmn;
3257 	}
3258 	/*
3259 	 * Update the value of nextrecord in case we received new
3260 	 * records when the socket was unlocked above for
3261 	 * externalizing SCM_RIGHTS.
3262 	 */
3263 	if (m != NULL) {
3264 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3265 	} else {
3266 		nextrecord = sb_rcv->sb_mb;
3267 	}
3268 
3269 done:
3270 	*mp = m;
3271 	*nextrecordp = nextrecord;
3272 
3273 	return error;
3274 }
3275 
3276 /*
3277  * If we have less data than requested, block awaiting more
3278  * (subject to any timeout) if:
3279  *   1. the current count is less than the low water mark, or
3280  *   2. MSG_WAITALL is set, and it is possible to do the entire
3281  *	receive operation at once if we block (resid <= hiwat).
3282  *   3. MSG_DONTWAIT is not set
3283  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3284  * we have to do the receive in sections, and thus risk returning
3285  * a short count if a timeout or signal occurs after we start.
3286  */
3287 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3288 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3289 {
3290 	struct protosw *pr = so->so_proto;
3291 
3292 	/* No mbufs in the receive-queue? Wait! */
3293 	if (m == NULL) {
3294 		return true;
3295 	}
3296 
3297 	/* Not enough data in the receive socket-buffer - we may have to wait */
3298 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3299 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3300 		/*
3301 		 * Application did set the lowater-mark, so we should wait for
3302 		 * this data to be present.
3303 		 */
3304 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3305 			return true;
3306 		}
3307 
3308 		/*
3309 		 * Application wants all the data - so let's try to do the
3310 		 * receive-operation at once by waiting for everything to
3311 		 * be there.
3312 		 */
3313 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3314 			return true;
3315 		}
3316 	}
3317 
3318 	return false;
3319 }
3320 
3321 /*
3322  * Implement receive operations on a socket.
3323  * We depend on the way that records are added to the sockbuf
3324  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3325  * must begin with an address if the protocol so specifies,
3326  * followed by an optional mbuf or mbufs containing ancillary data,
3327  * and then zero or more mbufs of data.
3328  * In order to avoid blocking network interrupts for the entire time here,
3329  * we splx() while doing the actual copy to user space.
3330  * Although the sockbuf is locked, new data may still be appended,
3331  * and thus we must maintain consistency of the sockbuf during that time.
3332  *
3333  * The caller may receive the data as a single mbuf chain by supplying
3334  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3335  * only for the count in uio_resid.
3336  *
3337  * Returns:	0			Success
3338  *		ENOBUFS
3339  *		ENOTCONN
3340  *		EWOULDBLOCK
3341  *	uiomove:EFAULT
3342  *	sblock:EWOULDBLOCK
3343  *	sblock:EINTR
3344  *	sbwait:EBADF
3345  *	sbwait:EINTR
3346  *	sodelayed_copy:EFAULT
3347  *	<pru_rcvoob>:EINVAL[TCP]
3348  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3349  *	<pru_rcvoob>:???
3350  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3351  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3352  *	<pr_domain->dom_externalize>:???
3353  *
3354  * Notes:	Additional return values from calls through <pru_rcvoob> and
3355  *		<pr_domain->dom_externalize> depend on protocols other than
3356  *		TCP or AF_UNIX, which are documented above.
3357  */
3358 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3359 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3360     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3361 {
3362 	struct mbuf *m, **mp, *ml = NULL;
3363 	struct mbuf *nextrecord, *free_list;
3364 	int flags, error, offset;
3365 	user_ssize_t len;
3366 	struct protosw *pr = so->so_proto;
3367 	int moff, type = 0;
3368 	user_ssize_t orig_resid = uio_resid(uio);
3369 	user_ssize_t delayed_copy_len;
3370 	int can_delay;
3371 	struct proc *p = current_proc();
3372 	boolean_t en_tracing = FALSE;
3373 
3374 	/*
3375 	 * Sanity check on the length passed by caller as we are making 'int'
3376 	 * comparisons
3377 	 */
3378 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3379 		return EINVAL;
3380 	}
3381 
3382 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3383 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3384 	    so->so_rcv.sb_hiwat);
3385 
3386 	socket_lock(so, 1);
3387 	so_update_last_owner_locked(so, p);
3388 	so_update_policy(so);
3389 
3390 #ifdef MORE_LOCKING_DEBUG
3391 	if (so->so_usecount == 1) {
3392 		panic("%s: so=%x no other reference on socket", __func__, so);
3393 		/* NOTREACHED */
3394 	}
3395 #endif
3396 	mp = mp0;
3397 	if (psa != NULL) {
3398 		*psa = NULL;
3399 	}
3400 	if (controlp != NULL) {
3401 		*controlp = NULL;
3402 	}
3403 	if (flagsp != NULL) {
3404 		flags = *flagsp & ~MSG_EOR;
3405 	} else {
3406 		flags = 0;
3407 	}
3408 
3409 	/*
3410 	 * If a recv attempt is made on a previously-accepted socket
3411 	 * that has been marked as inactive (disconnected), reject
3412 	 * the request.
3413 	 */
3414 	if (so->so_flags & SOF_DEFUNCT) {
3415 		struct sockbuf *sb = &so->so_rcv;
3416 
3417 		error = ENOTCONN;
3418 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3419 		    __func__, proc_pid(p), proc_best_name(p),
3420 		    so->so_gencnt,
3421 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3422 		/*
3423 		 * This socket should have been disconnected and flushed
3424 		 * prior to being returned from sodefunct(); there should
3425 		 * be no data on its receive list, so panic otherwise.
3426 		 */
3427 		if (so->so_state & SS_DEFUNCT) {
3428 			sb_empty_assert(sb, __func__);
3429 		}
3430 		socket_unlock(so, 1);
3431 		return error;
3432 	}
3433 
3434 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3435 	    pr->pr_usrreqs->pru_preconnect) {
3436 		/*
3437 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3438 		 * calling write() right after this. *If* the app calls a read
3439 		 * we do not want to block this read indefinetely. Thus,
3440 		 * we trigger a connect so that the session gets initiated.
3441 		 */
3442 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3443 
3444 		if (error) {
3445 			socket_unlock(so, 1);
3446 			return error;
3447 		}
3448 	}
3449 
3450 	if (ENTR_SHOULDTRACE &&
3451 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3452 		/*
3453 		 * enable energy tracing for inet sockets that go over
3454 		 * non-loopback interfaces only.
3455 		 */
3456 		struct inpcb *inp = sotoinpcb(so);
3457 		if (inp->inp_last_outifp != NULL &&
3458 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3459 			en_tracing = TRUE;
3460 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3461 			    VM_KERNEL_ADDRPERM(so),
3462 			    ((so->so_state & SS_NBIO) ?
3463 			    kEnTrFlagNonBlocking : 0),
3464 			    (int64_t)orig_resid);
3465 		}
3466 	}
3467 
3468 	/*
3469 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3470 	 * regardless of the flags argument. Here is the case were
3471 	 * out-of-band data is not inline.
3472 	 */
3473 	if ((flags & MSG_OOB) ||
3474 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3475 	    (so->so_options & SO_OOBINLINE) == 0 &&
3476 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3477 		m = m_get(M_WAIT, MT_DATA);
3478 		if (m == NULL) {
3479 			socket_unlock(so, 1);
3480 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3481 			    ENOBUFS, 0, 0, 0, 0);
3482 			return ENOBUFS;
3483 		}
3484 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3485 		if (error) {
3486 			goto bad;
3487 		}
3488 		socket_unlock(so, 0);
3489 		do {
3490 			error = uiomove(mtod(m, caddr_t),
3491 			    imin((int)uio_resid(uio), m->m_len), uio);
3492 			m = m_free(m);
3493 		} while (uio_resid(uio) && error == 0 && m != NULL);
3494 		socket_lock(so, 0);
3495 bad:
3496 		if (m != NULL) {
3497 			m_freem(m);
3498 		}
3499 
3500 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3501 			if (error == EWOULDBLOCK || error == EINVAL) {
3502 				/*
3503 				 * Let's try to get normal data:
3504 				 * EWOULDBLOCK: out-of-band data not
3505 				 * receive yet. EINVAL: out-of-band data
3506 				 * already read.
3507 				 */
3508 				error = 0;
3509 				goto nooob;
3510 			} else if (error == 0 && flagsp != NULL) {
3511 				*flagsp |= MSG_OOB;
3512 			}
3513 		}
3514 		socket_unlock(so, 1);
3515 		if (en_tracing) {
3516 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3517 			    VM_KERNEL_ADDRPERM(so), 0,
3518 			    (int64_t)(orig_resid - uio_resid(uio)));
3519 		}
3520 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3521 		    0, 0, 0, 0);
3522 
3523 		return error;
3524 	}
3525 nooob:
3526 	if (mp != NULL) {
3527 		*mp = NULL;
3528 	}
3529 
3530 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3531 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3532 	}
3533 
3534 	free_list = NULL;
3535 	delayed_copy_len = 0;
3536 restart:
3537 #ifdef MORE_LOCKING_DEBUG
3538 	if (so->so_usecount <= 1) {
3539 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3540 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3541 	}
3542 #endif
3543 	/*
3544 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3545 	 * and if so just return to the caller.  This could happen when
3546 	 * soreceive() is called by a socket upcall function during the
3547 	 * time the socket is freed.  The socket buffer would have been
3548 	 * locked across the upcall, therefore we cannot put this thread
3549 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3550 	 * we may livelock), because the lock on the socket buffer will
3551 	 * only be released when the upcall routine returns to its caller.
3552 	 * Because the socket has been officially closed, there can be
3553 	 * no further read on it.
3554 	 *
3555 	 * A multipath subflow socket would have its SS_NOFDREF set by
3556 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3557 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3558 	 */
3559 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3560 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3561 		socket_unlock(so, 1);
3562 		return 0;
3563 	}
3564 
3565 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3566 	if (error) {
3567 		socket_unlock(so, 1);
3568 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3569 		    0, 0, 0, 0);
3570 		if (en_tracing) {
3571 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3572 			    VM_KERNEL_ADDRPERM(so), 0,
3573 			    (int64_t)(orig_resid - uio_resid(uio)));
3574 		}
3575 		return error;
3576 	}
3577 
3578 	m = so->so_rcv.sb_mb;
3579 	if (so_should_wait(so, uio, m, flags)) {
3580 		/*
3581 		 * Panic if we notice inconsistencies in the socket's
3582 		 * receive list; both sb_mb and sb_cc should correctly
3583 		 * reflect the contents of the list, otherwise we may
3584 		 * end up with false positives during select() or poll()
3585 		 * which could put the application in a bad state.
3586 		 */
3587 		SB_MB_CHECK(&so->so_rcv);
3588 
3589 		if (so->so_error) {
3590 			if (m != NULL) {
3591 				goto dontblock;
3592 			}
3593 			error = so->so_error;
3594 			if ((flags & MSG_PEEK) == 0) {
3595 				so->so_error = 0;
3596 			}
3597 			goto release;
3598 		}
3599 		if (so->so_state & SS_CANTRCVMORE) {
3600 #if CONTENT_FILTER
3601 			/*
3602 			 * Deal with half closed connections
3603 			 */
3604 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3605 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3606 				CFIL_LOG(LOG_INFO,
3607 				    "so %llx ignore SS_CANTRCVMORE",
3608 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3609 			} else
3610 #endif /* CONTENT_FILTER */
3611 			if (m != NULL) {
3612 				goto dontblock;
3613 			} else {
3614 				goto release;
3615 			}
3616 		}
3617 		for (; m != NULL; m = m->m_next) {
3618 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3619 				m = so->so_rcv.sb_mb;
3620 				goto dontblock;
3621 			}
3622 		}
3623 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3624 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3625 			error = ENOTCONN;
3626 			goto release;
3627 		}
3628 		if (uio_resid(uio) == 0) {
3629 			goto release;
3630 		}
3631 
3632 		if ((so->so_state & SS_NBIO) ||
3633 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3634 			error = EWOULDBLOCK;
3635 			goto release;
3636 		}
3637 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3638 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3639 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3640 #if EVEN_MORE_LOCKING_DEBUG
3641 		if (socket_debug) {
3642 			printf("Waiting for socket data\n");
3643 		}
3644 #endif
3645 
3646 		/*
3647 		 * Depending on the protocol (e.g. TCP), the following
3648 		 * might cause the socket lock to be dropped and later
3649 		 * be reacquired, and more data could have arrived and
3650 		 * have been appended to the receive socket buffer by
3651 		 * the time it returns.  Therefore, we only sleep in
3652 		 * sbwait() below if and only if the wait-condition is still
3653 		 * true.
3654 		 */
3655 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3656 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3657 		}
3658 
3659 		error = 0;
3660 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3661 			error = sbwait(&so->so_rcv);
3662 		}
3663 
3664 #if EVEN_MORE_LOCKING_DEBUG
3665 		if (socket_debug) {
3666 			printf("SORECEIVE - sbwait returned %d\n", error);
3667 		}
3668 #endif
3669 		if (so->so_usecount < 1) {
3670 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3671 			    __func__, so, so->so_usecount);
3672 			/* NOTREACHED */
3673 		}
3674 		if (error) {
3675 			socket_unlock(so, 1);
3676 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3677 			    0, 0, 0, 0);
3678 			if (en_tracing) {
3679 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3680 				    VM_KERNEL_ADDRPERM(so), 0,
3681 				    (int64_t)(orig_resid - uio_resid(uio)));
3682 			}
3683 			return error;
3684 		}
3685 		goto restart;
3686 	}
3687 dontblock:
3688 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3689 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3690 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3691 	nextrecord = m->m_nextpkt;
3692 
3693 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3694 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3695 		    mp0 == NULL);
3696 		if (error == ERESTART) {
3697 			goto restart;
3698 		} else if (error != 0) {
3699 			goto release;
3700 		}
3701 		orig_resid = 0;
3702 	}
3703 
3704 	/*
3705 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3706 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3707 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3708 	 * perform externalization.
3709 	 */
3710 	if (m != NULL && m->m_type == MT_CONTROL) {
3711 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3712 		if (error != 0) {
3713 			goto release;
3714 		}
3715 		orig_resid = 0;
3716 	}
3717 
3718 	if (m != NULL) {
3719 		if (!(flags & MSG_PEEK)) {
3720 			/*
3721 			 * We get here because m points to an mbuf following
3722 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3723 			 * processed above.  In any case, m should be pointing
3724 			 * to the head of the mbuf chain, and the nextrecord
3725 			 * should be either NULL or equal to m->m_nextpkt.
3726 			 * See comments above about SB_LOCK.
3727 			 */
3728 			if (m != so->so_rcv.sb_mb ||
3729 			    m->m_nextpkt != nextrecord) {
3730 				panic("%s: post-control !sync so=%p m=%p "
3731 				    "nextrecord=%p\n", __func__, so, m,
3732 				    nextrecord);
3733 				/* NOTREACHED */
3734 			}
3735 			if (nextrecord == NULL) {
3736 				so->so_rcv.sb_lastrecord = m;
3737 			}
3738 		}
3739 		type = m->m_type;
3740 		if (type == MT_OOBDATA) {
3741 			flags |= MSG_OOB;
3742 		}
3743 	} else {
3744 		if (!(flags & MSG_PEEK)) {
3745 			SB_EMPTY_FIXUP(&so->so_rcv);
3746 		}
3747 	}
3748 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3749 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3750 
3751 	moff = 0;
3752 	offset = 0;
3753 
3754 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3755 		can_delay = 1;
3756 	} else {
3757 		can_delay = 0;
3758 	}
3759 
3760 	while (m != NULL &&
3761 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3762 		if (m->m_type == MT_OOBDATA) {
3763 			if (type != MT_OOBDATA) {
3764 				break;
3765 			}
3766 		} else if (type == MT_OOBDATA) {
3767 			break;
3768 		}
3769 
3770 		if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3771 		    m->m_type != MT_HEADER) {
3772 			break;
3773 		}
3774 		/*
3775 		 * Make sure to allways set MSG_OOB event when getting
3776 		 * out of band data inline.
3777 		 */
3778 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3779 		    (so->so_options & SO_OOBINLINE) != 0 &&
3780 		    (so->so_state & SS_RCVATMARK) != 0) {
3781 			flags |= MSG_OOB;
3782 		}
3783 		so->so_state &= ~SS_RCVATMARK;
3784 		len = uio_resid(uio) - delayed_copy_len;
3785 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3786 			len = so->so_oobmark - offset;
3787 		}
3788 		if (len > m->m_len - moff) {
3789 			len = m->m_len - moff;
3790 		}
3791 		/*
3792 		 * If mp is set, just pass back the mbufs.
3793 		 * Otherwise copy them out via the uio, then free.
3794 		 * Sockbuf must be consistent here (points to current mbuf,
3795 		 * it points to next record) when we drop priority;
3796 		 * we must note any additions to the sockbuf when we
3797 		 * block interrupts again.
3798 		 */
3799 		if (mp == NULL) {
3800 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3801 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3802 			if (can_delay && len == m->m_len) {
3803 				/*
3804 				 * only delay the copy if we're consuming the
3805 				 * mbuf and we're NOT in MSG_PEEK mode
3806 				 * and we have enough data to make it worthwile
3807 				 * to drop and retake the lock... can_delay
3808 				 * reflects the state of the 2 latter
3809 				 * constraints moff should always be zero
3810 				 * in these cases
3811 				 */
3812 				delayed_copy_len += len;
3813 			} else {
3814 				if (delayed_copy_len) {
3815 					error = sodelayed_copy(so, uio,
3816 					    &free_list, &delayed_copy_len);
3817 
3818 					if (error) {
3819 						goto release;
3820 					}
3821 					/*
3822 					 * can only get here if MSG_PEEK is not
3823 					 * set therefore, m should point at the
3824 					 * head of the rcv queue; if it doesn't,
3825 					 * it means something drastically
3826 					 * changed while we were out from behind
3827 					 * the lock in sodelayed_copy. perhaps
3828 					 * a RST on the stream. in any event,
3829 					 * the stream has been interrupted. it's
3830 					 * probably best just to return whatever
3831 					 * data we've moved and let the caller
3832 					 * sort it out...
3833 					 */
3834 					if (m != so->so_rcv.sb_mb) {
3835 						break;
3836 					}
3837 				}
3838 				socket_unlock(so, 0);
3839 				error = uiomove(mtod(m, caddr_t) + moff,
3840 				    (int)len, uio);
3841 				socket_lock(so, 0);
3842 
3843 				if (error) {
3844 					goto release;
3845 				}
3846 			}
3847 		} else {
3848 			uio_setresid(uio, (uio_resid(uio) - len));
3849 		}
3850 		if (len == m->m_len - moff) {
3851 			if (m->m_flags & M_EOR) {
3852 				flags |= MSG_EOR;
3853 			}
3854 			if (flags & MSG_PEEK) {
3855 				m = m->m_next;
3856 				moff = 0;
3857 			} else {
3858 				nextrecord = m->m_nextpkt;
3859 				sbfree(&so->so_rcv, m);
3860 				m->m_nextpkt = NULL;
3861 
3862 				if (mp != NULL) {
3863 					*mp = m;
3864 					mp = &m->m_next;
3865 					so->so_rcv.sb_mb = m = m->m_next;
3866 					*mp = NULL;
3867 				} else {
3868 					if (free_list == NULL) {
3869 						free_list = m;
3870 					} else {
3871 						ml->m_next = m;
3872 					}
3873 					ml = m;
3874 					so->so_rcv.sb_mb = m = m->m_next;
3875 					ml->m_next = NULL;
3876 				}
3877 				if (m != NULL) {
3878 					m->m_nextpkt = nextrecord;
3879 					if (nextrecord == NULL) {
3880 						so->so_rcv.sb_lastrecord = m;
3881 					}
3882 				} else {
3883 					so->so_rcv.sb_mb = nextrecord;
3884 					SB_EMPTY_FIXUP(&so->so_rcv);
3885 				}
3886 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3887 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3888 			}
3889 		} else {
3890 			if (flags & MSG_PEEK) {
3891 				moff += len;
3892 			} else {
3893 				if (mp != NULL) {
3894 					int copy_flag;
3895 
3896 					if (flags & MSG_DONTWAIT) {
3897 						copy_flag = M_DONTWAIT;
3898 					} else {
3899 						copy_flag = M_WAIT;
3900 					}
3901 					*mp = m_copym(m, 0, (int)len, copy_flag);
3902 					/*
3903 					 * Failed to allocate an mbuf?
3904 					 * Adjust uio_resid back, it was
3905 					 * adjusted down by len bytes which
3906 					 * we didn't copy over.
3907 					 */
3908 					if (*mp == NULL) {
3909 						uio_setresid(uio,
3910 						    (uio_resid(uio) + len));
3911 						break;
3912 					}
3913 				}
3914 				m->m_data += len;
3915 				m->m_len -= len;
3916 				so->so_rcv.sb_cc -= len;
3917 			}
3918 		}
3919 		if (so->so_oobmark) {
3920 			if ((flags & MSG_PEEK) == 0) {
3921 				so->so_oobmark -= len;
3922 				if (so->so_oobmark == 0) {
3923 					so->so_state |= SS_RCVATMARK;
3924 					break;
3925 				}
3926 			} else {
3927 				offset += len;
3928 				if (offset == so->so_oobmark) {
3929 					break;
3930 				}
3931 			}
3932 		}
3933 		if (flags & MSG_EOR) {
3934 			break;
3935 		}
3936 		/*
3937 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3938 		 * (for non-atomic socket), we must not quit until
3939 		 * "uio->uio_resid == 0" or an error termination.
3940 		 * If a signal/timeout occurs, return with a short
3941 		 * count but without error.  Keep sockbuf locked
3942 		 * against other readers.
3943 		 */
3944 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3945 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3946 		    !sosendallatonce(so) && !nextrecord) {
3947 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3948 #if CONTENT_FILTER
3949 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3950 #endif /* CONTENT_FILTER */
3951 			    )) {
3952 				goto release;
3953 			}
3954 
3955 			/*
3956 			 * Depending on the protocol (e.g. TCP), the following
3957 			 * might cause the socket lock to be dropped and later
3958 			 * be reacquired, and more data could have arrived and
3959 			 * have been appended to the receive socket buffer by
3960 			 * the time it returns.  Therefore, we only sleep in
3961 			 * sbwait() below if and only if the socket buffer is
3962 			 * empty, in order to avoid a false sleep.
3963 			 */
3964 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3965 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3966 			}
3967 
3968 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3969 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3970 
3971 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3972 				error = 0;
3973 				goto release;
3974 			}
3975 			/*
3976 			 * have to wait until after we get back from the sbwait
3977 			 * to do the copy because we will drop the lock if we
3978 			 * have enough data that has been delayed... by dropping
3979 			 * the lock we open up a window allowing the netisr
3980 			 * thread to process the incoming packets and to change
3981 			 * the state of this socket... we're issuing the sbwait
3982 			 * because the socket is empty and we're expecting the
3983 			 * netisr thread to wake us up when more packets arrive;
3984 			 * if we allow that processing to happen and then sbwait
3985 			 * we could stall forever with packets sitting in the
3986 			 * socket if no further packets arrive from the remote
3987 			 * side.
3988 			 *
3989 			 * we want to copy before we've collected all the data
3990 			 * to satisfy this request to allow the copy to overlap
3991 			 * the incoming packet processing on an MP system
3992 			 */
3993 			if (delayed_copy_len > sorecvmincopy &&
3994 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3995 				error = sodelayed_copy(so, uio,
3996 				    &free_list, &delayed_copy_len);
3997 
3998 				if (error) {
3999 					goto release;
4000 				}
4001 			}
4002 			m = so->so_rcv.sb_mb;
4003 			if (m != NULL) {
4004 				nextrecord = m->m_nextpkt;
4005 			}
4006 			SB_MB_CHECK(&so->so_rcv);
4007 		}
4008 	}
4009 #ifdef MORE_LOCKING_DEBUG
4010 	if (so->so_usecount <= 1) {
4011 		panic("%s: after big while so=%p ref=%d on socket",
4012 		    __func__, so, so->so_usecount);
4013 		/* NOTREACHED */
4014 	}
4015 #endif
4016 
4017 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4018 		if (so->so_options & SO_DONTTRUNC) {
4019 			flags |= MSG_RCVMORE;
4020 		} else {
4021 			flags |= MSG_TRUNC;
4022 			if ((flags & MSG_PEEK) == 0) {
4023 				(void) sbdroprecord(&so->so_rcv);
4024 			}
4025 		}
4026 	}
4027 
4028 	/*
4029 	 * pru_rcvd below (for TCP) may cause more data to be received
4030 	 * if the socket lock is dropped prior to sending the ACK; some
4031 	 * legacy OpenTransport applications don't handle this well
4032 	 * (if it receives less data than requested while MSG_HAVEMORE
4033 	 * is set), and so we set the flag now based on what we know
4034 	 * prior to calling pru_rcvd.
4035 	 */
4036 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4037 		flags |= MSG_HAVEMORE;
4038 	}
4039 
4040 	if ((flags & MSG_PEEK) == 0) {
4041 		if (m == NULL) {
4042 			so->so_rcv.sb_mb = nextrecord;
4043 			/*
4044 			 * First part is an inline SB_EMPTY_FIXUP().  Second
4045 			 * part makes sure sb_lastrecord is up-to-date if
4046 			 * there is still data in the socket buffer.
4047 			 */
4048 			if (so->so_rcv.sb_mb == NULL) {
4049 				so->so_rcv.sb_mbtail = NULL;
4050 				so->so_rcv.sb_lastrecord = NULL;
4051 			} else if (nextrecord->m_nextpkt == NULL) {
4052 				so->so_rcv.sb_lastrecord = nextrecord;
4053 			}
4054 			SB_MB_CHECK(&so->so_rcv);
4055 		}
4056 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4057 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4058 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4059 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4060 		}
4061 	}
4062 
4063 	if (delayed_copy_len) {
4064 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4065 		if (error) {
4066 			goto release;
4067 		}
4068 	}
4069 	if (free_list != NULL) {
4070 		m_freem_list(free_list);
4071 		free_list = NULL;
4072 	}
4073 
4074 	if (orig_resid == uio_resid(uio) && orig_resid &&
4075 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4076 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4077 		goto restart;
4078 	}
4079 
4080 	if (flagsp != NULL) {
4081 		*flagsp |= flags;
4082 	}
4083 release:
4084 #ifdef MORE_LOCKING_DEBUG
4085 	if (so->so_usecount <= 1) {
4086 		panic("%s: release so=%p ref=%d on socket", __func__,
4087 		    so, so->so_usecount);
4088 		/* NOTREACHED */
4089 	}
4090 #endif
4091 	if (delayed_copy_len) {
4092 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4093 	}
4094 
4095 	if (free_list != NULL) {
4096 		m_freem_list(free_list);
4097 	}
4098 
4099 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4100 
4101 	if (en_tracing) {
4102 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4103 		    VM_KERNEL_ADDRPERM(so),
4104 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4105 		    (int64_t)(orig_resid - uio_resid(uio)));
4106 	}
4107 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4108 	    so->so_rcv.sb_cc, 0, error);
4109 
4110 	return error;
4111 }
4112 
4113 /*
4114  * Returns:	0			Success
4115  *	uiomove:EFAULT
4116  */
4117 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4118 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4119     user_ssize_t *resid)
4120 {
4121 	int error = 0;
4122 	struct mbuf *m;
4123 
4124 	m = *free_list;
4125 
4126 	socket_unlock(so, 0);
4127 
4128 	while (m != NULL && error == 0) {
4129 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4130 		m = m->m_next;
4131 	}
4132 	m_freem_list(*free_list);
4133 
4134 	*free_list = NULL;
4135 	*resid = 0;
4136 
4137 	socket_lock(so, 0);
4138 
4139 	return error;
4140 }
4141 
4142 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4143 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4144     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4145 {
4146 #pragma unused(so)
4147 	int error = 0;
4148 	struct mbuf *ml, *m;
4149 	int i = 0;
4150 	struct uio *auio;
4151 
4152 	for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4153 	    ml = ml->m_nextpkt, i++) {
4154 		auio = msgarray[i].uio;
4155 		for (m = ml; m != NULL; m = m->m_next) {
4156 			error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4157 			if (error != 0) {
4158 				goto out;
4159 			}
4160 		}
4161 	}
4162 out:
4163 	m_freem_list(*free_list);
4164 
4165 	*free_list = NULL;
4166 	*resid = 0;
4167 
4168 	return error;
4169 }
4170 
4171 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4172 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4173     int *flagsp)
4174 {
4175 	struct mbuf *m;
4176 	struct mbuf *nextrecord;
4177 	struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4178 	int error;
4179 	user_ssize_t len, pktlen, delayed_copy_len = 0;
4180 	struct protosw *pr = so->so_proto;
4181 	user_ssize_t resid;
4182 	struct proc *p = current_proc();
4183 	struct uio *auio = NULL;
4184 	int npkts = 0;
4185 	int sblocked = 0;
4186 	struct sockaddr **psa = NULL;
4187 	struct mbuf **controlp = NULL;
4188 	int can_delay;
4189 	int flags;
4190 	struct mbuf *free_others = NULL;
4191 
4192 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4193 	    so, uiocnt,
4194 	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4195 
4196 	/*
4197 	 * Sanity checks:
4198 	 * - Only supports don't wait flags
4199 	 * - Only support datagram sockets (could be extended to raw)
4200 	 * - Must be atomic
4201 	 * - Protocol must support packet chains
4202 	 * - The uio array is NULL (should we panic?)
4203 	 */
4204 	if (flagsp != NULL) {
4205 		flags = *flagsp;
4206 	} else {
4207 		flags = 0;
4208 	}
4209 	if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4210 	    MSG_NBIO)) {
4211 		printf("%s invalid flags 0x%x\n", __func__, flags);
4212 		error = EINVAL;
4213 		goto out;
4214 	}
4215 	if (so->so_type != SOCK_DGRAM) {
4216 		error = EINVAL;
4217 		goto out;
4218 	}
4219 	if (sosendallatonce(so) == 0) {
4220 		error = EINVAL;
4221 		goto out;
4222 	}
4223 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4224 		error = EPROTONOSUPPORT;
4225 		goto out;
4226 	}
4227 	if (msgarray == NULL) {
4228 		printf("%s uioarray is NULL\n", __func__);
4229 		error = EINVAL;
4230 		goto out;
4231 	}
4232 	if (uiocnt == 0) {
4233 		printf("%s uiocnt is 0\n", __func__);
4234 		error = EINVAL;
4235 		goto out;
4236 	}
4237 	/*
4238 	 * Sanity check on the length passed by caller as we are making 'int'
4239 	 * comparisons
4240 	 */
4241 	resid = recv_msg_array_resid(msgarray, uiocnt);
4242 	if (resid < 0 || resid > INT_MAX) {
4243 		error = EINVAL;
4244 		goto out;
4245 	}
4246 
4247 	if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4248 		can_delay = 1;
4249 	} else {
4250 		can_delay = 0;
4251 	}
4252 
4253 	socket_lock(so, 1);
4254 	so_update_last_owner_locked(so, p);
4255 	so_update_policy(so);
4256 
4257 #if NECP
4258 	so_update_necp_policy(so, NULL, NULL);
4259 #endif /* NECP */
4260 
4261 	/*
4262 	 * If a recv attempt is made on a previously-accepted socket
4263 	 * that has been marked as inactive (disconnected), reject
4264 	 * the request.
4265 	 */
4266 	if (so->so_flags & SOF_DEFUNCT) {
4267 		struct sockbuf *sb = &so->so_rcv;
4268 
4269 		error = ENOTCONN;
4270 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4271 		    __func__, proc_pid(p), proc_best_name(p),
4272 		    so->so_gencnt,
4273 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4274 		/*
4275 		 * This socket should have been disconnected and flushed
4276 		 * prior to being returned from sodefunct(); there should
4277 		 * be no data on its receive list, so panic otherwise.
4278 		 */
4279 		if (so->so_state & SS_DEFUNCT) {
4280 			sb_empty_assert(sb, __func__);
4281 		}
4282 		goto release;
4283 	}
4284 
4285 next:
4286 	/*
4287 	 * The uio may be empty
4288 	 */
4289 	if (npkts >= uiocnt) {
4290 		error = 0;
4291 		goto release;
4292 	}
4293 restart:
4294 	/*
4295 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4296 	 * and if so just return to the caller.  This could happen when
4297 	 * soreceive() is called by a socket upcall function during the
4298 	 * time the socket is freed.  The socket buffer would have been
4299 	 * locked across the upcall, therefore we cannot put this thread
4300 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4301 	 * we may livelock), because the lock on the socket buffer will
4302 	 * only be released when the upcall routine returns to its caller.
4303 	 * Because the socket has been officially closed, there can be
4304 	 * no further read on it.
4305 	 */
4306 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4307 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4308 		error = 0;
4309 		goto release;
4310 	}
4311 
4312 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4313 	if (error) {
4314 		goto release;
4315 	}
4316 	sblocked = 1;
4317 
4318 	m = so->so_rcv.sb_mb;
4319 	/*
4320 	 * Block awaiting more datagram if needed
4321 	 */
4322 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4323 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4324 	    ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4325 		/*
4326 		 * Panic if we notice inconsistencies in the socket's
4327 		 * receive list; both sb_mb and sb_cc should correctly
4328 		 * reflect the contents of the list, otherwise we may
4329 		 * end up with false positives during select() or poll()
4330 		 * which could put the application in a bad state.
4331 		 */
4332 		SB_MB_CHECK(&so->so_rcv);
4333 
4334 		if (so->so_error) {
4335 			error = so->so_error;
4336 			if ((flags & MSG_PEEK) == 0) {
4337 				so->so_error = 0;
4338 			}
4339 			goto release;
4340 		}
4341 		if (so->so_state & SS_CANTRCVMORE) {
4342 			goto release;
4343 		}
4344 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4345 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4346 			error = ENOTCONN;
4347 			goto release;
4348 		}
4349 		if ((so->so_state & SS_NBIO) ||
4350 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4351 			error = EWOULDBLOCK;
4352 			goto release;
4353 		}
4354 		/*
4355 		 * Do not block if we got some data
4356 		 */
4357 		if (free_list != NULL) {
4358 			error = 0;
4359 			goto release;
4360 		}
4361 
4362 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4363 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4364 
4365 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4366 		sblocked = 0;
4367 
4368 		error = sbwait(&so->so_rcv);
4369 		if (error) {
4370 			goto release;
4371 		}
4372 		goto restart;
4373 	}
4374 
4375 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4376 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4377 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4378 
4379 	/*
4380 	 * Consume the current uio index as we have a datagram
4381 	 */
4382 	auio = msgarray[npkts].uio;
4383 	resid = uio_resid(auio);
4384 	msgarray[npkts].which |= SOCK_MSG_DATA;
4385 	psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4386 	    &msgarray[npkts].psa : NULL;
4387 	controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4388 	    &msgarray[npkts].controlp : NULL;
4389 	npkts += 1;
4390 	nextrecord = m->m_nextpkt;
4391 
4392 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4393 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4394 		if (error == ERESTART) {
4395 			goto restart;
4396 		} else if (error != 0) {
4397 			goto release;
4398 		}
4399 	}
4400 
4401 	if (m != NULL && m->m_type == MT_CONTROL) {
4402 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4403 		if (error != 0) {
4404 			goto release;
4405 		}
4406 	}
4407 
4408 	if (m->m_pkthdr.len == 0) {
4409 		printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4410 		    __func__, __LINE__,
4411 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4412 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4413 		    m->m_type);
4414 	}
4415 
4416 	/*
4417 	 * Loop to copy the mbufs of the current record
4418 	 * Support zero length packets
4419 	 */
4420 	ml = NULL;
4421 	pktlen = 0;
4422 	while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4423 		if (m->m_len == 0) {
4424 			panic("%p m_len zero", m);
4425 		}
4426 		if (m->m_type == 0) {
4427 			panic("%p m_type zero", m);
4428 		}
4429 		/*
4430 		 * Clip to the residual length
4431 		 */
4432 		if (len > m->m_len) {
4433 			len = m->m_len;
4434 		}
4435 		pktlen += len;
4436 		/*
4437 		 * Copy the mbufs via the uio or delay the copy
4438 		 * Sockbuf must be consistent here (points to current mbuf,
4439 		 * it points to next record) when we drop priority;
4440 		 * we must note any additions to the sockbuf when we
4441 		 * block interrupts again.
4442 		 */
4443 		if (len > 0 && can_delay == 0) {
4444 			socket_unlock(so, 0);
4445 			error = uiomove(mtod(m, caddr_t), (int)len, auio);
4446 			socket_lock(so, 0);
4447 			if (error) {
4448 				goto release;
4449 			}
4450 		} else {
4451 			delayed_copy_len += len;
4452 		}
4453 
4454 		if (len == m->m_len) {
4455 			/*
4456 			 * m was entirely copied
4457 			 */
4458 			sbfree(&so->so_rcv, m);
4459 			nextrecord = m->m_nextpkt;
4460 			m->m_nextpkt = NULL;
4461 
4462 			/*
4463 			 * Set the first packet to the head of the free list
4464 			 */
4465 			if (free_list == NULL) {
4466 				free_list = m;
4467 			}
4468 			/*
4469 			 * Link current packet to tail of free list
4470 			 */
4471 			if (ml == NULL) {
4472 				if (free_tail != NULL) {
4473 					free_tail->m_nextpkt = m;
4474 				}
4475 				free_tail = m;
4476 			}
4477 			/*
4478 			 * Link current mbuf to last mbuf of current packet
4479 			 */
4480 			if (ml != NULL) {
4481 				ml->m_next = m;
4482 			}
4483 			ml = m;
4484 
4485 			/*
4486 			 * Move next buf to head of socket buffer
4487 			 */
4488 			so->so_rcv.sb_mb = m = ml->m_next;
4489 			ml->m_next = NULL;
4490 
4491 			if (m != NULL) {
4492 				m->m_nextpkt = nextrecord;
4493 				if (nextrecord == NULL) {
4494 					so->so_rcv.sb_lastrecord = m;
4495 				}
4496 			} else {
4497 				so->so_rcv.sb_mb = nextrecord;
4498 				SB_EMPTY_FIXUP(&so->so_rcv);
4499 			}
4500 			SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4501 			SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4502 		} else {
4503 			/*
4504 			 * Stop the loop on partial copy
4505 			 */
4506 			break;
4507 		}
4508 	}
4509 #ifdef MORE_LOCKING_DEBUG
4510 	if (so->so_usecount <= 1) {
4511 		panic("%s: after big while so=%llx ref=%d on socket",
4512 		    __func__,
4513 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4514 		/* NOTREACHED */
4515 	}
4516 #endif
4517 	/*
4518 	 * Tell the caller we made a partial copy
4519 	 */
4520 	if (m != NULL) {
4521 		if (so->so_options & SO_DONTTRUNC) {
4522 			/*
4523 			 * Copyout first the freelist then the partial mbuf
4524 			 */
4525 			socket_unlock(so, 0);
4526 			if (delayed_copy_len) {
4527 				error = sodelayed_copy_list(so, msgarray,
4528 				    uiocnt, &free_list, &delayed_copy_len);
4529 			}
4530 
4531 			if (error == 0) {
4532 				error = uiomove(mtod(m, caddr_t), (int)len,
4533 				    auio);
4534 			}
4535 			socket_lock(so, 0);
4536 			if (error) {
4537 				goto release;
4538 			}
4539 
4540 			m->m_data += len;
4541 			m->m_len -= len;
4542 			so->so_rcv.sb_cc -= len;
4543 			flags |= MSG_RCVMORE;
4544 		} else {
4545 			(void) sbdroprecord(&so->so_rcv);
4546 			nextrecord = so->so_rcv.sb_mb;
4547 			m = NULL;
4548 			flags |= MSG_TRUNC;
4549 		}
4550 	}
4551 
4552 	if (m == NULL) {
4553 		so->so_rcv.sb_mb = nextrecord;
4554 		/*
4555 		 * First part is an inline SB_EMPTY_FIXUP().  Second
4556 		 * part makes sure sb_lastrecord is up-to-date if
4557 		 * there is still data in the socket buffer.
4558 		 */
4559 		if (so->so_rcv.sb_mb == NULL) {
4560 			so->so_rcv.sb_mbtail = NULL;
4561 			so->so_rcv.sb_lastrecord = NULL;
4562 		} else if (nextrecord->m_nextpkt == NULL) {
4563 			so->so_rcv.sb_lastrecord = nextrecord;
4564 		}
4565 		SB_MB_CHECK(&so->so_rcv);
4566 	}
4567 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4568 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4569 
4570 	/*
4571 	 * We can continue to the next packet as long as:
4572 	 * - We haven't exhausted the uio array
4573 	 * - There was no error
4574 	 * - A packet was not truncated
4575 	 * - We can still receive more data
4576 	 */
4577 	if (npkts < uiocnt && error == 0 &&
4578 	    (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4579 	    (so->so_state & SS_CANTRCVMORE) == 0) {
4580 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4581 		sblocked = 0;
4582 
4583 		goto next;
4584 	}
4585 	if (flagsp != NULL) {
4586 		*flagsp |= flags;
4587 	}
4588 
4589 release:
4590 	/*
4591 	 * pru_rcvd may cause more data to be received if the socket lock
4592 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4593 	 * That way the caller won't be surprised if it receives less data
4594 	 * than requested.
4595 	 */
4596 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4597 		flags |= MSG_HAVEMORE;
4598 	}
4599 
4600 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4601 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4602 	}
4603 
4604 	if (sblocked) {
4605 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4606 	} else {
4607 		socket_unlock(so, 1);
4608 	}
4609 
4610 	if (delayed_copy_len) {
4611 		error = sodelayed_copy_list(so, msgarray, uiocnt,
4612 		    &free_list, &delayed_copy_len);
4613 	}
4614 out:
4615 	/*
4616 	 * Amortize the cost of freeing the mbufs
4617 	 */
4618 	if (free_list != NULL) {
4619 		m_freem_list(free_list);
4620 	}
4621 	if (free_others != NULL) {
4622 		m_freem_list(free_others);
4623 	}
4624 
4625 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4626 	    0, 0, 0, 0);
4627 	return error;
4628 }
4629 
4630 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4631 so_statistics_event_to_nstat_event(int64_t *input_options,
4632     uint64_t *nstat_event)
4633 {
4634 	int error = 0;
4635 	switch (*input_options) {
4636 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4637 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4638 		break;
4639 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4640 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4641 		break;
4642 #if (DEBUG || DEVELOPMENT)
4643 	case SO_STATISTICS_EVENT_RESERVED_1:
4644 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4645 		break;
4646 	case SO_STATISTICS_EVENT_RESERVED_2:
4647 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4648 		break;
4649 #endif /* (DEBUG || DEVELOPMENT) */
4650 	default:
4651 		error = EINVAL;
4652 		break;
4653 	}
4654 	return error;
4655 }
4656 
4657 /*
4658  * Returns:	0			Success
4659  *		EINVAL
4660  *		ENOTCONN
4661  *	<pru_shutdown>:EINVAL
4662  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4663  *	<pru_shutdown>:ENOBUFS[TCP]
4664  *	<pru_shutdown>:EMSGSIZE[TCP]
4665  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4666  *	<pru_shutdown>:ENETUNREACH[TCP]
4667  *	<pru_shutdown>:ENETDOWN[TCP]
4668  *	<pru_shutdown>:ENOMEM[TCP]
4669  *	<pru_shutdown>:EACCES[TCP]
4670  *	<pru_shutdown>:EMSGSIZE[TCP]
4671  *	<pru_shutdown>:ENOBUFS[TCP]
4672  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4673  *	<pru_shutdown>:???		[other protocol families]
4674  */
4675 int
soshutdown(struct socket * so,int how)4676 soshutdown(struct socket *so, int how)
4677 {
4678 	int error;
4679 
4680 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4681 
4682 	switch (how) {
4683 	case SHUT_RD:
4684 	case SHUT_WR:
4685 	case SHUT_RDWR:
4686 		socket_lock(so, 1);
4687 		if ((so->so_state &
4688 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4689 			error = ENOTCONN;
4690 		} else {
4691 			error = soshutdownlock(so, how);
4692 		}
4693 		socket_unlock(so, 1);
4694 		break;
4695 	default:
4696 		error = EINVAL;
4697 		break;
4698 	}
4699 
4700 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4701 
4702 	return error;
4703 }
4704 
4705 int
soshutdownlock_final(struct socket * so,int how)4706 soshutdownlock_final(struct socket *so, int how)
4707 {
4708 	struct protosw *pr = so->so_proto;
4709 	int error = 0;
4710 
4711 	sflt_notify(so, sock_evt_shutdown, &how);
4712 
4713 	if (how != SHUT_WR) {
4714 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4715 			/* read already shut down */
4716 			error = ENOTCONN;
4717 			goto done;
4718 		}
4719 		sorflush(so);
4720 	}
4721 	if (how != SHUT_RD) {
4722 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4723 			/* write already shut down */
4724 			error = ENOTCONN;
4725 			goto done;
4726 		}
4727 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4728 	}
4729 done:
4730 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4731 	return error;
4732 }
4733 
4734 int
soshutdownlock(struct socket * so,int how)4735 soshutdownlock(struct socket *so, int how)
4736 {
4737 	int error = 0;
4738 
4739 #if CONTENT_FILTER
4740 	/*
4741 	 * A content filter may delay the actual shutdown until it
4742 	 * has processed the pending data
4743 	 */
4744 	if (so->so_flags & SOF_CONTENT_FILTER) {
4745 		error = cfil_sock_shutdown(so, &how);
4746 		if (error == EJUSTRETURN) {
4747 			error = 0;
4748 			goto done;
4749 		} else if (error != 0) {
4750 			goto done;
4751 		}
4752 	}
4753 #endif /* CONTENT_FILTER */
4754 
4755 	error = soshutdownlock_final(so, how);
4756 
4757 done:
4758 	return error;
4759 }
4760 
4761 void
sowflush(struct socket * so)4762 sowflush(struct socket *so)
4763 {
4764 	struct sockbuf *sb = &so->so_snd;
4765 
4766 	/*
4767 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4768 	 * to prevent the socket buffer from being unexpectedly altered
4769 	 * while it is used by another thread in socket send/receive.
4770 	 *
4771 	 * sblock() must not fail here, hence the assertion.
4772 	 */
4773 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4774 	VERIFY(sb->sb_flags & SB_LOCK);
4775 
4776 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4777 	sb->sb_flags            |= SB_DROP;
4778 	sb->sb_upcall           = NULL;
4779 	sb->sb_upcallarg        = NULL;
4780 
4781 	sbunlock(sb, TRUE);     /* keep socket locked */
4782 
4783 	selthreadclear(&sb->sb_sel);
4784 	sbrelease(sb);
4785 }
4786 
4787 void
sorflush(struct socket * so)4788 sorflush(struct socket *so)
4789 {
4790 	struct sockbuf *sb = &so->so_rcv;
4791 	struct protosw *pr = so->so_proto;
4792 	struct sockbuf asb;
4793 #ifdef notyet
4794 	lck_mtx_t *mutex_held;
4795 	/*
4796 	 * XXX: This code is currently commented out, because we may get here
4797 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4798 	 * longer be able to return us the lock; this will be fixed in future.
4799 	 */
4800 	if (so->so_proto->pr_getlock != NULL) {
4801 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4802 	} else {
4803 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4804 	}
4805 
4806 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4807 #endif /* notyet */
4808 
4809 	sflt_notify(so, sock_evt_flush_read, NULL);
4810 
4811 	socantrcvmore(so);
4812 
4813 	/*
4814 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4815 	 * to prevent the socket buffer from being unexpectedly altered
4816 	 * while it is used by another thread in socket send/receive.
4817 	 *
4818 	 * sblock() must not fail here, hence the assertion.
4819 	 */
4820 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4821 	VERIFY(sb->sb_flags & SB_LOCK);
4822 
4823 	/*
4824 	 * Copy only the relevant fields from "sb" to "asb" which we
4825 	 * need for sbrelease() to function.  In particular, skip
4826 	 * sb_sel as it contains the wait queue linkage, which would
4827 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4828 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4829 	 * to acquire it later as part of sbrelease().
4830 	 */
4831 	bzero(&asb, sizeof(asb));
4832 	asb.sb_cc               = sb->sb_cc;
4833 	asb.sb_hiwat            = sb->sb_hiwat;
4834 	asb.sb_mbcnt            = sb->sb_mbcnt;
4835 	asb.sb_mbmax            = sb->sb_mbmax;
4836 	asb.sb_ctl              = sb->sb_ctl;
4837 	asb.sb_lowat            = sb->sb_lowat;
4838 	asb.sb_mb               = sb->sb_mb;
4839 	asb.sb_mbtail           = sb->sb_mbtail;
4840 	asb.sb_lastrecord       = sb->sb_lastrecord;
4841 	asb.sb_so               = sb->sb_so;
4842 	asb.sb_flags            = sb->sb_flags;
4843 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4844 	asb.sb_flags            |= SB_DROP;
4845 
4846 	/*
4847 	 * Ideally we'd bzero() these and preserve the ones we need;
4848 	 * but to do that we'd need to shuffle things around in the
4849 	 * sockbuf, and we can't do it now because there are KEXTS
4850 	 * that are directly referring to the socket structure.
4851 	 *
4852 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4853 	 * Clearing SB_SEL is done for selthreadclear() below.
4854 	 */
4855 	sb->sb_cc               = 0;
4856 	sb->sb_hiwat            = 0;
4857 	sb->sb_mbcnt            = 0;
4858 	sb->sb_mbmax            = 0;
4859 	sb->sb_ctl              = 0;
4860 	sb->sb_lowat            = 0;
4861 	sb->sb_mb               = NULL;
4862 	sb->sb_mbtail           = NULL;
4863 	sb->sb_lastrecord       = NULL;
4864 	sb->sb_timeo.tv_sec     = 0;
4865 	sb->sb_timeo.tv_usec    = 0;
4866 	sb->sb_upcall           = NULL;
4867 	sb->sb_upcallarg        = NULL;
4868 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4869 	sb->sb_flags            |= SB_DROP;
4870 
4871 	sbunlock(sb, TRUE);     /* keep socket locked */
4872 
4873 	/*
4874 	 * Note that selthreadclear() is called on the original "sb" and
4875 	 * not the local "asb" because of the way wait queue linkage is
4876 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4877 	 * should no longer be set (cleared above.)
4878 	 */
4879 	selthreadclear(&sb->sb_sel);
4880 
4881 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4882 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4883 	}
4884 
4885 	sbrelease(&asb);
4886 }
4887 
4888 /*
4889  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4890  * an additional variant to handle the case where the option value needs
4891  * to be some kind of integer, but not a specific size.
4892  * In addition to their use here, these functions are also called by the
4893  * protocol-level pr_ctloutput() routines.
4894  *
4895  * Returns:	0			Success
4896  *		EINVAL
4897  *	copyin:EFAULT
4898  */
4899 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4900 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4901 {
4902 	size_t  valsize;
4903 
4904 	/*
4905 	 * If the user gives us more than we wanted, we ignore it,
4906 	 * but if we don't get the minimum length the caller
4907 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4908 	 * is set to however much we actually retrieved.
4909 	 */
4910 	if ((valsize = sopt->sopt_valsize) < minlen) {
4911 		return EINVAL;
4912 	}
4913 	if (valsize > len) {
4914 		sopt->sopt_valsize = valsize = len;
4915 	}
4916 
4917 	if (sopt->sopt_p != kernproc) {
4918 		return copyin(sopt->sopt_val, buf, valsize);
4919 	}
4920 
4921 	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4922 	return 0;
4923 }
4924 
4925 /*
4926  * sooptcopyin_timeval
4927  *   Copy in a timeval value into tv_p, and take into account whether the
4928  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4929  *   code here so that we can verify the 64-bit tv_sec value before we lose
4930  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4931  */
4932 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4933 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4934 {
4935 	int                     error;
4936 
4937 	if (proc_is64bit(sopt->sopt_p)) {
4938 		struct user64_timeval   tv64;
4939 
4940 		if (sopt->sopt_valsize < sizeof(tv64)) {
4941 			return EINVAL;
4942 		}
4943 
4944 		sopt->sopt_valsize = sizeof(tv64);
4945 		if (sopt->sopt_p != kernproc) {
4946 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4947 			if (error != 0) {
4948 				return error;
4949 			}
4950 		} else {
4951 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4952 			    sizeof(tv64));
4953 		}
4954 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4955 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4956 			return EDOM;
4957 		}
4958 
4959 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4960 		tv_p->tv_usec = tv64.tv_usec;
4961 	} else {
4962 		struct user32_timeval   tv32;
4963 
4964 		if (sopt->sopt_valsize < sizeof(tv32)) {
4965 			return EINVAL;
4966 		}
4967 
4968 		sopt->sopt_valsize = sizeof(tv32);
4969 		if (sopt->sopt_p != kernproc) {
4970 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4971 			if (error != 0) {
4972 				return error;
4973 			}
4974 		} else {
4975 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4976 			    sizeof(tv32));
4977 		}
4978 #ifndef __LP64__
4979 		/*
4980 		 * K64todo "comparison is always false due to
4981 		 * limited range of data type"
4982 		 */
4983 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4984 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4985 			return EDOM;
4986 		}
4987 #endif
4988 		tv_p->tv_sec = tv32.tv_sec;
4989 		tv_p->tv_usec = tv32.tv_usec;
4990 	}
4991 	return 0;
4992 }
4993 
4994 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4995 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4996     boolean_t ignore_delegate)
4997 {
4998 	kauth_cred_t cred =  NULL;
4999 	proc_t ep = PROC_NULL;
5000 	uid_t uid;
5001 	int error = 0;
5002 
5003 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
5004 		ep = proc_find(so->e_pid);
5005 		if (ep) {
5006 			cred = kauth_cred_proc_ref(ep);
5007 		}
5008 	}
5009 
5010 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5011 
5012 	/* uid is 0 for root */
5013 	if (uid != 0 || !allow_root) {
5014 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5015 	}
5016 	if (cred) {
5017 		kauth_cred_unref(&cred);
5018 	}
5019 	if (ep != PROC_NULL) {
5020 		proc_rele(ep);
5021 	}
5022 
5023 	return error;
5024 }
5025 
5026 /*
5027  * Returns:	0			Success
5028  *		EINVAL
5029  *		ENOPROTOOPT
5030  *		ENOBUFS
5031  *		EDOM
5032  *	sooptcopyin:EINVAL
5033  *	sooptcopyin:EFAULT
5034  *	sooptcopyin_timeval:EINVAL
5035  *	sooptcopyin_timeval:EFAULT
5036  *	sooptcopyin_timeval:EDOM
5037  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5038  *	<pr_ctloutput>:???w
5039  *	sflt_attach_private:???		[whatever a filter author chooses]
5040  *	<sf_setoption>:???		[whatever a filter author chooses]
5041  *
5042  * Notes:	Other <pru_listen> returns depend on the protocol family; all
5043  *		<sf_listen> returns depend on what the filter author causes
5044  *		their filter to return.
5045  */
5046 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5047 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5048 {
5049 	int     error, optval;
5050 	int64_t long_optval;
5051 	struct  linger l;
5052 	struct  timeval tv;
5053 
5054 	if (sopt->sopt_dir != SOPT_SET) {
5055 		sopt->sopt_dir = SOPT_SET;
5056 	}
5057 
5058 	if (dolock) {
5059 		socket_lock(so, 1);
5060 	}
5061 
5062 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5063 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5064 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5065 		/* the socket has been shutdown, no more sockopt's */
5066 		error = EINVAL;
5067 		goto out;
5068 	}
5069 
5070 	error = sflt_setsockopt(so, sopt);
5071 	if (error != 0) {
5072 		if (error == EJUSTRETURN) {
5073 			error = 0;
5074 		}
5075 		goto out;
5076 	}
5077 
5078 	if (sopt->sopt_level != SOL_SOCKET) {
5079 		if (so->so_proto != NULL &&
5080 		    so->so_proto->pr_ctloutput != NULL) {
5081 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5082 			goto out;
5083 		}
5084 		error = ENOPROTOOPT;
5085 	} else {
5086 		/*
5087 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5088 		 * the protocol layer, if needed.  A zero value returned from
5089 		 * the handler means use default socket-level processing as
5090 		 * done by the rest of this routine.  Otherwise, any other
5091 		 * return value indicates that the option is unsupported.
5092 		 */
5093 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5094 		    pru_socheckopt(so, sopt)) != 0) {
5095 			goto out;
5096 		}
5097 
5098 		error = 0;
5099 		switch (sopt->sopt_name) {
5100 		case SO_LINGER:
5101 		case SO_LINGER_SEC: {
5102 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5103 			if (error != 0) {
5104 				goto out;
5105 			}
5106 			/* Make sure to use sane values */
5107 			if (sopt->sopt_name == SO_LINGER) {
5108 				so->so_linger = (short)l.l_linger;
5109 			} else {
5110 				so->so_linger = (short)((long)l.l_linger * hz);
5111 			}
5112 			if (l.l_onoff != 0) {
5113 				so->so_options |= SO_LINGER;
5114 			} else {
5115 				so->so_options &= ~SO_LINGER;
5116 			}
5117 			break;
5118 		}
5119 		case SO_DEBUG:
5120 		case SO_KEEPALIVE:
5121 		case SO_DONTROUTE:
5122 		case SO_USELOOPBACK:
5123 		case SO_BROADCAST:
5124 		case SO_REUSEADDR:
5125 		case SO_REUSEPORT:
5126 		case SO_OOBINLINE:
5127 		case SO_TIMESTAMP:
5128 		case SO_TIMESTAMP_MONOTONIC:
5129 		case SO_TIMESTAMP_CONTINUOUS:
5130 		case SO_DONTTRUNC:
5131 		case SO_WANTMORE:
5132 		case SO_WANTOOBFLAG:
5133 		case SO_NOWAKEFROMSLEEP:
5134 		case SO_NOAPNFALLBK:
5135 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5136 			    sizeof(optval));
5137 			if (error != 0) {
5138 				goto out;
5139 			}
5140 			if (optval) {
5141 				so->so_options |= sopt->sopt_name;
5142 			} else {
5143 				so->so_options &= ~sopt->sopt_name;
5144 			}
5145 #if SKYWALK
5146 			inp_update_netns_flags(so);
5147 #endif /* SKYWALK */
5148 			break;
5149 
5150 		case SO_SNDBUF:
5151 		case SO_RCVBUF:
5152 		case SO_SNDLOWAT:
5153 		case SO_RCVLOWAT:
5154 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5155 			    sizeof(optval));
5156 			if (error != 0) {
5157 				goto out;
5158 			}
5159 
5160 			/*
5161 			 * Values < 1 make no sense for any of these
5162 			 * options, so disallow them.
5163 			 */
5164 			if (optval < 1) {
5165 				error = EINVAL;
5166 				goto out;
5167 			}
5168 
5169 			switch (sopt->sopt_name) {
5170 			case SO_SNDBUF:
5171 			case SO_RCVBUF: {
5172 				struct sockbuf *sb =
5173 				    (sopt->sopt_name == SO_SNDBUF) ?
5174 				    &so->so_snd : &so->so_rcv;
5175 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
5176 					error = ENOBUFS;
5177 					goto out;
5178 				}
5179 				sb->sb_flags |= SB_USRSIZE;
5180 				sb->sb_flags &= ~SB_AUTOSIZE;
5181 				sb->sb_idealsize = (u_int32_t)optval;
5182 				break;
5183 			}
5184 			/*
5185 			 * Make sure the low-water is never greater than
5186 			 * the high-water.
5187 			 */
5188 			case SO_SNDLOWAT: {
5189 				int space = sbspace(&so->so_snd);
5190 				uint32_t hiwat = so->so_snd.sb_hiwat;
5191 
5192 				if (so->so_snd.sb_flags & SB_UNIX) {
5193 					struct unpcb *unp =
5194 					    (struct unpcb *)(so->so_pcb);
5195 					if (unp != NULL &&
5196 					    unp->unp_conn != NULL) {
5197 						struct socket *so2 = unp->unp_conn->unp_socket;
5198 						hiwat += unp->unp_conn->unp_cc;
5199 						space = sbspace(&so2->so_rcv);
5200 					}
5201 				}
5202 
5203 				so->so_snd.sb_lowat =
5204 				    (optval > hiwat) ?
5205 				    hiwat : optval;
5206 
5207 				if (space >= so->so_snd.sb_lowat) {
5208 					sowwakeup(so);
5209 				}
5210 				break;
5211 			}
5212 			case SO_RCVLOWAT: {
5213 				int64_t data_len;
5214 				so->so_rcv.sb_lowat =
5215 				    (optval > so->so_rcv.sb_hiwat) ?
5216 				    so->so_rcv.sb_hiwat : optval;
5217 				if (so->so_rcv.sb_flags & SB_UNIX) {
5218 					struct unpcb *unp =
5219 					    (struct unpcb *)(so->so_pcb);
5220 					if (unp != NULL &&
5221 					    unp->unp_conn != NULL) {
5222 						struct socket *so2 = unp->unp_conn->unp_socket;
5223 						data_len = so2->so_snd.sb_cc
5224 						    - so2->so_snd.sb_ctl;
5225 					} else {
5226 						data_len = so->so_rcv.sb_cc
5227 						    - so->so_rcv.sb_ctl;
5228 					}
5229 				} else {
5230 					data_len = so->so_rcv.sb_cc
5231 					    - so->so_rcv.sb_ctl;
5232 				}
5233 
5234 				if (data_len >= so->so_rcv.sb_lowat) {
5235 					sorwakeup(so);
5236 				}
5237 				break;
5238 			}
5239 			}
5240 			break;
5241 
5242 		case SO_SNDTIMEO:
5243 		case SO_RCVTIMEO:
5244 			error = sooptcopyin_timeval(sopt, &tv);
5245 			if (error != 0) {
5246 				goto out;
5247 			}
5248 
5249 			switch (sopt->sopt_name) {
5250 			case SO_SNDTIMEO:
5251 				so->so_snd.sb_timeo = tv;
5252 				break;
5253 			case SO_RCVTIMEO:
5254 				so->so_rcv.sb_timeo = tv;
5255 				break;
5256 			}
5257 			break;
5258 
5259 		case SO_NKE: {
5260 			struct so_nke nke;
5261 
5262 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5263 			    sizeof(nke));
5264 			if (error != 0) {
5265 				goto out;
5266 			}
5267 
5268 			error = sflt_attach_internal(so, nke.nke_handle);
5269 			break;
5270 		}
5271 
5272 		case SO_NOSIGPIPE:
5273 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5274 			    sizeof(optval));
5275 			if (error != 0) {
5276 				goto out;
5277 			}
5278 			if (optval != 0) {
5279 				so->so_flags |= SOF_NOSIGPIPE;
5280 			} else {
5281 				so->so_flags &= ~SOF_NOSIGPIPE;
5282 			}
5283 			break;
5284 
5285 		case SO_NOADDRERR:
5286 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5287 			    sizeof(optval));
5288 			if (error != 0) {
5289 				goto out;
5290 			}
5291 			if (optval != 0) {
5292 				so->so_flags |= SOF_NOADDRAVAIL;
5293 			} else {
5294 				so->so_flags &= ~SOF_NOADDRAVAIL;
5295 			}
5296 			break;
5297 
5298 		case SO_REUSESHAREUID:
5299 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5300 			    sizeof(optval));
5301 			if (error != 0) {
5302 				goto out;
5303 			}
5304 			if (optval != 0) {
5305 				so->so_flags |= SOF_REUSESHAREUID;
5306 			} else {
5307 				so->so_flags &= ~SOF_REUSESHAREUID;
5308 			}
5309 			break;
5310 
5311 		case SO_NOTIFYCONFLICT:
5312 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5313 				error = EPERM;
5314 				goto out;
5315 			}
5316 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5317 			    sizeof(optval));
5318 			if (error != 0) {
5319 				goto out;
5320 			}
5321 			if (optval != 0) {
5322 				so->so_flags |= SOF_NOTIFYCONFLICT;
5323 			} else {
5324 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5325 			}
5326 			break;
5327 
5328 		case SO_RESTRICTIONS:
5329 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5330 			    sizeof(optval));
5331 			if (error != 0) {
5332 				goto out;
5333 			}
5334 
5335 			error = so_set_restrictions(so, optval);
5336 			break;
5337 
5338 		case SO_AWDL_UNRESTRICTED:
5339 			if (SOCK_DOM(so) != PF_INET &&
5340 			    SOCK_DOM(so) != PF_INET6) {
5341 				error = EOPNOTSUPP;
5342 				goto out;
5343 			}
5344 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5345 			    sizeof(optval));
5346 			if (error != 0) {
5347 				goto out;
5348 			}
5349 			if (optval != 0) {
5350 				error = soopt_cred_check(so,
5351 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5352 				if (error == 0) {
5353 					inp_set_awdl_unrestricted(
5354 						sotoinpcb(so));
5355 				}
5356 			} else {
5357 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5358 			}
5359 			break;
5360 		case SO_INTCOPROC_ALLOW:
5361 			if (SOCK_DOM(so) != PF_INET6) {
5362 				error = EOPNOTSUPP;
5363 				goto out;
5364 			}
5365 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5366 			    sizeof(optval));
5367 			if (error != 0) {
5368 				goto out;
5369 			}
5370 			if (optval != 0 &&
5371 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5372 				error = soopt_cred_check(so,
5373 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5374 				if (error == 0) {
5375 					inp_set_intcoproc_allowed(
5376 						sotoinpcb(so));
5377 				}
5378 			} else if (optval == 0) {
5379 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5380 			}
5381 			break;
5382 
5383 		case SO_LABEL:
5384 			error = EOPNOTSUPP;
5385 			break;
5386 
5387 		case SO_UPCALLCLOSEWAIT:
5388 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5389 			    sizeof(optval));
5390 			if (error != 0) {
5391 				goto out;
5392 			}
5393 			if (optval != 0) {
5394 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5395 			} else {
5396 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5397 			}
5398 			break;
5399 
5400 		case SO_RANDOMPORT:
5401 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5402 			    sizeof(optval));
5403 			if (error != 0) {
5404 				goto out;
5405 			}
5406 			if (optval != 0) {
5407 				so->so_flags |= SOF_BINDRANDOMPORT;
5408 			} else {
5409 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5410 			}
5411 			break;
5412 
5413 		case SO_NP_EXTENSIONS: {
5414 			struct so_np_extensions sonpx;
5415 
5416 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5417 			    sizeof(sonpx));
5418 			if (error != 0) {
5419 				goto out;
5420 			}
5421 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5422 				error = EINVAL;
5423 				goto out;
5424 			}
5425 			/*
5426 			 * Only one bit defined for now
5427 			 */
5428 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5429 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5430 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5431 				} else {
5432 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5433 				}
5434 			}
5435 			break;
5436 		}
5437 
5438 		case SO_TRAFFIC_CLASS: {
5439 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5440 			    sizeof(optval));
5441 			if (error != 0) {
5442 				goto out;
5443 			}
5444 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5445 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5446 				error = so_set_net_service_type(so, netsvc);
5447 				goto out;
5448 			}
5449 			error = so_set_traffic_class(so, optval);
5450 			if (error != 0) {
5451 				goto out;
5452 			}
5453 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5454 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5455 			break;
5456 		}
5457 
5458 		case SO_RECV_TRAFFIC_CLASS: {
5459 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5460 			    sizeof(optval));
5461 			if (error != 0) {
5462 				goto out;
5463 			}
5464 			if (optval == 0) {
5465 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5466 			} else {
5467 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5468 			}
5469 			break;
5470 		}
5471 
5472 #if (DEVELOPMENT || DEBUG)
5473 		case SO_TRAFFIC_CLASS_DBG: {
5474 			struct so_tcdbg so_tcdbg;
5475 
5476 			error = sooptcopyin(sopt, &so_tcdbg,
5477 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5478 			if (error != 0) {
5479 				goto out;
5480 			}
5481 			error = so_set_tcdbg(so, &so_tcdbg);
5482 			if (error != 0) {
5483 				goto out;
5484 			}
5485 			break;
5486 		}
5487 #endif /* (DEVELOPMENT || DEBUG) */
5488 
5489 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5490 			error = priv_check_cred(kauth_cred_get(),
5491 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5492 			if (error != 0) {
5493 				goto out;
5494 			}
5495 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5496 			    sizeof(optval));
5497 			if (error != 0) {
5498 				goto out;
5499 			}
5500 			if (optval == 0) {
5501 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5502 			} else {
5503 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5504 			}
5505 			break;
5506 
5507 #if (DEVELOPMENT || DEBUG)
5508 		case SO_DEFUNCTIT:
5509 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5510 			if (error == 0) {
5511 				error = sodefunct(current_proc(), so, 0);
5512 			}
5513 
5514 			break;
5515 #endif /* (DEVELOPMENT || DEBUG) */
5516 
5517 		case SO_DEFUNCTOK:
5518 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5519 			    sizeof(optval));
5520 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5521 				if (error == 0) {
5522 					error = EBADF;
5523 				}
5524 				goto out;
5525 			}
5526 			/*
5527 			 * Any process can set SO_DEFUNCTOK (clear
5528 			 * SOF_NODEFUNCT), but only root can clear
5529 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5530 			 */
5531 			if (optval == 0 &&
5532 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5533 				error = EPERM;
5534 				goto out;
5535 			}
5536 			if (optval) {
5537 				so->so_flags &= ~SOF_NODEFUNCT;
5538 			} else {
5539 				so->so_flags |= SOF_NODEFUNCT;
5540 			}
5541 
5542 			if (SOCK_DOM(so) == PF_INET ||
5543 			    SOCK_DOM(so) == PF_INET6) {
5544 				char s[MAX_IPv6_STR_LEN];
5545 				char d[MAX_IPv6_STR_LEN];
5546 				struct inpcb *inp = sotoinpcb(so);
5547 
5548 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5549 				    "[%s %s:%d -> %s:%d] is now marked "
5550 				    "as %seligible for "
5551 				    "defunct\n", __func__, proc_selfpid(),
5552 				    proc_best_name(current_proc()),
5553 				    so->so_gencnt,
5554 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5555 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5556 				    ((SOCK_DOM(so) == PF_INET) ?
5557 				    (void *)&inp->inp_laddr.s_addr :
5558 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5559 				    ntohs(inp->in6p_lport),
5560 				    inet_ntop(SOCK_DOM(so),
5561 				    (SOCK_DOM(so) == PF_INET) ?
5562 				    (void *)&inp->inp_faddr.s_addr :
5563 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5564 				    ntohs(inp->in6p_fport),
5565 				    (so->so_flags & SOF_NODEFUNCT) ?
5566 				    "not " : "");
5567 			} else {
5568 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5569 				    "is now marked as %seligible for "
5570 				    "defunct\n",
5571 				    __func__, proc_selfpid(),
5572 				    proc_best_name(current_proc()),
5573 				    so->so_gencnt,
5574 				    SOCK_DOM(so), SOCK_TYPE(so),
5575 				    (so->so_flags & SOF_NODEFUNCT) ?
5576 				    "not " : "");
5577 			}
5578 			break;
5579 
5580 		case SO_ISDEFUNCT:
5581 			/* This option is not settable */
5582 			error = EINVAL;
5583 			break;
5584 
5585 		case SO_OPPORTUNISTIC:
5586 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5587 			    sizeof(optval));
5588 			if (error == 0) {
5589 				error = so_set_opportunistic(so, optval);
5590 			}
5591 			break;
5592 
5593 		case SO_FLUSH:
5594 			/* This option is handled by lower layer(s) */
5595 			error = 0;
5596 			break;
5597 
5598 		case SO_RECV_ANYIF:
5599 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5600 			    sizeof(optval));
5601 			if (error == 0) {
5602 				error = so_set_recv_anyif(so, optval);
5603 			}
5604 			break;
5605 
5606 		case SO_TRAFFIC_MGT_BACKGROUND: {
5607 			/* This option is handled by lower layer(s) */
5608 			error = 0;
5609 			break;
5610 		}
5611 
5612 #if FLOW_DIVERT
5613 		case SO_FLOW_DIVERT_TOKEN:
5614 			error = flow_divert_token_set(so, sopt);
5615 			break;
5616 #endif  /* FLOW_DIVERT */
5617 
5618 
5619 		case SO_DELEGATED:
5620 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5621 			    sizeof(optval))) != 0) {
5622 				break;
5623 			}
5624 
5625 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5626 			break;
5627 
5628 		case SO_DELEGATED_UUID: {
5629 			uuid_t euuid;
5630 
5631 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5632 			    sizeof(euuid))) != 0) {
5633 				break;
5634 			}
5635 
5636 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5637 			break;
5638 		}
5639 
5640 #if NECP
5641 		case SO_NECP_ATTRIBUTES:
5642 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5643 				/* Handled by MPTCP itself */
5644 				break;
5645 			}
5646 
5647 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5648 				error = EINVAL;
5649 				goto out;
5650 			}
5651 
5652 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5653 			break;
5654 
5655 		case SO_NECP_CLIENTUUID: {
5656 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5657 				/* Handled by MPTCP itself */
5658 				break;
5659 			}
5660 
5661 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5662 				error = EINVAL;
5663 				goto out;
5664 			}
5665 
5666 			struct inpcb *inp = sotoinpcb(so);
5667 			if (!uuid_is_null(inp->necp_client_uuid)) {
5668 				// Clear out the old client UUID if present
5669 				necp_inpcb_remove_cb(inp);
5670 			}
5671 
5672 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5673 			    sizeof(uuid_t), sizeof(uuid_t));
5674 			if (error != 0) {
5675 				goto out;
5676 			}
5677 
5678 			if (uuid_is_null(inp->necp_client_uuid)) {
5679 				error = EINVAL;
5680 				goto out;
5681 			}
5682 
5683 			pid_t current_pid = proc_pid(current_proc());
5684 			error = necp_client_register_socket_flow(current_pid,
5685 			    inp->necp_client_uuid, inp);
5686 			if (error != 0) {
5687 				uuid_clear(inp->necp_client_uuid);
5688 				goto out;
5689 			}
5690 
5691 			if (inp->inp_lport != 0) {
5692 				// There is a bound local port, so this is not
5693 				// a fresh socket. Assign to the client.
5694 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5695 			}
5696 
5697 			break;
5698 		}
5699 		case SO_NECP_LISTENUUID: {
5700 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5701 				error = EINVAL;
5702 				goto out;
5703 			}
5704 
5705 			struct inpcb *inp = sotoinpcb(so);
5706 			if (!uuid_is_null(inp->necp_client_uuid)) {
5707 				error = EINVAL;
5708 				goto out;
5709 			}
5710 
5711 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5712 			    sizeof(uuid_t), sizeof(uuid_t));
5713 			if (error != 0) {
5714 				goto out;
5715 			}
5716 
5717 			if (uuid_is_null(inp->necp_client_uuid)) {
5718 				error = EINVAL;
5719 				goto out;
5720 			}
5721 
5722 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5723 			    inp->necp_client_uuid, inp);
5724 			if (error != 0) {
5725 				uuid_clear(inp->necp_client_uuid);
5726 				goto out;
5727 			}
5728 
5729 			// Mark that the port registration is held by NECP
5730 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5731 
5732 			break;
5733 		}
5734 
5735 		case SO_RESOLVER_SIGNATURE: {
5736 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5737 				error = EINVAL;
5738 				goto out;
5739 			}
5740 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5741 			break;
5742 		}
5743 #endif /* NECP */
5744 
5745 		case SO_EXTENDED_BK_IDLE:
5746 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5747 			    sizeof(optval));
5748 			if (error == 0) {
5749 				error = so_set_extended_bk_idle(so, optval);
5750 			}
5751 			break;
5752 
5753 		case SO_MARK_CELLFALLBACK:
5754 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5755 			    sizeof(optval));
5756 			if (error != 0) {
5757 				goto out;
5758 			}
5759 			if (optval < 0) {
5760 				error = EINVAL;
5761 				goto out;
5762 			}
5763 			if (optval == 0) {
5764 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5765 			} else {
5766 				so->so_flags1 |= SOF1_CELLFALLBACK;
5767 			}
5768 			break;
5769 
5770 		case SO_MARK_CELLFALLBACK_UUID:
5771 		{
5772 			struct so_mark_cellfallback_uuid_args args;
5773 
5774 			error = sooptcopyin(sopt, &args, sizeof(args),
5775 			    sizeof(args));
5776 			if (error != 0) {
5777 				goto out;
5778 			}
5779 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5780 			    args.flow_cellfallback);
5781 			break;
5782 		}
5783 
5784 		case SO_FALLBACK_MODE:
5785 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5786 			    sizeof(optval));
5787 			if (error != 0) {
5788 				goto out;
5789 			}
5790 			if (optval < SO_FALLBACK_MODE_NONE ||
5791 			    optval > SO_FALLBACK_MODE_PREFER) {
5792 				error = EINVAL;
5793 				goto out;
5794 			}
5795 			so->so_fallback_mode = (u_int8_t)optval;
5796 			break;
5797 
5798 		case SO_MARK_KNOWN_TRACKER: {
5799 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5800 			    sizeof(optval));
5801 			if (error != 0) {
5802 				goto out;
5803 			}
5804 			if (optval < 0) {
5805 				error = EINVAL;
5806 				goto out;
5807 			}
5808 			if (optval == 0) {
5809 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5810 			} else {
5811 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5812 			}
5813 			break;
5814 		}
5815 
5816 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5817 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5818 			    sizeof(optval));
5819 			if (error != 0) {
5820 				goto out;
5821 			}
5822 			if (optval < 0) {
5823 				error = EINVAL;
5824 				goto out;
5825 			}
5826 			if (optval == 0) {
5827 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5828 			} else {
5829 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5830 			}
5831 			break;
5832 		}
5833 
5834 		case SO_MARK_APPROVED_APP_DOMAIN: {
5835 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5836 			    sizeof(optval));
5837 			if (error != 0) {
5838 				goto out;
5839 			}
5840 			if (optval < 0) {
5841 				error = EINVAL;
5842 				goto out;
5843 			}
5844 			if (optval == 0) {
5845 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5846 			} else {
5847 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5848 			}
5849 			break;
5850 		}
5851 
5852 		case SO_STATISTICS_EVENT:
5853 			error = sooptcopyin(sopt, &long_optval,
5854 			    sizeof(long_optval), sizeof(long_optval));
5855 			if (error != 0) {
5856 				goto out;
5857 			}
5858 			u_int64_t nstat_event = 0;
5859 			error = so_statistics_event_to_nstat_event(
5860 				&long_optval, &nstat_event);
5861 			if (error != 0) {
5862 				goto out;
5863 			}
5864 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5865 			break;
5866 
5867 		case SO_NET_SERVICE_TYPE: {
5868 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5869 			    sizeof(optval));
5870 			if (error != 0) {
5871 				goto out;
5872 			}
5873 			error = so_set_net_service_type(so, optval);
5874 			break;
5875 		}
5876 
5877 		case SO_QOSMARKING_POLICY_OVERRIDE:
5878 			error = priv_check_cred(kauth_cred_get(),
5879 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5880 			if (error != 0) {
5881 				goto out;
5882 			}
5883 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5884 			    sizeof(optval));
5885 			if (error != 0) {
5886 				goto out;
5887 			}
5888 			if (optval == 0) {
5889 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5890 			} else {
5891 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5892 			}
5893 			break;
5894 
5895 		case SO_MPKL_SEND_INFO: {
5896 			struct so_mpkl_send_info so_mpkl_send_info;
5897 
5898 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5899 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5900 			if (error != 0) {
5901 				goto out;
5902 			}
5903 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5904 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5905 
5906 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5907 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5908 			} else {
5909 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5910 			}
5911 			break;
5912 		}
5913 		case SO_WANT_KEV_SOCKET_CLOSED: {
5914 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5915 			    sizeof(optval));
5916 			if (error != 0) {
5917 				goto out;
5918 			}
5919 			if (optval == 0) {
5920 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5921 			} else {
5922 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5923 			}
5924 			break;
5925 		}
5926 		case SO_MARK_WAKE_PKT: {
5927 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5928 			    sizeof(optval));
5929 			if (error != 0) {
5930 				goto out;
5931 			}
5932 			if (optval == 0) {
5933 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5934 			} else {
5935 				so->so_flags |= SOF_MARK_WAKE_PKT;
5936 			}
5937 			break;
5938 		}
5939 		case SO_RECV_WAKE_PKT: {
5940 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5941 			    sizeof(optval));
5942 			if (error != 0) {
5943 				goto out;
5944 			}
5945 			if (optval == 0) {
5946 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5947 			} else {
5948 				so->so_flags |= SOF_RECV_WAKE_PKT;
5949 			}
5950 			break;
5951 		}
5952 		default:
5953 			error = ENOPROTOOPT;
5954 			break;
5955 		}
5956 		if (error == 0 && so->so_proto != NULL &&
5957 		    so->so_proto->pr_ctloutput != NULL) {
5958 			(void) so->so_proto->pr_ctloutput(so, sopt);
5959 		}
5960 	}
5961 out:
5962 	if (dolock) {
5963 		socket_unlock(so, 1);
5964 	}
5965 	return error;
5966 }
5967 
5968 /* Helper routines for getsockopt */
5969 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5970 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5971 {
5972 	int     error;
5973 	size_t  valsize;
5974 
5975 	error = 0;
5976 
5977 	/*
5978 	 * Documented get behavior is that we always return a value,
5979 	 * possibly truncated to fit in the user's buffer.
5980 	 * Traditional behavior is that we always tell the user
5981 	 * precisely how much we copied, rather than something useful
5982 	 * like the total amount we had available for her.
5983 	 * Note that this interface is not idempotent; the entire answer must
5984 	 * generated ahead of time.
5985 	 */
5986 	valsize = MIN(len, sopt->sopt_valsize);
5987 	sopt->sopt_valsize = valsize;
5988 	if (sopt->sopt_val != USER_ADDR_NULL) {
5989 		if (sopt->sopt_p != kernproc) {
5990 			error = copyout(buf, sopt->sopt_val, valsize);
5991 		} else {
5992 			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5993 		}
5994 	}
5995 	return error;
5996 }
5997 
5998 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5999 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
6000 {
6001 	int                     error;
6002 	size_t                  len;
6003 	struct user64_timeval   tv64 = {};
6004 	struct user32_timeval   tv32 = {};
6005 	const void *            val;
6006 	size_t                  valsize;
6007 
6008 	error = 0;
6009 	if (proc_is64bit(sopt->sopt_p)) {
6010 		len = sizeof(tv64);
6011 		tv64.tv_sec = tv_p->tv_sec;
6012 		tv64.tv_usec = tv_p->tv_usec;
6013 		val = &tv64;
6014 	} else {
6015 		len = sizeof(tv32);
6016 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
6017 		tv32.tv_usec = tv_p->tv_usec;
6018 		val = &tv32;
6019 	}
6020 	valsize = MIN(len, sopt->sopt_valsize);
6021 	sopt->sopt_valsize = valsize;
6022 	if (sopt->sopt_val != USER_ADDR_NULL) {
6023 		if (sopt->sopt_p != kernproc) {
6024 			error = copyout(val, sopt->sopt_val, valsize);
6025 		} else {
6026 			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6027 		}
6028 	}
6029 	return error;
6030 }
6031 
6032 /*
6033  * Return:	0			Success
6034  *		ENOPROTOOPT
6035  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
6036  *	<pr_ctloutput>:???
6037  *	<sf_getoption>:???
6038  */
6039 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)6040 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
6041 {
6042 	int     error, optval;
6043 	struct  linger l;
6044 	struct  timeval tv;
6045 
6046 	if (sopt->sopt_dir != SOPT_GET) {
6047 		sopt->sopt_dir = SOPT_GET;
6048 	}
6049 
6050 	if (dolock) {
6051 		socket_lock(so, 1);
6052 	}
6053 
6054 	error = sflt_getsockopt(so, sopt);
6055 	if (error != 0) {
6056 		if (error == EJUSTRETURN) {
6057 			error = 0;
6058 		}
6059 		goto out;
6060 	}
6061 
6062 	if (sopt->sopt_level != SOL_SOCKET) {
6063 		if (so->so_proto != NULL &&
6064 		    so->so_proto->pr_ctloutput != NULL) {
6065 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
6066 			goto out;
6067 		}
6068 		error = ENOPROTOOPT;
6069 	} else {
6070 		/*
6071 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
6072 		 * the protocol layer, if needed.  A zero value returned from
6073 		 * the handler means use default socket-level processing as
6074 		 * done by the rest of this routine.  Otherwise, any other
6075 		 * return value indicates that the option is unsupported.
6076 		 */
6077 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6078 		    pru_socheckopt(so, sopt)) != 0) {
6079 			goto out;
6080 		}
6081 
6082 		error = 0;
6083 		switch (sopt->sopt_name) {
6084 		case SO_LINGER:
6085 		case SO_LINGER_SEC:
6086 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6087 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6088 			    so->so_linger : so->so_linger / hz;
6089 			error = sooptcopyout(sopt, &l, sizeof(l));
6090 			break;
6091 
6092 		case SO_USELOOPBACK:
6093 		case SO_DONTROUTE:
6094 		case SO_DEBUG:
6095 		case SO_KEEPALIVE:
6096 		case SO_REUSEADDR:
6097 		case SO_REUSEPORT:
6098 		case SO_BROADCAST:
6099 		case SO_OOBINLINE:
6100 		case SO_TIMESTAMP:
6101 		case SO_TIMESTAMP_MONOTONIC:
6102 		case SO_TIMESTAMP_CONTINUOUS:
6103 		case SO_DONTTRUNC:
6104 		case SO_WANTMORE:
6105 		case SO_WANTOOBFLAG:
6106 		case SO_NOWAKEFROMSLEEP:
6107 		case SO_NOAPNFALLBK:
6108 			optval = so->so_options & sopt->sopt_name;
6109 integer:
6110 			error = sooptcopyout(sopt, &optval, sizeof(optval));
6111 			break;
6112 
6113 		case SO_TYPE:
6114 			optval = so->so_type;
6115 			goto integer;
6116 
6117 		case SO_NREAD:
6118 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6119 				int pkt_total;
6120 				struct mbuf *m1;
6121 
6122 				pkt_total = 0;
6123 				m1 = so->so_rcv.sb_mb;
6124 				while (m1 != NULL) {
6125 					if (m1->m_type == MT_DATA ||
6126 					    m1->m_type == MT_HEADER ||
6127 					    m1->m_type == MT_OOBDATA) {
6128 						pkt_total += m1->m_len;
6129 					}
6130 					m1 = m1->m_next;
6131 				}
6132 				optval = pkt_total;
6133 			} else {
6134 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6135 			}
6136 			goto integer;
6137 
6138 		case SO_NUMRCVPKT:
6139 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6140 				int cnt = 0;
6141 				struct mbuf *m1;
6142 
6143 				m1 = so->so_rcv.sb_mb;
6144 				while (m1 != NULL) {
6145 					cnt += 1;
6146 					m1 = m1->m_nextpkt;
6147 				}
6148 				optval = cnt;
6149 				goto integer;
6150 			} else {
6151 				error = ENOPROTOOPT;
6152 				break;
6153 			}
6154 
6155 		case SO_NWRITE:
6156 			optval = so->so_snd.sb_cc;
6157 			goto integer;
6158 
6159 		case SO_ERROR:
6160 			optval = so->so_error;
6161 			so->so_error = 0;
6162 			goto integer;
6163 
6164 		case SO_SNDBUF: {
6165 			u_int32_t hiwat = so->so_snd.sb_hiwat;
6166 
6167 			if (so->so_snd.sb_flags & SB_UNIX) {
6168 				struct unpcb *unp =
6169 				    (struct unpcb *)(so->so_pcb);
6170 				if (unp != NULL && unp->unp_conn != NULL) {
6171 					hiwat += unp->unp_conn->unp_cc;
6172 				}
6173 			}
6174 
6175 			optval = hiwat;
6176 			goto integer;
6177 		}
6178 		case SO_RCVBUF:
6179 			optval = so->so_rcv.sb_hiwat;
6180 			goto integer;
6181 
6182 		case SO_SNDLOWAT:
6183 			optval = so->so_snd.sb_lowat;
6184 			goto integer;
6185 
6186 		case SO_RCVLOWAT:
6187 			optval = so->so_rcv.sb_lowat;
6188 			goto integer;
6189 
6190 		case SO_SNDTIMEO:
6191 		case SO_RCVTIMEO:
6192 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
6193 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6194 
6195 			error = sooptcopyout_timeval(sopt, &tv);
6196 			break;
6197 
6198 		case SO_NOSIGPIPE:
6199 			optval = (so->so_flags & SOF_NOSIGPIPE);
6200 			goto integer;
6201 
6202 		case SO_NOADDRERR:
6203 			optval = (so->so_flags & SOF_NOADDRAVAIL);
6204 			goto integer;
6205 
6206 		case SO_REUSESHAREUID:
6207 			optval = (so->so_flags & SOF_REUSESHAREUID);
6208 			goto integer;
6209 
6210 
6211 		case SO_NOTIFYCONFLICT:
6212 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6213 			goto integer;
6214 
6215 		case SO_RESTRICTIONS:
6216 			optval = so_get_restrictions(so);
6217 			goto integer;
6218 
6219 		case SO_AWDL_UNRESTRICTED:
6220 			if (SOCK_DOM(so) == PF_INET ||
6221 			    SOCK_DOM(so) == PF_INET6) {
6222 				optval = inp_get_awdl_unrestricted(
6223 					sotoinpcb(so));
6224 				goto integer;
6225 			} else {
6226 				error = EOPNOTSUPP;
6227 			}
6228 			break;
6229 
6230 		case SO_INTCOPROC_ALLOW:
6231 			if (SOCK_DOM(so) == PF_INET6) {
6232 				optval = inp_get_intcoproc_allowed(
6233 					sotoinpcb(so));
6234 				goto integer;
6235 			} else {
6236 				error = EOPNOTSUPP;
6237 			}
6238 			break;
6239 
6240 		case SO_LABEL:
6241 			error = EOPNOTSUPP;
6242 			break;
6243 
6244 		case SO_PEERLABEL:
6245 			error = EOPNOTSUPP;
6246 			break;
6247 
6248 #ifdef __APPLE_API_PRIVATE
6249 		case SO_UPCALLCLOSEWAIT:
6250 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6251 			goto integer;
6252 #endif
6253 		case SO_RANDOMPORT:
6254 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6255 			goto integer;
6256 
6257 		case SO_NP_EXTENSIONS: {
6258 			struct so_np_extensions sonpx = {};
6259 
6260 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6261 			    SONPX_SETOPTSHUT : 0;
6262 			sonpx.npx_mask = SONPX_MASK_VALID;
6263 
6264 			error = sooptcopyout(sopt, &sonpx,
6265 			    sizeof(struct so_np_extensions));
6266 			break;
6267 		}
6268 
6269 		case SO_TRAFFIC_CLASS:
6270 			optval = so->so_traffic_class;
6271 			goto integer;
6272 
6273 		case SO_RECV_TRAFFIC_CLASS:
6274 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6275 			goto integer;
6276 
6277 #if (DEVELOPMENT || DEBUG)
6278 		case SO_TRAFFIC_CLASS_DBG:
6279 			error = sogetopt_tcdbg(so, sopt);
6280 			break;
6281 #endif /* (DEVELOPMENT || DEBUG) */
6282 
6283 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6284 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6285 			goto integer;
6286 
6287 		case SO_DEFUNCTOK:
6288 			optval = !(so->so_flags & SOF_NODEFUNCT);
6289 			goto integer;
6290 
6291 		case SO_ISDEFUNCT:
6292 			optval = (so->so_flags & SOF_DEFUNCT);
6293 			goto integer;
6294 
6295 		case SO_OPPORTUNISTIC:
6296 			optval = so_get_opportunistic(so);
6297 			goto integer;
6298 
6299 		case SO_FLUSH:
6300 			/* This option is not gettable */
6301 			error = EINVAL;
6302 			break;
6303 
6304 		case SO_RECV_ANYIF:
6305 			optval = so_get_recv_anyif(so);
6306 			goto integer;
6307 
6308 		case SO_TRAFFIC_MGT_BACKGROUND:
6309 			/* This option is handled by lower layer(s) */
6310 			if (so->so_proto != NULL &&
6311 			    so->so_proto->pr_ctloutput != NULL) {
6312 				(void) so->so_proto->pr_ctloutput(so, sopt);
6313 			}
6314 			break;
6315 
6316 #if FLOW_DIVERT
6317 		case SO_FLOW_DIVERT_TOKEN:
6318 			error = flow_divert_token_get(so, sopt);
6319 			break;
6320 #endif  /* FLOW_DIVERT */
6321 
6322 #if NECP
6323 		case SO_NECP_ATTRIBUTES:
6324 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6325 				/* Handled by MPTCP itself */
6326 				break;
6327 			}
6328 
6329 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6330 				error = EINVAL;
6331 				goto out;
6332 			}
6333 
6334 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6335 			break;
6336 
6337 		case SO_NECP_CLIENTUUID: {
6338 			uuid_t *ncu;
6339 
6340 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6341 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6342 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6343 				ncu = &sotoinpcb(so)->necp_client_uuid;
6344 			} else {
6345 				error = EINVAL;
6346 				goto out;
6347 			}
6348 
6349 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6350 			break;
6351 		}
6352 
6353 		case SO_NECP_LISTENUUID: {
6354 			uuid_t *nlu;
6355 
6356 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6357 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6358 					nlu = &sotoinpcb(so)->necp_client_uuid;
6359 				} else {
6360 					error = ENOENT;
6361 					goto out;
6362 				}
6363 			} else {
6364 				error = EINVAL;
6365 				goto out;
6366 			}
6367 
6368 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6369 			break;
6370 		}
6371 
6372 		case SO_RESOLVER_SIGNATURE: {
6373 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6374 				error = EINVAL;
6375 				goto out;
6376 			}
6377 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6378 			break;
6379 		}
6380 
6381 #endif /* NECP */
6382 
6383 #if CONTENT_FILTER
6384 		case SO_CFIL_SOCK_ID: {
6385 			cfil_sock_id_t sock_id;
6386 
6387 			sock_id = cfil_sock_id_from_socket(so);
6388 
6389 			error = sooptcopyout(sopt, &sock_id,
6390 			    sizeof(cfil_sock_id_t));
6391 			break;
6392 		}
6393 #endif  /* CONTENT_FILTER */
6394 
6395 		case SO_EXTENDED_BK_IDLE:
6396 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6397 			goto integer;
6398 		case SO_MARK_CELLFALLBACK:
6399 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6400 			    ? 1 : 0;
6401 			goto integer;
6402 		case SO_FALLBACK_MODE:
6403 			optval = so->so_fallback_mode;
6404 			goto integer;
6405 		case SO_MARK_KNOWN_TRACKER: {
6406 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6407 			    ? 1 : 0;
6408 			goto integer;
6409 		}
6410 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6411 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6412 			    ? 1 : 0;
6413 			goto integer;
6414 		}
6415 		case SO_MARK_APPROVED_APP_DOMAIN: {
6416 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6417 			    ? 1 : 0;
6418 			goto integer;
6419 		}
6420 		case SO_NET_SERVICE_TYPE: {
6421 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6422 				optval = so->so_netsvctype;
6423 			} else {
6424 				optval = NET_SERVICE_TYPE_BE;
6425 			}
6426 			goto integer;
6427 		}
6428 		case SO_NETSVC_MARKING_LEVEL:
6429 			optval = so_get_netsvc_marking_level(so);
6430 			goto integer;
6431 
6432 		case SO_MPKL_SEND_INFO: {
6433 			struct so_mpkl_send_info so_mpkl_send_info;
6434 
6435 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6436 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6437 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6438 			    sizeof(struct so_mpkl_send_info));
6439 			break;
6440 		}
6441 		case SO_MARK_WAKE_PKT:
6442 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6443 			goto integer;
6444 		case SO_RECV_WAKE_PKT:
6445 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6446 			goto integer;
6447 		default:
6448 			error = ENOPROTOOPT;
6449 			break;
6450 		}
6451 	}
6452 out:
6453 	if (dolock) {
6454 		socket_unlock(so, 1);
6455 	}
6456 	return error;
6457 }
6458 
6459 /*
6460  * The size limits on our soopt_getm is different from that on FreeBSD.
6461  * We limit the size of options to MCLBYTES. This will have to change
6462  * if we need to define options that need more space than MCLBYTES.
6463  */
6464 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6465 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6466 {
6467 	struct mbuf *m, *m_prev;
6468 	int sopt_size = (int)sopt->sopt_valsize;
6469 	int how;
6470 
6471 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6472 		return EMSGSIZE;
6473 	}
6474 
6475 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6476 	MGET(m, how, MT_DATA);
6477 	if (m == NULL) {
6478 		return ENOBUFS;
6479 	}
6480 	if (sopt_size > MLEN) {
6481 		MCLGET(m, how);
6482 		if ((m->m_flags & M_EXT) == 0) {
6483 			m_free(m);
6484 			return ENOBUFS;
6485 		}
6486 		m->m_len = min(MCLBYTES, sopt_size);
6487 	} else {
6488 		m->m_len = min(MLEN, sopt_size);
6489 	}
6490 	sopt_size -= m->m_len;
6491 	*mp = m;
6492 	m_prev = m;
6493 
6494 	while (sopt_size > 0) {
6495 		MGET(m, how, MT_DATA);
6496 		if (m == NULL) {
6497 			m_freem(*mp);
6498 			return ENOBUFS;
6499 		}
6500 		if (sopt_size > MLEN) {
6501 			MCLGET(m, how);
6502 			if ((m->m_flags & M_EXT) == 0) {
6503 				m_freem(*mp);
6504 				m_freem(m);
6505 				return ENOBUFS;
6506 			}
6507 			m->m_len = min(MCLBYTES, sopt_size);
6508 		} else {
6509 			m->m_len = min(MLEN, sopt_size);
6510 		}
6511 		sopt_size -= m->m_len;
6512 		m_prev->m_next = m;
6513 		m_prev = m;
6514 	}
6515 	return 0;
6516 }
6517 
6518 /* copyin sopt data into mbuf chain */
6519 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6520 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6521 {
6522 	struct mbuf *m0 = m;
6523 
6524 	if (sopt->sopt_val == USER_ADDR_NULL) {
6525 		return 0;
6526 	}
6527 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6528 		if (sopt->sopt_p != kernproc) {
6529 			int error;
6530 
6531 			error = copyin(sopt->sopt_val, mtod(m, char *),
6532 			    m->m_len);
6533 			if (error != 0) {
6534 				m_freem(m0);
6535 				return error;
6536 			}
6537 		} else {
6538 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6539 			    mtod(m, char *), m->m_len);
6540 		}
6541 		sopt->sopt_valsize -= m->m_len;
6542 		sopt->sopt_val += m->m_len;
6543 		m = m->m_next;
6544 	}
6545 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6546 	if (m != NULL) {
6547 		panic("soopt_mcopyin");
6548 		/* NOTREACHED */
6549 	}
6550 	return 0;
6551 }
6552 
6553 /* copyout mbuf chain data into soopt */
6554 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6555 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6556 {
6557 	struct mbuf *m0 = m;
6558 	size_t valsize = 0;
6559 
6560 	if (sopt->sopt_val == USER_ADDR_NULL) {
6561 		return 0;
6562 	}
6563 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6564 		if (sopt->sopt_p != kernproc) {
6565 			int error;
6566 
6567 			error = copyout(mtod(m, char *), sopt->sopt_val,
6568 			    m->m_len);
6569 			if (error != 0) {
6570 				m_freem(m0);
6571 				return error;
6572 			}
6573 		} else {
6574 			bcopy(mtod(m, char *),
6575 			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6576 		}
6577 		sopt->sopt_valsize -= m->m_len;
6578 		sopt->sopt_val += m->m_len;
6579 		valsize += m->m_len;
6580 		m = m->m_next;
6581 	}
6582 	if (m != NULL) {
6583 		/* enough soopt buffer should be given from user-land */
6584 		m_freem(m0);
6585 		return EINVAL;
6586 	}
6587 	sopt->sopt_valsize = valsize;
6588 	return 0;
6589 }
6590 
6591 void
sohasoutofband(struct socket * so)6592 sohasoutofband(struct socket *so)
6593 {
6594 	if (so->so_pgid < 0) {
6595 		gsignal(-so->so_pgid, SIGURG);
6596 	} else if (so->so_pgid > 0) {
6597 		proc_signal(so->so_pgid, SIGURG);
6598 	}
6599 	selwakeup(&so->so_rcv.sb_sel);
6600 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6601 		KNOTE(&so->so_rcv.sb_sel.si_note,
6602 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6603 	}
6604 }
6605 
6606 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6607 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6608 {
6609 #pragma unused(cred)
6610 	struct proc *p = current_proc();
6611 	int revents = 0;
6612 
6613 	socket_lock(so, 1);
6614 	so_update_last_owner_locked(so, PROC_NULL);
6615 	so_update_policy(so);
6616 
6617 	if (events & (POLLIN | POLLRDNORM)) {
6618 		if (soreadable(so)) {
6619 			revents |= events & (POLLIN | POLLRDNORM);
6620 		}
6621 	}
6622 
6623 	if (events & (POLLOUT | POLLWRNORM)) {
6624 		if (sowriteable(so)) {
6625 			revents |= events & (POLLOUT | POLLWRNORM);
6626 		}
6627 	}
6628 
6629 	if (events & (POLLPRI | POLLRDBAND)) {
6630 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6631 			revents |= events & (POLLPRI | POLLRDBAND);
6632 		}
6633 	}
6634 
6635 	if (revents == 0) {
6636 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6637 			/*
6638 			 * Darwin sets the flag first,
6639 			 * BSD calls selrecord first
6640 			 */
6641 			so->so_rcv.sb_flags |= SB_SEL;
6642 			selrecord(p, &so->so_rcv.sb_sel, wql);
6643 		}
6644 
6645 		if (events & (POLLOUT | POLLWRNORM)) {
6646 			/*
6647 			 * Darwin sets the flag first,
6648 			 * BSD calls selrecord first
6649 			 */
6650 			so->so_snd.sb_flags |= SB_SEL;
6651 			selrecord(p, &so->so_snd.sb_sel, wql);
6652 		}
6653 	}
6654 
6655 	socket_unlock(so, 1);
6656 	return revents;
6657 }
6658 
6659 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6660 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6661 {
6662 	struct socket *so = (struct socket *)fp_get_data(fp);
6663 	int result;
6664 
6665 	socket_lock(so, 1);
6666 	so_update_last_owner_locked(so, PROC_NULL);
6667 	so_update_policy(so);
6668 
6669 	switch (kn->kn_filter) {
6670 	case EVFILT_READ:
6671 		kn->kn_filtid = EVFILTID_SOREAD;
6672 		break;
6673 	case EVFILT_WRITE:
6674 		kn->kn_filtid = EVFILTID_SOWRITE;
6675 		break;
6676 	case EVFILT_SOCK:
6677 		kn->kn_filtid = EVFILTID_SCK;
6678 		break;
6679 	case EVFILT_EXCEPT:
6680 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6681 		break;
6682 	default:
6683 		socket_unlock(so, 1);
6684 		knote_set_error(kn, EINVAL);
6685 		return 0;
6686 	}
6687 
6688 	/*
6689 	 * call the appropriate sub-filter attach
6690 	 * with the socket still locked
6691 	 */
6692 	result = knote_fops(kn)->f_attach(kn, kev);
6693 
6694 	socket_unlock(so, 1);
6695 
6696 	return result;
6697 }
6698 
6699 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6700 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6701 {
6702 	int retval = 0;
6703 	int64_t data = 0;
6704 
6705 	if (so->so_options & SO_ACCEPTCONN) {
6706 		/*
6707 		 * Radar 6615193 handle the listen case dynamically
6708 		 * for kqueue read filter. This allows to call listen()
6709 		 * after registering the kqueue EVFILT_READ.
6710 		 */
6711 
6712 		retval = !TAILQ_EMPTY(&so->so_comp);
6713 		data = so->so_qlen;
6714 		goto out;
6715 	}
6716 
6717 	/* socket isn't a listener */
6718 	/*
6719 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6720 	 * the bytes of protocol data. We therefore exclude any
6721 	 * control bytes.
6722 	 */
6723 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6724 
6725 	if (kn->kn_sfflags & NOTE_OOB) {
6726 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6727 			kn->kn_fflags |= NOTE_OOB;
6728 			data -= so->so_oobmark;
6729 			retval = 1;
6730 			goto out;
6731 		}
6732 	}
6733 
6734 	if ((so->so_state & SS_CANTRCVMORE)
6735 #if CONTENT_FILTER
6736 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6737 #endif /* CONTENT_FILTER */
6738 	    ) {
6739 		kn->kn_flags |= EV_EOF;
6740 		kn->kn_fflags = so->so_error;
6741 		retval = 1;
6742 		goto out;
6743 	}
6744 
6745 	if (so->so_error) {     /* temporary udp error */
6746 		retval = 1;
6747 		goto out;
6748 	}
6749 
6750 	int64_t lowwat = so->so_rcv.sb_lowat;
6751 	/*
6752 	 * Ensure that when NOTE_LOWAT is used, the derived
6753 	 * low water mark is bounded by socket's rcv buf's
6754 	 * high and low water mark values.
6755 	 */
6756 	if (kn->kn_sfflags & NOTE_LOWAT) {
6757 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6758 			lowwat = so->so_rcv.sb_hiwat;
6759 		} else if (kn->kn_sdata > lowwat) {
6760 			lowwat = kn->kn_sdata;
6761 		}
6762 	}
6763 
6764 	/*
6765 	 * While the `data` field is the amount of data to read,
6766 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6767 	 * so we need to take control bytes into account too.
6768 	 */
6769 	retval = (so->so_rcv.sb_cc >= lowwat);
6770 
6771 out:
6772 	if (retval && kev) {
6773 		knote_fill_kevent(kn, kev, data);
6774 	}
6775 	return retval;
6776 }
6777 
6778 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6779 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6780 {
6781 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6782 
6783 	/* socket locked */
6784 
6785 	/*
6786 	 * If the caller explicitly asked for OOB results (e.g. poll())
6787 	 * from EVFILT_READ, then save that off in the hookid field
6788 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6789 	 */
6790 	if (kn->kn_filter == EVFILT_READ &&
6791 	    kn->kn_flags & EV_OOBAND) {
6792 		kn->kn_flags &= ~EV_OOBAND;
6793 		kn->kn_hook32 = EV_OOBAND;
6794 	} else {
6795 		kn->kn_hook32 = 0;
6796 	}
6797 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6798 		so->so_rcv.sb_flags |= SB_KNOTE;
6799 	}
6800 
6801 	/* indicate if event is already fired */
6802 	return filt_soread_common(kn, NULL, so);
6803 }
6804 
6805 static void
filt_sordetach(struct knote * kn)6806 filt_sordetach(struct knote *kn)
6807 {
6808 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6809 
6810 	socket_lock(so, 1);
6811 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6812 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6813 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6814 		}
6815 	}
6816 	socket_unlock(so, 1);
6817 }
6818 
6819 /*ARGSUSED*/
6820 static int
filt_soread(struct knote * kn,long hint)6821 filt_soread(struct knote *kn, long hint)
6822 {
6823 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6824 	int retval;
6825 
6826 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6827 		socket_lock(so, 1);
6828 	}
6829 
6830 	retval = filt_soread_common(kn, NULL, so);
6831 
6832 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6833 		socket_unlock(so, 1);
6834 	}
6835 
6836 	return retval;
6837 }
6838 
6839 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6840 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6841 {
6842 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6843 	int retval;
6844 
6845 	socket_lock(so, 1);
6846 
6847 	/* save off the new input fflags and data */
6848 	kn->kn_sfflags = kev->fflags;
6849 	kn->kn_sdata = kev->data;
6850 
6851 	/* determine if changes result in fired events */
6852 	retval = filt_soread_common(kn, NULL, so);
6853 
6854 	socket_unlock(so, 1);
6855 
6856 	return retval;
6857 }
6858 
6859 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6860 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6861 {
6862 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6863 	int retval;
6864 
6865 	socket_lock(so, 1);
6866 	retval = filt_soread_common(kn, kev, so);
6867 	socket_unlock(so, 1);
6868 
6869 	return retval;
6870 }
6871 
6872 int
so_wait_for_if_feedback(struct socket * so)6873 so_wait_for_if_feedback(struct socket *so)
6874 {
6875 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6876 	    (so->so_state & SS_ISCONNECTED)) {
6877 		struct inpcb *inp = sotoinpcb(so);
6878 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6879 			return 1;
6880 		}
6881 	}
6882 	return 0;
6883 }
6884 
6885 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6886 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6887 {
6888 	int ret = 0;
6889 	int64_t data = sbspace(&so->so_snd);
6890 
6891 	if (so->so_state & SS_CANTSENDMORE) {
6892 		kn->kn_flags |= EV_EOF;
6893 		kn->kn_fflags = so->so_error;
6894 		ret = 1;
6895 		goto out;
6896 	}
6897 
6898 	if (so->so_error) {     /* temporary udp error */
6899 		ret = 1;
6900 		goto out;
6901 	}
6902 
6903 	if (!socanwrite(so)) {
6904 		ret = 0;
6905 		goto out;
6906 	}
6907 
6908 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6909 		ret = 1;
6910 		goto out;
6911 	}
6912 
6913 	int64_t lowwat = so->so_snd.sb_lowat;
6914 	const int64_t hiwat = so->so_snd.sb_hiwat;
6915 	/*
6916 	 * Deal with connected UNIX domain sockets which
6917 	 * rely on the fact that the sender's socket buffer is
6918 	 * actually the receiver's socket buffer.
6919 	 */
6920 	if (SOCK_DOM(so) == PF_LOCAL) {
6921 		struct unpcb *unp = sotounpcb(so);
6922 		if (unp != NULL && unp->unp_conn != NULL &&
6923 		    unp->unp_conn->unp_socket != NULL) {
6924 			struct socket *so2 = unp->unp_conn->unp_socket;
6925 			/*
6926 			 * At this point we know that `so' is locked
6927 			 * and that `unp_conn` isn't going to change.
6928 			 * However, we don't lock `so2` because doing so
6929 			 * may require unlocking `so'
6930 			 * (see unp_get_locks_in_order()).
6931 			 *
6932 			 * Two cases can happen:
6933 			 *
6934 			 * 1) we return 1 and tell the application that
6935 			 *    it can write.  Meanwhile, another thread
6936 			 *    fills up the socket buffer.  This will either
6937 			 *    lead to a blocking send or EWOULDBLOCK
6938 			 *    which the application should deal with.
6939 			 * 2) we return 0 and tell the application that
6940 			 *    the socket is not writable.  Meanwhile,
6941 			 *    another thread depletes the receive socket
6942 			 *    buffer. In this case the application will
6943 			 *    be woken up by sb_notify().
6944 			 *
6945 			 * MIN() is required because otherwise sosendcheck()
6946 			 * may return EWOULDBLOCK since it only considers
6947 			 * so->so_snd.
6948 			 */
6949 			data = MIN(data, sbspace(&so2->so_rcv));
6950 		}
6951 	}
6952 
6953 	if (kn->kn_sfflags & NOTE_LOWAT) {
6954 		if (kn->kn_sdata > hiwat) {
6955 			lowwat = hiwat;
6956 		} else if (kn->kn_sdata > lowwat) {
6957 			lowwat = kn->kn_sdata;
6958 		}
6959 	}
6960 
6961 	if (data > 0 && data >= lowwat) {
6962 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6963 #if (DEBUG || DEVELOPMENT)
6964 		    && so_notsent_lowat_check == 1
6965 #endif /* DEBUG || DEVELOPMENT */
6966 		    ) {
6967 			if ((SOCK_DOM(so) == PF_INET ||
6968 			    SOCK_DOM(so) == PF_INET6) &&
6969 			    so->so_type == SOCK_STREAM) {
6970 				ret = tcp_notsent_lowat_check(so);
6971 			}
6972 #if MPTCP
6973 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6974 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6975 				ret = mptcp_notsent_lowat_check(so);
6976 			}
6977 #endif
6978 			else {
6979 				ret = 1;
6980 				goto out;
6981 			}
6982 		} else {
6983 			ret = 1;
6984 		}
6985 	}
6986 	if (so_wait_for_if_feedback(so)) {
6987 		ret = 0;
6988 	}
6989 
6990 out:
6991 	if (ret && kev) {
6992 		knote_fill_kevent(kn, kev, data);
6993 	}
6994 	return ret;
6995 }
6996 
6997 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6998 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6999 {
7000 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7001 
7002 	/* socket locked */
7003 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
7004 		so->so_snd.sb_flags |= SB_KNOTE;
7005 	}
7006 
7007 	/* determine if its already fired */
7008 	return filt_sowrite_common(kn, NULL, so);
7009 }
7010 
7011 static void
filt_sowdetach(struct knote * kn)7012 filt_sowdetach(struct knote *kn)
7013 {
7014 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7015 	socket_lock(so, 1);
7016 
7017 	if (so->so_snd.sb_flags & SB_KNOTE) {
7018 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
7019 			so->so_snd.sb_flags &= ~SB_KNOTE;
7020 		}
7021 	}
7022 	socket_unlock(so, 1);
7023 }
7024 
7025 /*ARGSUSED*/
7026 static int
filt_sowrite(struct knote * kn,long hint)7027 filt_sowrite(struct knote *kn, long hint)
7028 {
7029 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7030 	int ret;
7031 
7032 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7033 		socket_lock(so, 1);
7034 	}
7035 
7036 	ret = filt_sowrite_common(kn, NULL, so);
7037 
7038 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7039 		socket_unlock(so, 1);
7040 	}
7041 
7042 	return ret;
7043 }
7044 
7045 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)7046 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
7047 {
7048 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7049 	int ret;
7050 
7051 	socket_lock(so, 1);
7052 
7053 	/*save off the new input fflags and data */
7054 	kn->kn_sfflags = kev->fflags;
7055 	kn->kn_sdata = kev->data;
7056 
7057 	/* determine if these changes result in a triggered event */
7058 	ret = filt_sowrite_common(kn, NULL, so);
7059 
7060 	socket_unlock(so, 1);
7061 
7062 	return ret;
7063 }
7064 
7065 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)7066 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
7067 {
7068 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7069 	int ret;
7070 
7071 	socket_lock(so, 1);
7072 	ret = filt_sowrite_common(kn, kev, so);
7073 	socket_unlock(so, 1);
7074 
7075 	return ret;
7076 }
7077 
7078 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)7079 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
7080     struct socket *so, long ev_hint)
7081 {
7082 	int ret = 0;
7083 	int64_t data = 0;
7084 	uint32_t level_trigger = 0;
7085 
7086 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
7087 		kn->kn_fflags |= NOTE_CONNRESET;
7088 	}
7089 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
7090 		kn->kn_fflags |= NOTE_TIMEOUT;
7091 	}
7092 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
7093 		kn->kn_fflags |= NOTE_NOSRCADDR;
7094 	}
7095 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
7096 		kn->kn_fflags |= NOTE_IFDENIED;
7097 	}
7098 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7099 		kn->kn_fflags |= NOTE_KEEPALIVE;
7100 	}
7101 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7102 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7103 	}
7104 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7105 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7106 	}
7107 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7108 	    (so->so_state & SS_ISCONNECTED)) {
7109 		kn->kn_fflags |= NOTE_CONNECTED;
7110 		level_trigger |= NOTE_CONNECTED;
7111 	}
7112 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7113 	    (so->so_state & SS_ISDISCONNECTED)) {
7114 		kn->kn_fflags |= NOTE_DISCONNECTED;
7115 		level_trigger |= NOTE_DISCONNECTED;
7116 	}
7117 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7118 		if (so->so_proto != NULL &&
7119 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7120 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7121 		}
7122 	}
7123 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7124 	    tcp_notify_ack_active(so)) {
7125 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
7126 	}
7127 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7128 		kn->kn_fflags |= NOTE_WAKE_PKT;
7129 	}
7130 
7131 	if ((so->so_state & SS_CANTRCVMORE)
7132 #if CONTENT_FILTER
7133 	    && cfil_sock_data_pending(&so->so_rcv) == 0
7134 #endif /* CONTENT_FILTER */
7135 	    ) {
7136 		kn->kn_fflags |= NOTE_READCLOSED;
7137 		level_trigger |= NOTE_READCLOSED;
7138 	}
7139 
7140 	if (so->so_state & SS_CANTSENDMORE) {
7141 		kn->kn_fflags |= NOTE_WRITECLOSED;
7142 		level_trigger |= NOTE_WRITECLOSED;
7143 	}
7144 
7145 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7146 	    (so->so_flags & SOF_SUSPENDED)) {
7147 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7148 
7149 		/* If resume event was delivered before, reset it */
7150 		kn->kn_hook32 &= ~NOTE_RESUME;
7151 
7152 		kn->kn_fflags |= NOTE_SUSPEND;
7153 		level_trigger |= NOTE_SUSPEND;
7154 	}
7155 
7156 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
7157 	    (so->so_flags & SOF_SUSPENDED) == 0) {
7158 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7159 
7160 		/* If suspend event was delivered before, reset it */
7161 		kn->kn_hook32 &= ~NOTE_SUSPEND;
7162 
7163 		kn->kn_fflags |= NOTE_RESUME;
7164 		level_trigger |= NOTE_RESUME;
7165 	}
7166 
7167 	if (so->so_error != 0) {
7168 		ret = 1;
7169 		data = so->so_error;
7170 		kn->kn_flags |= EV_EOF;
7171 	} else {
7172 		u_int32_t data32 = 0;
7173 		get_sockev_state(so, &data32);
7174 		data = data32;
7175 	}
7176 
7177 	/* Reset any events that are not requested on this knote */
7178 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7179 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7180 
7181 	/* Find the level triggerred events that are already delivered */
7182 	level_trigger &= kn->kn_hook32;
7183 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7184 
7185 	/* Do not deliver level triggerred events more than once */
7186 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7187 		ret = 1;
7188 	}
7189 
7190 	if (ret && kev) {
7191 		/*
7192 		 * Store the state of the events being delivered. This
7193 		 * state can be used to deliver level triggered events
7194 		 * ateast once and still avoid waking up the application
7195 		 * multiple times as long as the event is active.
7196 		 */
7197 		if (kn->kn_fflags != 0) {
7198 			kn->kn_hook32 |= (kn->kn_fflags &
7199 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7200 		}
7201 
7202 		/*
7203 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7204 		 * only one of them and remember the last one that was
7205 		 * delivered last
7206 		 */
7207 		if (kn->kn_fflags & NOTE_SUSPEND) {
7208 			kn->kn_hook32 &= ~NOTE_RESUME;
7209 		}
7210 		if (kn->kn_fflags & NOTE_RESUME) {
7211 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7212 		}
7213 
7214 		knote_fill_kevent(kn, kev, data);
7215 	}
7216 	return ret;
7217 }
7218 
7219 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7220 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7221 {
7222 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7223 
7224 	/* socket locked */
7225 	kn->kn_hook32 = 0;
7226 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7227 		so->so_flags |= SOF_KNOTE;
7228 	}
7229 
7230 	/* determine if event already fired */
7231 	return filt_sockev_common(kn, NULL, so, 0);
7232 }
7233 
7234 static void
filt_sockdetach(struct knote * kn)7235 filt_sockdetach(struct knote *kn)
7236 {
7237 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7238 	socket_lock(so, 1);
7239 
7240 	if ((so->so_flags & SOF_KNOTE) != 0) {
7241 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7242 			so->so_flags &= ~SOF_KNOTE;
7243 		}
7244 	}
7245 	socket_unlock(so, 1);
7246 }
7247 
7248 static int
filt_sockev(struct knote * kn,long hint)7249 filt_sockev(struct knote *kn, long hint)
7250 {
7251 	int ret = 0, locked = 0;
7252 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7253 	long ev_hint = (hint & SO_FILT_HINT_EV);
7254 
7255 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7256 		socket_lock(so, 1);
7257 		locked = 1;
7258 	}
7259 
7260 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7261 
7262 	if (locked) {
7263 		socket_unlock(so, 1);
7264 	}
7265 
7266 	return ret;
7267 }
7268 
7269 
7270 
7271 /*
7272  *	filt_socktouch - update event state
7273  */
7274 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7275 filt_socktouch(
7276 	struct knote *kn,
7277 	struct kevent_qos_s *kev)
7278 {
7279 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7280 	uint32_t changed_flags;
7281 	int ret;
7282 
7283 	socket_lock(so, 1);
7284 
7285 	/* save off the [result] data and fflags */
7286 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7287 
7288 	/* save off the new input fflags and data */
7289 	kn->kn_sfflags = kev->fflags;
7290 	kn->kn_sdata = kev->data;
7291 
7292 	/* restrict the current results to the (smaller?) set of new interest */
7293 	/*
7294 	 * For compatibility with previous implementations, we leave kn_fflags
7295 	 * as they were before.
7296 	 */
7297 	//kn->kn_fflags &= kev->fflags;
7298 
7299 	/*
7300 	 * Since we keep track of events that are already
7301 	 * delivered, if any of those events are not requested
7302 	 * anymore the state related to them can be reset
7303 	 */
7304 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7305 
7306 	/* determine if we have events to deliver */
7307 	ret = filt_sockev_common(kn, NULL, so, 0);
7308 
7309 	socket_unlock(so, 1);
7310 
7311 	return ret;
7312 }
7313 
7314 /*
7315  *	filt_sockprocess - query event fired state and return data
7316  */
7317 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7318 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7319 {
7320 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7321 	int ret = 0;
7322 
7323 	socket_lock(so, 1);
7324 
7325 	ret = filt_sockev_common(kn, kev, so, 0);
7326 
7327 	socket_unlock(so, 1);
7328 
7329 	return ret;
7330 }
7331 
7332 void
get_sockev_state(struct socket * so,u_int32_t * statep)7333 get_sockev_state(struct socket *so, u_int32_t *statep)
7334 {
7335 	u_int32_t state = *(statep);
7336 
7337 	/*
7338 	 * If the state variable is already used by a previous event,
7339 	 * reset it.
7340 	 */
7341 	if (state != 0) {
7342 		return;
7343 	}
7344 
7345 	if (so->so_state & SS_ISCONNECTED) {
7346 		state |= SOCKEV_CONNECTED;
7347 	} else {
7348 		state &= ~(SOCKEV_CONNECTED);
7349 	}
7350 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7351 	*(statep) = state;
7352 }
7353 
7354 #define SO_LOCK_HISTORY_STR_LEN \
7355 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7356 
7357 __private_extern__ const char *
solockhistory_nr(struct socket * so)7358 solockhistory_nr(struct socket *so)
7359 {
7360 	size_t n = 0;
7361 	int i;
7362 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7363 
7364 	bzero(lock_history_str, sizeof(lock_history_str));
7365 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7366 		n += scnprintf(lock_history_str + n,
7367 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7368 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7369 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7370 	}
7371 	return lock_history_str;
7372 }
7373 
7374 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7375 socket_getlock(struct socket *so, int flags)
7376 {
7377 	if (so->so_proto->pr_getlock != NULL) {
7378 		return (*so->so_proto->pr_getlock)(so, flags);
7379 	} else {
7380 		return so->so_proto->pr_domain->dom_mtx;
7381 	}
7382 }
7383 
7384 void
socket_lock(struct socket * so,int refcount)7385 socket_lock(struct socket *so, int refcount)
7386 {
7387 	void *lr_saved;
7388 
7389 	lr_saved = __builtin_return_address(0);
7390 
7391 	if (so->so_proto->pr_lock) {
7392 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7393 	} else {
7394 #ifdef MORE_LOCKING_DEBUG
7395 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7396 		    LCK_MTX_ASSERT_NOTOWNED);
7397 #endif
7398 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7399 		if (refcount) {
7400 			so->so_usecount++;
7401 		}
7402 		so->lock_lr[so->next_lock_lr] = lr_saved;
7403 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7404 	}
7405 }
7406 
7407 void
socket_lock_assert_owned(struct socket * so)7408 socket_lock_assert_owned(struct socket *so)
7409 {
7410 	lck_mtx_t *mutex_held;
7411 
7412 	if (so->so_proto->pr_getlock != NULL) {
7413 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7414 	} else {
7415 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7416 	}
7417 
7418 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7419 }
7420 
7421 int
socket_try_lock(struct socket * so)7422 socket_try_lock(struct socket *so)
7423 {
7424 	lck_mtx_t *mtx;
7425 
7426 	if (so->so_proto->pr_getlock != NULL) {
7427 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7428 	} else {
7429 		mtx = so->so_proto->pr_domain->dom_mtx;
7430 	}
7431 
7432 	return lck_mtx_try_lock(mtx);
7433 }
7434 
7435 void
socket_unlock(struct socket * so,int refcount)7436 socket_unlock(struct socket *so, int refcount)
7437 {
7438 	void *lr_saved;
7439 	lck_mtx_t *mutex_held;
7440 
7441 	lr_saved = __builtin_return_address(0);
7442 
7443 	if (so == NULL || so->so_proto == NULL) {
7444 		panic("%s: null so_proto so=%p", __func__, so);
7445 		/* NOTREACHED */
7446 	}
7447 
7448 	if (so->so_proto->pr_unlock) {
7449 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7450 	} else {
7451 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7452 #ifdef MORE_LOCKING_DEBUG
7453 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7454 #endif
7455 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7456 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7457 
7458 		if (refcount) {
7459 			if (so->so_usecount <= 0) {
7460 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7461 				    "lrh=%s", __func__, so->so_usecount, so,
7462 				    SOCK_DOM(so), so->so_type,
7463 				    SOCK_PROTO(so), solockhistory_nr(so));
7464 				/* NOTREACHED */
7465 			}
7466 
7467 			so->so_usecount--;
7468 			if (so->so_usecount == 0) {
7469 				sofreelastref(so, 1);
7470 			}
7471 		}
7472 		lck_mtx_unlock(mutex_held);
7473 	}
7474 }
7475 
7476 /* Called with socket locked, will unlock socket */
7477 void
sofree(struct socket * so)7478 sofree(struct socket *so)
7479 {
7480 	lck_mtx_t *mutex_held;
7481 
7482 	if (so->so_proto->pr_getlock != NULL) {
7483 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7484 	} else {
7485 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7486 	}
7487 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7488 
7489 	sofreelastref(so, 0);
7490 }
7491 
7492 void
soreference(struct socket * so)7493 soreference(struct socket *so)
7494 {
7495 	socket_lock(so, 1);     /* locks & take one reference on socket */
7496 	socket_unlock(so, 0);   /* unlock only */
7497 }
7498 
7499 void
sodereference(struct socket * so)7500 sodereference(struct socket *so)
7501 {
7502 	socket_lock(so, 0);
7503 	socket_unlock(so, 1);
7504 }
7505 
7506 /*
7507  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7508  * possibility of using jumbo clusters.  Caller must ensure to hold
7509  * the socket lock.
7510  */
7511 void
somultipages(struct socket * so,boolean_t set)7512 somultipages(struct socket *so, boolean_t set)
7513 {
7514 	if (set) {
7515 		so->so_flags |= SOF_MULTIPAGES;
7516 	} else {
7517 		so->so_flags &= ~SOF_MULTIPAGES;
7518 	}
7519 }
7520 
7521 void
soif2kcl(struct socket * so,boolean_t set)7522 soif2kcl(struct socket *so, boolean_t set)
7523 {
7524 	if (set) {
7525 		so->so_flags1 |= SOF1_IF_2KCL;
7526 	} else {
7527 		so->so_flags1 &= ~SOF1_IF_2KCL;
7528 	}
7529 }
7530 
7531 int
so_isdstlocal(struct socket * so)7532 so_isdstlocal(struct socket *so)
7533 {
7534 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7535 
7536 	if (SOCK_DOM(so) == PF_INET) {
7537 		return inaddr_local(inp->inp_faddr);
7538 	} else if (SOCK_DOM(so) == PF_INET6) {
7539 		return in6addr_local(&inp->in6p_faddr);
7540 	}
7541 
7542 	return 0;
7543 }
7544 
7545 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7546 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7547 {
7548 	struct sockbuf *rcv, *snd;
7549 	int err = 0, defunct;
7550 
7551 	rcv = &so->so_rcv;
7552 	snd = &so->so_snd;
7553 
7554 	defunct = (so->so_flags & SOF_DEFUNCT);
7555 	if (defunct) {
7556 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7557 			panic("%s: SB_DROP not set", __func__);
7558 			/* NOTREACHED */
7559 		}
7560 		goto done;
7561 	}
7562 
7563 	if (so->so_flags & SOF_NODEFUNCT) {
7564 		if (noforce) {
7565 			err = EOPNOTSUPP;
7566 			if (p != PROC_NULL) {
7567 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7568 				    "name %s level %d) so 0x%llu [%d,%d] "
7569 				    "is not eligible for defunct "
7570 				    "(%d)\n", __func__, proc_selfpid(),
7571 				    proc_best_name(current_proc()), proc_pid(p),
7572 				    proc_best_name(p), level,
7573 				    so->so_gencnt,
7574 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7575 			}
7576 			return err;
7577 		}
7578 		so->so_flags &= ~SOF_NODEFUNCT;
7579 		if (p != PROC_NULL) {
7580 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7581 			    "name %s level %d) so 0x%llu [%d,%d] "
7582 			    "defunct by force "
7583 			    "(%d)\n", __func__, proc_selfpid(),
7584 			    proc_best_name(current_proc()), proc_pid(p),
7585 			    proc_best_name(p), level,
7586 			    so->so_gencnt,
7587 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7588 		}
7589 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7590 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7591 		struct ifnet *ifp = inp->inp_last_outifp;
7592 
7593 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7594 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7595 		} else if (so->so_flags & SOF_DELEGATED) {
7596 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7597 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7598 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7599 		} else if (noforce && p != PROC_NULL) {
7600 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7601 
7602 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7603 			so->so_extended_bk_start = net_uptime();
7604 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7605 
7606 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7607 
7608 			err = EOPNOTSUPP;
7609 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7610 			    "name %s level %d) so 0x%llu [%d,%d] "
7611 			    "extend bk idle "
7612 			    "(%d)\n", __func__, proc_selfpid(),
7613 			    proc_best_name(current_proc()), proc_pid(p),
7614 			    proc_best_name(p), level,
7615 			    so->so_gencnt,
7616 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7617 			return err;
7618 		} else {
7619 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7620 		}
7621 	}
7622 
7623 	so->so_flags |= SOF_DEFUNCT;
7624 
7625 	/* Prevent further data from being appended to the socket buffers */
7626 	snd->sb_flags |= SB_DROP;
7627 	rcv->sb_flags |= SB_DROP;
7628 
7629 	/* Flush any existing data in the socket buffers */
7630 	if (rcv->sb_cc != 0) {
7631 		rcv->sb_flags &= ~SB_SEL;
7632 		selthreadclear(&rcv->sb_sel);
7633 		sbrelease(rcv);
7634 	}
7635 	if (snd->sb_cc != 0) {
7636 		snd->sb_flags &= ~SB_SEL;
7637 		selthreadclear(&snd->sb_sel);
7638 		sbrelease(snd);
7639 	}
7640 
7641 done:
7642 	if (p != PROC_NULL) {
7643 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7644 		    "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7645 		    proc_selfpid(), proc_best_name(current_proc()),
7646 		    proc_pid(p), proc_best_name(p), level,
7647 		    so->so_gencnt, SOCK_DOM(so),
7648 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7649 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7650 		    " extbkidle" : "");
7651 	}
7652 	return err;
7653 }
7654 
7655 int
sodefunct(struct proc * p,struct socket * so,int level)7656 sodefunct(struct proc *p, struct socket *so, int level)
7657 {
7658 	struct sockbuf *rcv, *snd;
7659 
7660 	if (!(so->so_flags & SOF_DEFUNCT)) {
7661 		panic("%s improperly called", __func__);
7662 		/* NOTREACHED */
7663 	}
7664 	if (so->so_state & SS_DEFUNCT) {
7665 		goto done;
7666 	}
7667 
7668 	rcv = &so->so_rcv;
7669 	snd = &so->so_snd;
7670 
7671 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7672 		char s[MAX_IPv6_STR_LEN];
7673 		char d[MAX_IPv6_STR_LEN];
7674 		struct inpcb *inp = sotoinpcb(so);
7675 
7676 		if (p != PROC_NULL) {
7677 			SODEFUNCTLOG(
7678 				"%s[%d, %s]: (target pid %d name %s level %d) "
7679 				"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7680 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7681 				" snd_fl 0x%x]\n", __func__,
7682 				proc_selfpid(), proc_best_name(current_proc()),
7683 				proc_pid(p), proc_best_name(p), level,
7684 				so->so_gencnt,
7685 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7686 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7687 				(void *)&inp->inp_laddr.s_addr :
7688 				(void *)&inp->in6p_laddr),
7689 				s, sizeof(s)), ntohs(inp->in6p_lport),
7690 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7691 				(void *)&inp->inp_faddr.s_addr :
7692 				(void *)&inp->in6p_faddr,
7693 				d, sizeof(d)), ntohs(inp->in6p_fport),
7694 				(uint32_t)rcv->sb_sel.si_flags,
7695 				(uint32_t)snd->sb_sel.si_flags,
7696 				rcv->sb_flags, snd->sb_flags);
7697 		}
7698 	} else if (p != PROC_NULL) {
7699 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7700 		    "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7701 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7702 		    proc_selfpid(), proc_best_name(current_proc()),
7703 		    proc_pid(p), proc_best_name(p), level,
7704 		    so->so_gencnt,
7705 		    SOCK_DOM(so), SOCK_TYPE(so),
7706 		    (uint32_t)rcv->sb_sel.si_flags,
7707 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7708 		    snd->sb_flags);
7709 	}
7710 
7711 	/*
7712 	 * First tell the protocol the flow is defunct
7713 	 */
7714 	(void)  (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7715 
7716 	/*
7717 	 * Unwedge threads blocked on sbwait() and sb_lock().
7718 	 */
7719 	sbwakeup(rcv);
7720 	sbwakeup(snd);
7721 
7722 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7723 	if (rcv->sb_flags & SB_LOCK) {
7724 		sbunlock(rcv, TRUE);    /* keep socket locked */
7725 	}
7726 	if (snd->sb_flags & SB_LOCK) {
7727 		sbunlock(snd, TRUE);    /* keep socket locked */
7728 	}
7729 	/*
7730 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7731 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7732 	 * states are set for the socket.  This would also flush out data
7733 	 * hanging off the receive list of this socket.
7734 	 */
7735 	(void) soshutdownlock_final(so, SHUT_RD);
7736 	(void) soshutdownlock_final(so, SHUT_WR);
7737 	(void) sodisconnectlocked(so);
7738 
7739 	/*
7740 	 * Explicitly handle connectionless-protocol disconnection
7741 	 * and release any remaining data in the socket buffers.
7742 	 */
7743 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7744 		(void) soisdisconnected(so);
7745 	}
7746 
7747 	if (so->so_error == 0) {
7748 		so->so_error = EBADF;
7749 	}
7750 
7751 	if (rcv->sb_cc != 0) {
7752 		rcv->sb_flags &= ~SB_SEL;
7753 		selthreadclear(&rcv->sb_sel);
7754 		sbrelease(rcv);
7755 	}
7756 	if (snd->sb_cc != 0) {
7757 		snd->sb_flags &= ~SB_SEL;
7758 		selthreadclear(&snd->sb_sel);
7759 		sbrelease(snd);
7760 	}
7761 	so->so_state |= SS_DEFUNCT;
7762 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7763 
7764 done:
7765 	return 0;
7766 }
7767 
7768 int
soresume(struct proc * p,struct socket * so,int locked)7769 soresume(struct proc *p, struct socket *so, int locked)
7770 {
7771 	if (locked == 0) {
7772 		socket_lock(so, 1);
7773 	}
7774 
7775 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7776 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7777 		    "[%d,%d] resumed from bk idle\n",
7778 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7779 		    proc_pid(p), proc_best_name(p),
7780 		    so->so_gencnt,
7781 		    SOCK_DOM(so), SOCK_TYPE(so));
7782 
7783 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7784 		so->so_extended_bk_start = 0;
7785 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7786 
7787 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7788 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7789 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7790 	}
7791 	if (locked == 0) {
7792 		socket_unlock(so, 1);
7793 	}
7794 
7795 	return 0;
7796 }
7797 
7798 /*
7799  * Does not attempt to account for sockets that are delegated from
7800  * the current process
7801  */
7802 int
so_set_extended_bk_idle(struct socket * so,int optval)7803 so_set_extended_bk_idle(struct socket *so, int optval)
7804 {
7805 	int error = 0;
7806 
7807 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7808 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7809 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7810 		error = EOPNOTSUPP;
7811 	} else if (optval == 0) {
7812 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7813 
7814 		soresume(current_proc(), so, 1);
7815 	} else {
7816 		struct proc *p = current_proc();
7817 		struct fileproc *fp;
7818 		int count = 0;
7819 
7820 		/*
7821 		 * Unlock socket to avoid lock ordering issue with
7822 		 * the proc fd table lock
7823 		 */
7824 		socket_unlock(so, 0);
7825 
7826 		proc_fdlock(p);
7827 		fdt_foreach(fp, p) {
7828 			struct socket *so2;
7829 
7830 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7831 				continue;
7832 			}
7833 
7834 			so2 = (struct socket *)fp_get_data(fp);
7835 			if (so != so2 &&
7836 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7837 				count++;
7838 			}
7839 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7840 				break;
7841 			}
7842 		}
7843 		proc_fdunlock(p);
7844 
7845 		socket_lock(so, 0);
7846 
7847 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7848 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7849 			error = EBUSY;
7850 		} else if (so->so_flags & SOF_DELEGATED) {
7851 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7852 			error = EBUSY;
7853 		} else {
7854 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7855 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7856 		}
7857 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7858 		    "%s marked for extended bk idle\n",
7859 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7860 		    so->so_gencnt,
7861 		    SOCK_DOM(so), SOCK_TYPE(so),
7862 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7863 		    "is" : "not");
7864 	}
7865 
7866 	return error;
7867 }
7868 
7869 static void
so_stop_extended_bk_idle(struct socket * so)7870 so_stop_extended_bk_idle(struct socket *so)
7871 {
7872 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7873 	so->so_extended_bk_start = 0;
7874 
7875 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7876 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7877 	/*
7878 	 * Force defunct
7879 	 */
7880 	sosetdefunct(current_proc(), so,
7881 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7882 	if (so->so_flags & SOF_DEFUNCT) {
7883 		sodefunct(current_proc(), so,
7884 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7885 	}
7886 }
7887 
7888 void
so_drain_extended_bk_idle(struct socket * so)7889 so_drain_extended_bk_idle(struct socket *so)
7890 {
7891 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7892 		/*
7893 		 * Only penalize sockets that have outstanding data
7894 		 */
7895 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7896 			so_stop_extended_bk_idle(so);
7897 
7898 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7899 		}
7900 	}
7901 }
7902 
7903 /*
7904  * Return values tells if socket is still in extended background idle
7905  */
7906 int
so_check_extended_bk_idle_time(struct socket * so)7907 so_check_extended_bk_idle_time(struct socket *so)
7908 {
7909 	int ret = 1;
7910 
7911 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7912 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7913 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7914 		    so->so_gencnt,
7915 		    SOCK_DOM(so), SOCK_TYPE(so));
7916 		if (net_uptime() - so->so_extended_bk_start >
7917 		    soextbkidlestat.so_xbkidle_time) {
7918 			so_stop_extended_bk_idle(so);
7919 
7920 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7921 
7922 			ret = 0;
7923 		} else {
7924 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7925 
7926 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7927 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7928 		}
7929 	}
7930 
7931 	return ret;
7932 }
7933 
7934 void
resume_proc_sockets(proc_t p)7935 resume_proc_sockets(proc_t p)
7936 {
7937 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7938 		struct fileproc *fp;
7939 		struct socket *so;
7940 
7941 		proc_fdlock(p);
7942 		fdt_foreach(fp, p) {
7943 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7944 				continue;
7945 			}
7946 
7947 			so = (struct socket *)fp_get_data(fp);
7948 			(void) soresume(p, so, 0);
7949 		}
7950 		proc_fdunlock(p);
7951 
7952 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7953 	}
7954 }
7955 
7956 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7957 so_set_recv_anyif(struct socket *so, int optval)
7958 {
7959 	int ret = 0;
7960 
7961 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7962 		if (optval) {
7963 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7964 		} else {
7965 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7966 		}
7967 #if SKYWALK
7968 		inp_update_netns_flags(so);
7969 #endif /* SKYWALK */
7970 	}
7971 
7972 
7973 	return ret;
7974 }
7975 
7976 __private_extern__ int
so_get_recv_anyif(struct socket * so)7977 so_get_recv_anyif(struct socket *so)
7978 {
7979 	int ret = 0;
7980 
7981 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7982 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7983 	}
7984 
7985 	return ret;
7986 }
7987 
7988 int
so_set_restrictions(struct socket * so,uint32_t vals)7989 so_set_restrictions(struct socket *so, uint32_t vals)
7990 {
7991 	int nocell_old, nocell_new;
7992 	int noexpensive_old, noexpensive_new;
7993 	int noconstrained_old, noconstrained_new;
7994 
7995 	/*
7996 	 * Deny-type restrictions are trapdoors; once set they cannot be
7997 	 * unset for the lifetime of the socket.  This allows them to be
7998 	 * issued by a framework on behalf of the application without
7999 	 * having to worry that they can be undone.
8000 	 *
8001 	 * Note here that socket-level restrictions overrides any protocol
8002 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
8003 	 * socket restriction issued on the socket has a higher precendence
8004 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
8005 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
8006 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
8007 	 */
8008 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8009 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8010 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8011 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
8012 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
8013 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
8014 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8015 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8016 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8017 
8018 	/* we can only set, not clear restrictions */
8019 	if ((nocell_new - nocell_old) == 0 &&
8020 	    (noexpensive_new - noexpensive_old) == 0 &&
8021 	    (noconstrained_new - noconstrained_old) == 0) {
8022 		return 0;
8023 	}
8024 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8025 		if (nocell_new - nocell_old != 0) {
8026 			/*
8027 			 * if deny cellular is now set, do what's needed
8028 			 * for INPCB
8029 			 */
8030 			inp_set_nocellular(sotoinpcb(so));
8031 		}
8032 		if (noexpensive_new - noexpensive_old != 0) {
8033 			inp_set_noexpensive(sotoinpcb(so));
8034 		}
8035 		if (noconstrained_new - noconstrained_old != 0) {
8036 			inp_set_noconstrained(sotoinpcb(so));
8037 		}
8038 	}
8039 
8040 	if (SOCK_DOM(so) == PF_MULTIPATH) {
8041 		mptcp_set_restrictions(so);
8042 	}
8043 
8044 	return 0;
8045 }
8046 
8047 uint32_t
so_get_restrictions(struct socket * so)8048 so_get_restrictions(struct socket *so)
8049 {
8050 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
8051 	       SO_RESTRICT_DENY_OUT |
8052 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
8053 }
8054 
8055 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)8056 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
8057 {
8058 	struct proc *ep = PROC_NULL;
8059 	int error = 0;
8060 
8061 	/* pid 0 is reserved for kernel */
8062 	if (epid == 0) {
8063 		error = EINVAL;
8064 		goto done;
8065 	}
8066 
8067 	/*
8068 	 * If this is an in-kernel socket, prevent its delegate
8069 	 * association from changing unless the socket option is
8070 	 * coming from within the kernel itself.
8071 	 */
8072 	if (so->last_pid == 0 && p != kernproc) {
8073 		error = EACCES;
8074 		goto done;
8075 	}
8076 
8077 	/*
8078 	 * If this is issued by a process that's recorded as the
8079 	 * real owner of the socket, or if the pid is the same as
8080 	 * the process's own pid, then proceed.  Otherwise ensure
8081 	 * that the issuing process has the necessary privileges.
8082 	 */
8083 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
8084 		if ((error = priv_check_cred(kauth_cred_get(),
8085 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8086 			error = EACCES;
8087 			goto done;
8088 		}
8089 	}
8090 
8091 	/* Find the process that corresponds to the effective pid */
8092 	if ((ep = proc_find(epid)) == PROC_NULL) {
8093 		error = ESRCH;
8094 		goto done;
8095 	}
8096 
8097 	/*
8098 	 * If a process tries to delegate the socket to itself, then
8099 	 * there's really nothing to do; treat it as a way for the
8100 	 * delegate association to be cleared.  Note that we check
8101 	 * the passed-in proc rather than calling proc_selfpid(),
8102 	 * as we need to check the process issuing the socket option
8103 	 * which could be kernproc.  Given that we don't allow 0 for
8104 	 * effective pid, it means that a delegated in-kernel socket
8105 	 * stays delegated during its lifetime (which is probably OK.)
8106 	 */
8107 	if (epid == proc_pid(p)) {
8108 		so->so_flags &= ~SOF_DELEGATED;
8109 		so->e_upid = 0;
8110 		so->e_pid = 0;
8111 		uuid_clear(so->e_uuid);
8112 	} else {
8113 		so->so_flags |= SOF_DELEGATED;
8114 		so->e_upid = proc_uniqueid(ep);
8115 		so->e_pid = proc_pid(ep);
8116 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8117 
8118 #if defined(XNU_TARGET_OS_OSX)
8119 		if (ep->p_responsible_pid != so->e_pid) {
8120 			proc_t rp = proc_find(ep->p_responsible_pid);
8121 			if (rp != PROC_NULL) {
8122 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8123 				so->so_rpid = ep->p_responsible_pid;
8124 				proc_rele(rp);
8125 			} else {
8126 				uuid_clear(so->so_ruuid);
8127 				so->so_rpid = -1;
8128 			}
8129 		}
8130 #endif
8131 	}
8132 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8133 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8134 	}
8135 done:
8136 	if (error == 0 && net_io_policy_log) {
8137 		uuid_string_t buf;
8138 
8139 		uuid_unparse(so->e_uuid, buf);
8140 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8141 		    "euuid %s%s\n", __func__, proc_name_address(p),
8142 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8143 		    SOCK_DOM(so), SOCK_TYPE(so),
8144 		    so->e_pid, proc_name_address(ep), buf,
8145 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8146 	} else if (error != 0 && net_io_policy_log) {
8147 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8148 		    "ERROR (%d)\n", __func__, proc_name_address(p),
8149 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8150 		    SOCK_DOM(so), SOCK_TYPE(so),
8151 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
8152 		    proc_name_address(ep), error);
8153 	}
8154 
8155 	/* Update this socket's policy upon success */
8156 	if (error == 0) {
8157 		so->so_policy_gencnt *= -1;
8158 		so_update_policy(so);
8159 #if NECP
8160 		so_update_necp_policy(so, NULL, NULL);
8161 #endif /* NECP */
8162 	}
8163 
8164 	if (ep != PROC_NULL) {
8165 		proc_rele(ep);
8166 	}
8167 
8168 	return error;
8169 }
8170 
8171 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8172 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8173 {
8174 	uuid_string_t buf;
8175 	uuid_t uuid;
8176 	int error = 0;
8177 
8178 	/* UUID must not be all-zeroes (reserved for kernel) */
8179 	if (uuid_is_null(euuid)) {
8180 		error = EINVAL;
8181 		goto done;
8182 	}
8183 
8184 	/*
8185 	 * If this is an in-kernel socket, prevent its delegate
8186 	 * association from changing unless the socket option is
8187 	 * coming from within the kernel itself.
8188 	 */
8189 	if (so->last_pid == 0 && p != kernproc) {
8190 		error = EACCES;
8191 		goto done;
8192 	}
8193 
8194 	/* Get the UUID of the issuing process */
8195 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8196 
8197 	/*
8198 	 * If this is issued by a process that's recorded as the
8199 	 * real owner of the socket, or if the uuid is the same as
8200 	 * the process's own uuid, then proceed.  Otherwise ensure
8201 	 * that the issuing process has the necessary privileges.
8202 	 */
8203 	if (check_cred &&
8204 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8205 	    uuid_compare(euuid, uuid) != 0)) {
8206 		if ((error = priv_check_cred(kauth_cred_get(),
8207 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8208 			error = EACCES;
8209 			goto done;
8210 		}
8211 	}
8212 
8213 	/*
8214 	 * If a process tries to delegate the socket to itself, then
8215 	 * there's really nothing to do; treat it as a way for the
8216 	 * delegate association to be cleared.  Note that we check
8217 	 * the uuid of the passed-in proc rather than that of the
8218 	 * current process, as we need to check the process issuing
8219 	 * the socket option which could be kernproc itself.  Given
8220 	 * that we don't allow 0 for effective uuid, it means that
8221 	 * a delegated in-kernel socket stays delegated during its
8222 	 * lifetime (which is okay.)
8223 	 */
8224 	if (uuid_compare(euuid, uuid) == 0) {
8225 		so->so_flags &= ~SOF_DELEGATED;
8226 		so->e_upid = 0;
8227 		so->e_pid = 0;
8228 		uuid_clear(so->e_uuid);
8229 	} else {
8230 		so->so_flags |= SOF_DELEGATED;
8231 		/*
8232 		 * Unlike so_set_effective_pid(), we only have the UUID
8233 		 * here and the process ID is not known.  Inherit the
8234 		 * real {pid,upid} of the socket.
8235 		 */
8236 		so->e_upid = so->last_upid;
8237 		so->e_pid = so->last_pid;
8238 		uuid_copy(so->e_uuid, euuid);
8239 	}
8240 	/*
8241 	 * The following will clear the effective process name as it's the same
8242 	 * as the real process
8243 	 */
8244 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8245 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8246 	}
8247 done:
8248 	if (error == 0 && net_io_policy_log) {
8249 		uuid_unparse(so->e_uuid, buf);
8250 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8251 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8252 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8253 		    SOCK_TYPE(so), so->e_pid, buf,
8254 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8255 	} else if (error != 0 && net_io_policy_log) {
8256 		uuid_unparse(euuid, buf);
8257 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8258 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8259 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8260 		    SOCK_TYPE(so), buf, error);
8261 	}
8262 
8263 	/* Update this socket's policy upon success */
8264 	if (error == 0) {
8265 		so->so_policy_gencnt *= -1;
8266 		so_update_policy(so);
8267 #if NECP
8268 		so_update_necp_policy(so, NULL, NULL);
8269 #endif /* NECP */
8270 	}
8271 
8272 	return error;
8273 }
8274 
8275 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8276 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8277     uint32_t ev_datalen)
8278 {
8279 	struct kev_msg ev_msg;
8280 
8281 	/*
8282 	 * A netpolicy event always starts with a netpolicy_event_data
8283 	 * structure, but the caller can provide for a longer event
8284 	 * structure to post, depending on the event code.
8285 	 */
8286 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8287 
8288 	bzero(&ev_msg, sizeof(ev_msg));
8289 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8290 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8291 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8292 	ev_msg.event_code       = ev_code;
8293 
8294 	ev_msg.dv[0].data_ptr   = ev_data;
8295 	ev_msg.dv[0].data_length = ev_datalen;
8296 
8297 	kev_post_msg(&ev_msg);
8298 }
8299 
8300 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8301 socket_post_kev_msg(uint32_t ev_code,
8302     struct kev_socket_event_data *ev_data,
8303     uint32_t ev_datalen)
8304 {
8305 	struct kev_msg ev_msg;
8306 
8307 	bzero(&ev_msg, sizeof(ev_msg));
8308 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8309 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8310 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8311 	ev_msg.event_code = ev_code;
8312 
8313 	ev_msg.dv[0].data_ptr = ev_data;
8314 	ev_msg.dv[0].data_length = ev_datalen;
8315 
8316 	kev_post_msg(&ev_msg);
8317 }
8318 
8319 void
socket_post_kev_msg_closed(struct socket * so)8320 socket_post_kev_msg_closed(struct socket *so)
8321 {
8322 	struct kev_socket_closed ev = {};
8323 	struct sockaddr *socksa = NULL, *peersa = NULL;
8324 	int err;
8325 
8326 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8327 		return;
8328 	}
8329 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8330 	if (err == 0) {
8331 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8332 		    &peersa);
8333 		if (err == 0) {
8334 			memcpy(&ev.ev_data.kev_sockname, socksa,
8335 			    min(socksa->sa_len,
8336 			    sizeof(ev.ev_data.kev_sockname)));
8337 			memcpy(&ev.ev_data.kev_peername, peersa,
8338 			    min(peersa->sa_len,
8339 			    sizeof(ev.ev_data.kev_peername)));
8340 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8341 			    &ev.ev_data, sizeof(ev));
8342 		}
8343 	}
8344 	free_sockaddr(socksa);
8345 	free_sockaddr(peersa);
8346 }
8347