xref: /xnu-8792.41.9/bsd/kern/uipc_socket.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120 
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125 
126 #include <os/log.h>
127 
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131 
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136 
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138 
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144 
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147 
148 static u_int32_t        so_cache_hw;    /* High water mark for socache */
149 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
150 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
151 static u_int32_t        cached_sock_count = 0;
152 STAILQ_HEAD(, socket)   so_cache_head;
153 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t        so_cache_time;
155 static int              socketinit_done;
156 static struct zone      *so_cache_zone;
157 
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160 
161 #include <machine/limits.h>
162 
163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void     filt_sordetach(struct knote *kn);
165 static int      filt_soread(struct knote *kn, long hint);
166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168 
169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void     filt_sowdetach(struct knote *kn);
171 static int      filt_sowrite(struct knote *kn, long hint);
172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174 
175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void     filt_sockdetach(struct knote *kn);
177 static int      filt_sockev(struct knote *kn, long hint);
178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180 
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183 
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 	.f_isfd = 1,
186 	.f_attach = filt_sorattach,
187 	.f_detach = filt_sordetach,
188 	.f_event = filt_soread,
189 	.f_touch = filt_sortouch,
190 	.f_process = filt_sorprocess,
191 };
192 
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 	.f_isfd = 1,
195 	.f_attach = filt_sowattach,
196 	.f_detach = filt_sowdetach,
197 	.f_event = filt_sowrite,
198 	.f_touch = filt_sowtouch,
199 	.f_process = filt_sowprocess,
200 };
201 
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 	.f_isfd = 1,
204 	.f_attach = filt_sockattach,
205 	.f_detach = filt_sockdetach,
206 	.f_event = filt_sockev,
207 	.f_touch = filt_socktouch,
208 	.f_process = filt_sockprocess,
209 };
210 
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 	.f_isfd = 1,
213 	.f_attach = filt_sorattach,
214 	.f_detach = filt_sordetach,
215 	.f_event = filt_soread,
216 	.f_touch = filt_sortouch,
217 	.f_process = filt_sorprocess,
218 };
219 
220 SYSCTL_DECL(_kern_ipc);
221 
222 #define EVEN_MORE_LOCKING_DEBUG 0
223 
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227 
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230     &sodefunct_calls, "");
231 
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t        so_gencnt;      /* generation count for sockets */
234 
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236 
237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246 
247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
248 
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252 
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy  = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261 
262 /*
263  * Set to enable jumbo clusters (if available) for large writes when
264  * the socket is marked with SOF_MULTIPAGES; see below.
265  */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269 
270 /*
271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272  * writes on the socket for all protocols on any network interfaces,
273  * depending upon sosendjcl above.  Be extra careful when setting this
274  * to 1, because sending down packets that cross physical pages down to
275  * broken drivers (those that falsely assume that the physical pages
276  * are contiguous) might lead to system panics or silent data corruption.
277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279  * capable.  Set this to 1 only for testing/debugging purposes.
280  */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284 
285 /*
286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287  * writes on the socket for all protocols on any network interfaces.
288  * Be extra careful when setting this to 1, because sending down packets with
289  * clusters larger that 2 KB might lead to system panics or data corruption.
290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291  * on the outgoing interface
292  * Set this to 1  for testing/debugging purposes only.
293  */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297 
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300     &sodefunctlog, 0, "");
301 
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304     &sothrottlelog, 0, "");
305 
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308     &sorestrictrecv, 0, "Enable inbound interface restrictions");
309 
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312     &sorestrictsend, 0, "Enable outbound interface restrictions");
313 
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317 
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323 
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327     &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329 
330 extern struct inpcbinfo tcbinfo;
331 
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335 
336 vm_size_t       so_cache_zone_element_size;
337 
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339     user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342 
343 /*
344  * Maximum of extended background idle sockets per process
345  * Set to zero to disable further setting of the option
346  */
347 
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
349 #define SO_IDLE_BK_IDLE_TIME            600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
351 
352 struct soextbkidlestat soextbkidlestat;
353 
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356     "Maximum of extended background idle sockets per process");
357 
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359     &soextbkidlestat.so_xbkidle_time, 0,
360     "Time in seconds to keep extended background idle sockets");
361 
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364     "High water mark for extended background idle sockets");
365 
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367     &soextbkidlestat, soextbkidlestat, "");
368 
369 int so_set_extended_bk_idle(struct socket *, int);
370 
371 
372 /*
373  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374  * setting the DSCP code on the packet based on the service class; see
375  * <rdar://problem/11277343> for details.
376  */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379     &sotcdb, 0, "");
380 
381 void
socketinit(void)382 socketinit(void)
383 {
384 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386 
387 #ifdef __LP64__
388 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402 
403 	if (socketinit_done) {
404 		printf("socketinit: already called...\n");
405 		return;
406 	}
407 	socketinit_done = 1;
408 
409 	PE_parse_boot_argn("socket_debug", &socket_debug,
410 	    sizeof(socket_debug));
411 
412 	STAILQ_INIT(&so_cache_head);
413 
414 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
416 
417 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419 
420 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424 
425 	in_pcbinit();
426 }
427 
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 	caddr_t temp;
432 	uintptr_t offset;
433 
434 	lck_mtx_lock(&so_cache_mtx);
435 
436 	if (!STAILQ_EMPTY(&so_cache_head)) {
437 		VERIFY(cached_sock_count > 0);
438 
439 		*so = STAILQ_FIRST(&so_cache_head);
440 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
442 
443 		cached_sock_count--;
444 		lck_mtx_unlock(&so_cache_mtx);
445 
446 		temp = (*so)->so_saved_pcb;
447 		bzero((caddr_t)*so, sizeof(struct socket));
448 
449 		(*so)->so_saved_pcb = temp;
450 	} else {
451 		lck_mtx_unlock(&so_cache_mtx);
452 
453 		*so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454 
455 		/*
456 		 * Define offsets for extra structures into our
457 		 * single block of memory. Align extra structures
458 		 * on longword boundaries.
459 		 */
460 
461 		offset = (uintptr_t)*so;
462 		offset += sizeof(struct socket);
463 
464 		offset = ALIGN(offset);
465 
466 		(*so)->so_saved_pcb = (caddr_t)offset;
467 		offset += get_inpcb_str_size();
468 
469 		offset = ALIGN(offset);
470 
471 		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 		    (caddr_t)offset;
473 	}
474 
475 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477 
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 	lck_mtx_lock(&so_cache_mtx);
482 
483 	so_cache_time = net_uptime();
484 	if (++cached_sock_count > max_cached_sock_count) {
485 		--cached_sock_count;
486 		lck_mtx_unlock(&so_cache_mtx);
487 		zfree(so_cache_zone, so);
488 	} else {
489 		if (so_cache_hw < cached_sock_count) {
490 			so_cache_hw = cached_sock_count;
491 		}
492 
493 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494 
495 		so->cache_timestamp = so_cache_time;
496 		lck_mtx_unlock(&so_cache_mtx);
497 	}
498 }
499 
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 	if (so->last_pid != 0) {
504 		/*
505 		 * last_pid and last_upid should remain zero for sockets
506 		 * created using sock_socket. The check above achieves that
507 		 */
508 		if (self == PROC_NULL) {
509 			self = current_proc();
510 		}
511 
512 		if (so->last_upid != proc_uniqueid(self) ||
513 		    so->last_pid != proc_pid(self)) {
514 			so->last_upid = proc_uniqueid(self);
515 			so->last_pid = proc_pid(self);
516 			proc_getexecutableuuid(self, so->last_uuid,
517 			    sizeof(so->last_uuid));
518 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 			}
521 		}
522 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 	}
524 }
525 
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 		(void) inp_update_policy(sotoinpcb(so));
531 	}
532 }
533 
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537     struct sockaddr *override_remote_addr)
538 {
539 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 		    override_remote_addr, 0);
542 	}
543 }
544 #endif /* NECP */
545 
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 	struct socket   *p;
550 	int             n_freed = 0;
551 	boolean_t rc = FALSE;
552 
553 	lck_mtx_lock(&so_cache_mtx);
554 	so_cache_timeouts++;
555 	so_cache_time = net_uptime();
556 
557 	while (!STAILQ_EMPTY(&so_cache_head)) {
558 		VERIFY(cached_sock_count > 0);
559 		p = STAILQ_FIRST(&so_cache_head);
560 		if ((so_cache_time - p->cache_timestamp) <
561 		    SO_CACHE_TIME_LIMIT) {
562 			break;
563 		}
564 
565 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 		--cached_sock_count;
567 
568 		zfree(so_cache_zone, p);
569 
570 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 			so_cache_max_freed++;
572 			break;
573 		}
574 	}
575 
576 	/* Schedule again if there is more to cleanup */
577 	if (!STAILQ_EMPTY(&so_cache_head)) {
578 		rc = TRUE;
579 	}
580 
581 	lck_mtx_unlock(&so_cache_mtx);
582 	return rc;
583 }
584 
585 /*
586  * Get a socket structure from our zone, and initialize it.
587  * We don't implement `waitok' yet (see comments in uipc_domain.c).
588  * Note that it would probably be better to allocate socket
589  * and PCB at the same time, but I'm not convinced that all
590  * the protocols can be easily modified to do this.
591  */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 	struct socket *so;
597 
598 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 		cached_sock_alloc(&so, how);
600 	} else {
601 		so = zalloc_flags(socket_zone, how | Z_ZERO);
602 	}
603 	if (so != NULL) {
604 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605 
606 		/*
607 		 * Increment the socket allocation statistics
608 		 */
609 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 	}
611 
612 	return so;
613 }
614 
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617     struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 	struct protosw *prp;
620 	struct socket *so;
621 	int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 	pid_t rpid = -1;
624 #endif
625 
626 #if TCPDEBUG
627 	extern int tcpconsdebug;
628 #endif
629 
630 	VERIFY(aso != NULL);
631 	*aso = NULL;
632 
633 	if (proto != 0) {
634 		prp = pffindproto(dom, proto, type);
635 	} else {
636 		prp = pffindtype(dom, type);
637 	}
638 
639 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 		if (pffinddomain(dom) == NULL) {
641 			return EAFNOSUPPORT;
642 		}
643 		if (proto != 0) {
644 			if (pffindprotonotype(dom, proto) != NULL) {
645 				return EPROTOTYPE;
646 			}
647 		}
648 		return EPROTONOSUPPORT;
649 	}
650 	if (prp->pr_type != type) {
651 		return EPROTOTYPE;
652 	}
653 	so = soalloc(1, dom, type);
654 	if (so == NULL) {
655 		return ENOBUFS;
656 	}
657 
658 	switch (dom) {
659 	case PF_LOCAL:
660 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 		break;
662 	case PF_INET:
663 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 		if (type == SOCK_STREAM) {
665 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 		} else {
667 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 		}
669 		break;
670 	case PF_ROUTE:
671 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 		break;
673 	case PF_NDRV:
674 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 		break;
676 	case PF_KEY:
677 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 		break;
679 	case PF_INET6:
680 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 		if (type == SOCK_STREAM) {
682 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 		} else {
684 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 		}
686 		break;
687 	case PF_SYSTEM:
688 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 		break;
690 	case PF_MULTIPATH:
691 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 		break;
693 	default:
694 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 		break;
696 	}
697 
698 	if (flags & SOCF_MPTCP) {
699 		so->so_state |= SS_NBIO;
700 	}
701 
702 	TAILQ_INIT(&so->so_incomp);
703 	TAILQ_INIT(&so->so_comp);
704 	so->so_type = (short)type;
705 	so->last_upid = proc_uniqueid(p);
706 	so->last_pid = proc_pid(p);
707 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709 
710 	if (ep != PROC_NULL && ep != p) {
711 		so->e_upid = proc_uniqueid(ep);
712 		so->e_pid = proc_pid(ep);
713 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 		so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 		if (ep->p_responsible_pid != so->e_pid) {
717 			rpid = ep->p_responsible_pid;
718 		}
719 #endif
720 	}
721 
722 #if defined(XNU_TARGET_OS_OSX)
723 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 		rpid = p->p_responsible_pid;
725 	}
726 
727 	so->so_rpid = -1;
728 	uuid_clear(so->so_ruuid);
729 	if (rpid >= 0) {
730 		proc_t rp = proc_find(rpid);
731 		if (rp != PROC_NULL) {
732 			proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 			so->so_rpid = rpid;
734 			proc_rele(rp);
735 		}
736 	}
737 #endif
738 
739 	so->so_cred = kauth_cred_proc_ref(p);
740 	if (!suser(kauth_cred_get(), NULL)) {
741 		so->so_state |= SS_PRIV;
742 	}
743 
744 	so->so_proto = prp;
745 	so->so_rcv.sb_flags |= SB_RECV;
746 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 	so->next_lock_lr = 0;
748 	so->next_unlock_lr = 0;
749 
750 	/*
751 	 * Attachment will create the per pcb lock if necessary and
752 	 * increase refcount for creation, make sure it's done before
753 	 * socket is inserted in lists.
754 	 */
755 	so->so_usecount++;
756 
757 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 	if (error != 0) {
759 		/*
760 		 * Warning:
761 		 * If so_pcb is not zero, the socket will be leaked,
762 		 * so protocol attachment handler must be coded carefuly
763 		 */
764 		if (so->so_pcb != NULL) {
765 			os_log_error(OS_LOG_DEFAULT,
766 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 			    error, dom, proto, type);
768 		}
769 		/*
770 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 		 */
772 		so->so_state |= SS_NOFDREF;
773 		so->so_flags |= SOF_PCBCLEARING;
774 		VERIFY(so->so_usecount > 0);
775 		so->so_usecount--;
776 		sofreelastref(so, 1);   /* will deallocate the socket */
777 		return error;
778 	}
779 
780 	/*
781 	 * Note: needs so_pcb to be set after pru_attach
782 	 */
783 	if (prp->pr_update_last_owner != NULL) {
784 		(*prp->pr_update_last_owner)(so, p, ep);
785 	}
786 
787 	atomic_add_32(&prp->pr_domain->dom_refs, 1);
788 
789 	/* Attach socket filters for this protocol */
790 	sflt_initsock(so);
791 #if TCPDEBUG
792 	if (tcpconsdebug == 2) {
793 		so->so_options |= SO_DEBUG;
794 	}
795 #endif
796 	so_set_default_traffic_class(so);
797 
798 	/*
799 	 * If this thread or task is marked to create backgrounded sockets,
800 	 * mark the socket as background.
801 	 */
802 	if (!(flags & SOCF_MPTCP) &&
803 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 		so->so_background_thread = current_thread();
806 	}
807 
808 	switch (dom) {
809 	/*
810 	 * Don't mark Unix domain or system
811 	 * eligible for defunct by default.
812 	 */
813 	case PF_LOCAL:
814 	case PF_SYSTEM:
815 		so->so_flags |= SOF_NODEFUNCT;
816 		break;
817 	default:
818 		break;
819 	}
820 
821 	/*
822 	 * Entitlements can't be checked at socket creation time except if the
823 	 * application requested a feature guarded by a privilege (c.f., socket
824 	 * delegation).
825 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 	 * a privilege check should only be triggered by a userland request.
827 	 * A privilege check at socket creation time is time consuming and
828 	 * could trigger many authorisation error messages from the security
829 	 * APIs.
830 	 */
831 
832 	*aso = so;
833 
834 	return 0;
835 }
836 
837 /*
838  * Returns:	0			Success
839  *		EAFNOSUPPORT
840  *		EPROTOTYPE
841  *		EPROTONOSUPPORT
842  *		ENOBUFS
843  *	<pru_attach>:ENOBUFS[AF_UNIX]
844  *	<pru_attach>:ENOBUFS[TCP]
845  *	<pru_attach>:ENOMEM[TCP]
846  *	<pru_attach>:???		[other protocol families, IPSEC]
847  */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 	           PROC_NULL);
853 }
854 
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 	int error = 0;
859 	struct proc *ep = PROC_NULL;
860 
861 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 		error = ESRCH;
863 		goto done;
864 	}
865 
866 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867 
868 	/*
869 	 * It might not be wise to hold the proc reference when calling
870 	 * socreate_internal since it calls soalloc with M_WAITOK
871 	 */
872 done:
873 	if (ep != PROC_NULL) {
874 		proc_rele(ep);
875 	}
876 
877 	return error;
878 }
879 
880 /*
881  * Returns:	0			Success
882  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
883  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
884  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
885  *	<pru_bind>:EINVAL		Invalid argument
886  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
887  *	<pru_bind>:EACCES		Permission denied
888  *	<pru_bind>:EADDRINUSE		Address in use
889  *	<pru_bind>:EAGAIN		Resource unavailable, try again
890  *	<pru_bind>:EPERM		Operation not permitted
891  *	<pru_bind>:???
892  *	<sf_bind>:???
893  *
894  * Notes:	It's not possible to fully enumerate the return codes above,
895  *		since socket filter authors and protocol family authors may
896  *		not choose to limit their error returns to those listed, even
897  *		though this may result in some software operating incorrectly.
898  *
899  *		The error codes which are enumerated above are those known to
900  *		be returned by the tcp_usr_bind function supplied.
901  */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 	struct proc *p = current_proc();
906 	int error = 0;
907 
908 	if (dolock) {
909 		socket_lock(so, 1);
910 	}
911 
912 	so_update_last_owner_locked(so, p);
913 	so_update_policy(so);
914 
915 #if NECP
916 	so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918 
919 	/*
920 	 * If this is a bind request on a socket that has been marked
921 	 * as inactive, reject it now before we go any further.
922 	 */
923 	if (so->so_flags & SOF_DEFUNCT) {
924 		error = EINVAL;
925 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
926 		    __func__, proc_pid(p), proc_best_name(p),
927 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
928 		    SOCK_DOM(so), SOCK_TYPE(so), error);
929 		goto out;
930 	}
931 
932 	/* Socket filter */
933 	error = sflt_bind(so, nam);
934 
935 	if (error == 0) {
936 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 	}
938 out:
939 	if (dolock) {
940 		socket_unlock(so, 1);
941 	}
942 
943 	if (error == EJUSTRETURN) {
944 		error = 0;
945 	}
946 
947 	return error;
948 }
949 
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 	kauth_cred_unref(&so->so_cred);
954 
955 	/* Remove any filters */
956 	sflt_termsock(so);
957 
958 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959 
960 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
961 		cached_sock_free(so);
962 	} else {
963 		zfree(socket_zone, so);
964 	}
965 }
966 
967 /*
968  * Returns:	0			Success
969  *		EINVAL
970  *		EOPNOTSUPP
971  *	<pru_listen>:EINVAL[AF_UNIX]
972  *	<pru_listen>:EINVAL[TCP]
973  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
974  *	<pru_listen>:EINVAL[TCP]	Invalid argument
975  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
976  *	<pru_listen>:EACCES[TCP]	Permission denied
977  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
978  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
979  *	<pru_listen>:EPERM[TCP]		Operation not permitted
980  *	<sf_listen>:???
981  *
982  * Notes:	Other <pru_listen> returns depend on the protocol family; all
983  *		<sf_listen> returns depend on what the filter author causes
984  *		their filter to return.
985  */
986 int
solisten(struct socket * so,int backlog)987 solisten(struct socket *so, int backlog)
988 {
989 	struct proc *p = current_proc();
990 	int error = 0;
991 
992 	socket_lock(so, 1);
993 
994 	so_update_last_owner_locked(so, p);
995 	so_update_policy(so);
996 
997 #if NECP
998 	so_update_necp_policy(so, NULL, NULL);
999 #endif /* NECP */
1000 
1001 	if (so->so_proto == NULL) {
1002 		error = EINVAL;
1003 		goto out;
1004 	}
1005 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1006 		error = EOPNOTSUPP;
1007 		goto out;
1008 	}
1009 
1010 	/*
1011 	 * If the listen request is made on a socket that is not fully
1012 	 * disconnected, or on a socket that has been marked as inactive,
1013 	 * reject the request now.
1014 	 */
1015 	if ((so->so_state &
1016 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1017 	    (so->so_flags & SOF_DEFUNCT)) {
1018 		error = EINVAL;
1019 		if (so->so_flags & SOF_DEFUNCT) {
1020 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1021 			    "(%d)\n", __func__, proc_pid(p),
1022 			    proc_best_name(p),
1023 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1024 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1025 		}
1026 		goto out;
1027 	}
1028 
1029 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1030 		error = EPERM;
1031 		goto out;
1032 	}
1033 
1034 	error = sflt_listen(so);
1035 	if (error == 0) {
1036 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1037 	}
1038 
1039 	if (error) {
1040 		if (error == EJUSTRETURN) {
1041 			error = 0;
1042 		}
1043 		goto out;
1044 	}
1045 
1046 	if (TAILQ_EMPTY(&so->so_comp)) {
1047 		so->so_options |= SO_ACCEPTCONN;
1048 	}
1049 	/*
1050 	 * POSIX: The implementation may have an upper limit on the length of
1051 	 * the listen queue-either global or per accepting socket. If backlog
1052 	 * exceeds this limit, the length of the listen queue is set to the
1053 	 * limit.
1054 	 *
1055 	 * If listen() is called with a backlog argument value that is less
1056 	 * than 0, the function behaves as if it had been called with a backlog
1057 	 * argument value of 0.
1058 	 *
1059 	 * A backlog argument of 0 may allow the socket to accept connections,
1060 	 * in which case the length of the listen queue may be set to an
1061 	 * implementation-defined minimum value.
1062 	 */
1063 	if (backlog <= 0 || backlog > somaxconn) {
1064 		backlog = somaxconn;
1065 	}
1066 
1067 	so->so_qlimit = (short)backlog;
1068 out:
1069 	socket_unlock(so, 1);
1070 	return error;
1071 }
1072 
1073 /*
1074  * The "accept list lock" protects the fields related to the listener queues
1075  * because we can unlock a socket to respect the lock ordering between
1076  * the listener socket and its clients sockets. The lock ordering is first to
1077  * acquire the client socket before the listener socket.
1078  *
1079  * The accept list lock serializes access to the following fields:
1080  * - of the listener socket:
1081  *   - so_comp
1082  *   - so_incomp
1083  *   - so_qlen
1084  *   - so_inqlen
1085  * - of client sockets that are in so_comp or so_incomp:
1086  *   - so_head
1087  *   - so_list
1088  *
1089  * As one can see the accept list lock protects the consistent of the
1090  * linkage of the client sockets.
1091  *
1092  * Note that those fields may be read without holding the accept list lock
1093  * for a preflight provided the accept list lock is taken when committing
1094  * to take an action based on the result of the preflight. The preflight
1095  * saves the cost of doing the unlock/lock dance.
1096  */
1097 void
so_acquire_accept_list(struct socket * head,struct socket * so)1098 so_acquire_accept_list(struct socket *head, struct socket *so)
1099 {
1100 	lck_mtx_t *mutex_held;
1101 
1102 	if (head->so_proto->pr_getlock == NULL) {
1103 		return;
1104 	}
1105 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1106 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1107 
1108 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1109 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1110 		return;
1111 	}
1112 	if (so != NULL) {
1113 		socket_unlock(so, 0);
1114 	}
1115 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1116 		so_accept_list_waits += 1;
1117 		msleep((caddr_t)&head->so_incomp, mutex_held,
1118 		    PSOCK | PCATCH, __func__, NULL);
1119 	}
1120 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1121 	if (so != NULL) {
1122 		socket_unlock(head, 0);
1123 		socket_lock(so, 0);
1124 		socket_lock(head, 0);
1125 	}
1126 }
1127 
1128 void
so_release_accept_list(struct socket * head)1129 so_release_accept_list(struct socket *head)
1130 {
1131 	if (head->so_proto->pr_getlock != NULL) {
1132 		lck_mtx_t *mutex_held;
1133 
1134 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1135 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1136 
1137 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1138 		wakeup((caddr_t)&head->so_incomp);
1139 	}
1140 }
1141 
1142 void
sofreelastref(struct socket * so,int dealloc)1143 sofreelastref(struct socket *so, int dealloc)
1144 {
1145 	struct socket *head = so->so_head;
1146 
1147 	/* Assume socket is locked */
1148 
1149 #if FLOW_DIVERT
1150 	if (so->so_flags & SOF_FLOW_DIVERT) {
1151 		flow_divert_detach(so);
1152 	}
1153 #endif  /* FLOW_DIVERT */
1154 
1155 #if CONTENT_FILTER
1156 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1157 		cfil_sock_detach(so);
1158 	}
1159 #endif /* CONTENT_FILTER */
1160 
1161 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1162 		soflow_detach(so);
1163 	}
1164 
1165 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1166 		selthreadclear(&so->so_snd.sb_sel);
1167 		selthreadclear(&so->so_rcv.sb_sel);
1168 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1169 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1170 		so->so_event = sonullevent;
1171 		return;
1172 	}
1173 	if (head != NULL) {
1174 		/*
1175 		 * Need to lock the listener when the protocol has
1176 		 * per socket locks
1177 		 */
1178 		if (head->so_proto->pr_getlock != NULL) {
1179 			socket_lock(head, 1);
1180 			so_acquire_accept_list(head, so);
1181 		}
1182 		if (so->so_state & SS_INCOMP) {
1183 			so->so_state &= ~SS_INCOMP;
1184 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1185 			head->so_incqlen--;
1186 			head->so_qlen--;
1187 			so->so_head = NULL;
1188 
1189 			if (head->so_proto->pr_getlock != NULL) {
1190 				so_release_accept_list(head);
1191 				socket_unlock(head, 1);
1192 			}
1193 		} else if (so->so_state & SS_COMP) {
1194 			if (head->so_proto->pr_getlock != NULL) {
1195 				so_release_accept_list(head);
1196 				socket_unlock(head, 1);
1197 			}
1198 			/*
1199 			 * We must not decommission a socket that's
1200 			 * on the accept(2) queue.  If we do, then
1201 			 * accept(2) may hang after select(2) indicated
1202 			 * that the listening socket was ready.
1203 			 */
1204 			selthreadclear(&so->so_snd.sb_sel);
1205 			selthreadclear(&so->so_rcv.sb_sel);
1206 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1207 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1208 			so->so_event = sonullevent;
1209 			return;
1210 		} else {
1211 			if (head->so_proto->pr_getlock != NULL) {
1212 				so_release_accept_list(head);
1213 				socket_unlock(head, 1);
1214 			}
1215 			printf("sofree: not queued\n");
1216 		}
1217 	}
1218 	sowflush(so);
1219 	sorflush(so);
1220 
1221 	/* 3932268: disable upcall */
1222 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1223 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1224 	so->so_event = sonullevent;
1225 
1226 	if (dealloc) {
1227 		sodealloc(so);
1228 	}
1229 }
1230 
1231 void
soclose_wait_locked(struct socket * so)1232 soclose_wait_locked(struct socket *so)
1233 {
1234 	lck_mtx_t *mutex_held;
1235 
1236 	if (so->so_proto->pr_getlock != NULL) {
1237 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1238 	} else {
1239 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1240 	}
1241 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1242 
1243 	/*
1244 	 * Double check here and return if there's no outstanding upcall;
1245 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1246 	 */
1247 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1248 		return;
1249 	}
1250 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1251 	so->so_snd.sb_flags &= ~SB_UPCALL;
1252 	so->so_flags |= SOF_CLOSEWAIT;
1253 
1254 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1255 	    "soclose_wait_locked", NULL);
1256 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1257 	so->so_flags &= ~SOF_CLOSEWAIT;
1258 }
1259 
1260 /*
1261  * Close a socket on last file table reference removal.
1262  * Initiate disconnect if connected.
1263  * Free socket when disconnect complete.
1264  */
1265 int
soclose_locked(struct socket * so)1266 soclose_locked(struct socket *so)
1267 {
1268 	int error = 0;
1269 	struct timespec ts;
1270 
1271 	if (so->so_usecount == 0) {
1272 		panic("soclose: so=%p refcount=0", so);
1273 		/* NOTREACHED */
1274 	}
1275 
1276 	sflt_notify(so, sock_evt_closing, NULL);
1277 
1278 	if (so->so_upcallusecount) {
1279 		soclose_wait_locked(so);
1280 	}
1281 
1282 #if CONTENT_FILTER
1283 	/*
1284 	 * We have to wait until the content filters are done
1285 	 */
1286 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1287 		cfil_sock_close_wait(so);
1288 		cfil_sock_is_closed(so);
1289 		cfil_sock_detach(so);
1290 	}
1291 #endif /* CONTENT_FILTER */
1292 
1293 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1294 		soflow_detach(so);
1295 	}
1296 
1297 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1298 		soresume(current_proc(), so, 1);
1299 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1300 	}
1301 
1302 	if ((so->so_options & SO_ACCEPTCONN)) {
1303 		struct socket *sp, *sonext;
1304 		int persocklock = 0;
1305 		int incomp_overflow_only;
1306 
1307 		/*
1308 		 * We do not want new connection to be added
1309 		 * to the connection queues
1310 		 */
1311 		so->so_options &= ~SO_ACCEPTCONN;
1312 
1313 		/*
1314 		 * We can drop the lock on the listener once
1315 		 * we've acquired the incoming list
1316 		 */
1317 		if (so->so_proto->pr_getlock != NULL) {
1318 			persocklock = 1;
1319 			so_acquire_accept_list(so, NULL);
1320 			socket_unlock(so, 0);
1321 		}
1322 again:
1323 		incomp_overflow_only = 1;
1324 
1325 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1326 			/*
1327 			 * Radar 5350314
1328 			 * skip sockets thrown away by tcpdropdropblreq
1329 			 * they will get cleanup by the garbage collection.
1330 			 * otherwise, remove the incomp socket from the queue
1331 			 * and let soabort trigger the appropriate cleanup.
1332 			 */
1333 			if (sp->so_flags & SOF_OVERFLOW) {
1334 				continue;
1335 			}
1336 
1337 			if (persocklock != 0) {
1338 				socket_lock(sp, 1);
1339 			}
1340 
1341 			/*
1342 			 * Radar 27945981
1343 			 * The extra reference for the list insure the
1344 			 * validity of the socket pointer when we perform the
1345 			 * unlock of the head above
1346 			 */
1347 			if (sp->so_state & SS_INCOMP) {
1348 				sp->so_state &= ~SS_INCOMP;
1349 				sp->so_head = NULL;
1350 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1351 				so->so_incqlen--;
1352 				so->so_qlen--;
1353 
1354 				(void) soabort(sp);
1355 			} else {
1356 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1357 				    __func__, sp);
1358 			}
1359 
1360 			if (persocklock != 0) {
1361 				socket_unlock(sp, 1);
1362 			}
1363 		}
1364 
1365 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1366 			/* Dequeue from so_comp since sofree() won't do it */
1367 			if (persocklock != 0) {
1368 				socket_lock(sp, 1);
1369 			}
1370 
1371 			if (sp->so_state & SS_COMP) {
1372 				sp->so_state &= ~SS_COMP;
1373 				sp->so_head = NULL;
1374 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1375 				so->so_qlen--;
1376 
1377 				(void) soabort(sp);
1378 			} else {
1379 				panic("%s sp %p in so_comp but !SS_COMP",
1380 				    __func__, sp);
1381 			}
1382 
1383 			if (persocklock) {
1384 				socket_unlock(sp, 1);
1385 			}
1386 		}
1387 
1388 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1389 #if (DEBUG | DEVELOPMENT)
1390 			panic("%s head %p so_comp not empty", __func__, so);
1391 #endif /* (DEVELOPMENT || DEBUG) */
1392 
1393 			goto again;
1394 		}
1395 
1396 		if (!TAILQ_EMPTY(&so->so_comp)) {
1397 #if (DEBUG | DEVELOPMENT)
1398 			panic("%s head %p so_comp not empty", __func__, so);
1399 #endif /* (DEVELOPMENT || DEBUG) */
1400 
1401 			goto again;
1402 		}
1403 
1404 		if (persocklock) {
1405 			socket_lock(so, 0);
1406 			so_release_accept_list(so);
1407 		}
1408 	}
1409 	if (so->so_pcb == NULL) {
1410 		/* 3915887: mark the socket as ready for dealloc */
1411 		so->so_flags |= SOF_PCBCLEARING;
1412 		goto discard;
1413 	}
1414 
1415 	if (so->so_state & SS_ISCONNECTED) {
1416 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1417 			error = sodisconnectlocked(so);
1418 			if (error) {
1419 				goto drop;
1420 			}
1421 		}
1422 		if (so->so_options & SO_LINGER) {
1423 			if ((so->so_state & SS_ISDISCONNECTING) &&
1424 			    (so->so_state & SS_NBIO)) {
1425 				goto drop;
1426 			}
1427 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1428 				lck_mtx_t *mutex_held;
1429 
1430 				if (so->so_proto->pr_getlock != NULL) {
1431 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1432 				} else {
1433 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1434 				}
1435 				ts.tv_sec = (so->so_linger / 100);
1436 				ts.tv_nsec = (so->so_linger % 100) *
1437 				    NSEC_PER_USEC * 1000 * 10;
1438 				error = msleep((caddr_t)&so->so_timeo,
1439 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1440 				if (error) {
1441 					/*
1442 					 * It's OK when the time fires,
1443 					 * don't report an error
1444 					 */
1445 					if (error == EWOULDBLOCK) {
1446 						error = 0;
1447 					}
1448 					break;
1449 				}
1450 			}
1451 		}
1452 	}
1453 drop:
1454 	if (so->so_usecount == 0) {
1455 		panic("soclose: usecount is zero so=%p", so);
1456 		/* NOTREACHED */
1457 	}
1458 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1459 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1460 		if (error == 0) {
1461 			error = error2;
1462 		}
1463 	}
1464 	if (so->so_usecount <= 0) {
1465 		panic("soclose: usecount is zero so=%p", so);
1466 		/* NOTREACHED */
1467 	}
1468 discard:
1469 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1470 	    (so->so_state & SS_NOFDREF)) {
1471 		panic("soclose: NOFDREF");
1472 		/* NOTREACHED */
1473 	}
1474 	so->so_state |= SS_NOFDREF;
1475 
1476 	if ((so->so_flags & SOF_KNOTE) != 0) {
1477 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1478 	}
1479 
1480 	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1481 
1482 	VERIFY(so->so_usecount > 0);
1483 	so->so_usecount--;
1484 	sofree(so);
1485 	return error;
1486 }
1487 
1488 int
soclose(struct socket * so)1489 soclose(struct socket *so)
1490 {
1491 	int error = 0;
1492 	socket_lock(so, 1);
1493 
1494 	if (so->so_retaincnt == 0) {
1495 		error = soclose_locked(so);
1496 	} else {
1497 		/*
1498 		 * if the FD is going away, but socket is
1499 		 * retained in kernel remove its reference
1500 		 */
1501 		so->so_usecount--;
1502 		if (so->so_usecount < 2) {
1503 			panic("soclose: retaincnt non null and so=%p "
1504 			    "usecount=%d\n", so, so->so_usecount);
1505 		}
1506 	}
1507 	socket_unlock(so, 1);
1508 	return error;
1509 }
1510 
1511 /*
1512  * Must be called at splnet...
1513  */
1514 /* Should already be locked */
1515 int
soabort(struct socket * so)1516 soabort(struct socket *so)
1517 {
1518 	int error;
1519 
1520 #ifdef MORE_LOCKING_DEBUG
1521 	lck_mtx_t *mutex_held;
1522 
1523 	if (so->so_proto->pr_getlock != NULL) {
1524 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1525 	} else {
1526 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1527 	}
1528 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1529 #endif
1530 
1531 	if ((so->so_flags & SOF_ABORTED) == 0) {
1532 		so->so_flags |= SOF_ABORTED;
1533 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1534 		if (error) {
1535 			sofree(so);
1536 			return error;
1537 		}
1538 	}
1539 	return 0;
1540 }
1541 
1542 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1543 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1544 {
1545 	int error;
1546 
1547 	if (dolock) {
1548 		socket_lock(so, 1);
1549 	}
1550 
1551 	so_update_last_owner_locked(so, PROC_NULL);
1552 	so_update_policy(so);
1553 #if NECP
1554 	so_update_necp_policy(so, NULL, NULL);
1555 #endif /* NECP */
1556 
1557 	if ((so->so_state & SS_NOFDREF) == 0) {
1558 		panic("soaccept: !NOFDREF");
1559 	}
1560 	so->so_state &= ~SS_NOFDREF;
1561 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1562 
1563 	if (dolock) {
1564 		socket_unlock(so, 1);
1565 	}
1566 	return error;
1567 }
1568 
1569 int
soaccept(struct socket * so,struct sockaddr ** nam)1570 soaccept(struct socket *so, struct sockaddr **nam)
1571 {
1572 	return soacceptlock(so, nam, 1);
1573 }
1574 
1575 int
soacceptfilter(struct socket * so,struct socket * head)1576 soacceptfilter(struct socket *so, struct socket *head)
1577 {
1578 	struct sockaddr *local = NULL, *remote = NULL;
1579 	int error = 0;
1580 
1581 	/*
1582 	 * Hold the lock even if this socket has not been made visible
1583 	 * to the filter(s).  For sockets with global locks, this protects
1584 	 * against the head or peer going away
1585 	 */
1586 	socket_lock(so, 1);
1587 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1588 	    sogetaddr_locked(so, &local, 0) != 0) {
1589 		so->so_state &= ~SS_NOFDREF;
1590 		socket_unlock(so, 1);
1591 		soclose(so);
1592 		/* Out of resources; try it again next time */
1593 		error = ECONNABORTED;
1594 		goto done;
1595 	}
1596 
1597 	error = sflt_accept(head, so, local, remote);
1598 
1599 	/*
1600 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1601 	 * as inactive and return it anyway.  This newly accepted socket
1602 	 * will be disconnected later before we hand it off to the caller.
1603 	 */
1604 	if (error == EJUSTRETURN) {
1605 		error = 0;
1606 		(void) sosetdefunct(current_proc(), so,
1607 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1608 	}
1609 
1610 	if (error != 0) {
1611 		/*
1612 		 * This may seem like a duplication to the above error
1613 		 * handling part when we return ECONNABORTED, except
1614 		 * the following is done while holding the lock since
1615 		 * the socket has been exposed to the filter(s) earlier.
1616 		 */
1617 		so->so_state &= ~SS_NOFDREF;
1618 		socket_unlock(so, 1);
1619 		soclose(so);
1620 		/* Propagate socket filter's error code to the caller */
1621 	} else {
1622 		socket_unlock(so, 1);
1623 	}
1624 done:
1625 	/* Callee checks for NULL pointer */
1626 	sock_freeaddr(remote);
1627 	sock_freeaddr(local);
1628 	return error;
1629 }
1630 
1631 /*
1632  * Returns:	0			Success
1633  *		EOPNOTSUPP		Operation not supported on socket
1634  *		EISCONN			Socket is connected
1635  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1636  *	<pru_connect>:EINVAL		Invalid argument
1637  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1638  *	<pru_connect>:EACCES		Permission denied
1639  *	<pru_connect>:EADDRINUSE	Address in use
1640  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1641  *	<pru_connect>:EPERM		Operation not permitted
1642  *	<sf_connect_out>:???		[anything a filter writer might set]
1643  */
1644 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1645 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1646 {
1647 	int error;
1648 	struct proc *p = current_proc();
1649 	tracker_metadata_t metadata = { };
1650 
1651 	if (dolock) {
1652 		socket_lock(so, 1);
1653 	}
1654 
1655 	so_update_last_owner_locked(so, p);
1656 	so_update_policy(so);
1657 
1658 #if NECP
1659 	so_update_necp_policy(so, NULL, nam);
1660 #endif /* NECP */
1661 
1662 	/*
1663 	 * If this is a listening socket or if this is a previously-accepted
1664 	 * socket that has been marked as inactive, reject the connect request.
1665 	 */
1666 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1667 		error = EOPNOTSUPP;
1668 		if (so->so_flags & SOF_DEFUNCT) {
1669 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1670 			    "(%d)\n", __func__, proc_pid(p),
1671 			    proc_best_name(p),
1672 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1673 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1674 		}
1675 		if (dolock) {
1676 			socket_unlock(so, 1);
1677 		}
1678 		return error;
1679 	}
1680 
1681 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1682 		if (dolock) {
1683 			socket_unlock(so, 1);
1684 		}
1685 		return EPERM;
1686 	}
1687 
1688 	/*
1689 	 * If protocol is connection-based, can only connect once.
1690 	 * Otherwise, if connected, try to disconnect first.
1691 	 * This allows user to disconnect by connecting to, e.g.,
1692 	 * a null address.
1693 	 */
1694 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1695 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1696 	    (error = sodisconnectlocked(so)))) {
1697 		error = EISCONN;
1698 	} else {
1699 		/*
1700 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1701 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1702 		 */
1703 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1704 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1705 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1706 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1707 				}
1708 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1709 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1710 				}
1711 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1712 					printf("connect() - failed necp_set_socket_domain_attributes");
1713 				}
1714 			}
1715 		}
1716 
1717 		/*
1718 		 * Run connect filter before calling protocol:
1719 		 *  - non-blocking connect returns before completion;
1720 		 */
1721 		error = sflt_connectout(so, nam);
1722 		if (error != 0) {
1723 			if (error == EJUSTRETURN) {
1724 				error = 0;
1725 			}
1726 		} else {
1727 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1728 			    (so, nam, p);
1729 			if (error != 0) {
1730 				so->so_state &= ~SS_ISCONNECTING;
1731 			}
1732 		}
1733 	}
1734 	if (dolock) {
1735 		socket_unlock(so, 1);
1736 	}
1737 	return error;
1738 }
1739 
1740 int
soconnect(struct socket * so,struct sockaddr * nam)1741 soconnect(struct socket *so, struct sockaddr *nam)
1742 {
1743 	return soconnectlock(so, nam, 1);
1744 }
1745 
1746 /*
1747  * Returns:	0			Success
1748  *	<pru_connect2>:EINVAL[AF_UNIX]
1749  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1750  *	<pru_connect2>:???		[other protocol families]
1751  *
1752  * Notes:	<pru_connect2> is not supported by [TCP].
1753  */
1754 int
soconnect2(struct socket * so1,struct socket * so2)1755 soconnect2(struct socket *so1, struct socket *so2)
1756 {
1757 	int error;
1758 
1759 	socket_lock(so1, 1);
1760 	if (so2->so_proto->pr_lock) {
1761 		socket_lock(so2, 1);
1762 	}
1763 
1764 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1765 
1766 	socket_unlock(so1, 1);
1767 	if (so2->so_proto->pr_lock) {
1768 		socket_unlock(so2, 1);
1769 	}
1770 	return error;
1771 }
1772 
1773 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1774 soconnectxlocked(struct socket *so, struct sockaddr *src,
1775     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1776     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1777     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1778 {
1779 	int error;
1780 	tracker_metadata_t metadata = { };
1781 
1782 	so_update_last_owner_locked(so, p);
1783 	so_update_policy(so);
1784 
1785 	/*
1786 	 * If this is a listening socket or if this is a previously-accepted
1787 	 * socket that has been marked as inactive, reject the connect request.
1788 	 */
1789 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1790 		error = EOPNOTSUPP;
1791 		if (so->so_flags & SOF_DEFUNCT) {
1792 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1793 			    "(%d)\n", __func__, proc_pid(p),
1794 			    proc_best_name(p),
1795 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1796 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1797 		}
1798 		return error;
1799 	}
1800 
1801 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1802 		return EPERM;
1803 	}
1804 
1805 	/*
1806 	 * If protocol is connection-based, can only connect once
1807 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1808 	 * try to disconnect first.  This allows user to disconnect
1809 	 * by connecting to, e.g., a null address.
1810 	 */
1811 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1812 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1813 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1814 	    (error = sodisconnectlocked(so)) != 0)) {
1815 		error = EISCONN;
1816 	} else {
1817 		/*
1818 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1819 		 * (only if it hasn't been marked yet).
1820 		 */
1821 		if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1822 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1823 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1824 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1825 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1826 				}
1827 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1828 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1829 				}
1830 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1831 					printf("connectx() - failed necp_set_socket_domain_attributes");
1832 				}
1833 			}
1834 		}
1835 
1836 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1837 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1838 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1839 
1840 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1841 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1842 			}
1843 		}
1844 
1845 		/*
1846 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1847 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1848 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1849 		 * Case 3 allows user to combine write with connect even if they have
1850 		 * no use for TFO (such as regular TCP, and UDP).
1851 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1852 		 */
1853 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1854 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1855 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1856 		}
1857 
1858 		/*
1859 		 * If a user sets data idempotent and does not pass an uio, or
1860 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1861 		 * SOF1_DATA_IDEMPOTENT.
1862 		 */
1863 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1864 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1865 			/* We should return EINVAL instead perhaps. */
1866 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1867 		}
1868 
1869 		/*
1870 		 * Run connect filter before calling protocol:
1871 		 *  - non-blocking connect returns before completion;
1872 		 */
1873 		error = sflt_connectout(so, dst);
1874 		if (error != 0) {
1875 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1876 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1877 			if (error == EJUSTRETURN) {
1878 				error = 0;
1879 			}
1880 		} else {
1881 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1882 			    (so, src, dst, p, ifscope, aid, pcid,
1883 			    flags, arg, arglen, auio, bytes_written);
1884 			if (error != 0) {
1885 				so->so_state &= ~SS_ISCONNECTING;
1886 				if (error != EINPROGRESS) {
1887 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1888 				}
1889 			}
1890 		}
1891 	}
1892 
1893 	return error;
1894 }
1895 
1896 int
sodisconnectlocked(struct socket * so)1897 sodisconnectlocked(struct socket *so)
1898 {
1899 	int error;
1900 
1901 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1902 		error = ENOTCONN;
1903 		goto bad;
1904 	}
1905 	if (so->so_state & SS_ISDISCONNECTING) {
1906 		error = EALREADY;
1907 		goto bad;
1908 	}
1909 
1910 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1911 	if (error == 0) {
1912 		sflt_notify(so, sock_evt_disconnected, NULL);
1913 	}
1914 
1915 bad:
1916 	return error;
1917 }
1918 
1919 /* Locking version */
1920 int
sodisconnect(struct socket * so)1921 sodisconnect(struct socket *so)
1922 {
1923 	int error;
1924 
1925 	socket_lock(so, 1);
1926 	error = sodisconnectlocked(so);
1927 	socket_unlock(so, 1);
1928 	return error;
1929 }
1930 
1931 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1932 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1933 {
1934 	int error;
1935 
1936 	/*
1937 	 * Call the protocol disconnectx handler; let it handle all
1938 	 * matters related to the connection state of this session.
1939 	 */
1940 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1941 	if (error == 0) {
1942 		/*
1943 		 * The event applies only for the session, not for
1944 		 * the disconnection of individual subflows.
1945 		 */
1946 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1947 			sflt_notify(so, sock_evt_disconnected, NULL);
1948 		}
1949 	}
1950 	return error;
1951 }
1952 
1953 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1954 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1955 {
1956 	int error;
1957 
1958 	socket_lock(so, 1);
1959 	error = sodisconnectxlocked(so, aid, cid);
1960 	socket_unlock(so, 1);
1961 	return error;
1962 }
1963 
1964 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1965 
1966 /*
1967  * sosendcheck will lock the socket buffer if it isn't locked and
1968  * verify that there is space for the data being inserted.
1969  *
1970  * Returns:	0			Success
1971  *		EPIPE
1972  *	sblock:EWOULDBLOCK
1973  *	sblock:EINTR
1974  *	sbwait:EBADF
1975  *	sbwait:EINTR
1976  *	[so_error]:???
1977  */
1978 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1979 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1980     int32_t clen, int32_t atomic, int flags, int *sblocked)
1981 {
1982 	int     error = 0;
1983 	int32_t space;
1984 	int     assumelock = 0;
1985 
1986 restart:
1987 	if (*sblocked == 0) {
1988 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1989 		    so->so_send_filt_thread != 0 &&
1990 		    so->so_send_filt_thread == current_thread()) {
1991 			/*
1992 			 * We're being called recursively from a filter,
1993 			 * allow this to continue. Radar 4150520.
1994 			 * Don't set sblocked because we don't want
1995 			 * to perform an unlock later.
1996 			 */
1997 			assumelock = 1;
1998 		} else {
1999 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2000 			if (error) {
2001 				if (so->so_flags & SOF_DEFUNCT) {
2002 					goto defunct;
2003 				}
2004 				return error;
2005 			}
2006 			*sblocked = 1;
2007 		}
2008 	}
2009 
2010 	/*
2011 	 * If a send attempt is made on a socket that has been marked
2012 	 * as inactive (disconnected), reject the request.
2013 	 */
2014 	if (so->so_flags & SOF_DEFUNCT) {
2015 defunct:
2016 		error = EPIPE;
2017 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2018 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2019 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2020 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2021 		return error;
2022 	}
2023 
2024 	if (so->so_state & SS_CANTSENDMORE) {
2025 #if CONTENT_FILTER
2026 		/*
2027 		 * Can re-inject data of half closed connections
2028 		 */
2029 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2030 		    so->so_snd.sb_cfil_thread == current_thread() &&
2031 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2032 			CFIL_LOG(LOG_INFO,
2033 			    "so %llx ignore SS_CANTSENDMORE",
2034 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2035 		} else
2036 #endif /* CONTENT_FILTER */
2037 		return EPIPE;
2038 	}
2039 	if (so->so_error) {
2040 		error = so->so_error;
2041 		so->so_error = 0;
2042 		return error;
2043 	}
2044 
2045 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2046 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2047 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2048 			    (resid != 0 || clen == 0) &&
2049 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2050 				return ENOTCONN;
2051 			}
2052 		} else if (addr == 0) {
2053 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2054 			       ENOTCONN : EDESTADDRREQ;
2055 		}
2056 	}
2057 
2058 	space = sbspace(&so->so_snd);
2059 
2060 	if (flags & MSG_OOB) {
2061 		space += 1024;
2062 	}
2063 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2064 	    clen > so->so_snd.sb_hiwat) {
2065 		return EMSGSIZE;
2066 	}
2067 
2068 	if ((space < resid + clen &&
2069 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2070 	    space < clen)) ||
2071 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2072 		/*
2073 		 * don't block the connectx call when there's more data
2074 		 * than can be copied.
2075 		 */
2076 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2077 			if (space == 0) {
2078 				return EWOULDBLOCK;
2079 			}
2080 			if (space < (int32_t)so->so_snd.sb_lowat) {
2081 				return 0;
2082 			}
2083 		}
2084 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2085 		    assumelock) {
2086 			return EWOULDBLOCK;
2087 		}
2088 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2089 		*sblocked = 0;
2090 		error = sbwait(&so->so_snd);
2091 		if (error) {
2092 			if (so->so_flags & SOF_DEFUNCT) {
2093 				goto defunct;
2094 			}
2095 			return error;
2096 		}
2097 		goto restart;
2098 	}
2099 	return 0;
2100 }
2101 
2102 /*
2103  * Send on a socket.
2104  * If send must go all at once and message is larger than
2105  * send buffering, then hard error.
2106  * Lock against other senders.
2107  * If must go all at once and not enough room now, then
2108  * inform user that this would block and do nothing.
2109  * Otherwise, if nonblocking, send as much as possible.
2110  * The data to be sent is described by "uio" if nonzero,
2111  * otherwise by the mbuf chain "top" (which must be null
2112  * if uio is not).  Data provided in mbuf chain must be small
2113  * enough to send all at once.
2114  *
2115  * Returns nonzero on error, timeout or signal; callers
2116  * must check for short counts if EINTR/ERESTART are returned.
2117  * Data and control buffers are freed on return.
2118  *
2119  * Returns:	0			Success
2120  *		EOPNOTSUPP
2121  *		EINVAL
2122  *		ENOBUFS
2123  *	uiomove:EFAULT
2124  *	sosendcheck:EPIPE
2125  *	sosendcheck:EWOULDBLOCK
2126  *	sosendcheck:EINTR
2127  *	sosendcheck:EBADF
2128  *	sosendcheck:EINTR
2129  *	sosendcheck:???			[value from so_error]
2130  *	<pru_send>:ECONNRESET[TCP]
2131  *	<pru_send>:EINVAL[TCP]
2132  *	<pru_send>:ENOBUFS[TCP]
2133  *	<pru_send>:EADDRINUSE[TCP]
2134  *	<pru_send>:EADDRNOTAVAIL[TCP]
2135  *	<pru_send>:EAFNOSUPPORT[TCP]
2136  *	<pru_send>:EACCES[TCP]
2137  *	<pru_send>:EAGAIN[TCP]
2138  *	<pru_send>:EPERM[TCP]
2139  *	<pru_send>:EMSGSIZE[TCP]
2140  *	<pru_send>:EHOSTUNREACH[TCP]
2141  *	<pru_send>:ENETUNREACH[TCP]
2142  *	<pru_send>:ENETDOWN[TCP]
2143  *	<pru_send>:ENOMEM[TCP]
2144  *	<pru_send>:ENOBUFS[TCP]
2145  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2146  *	<pru_send>:EINVAL[AF_UNIX]
2147  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2148  *	<pru_send>:EPIPE[AF_UNIX]
2149  *	<pru_send>:ENOTCONN[AF_UNIX]
2150  *	<pru_send>:EISCONN[AF_UNIX]
2151  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2152  *	<sf_data_out>:???		[whatever a filter author chooses]
2153  *
2154  * Notes:	Other <pru_send> returns depend on the protocol family; all
2155  *		<sf_data_out> returns depend on what the filter author causes
2156  *		their filter to return.
2157  */
2158 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2159 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2160     struct mbuf *top, struct mbuf *control, int flags)
2161 {
2162 	struct mbuf **mp;
2163 	struct mbuf *m, *freelist = NULL;
2164 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2165 	user_ssize_t space, len, resid, orig_resid;
2166 	int clen = 0, error, dontroute, sendflags;
2167 	int atomic = sosendallatonce(so) || top;
2168 	int sblocked = 0;
2169 	struct proc *p = current_proc();
2170 	uint16_t headroom = 0;
2171 	ssize_t mlen;
2172 	boolean_t en_tracing = FALSE;
2173 
2174 	if (uio != NULL) {
2175 		resid = uio_resid(uio);
2176 	} else {
2177 		resid = top->m_pkthdr.len;
2178 	}
2179 
2180 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2181 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2182 
2183 	socket_lock(so, 1);
2184 
2185 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2186 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2187 	}
2188 
2189 	/*
2190 	 * trace if tracing & network (vs. unix) sockets & and
2191 	 * non-loopback
2192 	 */
2193 	if (ENTR_SHOULDTRACE &&
2194 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2195 		struct inpcb *inp = sotoinpcb(so);
2196 		if (inp->inp_last_outifp != NULL &&
2197 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2198 			en_tracing = TRUE;
2199 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2200 			    VM_KERNEL_ADDRPERM(so),
2201 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2202 			    (int64_t)resid);
2203 			orig_resid = resid;
2204 		}
2205 	}
2206 
2207 	/*
2208 	 * Re-injection should not affect process accounting
2209 	 */
2210 	if ((flags & MSG_SKIPCFIL) == 0) {
2211 		so_update_last_owner_locked(so, p);
2212 		so_update_policy(so);
2213 
2214 #if NECP
2215 		so_update_necp_policy(so, NULL, addr);
2216 #endif /* NECP */
2217 	}
2218 
2219 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2220 		error = EOPNOTSUPP;
2221 		goto out_locked;
2222 	}
2223 
2224 	/*
2225 	 * In theory resid should be unsigned.
2226 	 * However, space must be signed, as it might be less than 0
2227 	 * if we over-committed, and we must use a signed comparison
2228 	 * of space and resid.  On the other hand, a negative resid
2229 	 * causes us to loop sending 0-length segments to the protocol.
2230 	 *
2231 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2232 	 *
2233 	 * Note: We limit resid to be a positive int value as we use
2234 	 * imin() to set bytes_to_copy -- radr://14558484
2235 	 */
2236 	if (resid < 0 || resid > INT_MAX ||
2237 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2238 		error = EINVAL;
2239 		goto out_locked;
2240 	}
2241 
2242 	dontroute = (flags & MSG_DONTROUTE) &&
2243 	    (so->so_options & SO_DONTROUTE) == 0 &&
2244 	    (so->so_proto->pr_flags & PR_ATOMIC);
2245 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2246 
2247 	if (control != NULL) {
2248 		clen = control->m_len;
2249 	}
2250 
2251 	if (soreserveheadroom != 0) {
2252 		headroom = so->so_pktheadroom;
2253 	}
2254 
2255 	do {
2256 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2257 		    &sblocked);
2258 		if (error) {
2259 			goto out_locked;
2260 		}
2261 
2262 		mp = &top;
2263 		space = sbspace(&so->so_snd) - clen;
2264 		space += ((flags & MSG_OOB) ? 1024 : 0);
2265 
2266 		do {
2267 			if (uio == NULL) {
2268 				/*
2269 				 * Data is prepackaged in "top".
2270 				 */
2271 				resid = 0;
2272 				if (flags & MSG_EOR) {
2273 					top->m_flags |= M_EOR;
2274 				}
2275 			} else {
2276 				int chainlength;
2277 				int bytes_to_copy;
2278 				boolean_t jumbocl;
2279 				boolean_t bigcl;
2280 				int bytes_to_alloc;
2281 
2282 				bytes_to_copy = imin((int)resid, (int)space);
2283 
2284 				bytes_to_alloc = bytes_to_copy;
2285 				if (top == NULL) {
2286 					bytes_to_alloc += headroom;
2287 				}
2288 
2289 				if (sosendminchain > 0) {
2290 					chainlength = 0;
2291 				} else {
2292 					chainlength = sosendmaxchain;
2293 				}
2294 
2295 				/*
2296 				 * Use big 4 KB cluster when the outgoing interface
2297 				 * does not prefer 2 KB clusters
2298 				 */
2299 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2300 				    sosendbigcl_ignore_capab;
2301 
2302 				/*
2303 				 * Attempt to use larger than system page-size
2304 				 * clusters for large writes only if there is
2305 				 * a jumbo cluster pool and if the socket is
2306 				 * marked accordingly.
2307 				 */
2308 				jumbocl = sosendjcl && njcl > 0 &&
2309 				    ((so->so_flags & SOF_MULTIPAGES) ||
2310 				    sosendjcl_ignore_capab) &&
2311 				    bigcl;
2312 
2313 				socket_unlock(so, 0);
2314 
2315 				do {
2316 					int num_needed;
2317 					int hdrs_needed = (top == NULL) ? 1 : 0;
2318 
2319 					/*
2320 					 * try to maintain a local cache of mbuf
2321 					 * clusters needed to complete this
2322 					 * write the list is further limited to
2323 					 * the number that are currently needed
2324 					 * to fill the socket this mechanism
2325 					 * allows a large number of mbufs/
2326 					 * clusters to be grabbed under a single
2327 					 * mbuf lock... if we can't get any
2328 					 * clusters, than fall back to trying
2329 					 * for mbufs if we fail early (or
2330 					 * miscalcluate the number needed) make
2331 					 * sure to release any clusters we
2332 					 * haven't yet consumed.
2333 					 */
2334 					if (freelist == NULL &&
2335 					    bytes_to_alloc > MBIGCLBYTES &&
2336 					    jumbocl) {
2337 						num_needed =
2338 						    bytes_to_alloc / M16KCLBYTES;
2339 
2340 						if ((bytes_to_alloc -
2341 						    (num_needed * M16KCLBYTES))
2342 						    >= MINCLSIZE) {
2343 							num_needed++;
2344 						}
2345 
2346 						freelist =
2347 						    m_getpackets_internal(
2348 							(unsigned int *)&num_needed,
2349 							hdrs_needed, M_WAIT, 0,
2350 							M16KCLBYTES);
2351 						/*
2352 						 * Fall back to 4K cluster size
2353 						 * if allocation failed
2354 						 */
2355 					}
2356 
2357 					if (freelist == NULL &&
2358 					    bytes_to_alloc > MCLBYTES &&
2359 					    bigcl) {
2360 						num_needed =
2361 						    bytes_to_alloc / MBIGCLBYTES;
2362 
2363 						if ((bytes_to_alloc -
2364 						    (num_needed * MBIGCLBYTES)) >=
2365 						    MINCLSIZE) {
2366 							num_needed++;
2367 						}
2368 
2369 						freelist =
2370 						    m_getpackets_internal(
2371 							(unsigned int *)&num_needed,
2372 							hdrs_needed, M_WAIT, 0,
2373 							MBIGCLBYTES);
2374 						/*
2375 						 * Fall back to cluster size
2376 						 * if allocation failed
2377 						 */
2378 					}
2379 
2380 					/*
2381 					 * Allocate a cluster as we want to
2382 					 * avoid to split the data in more
2383 					 * that one segment and using MINCLSIZE
2384 					 * would lead us to allocate two mbufs
2385 					 */
2386 					if (soreserveheadroom != 0 &&
2387 					    freelist == NULL &&
2388 					    ((top == NULL &&
2389 					    bytes_to_alloc > _MHLEN) ||
2390 					    bytes_to_alloc > _MLEN)) {
2391 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2392 						    MCLBYTES;
2393 						freelist =
2394 						    m_getpackets_internal(
2395 							(unsigned int *)&num_needed,
2396 							hdrs_needed, M_WAIT, 0,
2397 							MCLBYTES);
2398 						/*
2399 						 * Fall back to a single mbuf
2400 						 * if allocation failed
2401 						 */
2402 					} else if (freelist == NULL &&
2403 					    bytes_to_alloc > MINCLSIZE) {
2404 						num_needed =
2405 						    bytes_to_alloc / MCLBYTES;
2406 
2407 						if ((bytes_to_alloc -
2408 						    (num_needed * MCLBYTES)) >=
2409 						    MINCLSIZE) {
2410 							num_needed++;
2411 						}
2412 
2413 						freelist =
2414 						    m_getpackets_internal(
2415 							(unsigned int *)&num_needed,
2416 							hdrs_needed, M_WAIT, 0,
2417 							MCLBYTES);
2418 						/*
2419 						 * Fall back to a single mbuf
2420 						 * if allocation failed
2421 						 */
2422 					}
2423 					/*
2424 					 * For datagram protocols, leave
2425 					 * headroom for protocol headers
2426 					 * in the first cluster of the chain
2427 					 */
2428 					if (freelist != NULL && atomic &&
2429 					    top == NULL && headroom > 0) {
2430 						freelist->m_data += headroom;
2431 					}
2432 
2433 					/*
2434 					 * Fall back to regular mbufs without
2435 					 * reserving the socket headroom
2436 					 */
2437 					if (freelist == NULL) {
2438 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2439 							if (top == NULL) {
2440 								MGETHDR(freelist,
2441 								    M_WAIT, MT_DATA);
2442 							} else {
2443 								MGET(freelist,
2444 								    M_WAIT, MT_DATA);
2445 							}
2446 						}
2447 
2448 						if (freelist == NULL) {
2449 							error = ENOBUFS;
2450 							socket_lock(so, 0);
2451 							goto out_locked;
2452 						}
2453 						/*
2454 						 * For datagram protocols,
2455 						 * leave room for protocol
2456 						 * headers in first mbuf.
2457 						 */
2458 						if (atomic && top == NULL &&
2459 						    bytes_to_copy > 0 &&
2460 						    bytes_to_copy < MHLEN) {
2461 							MH_ALIGN(freelist,
2462 							    bytes_to_copy);
2463 						}
2464 					}
2465 					m = freelist;
2466 					freelist = m->m_next;
2467 					m->m_next = NULL;
2468 
2469 					if ((m->m_flags & M_EXT)) {
2470 						mlen = m->m_ext.ext_size -
2471 						    M_LEADINGSPACE(m);
2472 					} else if ((m->m_flags & M_PKTHDR)) {
2473 						mlen = MHLEN - M_LEADINGSPACE(m);
2474 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2475 					} else {
2476 						mlen = MLEN - M_LEADINGSPACE(m);
2477 					}
2478 					len = imin((int)mlen, bytes_to_copy);
2479 
2480 					chainlength += len;
2481 
2482 					space -= len;
2483 
2484 					error = uiomove(mtod(m, caddr_t),
2485 					    (int)len, uio);
2486 
2487 					resid = uio_resid(uio);
2488 
2489 					m->m_len = (int32_t)len;
2490 					*mp = m;
2491 					top->m_pkthdr.len += len;
2492 					if (error) {
2493 						break;
2494 					}
2495 					mp = &m->m_next;
2496 					if (resid <= 0) {
2497 						if (flags & MSG_EOR) {
2498 							top->m_flags |= M_EOR;
2499 						}
2500 						break;
2501 					}
2502 					bytes_to_copy = imin((int)resid, (int)space);
2503 				} while (space > 0 &&
2504 				    (chainlength < sosendmaxchain || atomic ||
2505 				    resid < MINCLSIZE));
2506 
2507 				socket_lock(so, 0);
2508 
2509 				if (error) {
2510 					goto out_locked;
2511 				}
2512 			}
2513 
2514 			if (dontroute) {
2515 				so->so_options |= SO_DONTROUTE;
2516 			}
2517 
2518 			/*
2519 			 * Compute flags here, for pru_send and NKEs
2520 			 *
2521 			 * If the user set MSG_EOF, the protocol
2522 			 * understands this flag and nothing left to
2523 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2524 			 */
2525 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2526 			    ((flags & MSG_EOF) &&
2527 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2528 			    (resid <= 0)) ? PRUS_EOF :
2529 			    /* If there is more to send set PRUS_MORETOCOME */
2530 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2531 
2532 			if ((flags & MSG_SKIPCFIL) == 0) {
2533 				/*
2534 				 * Socket filter processing
2535 				 */
2536 				error = sflt_data_out(so, addr, &top,
2537 				    &control, (sendflags & MSG_OOB) ?
2538 				    sock_data_filt_flag_oob : 0);
2539 				if (error) {
2540 					if (error == EJUSTRETURN) {
2541 						error = 0;
2542 						goto packet_consumed;
2543 					}
2544 					goto out_locked;
2545 				}
2546 #if CONTENT_FILTER
2547 				/*
2548 				 * Content filter processing
2549 				 */
2550 				error = cfil_sock_data_out(so, addr, top,
2551 				    control, sendflags, dgram_flow_entry);
2552 				if (error) {
2553 					if (error == EJUSTRETURN) {
2554 						error = 0;
2555 						goto packet_consumed;
2556 					}
2557 					goto out_locked;
2558 				}
2559 #endif /* CONTENT_FILTER */
2560 			}
2561 			error = (*so->so_proto->pr_usrreqs->pru_send)
2562 			    (so, sendflags, top, addr, control, p);
2563 
2564 packet_consumed:
2565 			if (dontroute) {
2566 				so->so_options &= ~SO_DONTROUTE;
2567 			}
2568 
2569 			clen = 0;
2570 			control = NULL;
2571 			top = NULL;
2572 			mp = &top;
2573 			if (error) {
2574 				goto out_locked;
2575 			}
2576 		} while (resid && space > 0);
2577 	} while (resid);
2578 
2579 out_locked:
2580 	if (sblocked) {
2581 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2582 	} else {
2583 		socket_unlock(so, 1);
2584 	}
2585 	if (top != NULL) {
2586 		m_freem(top);
2587 	}
2588 	if (control != NULL) {
2589 		m_freem(control);
2590 	}
2591 	if (freelist != NULL) {
2592 		m_freem_list(freelist);
2593 	}
2594 
2595 	if (dgram_flow_entry != NULL) {
2596 		soflow_free_flow(dgram_flow_entry);
2597 	}
2598 
2599 	soclearfastopen(so);
2600 
2601 	if (en_tracing) {
2602 		/* resid passed here is the bytes left in uio */
2603 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2604 		    VM_KERNEL_ADDRPERM(so),
2605 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2606 		    (int64_t)(orig_resid - resid));
2607 	}
2608 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2609 	    so->so_snd.sb_cc, space, error);
2610 
2611 	return error;
2612 }
2613 
2614 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2615 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2616 {
2617 	struct mbuf *m0 = NULL, *control_end = NULL;
2618 
2619 	socket_lock_assert_owned(so);
2620 
2621 	/*
2622 	 * top must points to mbuf chain to be sent.
2623 	 * If control is not NULL, top must be packet header
2624 	 */
2625 	VERIFY(top != NULL &&
2626 	    (control == NULL || top->m_flags & M_PKTHDR));
2627 
2628 	/*
2629 	 * If control is not passed in, see if we can get it
2630 	 * from top.
2631 	 */
2632 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2633 		// Locate start of control if present and start of data
2634 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2635 			if (m0->m_flags & M_PKTHDR) {
2636 				top = m0;
2637 				break;
2638 			} else if (m0->m_type == MT_CONTROL) {
2639 				if (control == NULL) {
2640 					// Found start of control
2641 					control = m0;
2642 				}
2643 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2644 					// Found end of control
2645 					control_end = m0;
2646 				}
2647 			}
2648 		}
2649 		if (control_end != NULL) {
2650 			control_end->m_next = NULL;
2651 		}
2652 	}
2653 
2654 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2655 	    (so, sendflags, top, addr, control, current_proc());
2656 
2657 	return error;
2658 }
2659 
2660 /*
2661  * Supported only connected sockets (no address) without ancillary data
2662  * (control mbuf) for atomic protocols
2663  */
2664 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2665 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2666 {
2667 	struct mbuf *m, *freelist = NULL;
2668 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2669 	user_ssize_t len, resid;
2670 	int error, dontroute;
2671 	int atomic = sosendallatonce(so);
2672 	int sblocked = 0;
2673 	struct proc *p = current_proc();
2674 	u_int uiofirst = 0;
2675 	u_int uiolast = 0;
2676 	struct mbuf *top = NULL;
2677 	uint16_t headroom = 0;
2678 	ssize_t mlen;
2679 	boolean_t bigcl;
2680 
2681 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2682 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2683 
2684 	if (so->so_type != SOCK_DGRAM) {
2685 		error = EINVAL;
2686 		goto out;
2687 	}
2688 	if (atomic == 0) {
2689 		error = EINVAL;
2690 		goto out;
2691 	}
2692 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2693 		error = EPROTONOSUPPORT;
2694 		goto out;
2695 	}
2696 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2697 		error = EINVAL;
2698 		goto out;
2699 	}
2700 	resid = uio_array_resid(uioarray, uiocnt);
2701 
2702 	/*
2703 	 * In theory resid should be unsigned.
2704 	 * However, space must be signed, as it might be less than 0
2705 	 * if we over-committed, and we must use a signed comparison
2706 	 * of space and resid.  On the other hand, a negative resid
2707 	 * causes us to loop sending 0-length segments to the protocol.
2708 	 *
2709 	 * Note: We limit resid to be a positive int value as we use
2710 	 * imin() to set bytes_to_copy -- radr://14558484
2711 	 */
2712 	if (resid < 0 || resid > INT_MAX) {
2713 		error = EINVAL;
2714 		goto out;
2715 	}
2716 
2717 	socket_lock(so, 1);
2718 	so_update_last_owner_locked(so, p);
2719 	so_update_policy(so);
2720 
2721 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2722 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2723 	}
2724 
2725 #if NECP
2726 	so_update_necp_policy(so, NULL, NULL);
2727 #endif /* NECP */
2728 
2729 	dontroute = (flags & MSG_DONTROUTE) &&
2730 	    (so->so_options & SO_DONTROUTE) == 0 &&
2731 	    (so->so_proto->pr_flags & PR_ATOMIC);
2732 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2733 
2734 	error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2735 	if (error) {
2736 		goto release;
2737 	}
2738 
2739 	/*
2740 	 * Use big 4 KB clusters when the outgoing interface does not prefer
2741 	 * 2 KB clusters
2742 	 */
2743 	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2744 
2745 	if (soreserveheadroom != 0) {
2746 		headroom = so->so_pktheadroom;
2747 	}
2748 
2749 	do {
2750 		int i;
2751 		int num_needed = 0;
2752 		int chainlength;
2753 		size_t maxpktlen = 0;
2754 		int bytes_to_alloc;
2755 
2756 		if (sosendminchain > 0) {
2757 			chainlength = 0;
2758 		} else {
2759 			chainlength = sosendmaxchain;
2760 		}
2761 
2762 		socket_unlock(so, 0);
2763 
2764 		/*
2765 		 * Find a set of uio that fit in a reasonable number
2766 		 * of mbuf packets
2767 		 */
2768 		for (i = uiofirst; i < uiocnt; i++) {
2769 			struct uio *auio = uioarray[i];
2770 
2771 			len = uio_resid(auio);
2772 
2773 			/* Do nothing for empty messages */
2774 			if (len == 0) {
2775 				continue;
2776 			}
2777 
2778 			num_needed += 1;
2779 			uiolast += 1;
2780 
2781 			if (len > maxpktlen) {
2782 				maxpktlen = len;
2783 			}
2784 
2785 			chainlength += len;
2786 			if (chainlength > sosendmaxchain) {
2787 				break;
2788 			}
2789 		}
2790 		/*
2791 		 * Nothing left to send
2792 		 */
2793 		if (num_needed == 0) {
2794 			socket_lock(so, 0);
2795 			break;
2796 		}
2797 		/*
2798 		 * Allocate buffer large enough to include headroom space for
2799 		 * network and link header
2800 		 *
2801 		 */
2802 		bytes_to_alloc = (int) maxpktlen + headroom;
2803 
2804 		/*
2805 		 * Allocate a single contiguous buffer of the smallest available
2806 		 * size when possible
2807 		 */
2808 		if (bytes_to_alloc > MCLBYTES &&
2809 		    bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2810 			freelist = m_getpackets_internal(
2811 				(unsigned int *)&num_needed,
2812 				num_needed, M_WAIT, 1,
2813 				MBIGCLBYTES);
2814 		} else if (bytes_to_alloc > _MHLEN &&
2815 		    bytes_to_alloc <= MCLBYTES) {
2816 			freelist = m_getpackets_internal(
2817 				(unsigned int *)&num_needed,
2818 				num_needed, M_WAIT, 1,
2819 				MCLBYTES);
2820 		} else {
2821 			freelist = m_allocpacket_internal(
2822 				(unsigned int *)&num_needed,
2823 				bytes_to_alloc, NULL, M_WAIT, 1, 0);
2824 		}
2825 
2826 		if (freelist == NULL) {
2827 			socket_lock(so, 0);
2828 			error = ENOMEM;
2829 			goto release;
2830 		}
2831 		/*
2832 		 * Copy each uio of the set into its own mbuf packet
2833 		 */
2834 		for (i = uiofirst, m = freelist;
2835 		    i < uiolast && m != NULL;
2836 		    i++) {
2837 			int bytes_to_copy;
2838 			struct mbuf *n;
2839 			struct uio *auio = uioarray[i];
2840 
2841 			bytes_to_copy = (int)uio_resid(auio);
2842 
2843 			/* Do nothing for empty messages */
2844 			if (bytes_to_copy == 0) {
2845 				continue;
2846 			}
2847 			/*
2848 			 * Leave headroom for protocol headers
2849 			 * in the first mbuf of the chain
2850 			 */
2851 			m->m_data += headroom;
2852 
2853 			for (n = m; n != NULL; n = n->m_next) {
2854 				if ((m->m_flags & M_EXT)) {
2855 					mlen = m->m_ext.ext_size -
2856 					    M_LEADINGSPACE(m);
2857 				} else if ((m->m_flags & M_PKTHDR)) {
2858 					mlen =
2859 					    MHLEN - M_LEADINGSPACE(m);
2860 				} else {
2861 					mlen = MLEN - M_LEADINGSPACE(m);
2862 				}
2863 				len = imin((int)mlen, bytes_to_copy);
2864 
2865 				/*
2866 				 * Note: uiomove() decrements the iovec
2867 				 * length
2868 				 */
2869 				error = uiomove(mtod(n, caddr_t),
2870 				    (int)len, auio);
2871 				if (error != 0) {
2872 					break;
2873 				}
2874 				n->m_len = (int32_t)len;
2875 				m->m_pkthdr.len += len;
2876 
2877 				VERIFY(m->m_pkthdr.len <= maxpktlen);
2878 
2879 				bytes_to_copy -= len;
2880 				resid -= len;
2881 			}
2882 			if (m->m_pkthdr.len == 0) {
2883 				printf(
2884 					"%s:%d so %llx pkt %llx type %u len null\n",
2885 					__func__, __LINE__,
2886 					(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2887 					(uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2888 					m->m_type);
2889 			}
2890 			if (error != 0) {
2891 				break;
2892 			}
2893 			m = m->m_nextpkt;
2894 		}
2895 
2896 		socket_lock(so, 0);
2897 
2898 		if (error) {
2899 			goto release;
2900 		}
2901 		top = freelist;
2902 		freelist = NULL;
2903 
2904 		if (dontroute) {
2905 			so->so_options |= SO_DONTROUTE;
2906 		}
2907 
2908 		if ((flags & MSG_SKIPCFIL) == 0) {
2909 			struct mbuf **prevnextp = NULL;
2910 
2911 			for (i = uiofirst, m = top;
2912 			    i < uiolast && m != NULL;
2913 			    i++) {
2914 				struct mbuf *nextpkt = m->m_nextpkt;
2915 
2916 				/*
2917 				 * Socket filter processing
2918 				 */
2919 				error = sflt_data_out(so, NULL, &m,
2920 				    NULL, 0);
2921 				if (error != 0 && error != EJUSTRETURN) {
2922 					goto release;
2923 				}
2924 
2925 #if CONTENT_FILTER
2926 				if (error == 0) {
2927 					/*
2928 					 * Content filter processing
2929 					 */
2930 					error = cfil_sock_data_out(so, NULL, m,
2931 					    NULL, 0, dgram_flow_entry);
2932 					if (error != 0 && error != EJUSTRETURN) {
2933 						goto release;
2934 					}
2935 				}
2936 #endif /* CONTENT_FILTER */
2937 				/*
2938 				 * Remove packet from the list when
2939 				 * swallowed by a filter
2940 				 */
2941 				if (error == EJUSTRETURN) {
2942 					error = 0;
2943 					if (prevnextp != NULL) {
2944 						*prevnextp = nextpkt;
2945 					} else {
2946 						top = nextpkt;
2947 					}
2948 				}
2949 
2950 				m = nextpkt;
2951 				if (m != NULL) {
2952 					prevnextp = &m->m_nextpkt;
2953 				}
2954 			}
2955 		}
2956 		if (top != NULL) {
2957 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2958 			    (so, 0, top, NULL, NULL, p);
2959 		}
2960 
2961 		if (dontroute) {
2962 			so->so_options &= ~SO_DONTROUTE;
2963 		}
2964 
2965 		top = NULL;
2966 		uiofirst = uiolast;
2967 	} while (resid > 0 && error == 0);
2968 release:
2969 	if (sblocked) {
2970 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2971 	} else {
2972 		socket_unlock(so, 1);
2973 	}
2974 out:
2975 	if (top != NULL) {
2976 		m_freem(top);
2977 	}
2978 	if (freelist != NULL) {
2979 		m_freem_list(freelist);
2980 	}
2981 
2982 	if (dgram_flow_entry != NULL) {
2983 		soflow_free_flow(dgram_flow_entry);
2984 	}
2985 
2986 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2987 	    so->so_snd.sb_cc, 0, error);
2988 
2989 	return error;
2990 }
2991 
2992 /*
2993  * May return ERESTART when packet is dropped by MAC policy check
2994  */
2995 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2996 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2997     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2998 {
2999 	int error = 0;
3000 	struct mbuf *m = *mp;
3001 	struct mbuf *nextrecord = *nextrecordp;
3002 
3003 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3004 #if CONFIG_MACF_SOCKET_SUBSET
3005 	/*
3006 	 * Call the MAC framework for policy checking if we're in
3007 	 * the user process context and the socket isn't connected.
3008 	 */
3009 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3010 		struct mbuf *m0 = m;
3011 		/*
3012 		 * Dequeue this record (temporarily) from the receive
3013 		 * list since we're about to drop the socket's lock
3014 		 * where a new record may arrive and be appended to
3015 		 * the list.  Upon MAC policy failure, the record
3016 		 * will be freed.  Otherwise, we'll add it back to
3017 		 * the head of the list.  We cannot rely on SB_LOCK
3018 		 * because append operation uses the socket's lock.
3019 		 */
3020 		do {
3021 			m->m_nextpkt = NULL;
3022 			sbfree(&so->so_rcv, m);
3023 			m = m->m_next;
3024 		} while (m != NULL);
3025 		m = m0;
3026 		so->so_rcv.sb_mb = nextrecord;
3027 		SB_EMPTY_FIXUP(&so->so_rcv);
3028 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3029 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3030 		socket_unlock(so, 0);
3031 
3032 		error = mac_socket_check_received(kauth_cred_get(), so,
3033 		    mtod(m, struct sockaddr *));
3034 
3035 		if (error != 0) {
3036 			/*
3037 			 * MAC policy failure; free this record and
3038 			 * process the next record (or block until
3039 			 * one is available).  We have adjusted sb_cc
3040 			 * and sb_mbcnt above so there is no need to
3041 			 * call sbfree() again.
3042 			 */
3043 			m_freem(m);
3044 			/*
3045 			 * Clear SB_LOCK but don't unlock the socket.
3046 			 * Process the next record or wait for one.
3047 			 */
3048 			socket_lock(so, 0);
3049 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
3050 			error = ERESTART;
3051 			goto done;
3052 		}
3053 		socket_lock(so, 0);
3054 		/*
3055 		 * If the socket has been defunct'd, drop it.
3056 		 */
3057 		if (so->so_flags & SOF_DEFUNCT) {
3058 			m_freem(m);
3059 			error = ENOTCONN;
3060 			goto done;
3061 		}
3062 		/*
3063 		 * Re-adjust the socket receive list and re-enqueue
3064 		 * the record in front of any packets which may have
3065 		 * been appended while we dropped the lock.
3066 		 */
3067 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3068 			sballoc(&so->so_rcv, m);
3069 		}
3070 		sballoc(&so->so_rcv, m);
3071 		if (so->so_rcv.sb_mb == NULL) {
3072 			so->so_rcv.sb_lastrecord = m0;
3073 			so->so_rcv.sb_mbtail = m;
3074 		}
3075 		m = m0;
3076 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3077 		so->so_rcv.sb_mb = m;
3078 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3079 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3080 	}
3081 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3082 	if (psa != NULL) {
3083 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3084 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3085 			error = EWOULDBLOCK;
3086 			goto done;
3087 		}
3088 	}
3089 	if (flags & MSG_PEEK) {
3090 		m = m->m_next;
3091 	} else {
3092 		sbfree(&so->so_rcv, m);
3093 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3094 			panic("%s: about to create invalid socketbuf",
3095 			    __func__);
3096 			/* NOTREACHED */
3097 		}
3098 		MFREE(m, so->so_rcv.sb_mb);
3099 		m = so->so_rcv.sb_mb;
3100 		if (m != NULL) {
3101 			m->m_nextpkt = nextrecord;
3102 		} else {
3103 			so->so_rcv.sb_mb = nextrecord;
3104 			SB_EMPTY_FIXUP(&so->so_rcv);
3105 		}
3106 	}
3107 done:
3108 	*mp = m;
3109 	*nextrecordp = nextrecord;
3110 
3111 	return error;
3112 }
3113 
3114 /*
3115  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3116  * so clear the data portion in order not to leak the file pointers
3117  */
3118 static void
sopeek_scm_rights(struct mbuf * rights)3119 sopeek_scm_rights(struct mbuf *rights)
3120 {
3121 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3122 
3123 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3124 		VERIFY(cm->cmsg_len <= rights->m_len);
3125 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3126 	}
3127 }
3128 
3129 /*
3130  * Process one or more MT_CONTROL mbufs present before any data mbufs
3131  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3132  * just copy the data; if !MSG_PEEK, we call into the protocol to
3133  * perform externalization.
3134  */
3135 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3136 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3137     struct mbuf **mp, struct mbuf **nextrecordp)
3138 {
3139 	int error = 0;
3140 	struct mbuf *cm = NULL, *cmn;
3141 	struct mbuf **cme = &cm;
3142 	struct sockbuf *sb_rcv = &so->so_rcv;
3143 	struct mbuf **msgpcm = NULL;
3144 	struct mbuf *m = *mp;
3145 	struct mbuf *nextrecord = *nextrecordp;
3146 	struct protosw *pr = so->so_proto;
3147 
3148 	/*
3149 	 * Externalizing the control messages would require us to
3150 	 * drop the socket's lock below.  Once we re-acquire the
3151 	 * lock, the mbuf chain might change.  In order to preserve
3152 	 * consistency, we unlink all control messages from the
3153 	 * first mbuf chain in one shot and link them separately
3154 	 * onto a different chain.
3155 	 */
3156 	do {
3157 		if (flags & MSG_PEEK) {
3158 			if (controlp != NULL) {
3159 				if (*controlp == NULL) {
3160 					msgpcm = controlp;
3161 				}
3162 				*controlp = m_copy(m, 0, m->m_len);
3163 
3164 				/*
3165 				 * If we failed to allocate an mbuf,
3166 				 * release any previously allocated
3167 				 * mbufs for control data. Return
3168 				 * an error. Keep the mbufs in the
3169 				 * socket as this is using
3170 				 * MSG_PEEK flag.
3171 				 */
3172 				if (*controlp == NULL) {
3173 					m_freem(*msgpcm);
3174 					error = ENOBUFS;
3175 					goto done;
3176 				}
3177 
3178 				if (pr->pr_domain->dom_externalize != NULL) {
3179 					sopeek_scm_rights(*controlp);
3180 				}
3181 
3182 				controlp = &(*controlp)->m_next;
3183 			}
3184 			m = m->m_next;
3185 		} else {
3186 			m->m_nextpkt = NULL;
3187 			sbfree(sb_rcv, m);
3188 			sb_rcv->sb_mb = m->m_next;
3189 			m->m_next = NULL;
3190 			*cme = m;
3191 			cme = &(*cme)->m_next;
3192 			m = sb_rcv->sb_mb;
3193 		}
3194 	} while (m != NULL && m->m_type == MT_CONTROL);
3195 
3196 	if (!(flags & MSG_PEEK)) {
3197 		if (sb_rcv->sb_mb != NULL) {
3198 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3199 		} else {
3200 			sb_rcv->sb_mb = nextrecord;
3201 			SB_EMPTY_FIXUP(sb_rcv);
3202 		}
3203 		if (nextrecord == NULL) {
3204 			sb_rcv->sb_lastrecord = m;
3205 		}
3206 	}
3207 
3208 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3209 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3210 
3211 	while (cm != NULL) {
3212 		int cmsg_level;
3213 		int cmsg_type;
3214 
3215 		cmn = cm->m_next;
3216 		cm->m_next = NULL;
3217 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3218 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3219 
3220 		/*
3221 		 * Call the protocol to externalize SCM_RIGHTS message
3222 		 * and return the modified message to the caller upon
3223 		 * success.  Otherwise, all other control messages are
3224 		 * returned unmodified to the caller.  Note that we
3225 		 * only get into this loop if MSG_PEEK is not set.
3226 		 */
3227 		if (pr->pr_domain->dom_externalize != NULL &&
3228 		    cmsg_level == SOL_SOCKET &&
3229 		    cmsg_type == SCM_RIGHTS) {
3230 			/*
3231 			 * Release socket lock: see 3903171.  This
3232 			 * would also allow more records to be appended
3233 			 * to the socket buffer.  We still have SB_LOCK
3234 			 * set on it, so we can be sure that the head
3235 			 * of the mbuf chain won't change.
3236 			 */
3237 			socket_unlock(so, 0);
3238 			error = (*pr->pr_domain->dom_externalize)(cm);
3239 			socket_lock(so, 0);
3240 		} else {
3241 			error = 0;
3242 		}
3243 
3244 		if (controlp != NULL && error == 0) {
3245 			*controlp = cm;
3246 			controlp = &(*controlp)->m_next;
3247 		} else {
3248 			(void) m_free(cm);
3249 		}
3250 		cm = cmn;
3251 	}
3252 	/*
3253 	 * Update the value of nextrecord in case we received new
3254 	 * records when the socket was unlocked above for
3255 	 * externalizing SCM_RIGHTS.
3256 	 */
3257 	if (m != NULL) {
3258 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3259 	} else {
3260 		nextrecord = sb_rcv->sb_mb;
3261 	}
3262 
3263 done:
3264 	*mp = m;
3265 	*nextrecordp = nextrecord;
3266 
3267 	return error;
3268 }
3269 
3270 /*
3271  * If we have less data than requested, block awaiting more
3272  * (subject to any timeout) if:
3273  *   1. the current count is less than the low water mark, or
3274  *   2. MSG_WAITALL is set, and it is possible to do the entire
3275  *	receive operation at once if we block (resid <= hiwat).
3276  *   3. MSG_DONTWAIT is not set
3277  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3278  * we have to do the receive in sections, and thus risk returning
3279  * a short count if a timeout or signal occurs after we start.
3280  */
3281 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3282 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3283 {
3284 	struct protosw *pr = so->so_proto;
3285 
3286 	/* No mbufs in the receive-queue? Wait! */
3287 	if (m == NULL) {
3288 		return true;
3289 	}
3290 
3291 	/* Not enough data in the receive socket-buffer - we may have to wait */
3292 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3293 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3294 		/*
3295 		 * Application did set the lowater-mark, so we should wait for
3296 		 * this data to be present.
3297 		 */
3298 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3299 			return true;
3300 		}
3301 
3302 		/*
3303 		 * Application wants all the data - so let's try to do the
3304 		 * receive-operation at once by waiting for everything to
3305 		 * be there.
3306 		 */
3307 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3308 			return true;
3309 		}
3310 	}
3311 
3312 	return false;
3313 }
3314 
3315 /*
3316  * Implement receive operations on a socket.
3317  * We depend on the way that records are added to the sockbuf
3318  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3319  * must begin with an address if the protocol so specifies,
3320  * followed by an optional mbuf or mbufs containing ancillary data,
3321  * and then zero or more mbufs of data.
3322  * In order to avoid blocking network interrupts for the entire time here,
3323  * we splx() while doing the actual copy to user space.
3324  * Although the sockbuf is locked, new data may still be appended,
3325  * and thus we must maintain consistency of the sockbuf during that time.
3326  *
3327  * The caller may receive the data as a single mbuf chain by supplying
3328  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3329  * only for the count in uio_resid.
3330  *
3331  * Returns:	0			Success
3332  *		ENOBUFS
3333  *		ENOTCONN
3334  *		EWOULDBLOCK
3335  *	uiomove:EFAULT
3336  *	sblock:EWOULDBLOCK
3337  *	sblock:EINTR
3338  *	sbwait:EBADF
3339  *	sbwait:EINTR
3340  *	sodelayed_copy:EFAULT
3341  *	<pru_rcvoob>:EINVAL[TCP]
3342  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3343  *	<pru_rcvoob>:???
3344  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3345  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3346  *	<pr_domain->dom_externalize>:???
3347  *
3348  * Notes:	Additional return values from calls through <pru_rcvoob> and
3349  *		<pr_domain->dom_externalize> depend on protocols other than
3350  *		TCP or AF_UNIX, which are documented above.
3351  */
3352 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3353 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3354     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3355 {
3356 	struct mbuf *m, **mp, *ml = NULL;
3357 	struct mbuf *nextrecord, *free_list;
3358 	int flags, error, offset;
3359 	user_ssize_t len;
3360 	struct protosw *pr = so->so_proto;
3361 	int moff, type = 0;
3362 	user_ssize_t orig_resid = uio_resid(uio);
3363 	user_ssize_t delayed_copy_len;
3364 	int can_delay;
3365 	struct proc *p = current_proc();
3366 	boolean_t en_tracing = FALSE;
3367 
3368 	/*
3369 	 * Sanity check on the length passed by caller as we are making 'int'
3370 	 * comparisons
3371 	 */
3372 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3373 		return EINVAL;
3374 	}
3375 
3376 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3377 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3378 	    so->so_rcv.sb_hiwat);
3379 
3380 	socket_lock(so, 1);
3381 	so_update_last_owner_locked(so, p);
3382 	so_update_policy(so);
3383 
3384 #ifdef MORE_LOCKING_DEBUG
3385 	if (so->so_usecount == 1) {
3386 		panic("%s: so=%x no other reference on socket", __func__, so);
3387 		/* NOTREACHED */
3388 	}
3389 #endif
3390 	mp = mp0;
3391 	if (psa != NULL) {
3392 		*psa = NULL;
3393 	}
3394 	if (controlp != NULL) {
3395 		*controlp = NULL;
3396 	}
3397 	if (flagsp != NULL) {
3398 		flags = *flagsp & ~MSG_EOR;
3399 	} else {
3400 		flags = 0;
3401 	}
3402 
3403 	/*
3404 	 * If a recv attempt is made on a previously-accepted socket
3405 	 * that has been marked as inactive (disconnected), reject
3406 	 * the request.
3407 	 */
3408 	if (so->so_flags & SOF_DEFUNCT) {
3409 		struct sockbuf *sb = &so->so_rcv;
3410 
3411 		error = ENOTCONN;
3412 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3413 		    __func__, proc_pid(p), proc_best_name(p),
3414 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3415 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3416 		/*
3417 		 * This socket should have been disconnected and flushed
3418 		 * prior to being returned from sodefunct(); there should
3419 		 * be no data on its receive list, so panic otherwise.
3420 		 */
3421 		if (so->so_state & SS_DEFUNCT) {
3422 			sb_empty_assert(sb, __func__);
3423 		}
3424 		socket_unlock(so, 1);
3425 		return error;
3426 	}
3427 
3428 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3429 	    pr->pr_usrreqs->pru_preconnect) {
3430 		/*
3431 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3432 		 * calling write() right after this. *If* the app calls a read
3433 		 * we do not want to block this read indefinetely. Thus,
3434 		 * we trigger a connect so that the session gets initiated.
3435 		 */
3436 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3437 
3438 		if (error) {
3439 			socket_unlock(so, 1);
3440 			return error;
3441 		}
3442 	}
3443 
3444 	if (ENTR_SHOULDTRACE &&
3445 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3446 		/*
3447 		 * enable energy tracing for inet sockets that go over
3448 		 * non-loopback interfaces only.
3449 		 */
3450 		struct inpcb *inp = sotoinpcb(so);
3451 		if (inp->inp_last_outifp != NULL &&
3452 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3453 			en_tracing = TRUE;
3454 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3455 			    VM_KERNEL_ADDRPERM(so),
3456 			    ((so->so_state & SS_NBIO) ?
3457 			    kEnTrFlagNonBlocking : 0),
3458 			    (int64_t)orig_resid);
3459 		}
3460 	}
3461 
3462 	/*
3463 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3464 	 * regardless of the flags argument. Here is the case were
3465 	 * out-of-band data is not inline.
3466 	 */
3467 	if ((flags & MSG_OOB) ||
3468 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3469 	    (so->so_options & SO_OOBINLINE) == 0 &&
3470 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3471 		m = m_get(M_WAIT, MT_DATA);
3472 		if (m == NULL) {
3473 			socket_unlock(so, 1);
3474 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3475 			    ENOBUFS, 0, 0, 0, 0);
3476 			return ENOBUFS;
3477 		}
3478 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3479 		if (error) {
3480 			goto bad;
3481 		}
3482 		socket_unlock(so, 0);
3483 		do {
3484 			error = uiomove(mtod(m, caddr_t),
3485 			    imin((int)uio_resid(uio), m->m_len), uio);
3486 			m = m_free(m);
3487 		} while (uio_resid(uio) && error == 0 && m != NULL);
3488 		socket_lock(so, 0);
3489 bad:
3490 		if (m != NULL) {
3491 			m_freem(m);
3492 		}
3493 
3494 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3495 			if (error == EWOULDBLOCK || error == EINVAL) {
3496 				/*
3497 				 * Let's try to get normal data:
3498 				 * EWOULDBLOCK: out-of-band data not
3499 				 * receive yet. EINVAL: out-of-band data
3500 				 * already read.
3501 				 */
3502 				error = 0;
3503 				goto nooob;
3504 			} else if (error == 0 && flagsp != NULL) {
3505 				*flagsp |= MSG_OOB;
3506 			}
3507 		}
3508 		socket_unlock(so, 1);
3509 		if (en_tracing) {
3510 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3511 			    VM_KERNEL_ADDRPERM(so), 0,
3512 			    (int64_t)(orig_resid - uio_resid(uio)));
3513 		}
3514 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3515 		    0, 0, 0, 0);
3516 
3517 		return error;
3518 	}
3519 nooob:
3520 	if (mp != NULL) {
3521 		*mp = NULL;
3522 	}
3523 
3524 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3525 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3526 	}
3527 
3528 	free_list = NULL;
3529 	delayed_copy_len = 0;
3530 restart:
3531 #ifdef MORE_LOCKING_DEBUG
3532 	if (so->so_usecount <= 1) {
3533 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3534 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3535 	}
3536 #endif
3537 	/*
3538 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3539 	 * and if so just return to the caller.  This could happen when
3540 	 * soreceive() is called by a socket upcall function during the
3541 	 * time the socket is freed.  The socket buffer would have been
3542 	 * locked across the upcall, therefore we cannot put this thread
3543 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3544 	 * we may livelock), because the lock on the socket buffer will
3545 	 * only be released when the upcall routine returns to its caller.
3546 	 * Because the socket has been officially closed, there can be
3547 	 * no further read on it.
3548 	 *
3549 	 * A multipath subflow socket would have its SS_NOFDREF set by
3550 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3551 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3552 	 */
3553 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3554 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3555 		socket_unlock(so, 1);
3556 		return 0;
3557 	}
3558 
3559 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3560 	if (error) {
3561 		socket_unlock(so, 1);
3562 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3563 		    0, 0, 0, 0);
3564 		if (en_tracing) {
3565 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3566 			    VM_KERNEL_ADDRPERM(so), 0,
3567 			    (int64_t)(orig_resid - uio_resid(uio)));
3568 		}
3569 		return error;
3570 	}
3571 
3572 	m = so->so_rcv.sb_mb;
3573 	if (so_should_wait(so, uio, m, flags)) {
3574 		/*
3575 		 * Panic if we notice inconsistencies in the socket's
3576 		 * receive list; both sb_mb and sb_cc should correctly
3577 		 * reflect the contents of the list, otherwise we may
3578 		 * end up with false positives during select() or poll()
3579 		 * which could put the application in a bad state.
3580 		 */
3581 		SB_MB_CHECK(&so->so_rcv);
3582 
3583 		if (so->so_error) {
3584 			if (m != NULL) {
3585 				goto dontblock;
3586 			}
3587 			error = so->so_error;
3588 			if ((flags & MSG_PEEK) == 0) {
3589 				so->so_error = 0;
3590 			}
3591 			goto release;
3592 		}
3593 		if (so->so_state & SS_CANTRCVMORE) {
3594 #if CONTENT_FILTER
3595 			/*
3596 			 * Deal with half closed connections
3597 			 */
3598 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3599 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3600 				CFIL_LOG(LOG_INFO,
3601 				    "so %llx ignore SS_CANTRCVMORE",
3602 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3603 			} else
3604 #endif /* CONTENT_FILTER */
3605 			if (m != NULL) {
3606 				goto dontblock;
3607 			} else {
3608 				goto release;
3609 			}
3610 		}
3611 		for (; m != NULL; m = m->m_next) {
3612 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3613 				m = so->so_rcv.sb_mb;
3614 				goto dontblock;
3615 			}
3616 		}
3617 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3618 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3619 			error = ENOTCONN;
3620 			goto release;
3621 		}
3622 		if (uio_resid(uio) == 0) {
3623 			goto release;
3624 		}
3625 
3626 		if ((so->so_state & SS_NBIO) ||
3627 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3628 			error = EWOULDBLOCK;
3629 			goto release;
3630 		}
3631 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3632 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3633 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3634 #if EVEN_MORE_LOCKING_DEBUG
3635 		if (socket_debug) {
3636 			printf("Waiting for socket data\n");
3637 		}
3638 #endif
3639 
3640 		/*
3641 		 * Depending on the protocol (e.g. TCP), the following
3642 		 * might cause the socket lock to be dropped and later
3643 		 * be reacquired, and more data could have arrived and
3644 		 * have been appended to the receive socket buffer by
3645 		 * the time it returns.  Therefore, we only sleep in
3646 		 * sbwait() below if and only if the wait-condition is still
3647 		 * true.
3648 		 */
3649 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3650 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3651 		}
3652 
3653 		error = 0;
3654 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3655 			error = sbwait(&so->so_rcv);
3656 		}
3657 
3658 #if EVEN_MORE_LOCKING_DEBUG
3659 		if (socket_debug) {
3660 			printf("SORECEIVE - sbwait returned %d\n", error);
3661 		}
3662 #endif
3663 		if (so->so_usecount < 1) {
3664 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3665 			    __func__, so, so->so_usecount);
3666 			/* NOTREACHED */
3667 		}
3668 		if (error) {
3669 			socket_unlock(so, 1);
3670 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3671 			    0, 0, 0, 0);
3672 			if (en_tracing) {
3673 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3674 				    VM_KERNEL_ADDRPERM(so), 0,
3675 				    (int64_t)(orig_resid - uio_resid(uio)));
3676 			}
3677 			return error;
3678 		}
3679 		goto restart;
3680 	}
3681 dontblock:
3682 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3683 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3684 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3685 	nextrecord = m->m_nextpkt;
3686 
3687 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3688 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3689 		    mp0 == NULL);
3690 		if (error == ERESTART) {
3691 			goto restart;
3692 		} else if (error != 0) {
3693 			goto release;
3694 		}
3695 		orig_resid = 0;
3696 	}
3697 
3698 	/*
3699 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3700 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3701 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3702 	 * perform externalization.
3703 	 */
3704 	if (m != NULL && m->m_type == MT_CONTROL) {
3705 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3706 		if (error != 0) {
3707 			goto release;
3708 		}
3709 		orig_resid = 0;
3710 	}
3711 
3712 	if (m != NULL) {
3713 		if (!(flags & MSG_PEEK)) {
3714 			/*
3715 			 * We get here because m points to an mbuf following
3716 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3717 			 * processed above.  In any case, m should be pointing
3718 			 * to the head of the mbuf chain, and the nextrecord
3719 			 * should be either NULL or equal to m->m_nextpkt.
3720 			 * See comments above about SB_LOCK.
3721 			 */
3722 			if (m != so->so_rcv.sb_mb ||
3723 			    m->m_nextpkt != nextrecord) {
3724 				panic("%s: post-control !sync so=%p m=%p "
3725 				    "nextrecord=%p\n", __func__, so, m,
3726 				    nextrecord);
3727 				/* NOTREACHED */
3728 			}
3729 			if (nextrecord == NULL) {
3730 				so->so_rcv.sb_lastrecord = m;
3731 			}
3732 		}
3733 		type = m->m_type;
3734 		if (type == MT_OOBDATA) {
3735 			flags |= MSG_OOB;
3736 		}
3737 	} else {
3738 		if (!(flags & MSG_PEEK)) {
3739 			SB_EMPTY_FIXUP(&so->so_rcv);
3740 		}
3741 	}
3742 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3743 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3744 
3745 	moff = 0;
3746 	offset = 0;
3747 
3748 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3749 		can_delay = 1;
3750 	} else {
3751 		can_delay = 0;
3752 	}
3753 
3754 	while (m != NULL &&
3755 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3756 		if (m->m_type == MT_OOBDATA) {
3757 			if (type != MT_OOBDATA) {
3758 				break;
3759 			}
3760 		} else if (type == MT_OOBDATA) {
3761 			break;
3762 		}
3763 
3764 		if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3765 		    m->m_type != MT_HEADER) {
3766 			break;
3767 		}
3768 		/*
3769 		 * Make sure to allways set MSG_OOB event when getting
3770 		 * out of band data inline.
3771 		 */
3772 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3773 		    (so->so_options & SO_OOBINLINE) != 0 &&
3774 		    (so->so_state & SS_RCVATMARK) != 0) {
3775 			flags |= MSG_OOB;
3776 		}
3777 		so->so_state &= ~SS_RCVATMARK;
3778 		len = uio_resid(uio) - delayed_copy_len;
3779 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3780 			len = so->so_oobmark - offset;
3781 		}
3782 		if (len > m->m_len - moff) {
3783 			len = m->m_len - moff;
3784 		}
3785 		/*
3786 		 * If mp is set, just pass back the mbufs.
3787 		 * Otherwise copy them out via the uio, then free.
3788 		 * Sockbuf must be consistent here (points to current mbuf,
3789 		 * it points to next record) when we drop priority;
3790 		 * we must note any additions to the sockbuf when we
3791 		 * block interrupts again.
3792 		 */
3793 		if (mp == NULL) {
3794 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3795 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3796 			if (can_delay && len == m->m_len) {
3797 				/*
3798 				 * only delay the copy if we're consuming the
3799 				 * mbuf and we're NOT in MSG_PEEK mode
3800 				 * and we have enough data to make it worthwile
3801 				 * to drop and retake the lock... can_delay
3802 				 * reflects the state of the 2 latter
3803 				 * constraints moff should always be zero
3804 				 * in these cases
3805 				 */
3806 				delayed_copy_len += len;
3807 			} else {
3808 				if (delayed_copy_len) {
3809 					error = sodelayed_copy(so, uio,
3810 					    &free_list, &delayed_copy_len);
3811 
3812 					if (error) {
3813 						goto release;
3814 					}
3815 					/*
3816 					 * can only get here if MSG_PEEK is not
3817 					 * set therefore, m should point at the
3818 					 * head of the rcv queue; if it doesn't,
3819 					 * it means something drastically
3820 					 * changed while we were out from behind
3821 					 * the lock in sodelayed_copy. perhaps
3822 					 * a RST on the stream. in any event,
3823 					 * the stream has been interrupted. it's
3824 					 * probably best just to return whatever
3825 					 * data we've moved and let the caller
3826 					 * sort it out...
3827 					 */
3828 					if (m != so->so_rcv.sb_mb) {
3829 						break;
3830 					}
3831 				}
3832 				socket_unlock(so, 0);
3833 				error = uiomove(mtod(m, caddr_t) + moff,
3834 				    (int)len, uio);
3835 				socket_lock(so, 0);
3836 
3837 				if (error) {
3838 					goto release;
3839 				}
3840 			}
3841 		} else {
3842 			uio_setresid(uio, (uio_resid(uio) - len));
3843 		}
3844 		if (len == m->m_len - moff) {
3845 			if (m->m_flags & M_EOR) {
3846 				flags |= MSG_EOR;
3847 			}
3848 			if (flags & MSG_PEEK) {
3849 				m = m->m_next;
3850 				moff = 0;
3851 			} else {
3852 				nextrecord = m->m_nextpkt;
3853 				sbfree(&so->so_rcv, m);
3854 				m->m_nextpkt = NULL;
3855 
3856 				if (mp != NULL) {
3857 					*mp = m;
3858 					mp = &m->m_next;
3859 					so->so_rcv.sb_mb = m = m->m_next;
3860 					*mp = NULL;
3861 				} else {
3862 					if (free_list == NULL) {
3863 						free_list = m;
3864 					} else {
3865 						ml->m_next = m;
3866 					}
3867 					ml = m;
3868 					so->so_rcv.sb_mb = m = m->m_next;
3869 					ml->m_next = NULL;
3870 				}
3871 				if (m != NULL) {
3872 					m->m_nextpkt = nextrecord;
3873 					if (nextrecord == NULL) {
3874 						so->so_rcv.sb_lastrecord = m;
3875 					}
3876 				} else {
3877 					so->so_rcv.sb_mb = nextrecord;
3878 					SB_EMPTY_FIXUP(&so->so_rcv);
3879 				}
3880 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3881 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3882 			}
3883 		} else {
3884 			if (flags & MSG_PEEK) {
3885 				moff += len;
3886 			} else {
3887 				if (mp != NULL) {
3888 					int copy_flag;
3889 
3890 					if (flags & MSG_DONTWAIT) {
3891 						copy_flag = M_DONTWAIT;
3892 					} else {
3893 						copy_flag = M_WAIT;
3894 					}
3895 					*mp = m_copym(m, 0, (int)len, copy_flag);
3896 					/*
3897 					 * Failed to allocate an mbuf?
3898 					 * Adjust uio_resid back, it was
3899 					 * adjusted down by len bytes which
3900 					 * we didn't copy over.
3901 					 */
3902 					if (*mp == NULL) {
3903 						uio_setresid(uio,
3904 						    (uio_resid(uio) + len));
3905 						break;
3906 					}
3907 				}
3908 				m->m_data += len;
3909 				m->m_len -= len;
3910 				so->so_rcv.sb_cc -= len;
3911 			}
3912 		}
3913 		if (so->so_oobmark) {
3914 			if ((flags & MSG_PEEK) == 0) {
3915 				so->so_oobmark -= len;
3916 				if (so->so_oobmark == 0) {
3917 					so->so_state |= SS_RCVATMARK;
3918 					break;
3919 				}
3920 			} else {
3921 				offset += len;
3922 				if (offset == so->so_oobmark) {
3923 					break;
3924 				}
3925 			}
3926 		}
3927 		if (flags & MSG_EOR) {
3928 			break;
3929 		}
3930 		/*
3931 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3932 		 * (for non-atomic socket), we must not quit until
3933 		 * "uio->uio_resid == 0" or an error termination.
3934 		 * If a signal/timeout occurs, return with a short
3935 		 * count but without error.  Keep sockbuf locked
3936 		 * against other readers.
3937 		 */
3938 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3939 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3940 		    !sosendallatonce(so) && !nextrecord) {
3941 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3942 #if CONTENT_FILTER
3943 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3944 #endif /* CONTENT_FILTER */
3945 			    )) {
3946 				goto release;
3947 			}
3948 
3949 			/*
3950 			 * Depending on the protocol (e.g. TCP), the following
3951 			 * might cause the socket lock to be dropped and later
3952 			 * be reacquired, and more data could have arrived and
3953 			 * have been appended to the receive socket buffer by
3954 			 * the time it returns.  Therefore, we only sleep in
3955 			 * sbwait() below if and only if the socket buffer is
3956 			 * empty, in order to avoid a false sleep.
3957 			 */
3958 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3959 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3960 			}
3961 
3962 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3963 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3964 
3965 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3966 				error = 0;
3967 				goto release;
3968 			}
3969 			/*
3970 			 * have to wait until after we get back from the sbwait
3971 			 * to do the copy because we will drop the lock if we
3972 			 * have enough data that has been delayed... by dropping
3973 			 * the lock we open up a window allowing the netisr
3974 			 * thread to process the incoming packets and to change
3975 			 * the state of this socket... we're issuing the sbwait
3976 			 * because the socket is empty and we're expecting the
3977 			 * netisr thread to wake us up when more packets arrive;
3978 			 * if we allow that processing to happen and then sbwait
3979 			 * we could stall forever with packets sitting in the
3980 			 * socket if no further packets arrive from the remote
3981 			 * side.
3982 			 *
3983 			 * we want to copy before we've collected all the data
3984 			 * to satisfy this request to allow the copy to overlap
3985 			 * the incoming packet processing on an MP system
3986 			 */
3987 			if (delayed_copy_len > sorecvmincopy &&
3988 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3989 				error = sodelayed_copy(so, uio,
3990 				    &free_list, &delayed_copy_len);
3991 
3992 				if (error) {
3993 					goto release;
3994 				}
3995 			}
3996 			m = so->so_rcv.sb_mb;
3997 			if (m != NULL) {
3998 				nextrecord = m->m_nextpkt;
3999 			}
4000 			SB_MB_CHECK(&so->so_rcv);
4001 		}
4002 	}
4003 #ifdef MORE_LOCKING_DEBUG
4004 	if (so->so_usecount <= 1) {
4005 		panic("%s: after big while so=%p ref=%d on socket",
4006 		    __func__, so, so->so_usecount);
4007 		/* NOTREACHED */
4008 	}
4009 #endif
4010 
4011 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4012 		if (so->so_options & SO_DONTTRUNC) {
4013 			flags |= MSG_RCVMORE;
4014 		} else {
4015 			flags |= MSG_TRUNC;
4016 			if ((flags & MSG_PEEK) == 0) {
4017 				(void) sbdroprecord(&so->so_rcv);
4018 			}
4019 		}
4020 	}
4021 
4022 	/*
4023 	 * pru_rcvd below (for TCP) may cause more data to be received
4024 	 * if the socket lock is dropped prior to sending the ACK; some
4025 	 * legacy OpenTransport applications don't handle this well
4026 	 * (if it receives less data than requested while MSG_HAVEMORE
4027 	 * is set), and so we set the flag now based on what we know
4028 	 * prior to calling pru_rcvd.
4029 	 */
4030 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4031 		flags |= MSG_HAVEMORE;
4032 	}
4033 
4034 	if ((flags & MSG_PEEK) == 0) {
4035 		if (m == NULL) {
4036 			so->so_rcv.sb_mb = nextrecord;
4037 			/*
4038 			 * First part is an inline SB_EMPTY_FIXUP().  Second
4039 			 * part makes sure sb_lastrecord is up-to-date if
4040 			 * there is still data in the socket buffer.
4041 			 */
4042 			if (so->so_rcv.sb_mb == NULL) {
4043 				so->so_rcv.sb_mbtail = NULL;
4044 				so->so_rcv.sb_lastrecord = NULL;
4045 			} else if (nextrecord->m_nextpkt == NULL) {
4046 				so->so_rcv.sb_lastrecord = nextrecord;
4047 			}
4048 			SB_MB_CHECK(&so->so_rcv);
4049 		}
4050 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4051 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4052 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4053 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4054 		}
4055 	}
4056 
4057 	if (delayed_copy_len) {
4058 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4059 		if (error) {
4060 			goto release;
4061 		}
4062 	}
4063 	if (free_list != NULL) {
4064 		m_freem_list(free_list);
4065 		free_list = NULL;
4066 	}
4067 
4068 	if (orig_resid == uio_resid(uio) && orig_resid &&
4069 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4070 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4071 		goto restart;
4072 	}
4073 
4074 	if (flagsp != NULL) {
4075 		*flagsp |= flags;
4076 	}
4077 release:
4078 #ifdef MORE_LOCKING_DEBUG
4079 	if (so->so_usecount <= 1) {
4080 		panic("%s: release so=%p ref=%d on socket", __func__,
4081 		    so, so->so_usecount);
4082 		/* NOTREACHED */
4083 	}
4084 #endif
4085 	if (delayed_copy_len) {
4086 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4087 	}
4088 
4089 	if (free_list != NULL) {
4090 		m_freem_list(free_list);
4091 	}
4092 
4093 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4094 
4095 	if (en_tracing) {
4096 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4097 		    VM_KERNEL_ADDRPERM(so),
4098 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4099 		    (int64_t)(orig_resid - uio_resid(uio)));
4100 	}
4101 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4102 	    so->so_rcv.sb_cc, 0, error);
4103 
4104 	return error;
4105 }
4106 
4107 /*
4108  * Returns:	0			Success
4109  *	uiomove:EFAULT
4110  */
4111 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4112 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4113     user_ssize_t *resid)
4114 {
4115 	int error = 0;
4116 	struct mbuf *m;
4117 
4118 	m = *free_list;
4119 
4120 	socket_unlock(so, 0);
4121 
4122 	while (m != NULL && error == 0) {
4123 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4124 		m = m->m_next;
4125 	}
4126 	m_freem_list(*free_list);
4127 
4128 	*free_list = NULL;
4129 	*resid = 0;
4130 
4131 	socket_lock(so, 0);
4132 
4133 	return error;
4134 }
4135 
4136 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4137 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4138     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4139 {
4140 #pragma unused(so)
4141 	int error = 0;
4142 	struct mbuf *ml, *m;
4143 	int i = 0;
4144 	struct uio *auio;
4145 
4146 	for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4147 	    ml = ml->m_nextpkt, i++) {
4148 		auio = msgarray[i].uio;
4149 		for (m = ml; m != NULL; m = m->m_next) {
4150 			error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4151 			if (error != 0) {
4152 				goto out;
4153 			}
4154 		}
4155 	}
4156 out:
4157 	m_freem_list(*free_list);
4158 
4159 	*free_list = NULL;
4160 	*resid = 0;
4161 
4162 	return error;
4163 }
4164 
4165 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4166 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4167     int *flagsp)
4168 {
4169 	struct mbuf *m;
4170 	struct mbuf *nextrecord;
4171 	struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4172 	int error;
4173 	user_ssize_t len, pktlen, delayed_copy_len = 0;
4174 	struct protosw *pr = so->so_proto;
4175 	user_ssize_t resid;
4176 	struct proc *p = current_proc();
4177 	struct uio *auio = NULL;
4178 	int npkts = 0;
4179 	int sblocked = 0;
4180 	struct sockaddr **psa = NULL;
4181 	struct mbuf **controlp = NULL;
4182 	int can_delay;
4183 	int flags;
4184 	struct mbuf *free_others = NULL;
4185 
4186 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4187 	    so, uiocnt,
4188 	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4189 
4190 	/*
4191 	 * Sanity checks:
4192 	 * - Only supports don't wait flags
4193 	 * - Only support datagram sockets (could be extended to raw)
4194 	 * - Must be atomic
4195 	 * - Protocol must support packet chains
4196 	 * - The uio array is NULL (should we panic?)
4197 	 */
4198 	if (flagsp != NULL) {
4199 		flags = *flagsp;
4200 	} else {
4201 		flags = 0;
4202 	}
4203 	if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4204 	    MSG_NBIO)) {
4205 		printf("%s invalid flags 0x%x\n", __func__, flags);
4206 		error = EINVAL;
4207 		goto out;
4208 	}
4209 	if (so->so_type != SOCK_DGRAM) {
4210 		error = EINVAL;
4211 		goto out;
4212 	}
4213 	if (sosendallatonce(so) == 0) {
4214 		error = EINVAL;
4215 		goto out;
4216 	}
4217 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4218 		error = EPROTONOSUPPORT;
4219 		goto out;
4220 	}
4221 	if (msgarray == NULL) {
4222 		printf("%s uioarray is NULL\n", __func__);
4223 		error = EINVAL;
4224 		goto out;
4225 	}
4226 	if (uiocnt == 0) {
4227 		printf("%s uiocnt is 0\n", __func__);
4228 		error = EINVAL;
4229 		goto out;
4230 	}
4231 	/*
4232 	 * Sanity check on the length passed by caller as we are making 'int'
4233 	 * comparisons
4234 	 */
4235 	resid = recv_msg_array_resid(msgarray, uiocnt);
4236 	if (resid < 0 || resid > INT_MAX) {
4237 		error = EINVAL;
4238 		goto out;
4239 	}
4240 
4241 	if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4242 		can_delay = 1;
4243 	} else {
4244 		can_delay = 0;
4245 	}
4246 
4247 	socket_lock(so, 1);
4248 	so_update_last_owner_locked(so, p);
4249 	so_update_policy(so);
4250 
4251 #if NECP
4252 	so_update_necp_policy(so, NULL, NULL);
4253 #endif /* NECP */
4254 
4255 	/*
4256 	 * If a recv attempt is made on a previously-accepted socket
4257 	 * that has been marked as inactive (disconnected), reject
4258 	 * the request.
4259 	 */
4260 	if (so->so_flags & SOF_DEFUNCT) {
4261 		struct sockbuf *sb = &so->so_rcv;
4262 
4263 		error = ENOTCONN;
4264 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4265 		    __func__, proc_pid(p), proc_best_name(p),
4266 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4267 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4268 		/*
4269 		 * This socket should have been disconnected and flushed
4270 		 * prior to being returned from sodefunct(); there should
4271 		 * be no data on its receive list, so panic otherwise.
4272 		 */
4273 		if (so->so_state & SS_DEFUNCT) {
4274 			sb_empty_assert(sb, __func__);
4275 		}
4276 		goto release;
4277 	}
4278 
4279 next:
4280 	/*
4281 	 * The uio may be empty
4282 	 */
4283 	if (npkts >= uiocnt) {
4284 		error = 0;
4285 		goto release;
4286 	}
4287 restart:
4288 	/*
4289 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4290 	 * and if so just return to the caller.  This could happen when
4291 	 * soreceive() is called by a socket upcall function during the
4292 	 * time the socket is freed.  The socket buffer would have been
4293 	 * locked across the upcall, therefore we cannot put this thread
4294 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4295 	 * we may livelock), because the lock on the socket buffer will
4296 	 * only be released when the upcall routine returns to its caller.
4297 	 * Because the socket has been officially closed, there can be
4298 	 * no further read on it.
4299 	 */
4300 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4301 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4302 		error = 0;
4303 		goto release;
4304 	}
4305 
4306 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4307 	if (error) {
4308 		goto release;
4309 	}
4310 	sblocked = 1;
4311 
4312 	m = so->so_rcv.sb_mb;
4313 	/*
4314 	 * Block awaiting more datagram if needed
4315 	 */
4316 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4317 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4318 	    ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4319 		/*
4320 		 * Panic if we notice inconsistencies in the socket's
4321 		 * receive list; both sb_mb and sb_cc should correctly
4322 		 * reflect the contents of the list, otherwise we may
4323 		 * end up with false positives during select() or poll()
4324 		 * which could put the application in a bad state.
4325 		 */
4326 		SB_MB_CHECK(&so->so_rcv);
4327 
4328 		if (so->so_error) {
4329 			error = so->so_error;
4330 			if ((flags & MSG_PEEK) == 0) {
4331 				so->so_error = 0;
4332 			}
4333 			goto release;
4334 		}
4335 		if (so->so_state & SS_CANTRCVMORE) {
4336 			goto release;
4337 		}
4338 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4339 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4340 			error = ENOTCONN;
4341 			goto release;
4342 		}
4343 		if ((so->so_state & SS_NBIO) ||
4344 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4345 			error = EWOULDBLOCK;
4346 			goto release;
4347 		}
4348 		/*
4349 		 * Do not block if we got some data
4350 		 */
4351 		if (free_list != NULL) {
4352 			error = 0;
4353 			goto release;
4354 		}
4355 
4356 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4357 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4358 
4359 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4360 		sblocked = 0;
4361 
4362 		error = sbwait(&so->so_rcv);
4363 		if (error) {
4364 			goto release;
4365 		}
4366 		goto restart;
4367 	}
4368 
4369 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4370 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4371 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4372 
4373 	/*
4374 	 * Consume the current uio index as we have a datagram
4375 	 */
4376 	auio = msgarray[npkts].uio;
4377 	resid = uio_resid(auio);
4378 	msgarray[npkts].which |= SOCK_MSG_DATA;
4379 	psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4380 	    &msgarray[npkts].psa : NULL;
4381 	controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4382 	    &msgarray[npkts].controlp : NULL;
4383 	npkts += 1;
4384 	nextrecord = m->m_nextpkt;
4385 
4386 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4387 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4388 		if (error == ERESTART) {
4389 			goto restart;
4390 		} else if (error != 0) {
4391 			goto release;
4392 		}
4393 	}
4394 
4395 	if (m != NULL && m->m_type == MT_CONTROL) {
4396 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4397 		if (error != 0) {
4398 			goto release;
4399 		}
4400 	}
4401 
4402 	if (m->m_pkthdr.len == 0) {
4403 		printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4404 		    __func__, __LINE__,
4405 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4406 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4407 		    m->m_type);
4408 	}
4409 
4410 	/*
4411 	 * Loop to copy the mbufs of the current record
4412 	 * Support zero length packets
4413 	 */
4414 	ml = NULL;
4415 	pktlen = 0;
4416 	while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4417 		if (m->m_len == 0) {
4418 			panic("%p m_len zero", m);
4419 		}
4420 		if (m->m_type == 0) {
4421 			panic("%p m_type zero", m);
4422 		}
4423 		/*
4424 		 * Clip to the residual length
4425 		 */
4426 		if (len > m->m_len) {
4427 			len = m->m_len;
4428 		}
4429 		pktlen += len;
4430 		/*
4431 		 * Copy the mbufs via the uio or delay the copy
4432 		 * Sockbuf must be consistent here (points to current mbuf,
4433 		 * it points to next record) when we drop priority;
4434 		 * we must note any additions to the sockbuf when we
4435 		 * block interrupts again.
4436 		 */
4437 		if (len > 0 && can_delay == 0) {
4438 			socket_unlock(so, 0);
4439 			error = uiomove(mtod(m, caddr_t), (int)len, auio);
4440 			socket_lock(so, 0);
4441 			if (error) {
4442 				goto release;
4443 			}
4444 		} else {
4445 			delayed_copy_len += len;
4446 		}
4447 
4448 		if (len == m->m_len) {
4449 			/*
4450 			 * m was entirely copied
4451 			 */
4452 			sbfree(&so->so_rcv, m);
4453 			nextrecord = m->m_nextpkt;
4454 			m->m_nextpkt = NULL;
4455 
4456 			/*
4457 			 * Set the first packet to the head of the free list
4458 			 */
4459 			if (free_list == NULL) {
4460 				free_list = m;
4461 			}
4462 			/*
4463 			 * Link current packet to tail of free list
4464 			 */
4465 			if (ml == NULL) {
4466 				if (free_tail != NULL) {
4467 					free_tail->m_nextpkt = m;
4468 				}
4469 				free_tail = m;
4470 			}
4471 			/*
4472 			 * Link current mbuf to last mbuf of current packet
4473 			 */
4474 			if (ml != NULL) {
4475 				ml->m_next = m;
4476 			}
4477 			ml = m;
4478 
4479 			/*
4480 			 * Move next buf to head of socket buffer
4481 			 */
4482 			so->so_rcv.sb_mb = m = ml->m_next;
4483 			ml->m_next = NULL;
4484 
4485 			if (m != NULL) {
4486 				m->m_nextpkt = nextrecord;
4487 				if (nextrecord == NULL) {
4488 					so->so_rcv.sb_lastrecord = m;
4489 				}
4490 			} else {
4491 				so->so_rcv.sb_mb = nextrecord;
4492 				SB_EMPTY_FIXUP(&so->so_rcv);
4493 			}
4494 			SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4495 			SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4496 		} else {
4497 			/*
4498 			 * Stop the loop on partial copy
4499 			 */
4500 			break;
4501 		}
4502 	}
4503 #ifdef MORE_LOCKING_DEBUG
4504 	if (so->so_usecount <= 1) {
4505 		panic("%s: after big while so=%llx ref=%d on socket",
4506 		    __func__,
4507 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4508 		/* NOTREACHED */
4509 	}
4510 #endif
4511 	/*
4512 	 * Tell the caller we made a partial copy
4513 	 */
4514 	if (m != NULL) {
4515 		if (so->so_options & SO_DONTTRUNC) {
4516 			/*
4517 			 * Copyout first the freelist then the partial mbuf
4518 			 */
4519 			socket_unlock(so, 0);
4520 			if (delayed_copy_len) {
4521 				error = sodelayed_copy_list(so, msgarray,
4522 				    uiocnt, &free_list, &delayed_copy_len);
4523 			}
4524 
4525 			if (error == 0) {
4526 				error = uiomove(mtod(m, caddr_t), (int)len,
4527 				    auio);
4528 			}
4529 			socket_lock(so, 0);
4530 			if (error) {
4531 				goto release;
4532 			}
4533 
4534 			m->m_data += len;
4535 			m->m_len -= len;
4536 			so->so_rcv.sb_cc -= len;
4537 			flags |= MSG_RCVMORE;
4538 		} else {
4539 			(void) sbdroprecord(&so->so_rcv);
4540 			nextrecord = so->so_rcv.sb_mb;
4541 			m = NULL;
4542 			flags |= MSG_TRUNC;
4543 		}
4544 	}
4545 
4546 	if (m == NULL) {
4547 		so->so_rcv.sb_mb = nextrecord;
4548 		/*
4549 		 * First part is an inline SB_EMPTY_FIXUP().  Second
4550 		 * part makes sure sb_lastrecord is up-to-date if
4551 		 * there is still data in the socket buffer.
4552 		 */
4553 		if (so->so_rcv.sb_mb == NULL) {
4554 			so->so_rcv.sb_mbtail = NULL;
4555 			so->so_rcv.sb_lastrecord = NULL;
4556 		} else if (nextrecord->m_nextpkt == NULL) {
4557 			so->so_rcv.sb_lastrecord = nextrecord;
4558 		}
4559 		SB_MB_CHECK(&so->so_rcv);
4560 	}
4561 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4562 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4563 
4564 	/*
4565 	 * We can continue to the next packet as long as:
4566 	 * - We haven't exhausted the uio array
4567 	 * - There was no error
4568 	 * - A packet was not truncated
4569 	 * - We can still receive more data
4570 	 */
4571 	if (npkts < uiocnt && error == 0 &&
4572 	    (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4573 	    (so->so_state & SS_CANTRCVMORE) == 0) {
4574 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4575 		sblocked = 0;
4576 
4577 		goto next;
4578 	}
4579 	if (flagsp != NULL) {
4580 		*flagsp |= flags;
4581 	}
4582 
4583 release:
4584 	/*
4585 	 * pru_rcvd may cause more data to be received if the socket lock
4586 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4587 	 * That way the caller won't be surprised if it receives less data
4588 	 * than requested.
4589 	 */
4590 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4591 		flags |= MSG_HAVEMORE;
4592 	}
4593 
4594 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4595 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4596 	}
4597 
4598 	if (sblocked) {
4599 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4600 	} else {
4601 		socket_unlock(so, 1);
4602 	}
4603 
4604 	if (delayed_copy_len) {
4605 		error = sodelayed_copy_list(so, msgarray, uiocnt,
4606 		    &free_list, &delayed_copy_len);
4607 	}
4608 out:
4609 	/*
4610 	 * Amortize the cost of freeing the mbufs
4611 	 */
4612 	if (free_list != NULL) {
4613 		m_freem_list(free_list);
4614 	}
4615 	if (free_others != NULL) {
4616 		m_freem_list(free_others);
4617 	}
4618 
4619 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4620 	    0, 0, 0, 0);
4621 	return error;
4622 }
4623 
4624 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4625 so_statistics_event_to_nstat_event(int64_t *input_options,
4626     uint64_t *nstat_event)
4627 {
4628 	int error = 0;
4629 	switch (*input_options) {
4630 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4631 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4632 		break;
4633 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4634 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4635 		break;
4636 #if (DEBUG || DEVELOPMENT)
4637 	case SO_STATISTICS_EVENT_RESERVED_1:
4638 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4639 		break;
4640 	case SO_STATISTICS_EVENT_RESERVED_2:
4641 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4642 		break;
4643 #endif /* (DEBUG || DEVELOPMENT) */
4644 	default:
4645 		error = EINVAL;
4646 		break;
4647 	}
4648 	return error;
4649 }
4650 
4651 /*
4652  * Returns:	0			Success
4653  *		EINVAL
4654  *		ENOTCONN
4655  *	<pru_shutdown>:EINVAL
4656  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4657  *	<pru_shutdown>:ENOBUFS[TCP]
4658  *	<pru_shutdown>:EMSGSIZE[TCP]
4659  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4660  *	<pru_shutdown>:ENETUNREACH[TCP]
4661  *	<pru_shutdown>:ENETDOWN[TCP]
4662  *	<pru_shutdown>:ENOMEM[TCP]
4663  *	<pru_shutdown>:EACCES[TCP]
4664  *	<pru_shutdown>:EMSGSIZE[TCP]
4665  *	<pru_shutdown>:ENOBUFS[TCP]
4666  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4667  *	<pru_shutdown>:???		[other protocol families]
4668  */
4669 int
soshutdown(struct socket * so,int how)4670 soshutdown(struct socket *so, int how)
4671 {
4672 	int error;
4673 
4674 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4675 
4676 	switch (how) {
4677 	case SHUT_RD:
4678 	case SHUT_WR:
4679 	case SHUT_RDWR:
4680 		socket_lock(so, 1);
4681 		if ((so->so_state &
4682 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4683 			error = ENOTCONN;
4684 		} else {
4685 			error = soshutdownlock(so, how);
4686 		}
4687 		socket_unlock(so, 1);
4688 		break;
4689 	default:
4690 		error = EINVAL;
4691 		break;
4692 	}
4693 
4694 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4695 
4696 	return error;
4697 }
4698 
4699 int
soshutdownlock_final(struct socket * so,int how)4700 soshutdownlock_final(struct socket *so, int how)
4701 {
4702 	struct protosw *pr = so->so_proto;
4703 	int error = 0;
4704 
4705 	sflt_notify(so, sock_evt_shutdown, &how);
4706 
4707 	if (how != SHUT_WR) {
4708 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4709 			/* read already shut down */
4710 			error = ENOTCONN;
4711 			goto done;
4712 		}
4713 		sorflush(so);
4714 	}
4715 	if (how != SHUT_RD) {
4716 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4717 			/* write already shut down */
4718 			error = ENOTCONN;
4719 			goto done;
4720 		}
4721 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4722 	}
4723 done:
4724 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4725 	return error;
4726 }
4727 
4728 int
soshutdownlock(struct socket * so,int how)4729 soshutdownlock(struct socket *so, int how)
4730 {
4731 	int error = 0;
4732 
4733 #if CONTENT_FILTER
4734 	/*
4735 	 * A content filter may delay the actual shutdown until it
4736 	 * has processed the pending data
4737 	 */
4738 	if (so->so_flags & SOF_CONTENT_FILTER) {
4739 		error = cfil_sock_shutdown(so, &how);
4740 		if (error == EJUSTRETURN) {
4741 			error = 0;
4742 			goto done;
4743 		} else if (error != 0) {
4744 			goto done;
4745 		}
4746 	}
4747 #endif /* CONTENT_FILTER */
4748 
4749 	error = soshutdownlock_final(so, how);
4750 
4751 done:
4752 	return error;
4753 }
4754 
4755 void
sowflush(struct socket * so)4756 sowflush(struct socket *so)
4757 {
4758 	struct sockbuf *sb = &so->so_snd;
4759 
4760 	/*
4761 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4762 	 * to prevent the socket buffer from being unexpectedly altered
4763 	 * while it is used by another thread in socket send/receive.
4764 	 *
4765 	 * sblock() must not fail here, hence the assertion.
4766 	 */
4767 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4768 	VERIFY(sb->sb_flags & SB_LOCK);
4769 
4770 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4771 	sb->sb_flags            |= SB_DROP;
4772 	sb->sb_upcall           = NULL;
4773 	sb->sb_upcallarg        = NULL;
4774 
4775 	sbunlock(sb, TRUE);     /* keep socket locked */
4776 
4777 	selthreadclear(&sb->sb_sel);
4778 	sbrelease(sb);
4779 }
4780 
4781 void
sorflush(struct socket * so)4782 sorflush(struct socket *so)
4783 {
4784 	struct sockbuf *sb = &so->so_rcv;
4785 	struct protosw *pr = so->so_proto;
4786 	struct sockbuf asb;
4787 #ifdef notyet
4788 	lck_mtx_t *mutex_held;
4789 	/*
4790 	 * XXX: This code is currently commented out, because we may get here
4791 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4792 	 * longer be able to return us the lock; this will be fixed in future.
4793 	 */
4794 	if (so->so_proto->pr_getlock != NULL) {
4795 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4796 	} else {
4797 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4798 	}
4799 
4800 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4801 #endif /* notyet */
4802 
4803 	sflt_notify(so, sock_evt_flush_read, NULL);
4804 
4805 	socantrcvmore(so);
4806 
4807 	/*
4808 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4809 	 * to prevent the socket buffer from being unexpectedly altered
4810 	 * while it is used by another thread in socket send/receive.
4811 	 *
4812 	 * sblock() must not fail here, hence the assertion.
4813 	 */
4814 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4815 	VERIFY(sb->sb_flags & SB_LOCK);
4816 
4817 	/*
4818 	 * Copy only the relevant fields from "sb" to "asb" which we
4819 	 * need for sbrelease() to function.  In particular, skip
4820 	 * sb_sel as it contains the wait queue linkage, which would
4821 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4822 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4823 	 * to acquire it later as part of sbrelease().
4824 	 */
4825 	bzero(&asb, sizeof(asb));
4826 	asb.sb_cc               = sb->sb_cc;
4827 	asb.sb_hiwat            = sb->sb_hiwat;
4828 	asb.sb_mbcnt            = sb->sb_mbcnt;
4829 	asb.sb_mbmax            = sb->sb_mbmax;
4830 	asb.sb_ctl              = sb->sb_ctl;
4831 	asb.sb_lowat            = sb->sb_lowat;
4832 	asb.sb_mb               = sb->sb_mb;
4833 	asb.sb_mbtail           = sb->sb_mbtail;
4834 	asb.sb_lastrecord       = sb->sb_lastrecord;
4835 	asb.sb_so               = sb->sb_so;
4836 	asb.sb_flags            = sb->sb_flags;
4837 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4838 	asb.sb_flags            |= SB_DROP;
4839 
4840 	/*
4841 	 * Ideally we'd bzero() these and preserve the ones we need;
4842 	 * but to do that we'd need to shuffle things around in the
4843 	 * sockbuf, and we can't do it now because there are KEXTS
4844 	 * that are directly referring to the socket structure.
4845 	 *
4846 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4847 	 * Clearing SB_SEL is done for selthreadclear() below.
4848 	 */
4849 	sb->sb_cc               = 0;
4850 	sb->sb_hiwat            = 0;
4851 	sb->sb_mbcnt            = 0;
4852 	sb->sb_mbmax            = 0;
4853 	sb->sb_ctl              = 0;
4854 	sb->sb_lowat            = 0;
4855 	sb->sb_mb               = NULL;
4856 	sb->sb_mbtail           = NULL;
4857 	sb->sb_lastrecord       = NULL;
4858 	sb->sb_timeo.tv_sec     = 0;
4859 	sb->sb_timeo.tv_usec    = 0;
4860 	sb->sb_upcall           = NULL;
4861 	sb->sb_upcallarg        = NULL;
4862 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4863 	sb->sb_flags            |= SB_DROP;
4864 
4865 	sbunlock(sb, TRUE);     /* keep socket locked */
4866 
4867 	/*
4868 	 * Note that selthreadclear() is called on the original "sb" and
4869 	 * not the local "asb" because of the way wait queue linkage is
4870 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4871 	 * should no longer be set (cleared above.)
4872 	 */
4873 	selthreadclear(&sb->sb_sel);
4874 
4875 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4876 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4877 	}
4878 
4879 	sbrelease(&asb);
4880 }
4881 
4882 /*
4883  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4884  * an additional variant to handle the case where the option value needs
4885  * to be some kind of integer, but not a specific size.
4886  * In addition to their use here, these functions are also called by the
4887  * protocol-level pr_ctloutput() routines.
4888  *
4889  * Returns:	0			Success
4890  *		EINVAL
4891  *	copyin:EFAULT
4892  */
4893 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4894 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4895 {
4896 	size_t  valsize;
4897 
4898 	/*
4899 	 * If the user gives us more than we wanted, we ignore it,
4900 	 * but if we don't get the minimum length the caller
4901 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4902 	 * is set to however much we actually retrieved.
4903 	 */
4904 	if ((valsize = sopt->sopt_valsize) < minlen) {
4905 		return EINVAL;
4906 	}
4907 	if (valsize > len) {
4908 		sopt->sopt_valsize = valsize = len;
4909 	}
4910 
4911 	if (sopt->sopt_p != kernproc) {
4912 		return copyin(sopt->sopt_val, buf, valsize);
4913 	}
4914 
4915 	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4916 	return 0;
4917 }
4918 
4919 /*
4920  * sooptcopyin_timeval
4921  *   Copy in a timeval value into tv_p, and take into account whether the
4922  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4923  *   code here so that we can verify the 64-bit tv_sec value before we lose
4924  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4925  */
4926 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4927 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4928 {
4929 	int                     error;
4930 
4931 	if (proc_is64bit(sopt->sopt_p)) {
4932 		struct user64_timeval   tv64;
4933 
4934 		if (sopt->sopt_valsize < sizeof(tv64)) {
4935 			return EINVAL;
4936 		}
4937 
4938 		sopt->sopt_valsize = sizeof(tv64);
4939 		if (sopt->sopt_p != kernproc) {
4940 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4941 			if (error != 0) {
4942 				return error;
4943 			}
4944 		} else {
4945 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4946 			    sizeof(tv64));
4947 		}
4948 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4949 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4950 			return EDOM;
4951 		}
4952 
4953 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4954 		tv_p->tv_usec = tv64.tv_usec;
4955 	} else {
4956 		struct user32_timeval   tv32;
4957 
4958 		if (sopt->sopt_valsize < sizeof(tv32)) {
4959 			return EINVAL;
4960 		}
4961 
4962 		sopt->sopt_valsize = sizeof(tv32);
4963 		if (sopt->sopt_p != kernproc) {
4964 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4965 			if (error != 0) {
4966 				return error;
4967 			}
4968 		} else {
4969 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4970 			    sizeof(tv32));
4971 		}
4972 #ifndef __LP64__
4973 		/*
4974 		 * K64todo "comparison is always false due to
4975 		 * limited range of data type"
4976 		 */
4977 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4978 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4979 			return EDOM;
4980 		}
4981 #endif
4982 		tv_p->tv_sec = tv32.tv_sec;
4983 		tv_p->tv_usec = tv32.tv_usec;
4984 	}
4985 	return 0;
4986 }
4987 
4988 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4989 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4990     boolean_t ignore_delegate)
4991 {
4992 	kauth_cred_t cred =  NULL;
4993 	proc_t ep = PROC_NULL;
4994 	uid_t uid;
4995 	int error = 0;
4996 
4997 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4998 		ep = proc_find(so->e_pid);
4999 		if (ep) {
5000 			cred = kauth_cred_proc_ref(ep);
5001 		}
5002 	}
5003 
5004 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5005 
5006 	/* uid is 0 for root */
5007 	if (uid != 0 || !allow_root) {
5008 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5009 	}
5010 	if (cred) {
5011 		kauth_cred_unref(&cred);
5012 	}
5013 	if (ep != PROC_NULL) {
5014 		proc_rele(ep);
5015 	}
5016 
5017 	return error;
5018 }
5019 
5020 /*
5021  * Returns:	0			Success
5022  *		EINVAL
5023  *		ENOPROTOOPT
5024  *		ENOBUFS
5025  *		EDOM
5026  *	sooptcopyin:EINVAL
5027  *	sooptcopyin:EFAULT
5028  *	sooptcopyin_timeval:EINVAL
5029  *	sooptcopyin_timeval:EFAULT
5030  *	sooptcopyin_timeval:EDOM
5031  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5032  *	<pr_ctloutput>:???w
5033  *	sflt_attach_private:???		[whatever a filter author chooses]
5034  *	<sf_setoption>:???		[whatever a filter author chooses]
5035  *
5036  * Notes:	Other <pru_listen> returns depend on the protocol family; all
5037  *		<sf_listen> returns depend on what the filter author causes
5038  *		their filter to return.
5039  */
5040 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5041 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5042 {
5043 	int     error, optval;
5044 	int64_t long_optval;
5045 	struct  linger l;
5046 	struct  timeval tv;
5047 
5048 	if (sopt->sopt_dir != SOPT_SET) {
5049 		sopt->sopt_dir = SOPT_SET;
5050 	}
5051 
5052 	if (dolock) {
5053 		socket_lock(so, 1);
5054 	}
5055 
5056 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5057 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5058 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5059 		/* the socket has been shutdown, no more sockopt's */
5060 		error = EINVAL;
5061 		goto out;
5062 	}
5063 
5064 	error = sflt_setsockopt(so, sopt);
5065 	if (error != 0) {
5066 		if (error == EJUSTRETURN) {
5067 			error = 0;
5068 		}
5069 		goto out;
5070 	}
5071 
5072 	if (sopt->sopt_level != SOL_SOCKET) {
5073 		if (so->so_proto != NULL &&
5074 		    so->so_proto->pr_ctloutput != NULL) {
5075 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5076 			goto out;
5077 		}
5078 		error = ENOPROTOOPT;
5079 	} else {
5080 		/*
5081 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5082 		 * the protocol layer, if needed.  A zero value returned from
5083 		 * the handler means use default socket-level processing as
5084 		 * done by the rest of this routine.  Otherwise, any other
5085 		 * return value indicates that the option is unsupported.
5086 		 */
5087 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5088 		    pru_socheckopt(so, sopt)) != 0) {
5089 			goto out;
5090 		}
5091 
5092 		error = 0;
5093 		switch (sopt->sopt_name) {
5094 		case SO_LINGER:
5095 		case SO_LINGER_SEC: {
5096 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5097 			if (error != 0) {
5098 				goto out;
5099 			}
5100 			/* Make sure to use sane values */
5101 			if (sopt->sopt_name == SO_LINGER) {
5102 				so->so_linger = (short)l.l_linger;
5103 			} else {
5104 				so->so_linger = (short)((long)l.l_linger * hz);
5105 			}
5106 			if (l.l_onoff != 0) {
5107 				so->so_options |= SO_LINGER;
5108 			} else {
5109 				so->so_options &= ~SO_LINGER;
5110 			}
5111 			break;
5112 		}
5113 		case SO_DEBUG:
5114 		case SO_KEEPALIVE:
5115 		case SO_DONTROUTE:
5116 		case SO_USELOOPBACK:
5117 		case SO_BROADCAST:
5118 		case SO_REUSEADDR:
5119 		case SO_REUSEPORT:
5120 		case SO_OOBINLINE:
5121 		case SO_TIMESTAMP:
5122 		case SO_TIMESTAMP_MONOTONIC:
5123 		case SO_TIMESTAMP_CONTINUOUS:
5124 		case SO_DONTTRUNC:
5125 		case SO_WANTMORE:
5126 		case SO_WANTOOBFLAG:
5127 		case SO_NOWAKEFROMSLEEP:
5128 		case SO_NOAPNFALLBK:
5129 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5130 			    sizeof(optval));
5131 			if (error != 0) {
5132 				goto out;
5133 			}
5134 			if (optval) {
5135 				so->so_options |= sopt->sopt_name;
5136 			} else {
5137 				so->so_options &= ~sopt->sopt_name;
5138 			}
5139 #if SKYWALK
5140 			inp_update_netns_flags(so);
5141 #endif /* SKYWALK */
5142 			break;
5143 
5144 		case SO_SNDBUF:
5145 		case SO_RCVBUF:
5146 		case SO_SNDLOWAT:
5147 		case SO_RCVLOWAT:
5148 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5149 			    sizeof(optval));
5150 			if (error != 0) {
5151 				goto out;
5152 			}
5153 
5154 			/*
5155 			 * Values < 1 make no sense for any of these
5156 			 * options, so disallow them.
5157 			 */
5158 			if (optval < 1) {
5159 				error = EINVAL;
5160 				goto out;
5161 			}
5162 
5163 			switch (sopt->sopt_name) {
5164 			case SO_SNDBUF:
5165 			case SO_RCVBUF: {
5166 				struct sockbuf *sb =
5167 				    (sopt->sopt_name == SO_SNDBUF) ?
5168 				    &so->so_snd : &so->so_rcv;
5169 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
5170 					error = ENOBUFS;
5171 					goto out;
5172 				}
5173 				sb->sb_flags |= SB_USRSIZE;
5174 				sb->sb_flags &= ~SB_AUTOSIZE;
5175 				sb->sb_idealsize = (u_int32_t)optval;
5176 				break;
5177 			}
5178 			/*
5179 			 * Make sure the low-water is never greater than
5180 			 * the high-water.
5181 			 */
5182 			case SO_SNDLOWAT: {
5183 				int space = sbspace(&so->so_snd);
5184 				uint32_t hiwat = so->so_snd.sb_hiwat;
5185 
5186 				if (so->so_snd.sb_flags & SB_UNIX) {
5187 					struct unpcb *unp =
5188 					    (struct unpcb *)(so->so_pcb);
5189 					if (unp != NULL &&
5190 					    unp->unp_conn != NULL) {
5191 						struct socket *so2 = unp->unp_conn->unp_socket;
5192 						hiwat += unp->unp_conn->unp_cc;
5193 						space = sbspace(&so2->so_rcv);
5194 					}
5195 				}
5196 
5197 				so->so_snd.sb_lowat =
5198 				    (optval > hiwat) ?
5199 				    hiwat : optval;
5200 
5201 				if (space >= so->so_snd.sb_lowat) {
5202 					sowwakeup(so);
5203 				}
5204 				break;
5205 			}
5206 			case SO_RCVLOWAT: {
5207 				int64_t data_len;
5208 				so->so_rcv.sb_lowat =
5209 				    (optval > so->so_rcv.sb_hiwat) ?
5210 				    so->so_rcv.sb_hiwat : optval;
5211 				if (so->so_rcv.sb_flags & SB_UNIX) {
5212 					struct unpcb *unp =
5213 					    (struct unpcb *)(so->so_pcb);
5214 					if (unp != NULL &&
5215 					    unp->unp_conn != NULL) {
5216 						struct socket *so2 = unp->unp_conn->unp_socket;
5217 						data_len = so2->so_snd.sb_cc
5218 						    - so2->so_snd.sb_ctl;
5219 					} else {
5220 						data_len = so->so_rcv.sb_cc
5221 						    - so->so_rcv.sb_ctl;
5222 					}
5223 				} else {
5224 					data_len = so->so_rcv.sb_cc
5225 					    - so->so_rcv.sb_ctl;
5226 				}
5227 
5228 				if (data_len >= so->so_rcv.sb_lowat) {
5229 					sorwakeup(so);
5230 				}
5231 				break;
5232 			}
5233 			}
5234 			break;
5235 
5236 		case SO_SNDTIMEO:
5237 		case SO_RCVTIMEO:
5238 			error = sooptcopyin_timeval(sopt, &tv);
5239 			if (error != 0) {
5240 				goto out;
5241 			}
5242 
5243 			switch (sopt->sopt_name) {
5244 			case SO_SNDTIMEO:
5245 				so->so_snd.sb_timeo = tv;
5246 				break;
5247 			case SO_RCVTIMEO:
5248 				so->so_rcv.sb_timeo = tv;
5249 				break;
5250 			}
5251 			break;
5252 
5253 		case SO_NKE: {
5254 			struct so_nke nke;
5255 
5256 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5257 			    sizeof(nke));
5258 			if (error != 0) {
5259 				goto out;
5260 			}
5261 
5262 			error = sflt_attach_internal(so, nke.nke_handle);
5263 			break;
5264 		}
5265 
5266 		case SO_NOSIGPIPE:
5267 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5268 			    sizeof(optval));
5269 			if (error != 0) {
5270 				goto out;
5271 			}
5272 			if (optval != 0) {
5273 				so->so_flags |= SOF_NOSIGPIPE;
5274 			} else {
5275 				so->so_flags &= ~SOF_NOSIGPIPE;
5276 			}
5277 			break;
5278 
5279 		case SO_NOADDRERR:
5280 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5281 			    sizeof(optval));
5282 			if (error != 0) {
5283 				goto out;
5284 			}
5285 			if (optval != 0) {
5286 				so->so_flags |= SOF_NOADDRAVAIL;
5287 			} else {
5288 				so->so_flags &= ~SOF_NOADDRAVAIL;
5289 			}
5290 			break;
5291 
5292 		case SO_REUSESHAREUID:
5293 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5294 			    sizeof(optval));
5295 			if (error != 0) {
5296 				goto out;
5297 			}
5298 			if (optval != 0) {
5299 				so->so_flags |= SOF_REUSESHAREUID;
5300 			} else {
5301 				so->so_flags &= ~SOF_REUSESHAREUID;
5302 			}
5303 			break;
5304 
5305 		case SO_NOTIFYCONFLICT:
5306 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5307 				error = EPERM;
5308 				goto out;
5309 			}
5310 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5311 			    sizeof(optval));
5312 			if (error != 0) {
5313 				goto out;
5314 			}
5315 			if (optval != 0) {
5316 				so->so_flags |= SOF_NOTIFYCONFLICT;
5317 			} else {
5318 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5319 			}
5320 			break;
5321 
5322 		case SO_RESTRICTIONS:
5323 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5324 			    sizeof(optval));
5325 			if (error != 0) {
5326 				goto out;
5327 			}
5328 
5329 			error = so_set_restrictions(so, optval);
5330 			break;
5331 
5332 		case SO_AWDL_UNRESTRICTED:
5333 			if (SOCK_DOM(so) != PF_INET &&
5334 			    SOCK_DOM(so) != PF_INET6) {
5335 				error = EOPNOTSUPP;
5336 				goto out;
5337 			}
5338 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5339 			    sizeof(optval));
5340 			if (error != 0) {
5341 				goto out;
5342 			}
5343 			if (optval != 0) {
5344 				error = soopt_cred_check(so,
5345 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5346 				if (error == 0) {
5347 					inp_set_awdl_unrestricted(
5348 						sotoinpcb(so));
5349 				}
5350 			} else {
5351 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5352 			}
5353 			break;
5354 		case SO_INTCOPROC_ALLOW:
5355 			if (SOCK_DOM(so) != PF_INET6) {
5356 				error = EOPNOTSUPP;
5357 				goto out;
5358 			}
5359 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5360 			    sizeof(optval));
5361 			if (error != 0) {
5362 				goto out;
5363 			}
5364 			if (optval != 0 &&
5365 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5366 				error = soopt_cred_check(so,
5367 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5368 				if (error == 0) {
5369 					inp_set_intcoproc_allowed(
5370 						sotoinpcb(so));
5371 				}
5372 			} else if (optval == 0) {
5373 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5374 			}
5375 			break;
5376 
5377 		case SO_LABEL:
5378 			error = EOPNOTSUPP;
5379 			break;
5380 
5381 		case SO_UPCALLCLOSEWAIT:
5382 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5383 			    sizeof(optval));
5384 			if (error != 0) {
5385 				goto out;
5386 			}
5387 			if (optval != 0) {
5388 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5389 			} else {
5390 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5391 			}
5392 			break;
5393 
5394 		case SO_RANDOMPORT:
5395 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5396 			    sizeof(optval));
5397 			if (error != 0) {
5398 				goto out;
5399 			}
5400 			if (optval != 0) {
5401 				so->so_flags |= SOF_BINDRANDOMPORT;
5402 			} else {
5403 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5404 			}
5405 			break;
5406 
5407 		case SO_NP_EXTENSIONS: {
5408 			struct so_np_extensions sonpx;
5409 
5410 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5411 			    sizeof(sonpx));
5412 			if (error != 0) {
5413 				goto out;
5414 			}
5415 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5416 				error = EINVAL;
5417 				goto out;
5418 			}
5419 			/*
5420 			 * Only one bit defined for now
5421 			 */
5422 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5423 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5424 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5425 				} else {
5426 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5427 				}
5428 			}
5429 			break;
5430 		}
5431 
5432 		case SO_TRAFFIC_CLASS: {
5433 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5434 			    sizeof(optval));
5435 			if (error != 0) {
5436 				goto out;
5437 			}
5438 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5439 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5440 				error = so_set_net_service_type(so, netsvc);
5441 				goto out;
5442 			}
5443 			error = so_set_traffic_class(so, optval);
5444 			if (error != 0) {
5445 				goto out;
5446 			}
5447 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5448 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5449 			break;
5450 		}
5451 
5452 		case SO_RECV_TRAFFIC_CLASS: {
5453 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5454 			    sizeof(optval));
5455 			if (error != 0) {
5456 				goto out;
5457 			}
5458 			if (optval == 0) {
5459 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5460 			} else {
5461 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5462 			}
5463 			break;
5464 		}
5465 
5466 #if (DEVELOPMENT || DEBUG)
5467 		case SO_TRAFFIC_CLASS_DBG: {
5468 			struct so_tcdbg so_tcdbg;
5469 
5470 			error = sooptcopyin(sopt, &so_tcdbg,
5471 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5472 			if (error != 0) {
5473 				goto out;
5474 			}
5475 			error = so_set_tcdbg(so, &so_tcdbg);
5476 			if (error != 0) {
5477 				goto out;
5478 			}
5479 			break;
5480 		}
5481 #endif /* (DEVELOPMENT || DEBUG) */
5482 
5483 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5484 			error = priv_check_cred(kauth_cred_get(),
5485 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5486 			if (error != 0) {
5487 				goto out;
5488 			}
5489 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5490 			    sizeof(optval));
5491 			if (error != 0) {
5492 				goto out;
5493 			}
5494 			if (optval == 0) {
5495 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5496 			} else {
5497 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5498 			}
5499 			break;
5500 
5501 #if (DEVELOPMENT || DEBUG)
5502 		case SO_DEFUNCTIT:
5503 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5504 			if (error == 0) {
5505 				error = sodefunct(current_proc(), so, 0);
5506 			}
5507 
5508 			break;
5509 #endif /* (DEVELOPMENT || DEBUG) */
5510 
5511 		case SO_DEFUNCTOK:
5512 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5513 			    sizeof(optval));
5514 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5515 				if (error == 0) {
5516 					error = EBADF;
5517 				}
5518 				goto out;
5519 			}
5520 			/*
5521 			 * Any process can set SO_DEFUNCTOK (clear
5522 			 * SOF_NODEFUNCT), but only root can clear
5523 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5524 			 */
5525 			if (optval == 0 &&
5526 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5527 				error = EPERM;
5528 				goto out;
5529 			}
5530 			if (optval) {
5531 				so->so_flags &= ~SOF_NODEFUNCT;
5532 			} else {
5533 				so->so_flags |= SOF_NODEFUNCT;
5534 			}
5535 
5536 			if (SOCK_DOM(so) == PF_INET ||
5537 			    SOCK_DOM(so) == PF_INET6) {
5538 				char s[MAX_IPv6_STR_LEN];
5539 				char d[MAX_IPv6_STR_LEN];
5540 				struct inpcb *inp = sotoinpcb(so);
5541 
5542 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5543 				    "[%s %s:%d -> %s:%d] is now marked "
5544 				    "as %seligible for "
5545 				    "defunct\n", __func__, proc_selfpid(),
5546 				    proc_best_name(current_proc()),
5547 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5548 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5549 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5550 				    ((SOCK_DOM(so) == PF_INET) ?
5551 				    (void *)&inp->inp_laddr.s_addr :
5552 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5553 				    ntohs(inp->in6p_lport),
5554 				    inet_ntop(SOCK_DOM(so),
5555 				    (SOCK_DOM(so) == PF_INET) ?
5556 				    (void *)&inp->inp_faddr.s_addr :
5557 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5558 				    ntohs(inp->in6p_fport),
5559 				    (so->so_flags & SOF_NODEFUNCT) ?
5560 				    "not " : "");
5561 			} else {
5562 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5563 				    "is now marked as %seligible for "
5564 				    "defunct\n",
5565 				    __func__, proc_selfpid(),
5566 				    proc_best_name(current_proc()),
5567 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5568 				    SOCK_DOM(so), SOCK_TYPE(so),
5569 				    (so->so_flags & SOF_NODEFUNCT) ?
5570 				    "not " : "");
5571 			}
5572 			break;
5573 
5574 		case SO_ISDEFUNCT:
5575 			/* This option is not settable */
5576 			error = EINVAL;
5577 			break;
5578 
5579 		case SO_OPPORTUNISTIC:
5580 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5581 			    sizeof(optval));
5582 			if (error == 0) {
5583 				error = so_set_opportunistic(so, optval);
5584 			}
5585 			break;
5586 
5587 		case SO_FLUSH:
5588 			/* This option is handled by lower layer(s) */
5589 			error = 0;
5590 			break;
5591 
5592 		case SO_RECV_ANYIF:
5593 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5594 			    sizeof(optval));
5595 			if (error == 0) {
5596 				error = so_set_recv_anyif(so, optval);
5597 			}
5598 			break;
5599 
5600 		case SO_TRAFFIC_MGT_BACKGROUND: {
5601 			/* This option is handled by lower layer(s) */
5602 			error = 0;
5603 			break;
5604 		}
5605 
5606 #if FLOW_DIVERT
5607 		case SO_FLOW_DIVERT_TOKEN:
5608 			error = flow_divert_token_set(so, sopt);
5609 			break;
5610 #endif  /* FLOW_DIVERT */
5611 
5612 
5613 		case SO_DELEGATED:
5614 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5615 			    sizeof(optval))) != 0) {
5616 				break;
5617 			}
5618 
5619 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5620 			break;
5621 
5622 		case SO_DELEGATED_UUID: {
5623 			uuid_t euuid;
5624 
5625 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5626 			    sizeof(euuid))) != 0) {
5627 				break;
5628 			}
5629 
5630 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5631 			break;
5632 		}
5633 
5634 #if NECP
5635 		case SO_NECP_ATTRIBUTES:
5636 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5637 				/* Handled by MPTCP itself */
5638 				break;
5639 			}
5640 
5641 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5642 				error = EINVAL;
5643 				goto out;
5644 			}
5645 
5646 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5647 			break;
5648 
5649 		case SO_NECP_CLIENTUUID: {
5650 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5651 				/* Handled by MPTCP itself */
5652 				break;
5653 			}
5654 
5655 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5656 				error = EINVAL;
5657 				goto out;
5658 			}
5659 
5660 			struct inpcb *inp = sotoinpcb(so);
5661 			if (!uuid_is_null(inp->necp_client_uuid)) {
5662 				// Clear out the old client UUID if present
5663 				necp_inpcb_remove_cb(inp);
5664 			}
5665 
5666 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5667 			    sizeof(uuid_t), sizeof(uuid_t));
5668 			if (error != 0) {
5669 				goto out;
5670 			}
5671 
5672 			if (uuid_is_null(inp->necp_client_uuid)) {
5673 				error = EINVAL;
5674 				goto out;
5675 			}
5676 
5677 			pid_t current_pid = proc_pid(current_proc());
5678 			error = necp_client_register_socket_flow(current_pid,
5679 			    inp->necp_client_uuid, inp);
5680 			if (error != 0) {
5681 				uuid_clear(inp->necp_client_uuid);
5682 				goto out;
5683 			}
5684 
5685 			if (inp->inp_lport != 0) {
5686 				// There is a bound local port, so this is not
5687 				// a fresh socket. Assign to the client.
5688 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5689 			}
5690 
5691 			break;
5692 		}
5693 		case SO_NECP_LISTENUUID: {
5694 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5695 				error = EINVAL;
5696 				goto out;
5697 			}
5698 
5699 			struct inpcb *inp = sotoinpcb(so);
5700 			if (!uuid_is_null(inp->necp_client_uuid)) {
5701 				error = EINVAL;
5702 				goto out;
5703 			}
5704 
5705 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5706 			    sizeof(uuid_t), sizeof(uuid_t));
5707 			if (error != 0) {
5708 				goto out;
5709 			}
5710 
5711 			if (uuid_is_null(inp->necp_client_uuid)) {
5712 				error = EINVAL;
5713 				goto out;
5714 			}
5715 
5716 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5717 			    inp->necp_client_uuid, inp);
5718 			if (error != 0) {
5719 				uuid_clear(inp->necp_client_uuid);
5720 				goto out;
5721 			}
5722 
5723 			// Mark that the port registration is held by NECP
5724 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5725 
5726 			break;
5727 		}
5728 
5729 		case SO_RESOLVER_SIGNATURE: {
5730 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5731 				error = EINVAL;
5732 				goto out;
5733 			}
5734 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5735 			break;
5736 		}
5737 #endif /* NECP */
5738 
5739 		case SO_EXTENDED_BK_IDLE:
5740 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5741 			    sizeof(optval));
5742 			if (error == 0) {
5743 				error = so_set_extended_bk_idle(so, optval);
5744 			}
5745 			break;
5746 
5747 		case SO_MARK_CELLFALLBACK:
5748 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5749 			    sizeof(optval));
5750 			if (error != 0) {
5751 				goto out;
5752 			}
5753 			if (optval < 0) {
5754 				error = EINVAL;
5755 				goto out;
5756 			}
5757 			if (optval == 0) {
5758 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5759 			} else {
5760 				so->so_flags1 |= SOF1_CELLFALLBACK;
5761 			}
5762 			break;
5763 
5764 		case SO_MARK_CELLFALLBACK_UUID:
5765 		{
5766 			struct so_mark_cellfallback_uuid_args args;
5767 
5768 			error = sooptcopyin(sopt, &args, sizeof(args),
5769 			    sizeof(args));
5770 			if (error != 0) {
5771 				goto out;
5772 			}
5773 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5774 			    args.flow_cellfallback);
5775 			break;
5776 		}
5777 
5778 		case SO_FALLBACK_MODE:
5779 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5780 			    sizeof(optval));
5781 			if (error != 0) {
5782 				goto out;
5783 			}
5784 			if (optval < SO_FALLBACK_MODE_NONE ||
5785 			    optval > SO_FALLBACK_MODE_PREFER) {
5786 				error = EINVAL;
5787 				goto out;
5788 			}
5789 			so->so_fallback_mode = (u_int8_t)optval;
5790 			break;
5791 
5792 		case SO_MARK_KNOWN_TRACKER: {
5793 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5794 			    sizeof(optval));
5795 			if (error != 0) {
5796 				goto out;
5797 			}
5798 			if (optval < 0) {
5799 				error = EINVAL;
5800 				goto out;
5801 			}
5802 			if (optval == 0) {
5803 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5804 			} else {
5805 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5806 			}
5807 			break;
5808 		}
5809 
5810 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5811 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5812 			    sizeof(optval));
5813 			if (error != 0) {
5814 				goto out;
5815 			}
5816 			if (optval < 0) {
5817 				error = EINVAL;
5818 				goto out;
5819 			}
5820 			if (optval == 0) {
5821 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5822 			} else {
5823 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5824 			}
5825 			break;
5826 		}
5827 
5828 		case SO_MARK_APPROVED_APP_DOMAIN: {
5829 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5830 			    sizeof(optval));
5831 			if (error != 0) {
5832 				goto out;
5833 			}
5834 			if (optval < 0) {
5835 				error = EINVAL;
5836 				goto out;
5837 			}
5838 			if (optval == 0) {
5839 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5840 			} else {
5841 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5842 			}
5843 			break;
5844 		}
5845 
5846 		case SO_STATISTICS_EVENT:
5847 			error = sooptcopyin(sopt, &long_optval,
5848 			    sizeof(long_optval), sizeof(long_optval));
5849 			if (error != 0) {
5850 				goto out;
5851 			}
5852 			u_int64_t nstat_event = 0;
5853 			error = so_statistics_event_to_nstat_event(
5854 				&long_optval, &nstat_event);
5855 			if (error != 0) {
5856 				goto out;
5857 			}
5858 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5859 			break;
5860 
5861 		case SO_NET_SERVICE_TYPE: {
5862 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5863 			    sizeof(optval));
5864 			if (error != 0) {
5865 				goto out;
5866 			}
5867 			error = so_set_net_service_type(so, optval);
5868 			break;
5869 		}
5870 
5871 		case SO_QOSMARKING_POLICY_OVERRIDE:
5872 			error = priv_check_cred(kauth_cred_get(),
5873 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5874 			if (error != 0) {
5875 				goto out;
5876 			}
5877 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5878 			    sizeof(optval));
5879 			if (error != 0) {
5880 				goto out;
5881 			}
5882 			if (optval == 0) {
5883 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5884 			} else {
5885 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5886 			}
5887 			break;
5888 
5889 		case SO_MPKL_SEND_INFO: {
5890 			struct so_mpkl_send_info so_mpkl_send_info;
5891 
5892 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5893 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5894 			if (error != 0) {
5895 				goto out;
5896 			}
5897 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5898 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5899 
5900 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5901 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5902 			} else {
5903 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5904 			}
5905 			break;
5906 		}
5907 		case SO_WANT_KEV_SOCKET_CLOSED: {
5908 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5909 			    sizeof(optval));
5910 			if (error != 0) {
5911 				goto out;
5912 			}
5913 			if (optval == 0) {
5914 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5915 			} else {
5916 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5917 			}
5918 			break;
5919 		}
5920 		case SO_MARK_WAKE_PKT: {
5921 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5922 			    sizeof(optval));
5923 			if (error != 0) {
5924 				goto out;
5925 			}
5926 			if (optval == 0) {
5927 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5928 			} else {
5929 				so->so_flags |= SOF_MARK_WAKE_PKT;
5930 			}
5931 			break;
5932 		}
5933 		case SO_RECV_WAKE_PKT: {
5934 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5935 			    sizeof(optval));
5936 			if (error != 0) {
5937 				goto out;
5938 			}
5939 			if (optval == 0) {
5940 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5941 			} else {
5942 				so->so_flags |= SOF_RECV_WAKE_PKT;
5943 			}
5944 			break;
5945 		}
5946 		default:
5947 			error = ENOPROTOOPT;
5948 			break;
5949 		}
5950 		if (error == 0 && so->so_proto != NULL &&
5951 		    so->so_proto->pr_ctloutput != NULL) {
5952 			(void) so->so_proto->pr_ctloutput(so, sopt);
5953 		}
5954 	}
5955 out:
5956 	if (dolock) {
5957 		socket_unlock(so, 1);
5958 	}
5959 	return error;
5960 }
5961 
5962 /* Helper routines for getsockopt */
5963 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5964 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5965 {
5966 	int     error;
5967 	size_t  valsize;
5968 
5969 	error = 0;
5970 
5971 	/*
5972 	 * Documented get behavior is that we always return a value,
5973 	 * possibly truncated to fit in the user's buffer.
5974 	 * Traditional behavior is that we always tell the user
5975 	 * precisely how much we copied, rather than something useful
5976 	 * like the total amount we had available for her.
5977 	 * Note that this interface is not idempotent; the entire answer must
5978 	 * generated ahead of time.
5979 	 */
5980 	valsize = MIN(len, sopt->sopt_valsize);
5981 	sopt->sopt_valsize = valsize;
5982 	if (sopt->sopt_val != USER_ADDR_NULL) {
5983 		if (sopt->sopt_p != kernproc) {
5984 			error = copyout(buf, sopt->sopt_val, valsize);
5985 		} else {
5986 			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5987 		}
5988 	}
5989 	return error;
5990 }
5991 
5992 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5993 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5994 {
5995 	int                     error;
5996 	size_t                  len;
5997 	struct user64_timeval   tv64 = {};
5998 	struct user32_timeval   tv32 = {};
5999 	const void *            val;
6000 	size_t                  valsize;
6001 
6002 	error = 0;
6003 	if (proc_is64bit(sopt->sopt_p)) {
6004 		len = sizeof(tv64);
6005 		tv64.tv_sec = tv_p->tv_sec;
6006 		tv64.tv_usec = tv_p->tv_usec;
6007 		val = &tv64;
6008 	} else {
6009 		len = sizeof(tv32);
6010 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
6011 		tv32.tv_usec = tv_p->tv_usec;
6012 		val = &tv32;
6013 	}
6014 	valsize = MIN(len, sopt->sopt_valsize);
6015 	sopt->sopt_valsize = valsize;
6016 	if (sopt->sopt_val != USER_ADDR_NULL) {
6017 		if (sopt->sopt_p != kernproc) {
6018 			error = copyout(val, sopt->sopt_val, valsize);
6019 		} else {
6020 			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6021 		}
6022 	}
6023 	return error;
6024 }
6025 
6026 /*
6027  * Return:	0			Success
6028  *		ENOPROTOOPT
6029  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
6030  *	<pr_ctloutput>:???
6031  *	<sf_getoption>:???
6032  */
6033 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)6034 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
6035 {
6036 	int     error, optval;
6037 	struct  linger l;
6038 	struct  timeval tv;
6039 
6040 	if (sopt->sopt_dir != SOPT_GET) {
6041 		sopt->sopt_dir = SOPT_GET;
6042 	}
6043 
6044 	if (dolock) {
6045 		socket_lock(so, 1);
6046 	}
6047 
6048 	error = sflt_getsockopt(so, sopt);
6049 	if (error != 0) {
6050 		if (error == EJUSTRETURN) {
6051 			error = 0;
6052 		}
6053 		goto out;
6054 	}
6055 
6056 	if (sopt->sopt_level != SOL_SOCKET) {
6057 		if (so->so_proto != NULL &&
6058 		    so->so_proto->pr_ctloutput != NULL) {
6059 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
6060 			goto out;
6061 		}
6062 		error = ENOPROTOOPT;
6063 	} else {
6064 		/*
6065 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
6066 		 * the protocol layer, if needed.  A zero value returned from
6067 		 * the handler means use default socket-level processing as
6068 		 * done by the rest of this routine.  Otherwise, any other
6069 		 * return value indicates that the option is unsupported.
6070 		 */
6071 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6072 		    pru_socheckopt(so, sopt)) != 0) {
6073 			goto out;
6074 		}
6075 
6076 		error = 0;
6077 		switch (sopt->sopt_name) {
6078 		case SO_LINGER:
6079 		case SO_LINGER_SEC:
6080 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6081 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6082 			    so->so_linger : so->so_linger / hz;
6083 			error = sooptcopyout(sopt, &l, sizeof(l));
6084 			break;
6085 
6086 		case SO_USELOOPBACK:
6087 		case SO_DONTROUTE:
6088 		case SO_DEBUG:
6089 		case SO_KEEPALIVE:
6090 		case SO_REUSEADDR:
6091 		case SO_REUSEPORT:
6092 		case SO_BROADCAST:
6093 		case SO_OOBINLINE:
6094 		case SO_TIMESTAMP:
6095 		case SO_TIMESTAMP_MONOTONIC:
6096 		case SO_TIMESTAMP_CONTINUOUS:
6097 		case SO_DONTTRUNC:
6098 		case SO_WANTMORE:
6099 		case SO_WANTOOBFLAG:
6100 		case SO_NOWAKEFROMSLEEP:
6101 		case SO_NOAPNFALLBK:
6102 			optval = so->so_options & sopt->sopt_name;
6103 integer:
6104 			error = sooptcopyout(sopt, &optval, sizeof(optval));
6105 			break;
6106 
6107 		case SO_TYPE:
6108 			optval = so->so_type;
6109 			goto integer;
6110 
6111 		case SO_NREAD:
6112 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6113 				int pkt_total;
6114 				struct mbuf *m1;
6115 
6116 				pkt_total = 0;
6117 				m1 = so->so_rcv.sb_mb;
6118 				while (m1 != NULL) {
6119 					if (m1->m_type == MT_DATA ||
6120 					    m1->m_type == MT_HEADER ||
6121 					    m1->m_type == MT_OOBDATA) {
6122 						pkt_total += m1->m_len;
6123 					}
6124 					m1 = m1->m_next;
6125 				}
6126 				optval = pkt_total;
6127 			} else {
6128 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6129 			}
6130 			goto integer;
6131 
6132 		case SO_NUMRCVPKT:
6133 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6134 				int cnt = 0;
6135 				struct mbuf *m1;
6136 
6137 				m1 = so->so_rcv.sb_mb;
6138 				while (m1 != NULL) {
6139 					cnt += 1;
6140 					m1 = m1->m_nextpkt;
6141 				}
6142 				optval = cnt;
6143 				goto integer;
6144 			} else {
6145 				error = ENOPROTOOPT;
6146 				break;
6147 			}
6148 
6149 		case SO_NWRITE:
6150 			optval = so->so_snd.sb_cc;
6151 			goto integer;
6152 
6153 		case SO_ERROR:
6154 			optval = so->so_error;
6155 			so->so_error = 0;
6156 			goto integer;
6157 
6158 		case SO_SNDBUF: {
6159 			u_int32_t hiwat = so->so_snd.sb_hiwat;
6160 
6161 			if (so->so_snd.sb_flags & SB_UNIX) {
6162 				struct unpcb *unp =
6163 				    (struct unpcb *)(so->so_pcb);
6164 				if (unp != NULL && unp->unp_conn != NULL) {
6165 					hiwat += unp->unp_conn->unp_cc;
6166 				}
6167 			}
6168 
6169 			optval = hiwat;
6170 			goto integer;
6171 		}
6172 		case SO_RCVBUF:
6173 			optval = so->so_rcv.sb_hiwat;
6174 			goto integer;
6175 
6176 		case SO_SNDLOWAT:
6177 			optval = so->so_snd.sb_lowat;
6178 			goto integer;
6179 
6180 		case SO_RCVLOWAT:
6181 			optval = so->so_rcv.sb_lowat;
6182 			goto integer;
6183 
6184 		case SO_SNDTIMEO:
6185 		case SO_RCVTIMEO:
6186 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
6187 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6188 
6189 			error = sooptcopyout_timeval(sopt, &tv);
6190 			break;
6191 
6192 		case SO_NOSIGPIPE:
6193 			optval = (so->so_flags & SOF_NOSIGPIPE);
6194 			goto integer;
6195 
6196 		case SO_NOADDRERR:
6197 			optval = (so->so_flags & SOF_NOADDRAVAIL);
6198 			goto integer;
6199 
6200 		case SO_REUSESHAREUID:
6201 			optval = (so->so_flags & SOF_REUSESHAREUID);
6202 			goto integer;
6203 
6204 
6205 		case SO_NOTIFYCONFLICT:
6206 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6207 			goto integer;
6208 
6209 		case SO_RESTRICTIONS:
6210 			optval = so_get_restrictions(so);
6211 			goto integer;
6212 
6213 		case SO_AWDL_UNRESTRICTED:
6214 			if (SOCK_DOM(so) == PF_INET ||
6215 			    SOCK_DOM(so) == PF_INET6) {
6216 				optval = inp_get_awdl_unrestricted(
6217 					sotoinpcb(so));
6218 				goto integer;
6219 			} else {
6220 				error = EOPNOTSUPP;
6221 			}
6222 			break;
6223 
6224 		case SO_INTCOPROC_ALLOW:
6225 			if (SOCK_DOM(so) == PF_INET6) {
6226 				optval = inp_get_intcoproc_allowed(
6227 					sotoinpcb(so));
6228 				goto integer;
6229 			} else {
6230 				error = EOPNOTSUPP;
6231 			}
6232 			break;
6233 
6234 		case SO_LABEL:
6235 			error = EOPNOTSUPP;
6236 			break;
6237 
6238 		case SO_PEERLABEL:
6239 			error = EOPNOTSUPP;
6240 			break;
6241 
6242 #ifdef __APPLE_API_PRIVATE
6243 		case SO_UPCALLCLOSEWAIT:
6244 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6245 			goto integer;
6246 #endif
6247 		case SO_RANDOMPORT:
6248 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6249 			goto integer;
6250 
6251 		case SO_NP_EXTENSIONS: {
6252 			struct so_np_extensions sonpx = {};
6253 
6254 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6255 			    SONPX_SETOPTSHUT : 0;
6256 			sonpx.npx_mask = SONPX_MASK_VALID;
6257 
6258 			error = sooptcopyout(sopt, &sonpx,
6259 			    sizeof(struct so_np_extensions));
6260 			break;
6261 		}
6262 
6263 		case SO_TRAFFIC_CLASS:
6264 			optval = so->so_traffic_class;
6265 			goto integer;
6266 
6267 		case SO_RECV_TRAFFIC_CLASS:
6268 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6269 			goto integer;
6270 
6271 #if (DEVELOPMENT || DEBUG)
6272 		case SO_TRAFFIC_CLASS_DBG:
6273 			error = sogetopt_tcdbg(so, sopt);
6274 			break;
6275 #endif /* (DEVELOPMENT || DEBUG) */
6276 
6277 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6278 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6279 			goto integer;
6280 
6281 		case SO_DEFUNCTOK:
6282 			optval = !(so->so_flags & SOF_NODEFUNCT);
6283 			goto integer;
6284 
6285 		case SO_ISDEFUNCT:
6286 			optval = (so->so_flags & SOF_DEFUNCT);
6287 			goto integer;
6288 
6289 		case SO_OPPORTUNISTIC:
6290 			optval = so_get_opportunistic(so);
6291 			goto integer;
6292 
6293 		case SO_FLUSH:
6294 			/* This option is not gettable */
6295 			error = EINVAL;
6296 			break;
6297 
6298 		case SO_RECV_ANYIF:
6299 			optval = so_get_recv_anyif(so);
6300 			goto integer;
6301 
6302 		case SO_TRAFFIC_MGT_BACKGROUND:
6303 			/* This option is handled by lower layer(s) */
6304 			if (so->so_proto != NULL &&
6305 			    so->so_proto->pr_ctloutput != NULL) {
6306 				(void) so->so_proto->pr_ctloutput(so, sopt);
6307 			}
6308 			break;
6309 
6310 #if FLOW_DIVERT
6311 		case SO_FLOW_DIVERT_TOKEN:
6312 			error = flow_divert_token_get(so, sopt);
6313 			break;
6314 #endif  /* FLOW_DIVERT */
6315 
6316 #if NECP
6317 		case SO_NECP_ATTRIBUTES:
6318 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6319 				/* Handled by MPTCP itself */
6320 				break;
6321 			}
6322 
6323 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6324 				error = EINVAL;
6325 				goto out;
6326 			}
6327 
6328 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6329 			break;
6330 
6331 		case SO_NECP_CLIENTUUID: {
6332 			uuid_t *ncu;
6333 
6334 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6335 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6336 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6337 				ncu = &sotoinpcb(so)->necp_client_uuid;
6338 			} else {
6339 				error = EINVAL;
6340 				goto out;
6341 			}
6342 
6343 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6344 			break;
6345 		}
6346 
6347 		case SO_NECP_LISTENUUID: {
6348 			uuid_t *nlu;
6349 
6350 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6351 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6352 					nlu = &sotoinpcb(so)->necp_client_uuid;
6353 				} else {
6354 					error = ENOENT;
6355 					goto out;
6356 				}
6357 			} else {
6358 				error = EINVAL;
6359 				goto out;
6360 			}
6361 
6362 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6363 			break;
6364 		}
6365 
6366 		case SO_RESOLVER_SIGNATURE: {
6367 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6368 				error = EINVAL;
6369 				goto out;
6370 			}
6371 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6372 			break;
6373 		}
6374 
6375 #endif /* NECP */
6376 
6377 #if CONTENT_FILTER
6378 		case SO_CFIL_SOCK_ID: {
6379 			cfil_sock_id_t sock_id;
6380 
6381 			sock_id = cfil_sock_id_from_socket(so);
6382 
6383 			error = sooptcopyout(sopt, &sock_id,
6384 			    sizeof(cfil_sock_id_t));
6385 			break;
6386 		}
6387 #endif  /* CONTENT_FILTER */
6388 
6389 		case SO_EXTENDED_BK_IDLE:
6390 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6391 			goto integer;
6392 		case SO_MARK_CELLFALLBACK:
6393 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6394 			    ? 1 : 0;
6395 			goto integer;
6396 		case SO_FALLBACK_MODE:
6397 			optval = so->so_fallback_mode;
6398 			goto integer;
6399 		case SO_MARK_KNOWN_TRACKER: {
6400 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6401 			    ? 1 : 0;
6402 			goto integer;
6403 		}
6404 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6405 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6406 			    ? 1 : 0;
6407 			goto integer;
6408 		}
6409 		case SO_MARK_APPROVED_APP_DOMAIN: {
6410 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6411 			    ? 1 : 0;
6412 			goto integer;
6413 		}
6414 		case SO_NET_SERVICE_TYPE: {
6415 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6416 				optval = so->so_netsvctype;
6417 			} else {
6418 				optval = NET_SERVICE_TYPE_BE;
6419 			}
6420 			goto integer;
6421 		}
6422 		case SO_NETSVC_MARKING_LEVEL:
6423 			optval = so_get_netsvc_marking_level(so);
6424 			goto integer;
6425 
6426 		case SO_MPKL_SEND_INFO: {
6427 			struct so_mpkl_send_info so_mpkl_send_info;
6428 
6429 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6430 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6431 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6432 			    sizeof(struct so_mpkl_send_info));
6433 			break;
6434 		}
6435 		case SO_MARK_WAKE_PKT:
6436 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6437 			goto integer;
6438 		case SO_RECV_WAKE_PKT:
6439 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6440 			goto integer;
6441 		default:
6442 			error = ENOPROTOOPT;
6443 			break;
6444 		}
6445 	}
6446 out:
6447 	if (dolock) {
6448 		socket_unlock(so, 1);
6449 	}
6450 	return error;
6451 }
6452 
6453 /*
6454  * The size limits on our soopt_getm is different from that on FreeBSD.
6455  * We limit the size of options to MCLBYTES. This will have to change
6456  * if we need to define options that need more space than MCLBYTES.
6457  */
6458 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6459 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6460 {
6461 	struct mbuf *m, *m_prev;
6462 	int sopt_size = (int)sopt->sopt_valsize;
6463 	int how;
6464 
6465 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6466 		return EMSGSIZE;
6467 	}
6468 
6469 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6470 	MGET(m, how, MT_DATA);
6471 	if (m == NULL) {
6472 		return ENOBUFS;
6473 	}
6474 	if (sopt_size > MLEN) {
6475 		MCLGET(m, how);
6476 		if ((m->m_flags & M_EXT) == 0) {
6477 			m_free(m);
6478 			return ENOBUFS;
6479 		}
6480 		m->m_len = min(MCLBYTES, sopt_size);
6481 	} else {
6482 		m->m_len = min(MLEN, sopt_size);
6483 	}
6484 	sopt_size -= m->m_len;
6485 	*mp = m;
6486 	m_prev = m;
6487 
6488 	while (sopt_size > 0) {
6489 		MGET(m, how, MT_DATA);
6490 		if (m == NULL) {
6491 			m_freem(*mp);
6492 			return ENOBUFS;
6493 		}
6494 		if (sopt_size > MLEN) {
6495 			MCLGET(m, how);
6496 			if ((m->m_flags & M_EXT) == 0) {
6497 				m_freem(*mp);
6498 				m_freem(m);
6499 				return ENOBUFS;
6500 			}
6501 			m->m_len = min(MCLBYTES, sopt_size);
6502 		} else {
6503 			m->m_len = min(MLEN, sopt_size);
6504 		}
6505 		sopt_size -= m->m_len;
6506 		m_prev->m_next = m;
6507 		m_prev = m;
6508 	}
6509 	return 0;
6510 }
6511 
6512 /* copyin sopt data into mbuf chain */
6513 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6514 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6515 {
6516 	struct mbuf *m0 = m;
6517 
6518 	if (sopt->sopt_val == USER_ADDR_NULL) {
6519 		return 0;
6520 	}
6521 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6522 		if (sopt->sopt_p != kernproc) {
6523 			int error;
6524 
6525 			error = copyin(sopt->sopt_val, mtod(m, char *),
6526 			    m->m_len);
6527 			if (error != 0) {
6528 				m_freem(m0);
6529 				return error;
6530 			}
6531 		} else {
6532 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6533 			    mtod(m, char *), m->m_len);
6534 		}
6535 		sopt->sopt_valsize -= m->m_len;
6536 		sopt->sopt_val += m->m_len;
6537 		m = m->m_next;
6538 	}
6539 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6540 	if (m != NULL) {
6541 		panic("soopt_mcopyin");
6542 		/* NOTREACHED */
6543 	}
6544 	return 0;
6545 }
6546 
6547 /* copyout mbuf chain data into soopt */
6548 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6549 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6550 {
6551 	struct mbuf *m0 = m;
6552 	size_t valsize = 0;
6553 
6554 	if (sopt->sopt_val == USER_ADDR_NULL) {
6555 		return 0;
6556 	}
6557 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6558 		if (sopt->sopt_p != kernproc) {
6559 			int error;
6560 
6561 			error = copyout(mtod(m, char *), sopt->sopt_val,
6562 			    m->m_len);
6563 			if (error != 0) {
6564 				m_freem(m0);
6565 				return error;
6566 			}
6567 		} else {
6568 			bcopy(mtod(m, char *),
6569 			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6570 		}
6571 		sopt->sopt_valsize -= m->m_len;
6572 		sopt->sopt_val += m->m_len;
6573 		valsize += m->m_len;
6574 		m = m->m_next;
6575 	}
6576 	if (m != NULL) {
6577 		/* enough soopt buffer should be given from user-land */
6578 		m_freem(m0);
6579 		return EINVAL;
6580 	}
6581 	sopt->sopt_valsize = valsize;
6582 	return 0;
6583 }
6584 
6585 void
sohasoutofband(struct socket * so)6586 sohasoutofband(struct socket *so)
6587 {
6588 	if (so->so_pgid < 0) {
6589 		gsignal(-so->so_pgid, SIGURG);
6590 	} else if (so->so_pgid > 0) {
6591 		proc_signal(so->so_pgid, SIGURG);
6592 	}
6593 	selwakeup(&so->so_rcv.sb_sel);
6594 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6595 		KNOTE(&so->so_rcv.sb_sel.si_note,
6596 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6597 	}
6598 }
6599 
6600 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6601 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6602 {
6603 #pragma unused(cred)
6604 	struct proc *p = current_proc();
6605 	int revents = 0;
6606 
6607 	socket_lock(so, 1);
6608 	so_update_last_owner_locked(so, PROC_NULL);
6609 	so_update_policy(so);
6610 
6611 	if (events & (POLLIN | POLLRDNORM)) {
6612 		if (soreadable(so)) {
6613 			revents |= events & (POLLIN | POLLRDNORM);
6614 		}
6615 	}
6616 
6617 	if (events & (POLLOUT | POLLWRNORM)) {
6618 		if (sowriteable(so)) {
6619 			revents |= events & (POLLOUT | POLLWRNORM);
6620 		}
6621 	}
6622 
6623 	if (events & (POLLPRI | POLLRDBAND)) {
6624 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6625 			revents |= events & (POLLPRI | POLLRDBAND);
6626 		}
6627 	}
6628 
6629 	if (revents == 0) {
6630 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6631 			/*
6632 			 * Darwin sets the flag first,
6633 			 * BSD calls selrecord first
6634 			 */
6635 			so->so_rcv.sb_flags |= SB_SEL;
6636 			selrecord(p, &so->so_rcv.sb_sel, wql);
6637 		}
6638 
6639 		if (events & (POLLOUT | POLLWRNORM)) {
6640 			/*
6641 			 * Darwin sets the flag first,
6642 			 * BSD calls selrecord first
6643 			 */
6644 			so->so_snd.sb_flags |= SB_SEL;
6645 			selrecord(p, &so->so_snd.sb_sel, wql);
6646 		}
6647 	}
6648 
6649 	socket_unlock(so, 1);
6650 	return revents;
6651 }
6652 
6653 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6654 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6655 {
6656 	struct socket *so = (struct socket *)fp_get_data(fp);
6657 	int result;
6658 
6659 	socket_lock(so, 1);
6660 	so_update_last_owner_locked(so, PROC_NULL);
6661 	so_update_policy(so);
6662 
6663 	switch (kn->kn_filter) {
6664 	case EVFILT_READ:
6665 		kn->kn_filtid = EVFILTID_SOREAD;
6666 		break;
6667 	case EVFILT_WRITE:
6668 		kn->kn_filtid = EVFILTID_SOWRITE;
6669 		break;
6670 	case EVFILT_SOCK:
6671 		kn->kn_filtid = EVFILTID_SCK;
6672 		break;
6673 	case EVFILT_EXCEPT:
6674 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6675 		break;
6676 	default:
6677 		socket_unlock(so, 1);
6678 		knote_set_error(kn, EINVAL);
6679 		return 0;
6680 	}
6681 
6682 	/*
6683 	 * call the appropriate sub-filter attach
6684 	 * with the socket still locked
6685 	 */
6686 	result = knote_fops(kn)->f_attach(kn, kev);
6687 
6688 	socket_unlock(so, 1);
6689 
6690 	return result;
6691 }
6692 
6693 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6694 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6695 {
6696 	int retval = 0;
6697 	int64_t data = 0;
6698 
6699 	if (so->so_options & SO_ACCEPTCONN) {
6700 		/*
6701 		 * Radar 6615193 handle the listen case dynamically
6702 		 * for kqueue read filter. This allows to call listen()
6703 		 * after registering the kqueue EVFILT_READ.
6704 		 */
6705 
6706 		retval = !TAILQ_EMPTY(&so->so_comp);
6707 		data = so->so_qlen;
6708 		goto out;
6709 	}
6710 
6711 	/* socket isn't a listener */
6712 	/*
6713 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6714 	 * the bytes of protocol data. We therefore exclude any
6715 	 * control bytes.
6716 	 */
6717 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6718 
6719 	if (kn->kn_sfflags & NOTE_OOB) {
6720 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6721 			kn->kn_fflags |= NOTE_OOB;
6722 			data -= so->so_oobmark;
6723 			retval = 1;
6724 			goto out;
6725 		}
6726 	}
6727 
6728 	if ((so->so_state & SS_CANTRCVMORE)
6729 #if CONTENT_FILTER
6730 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6731 #endif /* CONTENT_FILTER */
6732 	    ) {
6733 		kn->kn_flags |= EV_EOF;
6734 		kn->kn_fflags = so->so_error;
6735 		retval = 1;
6736 		goto out;
6737 	}
6738 
6739 	if (so->so_error) {     /* temporary udp error */
6740 		retval = 1;
6741 		goto out;
6742 	}
6743 
6744 	int64_t lowwat = so->so_rcv.sb_lowat;
6745 	/*
6746 	 * Ensure that when NOTE_LOWAT is used, the derived
6747 	 * low water mark is bounded by socket's rcv buf's
6748 	 * high and low water mark values.
6749 	 */
6750 	if (kn->kn_sfflags & NOTE_LOWAT) {
6751 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6752 			lowwat = so->so_rcv.sb_hiwat;
6753 		} else if (kn->kn_sdata > lowwat) {
6754 			lowwat = kn->kn_sdata;
6755 		}
6756 	}
6757 
6758 	/*
6759 	 * While the `data` field is the amount of data to read,
6760 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6761 	 * so we need to take control bytes into account too.
6762 	 */
6763 	retval = (so->so_rcv.sb_cc >= lowwat);
6764 
6765 out:
6766 	if (retval && kev) {
6767 		knote_fill_kevent(kn, kev, data);
6768 	}
6769 	return retval;
6770 }
6771 
6772 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6773 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6774 {
6775 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6776 
6777 	/* socket locked */
6778 
6779 	/*
6780 	 * If the caller explicitly asked for OOB results (e.g. poll())
6781 	 * from EVFILT_READ, then save that off in the hookid field
6782 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6783 	 */
6784 	if (kn->kn_filter == EVFILT_READ &&
6785 	    kn->kn_flags & EV_OOBAND) {
6786 		kn->kn_flags &= ~EV_OOBAND;
6787 		kn->kn_hook32 = EV_OOBAND;
6788 	} else {
6789 		kn->kn_hook32 = 0;
6790 	}
6791 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6792 		so->so_rcv.sb_flags |= SB_KNOTE;
6793 	}
6794 
6795 	/* indicate if event is already fired */
6796 	return filt_soread_common(kn, NULL, so);
6797 }
6798 
6799 static void
filt_sordetach(struct knote * kn)6800 filt_sordetach(struct knote *kn)
6801 {
6802 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6803 
6804 	socket_lock(so, 1);
6805 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6806 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6807 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6808 		}
6809 	}
6810 	socket_unlock(so, 1);
6811 }
6812 
6813 /*ARGSUSED*/
6814 static int
filt_soread(struct knote * kn,long hint)6815 filt_soread(struct knote *kn, long hint)
6816 {
6817 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6818 	int retval;
6819 
6820 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6821 		socket_lock(so, 1);
6822 	}
6823 
6824 	retval = filt_soread_common(kn, NULL, so);
6825 
6826 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6827 		socket_unlock(so, 1);
6828 	}
6829 
6830 	return retval;
6831 }
6832 
6833 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6834 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6835 {
6836 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6837 	int retval;
6838 
6839 	socket_lock(so, 1);
6840 
6841 	/* save off the new input fflags and data */
6842 	kn->kn_sfflags = kev->fflags;
6843 	kn->kn_sdata = kev->data;
6844 
6845 	/* determine if changes result in fired events */
6846 	retval = filt_soread_common(kn, NULL, so);
6847 
6848 	socket_unlock(so, 1);
6849 
6850 	return retval;
6851 }
6852 
6853 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6854 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6855 {
6856 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6857 	int retval;
6858 
6859 	socket_lock(so, 1);
6860 	retval = filt_soread_common(kn, kev, so);
6861 	socket_unlock(so, 1);
6862 
6863 	return retval;
6864 }
6865 
6866 int
so_wait_for_if_feedback(struct socket * so)6867 so_wait_for_if_feedback(struct socket *so)
6868 {
6869 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6870 	    (so->so_state & SS_ISCONNECTED)) {
6871 		struct inpcb *inp = sotoinpcb(so);
6872 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6873 			return 1;
6874 		}
6875 	}
6876 	return 0;
6877 }
6878 
6879 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6880 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6881 {
6882 	int ret = 0;
6883 	int64_t data = sbspace(&so->so_snd);
6884 
6885 	if (so->so_state & SS_CANTSENDMORE) {
6886 		kn->kn_flags |= EV_EOF;
6887 		kn->kn_fflags = so->so_error;
6888 		ret = 1;
6889 		goto out;
6890 	}
6891 
6892 	if (so->so_error) {     /* temporary udp error */
6893 		ret = 1;
6894 		goto out;
6895 	}
6896 
6897 	if (!socanwrite(so)) {
6898 		ret = 0;
6899 		goto out;
6900 	}
6901 
6902 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6903 		ret = 1;
6904 		goto out;
6905 	}
6906 
6907 	int64_t lowwat = so->so_snd.sb_lowat;
6908 	const int64_t hiwat = so->so_snd.sb_hiwat;
6909 	/*
6910 	 * Deal with connected UNIX domain sockets which
6911 	 * rely on the fact that the sender's socket buffer is
6912 	 * actually the receiver's socket buffer.
6913 	 */
6914 	if (SOCK_DOM(so) == PF_LOCAL) {
6915 		struct unpcb *unp = sotounpcb(so);
6916 		if (unp != NULL && unp->unp_conn != NULL &&
6917 		    unp->unp_conn->unp_socket != NULL) {
6918 			struct socket *so2 = unp->unp_conn->unp_socket;
6919 			/*
6920 			 * At this point we know that `so' is locked
6921 			 * and that `unp_conn` isn't going to change.
6922 			 * However, we don't lock `so2` because doing so
6923 			 * may require unlocking `so'
6924 			 * (see unp_get_locks_in_order()).
6925 			 *
6926 			 * Two cases can happen:
6927 			 *
6928 			 * 1) we return 1 and tell the application that
6929 			 *    it can write.  Meanwhile, another thread
6930 			 *    fills up the socket buffer.  This will either
6931 			 *    lead to a blocking send or EWOULDBLOCK
6932 			 *    which the application should deal with.
6933 			 * 2) we return 0 and tell the application that
6934 			 *    the socket is not writable.  Meanwhile,
6935 			 *    another thread depletes the receive socket
6936 			 *    buffer. In this case the application will
6937 			 *    be woken up by sb_notify().
6938 			 *
6939 			 * MIN() is required because otherwise sosendcheck()
6940 			 * may return EWOULDBLOCK since it only considers
6941 			 * so->so_snd.
6942 			 */
6943 			data = MIN(data, sbspace(&so2->so_rcv));
6944 		}
6945 	}
6946 
6947 	if (kn->kn_sfflags & NOTE_LOWAT) {
6948 		if (kn->kn_sdata > hiwat) {
6949 			lowwat = hiwat;
6950 		} else if (kn->kn_sdata > lowwat) {
6951 			lowwat = kn->kn_sdata;
6952 		}
6953 	}
6954 
6955 	if (data > 0 && data >= lowwat) {
6956 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6957 #if (DEBUG || DEVELOPMENT)
6958 		    && so_notsent_lowat_check == 1
6959 #endif /* DEBUG || DEVELOPMENT */
6960 		    ) {
6961 			if ((SOCK_DOM(so) == PF_INET ||
6962 			    SOCK_DOM(so) == PF_INET6) &&
6963 			    so->so_type == SOCK_STREAM) {
6964 				ret = tcp_notsent_lowat_check(so);
6965 			}
6966 #if MPTCP
6967 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6968 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6969 				ret = mptcp_notsent_lowat_check(so);
6970 			}
6971 #endif
6972 			else {
6973 				ret = 1;
6974 				goto out;
6975 			}
6976 		} else {
6977 			ret = 1;
6978 		}
6979 	}
6980 	if (so_wait_for_if_feedback(so)) {
6981 		ret = 0;
6982 	}
6983 
6984 out:
6985 	if (ret && kev) {
6986 		knote_fill_kevent(kn, kev, data);
6987 	}
6988 	return ret;
6989 }
6990 
6991 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6992 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6993 {
6994 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6995 
6996 	/* socket locked */
6997 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6998 		so->so_snd.sb_flags |= SB_KNOTE;
6999 	}
7000 
7001 	/* determine if its already fired */
7002 	return filt_sowrite_common(kn, NULL, so);
7003 }
7004 
7005 static void
filt_sowdetach(struct knote * kn)7006 filt_sowdetach(struct knote *kn)
7007 {
7008 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7009 	socket_lock(so, 1);
7010 
7011 	if (so->so_snd.sb_flags & SB_KNOTE) {
7012 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
7013 			so->so_snd.sb_flags &= ~SB_KNOTE;
7014 		}
7015 	}
7016 	socket_unlock(so, 1);
7017 }
7018 
7019 /*ARGSUSED*/
7020 static int
filt_sowrite(struct knote * kn,long hint)7021 filt_sowrite(struct knote *kn, long hint)
7022 {
7023 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7024 	int ret;
7025 
7026 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7027 		socket_lock(so, 1);
7028 	}
7029 
7030 	ret = filt_sowrite_common(kn, NULL, so);
7031 
7032 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7033 		socket_unlock(so, 1);
7034 	}
7035 
7036 	return ret;
7037 }
7038 
7039 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)7040 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
7041 {
7042 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7043 	int ret;
7044 
7045 	socket_lock(so, 1);
7046 
7047 	/*save off the new input fflags and data */
7048 	kn->kn_sfflags = kev->fflags;
7049 	kn->kn_sdata = kev->data;
7050 
7051 	/* determine if these changes result in a triggered event */
7052 	ret = filt_sowrite_common(kn, NULL, so);
7053 
7054 	socket_unlock(so, 1);
7055 
7056 	return ret;
7057 }
7058 
7059 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)7060 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
7061 {
7062 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7063 	int ret;
7064 
7065 	socket_lock(so, 1);
7066 	ret = filt_sowrite_common(kn, kev, so);
7067 	socket_unlock(so, 1);
7068 
7069 	return ret;
7070 }
7071 
7072 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)7073 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
7074     struct socket *so, long ev_hint)
7075 {
7076 	int ret = 0;
7077 	int64_t data = 0;
7078 	uint32_t level_trigger = 0;
7079 
7080 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
7081 		kn->kn_fflags |= NOTE_CONNRESET;
7082 	}
7083 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
7084 		kn->kn_fflags |= NOTE_TIMEOUT;
7085 	}
7086 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
7087 		kn->kn_fflags |= NOTE_NOSRCADDR;
7088 	}
7089 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
7090 		kn->kn_fflags |= NOTE_IFDENIED;
7091 	}
7092 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7093 		kn->kn_fflags |= NOTE_KEEPALIVE;
7094 	}
7095 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7096 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7097 	}
7098 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7099 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7100 	}
7101 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7102 	    (so->so_state & SS_ISCONNECTED)) {
7103 		kn->kn_fflags |= NOTE_CONNECTED;
7104 		level_trigger |= NOTE_CONNECTED;
7105 	}
7106 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7107 	    (so->so_state & SS_ISDISCONNECTED)) {
7108 		kn->kn_fflags |= NOTE_DISCONNECTED;
7109 		level_trigger |= NOTE_DISCONNECTED;
7110 	}
7111 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7112 		if (so->so_proto != NULL &&
7113 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7114 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7115 		}
7116 	}
7117 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7118 	    tcp_notify_ack_active(so)) {
7119 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
7120 	}
7121 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7122 		kn->kn_fflags |= NOTE_WAKE_PKT;
7123 	}
7124 
7125 	if ((so->so_state & SS_CANTRCVMORE)
7126 #if CONTENT_FILTER
7127 	    && cfil_sock_data_pending(&so->so_rcv) == 0
7128 #endif /* CONTENT_FILTER */
7129 	    ) {
7130 		kn->kn_fflags |= NOTE_READCLOSED;
7131 		level_trigger |= NOTE_READCLOSED;
7132 	}
7133 
7134 	if (so->so_state & SS_CANTSENDMORE) {
7135 		kn->kn_fflags |= NOTE_WRITECLOSED;
7136 		level_trigger |= NOTE_WRITECLOSED;
7137 	}
7138 
7139 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7140 	    (so->so_flags & SOF_SUSPENDED)) {
7141 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7142 
7143 		/* If resume event was delivered before, reset it */
7144 		kn->kn_hook32 &= ~NOTE_RESUME;
7145 
7146 		kn->kn_fflags |= NOTE_SUSPEND;
7147 		level_trigger |= NOTE_SUSPEND;
7148 	}
7149 
7150 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
7151 	    (so->so_flags & SOF_SUSPENDED) == 0) {
7152 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7153 
7154 		/* If suspend event was delivered before, reset it */
7155 		kn->kn_hook32 &= ~NOTE_SUSPEND;
7156 
7157 		kn->kn_fflags |= NOTE_RESUME;
7158 		level_trigger |= NOTE_RESUME;
7159 	}
7160 
7161 	if (so->so_error != 0) {
7162 		ret = 1;
7163 		data = so->so_error;
7164 		kn->kn_flags |= EV_EOF;
7165 	} else {
7166 		u_int32_t data32 = 0;
7167 		get_sockev_state(so, &data32);
7168 		data = data32;
7169 	}
7170 
7171 	/* Reset any events that are not requested on this knote */
7172 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7173 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7174 
7175 	/* Find the level triggerred events that are already delivered */
7176 	level_trigger &= kn->kn_hook32;
7177 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7178 
7179 	/* Do not deliver level triggerred events more than once */
7180 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7181 		ret = 1;
7182 	}
7183 
7184 	if (ret && kev) {
7185 		/*
7186 		 * Store the state of the events being delivered. This
7187 		 * state can be used to deliver level triggered events
7188 		 * ateast once and still avoid waking up the application
7189 		 * multiple times as long as the event is active.
7190 		 */
7191 		if (kn->kn_fflags != 0) {
7192 			kn->kn_hook32 |= (kn->kn_fflags &
7193 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7194 		}
7195 
7196 		/*
7197 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7198 		 * only one of them and remember the last one that was
7199 		 * delivered last
7200 		 */
7201 		if (kn->kn_fflags & NOTE_SUSPEND) {
7202 			kn->kn_hook32 &= ~NOTE_RESUME;
7203 		}
7204 		if (kn->kn_fflags & NOTE_RESUME) {
7205 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7206 		}
7207 
7208 		knote_fill_kevent(kn, kev, data);
7209 	}
7210 	return ret;
7211 }
7212 
7213 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7214 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7215 {
7216 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7217 
7218 	/* socket locked */
7219 	kn->kn_hook32 = 0;
7220 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7221 		so->so_flags |= SOF_KNOTE;
7222 	}
7223 
7224 	/* determine if event already fired */
7225 	return filt_sockev_common(kn, NULL, so, 0);
7226 }
7227 
7228 static void
filt_sockdetach(struct knote * kn)7229 filt_sockdetach(struct knote *kn)
7230 {
7231 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7232 	socket_lock(so, 1);
7233 
7234 	if ((so->so_flags & SOF_KNOTE) != 0) {
7235 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7236 			so->so_flags &= ~SOF_KNOTE;
7237 		}
7238 	}
7239 	socket_unlock(so, 1);
7240 }
7241 
7242 static int
filt_sockev(struct knote * kn,long hint)7243 filt_sockev(struct knote *kn, long hint)
7244 {
7245 	int ret = 0, locked = 0;
7246 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7247 	long ev_hint = (hint & SO_FILT_HINT_EV);
7248 
7249 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7250 		socket_lock(so, 1);
7251 		locked = 1;
7252 	}
7253 
7254 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7255 
7256 	if (locked) {
7257 		socket_unlock(so, 1);
7258 	}
7259 
7260 	return ret;
7261 }
7262 
7263 
7264 
7265 /*
7266  *	filt_socktouch - update event state
7267  */
7268 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7269 filt_socktouch(
7270 	struct knote *kn,
7271 	struct kevent_qos_s *kev)
7272 {
7273 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7274 	uint32_t changed_flags;
7275 	int ret;
7276 
7277 	socket_lock(so, 1);
7278 
7279 	/* save off the [result] data and fflags */
7280 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7281 
7282 	/* save off the new input fflags and data */
7283 	kn->kn_sfflags = kev->fflags;
7284 	kn->kn_sdata = kev->data;
7285 
7286 	/* restrict the current results to the (smaller?) set of new interest */
7287 	/*
7288 	 * For compatibility with previous implementations, we leave kn_fflags
7289 	 * as they were before.
7290 	 */
7291 	//kn->kn_fflags &= kev->fflags;
7292 
7293 	/*
7294 	 * Since we keep track of events that are already
7295 	 * delivered, if any of those events are not requested
7296 	 * anymore the state related to them can be reset
7297 	 */
7298 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7299 
7300 	/* determine if we have events to deliver */
7301 	ret = filt_sockev_common(kn, NULL, so, 0);
7302 
7303 	socket_unlock(so, 1);
7304 
7305 	return ret;
7306 }
7307 
7308 /*
7309  *	filt_sockprocess - query event fired state and return data
7310  */
7311 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7312 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7313 {
7314 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7315 	int ret = 0;
7316 
7317 	socket_lock(so, 1);
7318 
7319 	ret = filt_sockev_common(kn, kev, so, 0);
7320 
7321 	socket_unlock(so, 1);
7322 
7323 	return ret;
7324 }
7325 
7326 void
get_sockev_state(struct socket * so,u_int32_t * statep)7327 get_sockev_state(struct socket *so, u_int32_t *statep)
7328 {
7329 	u_int32_t state = *(statep);
7330 
7331 	/*
7332 	 * If the state variable is already used by a previous event,
7333 	 * reset it.
7334 	 */
7335 	if (state != 0) {
7336 		return;
7337 	}
7338 
7339 	if (so->so_state & SS_ISCONNECTED) {
7340 		state |= SOCKEV_CONNECTED;
7341 	} else {
7342 		state &= ~(SOCKEV_CONNECTED);
7343 	}
7344 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7345 	*(statep) = state;
7346 }
7347 
7348 #define SO_LOCK_HISTORY_STR_LEN \
7349 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7350 
7351 __private_extern__ const char *
solockhistory_nr(struct socket * so)7352 solockhistory_nr(struct socket *so)
7353 {
7354 	size_t n = 0;
7355 	int i;
7356 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7357 
7358 	bzero(lock_history_str, sizeof(lock_history_str));
7359 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7360 		n += scnprintf(lock_history_str + n,
7361 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7362 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7363 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7364 	}
7365 	return lock_history_str;
7366 }
7367 
7368 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7369 socket_getlock(struct socket *so, int flags)
7370 {
7371 	if (so->so_proto->pr_getlock != NULL) {
7372 		return (*so->so_proto->pr_getlock)(so, flags);
7373 	} else {
7374 		return so->so_proto->pr_domain->dom_mtx;
7375 	}
7376 }
7377 
7378 void
socket_lock(struct socket * so,int refcount)7379 socket_lock(struct socket *so, int refcount)
7380 {
7381 	void *lr_saved;
7382 
7383 	lr_saved = __builtin_return_address(0);
7384 
7385 	if (so->so_proto->pr_lock) {
7386 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7387 	} else {
7388 #ifdef MORE_LOCKING_DEBUG
7389 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7390 		    LCK_MTX_ASSERT_NOTOWNED);
7391 #endif
7392 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7393 		if (refcount) {
7394 			so->so_usecount++;
7395 		}
7396 		so->lock_lr[so->next_lock_lr] = lr_saved;
7397 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7398 	}
7399 }
7400 
7401 void
socket_lock_assert_owned(struct socket * so)7402 socket_lock_assert_owned(struct socket *so)
7403 {
7404 	lck_mtx_t *mutex_held;
7405 
7406 	if (so->so_proto->pr_getlock != NULL) {
7407 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7408 	} else {
7409 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7410 	}
7411 
7412 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7413 }
7414 
7415 int
socket_try_lock(struct socket * so)7416 socket_try_lock(struct socket *so)
7417 {
7418 	lck_mtx_t *mtx;
7419 
7420 	if (so->so_proto->pr_getlock != NULL) {
7421 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7422 	} else {
7423 		mtx = so->so_proto->pr_domain->dom_mtx;
7424 	}
7425 
7426 	return lck_mtx_try_lock(mtx);
7427 }
7428 
7429 void
socket_unlock(struct socket * so,int refcount)7430 socket_unlock(struct socket *so, int refcount)
7431 {
7432 	void *lr_saved;
7433 	lck_mtx_t *mutex_held;
7434 
7435 	lr_saved = __builtin_return_address(0);
7436 
7437 	if (so == NULL || so->so_proto == NULL) {
7438 		panic("%s: null so_proto so=%p", __func__, so);
7439 		/* NOTREACHED */
7440 	}
7441 
7442 	if (so->so_proto->pr_unlock) {
7443 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7444 	} else {
7445 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7446 #ifdef MORE_LOCKING_DEBUG
7447 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7448 #endif
7449 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7450 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7451 
7452 		if (refcount) {
7453 			if (so->so_usecount <= 0) {
7454 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7455 				    "lrh=%s", __func__, so->so_usecount, so,
7456 				    SOCK_DOM(so), so->so_type,
7457 				    SOCK_PROTO(so), solockhistory_nr(so));
7458 				/* NOTREACHED */
7459 			}
7460 
7461 			so->so_usecount--;
7462 			if (so->so_usecount == 0) {
7463 				sofreelastref(so, 1);
7464 			}
7465 		}
7466 		lck_mtx_unlock(mutex_held);
7467 	}
7468 }
7469 
7470 /* Called with socket locked, will unlock socket */
7471 void
sofree(struct socket * so)7472 sofree(struct socket *so)
7473 {
7474 	lck_mtx_t *mutex_held;
7475 
7476 	if (so->so_proto->pr_getlock != NULL) {
7477 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7478 	} else {
7479 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7480 	}
7481 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7482 
7483 	sofreelastref(so, 0);
7484 }
7485 
7486 void
soreference(struct socket * so)7487 soreference(struct socket *so)
7488 {
7489 	socket_lock(so, 1);     /* locks & take one reference on socket */
7490 	socket_unlock(so, 0);   /* unlock only */
7491 }
7492 
7493 void
sodereference(struct socket * so)7494 sodereference(struct socket *so)
7495 {
7496 	socket_lock(so, 0);
7497 	socket_unlock(so, 1);
7498 }
7499 
7500 /*
7501  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7502  * possibility of using jumbo clusters.  Caller must ensure to hold
7503  * the socket lock.
7504  */
7505 void
somultipages(struct socket * so,boolean_t set)7506 somultipages(struct socket *so, boolean_t set)
7507 {
7508 	if (set) {
7509 		so->so_flags |= SOF_MULTIPAGES;
7510 	} else {
7511 		so->so_flags &= ~SOF_MULTIPAGES;
7512 	}
7513 }
7514 
7515 void
soif2kcl(struct socket * so,boolean_t set)7516 soif2kcl(struct socket *so, boolean_t set)
7517 {
7518 	if (set) {
7519 		so->so_flags1 |= SOF1_IF_2KCL;
7520 	} else {
7521 		so->so_flags1 &= ~SOF1_IF_2KCL;
7522 	}
7523 }
7524 
7525 int
so_isdstlocal(struct socket * so)7526 so_isdstlocal(struct socket *so)
7527 {
7528 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7529 
7530 	if (SOCK_DOM(so) == PF_INET) {
7531 		return inaddr_local(inp->inp_faddr);
7532 	} else if (SOCK_DOM(so) == PF_INET6) {
7533 		return in6addr_local(&inp->in6p_faddr);
7534 	}
7535 
7536 	return 0;
7537 }
7538 
7539 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7540 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7541 {
7542 	struct sockbuf *rcv, *snd;
7543 	int err = 0, defunct;
7544 
7545 	rcv = &so->so_rcv;
7546 	snd = &so->so_snd;
7547 
7548 	defunct = (so->so_flags & SOF_DEFUNCT);
7549 	if (defunct) {
7550 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7551 			panic("%s: SB_DROP not set", __func__);
7552 			/* NOTREACHED */
7553 		}
7554 		goto done;
7555 	}
7556 
7557 	if (so->so_flags & SOF_NODEFUNCT) {
7558 		if (noforce) {
7559 			err = EOPNOTSUPP;
7560 			if (p != PROC_NULL) {
7561 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7562 				    "name %s level %d) so 0x%llx [%d,%d] "
7563 				    "is not eligible for defunct "
7564 				    "(%d)\n", __func__, proc_selfpid(),
7565 				    proc_best_name(current_proc()), proc_pid(p),
7566 				    proc_best_name(p), level,
7567 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7568 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7569 			}
7570 			return err;
7571 		}
7572 		so->so_flags &= ~SOF_NODEFUNCT;
7573 		if (p != PROC_NULL) {
7574 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7575 			    "name %s level %d) so 0x%llx [%d,%d] "
7576 			    "defunct by force "
7577 			    "(%d)\n", __func__, proc_selfpid(),
7578 			    proc_best_name(current_proc()), proc_pid(p),
7579 			    proc_best_name(p), level,
7580 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7581 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7582 		}
7583 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7584 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7585 		struct ifnet *ifp = inp->inp_last_outifp;
7586 
7587 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7588 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7589 		} else if (so->so_flags & SOF_DELEGATED) {
7590 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7591 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7592 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7593 		} else if (noforce && p != PROC_NULL) {
7594 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7595 
7596 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7597 			so->so_extended_bk_start = net_uptime();
7598 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7599 
7600 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7601 
7602 			err = EOPNOTSUPP;
7603 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7604 			    "name %s level %d) so 0x%llx [%d,%d] "
7605 			    "extend bk idle "
7606 			    "(%d)\n", __func__, proc_selfpid(),
7607 			    proc_best_name(current_proc()), proc_pid(p),
7608 			    proc_best_name(p), level,
7609 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7610 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7611 			return err;
7612 		} else {
7613 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7614 		}
7615 	}
7616 
7617 	so->so_flags |= SOF_DEFUNCT;
7618 
7619 	/* Prevent further data from being appended to the socket buffers */
7620 	snd->sb_flags |= SB_DROP;
7621 	rcv->sb_flags |= SB_DROP;
7622 
7623 	/* Flush any existing data in the socket buffers */
7624 	if (rcv->sb_cc != 0) {
7625 		rcv->sb_flags &= ~SB_SEL;
7626 		selthreadclear(&rcv->sb_sel);
7627 		sbrelease(rcv);
7628 	}
7629 	if (snd->sb_cc != 0) {
7630 		snd->sb_flags &= ~SB_SEL;
7631 		selthreadclear(&snd->sb_sel);
7632 		sbrelease(snd);
7633 	}
7634 
7635 done:
7636 	if (p != PROC_NULL) {
7637 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7638 		    "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7639 		    proc_selfpid(), proc_best_name(current_proc()),
7640 		    proc_pid(p), proc_best_name(p), level,
7641 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7642 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7643 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7644 		    " extbkidle" : "");
7645 	}
7646 	return err;
7647 }
7648 
7649 int
sodefunct(struct proc * p,struct socket * so,int level)7650 sodefunct(struct proc *p, struct socket *so, int level)
7651 {
7652 	struct sockbuf *rcv, *snd;
7653 
7654 	if (!(so->so_flags & SOF_DEFUNCT)) {
7655 		panic("%s improperly called", __func__);
7656 		/* NOTREACHED */
7657 	}
7658 	if (so->so_state & SS_DEFUNCT) {
7659 		goto done;
7660 	}
7661 
7662 	rcv = &so->so_rcv;
7663 	snd = &so->so_snd;
7664 
7665 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7666 		char s[MAX_IPv6_STR_LEN];
7667 		char d[MAX_IPv6_STR_LEN];
7668 		struct inpcb *inp = sotoinpcb(so);
7669 
7670 		if (p != PROC_NULL) {
7671 			SODEFUNCTLOG(
7672 				"%s[%d, %s]: (target pid %d name %s level %d) "
7673 				"so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7674 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7675 				" snd_fl 0x%x]\n", __func__,
7676 				proc_selfpid(), proc_best_name(current_proc()),
7677 				proc_pid(p), proc_best_name(p), level,
7678 				(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7679 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7680 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7681 				(void *)&inp->inp_laddr.s_addr :
7682 				(void *)&inp->in6p_laddr),
7683 				s, sizeof(s)), ntohs(inp->in6p_lport),
7684 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7685 				(void *)&inp->inp_faddr.s_addr :
7686 				(void *)&inp->in6p_faddr,
7687 				d, sizeof(d)), ntohs(inp->in6p_fport),
7688 				(uint32_t)rcv->sb_sel.si_flags,
7689 				(uint32_t)snd->sb_sel.si_flags,
7690 				rcv->sb_flags, snd->sb_flags);
7691 		}
7692 	} else if (p != PROC_NULL) {
7693 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7694 		    "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7695 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7696 		    proc_selfpid(), proc_best_name(current_proc()),
7697 		    proc_pid(p), proc_best_name(p), level,
7698 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7699 		    SOCK_DOM(so), SOCK_TYPE(so),
7700 		    (uint32_t)rcv->sb_sel.si_flags,
7701 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7702 		    snd->sb_flags);
7703 	}
7704 
7705 	/*
7706 	 * Unwedge threads blocked on sbwait() and sb_lock().
7707 	 */
7708 	sbwakeup(rcv);
7709 	sbwakeup(snd);
7710 
7711 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7712 	if (rcv->sb_flags & SB_LOCK) {
7713 		sbunlock(rcv, TRUE);    /* keep socket locked */
7714 	}
7715 	if (snd->sb_flags & SB_LOCK) {
7716 		sbunlock(snd, TRUE);    /* keep socket locked */
7717 	}
7718 	/*
7719 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7720 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7721 	 * states are set for the socket.  This would also flush out data
7722 	 * hanging off the receive list of this socket.
7723 	 */
7724 	(void) soshutdownlock_final(so, SHUT_RD);
7725 	(void) soshutdownlock_final(so, SHUT_WR);
7726 	(void) sodisconnectlocked(so);
7727 
7728 	/*
7729 	 * Explicitly handle connectionless-protocol disconnection
7730 	 * and release any remaining data in the socket buffers.
7731 	 */
7732 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7733 		(void) soisdisconnected(so);
7734 	}
7735 
7736 	if (so->so_error == 0) {
7737 		so->so_error = EBADF;
7738 	}
7739 
7740 	if (rcv->sb_cc != 0) {
7741 		rcv->sb_flags &= ~SB_SEL;
7742 		selthreadclear(&rcv->sb_sel);
7743 		sbrelease(rcv);
7744 	}
7745 	if (snd->sb_cc != 0) {
7746 		snd->sb_flags &= ~SB_SEL;
7747 		selthreadclear(&snd->sb_sel);
7748 		sbrelease(snd);
7749 	}
7750 	so->so_state |= SS_DEFUNCT;
7751 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7752 
7753 done:
7754 	return 0;
7755 }
7756 
7757 int
soresume(struct proc * p,struct socket * so,int locked)7758 soresume(struct proc *p, struct socket *so, int locked)
7759 {
7760 	if (locked == 0) {
7761 		socket_lock(so, 1);
7762 	}
7763 
7764 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7765 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7766 		    "[%d,%d] resumed from bk idle\n",
7767 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7768 		    proc_pid(p), proc_best_name(p),
7769 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7770 		    SOCK_DOM(so), SOCK_TYPE(so));
7771 
7772 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7773 		so->so_extended_bk_start = 0;
7774 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7775 
7776 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7777 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7778 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7779 	}
7780 	if (locked == 0) {
7781 		socket_unlock(so, 1);
7782 	}
7783 
7784 	return 0;
7785 }
7786 
7787 /*
7788  * Does not attempt to account for sockets that are delegated from
7789  * the current process
7790  */
7791 int
so_set_extended_bk_idle(struct socket * so,int optval)7792 so_set_extended_bk_idle(struct socket *so, int optval)
7793 {
7794 	int error = 0;
7795 
7796 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7797 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7798 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7799 		error = EOPNOTSUPP;
7800 	} else if (optval == 0) {
7801 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7802 
7803 		soresume(current_proc(), so, 1);
7804 	} else {
7805 		struct proc *p = current_proc();
7806 		struct fileproc *fp;
7807 		int count = 0;
7808 
7809 		/*
7810 		 * Unlock socket to avoid lock ordering issue with
7811 		 * the proc fd table lock
7812 		 */
7813 		socket_unlock(so, 0);
7814 
7815 		proc_fdlock(p);
7816 		fdt_foreach(fp, p) {
7817 			struct socket *so2;
7818 
7819 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7820 				continue;
7821 			}
7822 
7823 			so2 = (struct socket *)fp_get_data(fp);
7824 			if (so != so2 &&
7825 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7826 				count++;
7827 			}
7828 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7829 				break;
7830 			}
7831 		}
7832 		proc_fdunlock(p);
7833 
7834 		socket_lock(so, 0);
7835 
7836 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7837 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7838 			error = EBUSY;
7839 		} else if (so->so_flags & SOF_DELEGATED) {
7840 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7841 			error = EBUSY;
7842 		} else {
7843 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7844 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7845 		}
7846 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7847 		    "%s marked for extended bk idle\n",
7848 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7849 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7850 		    SOCK_DOM(so), SOCK_TYPE(so),
7851 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7852 		    "is" : "not");
7853 	}
7854 
7855 	return error;
7856 }
7857 
7858 static void
so_stop_extended_bk_idle(struct socket * so)7859 so_stop_extended_bk_idle(struct socket *so)
7860 {
7861 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7862 	so->so_extended_bk_start = 0;
7863 
7864 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7865 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7866 	/*
7867 	 * Force defunct
7868 	 */
7869 	sosetdefunct(current_proc(), so,
7870 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7871 	if (so->so_flags & SOF_DEFUNCT) {
7872 		sodefunct(current_proc(), so,
7873 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7874 	}
7875 }
7876 
7877 void
so_drain_extended_bk_idle(struct socket * so)7878 so_drain_extended_bk_idle(struct socket *so)
7879 {
7880 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7881 		/*
7882 		 * Only penalize sockets that have outstanding data
7883 		 */
7884 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7885 			so_stop_extended_bk_idle(so);
7886 
7887 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7888 		}
7889 	}
7890 }
7891 
7892 /*
7893  * Return values tells if socket is still in extended background idle
7894  */
7895 int
so_check_extended_bk_idle_time(struct socket * so)7896 so_check_extended_bk_idle_time(struct socket *so)
7897 {
7898 	int ret = 1;
7899 
7900 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7901 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7902 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7903 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7904 		    SOCK_DOM(so), SOCK_TYPE(so));
7905 		if (net_uptime() - so->so_extended_bk_start >
7906 		    soextbkidlestat.so_xbkidle_time) {
7907 			so_stop_extended_bk_idle(so);
7908 
7909 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7910 
7911 			ret = 0;
7912 		} else {
7913 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7914 
7915 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7916 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7917 		}
7918 	}
7919 
7920 	return ret;
7921 }
7922 
7923 void
resume_proc_sockets(proc_t p)7924 resume_proc_sockets(proc_t p)
7925 {
7926 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7927 		struct fileproc *fp;
7928 		struct socket *so;
7929 
7930 		proc_fdlock(p);
7931 		fdt_foreach(fp, p) {
7932 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7933 				continue;
7934 			}
7935 
7936 			so = (struct socket *)fp_get_data(fp);
7937 			(void) soresume(p, so, 0);
7938 		}
7939 		proc_fdunlock(p);
7940 
7941 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7942 	}
7943 }
7944 
7945 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7946 so_set_recv_anyif(struct socket *so, int optval)
7947 {
7948 	int ret = 0;
7949 
7950 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7951 		if (optval) {
7952 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7953 		} else {
7954 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7955 		}
7956 #if SKYWALK
7957 		inp_update_netns_flags(so);
7958 #endif /* SKYWALK */
7959 	}
7960 
7961 
7962 	return ret;
7963 }
7964 
7965 __private_extern__ int
so_get_recv_anyif(struct socket * so)7966 so_get_recv_anyif(struct socket *so)
7967 {
7968 	int ret = 0;
7969 
7970 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7971 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7972 	}
7973 
7974 	return ret;
7975 }
7976 
7977 int
so_set_restrictions(struct socket * so,uint32_t vals)7978 so_set_restrictions(struct socket *so, uint32_t vals)
7979 {
7980 	int nocell_old, nocell_new;
7981 	int noexpensive_old, noexpensive_new;
7982 	int noconstrained_old, noconstrained_new;
7983 
7984 	/*
7985 	 * Deny-type restrictions are trapdoors; once set they cannot be
7986 	 * unset for the lifetime of the socket.  This allows them to be
7987 	 * issued by a framework on behalf of the application without
7988 	 * having to worry that they can be undone.
7989 	 *
7990 	 * Note here that socket-level restrictions overrides any protocol
7991 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7992 	 * socket restriction issued on the socket has a higher precendence
7993 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7994 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7995 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7996 	 */
7997 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7998 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7999 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8000 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
8001 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
8002 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
8003 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8004 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8005 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8006 
8007 	/* we can only set, not clear restrictions */
8008 	if ((nocell_new - nocell_old) == 0 &&
8009 	    (noexpensive_new - noexpensive_old) == 0 &&
8010 	    (noconstrained_new - noconstrained_old) == 0) {
8011 		return 0;
8012 	}
8013 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8014 		if (nocell_new - nocell_old != 0) {
8015 			/*
8016 			 * if deny cellular is now set, do what's needed
8017 			 * for INPCB
8018 			 */
8019 			inp_set_nocellular(sotoinpcb(so));
8020 		}
8021 		if (noexpensive_new - noexpensive_old != 0) {
8022 			inp_set_noexpensive(sotoinpcb(so));
8023 		}
8024 		if (noconstrained_new - noconstrained_old != 0) {
8025 			inp_set_noconstrained(sotoinpcb(so));
8026 		}
8027 	}
8028 
8029 	if (SOCK_DOM(so) == PF_MULTIPATH) {
8030 		mptcp_set_restrictions(so);
8031 	}
8032 
8033 	return 0;
8034 }
8035 
8036 uint32_t
so_get_restrictions(struct socket * so)8037 so_get_restrictions(struct socket *so)
8038 {
8039 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
8040 	       SO_RESTRICT_DENY_OUT |
8041 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
8042 }
8043 
8044 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)8045 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
8046 {
8047 	struct proc *ep = PROC_NULL;
8048 	int error = 0;
8049 
8050 	/* pid 0 is reserved for kernel */
8051 	if (epid == 0) {
8052 		error = EINVAL;
8053 		goto done;
8054 	}
8055 
8056 	/*
8057 	 * If this is an in-kernel socket, prevent its delegate
8058 	 * association from changing unless the socket option is
8059 	 * coming from within the kernel itself.
8060 	 */
8061 	if (so->last_pid == 0 && p != kernproc) {
8062 		error = EACCES;
8063 		goto done;
8064 	}
8065 
8066 	/*
8067 	 * If this is issued by a process that's recorded as the
8068 	 * real owner of the socket, or if the pid is the same as
8069 	 * the process's own pid, then proceed.  Otherwise ensure
8070 	 * that the issuing process has the necessary privileges.
8071 	 */
8072 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
8073 		if ((error = priv_check_cred(kauth_cred_get(),
8074 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8075 			error = EACCES;
8076 			goto done;
8077 		}
8078 	}
8079 
8080 	/* Find the process that corresponds to the effective pid */
8081 	if ((ep = proc_find(epid)) == PROC_NULL) {
8082 		error = ESRCH;
8083 		goto done;
8084 	}
8085 
8086 	/*
8087 	 * If a process tries to delegate the socket to itself, then
8088 	 * there's really nothing to do; treat it as a way for the
8089 	 * delegate association to be cleared.  Note that we check
8090 	 * the passed-in proc rather than calling proc_selfpid(),
8091 	 * as we need to check the process issuing the socket option
8092 	 * which could be kernproc.  Given that we don't allow 0 for
8093 	 * effective pid, it means that a delegated in-kernel socket
8094 	 * stays delegated during its lifetime (which is probably OK.)
8095 	 */
8096 	if (epid == proc_pid(p)) {
8097 		so->so_flags &= ~SOF_DELEGATED;
8098 		so->e_upid = 0;
8099 		so->e_pid = 0;
8100 		uuid_clear(so->e_uuid);
8101 	} else {
8102 		so->so_flags |= SOF_DELEGATED;
8103 		so->e_upid = proc_uniqueid(ep);
8104 		so->e_pid = proc_pid(ep);
8105 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8106 
8107 #if defined(XNU_TARGET_OS_OSX)
8108 		if (ep->p_responsible_pid != so->e_pid) {
8109 			proc_t rp = proc_find(ep->p_responsible_pid);
8110 			if (rp != PROC_NULL) {
8111 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8112 				so->so_rpid = ep->p_responsible_pid;
8113 				proc_rele(rp);
8114 			} else {
8115 				uuid_clear(so->so_ruuid);
8116 				so->so_rpid = -1;
8117 			}
8118 		}
8119 #endif
8120 	}
8121 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8122 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8123 	}
8124 done:
8125 	if (error == 0 && net_io_policy_log) {
8126 		uuid_string_t buf;
8127 
8128 		uuid_unparse(so->e_uuid, buf);
8129 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8130 		    "euuid %s%s\n", __func__, proc_name_address(p),
8131 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8132 		    SOCK_DOM(so), SOCK_TYPE(so),
8133 		    so->e_pid, proc_name_address(ep), buf,
8134 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8135 	} else if (error != 0 && net_io_policy_log) {
8136 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8137 		    "ERROR (%d)\n", __func__, proc_name_address(p),
8138 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8139 		    SOCK_DOM(so), SOCK_TYPE(so),
8140 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
8141 		    proc_name_address(ep), error);
8142 	}
8143 
8144 	/* Update this socket's policy upon success */
8145 	if (error == 0) {
8146 		so->so_policy_gencnt *= -1;
8147 		so_update_policy(so);
8148 #if NECP
8149 		so_update_necp_policy(so, NULL, NULL);
8150 #endif /* NECP */
8151 	}
8152 
8153 	if (ep != PROC_NULL) {
8154 		proc_rele(ep);
8155 	}
8156 
8157 	return error;
8158 }
8159 
8160 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8161 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8162 {
8163 	uuid_string_t buf;
8164 	uuid_t uuid;
8165 	int error = 0;
8166 
8167 	/* UUID must not be all-zeroes (reserved for kernel) */
8168 	if (uuid_is_null(euuid)) {
8169 		error = EINVAL;
8170 		goto done;
8171 	}
8172 
8173 	/*
8174 	 * If this is an in-kernel socket, prevent its delegate
8175 	 * association from changing unless the socket option is
8176 	 * coming from within the kernel itself.
8177 	 */
8178 	if (so->last_pid == 0 && p != kernproc) {
8179 		error = EACCES;
8180 		goto done;
8181 	}
8182 
8183 	/* Get the UUID of the issuing process */
8184 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8185 
8186 	/*
8187 	 * If this is issued by a process that's recorded as the
8188 	 * real owner of the socket, or if the uuid is the same as
8189 	 * the process's own uuid, then proceed.  Otherwise ensure
8190 	 * that the issuing process has the necessary privileges.
8191 	 */
8192 	if (check_cred &&
8193 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8194 	    uuid_compare(euuid, uuid) != 0)) {
8195 		if ((error = priv_check_cred(kauth_cred_get(),
8196 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8197 			error = EACCES;
8198 			goto done;
8199 		}
8200 	}
8201 
8202 	/*
8203 	 * If a process tries to delegate the socket to itself, then
8204 	 * there's really nothing to do; treat it as a way for the
8205 	 * delegate association to be cleared.  Note that we check
8206 	 * the uuid of the passed-in proc rather than that of the
8207 	 * current process, as we need to check the process issuing
8208 	 * the socket option which could be kernproc itself.  Given
8209 	 * that we don't allow 0 for effective uuid, it means that
8210 	 * a delegated in-kernel socket stays delegated during its
8211 	 * lifetime (which is okay.)
8212 	 */
8213 	if (uuid_compare(euuid, uuid) == 0) {
8214 		so->so_flags &= ~SOF_DELEGATED;
8215 		so->e_upid = 0;
8216 		so->e_pid = 0;
8217 		uuid_clear(so->e_uuid);
8218 	} else {
8219 		so->so_flags |= SOF_DELEGATED;
8220 		/*
8221 		 * Unlike so_set_effective_pid(), we only have the UUID
8222 		 * here and the process ID is not known.  Inherit the
8223 		 * real {pid,upid} of the socket.
8224 		 */
8225 		so->e_upid = so->last_upid;
8226 		so->e_pid = so->last_pid;
8227 		uuid_copy(so->e_uuid, euuid);
8228 	}
8229 	/*
8230 	 * The following will clear the effective process name as it's the same
8231 	 * as the real process
8232 	 */
8233 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8234 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8235 	}
8236 done:
8237 	if (error == 0 && net_io_policy_log) {
8238 		uuid_unparse(so->e_uuid, buf);
8239 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8240 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8241 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8242 		    SOCK_TYPE(so), so->e_pid, buf,
8243 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8244 	} else if (error != 0 && net_io_policy_log) {
8245 		uuid_unparse(euuid, buf);
8246 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8247 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8248 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8249 		    SOCK_TYPE(so), buf, error);
8250 	}
8251 
8252 	/* Update this socket's policy upon success */
8253 	if (error == 0) {
8254 		so->so_policy_gencnt *= -1;
8255 		so_update_policy(so);
8256 #if NECP
8257 		so_update_necp_policy(so, NULL, NULL);
8258 #endif /* NECP */
8259 	}
8260 
8261 	return error;
8262 }
8263 
8264 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8265 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8266     uint32_t ev_datalen)
8267 {
8268 	struct kev_msg ev_msg;
8269 
8270 	/*
8271 	 * A netpolicy event always starts with a netpolicy_event_data
8272 	 * structure, but the caller can provide for a longer event
8273 	 * structure to post, depending on the event code.
8274 	 */
8275 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8276 
8277 	bzero(&ev_msg, sizeof(ev_msg));
8278 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8279 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8280 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8281 	ev_msg.event_code       = ev_code;
8282 
8283 	ev_msg.dv[0].data_ptr   = ev_data;
8284 	ev_msg.dv[0].data_length = ev_datalen;
8285 
8286 	kev_post_msg(&ev_msg);
8287 }
8288 
8289 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8290 socket_post_kev_msg(uint32_t ev_code,
8291     struct kev_socket_event_data *ev_data,
8292     uint32_t ev_datalen)
8293 {
8294 	struct kev_msg ev_msg;
8295 
8296 	bzero(&ev_msg, sizeof(ev_msg));
8297 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8298 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8299 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8300 	ev_msg.event_code = ev_code;
8301 
8302 	ev_msg.dv[0].data_ptr = ev_data;
8303 	ev_msg.dv[0].data_length = ev_datalen;
8304 
8305 	kev_post_msg(&ev_msg);
8306 }
8307 
8308 void
socket_post_kev_msg_closed(struct socket * so)8309 socket_post_kev_msg_closed(struct socket *so)
8310 {
8311 	struct kev_socket_closed ev = {};
8312 	struct sockaddr *socksa = NULL, *peersa = NULL;
8313 	int err;
8314 
8315 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8316 		return;
8317 	}
8318 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8319 	if (err == 0) {
8320 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8321 		    &peersa);
8322 		if (err == 0) {
8323 			memcpy(&ev.ev_data.kev_sockname, socksa,
8324 			    min(socksa->sa_len,
8325 			    sizeof(ev.ev_data.kev_sockname)));
8326 			memcpy(&ev.ev_data.kev_peername, peersa,
8327 			    min(peersa->sa_len,
8328 			    sizeof(ev.ev_data.kev_peername)));
8329 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8330 			    &ev.ev_data, sizeof(ev));
8331 		}
8332 	}
8333 	free_sockaddr(socksa);
8334 	free_sockaddr(peersa);
8335 }
8336