xref: /xnu-8020.140.41/bsd/kern/uipc_socket.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120 
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125 
126 #include <os/log.h>
127 
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131 
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136 
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138 
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144 
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147 
148 static u_int32_t        so_cache_hw;    /* High water mark for socache */
149 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
150 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
151 static u_int32_t        cached_sock_count = 0;
152 STAILQ_HEAD(, socket)   so_cache_head;
153 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t        so_cache_time;
155 static int              socketinit_done;
156 static struct zone      *so_cache_zone;
157 
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160 
161 #include <machine/limits.h>
162 
163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void     filt_sordetach(struct knote *kn);
165 static int      filt_soread(struct knote *kn, long hint);
166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168 
169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void     filt_sowdetach(struct knote *kn);
171 static int      filt_sowrite(struct knote *kn, long hint);
172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174 
175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void     filt_sockdetach(struct knote *kn);
177 static int      filt_sockev(struct knote *kn, long hint);
178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180 
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183 
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 	.f_isfd = 1,
186 	.f_attach = filt_sorattach,
187 	.f_detach = filt_sordetach,
188 	.f_event = filt_soread,
189 	.f_touch = filt_sortouch,
190 	.f_process = filt_sorprocess,
191 };
192 
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 	.f_isfd = 1,
195 	.f_attach = filt_sowattach,
196 	.f_detach = filt_sowdetach,
197 	.f_event = filt_sowrite,
198 	.f_touch = filt_sowtouch,
199 	.f_process = filt_sowprocess,
200 };
201 
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 	.f_isfd = 1,
204 	.f_attach = filt_sockattach,
205 	.f_detach = filt_sockdetach,
206 	.f_event = filt_sockev,
207 	.f_touch = filt_socktouch,
208 	.f_process = filt_sockprocess,
209 };
210 
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 	.f_isfd = 1,
213 	.f_attach = filt_sorattach,
214 	.f_detach = filt_sordetach,
215 	.f_event = filt_soread,
216 	.f_touch = filt_sortouch,
217 	.f_process = filt_sorprocess,
218 };
219 
220 SYSCTL_DECL(_kern_ipc);
221 
222 #define EVEN_MORE_LOCKING_DEBUG 0
223 
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227 
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230     &sodefunct_calls, "");
231 
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t        so_gencnt;      /* generation count for sockets */
234 
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236 
237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246 
247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
248 
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252 
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy  = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261 
262 /*
263  * Set to enable jumbo clusters (if available) for large writes when
264  * the socket is marked with SOF_MULTIPAGES; see below.
265  */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269 
270 /*
271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272  * writes on the socket for all protocols on any network interfaces,
273  * depending upon sosendjcl above.  Be extra careful when setting this
274  * to 1, because sending down packets that cross physical pages down to
275  * broken drivers (those that falsely assume that the physical pages
276  * are contiguous) might lead to system panics or silent data corruption.
277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279  * capable.  Set this to 1 only for testing/debugging purposes.
280  */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284 
285 /*
286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287  * writes on the socket for all protocols on any network interfaces.
288  * Be extra careful when setting this to 1, because sending down packets with
289  * clusters larger that 2 KB might lead to system panics or data corruption.
290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291  * on the outgoing interface
292  * Set this to 1  for testing/debugging purposes only.
293  */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297 
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300     &sodefunctlog, 0, "");
301 
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304     &sothrottlelog, 0, "");
305 
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308     &sorestrictrecv, 0, "Enable inbound interface restrictions");
309 
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312     &sorestrictsend, 0, "Enable outbound interface restrictions");
313 
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317 
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323 
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327     &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329 
330 extern struct inpcbinfo tcbinfo;
331 
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335 
336 vm_size_t       so_cache_zone_element_size;
337 
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339     user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342 
343 /*
344  * Maximum of extended background idle sockets per process
345  * Set to zero to disable further setting of the option
346  */
347 
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
349 #define SO_IDLE_BK_IDLE_TIME            600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
351 
352 struct soextbkidlestat soextbkidlestat;
353 
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356     "Maximum of extended background idle sockets per process");
357 
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359     &soextbkidlestat.so_xbkidle_time, 0,
360     "Time in seconds to keep extended background idle sockets");
361 
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364     "High water mark for extended background idle sockets");
365 
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367     &soextbkidlestat, soextbkidlestat, "");
368 
369 int so_set_extended_bk_idle(struct socket *, int);
370 
371 
372 /*
373  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374  * setting the DSCP code on the packet based on the service class; see
375  * <rdar://problem/11277343> for details.
376  */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379     &sotcdb, 0, "");
380 
381 void
socketinit(void)382 socketinit(void)
383 {
384 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386 
387 #ifdef __LP64__
388 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402 
403 	if (socketinit_done) {
404 		printf("socketinit: already called...\n");
405 		return;
406 	}
407 	socketinit_done = 1;
408 
409 	PE_parse_boot_argn("socket_debug", &socket_debug,
410 	    sizeof(socket_debug));
411 
412 	STAILQ_INIT(&so_cache_head);
413 
414 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
416 
417 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419 
420 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424 
425 	in_pcbinit();
426 }
427 
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 	caddr_t temp;
432 	uintptr_t offset;
433 
434 	lck_mtx_lock(&so_cache_mtx);
435 
436 	if (!STAILQ_EMPTY(&so_cache_head)) {
437 		VERIFY(cached_sock_count > 0);
438 
439 		*so = STAILQ_FIRST(&so_cache_head);
440 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
442 
443 		cached_sock_count--;
444 		lck_mtx_unlock(&so_cache_mtx);
445 
446 		temp = (*so)->so_saved_pcb;
447 		bzero((caddr_t)*so, sizeof(struct socket));
448 
449 		(*so)->so_saved_pcb = temp;
450 	} else {
451 		lck_mtx_unlock(&so_cache_mtx);
452 
453 		*so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454 
455 		/*
456 		 * Define offsets for extra structures into our
457 		 * single block of memory. Align extra structures
458 		 * on longword boundaries.
459 		 */
460 
461 		offset = (uintptr_t)*so;
462 		offset += sizeof(struct socket);
463 
464 		offset = ALIGN(offset);
465 
466 		(*so)->so_saved_pcb = (caddr_t)offset;
467 		offset += get_inpcb_str_size();
468 
469 		offset = ALIGN(offset);
470 
471 		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 		    (caddr_t)offset;
473 	}
474 
475 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477 
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 	lck_mtx_lock(&so_cache_mtx);
482 
483 	so_cache_time = net_uptime();
484 	if (++cached_sock_count > max_cached_sock_count) {
485 		--cached_sock_count;
486 		lck_mtx_unlock(&so_cache_mtx);
487 		zfree(so_cache_zone, so);
488 	} else {
489 		if (so_cache_hw < cached_sock_count) {
490 			so_cache_hw = cached_sock_count;
491 		}
492 
493 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494 
495 		so->cache_timestamp = so_cache_time;
496 		lck_mtx_unlock(&so_cache_mtx);
497 	}
498 }
499 
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 	if (so->last_pid != 0) {
504 		/*
505 		 * last_pid and last_upid should remain zero for sockets
506 		 * created using sock_socket. The check above achieves that
507 		 */
508 		if (self == PROC_NULL) {
509 			self = current_proc();
510 		}
511 
512 		if (so->last_upid != proc_uniqueid(self) ||
513 		    so->last_pid != proc_pid(self)) {
514 			so->last_upid = proc_uniqueid(self);
515 			so->last_pid = proc_pid(self);
516 			proc_getexecutableuuid(self, so->last_uuid,
517 			    sizeof(so->last_uuid));
518 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 			}
521 		}
522 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 	}
524 }
525 
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 		(void) inp_update_policy(sotoinpcb(so));
531 	}
532 }
533 
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537     struct sockaddr *override_remote_addr)
538 {
539 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 		    override_remote_addr, 0);
542 	}
543 }
544 #endif /* NECP */
545 
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 	struct socket   *p;
550 	int             n_freed = 0;
551 	boolean_t rc = FALSE;
552 
553 	lck_mtx_lock(&so_cache_mtx);
554 	so_cache_timeouts++;
555 	so_cache_time = net_uptime();
556 
557 	while (!STAILQ_EMPTY(&so_cache_head)) {
558 		VERIFY(cached_sock_count > 0);
559 		p = STAILQ_FIRST(&so_cache_head);
560 		if ((so_cache_time - p->cache_timestamp) <
561 		    SO_CACHE_TIME_LIMIT) {
562 			break;
563 		}
564 
565 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 		--cached_sock_count;
567 
568 		zfree(so_cache_zone, p);
569 
570 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 			so_cache_max_freed++;
572 			break;
573 		}
574 	}
575 
576 	/* Schedule again if there is more to cleanup */
577 	if (!STAILQ_EMPTY(&so_cache_head)) {
578 		rc = TRUE;
579 	}
580 
581 	lck_mtx_unlock(&so_cache_mtx);
582 	return rc;
583 }
584 
585 /*
586  * Get a socket structure from our zone, and initialize it.
587  * We don't implement `waitok' yet (see comments in uipc_domain.c).
588  * Note that it would probably be better to allocate socket
589  * and PCB at the same time, but I'm not convinced that all
590  * the protocols can be easily modified to do this.
591  */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 	struct socket *so;
597 
598 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 		cached_sock_alloc(&so, how);
600 	} else {
601 		so = zalloc_flags(socket_zone, how | Z_ZERO);
602 	}
603 	if (so != NULL) {
604 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605 
606 		/*
607 		 * Increment the socket allocation statistics
608 		 */
609 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 	}
611 
612 	return so;
613 }
614 
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617     struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 	struct protosw *prp;
620 	struct socket *so;
621 	int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 	pid_t rpid = -1;
624 #endif
625 
626 #if TCPDEBUG
627 	extern int tcpconsdebug;
628 #endif
629 
630 	VERIFY(aso != NULL);
631 	*aso = NULL;
632 
633 	if (proto != 0) {
634 		prp = pffindproto(dom, proto, type);
635 	} else {
636 		prp = pffindtype(dom, type);
637 	}
638 
639 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 		if (pffinddomain(dom) == NULL) {
641 			return EAFNOSUPPORT;
642 		}
643 		if (proto != 0) {
644 			if (pffindprotonotype(dom, proto) != NULL) {
645 				return EPROTOTYPE;
646 			}
647 		}
648 		return EPROTONOSUPPORT;
649 	}
650 	if (prp->pr_type != type) {
651 		return EPROTOTYPE;
652 	}
653 	so = soalloc(1, dom, type);
654 	if (so == NULL) {
655 		return ENOBUFS;
656 	}
657 
658 	switch (dom) {
659 	case PF_LOCAL:
660 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 		break;
662 	case PF_INET:
663 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 		if (type == SOCK_STREAM) {
665 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 		} else {
667 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 		}
669 		break;
670 	case PF_ROUTE:
671 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 		break;
673 	case PF_NDRV:
674 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 		break;
676 	case PF_KEY:
677 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 		break;
679 	case PF_INET6:
680 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 		if (type == SOCK_STREAM) {
682 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 		} else {
684 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 		}
686 		break;
687 	case PF_SYSTEM:
688 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 		break;
690 	case PF_MULTIPATH:
691 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 		break;
693 	default:
694 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 		break;
696 	}
697 
698 	if (flags & SOCF_MPTCP) {
699 		so->so_state |= SS_NBIO;
700 	}
701 
702 	TAILQ_INIT(&so->so_incomp);
703 	TAILQ_INIT(&so->so_comp);
704 	so->so_type = (short)type;
705 	so->last_upid = proc_uniqueid(p);
706 	so->last_pid = proc_pid(p);
707 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709 
710 	if (ep != PROC_NULL && ep != p) {
711 		so->e_upid = proc_uniqueid(ep);
712 		so->e_pid = proc_pid(ep);
713 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 		so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 		if (ep->p_responsible_pid != so->e_pid) {
717 			rpid = ep->p_responsible_pid;
718 		}
719 #endif
720 	}
721 
722 #if defined(XNU_TARGET_OS_OSX)
723 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 		rpid = p->p_responsible_pid;
725 	}
726 
727 	so->so_rpid = -1;
728 	uuid_clear(so->so_ruuid);
729 	if (rpid >= 0) {
730 		proc_t rp = proc_find(rpid);
731 		if (rp != PROC_NULL) {
732 			proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 			so->so_rpid = rpid;
734 			proc_rele(rp);
735 		}
736 	}
737 #endif
738 
739 	so->so_cred = kauth_cred_proc_ref(p);
740 	if (!suser(kauth_cred_get(), NULL)) {
741 		so->so_state |= SS_PRIV;
742 	}
743 
744 	so->so_proto = prp;
745 	so->so_rcv.sb_flags |= SB_RECV;
746 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 	so->next_lock_lr = 0;
748 	so->next_unlock_lr = 0;
749 
750 	/*
751 	 * Attachment will create the per pcb lock if necessary and
752 	 * increase refcount for creation, make sure it's done before
753 	 * socket is inserted in lists.
754 	 */
755 	so->so_usecount++;
756 
757 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 	if (error != 0) {
759 		/*
760 		 * Warning:
761 		 * If so_pcb is not zero, the socket will be leaked,
762 		 * so protocol attachment handler must be coded carefuly
763 		 */
764 		if (so->so_pcb != NULL) {
765 			os_log_error(OS_LOG_DEFAULT,
766 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 			    error, dom, proto, type);
768 		}
769 		/*
770 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 		 */
772 		so->so_state |= SS_NOFDREF;
773 		so->so_flags |= SOF_PCBCLEARING;
774 		VERIFY(so->so_usecount > 0);
775 		so->so_usecount--;
776 		sofreelastref(so, 1);   /* will deallocate the socket */
777 		return error;
778 	}
779 
780 	/*
781 	 * Note: needs so_pcb to be set after pru_attach
782 	 */
783 	if (prp->pr_update_last_owner != NULL) {
784 		(*prp->pr_update_last_owner)(so, p, ep);
785 	}
786 
787 	atomic_add_32(&prp->pr_domain->dom_refs, 1);
788 
789 	/* Attach socket filters for this protocol */
790 	sflt_initsock(so);
791 #if TCPDEBUG
792 	if (tcpconsdebug == 2) {
793 		so->so_options |= SO_DEBUG;
794 	}
795 #endif
796 	so_set_default_traffic_class(so);
797 
798 	/*
799 	 * If this thread or task is marked to create backgrounded sockets,
800 	 * mark the socket as background.
801 	 */
802 	if (!(flags & SOCF_MPTCP) &&
803 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 		so->so_background_thread = current_thread();
806 	}
807 
808 	switch (dom) {
809 	/*
810 	 * Don't mark Unix domain or system
811 	 * eligible for defunct by default.
812 	 */
813 	case PF_LOCAL:
814 	case PF_SYSTEM:
815 		so->so_flags |= SOF_NODEFUNCT;
816 		break;
817 	default:
818 		break;
819 	}
820 
821 	/*
822 	 * Entitlements can't be checked at socket creation time except if the
823 	 * application requested a feature guarded by a privilege (c.f., socket
824 	 * delegation).
825 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 	 * a privilege check should only be triggered by a userland request.
827 	 * A privilege check at socket creation time is time consuming and
828 	 * could trigger many authorisation error messages from the security
829 	 * APIs.
830 	 */
831 
832 	*aso = so;
833 
834 	return 0;
835 }
836 
837 /*
838  * Returns:	0			Success
839  *		EAFNOSUPPORT
840  *		EPROTOTYPE
841  *		EPROTONOSUPPORT
842  *		ENOBUFS
843  *	<pru_attach>:ENOBUFS[AF_UNIX]
844  *	<pru_attach>:ENOBUFS[TCP]
845  *	<pru_attach>:ENOMEM[TCP]
846  *	<pru_attach>:???		[other protocol families, IPSEC]
847  */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 	           PROC_NULL);
853 }
854 
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 	int error = 0;
859 	struct proc *ep = PROC_NULL;
860 
861 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 		error = ESRCH;
863 		goto done;
864 	}
865 
866 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867 
868 	/*
869 	 * It might not be wise to hold the proc reference when calling
870 	 * socreate_internal since it calls soalloc with M_WAITOK
871 	 */
872 done:
873 	if (ep != PROC_NULL) {
874 		proc_rele(ep);
875 	}
876 
877 	return error;
878 }
879 
880 /*
881  * Returns:	0			Success
882  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
883  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
884  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
885  *	<pru_bind>:EINVAL		Invalid argument
886  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
887  *	<pru_bind>:EACCES		Permission denied
888  *	<pru_bind>:EADDRINUSE		Address in use
889  *	<pru_bind>:EAGAIN		Resource unavailable, try again
890  *	<pru_bind>:EPERM		Operation not permitted
891  *	<pru_bind>:???
892  *	<sf_bind>:???
893  *
894  * Notes:	It's not possible to fully enumerate the return codes above,
895  *		since socket filter authors and protocol family authors may
896  *		not choose to limit their error returns to those listed, even
897  *		though this may result in some software operating incorrectly.
898  *
899  *		The error codes which are enumerated above are those known to
900  *		be returned by the tcp_usr_bind function supplied.
901  */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 	struct proc *p = current_proc();
906 	int error = 0;
907 
908 	if (dolock) {
909 		socket_lock(so, 1);
910 	}
911 
912 	so_update_last_owner_locked(so, p);
913 	so_update_policy(so);
914 
915 #if NECP
916 	so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918 
919 	/*
920 	 * If this is a bind request on a socket that has been marked
921 	 * as inactive, reject it now before we go any further.
922 	 */
923 	if (so->so_flags & SOF_DEFUNCT) {
924 		error = EINVAL;
925 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
926 		    __func__, proc_pid(p), proc_best_name(p),
927 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
928 		    SOCK_DOM(so), SOCK_TYPE(so), error);
929 		goto out;
930 	}
931 
932 	/* Socket filter */
933 	error = sflt_bind(so, nam);
934 
935 	if (error == 0) {
936 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 	}
938 out:
939 	if (dolock) {
940 		socket_unlock(so, 1);
941 	}
942 
943 	if (error == EJUSTRETURN) {
944 		error = 0;
945 	}
946 
947 	return error;
948 }
949 
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 	kauth_cred_unref(&so->so_cred);
954 
955 	/* Remove any filters */
956 	sflt_termsock(so);
957 
958 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959 
960 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
961 		cached_sock_free(so);
962 	} else {
963 		zfree(socket_zone, so);
964 	}
965 }
966 
967 /*
968  * Returns:	0			Success
969  *		EINVAL
970  *		EOPNOTSUPP
971  *	<pru_listen>:EINVAL[AF_UNIX]
972  *	<pru_listen>:EINVAL[TCP]
973  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
974  *	<pru_listen>:EINVAL[TCP]	Invalid argument
975  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
976  *	<pru_listen>:EACCES[TCP]	Permission denied
977  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
978  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
979  *	<pru_listen>:EPERM[TCP]		Operation not permitted
980  *	<sf_listen>:???
981  *
982  * Notes:	Other <pru_listen> returns depend on the protocol family; all
983  *		<sf_listen> returns depend on what the filter author causes
984  *		their filter to return.
985  */
986 int
solisten(struct socket * so,int backlog)987 solisten(struct socket *so, int backlog)
988 {
989 	struct proc *p = current_proc();
990 	int error = 0;
991 
992 	socket_lock(so, 1);
993 
994 	so_update_last_owner_locked(so, p);
995 	so_update_policy(so);
996 
997 #if NECP
998 	so_update_necp_policy(so, NULL, NULL);
999 #endif /* NECP */
1000 
1001 	if (so->so_proto == NULL) {
1002 		error = EINVAL;
1003 		goto out;
1004 	}
1005 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1006 		error = EOPNOTSUPP;
1007 		goto out;
1008 	}
1009 
1010 	/*
1011 	 * If the listen request is made on a socket that is not fully
1012 	 * disconnected, or on a socket that has been marked as inactive,
1013 	 * reject the request now.
1014 	 */
1015 	if ((so->so_state &
1016 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1017 	    (so->so_flags & SOF_DEFUNCT)) {
1018 		error = EINVAL;
1019 		if (so->so_flags & SOF_DEFUNCT) {
1020 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1021 			    "(%d)\n", __func__, proc_pid(p),
1022 			    proc_best_name(p),
1023 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1024 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1025 		}
1026 		goto out;
1027 	}
1028 
1029 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1030 		error = EPERM;
1031 		goto out;
1032 	}
1033 
1034 	error = sflt_listen(so);
1035 	if (error == 0) {
1036 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1037 	}
1038 
1039 	if (error) {
1040 		if (error == EJUSTRETURN) {
1041 			error = 0;
1042 		}
1043 		goto out;
1044 	}
1045 
1046 	if (TAILQ_EMPTY(&so->so_comp)) {
1047 		so->so_options |= SO_ACCEPTCONN;
1048 	}
1049 	/*
1050 	 * POSIX: The implementation may have an upper limit on the length of
1051 	 * the listen queue-either global or per accepting socket. If backlog
1052 	 * exceeds this limit, the length of the listen queue is set to the
1053 	 * limit.
1054 	 *
1055 	 * If listen() is called with a backlog argument value that is less
1056 	 * than 0, the function behaves as if it had been called with a backlog
1057 	 * argument value of 0.
1058 	 *
1059 	 * A backlog argument of 0 may allow the socket to accept connections,
1060 	 * in which case the length of the listen queue may be set to an
1061 	 * implementation-defined minimum value.
1062 	 */
1063 	if (backlog <= 0 || backlog > somaxconn) {
1064 		backlog = somaxconn;
1065 	}
1066 
1067 	so->so_qlimit = (short)backlog;
1068 out:
1069 	socket_unlock(so, 1);
1070 	return error;
1071 }
1072 
1073 /*
1074  * The "accept list lock" protects the fields related to the listener queues
1075  * because we can unlock a socket to respect the lock ordering between
1076  * the listener socket and its clients sockets. The lock ordering is first to
1077  * acquire the client socket before the listener socket.
1078  *
1079  * The accept list lock serializes access to the following fields:
1080  * - of the listener socket:
1081  *   - so_comp
1082  *   - so_incomp
1083  *   - so_qlen
1084  *   - so_inqlen
1085  * - of client sockets that are in so_comp or so_incomp:
1086  *   - so_head
1087  *   - so_list
1088  *
1089  * As one can see the accept list lock protects the consistent of the
1090  * linkage of the client sockets.
1091  *
1092  * Note that those fields may be read without holding the accept list lock
1093  * for a preflight provided the accept list lock is taken when committing
1094  * to take an action based on the result of the preflight. The preflight
1095  * saves the cost of doing the unlock/lock dance.
1096  */
1097 void
so_acquire_accept_list(struct socket * head,struct socket * so)1098 so_acquire_accept_list(struct socket *head, struct socket *so)
1099 {
1100 	lck_mtx_t *mutex_held;
1101 
1102 	if (head->so_proto->pr_getlock == NULL) {
1103 		return;
1104 	}
1105 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1106 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1107 
1108 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1109 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1110 		return;
1111 	}
1112 	if (so != NULL) {
1113 		socket_unlock(so, 0);
1114 	}
1115 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1116 		so_accept_list_waits += 1;
1117 		msleep((caddr_t)&head->so_incomp, mutex_held,
1118 		    PSOCK | PCATCH, __func__, NULL);
1119 	}
1120 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1121 	if (so != NULL) {
1122 		socket_unlock(head, 0);
1123 		socket_lock(so, 0);
1124 		socket_lock(head, 0);
1125 	}
1126 }
1127 
1128 void
so_release_accept_list(struct socket * head)1129 so_release_accept_list(struct socket *head)
1130 {
1131 	if (head->so_proto->pr_getlock != NULL) {
1132 		lck_mtx_t *mutex_held;
1133 
1134 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1135 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1136 
1137 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1138 		wakeup((caddr_t)&head->so_incomp);
1139 	}
1140 }
1141 
1142 void
sofreelastref(struct socket * so,int dealloc)1143 sofreelastref(struct socket *so, int dealloc)
1144 {
1145 	struct socket *head = so->so_head;
1146 
1147 	/* Assume socket is locked */
1148 
1149 #if FLOW_DIVERT
1150 	if (so->so_flags & SOF_FLOW_DIVERT) {
1151 		flow_divert_detach(so);
1152 	}
1153 #endif  /* FLOW_DIVERT */
1154 
1155 #if CONTENT_FILTER
1156 	if (dealloc && ((so->so_flags & SOF_CONTENT_FILTER) != 0)) {
1157 		cfil_sock_detach(so);
1158 	}
1159 #endif /* CONTENT_FILTER */
1160 
1161 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1162 		soflow_detach(so);
1163 	}
1164 
1165 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1166 		selthreadclear(&so->so_snd.sb_sel);
1167 		selthreadclear(&so->so_rcv.sb_sel);
1168 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1169 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1170 		so->so_event = sonullevent;
1171 		return;
1172 	}
1173 	if (head != NULL) {
1174 		/*
1175 		 * Need to lock the listener when the protocol has
1176 		 * per socket locks
1177 		 */
1178 		if (head->so_proto->pr_getlock != NULL) {
1179 			socket_lock(head, 1);
1180 			so_acquire_accept_list(head, so);
1181 		}
1182 		if (so->so_state & SS_INCOMP) {
1183 			so->so_state &= ~SS_INCOMP;
1184 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1185 			head->so_incqlen--;
1186 			head->so_qlen--;
1187 			so->so_head = NULL;
1188 
1189 			if (head->so_proto->pr_getlock != NULL) {
1190 				so_release_accept_list(head);
1191 				socket_unlock(head, 1);
1192 			}
1193 		} else if (so->so_state & SS_COMP) {
1194 			if (head->so_proto->pr_getlock != NULL) {
1195 				so_release_accept_list(head);
1196 				socket_unlock(head, 1);
1197 			}
1198 			/*
1199 			 * We must not decommission a socket that's
1200 			 * on the accept(2) queue.  If we do, then
1201 			 * accept(2) may hang after select(2) indicated
1202 			 * that the listening socket was ready.
1203 			 */
1204 			selthreadclear(&so->so_snd.sb_sel);
1205 			selthreadclear(&so->so_rcv.sb_sel);
1206 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1207 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1208 			so->so_event = sonullevent;
1209 			return;
1210 		} else {
1211 			if (head->so_proto->pr_getlock != NULL) {
1212 				so_release_accept_list(head);
1213 				socket_unlock(head, 1);
1214 			}
1215 			printf("sofree: not queued\n");
1216 		}
1217 	}
1218 	sowflush(so);
1219 	sorflush(so);
1220 
1221 	/* 3932268: disable upcall */
1222 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1223 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1224 	so->so_event = sonullevent;
1225 
1226 	if (dealloc) {
1227 		sodealloc(so);
1228 	}
1229 }
1230 
1231 void
soclose_wait_locked(struct socket * so)1232 soclose_wait_locked(struct socket *so)
1233 {
1234 	lck_mtx_t *mutex_held;
1235 
1236 	if (so->so_proto->pr_getlock != NULL) {
1237 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1238 	} else {
1239 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1240 	}
1241 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1242 
1243 	/*
1244 	 * Double check here and return if there's no outstanding upcall;
1245 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1246 	 */
1247 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1248 		return;
1249 	}
1250 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1251 	so->so_snd.sb_flags &= ~SB_UPCALL;
1252 	so->so_flags |= SOF_CLOSEWAIT;
1253 
1254 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1255 	    "soclose_wait_locked", NULL);
1256 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1257 	so->so_flags &= ~SOF_CLOSEWAIT;
1258 }
1259 
1260 /*
1261  * Close a socket on last file table reference removal.
1262  * Initiate disconnect if connected.
1263  * Free socket when disconnect complete.
1264  */
1265 int
soclose_locked(struct socket * so)1266 soclose_locked(struct socket *so)
1267 {
1268 	int error = 0;
1269 	struct timespec ts;
1270 
1271 	if (so->so_usecount == 0) {
1272 		panic("soclose: so=%p refcount=0", so);
1273 		/* NOTREACHED */
1274 	}
1275 
1276 	sflt_notify(so, sock_evt_closing, NULL);
1277 
1278 	if (so->so_upcallusecount) {
1279 		soclose_wait_locked(so);
1280 	}
1281 
1282 #if CONTENT_FILTER
1283 	/*
1284 	 * We have to wait until the content filters are done
1285 	 */
1286 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1287 		cfil_sock_close_wait(so);
1288 		cfil_sock_is_closed(so);
1289 		cfil_sock_detach(so);
1290 	}
1291 #endif /* CONTENT_FILTER */
1292 
1293 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1294 		soflow_detach(so);
1295 	}
1296 
1297 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1298 		soresume(current_proc(), so, 1);
1299 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1300 	}
1301 
1302 	if ((so->so_options & SO_ACCEPTCONN)) {
1303 		struct socket *sp, *sonext;
1304 		int persocklock = 0;
1305 		int incomp_overflow_only;
1306 
1307 		/*
1308 		 * We do not want new connection to be added
1309 		 * to the connection queues
1310 		 */
1311 		so->so_options &= ~SO_ACCEPTCONN;
1312 
1313 		/*
1314 		 * We can drop the lock on the listener once
1315 		 * we've acquired the incoming list
1316 		 */
1317 		if (so->so_proto->pr_getlock != NULL) {
1318 			persocklock = 1;
1319 			so_acquire_accept_list(so, NULL);
1320 			socket_unlock(so, 0);
1321 		}
1322 again:
1323 		incomp_overflow_only = 1;
1324 
1325 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1326 			/*
1327 			 * Radar 5350314
1328 			 * skip sockets thrown away by tcpdropdropblreq
1329 			 * they will get cleanup by the garbage collection.
1330 			 * otherwise, remove the incomp socket from the queue
1331 			 * and let soabort trigger the appropriate cleanup.
1332 			 */
1333 			if (sp->so_flags & SOF_OVERFLOW) {
1334 				continue;
1335 			}
1336 
1337 			if (persocklock != 0) {
1338 				socket_lock(sp, 1);
1339 			}
1340 
1341 			/*
1342 			 * Radar 27945981
1343 			 * The extra reference for the list insure the
1344 			 * validity of the socket pointer when we perform the
1345 			 * unlock of the head above
1346 			 */
1347 			if (sp->so_state & SS_INCOMP) {
1348 				sp->so_state &= ~SS_INCOMP;
1349 				sp->so_head = NULL;
1350 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1351 				so->so_incqlen--;
1352 				so->so_qlen--;
1353 
1354 				(void) soabort(sp);
1355 			} else {
1356 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1357 				    __func__, sp);
1358 			}
1359 
1360 			if (persocklock != 0) {
1361 				socket_unlock(sp, 1);
1362 			}
1363 		}
1364 
1365 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1366 			/* Dequeue from so_comp since sofree() won't do it */
1367 			if (persocklock != 0) {
1368 				socket_lock(sp, 1);
1369 			}
1370 
1371 			if (sp->so_state & SS_COMP) {
1372 				sp->so_state &= ~SS_COMP;
1373 				sp->so_head = NULL;
1374 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1375 				so->so_qlen--;
1376 
1377 				(void) soabort(sp);
1378 			} else {
1379 				panic("%s sp %p in so_comp but !SS_COMP",
1380 				    __func__, sp);
1381 			}
1382 
1383 			if (persocklock) {
1384 				socket_unlock(sp, 1);
1385 			}
1386 		}
1387 
1388 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1389 #if (DEBUG | DEVELOPMENT)
1390 			panic("%s head %p so_comp not empty", __func__, so);
1391 #endif /* (DEVELOPMENT || DEBUG) */
1392 
1393 			goto again;
1394 		}
1395 
1396 		if (!TAILQ_EMPTY(&so->so_comp)) {
1397 #if (DEBUG | DEVELOPMENT)
1398 			panic("%s head %p so_comp not empty", __func__, so);
1399 #endif /* (DEVELOPMENT || DEBUG) */
1400 
1401 			goto again;
1402 		}
1403 
1404 		if (persocklock) {
1405 			socket_lock(so, 0);
1406 			so_release_accept_list(so);
1407 		}
1408 	}
1409 	if (so->so_pcb == NULL) {
1410 		/* 3915887: mark the socket as ready for dealloc */
1411 		so->so_flags |= SOF_PCBCLEARING;
1412 		goto discard;
1413 	}
1414 	if (so->so_state & SS_ISCONNECTED) {
1415 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1416 			error = sodisconnectlocked(so);
1417 			if (error) {
1418 				goto drop;
1419 			}
1420 		}
1421 		if (so->so_options & SO_LINGER) {
1422 			lck_mtx_t *mutex_held;
1423 
1424 			if ((so->so_state & SS_ISDISCONNECTING) &&
1425 			    (so->so_state & SS_NBIO)) {
1426 				goto drop;
1427 			}
1428 			if (so->so_proto->pr_getlock != NULL) {
1429 				mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1430 			} else {
1431 				mutex_held = so->so_proto->pr_domain->dom_mtx;
1432 			}
1433 			while (so->so_state & SS_ISCONNECTED) {
1434 				ts.tv_sec = (so->so_linger / 100);
1435 				ts.tv_nsec = (so->so_linger % 100) *
1436 				    NSEC_PER_USEC * 1000 * 10;
1437 				error = msleep((caddr_t)&so->so_timeo,
1438 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1439 				if (error) {
1440 					/*
1441 					 * It's OK when the time fires,
1442 					 * don't report an error
1443 					 */
1444 					if (error == EWOULDBLOCK) {
1445 						error = 0;
1446 					}
1447 					break;
1448 				}
1449 			}
1450 		}
1451 	}
1452 drop:
1453 	if (so->so_usecount == 0) {
1454 		panic("soclose: usecount is zero so=%p", so);
1455 		/* NOTREACHED */
1456 	}
1457 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1458 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1459 		if (error == 0) {
1460 			error = error2;
1461 		}
1462 	}
1463 	if (so->so_usecount <= 0) {
1464 		panic("soclose: usecount is zero so=%p", so);
1465 		/* NOTREACHED */
1466 	}
1467 discard:
1468 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1469 	    (so->so_state & SS_NOFDREF)) {
1470 		panic("soclose: NOFDREF");
1471 		/* NOTREACHED */
1472 	}
1473 	so->so_state |= SS_NOFDREF;
1474 
1475 	if ((so->so_flags & SOF_KNOTE) != 0) {
1476 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1477 	}
1478 
1479 	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1480 
1481 	VERIFY(so->so_usecount > 0);
1482 	so->so_usecount--;
1483 	sofree(so);
1484 	return error;
1485 }
1486 
1487 int
soclose(struct socket * so)1488 soclose(struct socket *so)
1489 {
1490 	int error = 0;
1491 	socket_lock(so, 1);
1492 
1493 	if (so->so_retaincnt == 0) {
1494 		error = soclose_locked(so);
1495 	} else {
1496 		/*
1497 		 * if the FD is going away, but socket is
1498 		 * retained in kernel remove its reference
1499 		 */
1500 		so->so_usecount--;
1501 		if (so->so_usecount < 2) {
1502 			panic("soclose: retaincnt non null and so=%p "
1503 			    "usecount=%d\n", so, so->so_usecount);
1504 		}
1505 	}
1506 	socket_unlock(so, 1);
1507 	return error;
1508 }
1509 
1510 /*
1511  * Must be called at splnet...
1512  */
1513 /* Should already be locked */
1514 int
soabort(struct socket * so)1515 soabort(struct socket *so)
1516 {
1517 	int error;
1518 
1519 #ifdef MORE_LOCKING_DEBUG
1520 	lck_mtx_t *mutex_held;
1521 
1522 	if (so->so_proto->pr_getlock != NULL) {
1523 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1524 	} else {
1525 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1526 	}
1527 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1528 #endif
1529 
1530 	if ((so->so_flags & SOF_ABORTED) == 0) {
1531 		so->so_flags |= SOF_ABORTED;
1532 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1533 		if (error) {
1534 			sofree(so);
1535 			return error;
1536 		}
1537 	}
1538 	return 0;
1539 }
1540 
1541 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1542 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1543 {
1544 	int error;
1545 
1546 	if (dolock) {
1547 		socket_lock(so, 1);
1548 	}
1549 
1550 	so_update_last_owner_locked(so, PROC_NULL);
1551 	so_update_policy(so);
1552 #if NECP
1553 	so_update_necp_policy(so, NULL, NULL);
1554 #endif /* NECP */
1555 
1556 	if ((so->so_state & SS_NOFDREF) == 0) {
1557 		panic("soaccept: !NOFDREF");
1558 	}
1559 	so->so_state &= ~SS_NOFDREF;
1560 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1561 
1562 	if (dolock) {
1563 		socket_unlock(so, 1);
1564 	}
1565 	return error;
1566 }
1567 
1568 int
soaccept(struct socket * so,struct sockaddr ** nam)1569 soaccept(struct socket *so, struct sockaddr **nam)
1570 {
1571 	return soacceptlock(so, nam, 1);
1572 }
1573 
1574 int
soacceptfilter(struct socket * so,struct socket * head)1575 soacceptfilter(struct socket *so, struct socket *head)
1576 {
1577 	struct sockaddr *local = NULL, *remote = NULL;
1578 	int error = 0;
1579 
1580 	/*
1581 	 * Hold the lock even if this socket has not been made visible
1582 	 * to the filter(s).  For sockets with global locks, this protects
1583 	 * against the head or peer going away
1584 	 */
1585 	socket_lock(so, 1);
1586 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1587 	    sogetaddr_locked(so, &local, 0) != 0) {
1588 		so->so_state &= ~SS_NOFDREF;
1589 		socket_unlock(so, 1);
1590 		soclose(so);
1591 		/* Out of resources; try it again next time */
1592 		error = ECONNABORTED;
1593 		goto done;
1594 	}
1595 
1596 	error = sflt_accept(head, so, local, remote);
1597 
1598 	/*
1599 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1600 	 * as inactive and return it anyway.  This newly accepted socket
1601 	 * will be disconnected later before we hand it off to the caller.
1602 	 */
1603 	if (error == EJUSTRETURN) {
1604 		error = 0;
1605 		(void) sosetdefunct(current_proc(), so,
1606 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1607 	}
1608 
1609 	if (error != 0) {
1610 		/*
1611 		 * This may seem like a duplication to the above error
1612 		 * handling part when we return ECONNABORTED, except
1613 		 * the following is done while holding the lock since
1614 		 * the socket has been exposed to the filter(s) earlier.
1615 		 */
1616 		so->so_state &= ~SS_NOFDREF;
1617 		socket_unlock(so, 1);
1618 		soclose(so);
1619 		/* Propagate socket filter's error code to the caller */
1620 	} else {
1621 		socket_unlock(so, 1);
1622 	}
1623 done:
1624 	/* Callee checks for NULL pointer */
1625 	sock_freeaddr(remote);
1626 	sock_freeaddr(local);
1627 	return error;
1628 }
1629 
1630 /*
1631  * Returns:	0			Success
1632  *		EOPNOTSUPP		Operation not supported on socket
1633  *		EISCONN			Socket is connected
1634  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1635  *	<pru_connect>:EINVAL		Invalid argument
1636  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1637  *	<pru_connect>:EACCES		Permission denied
1638  *	<pru_connect>:EADDRINUSE	Address in use
1639  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1640  *	<pru_connect>:EPERM		Operation not permitted
1641  *	<sf_connect_out>:???		[anything a filter writer might set]
1642  */
1643 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1644 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1645 {
1646 	int error;
1647 	struct proc *p = current_proc();
1648 	tracker_metadata_t metadata = { };
1649 
1650 	if (dolock) {
1651 		socket_lock(so, 1);
1652 	}
1653 
1654 	so_update_last_owner_locked(so, p);
1655 	so_update_policy(so);
1656 
1657 #if NECP
1658 	so_update_necp_policy(so, NULL, nam);
1659 #endif /* NECP */
1660 
1661 	/*
1662 	 * If this is a listening socket or if this is a previously-accepted
1663 	 * socket that has been marked as inactive, reject the connect request.
1664 	 */
1665 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1666 		error = EOPNOTSUPP;
1667 		if (so->so_flags & SOF_DEFUNCT) {
1668 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1669 			    "(%d)\n", __func__, proc_pid(p),
1670 			    proc_best_name(p),
1671 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1672 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1673 		}
1674 		if (dolock) {
1675 			socket_unlock(so, 1);
1676 		}
1677 		return error;
1678 	}
1679 
1680 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1681 		if (dolock) {
1682 			socket_unlock(so, 1);
1683 		}
1684 		return EPERM;
1685 	}
1686 
1687 	/*
1688 	 * If protocol is connection-based, can only connect once.
1689 	 * Otherwise, if connected, try to disconnect first.
1690 	 * This allows user to disconnect by connecting to, e.g.,
1691 	 * a null address.
1692 	 */
1693 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1694 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1695 	    (error = sodisconnectlocked(so)))) {
1696 		error = EISCONN;
1697 	} else {
1698 		/*
1699 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1700 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1701 		 */
1702 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1703 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1704 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1705 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1706 				}
1707 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1708 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1709 				}
1710 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1711 					printf("connect() - failed necp_set_socket_domain_attributes");
1712 				}
1713 			}
1714 		}
1715 
1716 		/*
1717 		 * Run connect filter before calling protocol:
1718 		 *  - non-blocking connect returns before completion;
1719 		 */
1720 		error = sflt_connectout(so, nam);
1721 		if (error != 0) {
1722 			if (error == EJUSTRETURN) {
1723 				error = 0;
1724 			}
1725 		} else {
1726 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1727 			    (so, nam, p);
1728 			if (error != 0) {
1729 				so->so_state &= ~SS_ISCONNECTING;
1730 			}
1731 		}
1732 	}
1733 	if (dolock) {
1734 		socket_unlock(so, 1);
1735 	}
1736 	return error;
1737 }
1738 
1739 int
soconnect(struct socket * so,struct sockaddr * nam)1740 soconnect(struct socket *so, struct sockaddr *nam)
1741 {
1742 	return soconnectlock(so, nam, 1);
1743 }
1744 
1745 /*
1746  * Returns:	0			Success
1747  *	<pru_connect2>:EINVAL[AF_UNIX]
1748  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1749  *	<pru_connect2>:???		[other protocol families]
1750  *
1751  * Notes:	<pru_connect2> is not supported by [TCP].
1752  */
1753 int
soconnect2(struct socket * so1,struct socket * so2)1754 soconnect2(struct socket *so1, struct socket *so2)
1755 {
1756 	int error;
1757 
1758 	socket_lock(so1, 1);
1759 	if (so2->so_proto->pr_lock) {
1760 		socket_lock(so2, 1);
1761 	}
1762 
1763 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1764 
1765 	socket_unlock(so1, 1);
1766 	if (so2->so_proto->pr_lock) {
1767 		socket_unlock(so2, 1);
1768 	}
1769 	return error;
1770 }
1771 
1772 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1773 soconnectxlocked(struct socket *so, struct sockaddr *src,
1774     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1775     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1776     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1777 {
1778 	int error;
1779 	tracker_metadata_t metadata = { };
1780 
1781 	so_update_last_owner_locked(so, p);
1782 	so_update_policy(so);
1783 
1784 	/*
1785 	 * If this is a listening socket or if this is a previously-accepted
1786 	 * socket that has been marked as inactive, reject the connect request.
1787 	 */
1788 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1789 		error = EOPNOTSUPP;
1790 		if (so->so_flags & SOF_DEFUNCT) {
1791 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1792 			    "(%d)\n", __func__, proc_pid(p),
1793 			    proc_best_name(p),
1794 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1795 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1796 		}
1797 		return error;
1798 	}
1799 
1800 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1801 		return EPERM;
1802 	}
1803 
1804 	/*
1805 	 * If protocol is connection-based, can only connect once
1806 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1807 	 * try to disconnect first.  This allows user to disconnect
1808 	 * by connecting to, e.g., a null address.
1809 	 */
1810 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1811 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1812 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1813 	    (error = sodisconnectlocked(so)) != 0)) {
1814 		error = EISCONN;
1815 	} else {
1816 		/*
1817 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1818 		 * (only if it hasn't been marked yet).
1819 		 */
1820 		if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1821 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1822 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1823 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1824 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1825 				}
1826 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1827 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1828 				}
1829 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1830 					printf("connectx() - failed necp_set_socket_domain_attributes");
1831 				}
1832 			}
1833 		}
1834 
1835 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1836 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1837 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1838 
1839 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1840 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1841 			}
1842 		}
1843 
1844 		/*
1845 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1846 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1847 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1848 		 * Case 3 allows user to combine write with connect even if they have
1849 		 * no use for TFO (such as regular TCP, and UDP).
1850 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1851 		 */
1852 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1853 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1854 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1855 		}
1856 
1857 		/*
1858 		 * If a user sets data idempotent and does not pass an uio, or
1859 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1860 		 * SOF1_DATA_IDEMPOTENT.
1861 		 */
1862 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1863 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1864 			/* We should return EINVAL instead perhaps. */
1865 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1866 		}
1867 
1868 		/*
1869 		 * Run connect filter before calling protocol:
1870 		 *  - non-blocking connect returns before completion;
1871 		 */
1872 		error = sflt_connectout(so, dst);
1873 		if (error != 0) {
1874 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1875 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1876 			if (error == EJUSTRETURN) {
1877 				error = 0;
1878 			}
1879 		} else {
1880 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1881 			    (so, src, dst, p, ifscope, aid, pcid,
1882 			    flags, arg, arglen, auio, bytes_written);
1883 			if (error != 0) {
1884 				so->so_state &= ~SS_ISCONNECTING;
1885 				if (error != EINPROGRESS) {
1886 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1887 				}
1888 			}
1889 		}
1890 	}
1891 
1892 	return error;
1893 }
1894 
1895 int
sodisconnectlocked(struct socket * so)1896 sodisconnectlocked(struct socket *so)
1897 {
1898 	int error;
1899 
1900 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1901 		error = ENOTCONN;
1902 		goto bad;
1903 	}
1904 	if (so->so_state & SS_ISDISCONNECTING) {
1905 		error = EALREADY;
1906 		goto bad;
1907 	}
1908 
1909 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1910 	if (error == 0) {
1911 		sflt_notify(so, sock_evt_disconnected, NULL);
1912 	}
1913 
1914 bad:
1915 	return error;
1916 }
1917 
1918 /* Locking version */
1919 int
sodisconnect(struct socket * so)1920 sodisconnect(struct socket *so)
1921 {
1922 	int error;
1923 
1924 	socket_lock(so, 1);
1925 	error = sodisconnectlocked(so);
1926 	socket_unlock(so, 1);
1927 	return error;
1928 }
1929 
1930 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1931 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1932 {
1933 	int error;
1934 
1935 	/*
1936 	 * Call the protocol disconnectx handler; let it handle all
1937 	 * matters related to the connection state of this session.
1938 	 */
1939 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1940 	if (error == 0) {
1941 		/*
1942 		 * The event applies only for the session, not for
1943 		 * the disconnection of individual subflows.
1944 		 */
1945 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1946 			sflt_notify(so, sock_evt_disconnected, NULL);
1947 		}
1948 	}
1949 	return error;
1950 }
1951 
1952 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1953 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1954 {
1955 	int error;
1956 
1957 	socket_lock(so, 1);
1958 	error = sodisconnectxlocked(so, aid, cid);
1959 	socket_unlock(so, 1);
1960 	return error;
1961 }
1962 
1963 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1964 
1965 /*
1966  * sosendcheck will lock the socket buffer if it isn't locked and
1967  * verify that there is space for the data being inserted.
1968  *
1969  * Returns:	0			Success
1970  *		EPIPE
1971  *	sblock:EWOULDBLOCK
1972  *	sblock:EINTR
1973  *	sbwait:EBADF
1974  *	sbwait:EINTR
1975  *	[so_error]:???
1976  */
1977 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1978 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1979     int32_t clen, int32_t atomic, int flags, int *sblocked)
1980 {
1981 	int     error = 0;
1982 	int32_t space;
1983 	int     assumelock = 0;
1984 
1985 restart:
1986 	if (*sblocked == 0) {
1987 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1988 		    so->so_send_filt_thread != 0 &&
1989 		    so->so_send_filt_thread == current_thread()) {
1990 			/*
1991 			 * We're being called recursively from a filter,
1992 			 * allow this to continue. Radar 4150520.
1993 			 * Don't set sblocked because we don't want
1994 			 * to perform an unlock later.
1995 			 */
1996 			assumelock = 1;
1997 		} else {
1998 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1999 			if (error) {
2000 				if (so->so_flags & SOF_DEFUNCT) {
2001 					goto defunct;
2002 				}
2003 				return error;
2004 			}
2005 			*sblocked = 1;
2006 		}
2007 	}
2008 
2009 	/*
2010 	 * If a send attempt is made on a socket that has been marked
2011 	 * as inactive (disconnected), reject the request.
2012 	 */
2013 	if (so->so_flags & SOF_DEFUNCT) {
2014 defunct:
2015 		error = EPIPE;
2016 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2017 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2018 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2019 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2020 		return error;
2021 	}
2022 
2023 	if (so->so_state & SS_CANTSENDMORE) {
2024 #if CONTENT_FILTER
2025 		/*
2026 		 * Can re-inject data of half closed connections
2027 		 */
2028 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2029 		    so->so_snd.sb_cfil_thread == current_thread() &&
2030 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2031 			CFIL_LOG(LOG_INFO,
2032 			    "so %llx ignore SS_CANTSENDMORE",
2033 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2034 		} else
2035 #endif /* CONTENT_FILTER */
2036 		return EPIPE;
2037 	}
2038 	if (so->so_error) {
2039 		error = so->so_error;
2040 		so->so_error = 0;
2041 		return error;
2042 	}
2043 
2044 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2045 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2046 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2047 			    (resid != 0 || clen == 0) &&
2048 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2049 				return ENOTCONN;
2050 			}
2051 		} else if (addr == 0) {
2052 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2053 			       ENOTCONN : EDESTADDRREQ;
2054 		}
2055 	}
2056 
2057 	space = sbspace(&so->so_snd);
2058 
2059 	if (flags & MSG_OOB) {
2060 		space += 1024;
2061 	}
2062 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2063 	    clen > so->so_snd.sb_hiwat) {
2064 		return EMSGSIZE;
2065 	}
2066 
2067 	if ((space < resid + clen &&
2068 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2069 	    space < clen)) ||
2070 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2071 		/*
2072 		 * don't block the connectx call when there's more data
2073 		 * than can be copied.
2074 		 */
2075 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2076 			if (space == 0) {
2077 				return EWOULDBLOCK;
2078 			}
2079 			if (space < (int32_t)so->so_snd.sb_lowat) {
2080 				return 0;
2081 			}
2082 		}
2083 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2084 		    assumelock) {
2085 			return EWOULDBLOCK;
2086 		}
2087 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2088 		*sblocked = 0;
2089 		error = sbwait(&so->so_snd);
2090 		if (error) {
2091 			if (so->so_flags & SOF_DEFUNCT) {
2092 				goto defunct;
2093 			}
2094 			return error;
2095 		}
2096 		goto restart;
2097 	}
2098 	return 0;
2099 }
2100 
2101 /*
2102  * Send on a socket.
2103  * If send must go all at once and message is larger than
2104  * send buffering, then hard error.
2105  * Lock against other senders.
2106  * If must go all at once and not enough room now, then
2107  * inform user that this would block and do nothing.
2108  * Otherwise, if nonblocking, send as much as possible.
2109  * The data to be sent is described by "uio" if nonzero,
2110  * otherwise by the mbuf chain "top" (which must be null
2111  * if uio is not).  Data provided in mbuf chain must be small
2112  * enough to send all at once.
2113  *
2114  * Returns nonzero on error, timeout or signal; callers
2115  * must check for short counts if EINTR/ERESTART are returned.
2116  * Data and control buffers are freed on return.
2117  *
2118  * Returns:	0			Success
2119  *		EOPNOTSUPP
2120  *		EINVAL
2121  *		ENOBUFS
2122  *	uiomove:EFAULT
2123  *	sosendcheck:EPIPE
2124  *	sosendcheck:EWOULDBLOCK
2125  *	sosendcheck:EINTR
2126  *	sosendcheck:EBADF
2127  *	sosendcheck:EINTR
2128  *	sosendcheck:???			[value from so_error]
2129  *	<pru_send>:ECONNRESET[TCP]
2130  *	<pru_send>:EINVAL[TCP]
2131  *	<pru_send>:ENOBUFS[TCP]
2132  *	<pru_send>:EADDRINUSE[TCP]
2133  *	<pru_send>:EADDRNOTAVAIL[TCP]
2134  *	<pru_send>:EAFNOSUPPORT[TCP]
2135  *	<pru_send>:EACCES[TCP]
2136  *	<pru_send>:EAGAIN[TCP]
2137  *	<pru_send>:EPERM[TCP]
2138  *	<pru_send>:EMSGSIZE[TCP]
2139  *	<pru_send>:EHOSTUNREACH[TCP]
2140  *	<pru_send>:ENETUNREACH[TCP]
2141  *	<pru_send>:ENETDOWN[TCP]
2142  *	<pru_send>:ENOMEM[TCP]
2143  *	<pru_send>:ENOBUFS[TCP]
2144  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2145  *	<pru_send>:EINVAL[AF_UNIX]
2146  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2147  *	<pru_send>:EPIPE[AF_UNIX]
2148  *	<pru_send>:ENOTCONN[AF_UNIX]
2149  *	<pru_send>:EISCONN[AF_UNIX]
2150  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2151  *	<sf_data_out>:???		[whatever a filter author chooses]
2152  *
2153  * Notes:	Other <pru_send> returns depend on the protocol family; all
2154  *		<sf_data_out> returns depend on what the filter author causes
2155  *		their filter to return.
2156  */
2157 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2158 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2159     struct mbuf *top, struct mbuf *control, int flags)
2160 {
2161 	struct mbuf **mp;
2162 	struct mbuf *m, *freelist = NULL;
2163 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2164 	user_ssize_t space, len, resid, orig_resid;
2165 	int clen = 0, error, dontroute, sendflags;
2166 	int atomic = sosendallatonce(so) || top;
2167 	int sblocked = 0;
2168 	struct proc *p = current_proc();
2169 	uint16_t headroom = 0;
2170 	ssize_t mlen;
2171 	boolean_t en_tracing = FALSE;
2172 
2173 	if (uio != NULL) {
2174 		resid = uio_resid(uio);
2175 	} else {
2176 		resid = top->m_pkthdr.len;
2177 	}
2178 
2179 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2180 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2181 
2182 	socket_lock(so, 1);
2183 
2184 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2185 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2186 	}
2187 
2188 	/*
2189 	 * trace if tracing & network (vs. unix) sockets & and
2190 	 * non-loopback
2191 	 */
2192 	if (ENTR_SHOULDTRACE &&
2193 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2194 		struct inpcb *inp = sotoinpcb(so);
2195 		if (inp->inp_last_outifp != NULL &&
2196 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2197 			en_tracing = TRUE;
2198 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2199 			    VM_KERNEL_ADDRPERM(so),
2200 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2201 			    (int64_t)resid);
2202 			orig_resid = resid;
2203 		}
2204 	}
2205 
2206 	/*
2207 	 * Re-injection should not affect process accounting
2208 	 */
2209 	if ((flags & MSG_SKIPCFIL) == 0) {
2210 		so_update_last_owner_locked(so, p);
2211 		so_update_policy(so);
2212 
2213 #if NECP
2214 		so_update_necp_policy(so, NULL, addr);
2215 #endif /* NECP */
2216 	}
2217 
2218 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2219 		error = EOPNOTSUPP;
2220 		goto out_locked;
2221 	}
2222 
2223 	/*
2224 	 * In theory resid should be unsigned.
2225 	 * However, space must be signed, as it might be less than 0
2226 	 * if we over-committed, and we must use a signed comparison
2227 	 * of space and resid.  On the other hand, a negative resid
2228 	 * causes us to loop sending 0-length segments to the protocol.
2229 	 *
2230 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2231 	 *
2232 	 * Note: We limit resid to be a positive int value as we use
2233 	 * imin() to set bytes_to_copy -- radr://14558484
2234 	 */
2235 	if (resid < 0 || resid > INT_MAX ||
2236 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2237 		error = EINVAL;
2238 		goto out_locked;
2239 	}
2240 
2241 	dontroute = (flags & MSG_DONTROUTE) &&
2242 	    (so->so_options & SO_DONTROUTE) == 0 &&
2243 	    (so->so_proto->pr_flags & PR_ATOMIC);
2244 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2245 
2246 	if (control != NULL) {
2247 		clen = control->m_len;
2248 	}
2249 
2250 	if (soreserveheadroom != 0) {
2251 		headroom = so->so_pktheadroom;
2252 	}
2253 
2254 	do {
2255 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2256 		    &sblocked);
2257 		if (error) {
2258 			goto out_locked;
2259 		}
2260 
2261 		mp = &top;
2262 		space = sbspace(&so->so_snd) - clen;
2263 		space += ((flags & MSG_OOB) ? 1024 : 0);
2264 
2265 		do {
2266 			if (uio == NULL) {
2267 				/*
2268 				 * Data is prepackaged in "top".
2269 				 */
2270 				resid = 0;
2271 				if (flags & MSG_EOR) {
2272 					top->m_flags |= M_EOR;
2273 				}
2274 			} else {
2275 				int chainlength;
2276 				int bytes_to_copy;
2277 				boolean_t jumbocl;
2278 				boolean_t bigcl;
2279 				int bytes_to_alloc;
2280 
2281 				bytes_to_copy = imin((int)resid, (int)space);
2282 
2283 				bytes_to_alloc = bytes_to_copy;
2284 				if (top == NULL) {
2285 					bytes_to_alloc += headroom;
2286 				}
2287 
2288 				if (sosendminchain > 0) {
2289 					chainlength = 0;
2290 				} else {
2291 					chainlength = sosendmaxchain;
2292 				}
2293 
2294 				/*
2295 				 * Use big 4 KB cluster when the outgoing interface
2296 				 * does not prefer 2 KB clusters
2297 				 */
2298 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2299 				    sosendbigcl_ignore_capab;
2300 
2301 				/*
2302 				 * Attempt to use larger than system page-size
2303 				 * clusters for large writes only if there is
2304 				 * a jumbo cluster pool and if the socket is
2305 				 * marked accordingly.
2306 				 */
2307 				jumbocl = sosendjcl && njcl > 0 &&
2308 				    ((so->so_flags & SOF_MULTIPAGES) ||
2309 				    sosendjcl_ignore_capab) &&
2310 				    bigcl;
2311 
2312 				socket_unlock(so, 0);
2313 
2314 				do {
2315 					int num_needed;
2316 					int hdrs_needed = (top == NULL) ? 1 : 0;
2317 
2318 					/*
2319 					 * try to maintain a local cache of mbuf
2320 					 * clusters needed to complete this
2321 					 * write the list is further limited to
2322 					 * the number that are currently needed
2323 					 * to fill the socket this mechanism
2324 					 * allows a large number of mbufs/
2325 					 * clusters to be grabbed under a single
2326 					 * mbuf lock... if we can't get any
2327 					 * clusters, than fall back to trying
2328 					 * for mbufs if we fail early (or
2329 					 * miscalcluate the number needed) make
2330 					 * sure to release any clusters we
2331 					 * haven't yet consumed.
2332 					 */
2333 					if (freelist == NULL &&
2334 					    bytes_to_alloc > MBIGCLBYTES &&
2335 					    jumbocl) {
2336 						num_needed =
2337 						    bytes_to_alloc / M16KCLBYTES;
2338 
2339 						if ((bytes_to_alloc -
2340 						    (num_needed * M16KCLBYTES))
2341 						    >= MINCLSIZE) {
2342 							num_needed++;
2343 						}
2344 
2345 						freelist =
2346 						    m_getpackets_internal(
2347 							(unsigned int *)&num_needed,
2348 							hdrs_needed, M_WAIT, 0,
2349 							M16KCLBYTES);
2350 						/*
2351 						 * Fall back to 4K cluster size
2352 						 * if allocation failed
2353 						 */
2354 					}
2355 
2356 					if (freelist == NULL &&
2357 					    bytes_to_alloc > MCLBYTES &&
2358 					    bigcl) {
2359 						num_needed =
2360 						    bytes_to_alloc / MBIGCLBYTES;
2361 
2362 						if ((bytes_to_alloc -
2363 						    (num_needed * MBIGCLBYTES)) >=
2364 						    MINCLSIZE) {
2365 							num_needed++;
2366 						}
2367 
2368 						freelist =
2369 						    m_getpackets_internal(
2370 							(unsigned int *)&num_needed,
2371 							hdrs_needed, M_WAIT, 0,
2372 							MBIGCLBYTES);
2373 						/*
2374 						 * Fall back to cluster size
2375 						 * if allocation failed
2376 						 */
2377 					}
2378 
2379 					/*
2380 					 * Allocate a cluster as we want to
2381 					 * avoid to split the data in more
2382 					 * that one segment and using MINCLSIZE
2383 					 * would lead us to allocate two mbufs
2384 					 */
2385 					if (soreserveheadroom != 0 &&
2386 					    freelist == NULL &&
2387 					    ((top == NULL &&
2388 					    bytes_to_alloc > _MHLEN) ||
2389 					    bytes_to_alloc > _MLEN)) {
2390 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2391 						    MCLBYTES;
2392 						freelist =
2393 						    m_getpackets_internal(
2394 							(unsigned int *)&num_needed,
2395 							hdrs_needed, M_WAIT, 0,
2396 							MCLBYTES);
2397 						/*
2398 						 * Fall back to a single mbuf
2399 						 * if allocation failed
2400 						 */
2401 					} else if (freelist == NULL &&
2402 					    bytes_to_alloc > MINCLSIZE) {
2403 						num_needed =
2404 						    bytes_to_alloc / MCLBYTES;
2405 
2406 						if ((bytes_to_alloc -
2407 						    (num_needed * MCLBYTES)) >=
2408 						    MINCLSIZE) {
2409 							num_needed++;
2410 						}
2411 
2412 						freelist =
2413 						    m_getpackets_internal(
2414 							(unsigned int *)&num_needed,
2415 							hdrs_needed, M_WAIT, 0,
2416 							MCLBYTES);
2417 						/*
2418 						 * Fall back to a single mbuf
2419 						 * if allocation failed
2420 						 */
2421 					}
2422 					/*
2423 					 * For datagram protocols, leave
2424 					 * headroom for protocol headers
2425 					 * in the first cluster of the chain
2426 					 */
2427 					if (freelist != NULL && atomic &&
2428 					    top == NULL && headroom > 0) {
2429 						freelist->m_data += headroom;
2430 					}
2431 
2432 					/*
2433 					 * Fall back to regular mbufs without
2434 					 * reserving the socket headroom
2435 					 */
2436 					if (freelist == NULL) {
2437 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2438 							if (top == NULL) {
2439 								MGETHDR(freelist,
2440 								    M_WAIT, MT_DATA);
2441 							} else {
2442 								MGET(freelist,
2443 								    M_WAIT, MT_DATA);
2444 							}
2445 						}
2446 
2447 						if (freelist == NULL) {
2448 							error = ENOBUFS;
2449 							socket_lock(so, 0);
2450 							goto out_locked;
2451 						}
2452 						/*
2453 						 * For datagram protocols,
2454 						 * leave room for protocol
2455 						 * headers in first mbuf.
2456 						 */
2457 						if (atomic && top == NULL &&
2458 						    bytes_to_copy > 0 &&
2459 						    bytes_to_copy < MHLEN) {
2460 							MH_ALIGN(freelist,
2461 							    bytes_to_copy);
2462 						}
2463 					}
2464 					m = freelist;
2465 					freelist = m->m_next;
2466 					m->m_next = NULL;
2467 
2468 					if ((m->m_flags & M_EXT)) {
2469 						mlen = m->m_ext.ext_size -
2470 						    M_LEADINGSPACE(m);
2471 					} else if ((m->m_flags & M_PKTHDR)) {
2472 						mlen = MHLEN - M_LEADINGSPACE(m);
2473 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2474 					} else {
2475 						mlen = MLEN - M_LEADINGSPACE(m);
2476 					}
2477 					len = imin((int)mlen, bytes_to_copy);
2478 
2479 					chainlength += len;
2480 
2481 					space -= len;
2482 
2483 					error = uiomove(mtod(m, caddr_t),
2484 					    (int)len, uio);
2485 
2486 					resid = uio_resid(uio);
2487 
2488 					m->m_len = (int32_t)len;
2489 					*mp = m;
2490 					top->m_pkthdr.len += len;
2491 					if (error) {
2492 						break;
2493 					}
2494 					mp = &m->m_next;
2495 					if (resid <= 0) {
2496 						if (flags & MSG_EOR) {
2497 							top->m_flags |= M_EOR;
2498 						}
2499 						break;
2500 					}
2501 					bytes_to_copy = imin((int)resid, (int)space);
2502 				} while (space > 0 &&
2503 				    (chainlength < sosendmaxchain || atomic ||
2504 				    resid < MINCLSIZE));
2505 
2506 				socket_lock(so, 0);
2507 
2508 				if (error) {
2509 					goto out_locked;
2510 				}
2511 			}
2512 
2513 			if (dontroute) {
2514 				so->so_options |= SO_DONTROUTE;
2515 			}
2516 
2517 			/*
2518 			 * Compute flags here, for pru_send and NKEs
2519 			 *
2520 			 * If the user set MSG_EOF, the protocol
2521 			 * understands this flag and nothing left to
2522 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2523 			 */
2524 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2525 			    ((flags & MSG_EOF) &&
2526 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2527 			    (resid <= 0)) ? PRUS_EOF :
2528 			    /* If there is more to send set PRUS_MORETOCOME */
2529 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2530 
2531 			if ((flags & MSG_SKIPCFIL) == 0) {
2532 				/*
2533 				 * Socket filter processing
2534 				 */
2535 				error = sflt_data_out(so, addr, &top,
2536 				    &control, (sendflags & MSG_OOB) ?
2537 				    sock_data_filt_flag_oob : 0);
2538 				if (error) {
2539 					if (error == EJUSTRETURN) {
2540 						error = 0;
2541 						goto packet_consumed;
2542 					}
2543 					goto out_locked;
2544 				}
2545 #if CONTENT_FILTER
2546 				/*
2547 				 * Content filter processing
2548 				 */
2549 				error = cfil_sock_data_out(so, addr, top,
2550 				    control, sendflags, dgram_flow_entry);
2551 				if (error) {
2552 					if (error == EJUSTRETURN) {
2553 						error = 0;
2554 						goto packet_consumed;
2555 					}
2556 					goto out_locked;
2557 				}
2558 #endif /* CONTENT_FILTER */
2559 			}
2560 			error = (*so->so_proto->pr_usrreqs->pru_send)
2561 			    (so, sendflags, top, addr, control, p);
2562 
2563 packet_consumed:
2564 			if (dontroute) {
2565 				so->so_options &= ~SO_DONTROUTE;
2566 			}
2567 
2568 			clen = 0;
2569 			control = NULL;
2570 			top = NULL;
2571 			mp = &top;
2572 			if (error) {
2573 				goto out_locked;
2574 			}
2575 		} while (resid && space > 0);
2576 	} while (resid);
2577 
2578 out_locked:
2579 	if (sblocked) {
2580 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2581 	} else {
2582 		socket_unlock(so, 1);
2583 	}
2584 	if (top != NULL) {
2585 		m_freem(top);
2586 	}
2587 	if (control != NULL) {
2588 		m_freem(control);
2589 	}
2590 	if (freelist != NULL) {
2591 		m_freem_list(freelist);
2592 	}
2593 
2594 	if (dgram_flow_entry != NULL) {
2595 		soflow_free_flow(dgram_flow_entry);
2596 	}
2597 
2598 	soclearfastopen(so);
2599 
2600 	if (en_tracing) {
2601 		/* resid passed here is the bytes left in uio */
2602 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2603 		    VM_KERNEL_ADDRPERM(so),
2604 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2605 		    (int64_t)(orig_resid - resid));
2606 	}
2607 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2608 	    so->so_snd.sb_cc, space, error);
2609 
2610 	return error;
2611 }
2612 
2613 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2614 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2615 {
2616 	struct mbuf *m0 = NULL, *control_end = NULL;
2617 
2618 	socket_lock_assert_owned(so);
2619 
2620 	/*
2621 	 * top must points to mbuf chain to be sent.
2622 	 * If control is not NULL, top must be packet header
2623 	 */
2624 	VERIFY(top != NULL &&
2625 	    (control == NULL || top->m_flags & M_PKTHDR));
2626 
2627 	/*
2628 	 * If control is not passed in, see if we can get it
2629 	 * from top.
2630 	 */
2631 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2632 		// Locate start of control if present and start of data
2633 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2634 			if (m0->m_flags & M_PKTHDR) {
2635 				top = m0;
2636 				break;
2637 			} else if (m0->m_type == MT_CONTROL) {
2638 				if (control == NULL) {
2639 					// Found start of control
2640 					control = m0;
2641 				}
2642 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2643 					// Found end of control
2644 					control_end = m0;
2645 				}
2646 			}
2647 		}
2648 		if (control_end != NULL) {
2649 			control_end->m_next = NULL;
2650 		}
2651 	}
2652 
2653 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2654 	    (so, sendflags, top, addr, control, current_proc());
2655 
2656 	return error;
2657 }
2658 
2659 /*
2660  * Supported only connected sockets (no address) without ancillary data
2661  * (control mbuf) for atomic protocols
2662  */
2663 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2664 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2665 {
2666 	struct mbuf *m, *freelist = NULL;
2667 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2668 	user_ssize_t len, resid;
2669 	int error, dontroute;
2670 	int atomic = sosendallatonce(so);
2671 	int sblocked = 0;
2672 	struct proc *p = current_proc();
2673 	u_int uiofirst = 0;
2674 	u_int uiolast = 0;
2675 	struct mbuf *top = NULL;
2676 	uint16_t headroom = 0;
2677 	ssize_t mlen;
2678 	boolean_t bigcl;
2679 
2680 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2681 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2682 
2683 	if (so->so_type != SOCK_DGRAM) {
2684 		error = EINVAL;
2685 		goto out;
2686 	}
2687 	if (atomic == 0) {
2688 		error = EINVAL;
2689 		goto out;
2690 	}
2691 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2692 		error = EPROTONOSUPPORT;
2693 		goto out;
2694 	}
2695 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2696 		error = EINVAL;
2697 		goto out;
2698 	}
2699 	resid = uio_array_resid(uioarray, uiocnt);
2700 
2701 	/*
2702 	 * In theory resid should be unsigned.
2703 	 * However, space must be signed, as it might be less than 0
2704 	 * if we over-committed, and we must use a signed comparison
2705 	 * of space and resid.  On the other hand, a negative resid
2706 	 * causes us to loop sending 0-length segments to the protocol.
2707 	 *
2708 	 * Note: We limit resid to be a positive int value as we use
2709 	 * imin() to set bytes_to_copy -- radr://14558484
2710 	 */
2711 	if (resid < 0 || resid > INT_MAX) {
2712 		error = EINVAL;
2713 		goto out;
2714 	}
2715 
2716 	socket_lock(so, 1);
2717 	so_update_last_owner_locked(so, p);
2718 	so_update_policy(so);
2719 
2720 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2721 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2722 	}
2723 
2724 #if NECP
2725 	so_update_necp_policy(so, NULL, NULL);
2726 #endif /* NECP */
2727 
2728 	dontroute = (flags & MSG_DONTROUTE) &&
2729 	    (so->so_options & SO_DONTROUTE) == 0 &&
2730 	    (so->so_proto->pr_flags & PR_ATOMIC);
2731 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2732 
2733 	error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2734 	if (error) {
2735 		goto release;
2736 	}
2737 
2738 	/*
2739 	 * Use big 4 KB clusters when the outgoing interface does not prefer
2740 	 * 2 KB clusters
2741 	 */
2742 	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2743 
2744 	if (soreserveheadroom != 0) {
2745 		headroom = so->so_pktheadroom;
2746 	}
2747 
2748 	do {
2749 		int i;
2750 		int num_needed = 0;
2751 		int chainlength;
2752 		size_t maxpktlen = 0;
2753 		int bytes_to_alloc;
2754 
2755 		if (sosendminchain > 0) {
2756 			chainlength = 0;
2757 		} else {
2758 			chainlength = sosendmaxchain;
2759 		}
2760 
2761 		socket_unlock(so, 0);
2762 
2763 		/*
2764 		 * Find a set of uio that fit in a reasonable number
2765 		 * of mbuf packets
2766 		 */
2767 		for (i = uiofirst; i < uiocnt; i++) {
2768 			struct uio *auio = uioarray[i];
2769 
2770 			len = uio_resid(auio);
2771 
2772 			/* Do nothing for empty messages */
2773 			if (len == 0) {
2774 				continue;
2775 			}
2776 
2777 			num_needed += 1;
2778 			uiolast += 1;
2779 
2780 			if (len > maxpktlen) {
2781 				maxpktlen = len;
2782 			}
2783 
2784 			chainlength += len;
2785 			if (chainlength > sosendmaxchain) {
2786 				break;
2787 			}
2788 		}
2789 		/*
2790 		 * Nothing left to send
2791 		 */
2792 		if (num_needed == 0) {
2793 			socket_lock(so, 0);
2794 			break;
2795 		}
2796 		/*
2797 		 * Allocate buffer large enough to include headroom space for
2798 		 * network and link header
2799 		 *
2800 		 */
2801 		bytes_to_alloc = (int) maxpktlen + headroom;
2802 
2803 		/*
2804 		 * Allocate a single contiguous buffer of the smallest available
2805 		 * size when possible
2806 		 */
2807 		if (bytes_to_alloc > MCLBYTES &&
2808 		    bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2809 			freelist = m_getpackets_internal(
2810 				(unsigned int *)&num_needed,
2811 				num_needed, M_WAIT, 1,
2812 				MBIGCLBYTES);
2813 		} else if (bytes_to_alloc > _MHLEN &&
2814 		    bytes_to_alloc <= MCLBYTES) {
2815 			freelist = m_getpackets_internal(
2816 				(unsigned int *)&num_needed,
2817 				num_needed, M_WAIT, 1,
2818 				MCLBYTES);
2819 		} else {
2820 			freelist = m_allocpacket_internal(
2821 				(unsigned int *)&num_needed,
2822 				bytes_to_alloc, NULL, M_WAIT, 1, 0);
2823 		}
2824 
2825 		if (freelist == NULL) {
2826 			socket_lock(so, 0);
2827 			error = ENOMEM;
2828 			goto release;
2829 		}
2830 		/*
2831 		 * Copy each uio of the set into its own mbuf packet
2832 		 */
2833 		for (i = uiofirst, m = freelist;
2834 		    i < uiolast && m != NULL;
2835 		    i++) {
2836 			int bytes_to_copy;
2837 			struct mbuf *n;
2838 			struct uio *auio = uioarray[i];
2839 
2840 			bytes_to_copy = (int)uio_resid(auio);
2841 
2842 			/* Do nothing for empty messages */
2843 			if (bytes_to_copy == 0) {
2844 				continue;
2845 			}
2846 			/*
2847 			 * Leave headroom for protocol headers
2848 			 * in the first mbuf of the chain
2849 			 */
2850 			m->m_data += headroom;
2851 
2852 			for (n = m; n != NULL; n = n->m_next) {
2853 				if ((m->m_flags & M_EXT)) {
2854 					mlen = m->m_ext.ext_size -
2855 					    M_LEADINGSPACE(m);
2856 				} else if ((m->m_flags & M_PKTHDR)) {
2857 					mlen =
2858 					    MHLEN - M_LEADINGSPACE(m);
2859 				} else {
2860 					mlen = MLEN - M_LEADINGSPACE(m);
2861 				}
2862 				len = imin((int)mlen, bytes_to_copy);
2863 
2864 				/*
2865 				 * Note: uiomove() decrements the iovec
2866 				 * length
2867 				 */
2868 				error = uiomove(mtod(n, caddr_t),
2869 				    (int)len, auio);
2870 				if (error != 0) {
2871 					break;
2872 				}
2873 				n->m_len = (int32_t)len;
2874 				m->m_pkthdr.len += len;
2875 
2876 				VERIFY(m->m_pkthdr.len <= maxpktlen);
2877 
2878 				bytes_to_copy -= len;
2879 				resid -= len;
2880 			}
2881 			if (m->m_pkthdr.len == 0) {
2882 				printf(
2883 					"%s:%d so %llx pkt %llx type %u len null\n",
2884 					__func__, __LINE__,
2885 					(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2886 					(uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2887 					m->m_type);
2888 			}
2889 			if (error != 0) {
2890 				break;
2891 			}
2892 			m = m->m_nextpkt;
2893 		}
2894 
2895 		socket_lock(so, 0);
2896 
2897 		if (error) {
2898 			goto release;
2899 		}
2900 		top = freelist;
2901 		freelist = NULL;
2902 
2903 		if (dontroute) {
2904 			so->so_options |= SO_DONTROUTE;
2905 		}
2906 
2907 		if ((flags & MSG_SKIPCFIL) == 0) {
2908 			struct mbuf **prevnextp = NULL;
2909 
2910 			for (i = uiofirst, m = top;
2911 			    i < uiolast && m != NULL;
2912 			    i++) {
2913 				struct mbuf *nextpkt = m->m_nextpkt;
2914 
2915 				/*
2916 				 * Socket filter processing
2917 				 */
2918 				error = sflt_data_out(so, NULL, &m,
2919 				    NULL, 0);
2920 				if (error != 0 && error != EJUSTRETURN) {
2921 					goto release;
2922 				}
2923 
2924 #if CONTENT_FILTER
2925 				if (error == 0) {
2926 					/*
2927 					 * Content filter processing
2928 					 */
2929 					error = cfil_sock_data_out(so, NULL, m,
2930 					    NULL, 0, dgram_flow_entry);
2931 					if (error != 0 && error != EJUSTRETURN) {
2932 						goto release;
2933 					}
2934 				}
2935 #endif /* CONTENT_FILTER */
2936 				/*
2937 				 * Remove packet from the list when
2938 				 * swallowed by a filter
2939 				 */
2940 				if (error == EJUSTRETURN) {
2941 					error = 0;
2942 					if (prevnextp != NULL) {
2943 						*prevnextp = nextpkt;
2944 					} else {
2945 						top = nextpkt;
2946 					}
2947 				}
2948 
2949 				m = nextpkt;
2950 				if (m != NULL) {
2951 					prevnextp = &m->m_nextpkt;
2952 				}
2953 			}
2954 		}
2955 		if (top != NULL) {
2956 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2957 			    (so, 0, top, NULL, NULL, p);
2958 		}
2959 
2960 		if (dontroute) {
2961 			so->so_options &= ~SO_DONTROUTE;
2962 		}
2963 
2964 		top = NULL;
2965 		uiofirst = uiolast;
2966 	} while (resid > 0 && error == 0);
2967 release:
2968 	if (sblocked) {
2969 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2970 	} else {
2971 		socket_unlock(so, 1);
2972 	}
2973 out:
2974 	if (top != NULL) {
2975 		m_freem(top);
2976 	}
2977 	if (freelist != NULL) {
2978 		m_freem_list(freelist);
2979 	}
2980 
2981 	if (dgram_flow_entry != NULL) {
2982 		soflow_free_flow(dgram_flow_entry);
2983 	}
2984 
2985 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2986 	    so->so_snd.sb_cc, 0, error);
2987 
2988 	return error;
2989 }
2990 
2991 /*
2992  * May return ERESTART when packet is dropped by MAC policy check
2993  */
2994 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2995 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2996     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2997 {
2998 	int error = 0;
2999 	struct mbuf *m = *mp;
3000 	struct mbuf *nextrecord = *nextrecordp;
3001 
3002 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3003 #if CONFIG_MACF_SOCKET_SUBSET
3004 	/*
3005 	 * Call the MAC framework for policy checking if we're in
3006 	 * the user process context and the socket isn't connected.
3007 	 */
3008 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3009 		struct mbuf *m0 = m;
3010 		/*
3011 		 * Dequeue this record (temporarily) from the receive
3012 		 * list since we're about to drop the socket's lock
3013 		 * where a new record may arrive and be appended to
3014 		 * the list.  Upon MAC policy failure, the record
3015 		 * will be freed.  Otherwise, we'll add it back to
3016 		 * the head of the list.  We cannot rely on SB_LOCK
3017 		 * because append operation uses the socket's lock.
3018 		 */
3019 		do {
3020 			m->m_nextpkt = NULL;
3021 			sbfree(&so->so_rcv, m);
3022 			m = m->m_next;
3023 		} while (m != NULL);
3024 		m = m0;
3025 		so->so_rcv.sb_mb = nextrecord;
3026 		SB_EMPTY_FIXUP(&so->so_rcv);
3027 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3028 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3029 		socket_unlock(so, 0);
3030 
3031 		error = mac_socket_check_received(kauth_cred_get(), so,
3032 		    mtod(m, struct sockaddr *));
3033 
3034 		if (error != 0) {
3035 			/*
3036 			 * MAC policy failure; free this record and
3037 			 * process the next record (or block until
3038 			 * one is available).  We have adjusted sb_cc
3039 			 * and sb_mbcnt above so there is no need to
3040 			 * call sbfree() again.
3041 			 */
3042 			m_freem(m);
3043 			/*
3044 			 * Clear SB_LOCK but don't unlock the socket.
3045 			 * Process the next record or wait for one.
3046 			 */
3047 			socket_lock(so, 0);
3048 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
3049 			error = ERESTART;
3050 			goto done;
3051 		}
3052 		socket_lock(so, 0);
3053 		/*
3054 		 * If the socket has been defunct'd, drop it.
3055 		 */
3056 		if (so->so_flags & SOF_DEFUNCT) {
3057 			m_freem(m);
3058 			error = ENOTCONN;
3059 			goto done;
3060 		}
3061 		/*
3062 		 * Re-adjust the socket receive list and re-enqueue
3063 		 * the record in front of any packets which may have
3064 		 * been appended while we dropped the lock.
3065 		 */
3066 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3067 			sballoc(&so->so_rcv, m);
3068 		}
3069 		sballoc(&so->so_rcv, m);
3070 		if (so->so_rcv.sb_mb == NULL) {
3071 			so->so_rcv.sb_lastrecord = m0;
3072 			so->so_rcv.sb_mbtail = m;
3073 		}
3074 		m = m0;
3075 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3076 		so->so_rcv.sb_mb = m;
3077 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3078 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3079 	}
3080 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3081 	if (psa != NULL) {
3082 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3083 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3084 			error = EWOULDBLOCK;
3085 			goto done;
3086 		}
3087 	}
3088 	if (flags & MSG_PEEK) {
3089 		m = m->m_next;
3090 	} else {
3091 		sbfree(&so->so_rcv, m);
3092 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3093 			panic("%s: about to create invalid socketbuf",
3094 			    __func__);
3095 			/* NOTREACHED */
3096 		}
3097 		MFREE(m, so->so_rcv.sb_mb);
3098 		m = so->so_rcv.sb_mb;
3099 		if (m != NULL) {
3100 			m->m_nextpkt = nextrecord;
3101 		} else {
3102 			so->so_rcv.sb_mb = nextrecord;
3103 			SB_EMPTY_FIXUP(&so->so_rcv);
3104 		}
3105 	}
3106 done:
3107 	*mp = m;
3108 	*nextrecordp = nextrecord;
3109 
3110 	return error;
3111 }
3112 
3113 /*
3114  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3115  * so clear the data portion in order not to leak the file pointers
3116  */
3117 static void
sopeek_scm_rights(struct mbuf * rights)3118 sopeek_scm_rights(struct mbuf *rights)
3119 {
3120 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3121 
3122 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3123 		VERIFY(cm->cmsg_len <= rights->m_len);
3124 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3125 	}
3126 }
3127 
3128 /*
3129  * Process one or more MT_CONTROL mbufs present before any data mbufs
3130  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3131  * just copy the data; if !MSG_PEEK, we call into the protocol to
3132  * perform externalization.
3133  */
3134 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3135 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3136     struct mbuf **mp, struct mbuf **nextrecordp)
3137 {
3138 	int error = 0;
3139 	struct mbuf *cm = NULL, *cmn;
3140 	struct mbuf **cme = &cm;
3141 	struct sockbuf *sb_rcv = &so->so_rcv;
3142 	struct mbuf **msgpcm = NULL;
3143 	struct mbuf *m = *mp;
3144 	struct mbuf *nextrecord = *nextrecordp;
3145 	struct protosw *pr = so->so_proto;
3146 
3147 	/*
3148 	 * Externalizing the control messages would require us to
3149 	 * drop the socket's lock below.  Once we re-acquire the
3150 	 * lock, the mbuf chain might change.  In order to preserve
3151 	 * consistency, we unlink all control messages from the
3152 	 * first mbuf chain in one shot and link them separately
3153 	 * onto a different chain.
3154 	 */
3155 	do {
3156 		if (flags & MSG_PEEK) {
3157 			if (controlp != NULL) {
3158 				if (*controlp == NULL) {
3159 					msgpcm = controlp;
3160 				}
3161 				*controlp = m_copy(m, 0, m->m_len);
3162 
3163 				/*
3164 				 * If we failed to allocate an mbuf,
3165 				 * release any previously allocated
3166 				 * mbufs for control data. Return
3167 				 * an error. Keep the mbufs in the
3168 				 * socket as this is using
3169 				 * MSG_PEEK flag.
3170 				 */
3171 				if (*controlp == NULL) {
3172 					m_freem(*msgpcm);
3173 					error = ENOBUFS;
3174 					goto done;
3175 				}
3176 
3177 				if (pr->pr_domain->dom_externalize != NULL) {
3178 					sopeek_scm_rights(*controlp);
3179 				}
3180 
3181 				controlp = &(*controlp)->m_next;
3182 			}
3183 			m = m->m_next;
3184 		} else {
3185 			m->m_nextpkt = NULL;
3186 			sbfree(sb_rcv, m);
3187 			sb_rcv->sb_mb = m->m_next;
3188 			m->m_next = NULL;
3189 			*cme = m;
3190 			cme = &(*cme)->m_next;
3191 			m = sb_rcv->sb_mb;
3192 		}
3193 	} while (m != NULL && m->m_type == MT_CONTROL);
3194 
3195 	if (!(flags & MSG_PEEK)) {
3196 		if (sb_rcv->sb_mb != NULL) {
3197 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3198 		} else {
3199 			sb_rcv->sb_mb = nextrecord;
3200 			SB_EMPTY_FIXUP(sb_rcv);
3201 		}
3202 		if (nextrecord == NULL) {
3203 			sb_rcv->sb_lastrecord = m;
3204 		}
3205 	}
3206 
3207 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3208 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3209 
3210 	while (cm != NULL) {
3211 		int cmsg_level;
3212 		int cmsg_type;
3213 
3214 		cmn = cm->m_next;
3215 		cm->m_next = NULL;
3216 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3217 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3218 
3219 		/*
3220 		 * Call the protocol to externalize SCM_RIGHTS message
3221 		 * and return the modified message to the caller upon
3222 		 * success.  Otherwise, all other control messages are
3223 		 * returned unmodified to the caller.  Note that we
3224 		 * only get into this loop if MSG_PEEK is not set.
3225 		 */
3226 		if (pr->pr_domain->dom_externalize != NULL &&
3227 		    cmsg_level == SOL_SOCKET &&
3228 		    cmsg_type == SCM_RIGHTS) {
3229 			/*
3230 			 * Release socket lock: see 3903171.  This
3231 			 * would also allow more records to be appended
3232 			 * to the socket buffer.  We still have SB_LOCK
3233 			 * set on it, so we can be sure that the head
3234 			 * of the mbuf chain won't change.
3235 			 */
3236 			socket_unlock(so, 0);
3237 			error = (*pr->pr_domain->dom_externalize)(cm);
3238 			socket_lock(so, 0);
3239 		} else {
3240 			error = 0;
3241 		}
3242 
3243 		if (controlp != NULL && error == 0) {
3244 			*controlp = cm;
3245 			controlp = &(*controlp)->m_next;
3246 		} else {
3247 			(void) m_free(cm);
3248 		}
3249 		cm = cmn;
3250 	}
3251 	/*
3252 	 * Update the value of nextrecord in case we received new
3253 	 * records when the socket was unlocked above for
3254 	 * externalizing SCM_RIGHTS.
3255 	 */
3256 	if (m != NULL) {
3257 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3258 	} else {
3259 		nextrecord = sb_rcv->sb_mb;
3260 	}
3261 
3262 done:
3263 	*mp = m;
3264 	*nextrecordp = nextrecord;
3265 
3266 	return error;
3267 }
3268 
3269 /*
3270  * If we have less data than requested, block awaiting more
3271  * (subject to any timeout) if:
3272  *   1. the current count is less than the low water mark, or
3273  *   2. MSG_WAITALL is set, and it is possible to do the entire
3274  *	receive operation at once if we block (resid <= hiwat).
3275  *   3. MSG_DONTWAIT is not set
3276  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3277  * we have to do the receive in sections, and thus risk returning
3278  * a short count if a timeout or signal occurs after we start.
3279  */
3280 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3281 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3282 {
3283 	struct protosw *pr = so->so_proto;
3284 
3285 	/* No mbufs in the receive-queue? Wait! */
3286 	if (m == NULL) {
3287 		return true;
3288 	}
3289 
3290 	/* Not enough data in the receive socket-buffer - we may have to wait */
3291 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3292 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3293 		/*
3294 		 * Application did set the lowater-mark, so we should wait for
3295 		 * this data to be present.
3296 		 */
3297 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3298 			return true;
3299 		}
3300 
3301 		/*
3302 		 * Application wants all the data - so let's try to do the
3303 		 * receive-operation at once by waiting for everything to
3304 		 * be there.
3305 		 */
3306 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3307 			return true;
3308 		}
3309 	}
3310 
3311 	return false;
3312 }
3313 
3314 /*
3315  * Implement receive operations on a socket.
3316  * We depend on the way that records are added to the sockbuf
3317  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3318  * must begin with an address if the protocol so specifies,
3319  * followed by an optional mbuf or mbufs containing ancillary data,
3320  * and then zero or more mbufs of data.
3321  * In order to avoid blocking network interrupts for the entire time here,
3322  * we splx() while doing the actual copy to user space.
3323  * Although the sockbuf is locked, new data may still be appended,
3324  * and thus we must maintain consistency of the sockbuf during that time.
3325  *
3326  * The caller may receive the data as a single mbuf chain by supplying
3327  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3328  * only for the count in uio_resid.
3329  *
3330  * Returns:	0			Success
3331  *		ENOBUFS
3332  *		ENOTCONN
3333  *		EWOULDBLOCK
3334  *	uiomove:EFAULT
3335  *	sblock:EWOULDBLOCK
3336  *	sblock:EINTR
3337  *	sbwait:EBADF
3338  *	sbwait:EINTR
3339  *	sodelayed_copy:EFAULT
3340  *	<pru_rcvoob>:EINVAL[TCP]
3341  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3342  *	<pru_rcvoob>:???
3343  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3344  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3345  *	<pr_domain->dom_externalize>:???
3346  *
3347  * Notes:	Additional return values from calls through <pru_rcvoob> and
3348  *		<pr_domain->dom_externalize> depend on protocols other than
3349  *		TCP or AF_UNIX, which are documented above.
3350  */
3351 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3352 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3353     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3354 {
3355 	struct mbuf *m, **mp, *ml = NULL;
3356 	struct mbuf *nextrecord, *free_list;
3357 	int flags, error, offset;
3358 	user_ssize_t len;
3359 	struct protosw *pr = so->so_proto;
3360 	int moff, type = 0;
3361 	user_ssize_t orig_resid = uio_resid(uio);
3362 	user_ssize_t delayed_copy_len;
3363 	int can_delay;
3364 	struct proc *p = current_proc();
3365 	boolean_t en_tracing = FALSE;
3366 
3367 	/*
3368 	 * Sanity check on the length passed by caller as we are making 'int'
3369 	 * comparisons
3370 	 */
3371 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3372 		return EINVAL;
3373 	}
3374 
3375 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3376 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3377 	    so->so_rcv.sb_hiwat);
3378 
3379 	socket_lock(so, 1);
3380 	so_update_last_owner_locked(so, p);
3381 	so_update_policy(so);
3382 
3383 #ifdef MORE_LOCKING_DEBUG
3384 	if (so->so_usecount == 1) {
3385 		panic("%s: so=%x no other reference on socket", __func__, so);
3386 		/* NOTREACHED */
3387 	}
3388 #endif
3389 	mp = mp0;
3390 	if (psa != NULL) {
3391 		*psa = NULL;
3392 	}
3393 	if (controlp != NULL) {
3394 		*controlp = NULL;
3395 	}
3396 	if (flagsp != NULL) {
3397 		flags = *flagsp & ~MSG_EOR;
3398 	} else {
3399 		flags = 0;
3400 	}
3401 
3402 	/*
3403 	 * If a recv attempt is made on a previously-accepted socket
3404 	 * that has been marked as inactive (disconnected), reject
3405 	 * the request.
3406 	 */
3407 	if (so->so_flags & SOF_DEFUNCT) {
3408 		struct sockbuf *sb = &so->so_rcv;
3409 
3410 		error = ENOTCONN;
3411 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3412 		    __func__, proc_pid(p), proc_best_name(p),
3413 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3414 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3415 		/*
3416 		 * This socket should have been disconnected and flushed
3417 		 * prior to being returned from sodefunct(); there should
3418 		 * be no data on its receive list, so panic otherwise.
3419 		 */
3420 		if (so->so_state & SS_DEFUNCT) {
3421 			sb_empty_assert(sb, __func__);
3422 		}
3423 		socket_unlock(so, 1);
3424 		return error;
3425 	}
3426 
3427 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3428 	    pr->pr_usrreqs->pru_preconnect) {
3429 		/*
3430 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3431 		 * calling write() right after this. *If* the app calls a read
3432 		 * we do not want to block this read indefinetely. Thus,
3433 		 * we trigger a connect so that the session gets initiated.
3434 		 */
3435 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3436 
3437 		if (error) {
3438 			socket_unlock(so, 1);
3439 			return error;
3440 		}
3441 	}
3442 
3443 	if (ENTR_SHOULDTRACE &&
3444 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3445 		/*
3446 		 * enable energy tracing for inet sockets that go over
3447 		 * non-loopback interfaces only.
3448 		 */
3449 		struct inpcb *inp = sotoinpcb(so);
3450 		if (inp->inp_last_outifp != NULL &&
3451 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3452 			en_tracing = TRUE;
3453 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3454 			    VM_KERNEL_ADDRPERM(so),
3455 			    ((so->so_state & SS_NBIO) ?
3456 			    kEnTrFlagNonBlocking : 0),
3457 			    (int64_t)orig_resid);
3458 		}
3459 	}
3460 
3461 	/*
3462 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3463 	 * regardless of the flags argument. Here is the case were
3464 	 * out-of-band data is not inline.
3465 	 */
3466 	if ((flags & MSG_OOB) ||
3467 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3468 	    (so->so_options & SO_OOBINLINE) == 0 &&
3469 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3470 		m = m_get(M_WAIT, MT_DATA);
3471 		if (m == NULL) {
3472 			socket_unlock(so, 1);
3473 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3474 			    ENOBUFS, 0, 0, 0, 0);
3475 			return ENOBUFS;
3476 		}
3477 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3478 		if (error) {
3479 			goto bad;
3480 		}
3481 		socket_unlock(so, 0);
3482 		do {
3483 			error = uiomove(mtod(m, caddr_t),
3484 			    imin((int)uio_resid(uio), m->m_len), uio);
3485 			m = m_free(m);
3486 		} while (uio_resid(uio) && error == 0 && m != NULL);
3487 		socket_lock(so, 0);
3488 bad:
3489 		if (m != NULL) {
3490 			m_freem(m);
3491 		}
3492 
3493 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3494 			if (error == EWOULDBLOCK || error == EINVAL) {
3495 				/*
3496 				 * Let's try to get normal data:
3497 				 * EWOULDBLOCK: out-of-band data not
3498 				 * receive yet. EINVAL: out-of-band data
3499 				 * already read.
3500 				 */
3501 				error = 0;
3502 				goto nooob;
3503 			} else if (error == 0 && flagsp != NULL) {
3504 				*flagsp |= MSG_OOB;
3505 			}
3506 		}
3507 		socket_unlock(so, 1);
3508 		if (en_tracing) {
3509 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3510 			    VM_KERNEL_ADDRPERM(so), 0,
3511 			    (int64_t)(orig_resid - uio_resid(uio)));
3512 		}
3513 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3514 		    0, 0, 0, 0);
3515 
3516 		return error;
3517 	}
3518 nooob:
3519 	if (mp != NULL) {
3520 		*mp = NULL;
3521 	}
3522 
3523 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3524 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3525 	}
3526 
3527 	free_list = NULL;
3528 	delayed_copy_len = 0;
3529 restart:
3530 #ifdef MORE_LOCKING_DEBUG
3531 	if (so->so_usecount <= 1) {
3532 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3533 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3534 	}
3535 #endif
3536 	/*
3537 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3538 	 * and if so just return to the caller.  This could happen when
3539 	 * soreceive() is called by a socket upcall function during the
3540 	 * time the socket is freed.  The socket buffer would have been
3541 	 * locked across the upcall, therefore we cannot put this thread
3542 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3543 	 * we may livelock), because the lock on the socket buffer will
3544 	 * only be released when the upcall routine returns to its caller.
3545 	 * Because the socket has been officially closed, there can be
3546 	 * no further read on it.
3547 	 *
3548 	 * A multipath subflow socket would have its SS_NOFDREF set by
3549 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3550 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3551 	 */
3552 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3553 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3554 		socket_unlock(so, 1);
3555 		return 0;
3556 	}
3557 
3558 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3559 	if (error) {
3560 		socket_unlock(so, 1);
3561 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3562 		    0, 0, 0, 0);
3563 		if (en_tracing) {
3564 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3565 			    VM_KERNEL_ADDRPERM(so), 0,
3566 			    (int64_t)(orig_resid - uio_resid(uio)));
3567 		}
3568 		return error;
3569 	}
3570 
3571 	m = so->so_rcv.sb_mb;
3572 	if (so_should_wait(so, uio, m, flags)) {
3573 		/*
3574 		 * Panic if we notice inconsistencies in the socket's
3575 		 * receive list; both sb_mb and sb_cc should correctly
3576 		 * reflect the contents of the list, otherwise we may
3577 		 * end up with false positives during select() or poll()
3578 		 * which could put the application in a bad state.
3579 		 */
3580 		SB_MB_CHECK(&so->so_rcv);
3581 
3582 		if (so->so_error) {
3583 			if (m != NULL) {
3584 				goto dontblock;
3585 			}
3586 			error = so->so_error;
3587 			if ((flags & MSG_PEEK) == 0) {
3588 				so->so_error = 0;
3589 			}
3590 			goto release;
3591 		}
3592 		if (so->so_state & SS_CANTRCVMORE) {
3593 #if CONTENT_FILTER
3594 			/*
3595 			 * Deal with half closed connections
3596 			 */
3597 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3598 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3599 				CFIL_LOG(LOG_INFO,
3600 				    "so %llx ignore SS_CANTRCVMORE",
3601 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3602 			} else
3603 #endif /* CONTENT_FILTER */
3604 			if (m != NULL) {
3605 				goto dontblock;
3606 			} else {
3607 				goto release;
3608 			}
3609 		}
3610 		for (; m != NULL; m = m->m_next) {
3611 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3612 				m = so->so_rcv.sb_mb;
3613 				goto dontblock;
3614 			}
3615 		}
3616 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3617 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3618 			error = ENOTCONN;
3619 			goto release;
3620 		}
3621 		if (uio_resid(uio) == 0) {
3622 			goto release;
3623 		}
3624 
3625 		if ((so->so_state & SS_NBIO) ||
3626 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3627 			error = EWOULDBLOCK;
3628 			goto release;
3629 		}
3630 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3631 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3632 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3633 #if EVEN_MORE_LOCKING_DEBUG
3634 		if (socket_debug) {
3635 			printf("Waiting for socket data\n");
3636 		}
3637 #endif
3638 
3639 		/*
3640 		 * Depending on the protocol (e.g. TCP), the following
3641 		 * might cause the socket lock to be dropped and later
3642 		 * be reacquired, and more data could have arrived and
3643 		 * have been appended to the receive socket buffer by
3644 		 * the time it returns.  Therefore, we only sleep in
3645 		 * sbwait() below if and only if the wait-condition is still
3646 		 * true.
3647 		 */
3648 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3649 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3650 		}
3651 
3652 		error = 0;
3653 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3654 			error = sbwait(&so->so_rcv);
3655 		}
3656 
3657 #if EVEN_MORE_LOCKING_DEBUG
3658 		if (socket_debug) {
3659 			printf("SORECEIVE - sbwait returned %d\n", error);
3660 		}
3661 #endif
3662 		if (so->so_usecount < 1) {
3663 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3664 			    __func__, so, so->so_usecount);
3665 			/* NOTREACHED */
3666 		}
3667 		if (error) {
3668 			socket_unlock(so, 1);
3669 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3670 			    0, 0, 0, 0);
3671 			if (en_tracing) {
3672 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3673 				    VM_KERNEL_ADDRPERM(so), 0,
3674 				    (int64_t)(orig_resid - uio_resid(uio)));
3675 			}
3676 			return error;
3677 		}
3678 		goto restart;
3679 	}
3680 dontblock:
3681 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3682 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3683 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3684 	nextrecord = m->m_nextpkt;
3685 
3686 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3687 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3688 		    mp0 == NULL);
3689 		if (error == ERESTART) {
3690 			goto restart;
3691 		} else if (error != 0) {
3692 			goto release;
3693 		}
3694 		orig_resid = 0;
3695 	}
3696 
3697 	/*
3698 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3699 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3700 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3701 	 * perform externalization.
3702 	 */
3703 	if (m != NULL && m->m_type == MT_CONTROL) {
3704 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3705 		if (error != 0) {
3706 			goto release;
3707 		}
3708 		orig_resid = 0;
3709 	}
3710 
3711 	if (m != NULL) {
3712 		if (!(flags & MSG_PEEK)) {
3713 			/*
3714 			 * We get here because m points to an mbuf following
3715 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3716 			 * processed above.  In any case, m should be pointing
3717 			 * to the head of the mbuf chain, and the nextrecord
3718 			 * should be either NULL or equal to m->m_nextpkt.
3719 			 * See comments above about SB_LOCK.
3720 			 */
3721 			if (m != so->so_rcv.sb_mb ||
3722 			    m->m_nextpkt != nextrecord) {
3723 				panic("%s: post-control !sync so=%p m=%p "
3724 				    "nextrecord=%p\n", __func__, so, m,
3725 				    nextrecord);
3726 				/* NOTREACHED */
3727 			}
3728 			if (nextrecord == NULL) {
3729 				so->so_rcv.sb_lastrecord = m;
3730 			}
3731 		}
3732 		type = m->m_type;
3733 		if (type == MT_OOBDATA) {
3734 			flags |= MSG_OOB;
3735 		}
3736 	} else {
3737 		if (!(flags & MSG_PEEK)) {
3738 			SB_EMPTY_FIXUP(&so->so_rcv);
3739 		}
3740 	}
3741 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3742 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3743 
3744 	moff = 0;
3745 	offset = 0;
3746 
3747 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3748 		can_delay = 1;
3749 	} else {
3750 		can_delay = 0;
3751 	}
3752 
3753 	while (m != NULL &&
3754 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3755 		if (m->m_type == MT_OOBDATA) {
3756 			if (type != MT_OOBDATA) {
3757 				break;
3758 			}
3759 		} else if (type == MT_OOBDATA) {
3760 			break;
3761 		}
3762 
3763 		if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3764 		    m->m_type != MT_HEADER) {
3765 			break;
3766 		}
3767 		/*
3768 		 * Make sure to allways set MSG_OOB event when getting
3769 		 * out of band data inline.
3770 		 */
3771 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3772 		    (so->so_options & SO_OOBINLINE) != 0 &&
3773 		    (so->so_state & SS_RCVATMARK) != 0) {
3774 			flags |= MSG_OOB;
3775 		}
3776 		so->so_state &= ~SS_RCVATMARK;
3777 		len = uio_resid(uio) - delayed_copy_len;
3778 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3779 			len = so->so_oobmark - offset;
3780 		}
3781 		if (len > m->m_len - moff) {
3782 			len = m->m_len - moff;
3783 		}
3784 		/*
3785 		 * If mp is set, just pass back the mbufs.
3786 		 * Otherwise copy them out via the uio, then free.
3787 		 * Sockbuf must be consistent here (points to current mbuf,
3788 		 * it points to next record) when we drop priority;
3789 		 * we must note any additions to the sockbuf when we
3790 		 * block interrupts again.
3791 		 */
3792 		if (mp == NULL) {
3793 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3794 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3795 			if (can_delay && len == m->m_len) {
3796 				/*
3797 				 * only delay the copy if we're consuming the
3798 				 * mbuf and we're NOT in MSG_PEEK mode
3799 				 * and we have enough data to make it worthwile
3800 				 * to drop and retake the lock... can_delay
3801 				 * reflects the state of the 2 latter
3802 				 * constraints moff should always be zero
3803 				 * in these cases
3804 				 */
3805 				delayed_copy_len += len;
3806 			} else {
3807 				if (delayed_copy_len) {
3808 					error = sodelayed_copy(so, uio,
3809 					    &free_list, &delayed_copy_len);
3810 
3811 					if (error) {
3812 						goto release;
3813 					}
3814 					/*
3815 					 * can only get here if MSG_PEEK is not
3816 					 * set therefore, m should point at the
3817 					 * head of the rcv queue; if it doesn't,
3818 					 * it means something drastically
3819 					 * changed while we were out from behind
3820 					 * the lock in sodelayed_copy. perhaps
3821 					 * a RST on the stream. in any event,
3822 					 * the stream has been interrupted. it's
3823 					 * probably best just to return whatever
3824 					 * data we've moved and let the caller
3825 					 * sort it out...
3826 					 */
3827 					if (m != so->so_rcv.sb_mb) {
3828 						break;
3829 					}
3830 				}
3831 				socket_unlock(so, 0);
3832 				error = uiomove(mtod(m, caddr_t) + moff,
3833 				    (int)len, uio);
3834 				socket_lock(so, 0);
3835 
3836 				if (error) {
3837 					goto release;
3838 				}
3839 			}
3840 		} else {
3841 			uio_setresid(uio, (uio_resid(uio) - len));
3842 		}
3843 		if (len == m->m_len - moff) {
3844 			if (m->m_flags & M_EOR) {
3845 				flags |= MSG_EOR;
3846 			}
3847 			if (flags & MSG_PEEK) {
3848 				m = m->m_next;
3849 				moff = 0;
3850 			} else {
3851 				nextrecord = m->m_nextpkt;
3852 				sbfree(&so->so_rcv, m);
3853 				m->m_nextpkt = NULL;
3854 
3855 				if (mp != NULL) {
3856 					*mp = m;
3857 					mp = &m->m_next;
3858 					so->so_rcv.sb_mb = m = m->m_next;
3859 					*mp = NULL;
3860 				} else {
3861 					if (free_list == NULL) {
3862 						free_list = m;
3863 					} else {
3864 						ml->m_next = m;
3865 					}
3866 					ml = m;
3867 					so->so_rcv.sb_mb = m = m->m_next;
3868 					ml->m_next = NULL;
3869 				}
3870 				if (m != NULL) {
3871 					m->m_nextpkt = nextrecord;
3872 					if (nextrecord == NULL) {
3873 						so->so_rcv.sb_lastrecord = m;
3874 					}
3875 				} else {
3876 					so->so_rcv.sb_mb = nextrecord;
3877 					SB_EMPTY_FIXUP(&so->so_rcv);
3878 				}
3879 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3880 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3881 			}
3882 		} else {
3883 			if (flags & MSG_PEEK) {
3884 				moff += len;
3885 			} else {
3886 				if (mp != NULL) {
3887 					int copy_flag;
3888 
3889 					if (flags & MSG_DONTWAIT) {
3890 						copy_flag = M_DONTWAIT;
3891 					} else {
3892 						copy_flag = M_WAIT;
3893 					}
3894 					*mp = m_copym(m, 0, (int)len, copy_flag);
3895 					/*
3896 					 * Failed to allocate an mbuf?
3897 					 * Adjust uio_resid back, it was
3898 					 * adjusted down by len bytes which
3899 					 * we didn't copy over.
3900 					 */
3901 					if (*mp == NULL) {
3902 						uio_setresid(uio,
3903 						    (uio_resid(uio) + len));
3904 						break;
3905 					}
3906 				}
3907 				m->m_data += len;
3908 				m->m_len -= len;
3909 				so->so_rcv.sb_cc -= len;
3910 			}
3911 		}
3912 		if (so->so_oobmark) {
3913 			if ((flags & MSG_PEEK) == 0) {
3914 				so->so_oobmark -= len;
3915 				if (so->so_oobmark == 0) {
3916 					so->so_state |= SS_RCVATMARK;
3917 					break;
3918 				}
3919 			} else {
3920 				offset += len;
3921 				if (offset == so->so_oobmark) {
3922 					break;
3923 				}
3924 			}
3925 		}
3926 		if (flags & MSG_EOR) {
3927 			break;
3928 		}
3929 		/*
3930 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3931 		 * (for non-atomic socket), we must not quit until
3932 		 * "uio->uio_resid == 0" or an error termination.
3933 		 * If a signal/timeout occurs, return with a short
3934 		 * count but without error.  Keep sockbuf locked
3935 		 * against other readers.
3936 		 */
3937 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3938 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3939 		    !sosendallatonce(so) && !nextrecord) {
3940 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3941 #if CONTENT_FILTER
3942 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3943 #endif /* CONTENT_FILTER */
3944 			    )) {
3945 				goto release;
3946 			}
3947 
3948 			/*
3949 			 * Depending on the protocol (e.g. TCP), the following
3950 			 * might cause the socket lock to be dropped and later
3951 			 * be reacquired, and more data could have arrived and
3952 			 * have been appended to the receive socket buffer by
3953 			 * the time it returns.  Therefore, we only sleep in
3954 			 * sbwait() below if and only if the socket buffer is
3955 			 * empty, in order to avoid a false sleep.
3956 			 */
3957 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3958 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3959 			}
3960 
3961 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3962 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3963 
3964 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3965 				error = 0;
3966 				goto release;
3967 			}
3968 			/*
3969 			 * have to wait until after we get back from the sbwait
3970 			 * to do the copy because we will drop the lock if we
3971 			 * have enough data that has been delayed... by dropping
3972 			 * the lock we open up a window allowing the netisr
3973 			 * thread to process the incoming packets and to change
3974 			 * the state of this socket... we're issuing the sbwait
3975 			 * because the socket is empty and we're expecting the
3976 			 * netisr thread to wake us up when more packets arrive;
3977 			 * if we allow that processing to happen and then sbwait
3978 			 * we could stall forever with packets sitting in the
3979 			 * socket if no further packets arrive from the remote
3980 			 * side.
3981 			 *
3982 			 * we want to copy before we've collected all the data
3983 			 * to satisfy this request to allow the copy to overlap
3984 			 * the incoming packet processing on an MP system
3985 			 */
3986 			if (delayed_copy_len > sorecvmincopy &&
3987 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3988 				error = sodelayed_copy(so, uio,
3989 				    &free_list, &delayed_copy_len);
3990 
3991 				if (error) {
3992 					goto release;
3993 				}
3994 			}
3995 			m = so->so_rcv.sb_mb;
3996 			if (m != NULL) {
3997 				nextrecord = m->m_nextpkt;
3998 			}
3999 			SB_MB_CHECK(&so->so_rcv);
4000 		}
4001 	}
4002 #ifdef MORE_LOCKING_DEBUG
4003 	if (so->so_usecount <= 1) {
4004 		panic("%s: after big while so=%p ref=%d on socket",
4005 		    __func__, so, so->so_usecount);
4006 		/* NOTREACHED */
4007 	}
4008 #endif
4009 
4010 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4011 		if (so->so_options & SO_DONTTRUNC) {
4012 			flags |= MSG_RCVMORE;
4013 		} else {
4014 			flags |= MSG_TRUNC;
4015 			if ((flags & MSG_PEEK) == 0) {
4016 				(void) sbdroprecord(&so->so_rcv);
4017 			}
4018 		}
4019 	}
4020 
4021 	/*
4022 	 * pru_rcvd below (for TCP) may cause more data to be received
4023 	 * if the socket lock is dropped prior to sending the ACK; some
4024 	 * legacy OpenTransport applications don't handle this well
4025 	 * (if it receives less data than requested while MSG_HAVEMORE
4026 	 * is set), and so we set the flag now based on what we know
4027 	 * prior to calling pru_rcvd.
4028 	 */
4029 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4030 		flags |= MSG_HAVEMORE;
4031 	}
4032 
4033 	if ((flags & MSG_PEEK) == 0) {
4034 		if (m == NULL) {
4035 			so->so_rcv.sb_mb = nextrecord;
4036 			/*
4037 			 * First part is an inline SB_EMPTY_FIXUP().  Second
4038 			 * part makes sure sb_lastrecord is up-to-date if
4039 			 * there is still data in the socket buffer.
4040 			 */
4041 			if (so->so_rcv.sb_mb == NULL) {
4042 				so->so_rcv.sb_mbtail = NULL;
4043 				so->so_rcv.sb_lastrecord = NULL;
4044 			} else if (nextrecord->m_nextpkt == NULL) {
4045 				so->so_rcv.sb_lastrecord = nextrecord;
4046 			}
4047 			SB_MB_CHECK(&so->so_rcv);
4048 		}
4049 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4050 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4051 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4052 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4053 		}
4054 	}
4055 
4056 	if (delayed_copy_len) {
4057 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4058 		if (error) {
4059 			goto release;
4060 		}
4061 	}
4062 	if (free_list != NULL) {
4063 		m_freem_list(free_list);
4064 		free_list = NULL;
4065 	}
4066 
4067 	if (orig_resid == uio_resid(uio) && orig_resid &&
4068 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4069 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4070 		goto restart;
4071 	}
4072 
4073 	if (flagsp != NULL) {
4074 		*flagsp |= flags;
4075 	}
4076 release:
4077 #ifdef MORE_LOCKING_DEBUG
4078 	if (so->so_usecount <= 1) {
4079 		panic("%s: release so=%p ref=%d on socket", __func__,
4080 		    so, so->so_usecount);
4081 		/* NOTREACHED */
4082 	}
4083 #endif
4084 	if (delayed_copy_len) {
4085 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4086 	}
4087 
4088 	if (free_list != NULL) {
4089 		m_freem_list(free_list);
4090 	}
4091 
4092 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4093 
4094 	if (en_tracing) {
4095 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4096 		    VM_KERNEL_ADDRPERM(so),
4097 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4098 		    (int64_t)(orig_resid - uio_resid(uio)));
4099 	}
4100 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4101 	    so->so_rcv.sb_cc, 0, error);
4102 
4103 	return error;
4104 }
4105 
4106 /*
4107  * Returns:	0			Success
4108  *	uiomove:EFAULT
4109  */
4110 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4111 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4112     user_ssize_t *resid)
4113 {
4114 	int error = 0;
4115 	struct mbuf *m;
4116 
4117 	m = *free_list;
4118 
4119 	socket_unlock(so, 0);
4120 
4121 	while (m != NULL && error == 0) {
4122 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4123 		m = m->m_next;
4124 	}
4125 	m_freem_list(*free_list);
4126 
4127 	*free_list = NULL;
4128 	*resid = 0;
4129 
4130 	socket_lock(so, 0);
4131 
4132 	return error;
4133 }
4134 
4135 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4136 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4137     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4138 {
4139 #pragma unused(so)
4140 	int error = 0;
4141 	struct mbuf *ml, *m;
4142 	int i = 0;
4143 	struct uio *auio;
4144 
4145 	for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4146 	    ml = ml->m_nextpkt, i++) {
4147 		auio = msgarray[i].uio;
4148 		for (m = ml; m != NULL; m = m->m_next) {
4149 			error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4150 			if (error != 0) {
4151 				goto out;
4152 			}
4153 		}
4154 	}
4155 out:
4156 	m_freem_list(*free_list);
4157 
4158 	*free_list = NULL;
4159 	*resid = 0;
4160 
4161 	return error;
4162 }
4163 
4164 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4165 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4166     int *flagsp)
4167 {
4168 	struct mbuf *m;
4169 	struct mbuf *nextrecord;
4170 	struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4171 	int error;
4172 	user_ssize_t len, pktlen, delayed_copy_len = 0;
4173 	struct protosw *pr = so->so_proto;
4174 	user_ssize_t resid;
4175 	struct proc *p = current_proc();
4176 	struct uio *auio = NULL;
4177 	int npkts = 0;
4178 	int sblocked = 0;
4179 	struct sockaddr **psa = NULL;
4180 	struct mbuf **controlp = NULL;
4181 	int can_delay;
4182 	int flags;
4183 	struct mbuf *free_others = NULL;
4184 
4185 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4186 	    so, uiocnt,
4187 	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4188 
4189 	/*
4190 	 * Sanity checks:
4191 	 * - Only supports don't wait flags
4192 	 * - Only support datagram sockets (could be extended to raw)
4193 	 * - Must be atomic
4194 	 * - Protocol must support packet chains
4195 	 * - The uio array is NULL (should we panic?)
4196 	 */
4197 	if (flagsp != NULL) {
4198 		flags = *flagsp;
4199 	} else {
4200 		flags = 0;
4201 	}
4202 	if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4203 	    MSG_NBIO)) {
4204 		printf("%s invalid flags 0x%x\n", __func__, flags);
4205 		error = EINVAL;
4206 		goto out;
4207 	}
4208 	if (so->so_type != SOCK_DGRAM) {
4209 		error = EINVAL;
4210 		goto out;
4211 	}
4212 	if (sosendallatonce(so) == 0) {
4213 		error = EINVAL;
4214 		goto out;
4215 	}
4216 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4217 		error = EPROTONOSUPPORT;
4218 		goto out;
4219 	}
4220 	if (msgarray == NULL) {
4221 		printf("%s uioarray is NULL\n", __func__);
4222 		error = EINVAL;
4223 		goto out;
4224 	}
4225 	if (uiocnt == 0) {
4226 		printf("%s uiocnt is 0\n", __func__);
4227 		error = EINVAL;
4228 		goto out;
4229 	}
4230 	/*
4231 	 * Sanity check on the length passed by caller as we are making 'int'
4232 	 * comparisons
4233 	 */
4234 	resid = recv_msg_array_resid(msgarray, uiocnt);
4235 	if (resid < 0 || resid > INT_MAX) {
4236 		error = EINVAL;
4237 		goto out;
4238 	}
4239 
4240 	if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4241 		can_delay = 1;
4242 	} else {
4243 		can_delay = 0;
4244 	}
4245 
4246 	socket_lock(so, 1);
4247 	so_update_last_owner_locked(so, p);
4248 	so_update_policy(so);
4249 
4250 #if NECP
4251 	so_update_necp_policy(so, NULL, NULL);
4252 #endif /* NECP */
4253 
4254 	/*
4255 	 * If a recv attempt is made on a previously-accepted socket
4256 	 * that has been marked as inactive (disconnected), reject
4257 	 * the request.
4258 	 */
4259 	if (so->so_flags & SOF_DEFUNCT) {
4260 		struct sockbuf *sb = &so->so_rcv;
4261 
4262 		error = ENOTCONN;
4263 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4264 		    __func__, proc_pid(p), proc_best_name(p),
4265 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4266 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4267 		/*
4268 		 * This socket should have been disconnected and flushed
4269 		 * prior to being returned from sodefunct(); there should
4270 		 * be no data on its receive list, so panic otherwise.
4271 		 */
4272 		if (so->so_state & SS_DEFUNCT) {
4273 			sb_empty_assert(sb, __func__);
4274 		}
4275 		goto release;
4276 	}
4277 
4278 next:
4279 	/*
4280 	 * The uio may be empty
4281 	 */
4282 	if (npkts >= uiocnt) {
4283 		error = 0;
4284 		goto release;
4285 	}
4286 restart:
4287 	/*
4288 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4289 	 * and if so just return to the caller.  This could happen when
4290 	 * soreceive() is called by a socket upcall function during the
4291 	 * time the socket is freed.  The socket buffer would have been
4292 	 * locked across the upcall, therefore we cannot put this thread
4293 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4294 	 * we may livelock), because the lock on the socket buffer will
4295 	 * only be released when the upcall routine returns to its caller.
4296 	 * Because the socket has been officially closed, there can be
4297 	 * no further read on it.
4298 	 */
4299 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4300 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4301 		error = 0;
4302 		goto release;
4303 	}
4304 
4305 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4306 	if (error) {
4307 		goto release;
4308 	}
4309 	sblocked = 1;
4310 
4311 	m = so->so_rcv.sb_mb;
4312 	/*
4313 	 * Block awaiting more datagram if needed
4314 	 */
4315 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4316 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4317 	    ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4318 		/*
4319 		 * Panic if we notice inconsistencies in the socket's
4320 		 * receive list; both sb_mb and sb_cc should correctly
4321 		 * reflect the contents of the list, otherwise we may
4322 		 * end up with false positives during select() or poll()
4323 		 * which could put the application in a bad state.
4324 		 */
4325 		SB_MB_CHECK(&so->so_rcv);
4326 
4327 		if (so->so_error) {
4328 			error = so->so_error;
4329 			if ((flags & MSG_PEEK) == 0) {
4330 				so->so_error = 0;
4331 			}
4332 			goto release;
4333 		}
4334 		if (so->so_state & SS_CANTRCVMORE) {
4335 			goto release;
4336 		}
4337 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4338 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4339 			error = ENOTCONN;
4340 			goto release;
4341 		}
4342 		if ((so->so_state & SS_NBIO) ||
4343 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4344 			error = EWOULDBLOCK;
4345 			goto release;
4346 		}
4347 		/*
4348 		 * Do not block if we got some data
4349 		 */
4350 		if (free_list != NULL) {
4351 			error = 0;
4352 			goto release;
4353 		}
4354 
4355 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4356 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4357 
4358 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4359 		sblocked = 0;
4360 
4361 		error = sbwait(&so->so_rcv);
4362 		if (error) {
4363 			goto release;
4364 		}
4365 		goto restart;
4366 	}
4367 
4368 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4369 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4370 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4371 
4372 	/*
4373 	 * Consume the current uio index as we have a datagram
4374 	 */
4375 	auio = msgarray[npkts].uio;
4376 	resid = uio_resid(auio);
4377 	msgarray[npkts].which |= SOCK_MSG_DATA;
4378 	psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4379 	    &msgarray[npkts].psa : NULL;
4380 	controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4381 	    &msgarray[npkts].controlp : NULL;
4382 	npkts += 1;
4383 	nextrecord = m->m_nextpkt;
4384 
4385 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4386 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4387 		if (error == ERESTART) {
4388 			goto restart;
4389 		} else if (error != 0) {
4390 			goto release;
4391 		}
4392 	}
4393 
4394 	if (m != NULL && m->m_type == MT_CONTROL) {
4395 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4396 		if (error != 0) {
4397 			goto release;
4398 		}
4399 	}
4400 
4401 	if (m->m_pkthdr.len == 0) {
4402 		printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4403 		    __func__, __LINE__,
4404 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4405 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4406 		    m->m_type);
4407 	}
4408 
4409 	/*
4410 	 * Loop to copy the mbufs of the current record
4411 	 * Support zero length packets
4412 	 */
4413 	ml = NULL;
4414 	pktlen = 0;
4415 	while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4416 		if (m->m_len == 0) {
4417 			panic("%p m_len zero", m);
4418 		}
4419 		if (m->m_type == 0) {
4420 			panic("%p m_type zero", m);
4421 		}
4422 		/*
4423 		 * Clip to the residual length
4424 		 */
4425 		if (len > m->m_len) {
4426 			len = m->m_len;
4427 		}
4428 		pktlen += len;
4429 		/*
4430 		 * Copy the mbufs via the uio or delay the copy
4431 		 * Sockbuf must be consistent here (points to current mbuf,
4432 		 * it points to next record) when we drop priority;
4433 		 * we must note any additions to the sockbuf when we
4434 		 * block interrupts again.
4435 		 */
4436 		if (len > 0 && can_delay == 0) {
4437 			socket_unlock(so, 0);
4438 			error = uiomove(mtod(m, caddr_t), (int)len, auio);
4439 			socket_lock(so, 0);
4440 			if (error) {
4441 				goto release;
4442 			}
4443 		} else {
4444 			delayed_copy_len += len;
4445 		}
4446 
4447 		if (len == m->m_len) {
4448 			/*
4449 			 * m was entirely copied
4450 			 */
4451 			sbfree(&so->so_rcv, m);
4452 			nextrecord = m->m_nextpkt;
4453 			m->m_nextpkt = NULL;
4454 
4455 			/*
4456 			 * Set the first packet to the head of the free list
4457 			 */
4458 			if (free_list == NULL) {
4459 				free_list = m;
4460 			}
4461 			/*
4462 			 * Link current packet to tail of free list
4463 			 */
4464 			if (ml == NULL) {
4465 				if (free_tail != NULL) {
4466 					free_tail->m_nextpkt = m;
4467 				}
4468 				free_tail = m;
4469 			}
4470 			/*
4471 			 * Link current mbuf to last mbuf of current packet
4472 			 */
4473 			if (ml != NULL) {
4474 				ml->m_next = m;
4475 			}
4476 			ml = m;
4477 
4478 			/*
4479 			 * Move next buf to head of socket buffer
4480 			 */
4481 			so->so_rcv.sb_mb = m = ml->m_next;
4482 			ml->m_next = NULL;
4483 
4484 			if (m != NULL) {
4485 				m->m_nextpkt = nextrecord;
4486 				if (nextrecord == NULL) {
4487 					so->so_rcv.sb_lastrecord = m;
4488 				}
4489 			} else {
4490 				so->so_rcv.sb_mb = nextrecord;
4491 				SB_EMPTY_FIXUP(&so->so_rcv);
4492 			}
4493 			SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4494 			SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4495 		} else {
4496 			/*
4497 			 * Stop the loop on partial copy
4498 			 */
4499 			break;
4500 		}
4501 	}
4502 #ifdef MORE_LOCKING_DEBUG
4503 	if (so->so_usecount <= 1) {
4504 		panic("%s: after big while so=%llx ref=%d on socket",
4505 		    __func__,
4506 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4507 		/* NOTREACHED */
4508 	}
4509 #endif
4510 	/*
4511 	 * Tell the caller we made a partial copy
4512 	 */
4513 	if (m != NULL) {
4514 		if (so->so_options & SO_DONTTRUNC) {
4515 			/*
4516 			 * Copyout first the freelist then the partial mbuf
4517 			 */
4518 			socket_unlock(so, 0);
4519 			if (delayed_copy_len) {
4520 				error = sodelayed_copy_list(so, msgarray,
4521 				    uiocnt, &free_list, &delayed_copy_len);
4522 			}
4523 
4524 			if (error == 0) {
4525 				error = uiomove(mtod(m, caddr_t), (int)len,
4526 				    auio);
4527 			}
4528 			socket_lock(so, 0);
4529 			if (error) {
4530 				goto release;
4531 			}
4532 
4533 			m->m_data += len;
4534 			m->m_len -= len;
4535 			so->so_rcv.sb_cc -= len;
4536 			flags |= MSG_RCVMORE;
4537 		} else {
4538 			(void) sbdroprecord(&so->so_rcv);
4539 			nextrecord = so->so_rcv.sb_mb;
4540 			m = NULL;
4541 			flags |= MSG_TRUNC;
4542 		}
4543 	}
4544 
4545 	if (m == NULL) {
4546 		so->so_rcv.sb_mb = nextrecord;
4547 		/*
4548 		 * First part is an inline SB_EMPTY_FIXUP().  Second
4549 		 * part makes sure sb_lastrecord is up-to-date if
4550 		 * there is still data in the socket buffer.
4551 		 */
4552 		if (so->so_rcv.sb_mb == NULL) {
4553 			so->so_rcv.sb_mbtail = NULL;
4554 			so->so_rcv.sb_lastrecord = NULL;
4555 		} else if (nextrecord->m_nextpkt == NULL) {
4556 			so->so_rcv.sb_lastrecord = nextrecord;
4557 		}
4558 		SB_MB_CHECK(&so->so_rcv);
4559 	}
4560 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4561 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4562 
4563 	/*
4564 	 * We can continue to the next packet as long as:
4565 	 * - We haven't exhausted the uio array
4566 	 * - There was no error
4567 	 * - A packet was not truncated
4568 	 * - We can still receive more data
4569 	 */
4570 	if (npkts < uiocnt && error == 0 &&
4571 	    (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4572 	    (so->so_state & SS_CANTRCVMORE) == 0) {
4573 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4574 		sblocked = 0;
4575 
4576 		goto next;
4577 	}
4578 	if (flagsp != NULL) {
4579 		*flagsp |= flags;
4580 	}
4581 
4582 release:
4583 	/*
4584 	 * pru_rcvd may cause more data to be received if the socket lock
4585 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4586 	 * That way the caller won't be surprised if it receives less data
4587 	 * than requested.
4588 	 */
4589 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4590 		flags |= MSG_HAVEMORE;
4591 	}
4592 
4593 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4594 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4595 	}
4596 
4597 	if (sblocked) {
4598 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4599 	} else {
4600 		socket_unlock(so, 1);
4601 	}
4602 
4603 	if (delayed_copy_len) {
4604 		error = sodelayed_copy_list(so, msgarray, uiocnt,
4605 		    &free_list, &delayed_copy_len);
4606 	}
4607 out:
4608 	/*
4609 	 * Amortize the cost of freeing the mbufs
4610 	 */
4611 	if (free_list != NULL) {
4612 		m_freem_list(free_list);
4613 	}
4614 	if (free_others != NULL) {
4615 		m_freem_list(free_others);
4616 	}
4617 
4618 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4619 	    0, 0, 0, 0);
4620 	return error;
4621 }
4622 
4623 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4624 so_statistics_event_to_nstat_event(int64_t *input_options,
4625     uint64_t *nstat_event)
4626 {
4627 	int error = 0;
4628 	switch (*input_options) {
4629 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4630 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4631 		break;
4632 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4633 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4634 		break;
4635 #if (DEBUG || DEVELOPMENT)
4636 	case SO_STATISTICS_EVENT_RESERVED_1:
4637 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4638 		break;
4639 	case SO_STATISTICS_EVENT_RESERVED_2:
4640 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4641 		break;
4642 #endif /* (DEBUG || DEVELOPMENT) */
4643 	default:
4644 		error = EINVAL;
4645 		break;
4646 	}
4647 	return error;
4648 }
4649 
4650 /*
4651  * Returns:	0			Success
4652  *		EINVAL
4653  *		ENOTCONN
4654  *	<pru_shutdown>:EINVAL
4655  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4656  *	<pru_shutdown>:ENOBUFS[TCP]
4657  *	<pru_shutdown>:EMSGSIZE[TCP]
4658  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4659  *	<pru_shutdown>:ENETUNREACH[TCP]
4660  *	<pru_shutdown>:ENETDOWN[TCP]
4661  *	<pru_shutdown>:ENOMEM[TCP]
4662  *	<pru_shutdown>:EACCES[TCP]
4663  *	<pru_shutdown>:EMSGSIZE[TCP]
4664  *	<pru_shutdown>:ENOBUFS[TCP]
4665  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4666  *	<pru_shutdown>:???		[other protocol families]
4667  */
4668 int
soshutdown(struct socket * so,int how)4669 soshutdown(struct socket *so, int how)
4670 {
4671 	int error;
4672 
4673 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4674 
4675 	switch (how) {
4676 	case SHUT_RD:
4677 	case SHUT_WR:
4678 	case SHUT_RDWR:
4679 		socket_lock(so, 1);
4680 		if ((so->so_state &
4681 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4682 			error = ENOTCONN;
4683 		} else {
4684 			error = soshutdownlock(so, how);
4685 		}
4686 		socket_unlock(so, 1);
4687 		break;
4688 	default:
4689 		error = EINVAL;
4690 		break;
4691 	}
4692 
4693 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4694 
4695 	return error;
4696 }
4697 
4698 int
soshutdownlock_final(struct socket * so,int how)4699 soshutdownlock_final(struct socket *so, int how)
4700 {
4701 	struct protosw *pr = so->so_proto;
4702 	int error = 0;
4703 
4704 	sflt_notify(so, sock_evt_shutdown, &how);
4705 
4706 	if (how != SHUT_WR) {
4707 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4708 			/* read already shut down */
4709 			error = ENOTCONN;
4710 			goto done;
4711 		}
4712 		sorflush(so);
4713 	}
4714 	if (how != SHUT_RD) {
4715 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4716 			/* write already shut down */
4717 			error = ENOTCONN;
4718 			goto done;
4719 		}
4720 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4721 	}
4722 done:
4723 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4724 	return error;
4725 }
4726 
4727 int
soshutdownlock(struct socket * so,int how)4728 soshutdownlock(struct socket *so, int how)
4729 {
4730 	int error = 0;
4731 
4732 #if CONTENT_FILTER
4733 	/*
4734 	 * A content filter may delay the actual shutdown until it
4735 	 * has processed the pending data
4736 	 */
4737 	if (so->so_flags & SOF_CONTENT_FILTER) {
4738 		error = cfil_sock_shutdown(so, &how);
4739 		if (error == EJUSTRETURN) {
4740 			error = 0;
4741 			goto done;
4742 		} else if (error != 0) {
4743 			goto done;
4744 		}
4745 	}
4746 #endif /* CONTENT_FILTER */
4747 
4748 	error = soshutdownlock_final(so, how);
4749 
4750 done:
4751 	return error;
4752 }
4753 
4754 void
sowflush(struct socket * so)4755 sowflush(struct socket *so)
4756 {
4757 	struct sockbuf *sb = &so->so_snd;
4758 
4759 	/*
4760 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4761 	 * to prevent the socket buffer from being unexpectedly altered
4762 	 * while it is used by another thread in socket send/receive.
4763 	 *
4764 	 * sblock() must not fail here, hence the assertion.
4765 	 */
4766 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4767 	VERIFY(sb->sb_flags & SB_LOCK);
4768 
4769 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4770 	sb->sb_flags            |= SB_DROP;
4771 	sb->sb_upcall           = NULL;
4772 	sb->sb_upcallarg        = NULL;
4773 
4774 	sbunlock(sb, TRUE);     /* keep socket locked */
4775 
4776 	selthreadclear(&sb->sb_sel);
4777 	sbrelease(sb);
4778 }
4779 
4780 void
sorflush(struct socket * so)4781 sorflush(struct socket *so)
4782 {
4783 	struct sockbuf *sb = &so->so_rcv;
4784 	struct protosw *pr = so->so_proto;
4785 	struct sockbuf asb;
4786 #ifdef notyet
4787 	lck_mtx_t *mutex_held;
4788 	/*
4789 	 * XXX: This code is currently commented out, because we may get here
4790 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4791 	 * longer be able to return us the lock; this will be fixed in future.
4792 	 */
4793 	if (so->so_proto->pr_getlock != NULL) {
4794 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4795 	} else {
4796 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4797 	}
4798 
4799 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4800 #endif /* notyet */
4801 
4802 	sflt_notify(so, sock_evt_flush_read, NULL);
4803 
4804 	socantrcvmore(so);
4805 
4806 	/*
4807 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4808 	 * to prevent the socket buffer from being unexpectedly altered
4809 	 * while it is used by another thread in socket send/receive.
4810 	 *
4811 	 * sblock() must not fail here, hence the assertion.
4812 	 */
4813 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4814 	VERIFY(sb->sb_flags & SB_LOCK);
4815 
4816 	/*
4817 	 * Copy only the relevant fields from "sb" to "asb" which we
4818 	 * need for sbrelease() to function.  In particular, skip
4819 	 * sb_sel as it contains the wait queue linkage, which would
4820 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4821 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4822 	 * to acquire it later as part of sbrelease().
4823 	 */
4824 	bzero(&asb, sizeof(asb));
4825 	asb.sb_cc               = sb->sb_cc;
4826 	asb.sb_hiwat            = sb->sb_hiwat;
4827 	asb.sb_mbcnt            = sb->sb_mbcnt;
4828 	asb.sb_mbmax            = sb->sb_mbmax;
4829 	asb.sb_ctl              = sb->sb_ctl;
4830 	asb.sb_lowat            = sb->sb_lowat;
4831 	asb.sb_mb               = sb->sb_mb;
4832 	asb.sb_mbtail           = sb->sb_mbtail;
4833 	asb.sb_lastrecord       = sb->sb_lastrecord;
4834 	asb.sb_so               = sb->sb_so;
4835 	asb.sb_flags            = sb->sb_flags;
4836 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4837 	asb.sb_flags            |= SB_DROP;
4838 
4839 	/*
4840 	 * Ideally we'd bzero() these and preserve the ones we need;
4841 	 * but to do that we'd need to shuffle things around in the
4842 	 * sockbuf, and we can't do it now because there are KEXTS
4843 	 * that are directly referring to the socket structure.
4844 	 *
4845 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4846 	 * Clearing SB_SEL is done for selthreadclear() below.
4847 	 */
4848 	sb->sb_cc               = 0;
4849 	sb->sb_hiwat            = 0;
4850 	sb->sb_mbcnt            = 0;
4851 	sb->sb_mbmax            = 0;
4852 	sb->sb_ctl              = 0;
4853 	sb->sb_lowat            = 0;
4854 	sb->sb_mb               = NULL;
4855 	sb->sb_mbtail           = NULL;
4856 	sb->sb_lastrecord       = NULL;
4857 	sb->sb_timeo.tv_sec     = 0;
4858 	sb->sb_timeo.tv_usec    = 0;
4859 	sb->sb_upcall           = NULL;
4860 	sb->sb_upcallarg        = NULL;
4861 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4862 	sb->sb_flags            |= SB_DROP;
4863 
4864 	sbunlock(sb, TRUE);     /* keep socket locked */
4865 
4866 	/*
4867 	 * Note that selthreadclear() is called on the original "sb" and
4868 	 * not the local "asb" because of the way wait queue linkage is
4869 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4870 	 * should no longer be set (cleared above.)
4871 	 */
4872 	selthreadclear(&sb->sb_sel);
4873 
4874 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4875 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4876 	}
4877 
4878 	sbrelease(&asb);
4879 }
4880 
4881 /*
4882  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4883  * an additional variant to handle the case where the option value needs
4884  * to be some kind of integer, but not a specific size.
4885  * In addition to their use here, these functions are also called by the
4886  * protocol-level pr_ctloutput() routines.
4887  *
4888  * Returns:	0			Success
4889  *		EINVAL
4890  *	copyin:EFAULT
4891  */
4892 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4893 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4894 {
4895 	size_t  valsize;
4896 
4897 	/*
4898 	 * If the user gives us more than we wanted, we ignore it,
4899 	 * but if we don't get the minimum length the caller
4900 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4901 	 * is set to however much we actually retrieved.
4902 	 */
4903 	if ((valsize = sopt->sopt_valsize) < minlen) {
4904 		return EINVAL;
4905 	}
4906 	if (valsize > len) {
4907 		sopt->sopt_valsize = valsize = len;
4908 	}
4909 
4910 	if (sopt->sopt_p != kernproc) {
4911 		return copyin(sopt->sopt_val, buf, valsize);
4912 	}
4913 
4914 	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4915 	return 0;
4916 }
4917 
4918 /*
4919  * sooptcopyin_timeval
4920  *   Copy in a timeval value into tv_p, and take into account whether the
4921  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4922  *   code here so that we can verify the 64-bit tv_sec value before we lose
4923  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4924  */
4925 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4926 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4927 {
4928 	int                     error;
4929 
4930 	if (proc_is64bit(sopt->sopt_p)) {
4931 		struct user64_timeval   tv64;
4932 
4933 		if (sopt->sopt_valsize < sizeof(tv64)) {
4934 			return EINVAL;
4935 		}
4936 
4937 		sopt->sopt_valsize = sizeof(tv64);
4938 		if (sopt->sopt_p != kernproc) {
4939 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4940 			if (error != 0) {
4941 				return error;
4942 			}
4943 		} else {
4944 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4945 			    sizeof(tv64));
4946 		}
4947 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4948 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4949 			return EDOM;
4950 		}
4951 
4952 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4953 		tv_p->tv_usec = tv64.tv_usec;
4954 	} else {
4955 		struct user32_timeval   tv32;
4956 
4957 		if (sopt->sopt_valsize < sizeof(tv32)) {
4958 			return EINVAL;
4959 		}
4960 
4961 		sopt->sopt_valsize = sizeof(tv32);
4962 		if (sopt->sopt_p != kernproc) {
4963 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4964 			if (error != 0) {
4965 				return error;
4966 			}
4967 		} else {
4968 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4969 			    sizeof(tv32));
4970 		}
4971 #ifndef __LP64__
4972 		/*
4973 		 * K64todo "comparison is always false due to
4974 		 * limited range of data type"
4975 		 */
4976 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4977 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4978 			return EDOM;
4979 		}
4980 #endif
4981 		tv_p->tv_sec = tv32.tv_sec;
4982 		tv_p->tv_usec = tv32.tv_usec;
4983 	}
4984 	return 0;
4985 }
4986 
4987 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4988 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4989     boolean_t ignore_delegate)
4990 {
4991 	kauth_cred_t cred =  NULL;
4992 	proc_t ep = PROC_NULL;
4993 	uid_t uid;
4994 	int error = 0;
4995 
4996 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4997 		ep = proc_find(so->e_pid);
4998 		if (ep) {
4999 			cred = kauth_cred_proc_ref(ep);
5000 		}
5001 	}
5002 
5003 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5004 
5005 	/* uid is 0 for root */
5006 	if (uid != 0 || !allow_root) {
5007 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5008 	}
5009 	if (cred) {
5010 		kauth_cred_unref(&cred);
5011 	}
5012 	if (ep != PROC_NULL) {
5013 		proc_rele(ep);
5014 	}
5015 
5016 	return error;
5017 }
5018 
5019 /*
5020  * Returns:	0			Success
5021  *		EINVAL
5022  *		ENOPROTOOPT
5023  *		ENOBUFS
5024  *		EDOM
5025  *	sooptcopyin:EINVAL
5026  *	sooptcopyin:EFAULT
5027  *	sooptcopyin_timeval:EINVAL
5028  *	sooptcopyin_timeval:EFAULT
5029  *	sooptcopyin_timeval:EDOM
5030  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5031  *	<pr_ctloutput>:???w
5032  *	sflt_attach_private:???		[whatever a filter author chooses]
5033  *	<sf_setoption>:???		[whatever a filter author chooses]
5034  *
5035  * Notes:	Other <pru_listen> returns depend on the protocol family; all
5036  *		<sf_listen> returns depend on what the filter author causes
5037  *		their filter to return.
5038  */
5039 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5040 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5041 {
5042 	int     error, optval;
5043 	int64_t long_optval;
5044 	struct  linger l;
5045 	struct  timeval tv;
5046 
5047 	if (sopt->sopt_dir != SOPT_SET) {
5048 		sopt->sopt_dir = SOPT_SET;
5049 	}
5050 
5051 	if (dolock) {
5052 		socket_lock(so, 1);
5053 	}
5054 
5055 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5056 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5057 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5058 		/* the socket has been shutdown, no more sockopt's */
5059 		error = EINVAL;
5060 		goto out;
5061 	}
5062 
5063 	error = sflt_setsockopt(so, sopt);
5064 	if (error != 0) {
5065 		if (error == EJUSTRETURN) {
5066 			error = 0;
5067 		}
5068 		goto out;
5069 	}
5070 
5071 	if (sopt->sopt_level != SOL_SOCKET) {
5072 		if (so->so_proto != NULL &&
5073 		    so->so_proto->pr_ctloutput != NULL) {
5074 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5075 			goto out;
5076 		}
5077 		error = ENOPROTOOPT;
5078 	} else {
5079 		/*
5080 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5081 		 * the protocol layer, if needed.  A zero value returned from
5082 		 * the handler means use default socket-level processing as
5083 		 * done by the rest of this routine.  Otherwise, any other
5084 		 * return value indicates that the option is unsupported.
5085 		 */
5086 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5087 		    pru_socheckopt(so, sopt)) != 0) {
5088 			goto out;
5089 		}
5090 
5091 		error = 0;
5092 		switch (sopt->sopt_name) {
5093 		case SO_LINGER:
5094 		case SO_LINGER_SEC:
5095 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5096 			if (error != 0) {
5097 				goto out;
5098 			}
5099 
5100 			so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5101 			    (short)l.l_linger : (short)(l.l_linger * hz);
5102 			if (l.l_onoff != 0) {
5103 				so->so_options |= SO_LINGER;
5104 			} else {
5105 				so->so_options &= ~SO_LINGER;
5106 			}
5107 			break;
5108 
5109 		case SO_DEBUG:
5110 		case SO_KEEPALIVE:
5111 		case SO_DONTROUTE:
5112 		case SO_USELOOPBACK:
5113 		case SO_BROADCAST:
5114 		case SO_REUSEADDR:
5115 		case SO_REUSEPORT:
5116 		case SO_OOBINLINE:
5117 		case SO_TIMESTAMP:
5118 		case SO_TIMESTAMP_MONOTONIC:
5119 		case SO_TIMESTAMP_CONTINUOUS:
5120 		case SO_DONTTRUNC:
5121 		case SO_WANTMORE:
5122 		case SO_WANTOOBFLAG:
5123 		case SO_NOWAKEFROMSLEEP:
5124 		case SO_NOAPNFALLBK:
5125 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5126 			    sizeof(optval));
5127 			if (error != 0) {
5128 				goto out;
5129 			}
5130 			if (optval) {
5131 				so->so_options |= sopt->sopt_name;
5132 			} else {
5133 				so->so_options &= ~sopt->sopt_name;
5134 			}
5135 #if SKYWALK
5136 			inp_update_netns_flags(so);
5137 #endif /* SKYWALK */
5138 			break;
5139 
5140 		case SO_SNDBUF:
5141 		case SO_RCVBUF:
5142 		case SO_SNDLOWAT:
5143 		case SO_RCVLOWAT:
5144 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5145 			    sizeof(optval));
5146 			if (error != 0) {
5147 				goto out;
5148 			}
5149 
5150 			/*
5151 			 * Values < 1 make no sense for any of these
5152 			 * options, so disallow them.
5153 			 */
5154 			if (optval < 1) {
5155 				error = EINVAL;
5156 				goto out;
5157 			}
5158 
5159 			switch (sopt->sopt_name) {
5160 			case SO_SNDBUF:
5161 			case SO_RCVBUF: {
5162 				struct sockbuf *sb =
5163 				    (sopt->sopt_name == SO_SNDBUF) ?
5164 				    &so->so_snd : &so->so_rcv;
5165 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
5166 					error = ENOBUFS;
5167 					goto out;
5168 				}
5169 				sb->sb_flags |= SB_USRSIZE;
5170 				sb->sb_flags &= ~SB_AUTOSIZE;
5171 				sb->sb_idealsize = (u_int32_t)optval;
5172 				break;
5173 			}
5174 			/*
5175 			 * Make sure the low-water is never greater than
5176 			 * the high-water.
5177 			 */
5178 			case SO_SNDLOWAT: {
5179 				int space = sbspace(&so->so_snd);
5180 				u_int32_t hiwat = so->so_snd.sb_hiwat;
5181 
5182 				if (so->so_snd.sb_flags & SB_UNIX) {
5183 					struct unpcb *unp =
5184 					    (struct unpcb *)(so->so_pcb);
5185 					if (unp != NULL &&
5186 					    unp->unp_conn != NULL) {
5187 						hiwat += unp->unp_conn->unp_cc;
5188 					}
5189 				}
5190 
5191 				so->so_snd.sb_lowat =
5192 				    (optval > hiwat) ?
5193 				    hiwat : optval;
5194 
5195 				if (space >= so->so_snd.sb_lowat) {
5196 					sowwakeup(so);
5197 				}
5198 				break;
5199 			}
5200 			case SO_RCVLOWAT: {
5201 				int64_t data_len;
5202 				so->so_rcv.sb_lowat =
5203 				    (optval > so->so_rcv.sb_hiwat) ?
5204 				    so->so_rcv.sb_hiwat : optval;
5205 				data_len = so->so_rcv.sb_cc
5206 				    - so->so_rcv.sb_ctl;
5207 				if (data_len >= so->so_rcv.sb_lowat) {
5208 					sorwakeup(so);
5209 				}
5210 				break;
5211 			}
5212 			}
5213 			break;
5214 
5215 		case SO_SNDTIMEO:
5216 		case SO_RCVTIMEO:
5217 			error = sooptcopyin_timeval(sopt, &tv);
5218 			if (error != 0) {
5219 				goto out;
5220 			}
5221 
5222 			switch (sopt->sopt_name) {
5223 			case SO_SNDTIMEO:
5224 				so->so_snd.sb_timeo = tv;
5225 				break;
5226 			case SO_RCVTIMEO:
5227 				so->so_rcv.sb_timeo = tv;
5228 				break;
5229 			}
5230 			break;
5231 
5232 		case SO_NKE: {
5233 			struct so_nke nke;
5234 
5235 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5236 			    sizeof(nke));
5237 			if (error != 0) {
5238 				goto out;
5239 			}
5240 
5241 			error = sflt_attach_internal(so, nke.nke_handle);
5242 			break;
5243 		}
5244 
5245 		case SO_NOSIGPIPE:
5246 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5247 			    sizeof(optval));
5248 			if (error != 0) {
5249 				goto out;
5250 			}
5251 			if (optval != 0) {
5252 				so->so_flags |= SOF_NOSIGPIPE;
5253 			} else {
5254 				so->so_flags &= ~SOF_NOSIGPIPE;
5255 			}
5256 			break;
5257 
5258 		case SO_NOADDRERR:
5259 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5260 			    sizeof(optval));
5261 			if (error != 0) {
5262 				goto out;
5263 			}
5264 			if (optval != 0) {
5265 				so->so_flags |= SOF_NOADDRAVAIL;
5266 			} else {
5267 				so->so_flags &= ~SOF_NOADDRAVAIL;
5268 			}
5269 			break;
5270 
5271 		case SO_REUSESHAREUID:
5272 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5273 			    sizeof(optval));
5274 			if (error != 0) {
5275 				goto out;
5276 			}
5277 			if (optval != 0) {
5278 				so->so_flags |= SOF_REUSESHAREUID;
5279 			} else {
5280 				so->so_flags &= ~SOF_REUSESHAREUID;
5281 			}
5282 			break;
5283 
5284 		case SO_NOTIFYCONFLICT:
5285 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5286 				error = EPERM;
5287 				goto out;
5288 			}
5289 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5290 			    sizeof(optval));
5291 			if (error != 0) {
5292 				goto out;
5293 			}
5294 			if (optval != 0) {
5295 				so->so_flags |= SOF_NOTIFYCONFLICT;
5296 			} else {
5297 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5298 			}
5299 			break;
5300 
5301 		case SO_RESTRICTIONS:
5302 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5303 			    sizeof(optval));
5304 			if (error != 0) {
5305 				goto out;
5306 			}
5307 
5308 			error = so_set_restrictions(so, optval);
5309 			break;
5310 
5311 		case SO_AWDL_UNRESTRICTED:
5312 			if (SOCK_DOM(so) != PF_INET &&
5313 			    SOCK_DOM(so) != PF_INET6) {
5314 				error = EOPNOTSUPP;
5315 				goto out;
5316 			}
5317 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5318 			    sizeof(optval));
5319 			if (error != 0) {
5320 				goto out;
5321 			}
5322 			if (optval != 0) {
5323 				error = soopt_cred_check(so,
5324 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5325 				if (error == 0) {
5326 					inp_set_awdl_unrestricted(
5327 						sotoinpcb(so));
5328 				}
5329 			} else {
5330 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5331 			}
5332 			break;
5333 		case SO_INTCOPROC_ALLOW:
5334 			if (SOCK_DOM(so) != PF_INET6) {
5335 				error = EOPNOTSUPP;
5336 				goto out;
5337 			}
5338 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5339 			    sizeof(optval));
5340 			if (error != 0) {
5341 				goto out;
5342 			}
5343 			if (optval != 0 &&
5344 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5345 				error = soopt_cred_check(so,
5346 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5347 				if (error == 0) {
5348 					inp_set_intcoproc_allowed(
5349 						sotoinpcb(so));
5350 				}
5351 			} else if (optval == 0) {
5352 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5353 			}
5354 			break;
5355 
5356 		case SO_LABEL:
5357 			error = EOPNOTSUPP;
5358 			break;
5359 
5360 		case SO_UPCALLCLOSEWAIT:
5361 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5362 			    sizeof(optval));
5363 			if (error != 0) {
5364 				goto out;
5365 			}
5366 			if (optval != 0) {
5367 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5368 			} else {
5369 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5370 			}
5371 			break;
5372 
5373 		case SO_RANDOMPORT:
5374 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5375 			    sizeof(optval));
5376 			if (error != 0) {
5377 				goto out;
5378 			}
5379 			if (optval != 0) {
5380 				so->so_flags |= SOF_BINDRANDOMPORT;
5381 			} else {
5382 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5383 			}
5384 			break;
5385 
5386 		case SO_NP_EXTENSIONS: {
5387 			struct so_np_extensions sonpx;
5388 
5389 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5390 			    sizeof(sonpx));
5391 			if (error != 0) {
5392 				goto out;
5393 			}
5394 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5395 				error = EINVAL;
5396 				goto out;
5397 			}
5398 			/*
5399 			 * Only one bit defined for now
5400 			 */
5401 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5402 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5403 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5404 				} else {
5405 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5406 				}
5407 			}
5408 			break;
5409 		}
5410 
5411 		case SO_TRAFFIC_CLASS: {
5412 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5413 			    sizeof(optval));
5414 			if (error != 0) {
5415 				goto out;
5416 			}
5417 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5418 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5419 				error = so_set_net_service_type(so, netsvc);
5420 				goto out;
5421 			}
5422 			error = so_set_traffic_class(so, optval);
5423 			if (error != 0) {
5424 				goto out;
5425 			}
5426 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5427 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5428 			break;
5429 		}
5430 
5431 		case SO_RECV_TRAFFIC_CLASS: {
5432 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5433 			    sizeof(optval));
5434 			if (error != 0) {
5435 				goto out;
5436 			}
5437 			if (optval == 0) {
5438 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5439 			} else {
5440 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5441 			}
5442 			break;
5443 		}
5444 
5445 #if (DEVELOPMENT || DEBUG)
5446 		case SO_TRAFFIC_CLASS_DBG: {
5447 			struct so_tcdbg so_tcdbg;
5448 
5449 			error = sooptcopyin(sopt, &so_tcdbg,
5450 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5451 			if (error != 0) {
5452 				goto out;
5453 			}
5454 			error = so_set_tcdbg(so, &so_tcdbg);
5455 			if (error != 0) {
5456 				goto out;
5457 			}
5458 			break;
5459 		}
5460 #endif /* (DEVELOPMENT || DEBUG) */
5461 
5462 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5463 			error = priv_check_cred(kauth_cred_get(),
5464 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5465 			if (error != 0) {
5466 				goto out;
5467 			}
5468 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5469 			    sizeof(optval));
5470 			if (error != 0) {
5471 				goto out;
5472 			}
5473 			if (optval == 0) {
5474 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5475 			} else {
5476 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5477 			}
5478 			break;
5479 
5480 #if (DEVELOPMENT || DEBUG)
5481 		case SO_DEFUNCTIT:
5482 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5483 			if (error == 0) {
5484 				error = sodefunct(current_proc(), so, 0);
5485 			}
5486 
5487 			break;
5488 #endif /* (DEVELOPMENT || DEBUG) */
5489 
5490 		case SO_DEFUNCTOK:
5491 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5492 			    sizeof(optval));
5493 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5494 				if (error == 0) {
5495 					error = EBADF;
5496 				}
5497 				goto out;
5498 			}
5499 			/*
5500 			 * Any process can set SO_DEFUNCTOK (clear
5501 			 * SOF_NODEFUNCT), but only root can clear
5502 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5503 			 */
5504 			if (optval == 0 &&
5505 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5506 				error = EPERM;
5507 				goto out;
5508 			}
5509 			if (optval) {
5510 				so->so_flags &= ~SOF_NODEFUNCT;
5511 			} else {
5512 				so->so_flags |= SOF_NODEFUNCT;
5513 			}
5514 
5515 			if (SOCK_DOM(so) == PF_INET ||
5516 			    SOCK_DOM(so) == PF_INET6) {
5517 				char s[MAX_IPv6_STR_LEN];
5518 				char d[MAX_IPv6_STR_LEN];
5519 				struct inpcb *inp = sotoinpcb(so);
5520 
5521 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5522 				    "[%s %s:%d -> %s:%d] is now marked "
5523 				    "as %seligible for "
5524 				    "defunct\n", __func__, proc_selfpid(),
5525 				    proc_best_name(current_proc()),
5526 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5527 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5528 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5529 				    ((SOCK_DOM(so) == PF_INET) ?
5530 				    (void *)&inp->inp_laddr.s_addr :
5531 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5532 				    ntohs(inp->in6p_lport),
5533 				    inet_ntop(SOCK_DOM(so),
5534 				    (SOCK_DOM(so) == PF_INET) ?
5535 				    (void *)&inp->inp_faddr.s_addr :
5536 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5537 				    ntohs(inp->in6p_fport),
5538 				    (so->so_flags & SOF_NODEFUNCT) ?
5539 				    "not " : "");
5540 			} else {
5541 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5542 				    "is now marked as %seligible for "
5543 				    "defunct\n",
5544 				    __func__, proc_selfpid(),
5545 				    proc_best_name(current_proc()),
5546 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5547 				    SOCK_DOM(so), SOCK_TYPE(so),
5548 				    (so->so_flags & SOF_NODEFUNCT) ?
5549 				    "not " : "");
5550 			}
5551 			break;
5552 
5553 		case SO_ISDEFUNCT:
5554 			/* This option is not settable */
5555 			error = EINVAL;
5556 			break;
5557 
5558 		case SO_OPPORTUNISTIC:
5559 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5560 			    sizeof(optval));
5561 			if (error == 0) {
5562 				error = so_set_opportunistic(so, optval);
5563 			}
5564 			break;
5565 
5566 		case SO_FLUSH:
5567 			/* This option is handled by lower layer(s) */
5568 			error = 0;
5569 			break;
5570 
5571 		case SO_RECV_ANYIF:
5572 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5573 			    sizeof(optval));
5574 			if (error == 0) {
5575 				error = so_set_recv_anyif(so, optval);
5576 			}
5577 			break;
5578 
5579 		case SO_TRAFFIC_MGT_BACKGROUND: {
5580 			/* This option is handled by lower layer(s) */
5581 			error = 0;
5582 			break;
5583 		}
5584 
5585 #if FLOW_DIVERT
5586 		case SO_FLOW_DIVERT_TOKEN:
5587 			error = flow_divert_token_set(so, sopt);
5588 			break;
5589 #endif  /* FLOW_DIVERT */
5590 
5591 
5592 		case SO_DELEGATED:
5593 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5594 			    sizeof(optval))) != 0) {
5595 				break;
5596 			}
5597 
5598 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5599 			break;
5600 
5601 		case SO_DELEGATED_UUID: {
5602 			uuid_t euuid;
5603 
5604 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5605 			    sizeof(euuid))) != 0) {
5606 				break;
5607 			}
5608 
5609 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5610 			break;
5611 		}
5612 
5613 #if NECP
5614 		case SO_NECP_ATTRIBUTES:
5615 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5616 				/* Handled by MPTCP itself */
5617 				break;
5618 			}
5619 
5620 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5621 				error = EINVAL;
5622 				goto out;
5623 			}
5624 
5625 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5626 			break;
5627 
5628 		case SO_NECP_CLIENTUUID: {
5629 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5630 				/* Handled by MPTCP itself */
5631 				break;
5632 			}
5633 
5634 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5635 				error = EINVAL;
5636 				goto out;
5637 			}
5638 
5639 			struct inpcb *inp = sotoinpcb(so);
5640 			if (!uuid_is_null(inp->necp_client_uuid)) {
5641 				// Clear out the old client UUID if present
5642 				necp_inpcb_remove_cb(inp);
5643 			}
5644 
5645 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5646 			    sizeof(uuid_t), sizeof(uuid_t));
5647 			if (error != 0) {
5648 				goto out;
5649 			}
5650 
5651 			if (uuid_is_null(inp->necp_client_uuid)) {
5652 				error = EINVAL;
5653 				goto out;
5654 			}
5655 
5656 			pid_t current_pid = proc_pid(current_proc());
5657 			error = necp_client_register_socket_flow(current_pid,
5658 			    inp->necp_client_uuid, inp);
5659 			if (error != 0) {
5660 				uuid_clear(inp->necp_client_uuid);
5661 				goto out;
5662 			}
5663 
5664 			if (inp->inp_lport != 0) {
5665 				// There is a bound local port, so this is not
5666 				// a fresh socket. Assign to the client.
5667 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5668 			}
5669 
5670 			break;
5671 		}
5672 		case SO_NECP_LISTENUUID: {
5673 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5674 				error = EINVAL;
5675 				goto out;
5676 			}
5677 
5678 			struct inpcb *inp = sotoinpcb(so);
5679 			if (!uuid_is_null(inp->necp_client_uuid)) {
5680 				error = EINVAL;
5681 				goto out;
5682 			}
5683 
5684 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5685 			    sizeof(uuid_t), sizeof(uuid_t));
5686 			if (error != 0) {
5687 				goto out;
5688 			}
5689 
5690 			if (uuid_is_null(inp->necp_client_uuid)) {
5691 				error = EINVAL;
5692 				goto out;
5693 			}
5694 
5695 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5696 			    inp->necp_client_uuid, inp);
5697 			if (error != 0) {
5698 				uuid_clear(inp->necp_client_uuid);
5699 				goto out;
5700 			}
5701 
5702 			// Mark that the port registration is held by NECP
5703 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5704 
5705 			break;
5706 		}
5707 #endif /* NECP */
5708 
5709 		case SO_EXTENDED_BK_IDLE:
5710 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5711 			    sizeof(optval));
5712 			if (error == 0) {
5713 				error = so_set_extended_bk_idle(so, optval);
5714 			}
5715 			break;
5716 
5717 		case SO_MARK_CELLFALLBACK:
5718 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5719 			    sizeof(optval));
5720 			if (error != 0) {
5721 				goto out;
5722 			}
5723 			if (optval < 0) {
5724 				error = EINVAL;
5725 				goto out;
5726 			}
5727 			if (optval == 0) {
5728 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5729 			} else {
5730 				so->so_flags1 |= SOF1_CELLFALLBACK;
5731 			}
5732 			break;
5733 
5734 		case SO_FALLBACK_MODE:
5735 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5736 			    sizeof(optval));
5737 			if (error != 0) {
5738 				goto out;
5739 			}
5740 			if (optval < SO_FALLBACK_MODE_NONE ||
5741 			    optval > SO_FALLBACK_MODE_PREFER) {
5742 				error = EINVAL;
5743 				goto out;
5744 			}
5745 			so->so_fallback_mode = (u_int8_t)optval;
5746 			break;
5747 
5748 		case SO_MARK_KNOWN_TRACKER: {
5749 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5750 			    sizeof(optval));
5751 			if (error != 0) {
5752 				goto out;
5753 			}
5754 			if (optval < 0) {
5755 				error = EINVAL;
5756 				goto out;
5757 			}
5758 			if (optval == 0) {
5759 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5760 			} else {
5761 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5762 			}
5763 			break;
5764 		}
5765 
5766 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5767 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5768 			    sizeof(optval));
5769 			if (error != 0) {
5770 				goto out;
5771 			}
5772 			if (optval < 0) {
5773 				error = EINVAL;
5774 				goto out;
5775 			}
5776 			if (optval == 0) {
5777 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5778 			} else {
5779 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5780 			}
5781 			break;
5782 		}
5783 
5784 		case SO_MARK_APPROVED_APP_DOMAIN: {
5785 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5786 			    sizeof(optval));
5787 			if (error != 0) {
5788 				goto out;
5789 			}
5790 			if (optval < 0) {
5791 				error = EINVAL;
5792 				goto out;
5793 			}
5794 			if (optval == 0) {
5795 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5796 			} else {
5797 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5798 			}
5799 			break;
5800 		}
5801 
5802 		case SO_STATISTICS_EVENT:
5803 			error = sooptcopyin(sopt, &long_optval,
5804 			    sizeof(long_optval), sizeof(long_optval));
5805 			if (error != 0) {
5806 				goto out;
5807 			}
5808 			u_int64_t nstat_event = 0;
5809 			error = so_statistics_event_to_nstat_event(
5810 				&long_optval, &nstat_event);
5811 			if (error != 0) {
5812 				goto out;
5813 			}
5814 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5815 			break;
5816 
5817 		case SO_NET_SERVICE_TYPE: {
5818 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5819 			    sizeof(optval));
5820 			if (error != 0) {
5821 				goto out;
5822 			}
5823 			error = so_set_net_service_type(so, optval);
5824 			break;
5825 		}
5826 
5827 		case SO_QOSMARKING_POLICY_OVERRIDE:
5828 			error = priv_check_cred(kauth_cred_get(),
5829 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5830 			if (error != 0) {
5831 				goto out;
5832 			}
5833 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5834 			    sizeof(optval));
5835 			if (error != 0) {
5836 				goto out;
5837 			}
5838 			if (optval == 0) {
5839 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5840 			} else {
5841 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5842 			}
5843 			break;
5844 
5845 		case SO_MPKL_SEND_INFO: {
5846 			struct so_mpkl_send_info so_mpkl_send_info;
5847 
5848 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5849 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5850 			if (error != 0) {
5851 				goto out;
5852 			}
5853 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5854 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5855 
5856 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5857 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5858 			} else {
5859 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5860 			}
5861 			break;
5862 		}
5863 		case SO_WANT_KEV_SOCKET_CLOSED: {
5864 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5865 			    sizeof(optval));
5866 			if (error != 0) {
5867 				goto out;
5868 			}
5869 			if (optval == 0) {
5870 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5871 			} else {
5872 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5873 			}
5874 			break;
5875 		}
5876 		case SO_MARK_WAKE_PKT: {
5877 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5878 			    sizeof(optval));
5879 			if (error != 0) {
5880 				goto out;
5881 			}
5882 			if (optval == 0) {
5883 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5884 			} else {
5885 				so->so_flags |= SOF_MARK_WAKE_PKT;
5886 			}
5887 			break;
5888 		}
5889 		case SO_RECV_WAKE_PKT: {
5890 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5891 			    sizeof(optval));
5892 			if (error != 0) {
5893 				goto out;
5894 			}
5895 			if (optval == 0) {
5896 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5897 			} else {
5898 				so->so_flags |= SOF_RECV_WAKE_PKT;
5899 			}
5900 			break;
5901 		}
5902 		default:
5903 			error = ENOPROTOOPT;
5904 			break;
5905 		}
5906 		if (error == 0 && so->so_proto != NULL &&
5907 		    so->so_proto->pr_ctloutput != NULL) {
5908 			(void) so->so_proto->pr_ctloutput(so, sopt);
5909 		}
5910 	}
5911 out:
5912 	if (dolock) {
5913 		socket_unlock(so, 1);
5914 	}
5915 	return error;
5916 }
5917 
5918 /* Helper routines for getsockopt */
5919 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5920 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5921 {
5922 	int     error;
5923 	size_t  valsize;
5924 
5925 	error = 0;
5926 
5927 	/*
5928 	 * Documented get behavior is that we always return a value,
5929 	 * possibly truncated to fit in the user's buffer.
5930 	 * Traditional behavior is that we always tell the user
5931 	 * precisely how much we copied, rather than something useful
5932 	 * like the total amount we had available for her.
5933 	 * Note that this interface is not idempotent; the entire answer must
5934 	 * generated ahead of time.
5935 	 */
5936 	valsize = MIN(len, sopt->sopt_valsize);
5937 	sopt->sopt_valsize = valsize;
5938 	if (sopt->sopt_val != USER_ADDR_NULL) {
5939 		if (sopt->sopt_p != kernproc) {
5940 			error = copyout(buf, sopt->sopt_val, valsize);
5941 		} else {
5942 			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5943 		}
5944 	}
5945 	return error;
5946 }
5947 
5948 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5949 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5950 {
5951 	int                     error;
5952 	size_t                  len;
5953 	struct user64_timeval   tv64 = {};
5954 	struct user32_timeval   tv32 = {};
5955 	const void *            val;
5956 	size_t                  valsize;
5957 
5958 	error = 0;
5959 	if (proc_is64bit(sopt->sopt_p)) {
5960 		len = sizeof(tv64);
5961 		tv64.tv_sec = tv_p->tv_sec;
5962 		tv64.tv_usec = tv_p->tv_usec;
5963 		val = &tv64;
5964 	} else {
5965 		len = sizeof(tv32);
5966 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5967 		tv32.tv_usec = tv_p->tv_usec;
5968 		val = &tv32;
5969 	}
5970 	valsize = MIN(len, sopt->sopt_valsize);
5971 	sopt->sopt_valsize = valsize;
5972 	if (sopt->sopt_val != USER_ADDR_NULL) {
5973 		if (sopt->sopt_p != kernproc) {
5974 			error = copyout(val, sopt->sopt_val, valsize);
5975 		} else {
5976 			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5977 		}
5978 	}
5979 	return error;
5980 }
5981 
5982 /*
5983  * Return:	0			Success
5984  *		ENOPROTOOPT
5985  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5986  *	<pr_ctloutput>:???
5987  *	<sf_getoption>:???
5988  */
5989 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5990 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5991 {
5992 	int     error, optval;
5993 	struct  linger l;
5994 	struct  timeval tv;
5995 
5996 	if (sopt->sopt_dir != SOPT_GET) {
5997 		sopt->sopt_dir = SOPT_GET;
5998 	}
5999 
6000 	if (dolock) {
6001 		socket_lock(so, 1);
6002 	}
6003 
6004 	error = sflt_getsockopt(so, sopt);
6005 	if (error != 0) {
6006 		if (error == EJUSTRETURN) {
6007 			error = 0;
6008 		}
6009 		goto out;
6010 	}
6011 
6012 	if (sopt->sopt_level != SOL_SOCKET) {
6013 		if (so->so_proto != NULL &&
6014 		    so->so_proto->pr_ctloutput != NULL) {
6015 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
6016 			goto out;
6017 		}
6018 		error = ENOPROTOOPT;
6019 	} else {
6020 		/*
6021 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
6022 		 * the protocol layer, if needed.  A zero value returned from
6023 		 * the handler means use default socket-level processing as
6024 		 * done by the rest of this routine.  Otherwise, any other
6025 		 * return value indicates that the option is unsupported.
6026 		 */
6027 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6028 		    pru_socheckopt(so, sopt)) != 0) {
6029 			goto out;
6030 		}
6031 
6032 		error = 0;
6033 		switch (sopt->sopt_name) {
6034 		case SO_LINGER:
6035 		case SO_LINGER_SEC:
6036 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6037 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6038 			    so->so_linger : so->so_linger / hz;
6039 			error = sooptcopyout(sopt, &l, sizeof(l));
6040 			break;
6041 
6042 		case SO_USELOOPBACK:
6043 		case SO_DONTROUTE:
6044 		case SO_DEBUG:
6045 		case SO_KEEPALIVE:
6046 		case SO_REUSEADDR:
6047 		case SO_REUSEPORT:
6048 		case SO_BROADCAST:
6049 		case SO_OOBINLINE:
6050 		case SO_TIMESTAMP:
6051 		case SO_TIMESTAMP_MONOTONIC:
6052 		case SO_TIMESTAMP_CONTINUOUS:
6053 		case SO_DONTTRUNC:
6054 		case SO_WANTMORE:
6055 		case SO_WANTOOBFLAG:
6056 		case SO_NOWAKEFROMSLEEP:
6057 		case SO_NOAPNFALLBK:
6058 			optval = so->so_options & sopt->sopt_name;
6059 integer:
6060 			error = sooptcopyout(sopt, &optval, sizeof(optval));
6061 			break;
6062 
6063 		case SO_TYPE:
6064 			optval = so->so_type;
6065 			goto integer;
6066 
6067 		case SO_NREAD:
6068 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6069 				int pkt_total;
6070 				struct mbuf *m1;
6071 
6072 				pkt_total = 0;
6073 				m1 = so->so_rcv.sb_mb;
6074 				while (m1 != NULL) {
6075 					if (m1->m_type == MT_DATA ||
6076 					    m1->m_type == MT_HEADER ||
6077 					    m1->m_type == MT_OOBDATA) {
6078 						pkt_total += m1->m_len;
6079 					}
6080 					m1 = m1->m_next;
6081 				}
6082 				optval = pkt_total;
6083 			} else {
6084 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6085 			}
6086 			goto integer;
6087 
6088 		case SO_NUMRCVPKT:
6089 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6090 				int cnt = 0;
6091 				struct mbuf *m1;
6092 
6093 				m1 = so->so_rcv.sb_mb;
6094 				while (m1 != NULL) {
6095 					cnt += 1;
6096 					m1 = m1->m_nextpkt;
6097 				}
6098 				optval = cnt;
6099 				goto integer;
6100 			} else {
6101 				error = ENOPROTOOPT;
6102 				break;
6103 			}
6104 
6105 		case SO_NWRITE:
6106 			optval = so->so_snd.sb_cc;
6107 			goto integer;
6108 
6109 		case SO_ERROR:
6110 			optval = so->so_error;
6111 			so->so_error = 0;
6112 			goto integer;
6113 
6114 		case SO_SNDBUF: {
6115 			u_int32_t hiwat = so->so_snd.sb_hiwat;
6116 
6117 			if (so->so_snd.sb_flags & SB_UNIX) {
6118 				struct unpcb *unp =
6119 				    (struct unpcb *)(so->so_pcb);
6120 				if (unp != NULL && unp->unp_conn != NULL) {
6121 					hiwat += unp->unp_conn->unp_cc;
6122 				}
6123 			}
6124 
6125 			optval = hiwat;
6126 			goto integer;
6127 		}
6128 		case SO_RCVBUF:
6129 			optval = so->so_rcv.sb_hiwat;
6130 			goto integer;
6131 
6132 		case SO_SNDLOWAT:
6133 			optval = so->so_snd.sb_lowat;
6134 			goto integer;
6135 
6136 		case SO_RCVLOWAT:
6137 			optval = so->so_rcv.sb_lowat;
6138 			goto integer;
6139 
6140 		case SO_SNDTIMEO:
6141 		case SO_RCVTIMEO:
6142 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
6143 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6144 
6145 			error = sooptcopyout_timeval(sopt, &tv);
6146 			break;
6147 
6148 		case SO_NOSIGPIPE:
6149 			optval = (so->so_flags & SOF_NOSIGPIPE);
6150 			goto integer;
6151 
6152 		case SO_NOADDRERR:
6153 			optval = (so->so_flags & SOF_NOADDRAVAIL);
6154 			goto integer;
6155 
6156 		case SO_REUSESHAREUID:
6157 			optval = (so->so_flags & SOF_REUSESHAREUID);
6158 			goto integer;
6159 
6160 
6161 		case SO_NOTIFYCONFLICT:
6162 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6163 			goto integer;
6164 
6165 		case SO_RESTRICTIONS:
6166 			optval = so_get_restrictions(so);
6167 			goto integer;
6168 
6169 		case SO_AWDL_UNRESTRICTED:
6170 			if (SOCK_DOM(so) == PF_INET ||
6171 			    SOCK_DOM(so) == PF_INET6) {
6172 				optval = inp_get_awdl_unrestricted(
6173 					sotoinpcb(so));
6174 				goto integer;
6175 			} else {
6176 				error = EOPNOTSUPP;
6177 			}
6178 			break;
6179 
6180 		case SO_INTCOPROC_ALLOW:
6181 			if (SOCK_DOM(so) == PF_INET6) {
6182 				optval = inp_get_intcoproc_allowed(
6183 					sotoinpcb(so));
6184 				goto integer;
6185 			} else {
6186 				error = EOPNOTSUPP;
6187 			}
6188 			break;
6189 
6190 		case SO_LABEL:
6191 			error = EOPNOTSUPP;
6192 			break;
6193 
6194 		case SO_PEERLABEL:
6195 			error = EOPNOTSUPP;
6196 			break;
6197 
6198 #ifdef __APPLE_API_PRIVATE
6199 		case SO_UPCALLCLOSEWAIT:
6200 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6201 			goto integer;
6202 #endif
6203 		case SO_RANDOMPORT:
6204 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6205 			goto integer;
6206 
6207 		case SO_NP_EXTENSIONS: {
6208 			struct so_np_extensions sonpx = {};
6209 
6210 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6211 			    SONPX_SETOPTSHUT : 0;
6212 			sonpx.npx_mask = SONPX_MASK_VALID;
6213 
6214 			error = sooptcopyout(sopt, &sonpx,
6215 			    sizeof(struct so_np_extensions));
6216 			break;
6217 		}
6218 
6219 		case SO_TRAFFIC_CLASS:
6220 			optval = so->so_traffic_class;
6221 			goto integer;
6222 
6223 		case SO_RECV_TRAFFIC_CLASS:
6224 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6225 			goto integer;
6226 
6227 #if (DEVELOPMENT || DEBUG)
6228 		case SO_TRAFFIC_CLASS_DBG:
6229 			error = sogetopt_tcdbg(so, sopt);
6230 			break;
6231 #endif /* (DEVELOPMENT || DEBUG) */
6232 
6233 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6234 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6235 			goto integer;
6236 
6237 		case SO_DEFUNCTOK:
6238 			optval = !(so->so_flags & SOF_NODEFUNCT);
6239 			goto integer;
6240 
6241 		case SO_ISDEFUNCT:
6242 			optval = (so->so_flags & SOF_DEFUNCT);
6243 			goto integer;
6244 
6245 		case SO_OPPORTUNISTIC:
6246 			optval = so_get_opportunistic(so);
6247 			goto integer;
6248 
6249 		case SO_FLUSH:
6250 			/* This option is not gettable */
6251 			error = EINVAL;
6252 			break;
6253 
6254 		case SO_RECV_ANYIF:
6255 			optval = so_get_recv_anyif(so);
6256 			goto integer;
6257 
6258 		case SO_TRAFFIC_MGT_BACKGROUND:
6259 			/* This option is handled by lower layer(s) */
6260 			if (so->so_proto != NULL &&
6261 			    so->so_proto->pr_ctloutput != NULL) {
6262 				(void) so->so_proto->pr_ctloutput(so, sopt);
6263 			}
6264 			break;
6265 
6266 #if FLOW_DIVERT
6267 		case SO_FLOW_DIVERT_TOKEN:
6268 			error = flow_divert_token_get(so, sopt);
6269 			break;
6270 #endif  /* FLOW_DIVERT */
6271 
6272 #if NECP
6273 		case SO_NECP_ATTRIBUTES:
6274 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6275 				/* Handled by MPTCP itself */
6276 				break;
6277 			}
6278 
6279 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6280 				error = EINVAL;
6281 				goto out;
6282 			}
6283 
6284 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6285 			break;
6286 
6287 		case SO_NECP_CLIENTUUID: {
6288 			uuid_t *ncu;
6289 
6290 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6291 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6292 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6293 				ncu = &sotoinpcb(so)->necp_client_uuid;
6294 			} else {
6295 				error = EINVAL;
6296 				goto out;
6297 			}
6298 
6299 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6300 			break;
6301 		}
6302 
6303 		case SO_NECP_LISTENUUID: {
6304 			uuid_t *nlu;
6305 
6306 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6307 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6308 					nlu = &sotoinpcb(so)->necp_client_uuid;
6309 				} else {
6310 					error = ENOENT;
6311 					goto out;
6312 				}
6313 			} else {
6314 				error = EINVAL;
6315 				goto out;
6316 			}
6317 
6318 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6319 			break;
6320 		}
6321 #endif /* NECP */
6322 
6323 #if CONTENT_FILTER
6324 		case SO_CFIL_SOCK_ID: {
6325 			cfil_sock_id_t sock_id;
6326 
6327 			sock_id = cfil_sock_id_from_socket(so);
6328 
6329 			error = sooptcopyout(sopt, &sock_id,
6330 			    sizeof(cfil_sock_id_t));
6331 			break;
6332 		}
6333 #endif  /* CONTENT_FILTER */
6334 
6335 		case SO_EXTENDED_BK_IDLE:
6336 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6337 			goto integer;
6338 		case SO_MARK_CELLFALLBACK:
6339 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6340 			    ? 1 : 0;
6341 			goto integer;
6342 		case SO_FALLBACK_MODE:
6343 			optval = so->so_fallback_mode;
6344 			goto integer;
6345 		case SO_MARK_KNOWN_TRACKER: {
6346 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6347 			    ? 1 : 0;
6348 			goto integer;
6349 		}
6350 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6351 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6352 			    ? 1 : 0;
6353 			goto integer;
6354 		}
6355 		case SO_MARK_APPROVED_APP_DOMAIN: {
6356 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6357 			    ? 1 : 0;
6358 			goto integer;
6359 		}
6360 		case SO_NET_SERVICE_TYPE: {
6361 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6362 				optval = so->so_netsvctype;
6363 			} else {
6364 				optval = NET_SERVICE_TYPE_BE;
6365 			}
6366 			goto integer;
6367 		}
6368 		case SO_NETSVC_MARKING_LEVEL:
6369 			optval = so_get_netsvc_marking_level(so);
6370 			goto integer;
6371 
6372 		case SO_MPKL_SEND_INFO: {
6373 			struct so_mpkl_send_info so_mpkl_send_info;
6374 
6375 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6376 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6377 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6378 			    sizeof(struct so_mpkl_send_info));
6379 			break;
6380 		}
6381 		case SO_MARK_WAKE_PKT:
6382 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6383 			goto integer;
6384 		case SO_RECV_WAKE_PKT:
6385 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6386 			goto integer;
6387 		default:
6388 			error = ENOPROTOOPT;
6389 			break;
6390 		}
6391 	}
6392 out:
6393 	if (dolock) {
6394 		socket_unlock(so, 1);
6395 	}
6396 	return error;
6397 }
6398 
6399 /*
6400  * The size limits on our soopt_getm is different from that on FreeBSD.
6401  * We limit the size of options to MCLBYTES. This will have to change
6402  * if we need to define options that need more space than MCLBYTES.
6403  */
6404 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6405 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6406 {
6407 	struct mbuf *m, *m_prev;
6408 	int sopt_size = (int)sopt->sopt_valsize;
6409 	int how;
6410 
6411 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6412 		return EMSGSIZE;
6413 	}
6414 
6415 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6416 	MGET(m, how, MT_DATA);
6417 	if (m == NULL) {
6418 		return ENOBUFS;
6419 	}
6420 	if (sopt_size > MLEN) {
6421 		MCLGET(m, how);
6422 		if ((m->m_flags & M_EXT) == 0) {
6423 			m_free(m);
6424 			return ENOBUFS;
6425 		}
6426 		m->m_len = min(MCLBYTES, sopt_size);
6427 	} else {
6428 		m->m_len = min(MLEN, sopt_size);
6429 	}
6430 	sopt_size -= m->m_len;
6431 	*mp = m;
6432 	m_prev = m;
6433 
6434 	while (sopt_size > 0) {
6435 		MGET(m, how, MT_DATA);
6436 		if (m == NULL) {
6437 			m_freem(*mp);
6438 			return ENOBUFS;
6439 		}
6440 		if (sopt_size > MLEN) {
6441 			MCLGET(m, how);
6442 			if ((m->m_flags & M_EXT) == 0) {
6443 				m_freem(*mp);
6444 				m_freem(m);
6445 				return ENOBUFS;
6446 			}
6447 			m->m_len = min(MCLBYTES, sopt_size);
6448 		} else {
6449 			m->m_len = min(MLEN, sopt_size);
6450 		}
6451 		sopt_size -= m->m_len;
6452 		m_prev->m_next = m;
6453 		m_prev = m;
6454 	}
6455 	return 0;
6456 }
6457 
6458 /* copyin sopt data into mbuf chain */
6459 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6460 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6461 {
6462 	struct mbuf *m0 = m;
6463 
6464 	if (sopt->sopt_val == USER_ADDR_NULL) {
6465 		return 0;
6466 	}
6467 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6468 		if (sopt->sopt_p != kernproc) {
6469 			int error;
6470 
6471 			error = copyin(sopt->sopt_val, mtod(m, char *),
6472 			    m->m_len);
6473 			if (error != 0) {
6474 				m_freem(m0);
6475 				return error;
6476 			}
6477 		} else {
6478 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6479 			    mtod(m, char *), m->m_len);
6480 		}
6481 		sopt->sopt_valsize -= m->m_len;
6482 		sopt->sopt_val += m->m_len;
6483 		m = m->m_next;
6484 	}
6485 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6486 	if (m != NULL) {
6487 		panic("soopt_mcopyin");
6488 		/* NOTREACHED */
6489 	}
6490 	return 0;
6491 }
6492 
6493 /* copyout mbuf chain data into soopt */
6494 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6495 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6496 {
6497 	struct mbuf *m0 = m;
6498 	size_t valsize = 0;
6499 
6500 	if (sopt->sopt_val == USER_ADDR_NULL) {
6501 		return 0;
6502 	}
6503 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6504 		if (sopt->sopt_p != kernproc) {
6505 			int error;
6506 
6507 			error = copyout(mtod(m, char *), sopt->sopt_val,
6508 			    m->m_len);
6509 			if (error != 0) {
6510 				m_freem(m0);
6511 				return error;
6512 			}
6513 		} else {
6514 			bcopy(mtod(m, char *),
6515 			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6516 		}
6517 		sopt->sopt_valsize -= m->m_len;
6518 		sopt->sopt_val += m->m_len;
6519 		valsize += m->m_len;
6520 		m = m->m_next;
6521 	}
6522 	if (m != NULL) {
6523 		/* enough soopt buffer should be given from user-land */
6524 		m_freem(m0);
6525 		return EINVAL;
6526 	}
6527 	sopt->sopt_valsize = valsize;
6528 	return 0;
6529 }
6530 
6531 void
sohasoutofband(struct socket * so)6532 sohasoutofband(struct socket *so)
6533 {
6534 	if (so->so_pgid < 0) {
6535 		gsignal(-so->so_pgid, SIGURG);
6536 	} else if (so->so_pgid > 0) {
6537 		proc_signal(so->so_pgid, SIGURG);
6538 	}
6539 	selwakeup(&so->so_rcv.sb_sel);
6540 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6541 		KNOTE(&so->so_rcv.sb_sel.si_note,
6542 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6543 	}
6544 }
6545 
6546 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6547 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6548 {
6549 #pragma unused(cred)
6550 	struct proc *p = current_proc();
6551 	int revents = 0;
6552 
6553 	socket_lock(so, 1);
6554 	so_update_last_owner_locked(so, PROC_NULL);
6555 	so_update_policy(so);
6556 
6557 	if (events & (POLLIN | POLLRDNORM)) {
6558 		if (soreadable(so)) {
6559 			revents |= events & (POLLIN | POLLRDNORM);
6560 		}
6561 	}
6562 
6563 	if (events & (POLLOUT | POLLWRNORM)) {
6564 		if (sowriteable(so)) {
6565 			revents |= events & (POLLOUT | POLLWRNORM);
6566 		}
6567 	}
6568 
6569 	if (events & (POLLPRI | POLLRDBAND)) {
6570 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6571 			revents |= events & (POLLPRI | POLLRDBAND);
6572 		}
6573 	}
6574 
6575 	if (revents == 0) {
6576 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6577 			/*
6578 			 * Darwin sets the flag first,
6579 			 * BSD calls selrecord first
6580 			 */
6581 			so->so_rcv.sb_flags |= SB_SEL;
6582 			selrecord(p, &so->so_rcv.sb_sel, wql);
6583 		}
6584 
6585 		if (events & (POLLOUT | POLLWRNORM)) {
6586 			/*
6587 			 * Darwin sets the flag first,
6588 			 * BSD calls selrecord first
6589 			 */
6590 			so->so_snd.sb_flags |= SB_SEL;
6591 			selrecord(p, &so->so_snd.sb_sel, wql);
6592 		}
6593 	}
6594 
6595 	socket_unlock(so, 1);
6596 	return revents;
6597 }
6598 
6599 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6600 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6601 {
6602 	struct socket *so = (struct socket *)fp_get_data(fp);
6603 	int result;
6604 
6605 	socket_lock(so, 1);
6606 	so_update_last_owner_locked(so, PROC_NULL);
6607 	so_update_policy(so);
6608 
6609 	switch (kn->kn_filter) {
6610 	case EVFILT_READ:
6611 		kn->kn_filtid = EVFILTID_SOREAD;
6612 		break;
6613 	case EVFILT_WRITE:
6614 		kn->kn_filtid = EVFILTID_SOWRITE;
6615 		break;
6616 	case EVFILT_SOCK:
6617 		kn->kn_filtid = EVFILTID_SCK;
6618 		break;
6619 	case EVFILT_EXCEPT:
6620 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6621 		break;
6622 	default:
6623 		socket_unlock(so, 1);
6624 		knote_set_error(kn, EINVAL);
6625 		return 0;
6626 	}
6627 
6628 	/*
6629 	 * call the appropriate sub-filter attach
6630 	 * with the socket still locked
6631 	 */
6632 	result = knote_fops(kn)->f_attach(kn, kev);
6633 
6634 	socket_unlock(so, 1);
6635 
6636 	return result;
6637 }
6638 
6639 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6640 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6641 {
6642 	int retval = 0;
6643 	int64_t data = 0;
6644 
6645 	if (so->so_options & SO_ACCEPTCONN) {
6646 		/*
6647 		 * Radar 6615193 handle the listen case dynamically
6648 		 * for kqueue read filter. This allows to call listen()
6649 		 * after registering the kqueue EVFILT_READ.
6650 		 */
6651 
6652 		retval = !TAILQ_EMPTY(&so->so_comp);
6653 		data = so->so_qlen;
6654 		goto out;
6655 	}
6656 
6657 	/* socket isn't a listener */
6658 	/*
6659 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6660 	 * the bytes of protocol data. We therefore exclude any
6661 	 * control bytes.
6662 	 */
6663 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6664 
6665 	if (kn->kn_sfflags & NOTE_OOB) {
6666 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6667 			kn->kn_fflags |= NOTE_OOB;
6668 			data -= so->so_oobmark;
6669 			retval = 1;
6670 			goto out;
6671 		}
6672 	}
6673 
6674 	if ((so->so_state & SS_CANTRCVMORE)
6675 #if CONTENT_FILTER
6676 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6677 #endif /* CONTENT_FILTER */
6678 	    ) {
6679 		kn->kn_flags |= EV_EOF;
6680 		kn->kn_fflags = so->so_error;
6681 		retval = 1;
6682 		goto out;
6683 	}
6684 
6685 	if (so->so_error) {     /* temporary udp error */
6686 		retval = 1;
6687 		goto out;
6688 	}
6689 
6690 	int64_t lowwat = so->so_rcv.sb_lowat;
6691 	/*
6692 	 * Ensure that when NOTE_LOWAT is used, the derived
6693 	 * low water mark is bounded by socket's rcv buf's
6694 	 * high and low water mark values.
6695 	 */
6696 	if (kn->kn_sfflags & NOTE_LOWAT) {
6697 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6698 			lowwat = so->so_rcv.sb_hiwat;
6699 		} else if (kn->kn_sdata > lowwat) {
6700 			lowwat = kn->kn_sdata;
6701 		}
6702 	}
6703 
6704 	/*
6705 	 * While the `data` field is the amount of data to read,
6706 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6707 	 * so we need to take control bytes into account too.
6708 	 */
6709 	retval = (so->so_rcv.sb_cc >= lowwat);
6710 
6711 out:
6712 	if (retval && kev) {
6713 		knote_fill_kevent(kn, kev, data);
6714 	}
6715 	return retval;
6716 }
6717 
6718 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6719 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6720 {
6721 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6722 
6723 	/* socket locked */
6724 
6725 	/*
6726 	 * If the caller explicitly asked for OOB results (e.g. poll())
6727 	 * from EVFILT_READ, then save that off in the hookid field
6728 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6729 	 */
6730 	if (kn->kn_filter == EVFILT_READ &&
6731 	    kn->kn_flags & EV_OOBAND) {
6732 		kn->kn_flags &= ~EV_OOBAND;
6733 		kn->kn_hook32 = EV_OOBAND;
6734 	} else {
6735 		kn->kn_hook32 = 0;
6736 	}
6737 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6738 		so->so_rcv.sb_flags |= SB_KNOTE;
6739 	}
6740 
6741 	/* indicate if event is already fired */
6742 	return filt_soread_common(kn, NULL, so);
6743 }
6744 
6745 static void
filt_sordetach(struct knote * kn)6746 filt_sordetach(struct knote *kn)
6747 {
6748 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6749 
6750 	socket_lock(so, 1);
6751 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6752 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6753 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6754 		}
6755 	}
6756 	socket_unlock(so, 1);
6757 }
6758 
6759 /*ARGSUSED*/
6760 static int
filt_soread(struct knote * kn,long hint)6761 filt_soread(struct knote *kn, long hint)
6762 {
6763 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6764 	int retval;
6765 
6766 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6767 		socket_lock(so, 1);
6768 	}
6769 
6770 	retval = filt_soread_common(kn, NULL, so);
6771 
6772 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6773 		socket_unlock(so, 1);
6774 	}
6775 
6776 	return retval;
6777 }
6778 
6779 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6780 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6781 {
6782 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6783 	int retval;
6784 
6785 	socket_lock(so, 1);
6786 
6787 	/* save off the new input fflags and data */
6788 	kn->kn_sfflags = kev->fflags;
6789 	kn->kn_sdata = kev->data;
6790 
6791 	/* determine if changes result in fired events */
6792 	retval = filt_soread_common(kn, NULL, so);
6793 
6794 	socket_unlock(so, 1);
6795 
6796 	return retval;
6797 }
6798 
6799 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6800 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6801 {
6802 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6803 	int retval;
6804 
6805 	socket_lock(so, 1);
6806 	retval = filt_soread_common(kn, kev, so);
6807 	socket_unlock(so, 1);
6808 
6809 	return retval;
6810 }
6811 
6812 int
so_wait_for_if_feedback(struct socket * so)6813 so_wait_for_if_feedback(struct socket *so)
6814 {
6815 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6816 	    (so->so_state & SS_ISCONNECTED)) {
6817 		struct inpcb *inp = sotoinpcb(so);
6818 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6819 			return 1;
6820 		}
6821 	}
6822 	return 0;
6823 }
6824 
6825 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6826 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6827 {
6828 	int ret = 0;
6829 	int64_t data = sbspace(&so->so_snd);
6830 
6831 	if (so->so_state & SS_CANTSENDMORE) {
6832 		kn->kn_flags |= EV_EOF;
6833 		kn->kn_fflags = so->so_error;
6834 		ret = 1;
6835 		goto out;
6836 	}
6837 
6838 	if (so->so_error) {     /* temporary udp error */
6839 		ret = 1;
6840 		goto out;
6841 	}
6842 
6843 	if (!socanwrite(so)) {
6844 		ret = 0;
6845 		goto out;
6846 	}
6847 
6848 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6849 		ret = 1;
6850 		goto out;
6851 	}
6852 
6853 	int64_t lowwat = so->so_snd.sb_lowat;
6854 
6855 	if (kn->kn_sfflags & NOTE_LOWAT) {
6856 		if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6857 			lowwat = so->so_snd.sb_hiwat;
6858 		} else if (kn->kn_sdata > lowwat) {
6859 			lowwat = kn->kn_sdata;
6860 		}
6861 	}
6862 
6863 	if (data >= lowwat) {
6864 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6865 #if (DEBUG || DEVELOPMENT)
6866 		    && so_notsent_lowat_check == 1
6867 #endif /* DEBUG || DEVELOPMENT */
6868 		    ) {
6869 			if ((SOCK_DOM(so) == PF_INET ||
6870 			    SOCK_DOM(so) == PF_INET6) &&
6871 			    so->so_type == SOCK_STREAM) {
6872 				ret = tcp_notsent_lowat_check(so);
6873 			}
6874 #if MPTCP
6875 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6876 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6877 				ret = mptcp_notsent_lowat_check(so);
6878 			}
6879 #endif
6880 			else {
6881 				ret = 1;
6882 				goto out;
6883 			}
6884 		} else {
6885 			ret = 1;
6886 		}
6887 	}
6888 	if (so_wait_for_if_feedback(so)) {
6889 		ret = 0;
6890 	}
6891 
6892 out:
6893 	if (ret && kev) {
6894 		knote_fill_kevent(kn, kev, data);
6895 	}
6896 	return ret;
6897 }
6898 
6899 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6900 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6901 {
6902 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6903 
6904 	/* socket locked */
6905 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6906 		so->so_snd.sb_flags |= SB_KNOTE;
6907 	}
6908 
6909 	/* determine if its already fired */
6910 	return filt_sowrite_common(kn, NULL, so);
6911 }
6912 
6913 static void
filt_sowdetach(struct knote * kn)6914 filt_sowdetach(struct knote *kn)
6915 {
6916 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6917 	socket_lock(so, 1);
6918 
6919 	if (so->so_snd.sb_flags & SB_KNOTE) {
6920 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6921 			so->so_snd.sb_flags &= ~SB_KNOTE;
6922 		}
6923 	}
6924 	socket_unlock(so, 1);
6925 }
6926 
6927 /*ARGSUSED*/
6928 static int
filt_sowrite(struct knote * kn,long hint)6929 filt_sowrite(struct knote *kn, long hint)
6930 {
6931 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6932 	int ret;
6933 
6934 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6935 		socket_lock(so, 1);
6936 	}
6937 
6938 	ret = filt_sowrite_common(kn, NULL, so);
6939 
6940 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6941 		socket_unlock(so, 1);
6942 	}
6943 
6944 	return ret;
6945 }
6946 
6947 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6948 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6949 {
6950 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6951 	int ret;
6952 
6953 	socket_lock(so, 1);
6954 
6955 	/*save off the new input fflags and data */
6956 	kn->kn_sfflags = kev->fflags;
6957 	kn->kn_sdata = kev->data;
6958 
6959 	/* determine if these changes result in a triggered event */
6960 	ret = filt_sowrite_common(kn, NULL, so);
6961 
6962 	socket_unlock(so, 1);
6963 
6964 	return ret;
6965 }
6966 
6967 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6968 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6969 {
6970 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6971 	int ret;
6972 
6973 	socket_lock(so, 1);
6974 	ret = filt_sowrite_common(kn, kev, so);
6975 	socket_unlock(so, 1);
6976 
6977 	return ret;
6978 }
6979 
6980 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6981 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6982     struct socket *so, long ev_hint)
6983 {
6984 	int ret = 0;
6985 	int64_t data = 0;
6986 	uint32_t level_trigger = 0;
6987 
6988 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6989 		kn->kn_fflags |= NOTE_CONNRESET;
6990 	}
6991 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6992 		kn->kn_fflags |= NOTE_TIMEOUT;
6993 	}
6994 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6995 		kn->kn_fflags |= NOTE_NOSRCADDR;
6996 	}
6997 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6998 		kn->kn_fflags |= NOTE_IFDENIED;
6999 	}
7000 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7001 		kn->kn_fflags |= NOTE_KEEPALIVE;
7002 	}
7003 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7004 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7005 	}
7006 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7007 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7008 	}
7009 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7010 	    (so->so_state & SS_ISCONNECTED)) {
7011 		kn->kn_fflags |= NOTE_CONNECTED;
7012 		level_trigger |= NOTE_CONNECTED;
7013 	}
7014 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7015 	    (so->so_state & SS_ISDISCONNECTED)) {
7016 		kn->kn_fflags |= NOTE_DISCONNECTED;
7017 		level_trigger |= NOTE_DISCONNECTED;
7018 	}
7019 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7020 		if (so->so_proto != NULL &&
7021 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7022 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7023 		}
7024 	}
7025 
7026 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7027 	    tcp_notify_ack_active(so)) {
7028 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
7029 	}
7030 
7031 	if ((so->so_state & SS_CANTRCVMORE)
7032 #if CONTENT_FILTER
7033 	    && cfil_sock_data_pending(&so->so_rcv) == 0
7034 #endif /* CONTENT_FILTER */
7035 	    ) {
7036 		kn->kn_fflags |= NOTE_READCLOSED;
7037 		level_trigger |= NOTE_READCLOSED;
7038 	}
7039 
7040 	if (so->so_state & SS_CANTSENDMORE) {
7041 		kn->kn_fflags |= NOTE_WRITECLOSED;
7042 		level_trigger |= NOTE_WRITECLOSED;
7043 	}
7044 
7045 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7046 	    (so->so_flags & SOF_SUSPENDED)) {
7047 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7048 
7049 		/* If resume event was delivered before, reset it */
7050 		kn->kn_hook32 &= ~NOTE_RESUME;
7051 
7052 		kn->kn_fflags |= NOTE_SUSPEND;
7053 		level_trigger |= NOTE_SUSPEND;
7054 	}
7055 
7056 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
7057 	    (so->so_flags & SOF_SUSPENDED) == 0) {
7058 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7059 
7060 		/* If suspend event was delivered before, reset it */
7061 		kn->kn_hook32 &= ~NOTE_SUSPEND;
7062 
7063 		kn->kn_fflags |= NOTE_RESUME;
7064 		level_trigger |= NOTE_RESUME;
7065 	}
7066 
7067 	if (so->so_error != 0) {
7068 		ret = 1;
7069 		data = so->so_error;
7070 		kn->kn_flags |= EV_EOF;
7071 	} else {
7072 		u_int32_t data32 = 0;
7073 		get_sockev_state(so, &data32);
7074 		data = data32;
7075 	}
7076 
7077 	/* Reset any events that are not requested on this knote */
7078 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7079 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7080 
7081 	/* Find the level triggerred events that are already delivered */
7082 	level_trigger &= kn->kn_hook32;
7083 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7084 
7085 	/* Do not deliver level triggerred events more than once */
7086 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7087 		ret = 1;
7088 	}
7089 
7090 	if (ret && kev) {
7091 		/*
7092 		 * Store the state of the events being delivered. This
7093 		 * state can be used to deliver level triggered events
7094 		 * ateast once and still avoid waking up the application
7095 		 * multiple times as long as the event is active.
7096 		 */
7097 		if (kn->kn_fflags != 0) {
7098 			kn->kn_hook32 |= (kn->kn_fflags &
7099 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7100 		}
7101 
7102 		/*
7103 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7104 		 * only one of them and remember the last one that was
7105 		 * delivered last
7106 		 */
7107 		if (kn->kn_fflags & NOTE_SUSPEND) {
7108 			kn->kn_hook32 &= ~NOTE_RESUME;
7109 		}
7110 		if (kn->kn_fflags & NOTE_RESUME) {
7111 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7112 		}
7113 
7114 		knote_fill_kevent(kn, kev, data);
7115 	}
7116 	return ret;
7117 }
7118 
7119 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7120 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7121 {
7122 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7123 
7124 	/* socket locked */
7125 	kn->kn_hook32 = 0;
7126 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7127 		so->so_flags |= SOF_KNOTE;
7128 	}
7129 
7130 	/* determine if event already fired */
7131 	return filt_sockev_common(kn, NULL, so, 0);
7132 }
7133 
7134 static void
filt_sockdetach(struct knote * kn)7135 filt_sockdetach(struct knote *kn)
7136 {
7137 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7138 	socket_lock(so, 1);
7139 
7140 	if ((so->so_flags & SOF_KNOTE) != 0) {
7141 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7142 			so->so_flags &= ~SOF_KNOTE;
7143 		}
7144 	}
7145 	socket_unlock(so, 1);
7146 }
7147 
7148 static int
filt_sockev(struct knote * kn,long hint)7149 filt_sockev(struct knote *kn, long hint)
7150 {
7151 	int ret = 0, locked = 0;
7152 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7153 	long ev_hint = (hint & SO_FILT_HINT_EV);
7154 
7155 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7156 		socket_lock(so, 1);
7157 		locked = 1;
7158 	}
7159 
7160 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7161 
7162 	if (locked) {
7163 		socket_unlock(so, 1);
7164 	}
7165 
7166 	return ret;
7167 }
7168 
7169 
7170 
7171 /*
7172  *	filt_socktouch - update event state
7173  */
7174 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7175 filt_socktouch(
7176 	struct knote *kn,
7177 	struct kevent_qos_s *kev)
7178 {
7179 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7180 	uint32_t changed_flags;
7181 	int ret;
7182 
7183 	socket_lock(so, 1);
7184 
7185 	/* save off the [result] data and fflags */
7186 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7187 
7188 	/* save off the new input fflags and data */
7189 	kn->kn_sfflags = kev->fflags;
7190 	kn->kn_sdata = kev->data;
7191 
7192 	/* restrict the current results to the (smaller?) set of new interest */
7193 	/*
7194 	 * For compatibility with previous implementations, we leave kn_fflags
7195 	 * as they were before.
7196 	 */
7197 	//kn->kn_fflags &= kev->fflags;
7198 
7199 	/*
7200 	 * Since we keep track of events that are already
7201 	 * delivered, if any of those events are not requested
7202 	 * anymore the state related to them can be reset
7203 	 */
7204 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7205 
7206 	/* determine if we have events to deliver */
7207 	ret = filt_sockev_common(kn, NULL, so, 0);
7208 
7209 	socket_unlock(so, 1);
7210 
7211 	return ret;
7212 }
7213 
7214 /*
7215  *	filt_sockprocess - query event fired state and return data
7216  */
7217 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7218 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7219 {
7220 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7221 	int ret = 0;
7222 
7223 	socket_lock(so, 1);
7224 
7225 	ret = filt_sockev_common(kn, kev, so, 0);
7226 
7227 	socket_unlock(so, 1);
7228 
7229 	return ret;
7230 }
7231 
7232 void
get_sockev_state(struct socket * so,u_int32_t * statep)7233 get_sockev_state(struct socket *so, u_int32_t *statep)
7234 {
7235 	u_int32_t state = *(statep);
7236 
7237 	/*
7238 	 * If the state variable is already used by a previous event,
7239 	 * reset it.
7240 	 */
7241 	if (state != 0) {
7242 		return;
7243 	}
7244 
7245 	if (so->so_state & SS_ISCONNECTED) {
7246 		state |= SOCKEV_CONNECTED;
7247 	} else {
7248 		state &= ~(SOCKEV_CONNECTED);
7249 	}
7250 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7251 	*(statep) = state;
7252 }
7253 
7254 #define SO_LOCK_HISTORY_STR_LEN \
7255 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7256 
7257 __private_extern__ const char *
solockhistory_nr(struct socket * so)7258 solockhistory_nr(struct socket *so)
7259 {
7260 	size_t n = 0;
7261 	int i;
7262 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7263 
7264 	bzero(lock_history_str, sizeof(lock_history_str));
7265 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7266 		n += scnprintf(lock_history_str + n,
7267 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7268 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7269 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7270 	}
7271 	return lock_history_str;
7272 }
7273 
7274 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7275 socket_getlock(struct socket *so, int flags)
7276 {
7277 	if (so->so_proto->pr_getlock != NULL) {
7278 		return (*so->so_proto->pr_getlock)(so, flags);
7279 	} else {
7280 		return so->so_proto->pr_domain->dom_mtx;
7281 	}
7282 }
7283 
7284 void
socket_lock(struct socket * so,int refcount)7285 socket_lock(struct socket *so, int refcount)
7286 {
7287 	void *lr_saved;
7288 
7289 	lr_saved = __builtin_return_address(0);
7290 
7291 	if (so->so_proto->pr_lock) {
7292 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7293 	} else {
7294 #ifdef MORE_LOCKING_DEBUG
7295 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7296 		    LCK_MTX_ASSERT_NOTOWNED);
7297 #endif
7298 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7299 		if (refcount) {
7300 			so->so_usecount++;
7301 		}
7302 		so->lock_lr[so->next_lock_lr] = lr_saved;
7303 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7304 	}
7305 }
7306 
7307 void
socket_lock_assert_owned(struct socket * so)7308 socket_lock_assert_owned(struct socket *so)
7309 {
7310 	lck_mtx_t *mutex_held;
7311 
7312 	if (so->so_proto->pr_getlock != NULL) {
7313 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7314 	} else {
7315 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7316 	}
7317 
7318 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7319 }
7320 
7321 int
socket_try_lock(struct socket * so)7322 socket_try_lock(struct socket *so)
7323 {
7324 	lck_mtx_t *mtx;
7325 
7326 	if (so->so_proto->pr_getlock != NULL) {
7327 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7328 	} else {
7329 		mtx = so->so_proto->pr_domain->dom_mtx;
7330 	}
7331 
7332 	return lck_mtx_try_lock(mtx);
7333 }
7334 
7335 void
socket_unlock(struct socket * so,int refcount)7336 socket_unlock(struct socket *so, int refcount)
7337 {
7338 	void *lr_saved;
7339 	lck_mtx_t *mutex_held;
7340 
7341 	lr_saved = __builtin_return_address(0);
7342 
7343 	if (so == NULL || so->so_proto == NULL) {
7344 		panic("%s: null so_proto so=%p", __func__, so);
7345 		/* NOTREACHED */
7346 	}
7347 
7348 	if (so->so_proto->pr_unlock) {
7349 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7350 	} else {
7351 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7352 #ifdef MORE_LOCKING_DEBUG
7353 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7354 #endif
7355 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7356 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7357 
7358 		if (refcount) {
7359 			if (so->so_usecount <= 0) {
7360 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7361 				    "lrh=%s", __func__, so->so_usecount, so,
7362 				    SOCK_DOM(so), so->so_type,
7363 				    SOCK_PROTO(so), solockhistory_nr(so));
7364 				/* NOTREACHED */
7365 			}
7366 
7367 			so->so_usecount--;
7368 			if (so->so_usecount == 0) {
7369 				sofreelastref(so, 1);
7370 			}
7371 		}
7372 		lck_mtx_unlock(mutex_held);
7373 	}
7374 }
7375 
7376 /* Called with socket locked, will unlock socket */
7377 void
sofree(struct socket * so)7378 sofree(struct socket *so)
7379 {
7380 	lck_mtx_t *mutex_held;
7381 
7382 	if (so->so_proto->pr_getlock != NULL) {
7383 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7384 	} else {
7385 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7386 	}
7387 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7388 
7389 	sofreelastref(so, 0);
7390 }
7391 
7392 void
soreference(struct socket * so)7393 soreference(struct socket *so)
7394 {
7395 	socket_lock(so, 1);     /* locks & take one reference on socket */
7396 	socket_unlock(so, 0);   /* unlock only */
7397 }
7398 
7399 void
sodereference(struct socket * so)7400 sodereference(struct socket *so)
7401 {
7402 	socket_lock(so, 0);
7403 	socket_unlock(so, 1);
7404 }
7405 
7406 /*
7407  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7408  * possibility of using jumbo clusters.  Caller must ensure to hold
7409  * the socket lock.
7410  */
7411 void
somultipages(struct socket * so,boolean_t set)7412 somultipages(struct socket *so, boolean_t set)
7413 {
7414 	if (set) {
7415 		so->so_flags |= SOF_MULTIPAGES;
7416 	} else {
7417 		so->so_flags &= ~SOF_MULTIPAGES;
7418 	}
7419 }
7420 
7421 void
soif2kcl(struct socket * so,boolean_t set)7422 soif2kcl(struct socket *so, boolean_t set)
7423 {
7424 	if (set) {
7425 		so->so_flags1 |= SOF1_IF_2KCL;
7426 	} else {
7427 		so->so_flags1 &= ~SOF1_IF_2KCL;
7428 	}
7429 }
7430 
7431 int
so_isdstlocal(struct socket * so)7432 so_isdstlocal(struct socket *so)
7433 {
7434 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7435 
7436 	if (SOCK_DOM(so) == PF_INET) {
7437 		return inaddr_local(inp->inp_faddr);
7438 	} else if (SOCK_DOM(so) == PF_INET6) {
7439 		return in6addr_local(&inp->in6p_faddr);
7440 	}
7441 
7442 	return 0;
7443 }
7444 
7445 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7446 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7447 {
7448 	struct sockbuf *rcv, *snd;
7449 	int err = 0, defunct;
7450 
7451 	rcv = &so->so_rcv;
7452 	snd = &so->so_snd;
7453 
7454 	defunct = (so->so_flags & SOF_DEFUNCT);
7455 	if (defunct) {
7456 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7457 			panic("%s: SB_DROP not set", __func__);
7458 			/* NOTREACHED */
7459 		}
7460 		goto done;
7461 	}
7462 
7463 	if (so->so_flags & SOF_NODEFUNCT) {
7464 		if (noforce) {
7465 			err = EOPNOTSUPP;
7466 			if (p != PROC_NULL) {
7467 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7468 				    "name %s level %d) so 0x%llx [%d,%d] "
7469 				    "is not eligible for defunct "
7470 				    "(%d)\n", __func__, proc_selfpid(),
7471 				    proc_best_name(current_proc()), proc_pid(p),
7472 				    proc_best_name(p), level,
7473 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7474 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7475 			}
7476 			return err;
7477 		}
7478 		so->so_flags &= ~SOF_NODEFUNCT;
7479 		if (p != PROC_NULL) {
7480 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7481 			    "name %s level %d) so 0x%llx [%d,%d] "
7482 			    "defunct by force "
7483 			    "(%d)\n", __func__, proc_selfpid(),
7484 			    proc_best_name(current_proc()), proc_pid(p),
7485 			    proc_best_name(p), level,
7486 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7487 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7488 		}
7489 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7490 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7491 		struct ifnet *ifp = inp->inp_last_outifp;
7492 
7493 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7494 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7495 		} else if (so->so_flags & SOF_DELEGATED) {
7496 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7497 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7498 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7499 		} else if (noforce && p != PROC_NULL) {
7500 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7501 
7502 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7503 			so->so_extended_bk_start = net_uptime();
7504 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7505 
7506 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7507 
7508 			err = EOPNOTSUPP;
7509 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7510 			    "name %s level %d) so 0x%llx [%d,%d] "
7511 			    "extend bk idle "
7512 			    "(%d)\n", __func__, proc_selfpid(),
7513 			    proc_best_name(current_proc()), proc_pid(p),
7514 			    proc_best_name(p), level,
7515 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7516 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7517 			return err;
7518 		} else {
7519 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7520 		}
7521 	}
7522 
7523 	so->so_flags |= SOF_DEFUNCT;
7524 
7525 	/* Prevent further data from being appended to the socket buffers */
7526 	snd->sb_flags |= SB_DROP;
7527 	rcv->sb_flags |= SB_DROP;
7528 
7529 	/* Flush any existing data in the socket buffers */
7530 	if (rcv->sb_cc != 0) {
7531 		rcv->sb_flags &= ~SB_SEL;
7532 		selthreadclear(&rcv->sb_sel);
7533 		sbrelease(rcv);
7534 	}
7535 	if (snd->sb_cc != 0) {
7536 		snd->sb_flags &= ~SB_SEL;
7537 		selthreadclear(&snd->sb_sel);
7538 		sbrelease(snd);
7539 	}
7540 
7541 done:
7542 	if (p != PROC_NULL) {
7543 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7544 		    "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7545 		    proc_selfpid(), proc_best_name(current_proc()),
7546 		    proc_pid(p), proc_best_name(p), level,
7547 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7548 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7549 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7550 		    " extbkidle" : "");
7551 	}
7552 	return err;
7553 }
7554 
7555 int
sodefunct(struct proc * p,struct socket * so,int level)7556 sodefunct(struct proc *p, struct socket *so, int level)
7557 {
7558 	struct sockbuf *rcv, *snd;
7559 
7560 	if (!(so->so_flags & SOF_DEFUNCT)) {
7561 		panic("%s improperly called", __func__);
7562 		/* NOTREACHED */
7563 	}
7564 	if (so->so_state & SS_DEFUNCT) {
7565 		goto done;
7566 	}
7567 
7568 	rcv = &so->so_rcv;
7569 	snd = &so->so_snd;
7570 
7571 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7572 		char s[MAX_IPv6_STR_LEN];
7573 		char d[MAX_IPv6_STR_LEN];
7574 		struct inpcb *inp = sotoinpcb(so);
7575 
7576 		if (p != PROC_NULL) {
7577 			SODEFUNCTLOG(
7578 				"%s[%d, %s]: (target pid %d name %s level %d) "
7579 				"so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7580 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7581 				" snd_fl 0x%x]\n", __func__,
7582 				proc_selfpid(), proc_best_name(current_proc()),
7583 				proc_pid(p), proc_best_name(p), level,
7584 				(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7585 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7586 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7587 				(void *)&inp->inp_laddr.s_addr :
7588 				(void *)&inp->in6p_laddr),
7589 				s, sizeof(s)), ntohs(inp->in6p_lport),
7590 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7591 				(void *)&inp->inp_faddr.s_addr :
7592 				(void *)&inp->in6p_faddr,
7593 				d, sizeof(d)), ntohs(inp->in6p_fport),
7594 				(uint32_t)rcv->sb_sel.si_flags,
7595 				(uint32_t)snd->sb_sel.si_flags,
7596 				rcv->sb_flags, snd->sb_flags);
7597 		}
7598 	} else if (p != PROC_NULL) {
7599 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7600 		    "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7601 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7602 		    proc_selfpid(), proc_best_name(current_proc()),
7603 		    proc_pid(p), proc_best_name(p), level,
7604 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7605 		    SOCK_DOM(so), SOCK_TYPE(so),
7606 		    (uint32_t)rcv->sb_sel.si_flags,
7607 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7608 		    snd->sb_flags);
7609 	}
7610 
7611 	/*
7612 	 * Unwedge threads blocked on sbwait() and sb_lock().
7613 	 */
7614 	sbwakeup(rcv);
7615 	sbwakeup(snd);
7616 
7617 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7618 	if (rcv->sb_flags & SB_LOCK) {
7619 		sbunlock(rcv, TRUE);    /* keep socket locked */
7620 	}
7621 	if (snd->sb_flags & SB_LOCK) {
7622 		sbunlock(snd, TRUE);    /* keep socket locked */
7623 	}
7624 	/*
7625 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7626 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7627 	 * states are set for the socket.  This would also flush out data
7628 	 * hanging off the receive list of this socket.
7629 	 */
7630 	(void) soshutdownlock_final(so, SHUT_RD);
7631 	(void) soshutdownlock_final(so, SHUT_WR);
7632 	(void) sodisconnectlocked(so);
7633 
7634 	/*
7635 	 * Explicitly handle connectionless-protocol disconnection
7636 	 * and release any remaining data in the socket buffers.
7637 	 */
7638 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7639 		(void) soisdisconnected(so);
7640 	}
7641 
7642 	if (so->so_error == 0) {
7643 		so->so_error = EBADF;
7644 	}
7645 
7646 	if (rcv->sb_cc != 0) {
7647 		rcv->sb_flags &= ~SB_SEL;
7648 		selthreadclear(&rcv->sb_sel);
7649 		sbrelease(rcv);
7650 	}
7651 	if (snd->sb_cc != 0) {
7652 		snd->sb_flags &= ~SB_SEL;
7653 		selthreadclear(&snd->sb_sel);
7654 		sbrelease(snd);
7655 	}
7656 	so->so_state |= SS_DEFUNCT;
7657 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7658 
7659 done:
7660 	return 0;
7661 }
7662 
7663 int
soresume(struct proc * p,struct socket * so,int locked)7664 soresume(struct proc *p, struct socket *so, int locked)
7665 {
7666 	if (locked == 0) {
7667 		socket_lock(so, 1);
7668 	}
7669 
7670 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7671 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7672 		    "[%d,%d] resumed from bk idle\n",
7673 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7674 		    proc_pid(p), proc_best_name(p),
7675 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7676 		    SOCK_DOM(so), SOCK_TYPE(so));
7677 
7678 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7679 		so->so_extended_bk_start = 0;
7680 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7681 
7682 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7683 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7684 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7685 	}
7686 	if (locked == 0) {
7687 		socket_unlock(so, 1);
7688 	}
7689 
7690 	return 0;
7691 }
7692 
7693 /*
7694  * Does not attempt to account for sockets that are delegated from
7695  * the current process
7696  */
7697 int
so_set_extended_bk_idle(struct socket * so,int optval)7698 so_set_extended_bk_idle(struct socket *so, int optval)
7699 {
7700 	int error = 0;
7701 
7702 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7703 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7704 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7705 		error = EOPNOTSUPP;
7706 	} else if (optval == 0) {
7707 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7708 
7709 		soresume(current_proc(), so, 1);
7710 	} else {
7711 		struct proc *p = current_proc();
7712 		struct fileproc *fp;
7713 		int count = 0;
7714 
7715 		/*
7716 		 * Unlock socket to avoid lock ordering issue with
7717 		 * the proc fd table lock
7718 		 */
7719 		socket_unlock(so, 0);
7720 
7721 		proc_fdlock(p);
7722 		fdt_foreach(fp, p) {
7723 			struct socket *so2;
7724 
7725 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7726 				continue;
7727 			}
7728 
7729 			so2 = (struct socket *)fp_get_data(fp);
7730 			if (so != so2 &&
7731 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7732 				count++;
7733 			}
7734 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7735 				break;
7736 			}
7737 		}
7738 		proc_fdunlock(p);
7739 
7740 		socket_lock(so, 0);
7741 
7742 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7743 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7744 			error = EBUSY;
7745 		} else if (so->so_flags & SOF_DELEGATED) {
7746 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7747 			error = EBUSY;
7748 		} else {
7749 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7750 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7751 		}
7752 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7753 		    "%s marked for extended bk idle\n",
7754 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7755 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7756 		    SOCK_DOM(so), SOCK_TYPE(so),
7757 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7758 		    "is" : "not");
7759 	}
7760 
7761 	return error;
7762 }
7763 
7764 static void
so_stop_extended_bk_idle(struct socket * so)7765 so_stop_extended_bk_idle(struct socket *so)
7766 {
7767 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7768 	so->so_extended_bk_start = 0;
7769 
7770 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7771 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7772 	/*
7773 	 * Force defunct
7774 	 */
7775 	sosetdefunct(current_proc(), so,
7776 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7777 	if (so->so_flags & SOF_DEFUNCT) {
7778 		sodefunct(current_proc(), so,
7779 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7780 	}
7781 }
7782 
7783 void
so_drain_extended_bk_idle(struct socket * so)7784 so_drain_extended_bk_idle(struct socket *so)
7785 {
7786 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7787 		/*
7788 		 * Only penalize sockets that have outstanding data
7789 		 */
7790 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7791 			so_stop_extended_bk_idle(so);
7792 
7793 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7794 		}
7795 	}
7796 }
7797 
7798 /*
7799  * Return values tells if socket is still in extended background idle
7800  */
7801 int
so_check_extended_bk_idle_time(struct socket * so)7802 so_check_extended_bk_idle_time(struct socket *so)
7803 {
7804 	int ret = 1;
7805 
7806 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7807 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7808 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7809 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7810 		    SOCK_DOM(so), SOCK_TYPE(so));
7811 		if (net_uptime() - so->so_extended_bk_start >
7812 		    soextbkidlestat.so_xbkidle_time) {
7813 			so_stop_extended_bk_idle(so);
7814 
7815 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7816 
7817 			ret = 0;
7818 		} else {
7819 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7820 
7821 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7822 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7823 		}
7824 	}
7825 
7826 	return ret;
7827 }
7828 
7829 void
resume_proc_sockets(proc_t p)7830 resume_proc_sockets(proc_t p)
7831 {
7832 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7833 		struct fileproc *fp;
7834 		struct socket *so;
7835 
7836 		proc_fdlock(p);
7837 		fdt_foreach(fp, p) {
7838 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7839 				continue;
7840 			}
7841 
7842 			so = (struct socket *)fp_get_data(fp);
7843 			(void) soresume(p, so, 0);
7844 		}
7845 		proc_fdunlock(p);
7846 
7847 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7848 	}
7849 }
7850 
7851 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7852 so_set_recv_anyif(struct socket *so, int optval)
7853 {
7854 	int ret = 0;
7855 
7856 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7857 		if (optval) {
7858 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7859 		} else {
7860 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7861 		}
7862 #if SKYWALK
7863 		inp_update_netns_flags(so);
7864 #endif /* SKYWALK */
7865 	}
7866 
7867 
7868 	return ret;
7869 }
7870 
7871 __private_extern__ int
so_get_recv_anyif(struct socket * so)7872 so_get_recv_anyif(struct socket *so)
7873 {
7874 	int ret = 0;
7875 
7876 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7877 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7878 	}
7879 
7880 	return ret;
7881 }
7882 
7883 int
so_set_restrictions(struct socket * so,uint32_t vals)7884 so_set_restrictions(struct socket *so, uint32_t vals)
7885 {
7886 	int nocell_old, nocell_new;
7887 	int noexpensive_old, noexpensive_new;
7888 	int noconstrained_old, noconstrained_new;
7889 
7890 	/*
7891 	 * Deny-type restrictions are trapdoors; once set they cannot be
7892 	 * unset for the lifetime of the socket.  This allows them to be
7893 	 * issued by a framework on behalf of the application without
7894 	 * having to worry that they can be undone.
7895 	 *
7896 	 * Note here that socket-level restrictions overrides any protocol
7897 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7898 	 * socket restriction issued on the socket has a higher precendence
7899 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7900 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7901 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7902 	 */
7903 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7904 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7905 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7906 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7907 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7908 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7909 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7910 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7911 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7912 
7913 	/* we can only set, not clear restrictions */
7914 	if ((nocell_new - nocell_old) == 0 &&
7915 	    (noexpensive_new - noexpensive_old) == 0 &&
7916 	    (noconstrained_new - noconstrained_old) == 0) {
7917 		return 0;
7918 	}
7919 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7920 		if (nocell_new - nocell_old != 0) {
7921 			/*
7922 			 * if deny cellular is now set, do what's needed
7923 			 * for INPCB
7924 			 */
7925 			inp_set_nocellular(sotoinpcb(so));
7926 		}
7927 		if (noexpensive_new - noexpensive_old != 0) {
7928 			inp_set_noexpensive(sotoinpcb(so));
7929 		}
7930 		if (noconstrained_new - noconstrained_old != 0) {
7931 			inp_set_noconstrained(sotoinpcb(so));
7932 		}
7933 	}
7934 
7935 	if (SOCK_DOM(so) == PF_MULTIPATH) {
7936 		mptcp_set_restrictions(so);
7937 	}
7938 
7939 	return 0;
7940 }
7941 
7942 uint32_t
so_get_restrictions(struct socket * so)7943 so_get_restrictions(struct socket *so)
7944 {
7945 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7946 	       SO_RESTRICT_DENY_OUT |
7947 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7948 }
7949 
7950 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7951 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7952 {
7953 	struct proc *ep = PROC_NULL;
7954 	int error = 0;
7955 
7956 	/* pid 0 is reserved for kernel */
7957 	if (epid == 0) {
7958 		error = EINVAL;
7959 		goto done;
7960 	}
7961 
7962 	/*
7963 	 * If this is an in-kernel socket, prevent its delegate
7964 	 * association from changing unless the socket option is
7965 	 * coming from within the kernel itself.
7966 	 */
7967 	if (so->last_pid == 0 && p != kernproc) {
7968 		error = EACCES;
7969 		goto done;
7970 	}
7971 
7972 	/*
7973 	 * If this is issued by a process that's recorded as the
7974 	 * real owner of the socket, or if the pid is the same as
7975 	 * the process's own pid, then proceed.  Otherwise ensure
7976 	 * that the issuing process has the necessary privileges.
7977 	 */
7978 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7979 		if ((error = priv_check_cred(kauth_cred_get(),
7980 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7981 			error = EACCES;
7982 			goto done;
7983 		}
7984 	}
7985 
7986 	/* Find the process that corresponds to the effective pid */
7987 	if ((ep = proc_find(epid)) == PROC_NULL) {
7988 		error = ESRCH;
7989 		goto done;
7990 	}
7991 
7992 	/*
7993 	 * If a process tries to delegate the socket to itself, then
7994 	 * there's really nothing to do; treat it as a way for the
7995 	 * delegate association to be cleared.  Note that we check
7996 	 * the passed-in proc rather than calling proc_selfpid(),
7997 	 * as we need to check the process issuing the socket option
7998 	 * which could be kernproc.  Given that we don't allow 0 for
7999 	 * effective pid, it means that a delegated in-kernel socket
8000 	 * stays delegated during its lifetime (which is probably OK.)
8001 	 */
8002 	if (epid == proc_pid(p)) {
8003 		so->so_flags &= ~SOF_DELEGATED;
8004 		so->e_upid = 0;
8005 		so->e_pid = 0;
8006 		uuid_clear(so->e_uuid);
8007 	} else {
8008 		so->so_flags |= SOF_DELEGATED;
8009 		so->e_upid = proc_uniqueid(ep);
8010 		so->e_pid = proc_pid(ep);
8011 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8012 
8013 #if defined(XNU_TARGET_OS_OSX)
8014 		if (ep->p_responsible_pid != so->e_pid) {
8015 			proc_t rp = proc_find(ep->p_responsible_pid);
8016 			if (rp != PROC_NULL) {
8017 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8018 				so->so_rpid = ep->p_responsible_pid;
8019 				proc_rele(rp);
8020 			} else {
8021 				uuid_clear(so->so_ruuid);
8022 				so->so_rpid = -1;
8023 			}
8024 		}
8025 #endif
8026 	}
8027 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8028 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8029 	}
8030 done:
8031 	if (error == 0 && net_io_policy_log) {
8032 		uuid_string_t buf;
8033 
8034 		uuid_unparse(so->e_uuid, buf);
8035 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8036 		    "euuid %s%s\n", __func__, proc_name_address(p),
8037 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8038 		    SOCK_DOM(so), SOCK_TYPE(so),
8039 		    so->e_pid, proc_name_address(ep), buf,
8040 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8041 	} else if (error != 0 && net_io_policy_log) {
8042 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8043 		    "ERROR (%d)\n", __func__, proc_name_address(p),
8044 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8045 		    SOCK_DOM(so), SOCK_TYPE(so),
8046 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
8047 		    proc_name_address(ep), error);
8048 	}
8049 
8050 	/* Update this socket's policy upon success */
8051 	if (error == 0) {
8052 		so->so_policy_gencnt *= -1;
8053 		so_update_policy(so);
8054 #if NECP
8055 		so_update_necp_policy(so, NULL, NULL);
8056 #endif /* NECP */
8057 	}
8058 
8059 	if (ep != PROC_NULL) {
8060 		proc_rele(ep);
8061 	}
8062 
8063 	return error;
8064 }
8065 
8066 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8067 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8068 {
8069 	uuid_string_t buf;
8070 	uuid_t uuid;
8071 	int error = 0;
8072 
8073 	/* UUID must not be all-zeroes (reserved for kernel) */
8074 	if (uuid_is_null(euuid)) {
8075 		error = EINVAL;
8076 		goto done;
8077 	}
8078 
8079 	/*
8080 	 * If this is an in-kernel socket, prevent its delegate
8081 	 * association from changing unless the socket option is
8082 	 * coming from within the kernel itself.
8083 	 */
8084 	if (so->last_pid == 0 && p != kernproc) {
8085 		error = EACCES;
8086 		goto done;
8087 	}
8088 
8089 	/* Get the UUID of the issuing process */
8090 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8091 
8092 	/*
8093 	 * If this is issued by a process that's recorded as the
8094 	 * real owner of the socket, or if the uuid is the same as
8095 	 * the process's own uuid, then proceed.  Otherwise ensure
8096 	 * that the issuing process has the necessary privileges.
8097 	 */
8098 	if (check_cred &&
8099 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8100 	    uuid_compare(euuid, uuid) != 0)) {
8101 		if ((error = priv_check_cred(kauth_cred_get(),
8102 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8103 			error = EACCES;
8104 			goto done;
8105 		}
8106 	}
8107 
8108 	/*
8109 	 * If a process tries to delegate the socket to itself, then
8110 	 * there's really nothing to do; treat it as a way for the
8111 	 * delegate association to be cleared.  Note that we check
8112 	 * the uuid of the passed-in proc rather than that of the
8113 	 * current process, as we need to check the process issuing
8114 	 * the socket option which could be kernproc itself.  Given
8115 	 * that we don't allow 0 for effective uuid, it means that
8116 	 * a delegated in-kernel socket stays delegated during its
8117 	 * lifetime (which is okay.)
8118 	 */
8119 	if (uuid_compare(euuid, uuid) == 0) {
8120 		so->so_flags &= ~SOF_DELEGATED;
8121 		so->e_upid = 0;
8122 		so->e_pid = 0;
8123 		uuid_clear(so->e_uuid);
8124 	} else {
8125 		so->so_flags |= SOF_DELEGATED;
8126 		/*
8127 		 * Unlike so_set_effective_pid(), we only have the UUID
8128 		 * here and the process ID is not known.  Inherit the
8129 		 * real {pid,upid} of the socket.
8130 		 */
8131 		so->e_upid = so->last_upid;
8132 		so->e_pid = so->last_pid;
8133 		uuid_copy(so->e_uuid, euuid);
8134 	}
8135 	/*
8136 	 * The following will clear the effective process name as it's the same
8137 	 * as the real process
8138 	 */
8139 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8140 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8141 	}
8142 done:
8143 	if (error == 0 && net_io_policy_log) {
8144 		uuid_unparse(so->e_uuid, buf);
8145 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8146 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8147 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8148 		    SOCK_TYPE(so), so->e_pid, buf,
8149 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8150 	} else if (error != 0 && net_io_policy_log) {
8151 		uuid_unparse(euuid, buf);
8152 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8153 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8154 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8155 		    SOCK_TYPE(so), buf, error);
8156 	}
8157 
8158 	/* Update this socket's policy upon success */
8159 	if (error == 0) {
8160 		so->so_policy_gencnt *= -1;
8161 		so_update_policy(so);
8162 #if NECP
8163 		so_update_necp_policy(so, NULL, NULL);
8164 #endif /* NECP */
8165 	}
8166 
8167 	return error;
8168 }
8169 
8170 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8171 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8172     uint32_t ev_datalen)
8173 {
8174 	struct kev_msg ev_msg;
8175 
8176 	/*
8177 	 * A netpolicy event always starts with a netpolicy_event_data
8178 	 * structure, but the caller can provide for a longer event
8179 	 * structure to post, depending on the event code.
8180 	 */
8181 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8182 
8183 	bzero(&ev_msg, sizeof(ev_msg));
8184 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8185 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8186 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8187 	ev_msg.event_code       = ev_code;
8188 
8189 	ev_msg.dv[0].data_ptr   = ev_data;
8190 	ev_msg.dv[0].data_length = ev_datalen;
8191 
8192 	kev_post_msg(&ev_msg);
8193 }
8194 
8195 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8196 socket_post_kev_msg(uint32_t ev_code,
8197     struct kev_socket_event_data *ev_data,
8198     uint32_t ev_datalen)
8199 {
8200 	struct kev_msg ev_msg;
8201 
8202 	bzero(&ev_msg, sizeof(ev_msg));
8203 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8204 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8205 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8206 	ev_msg.event_code = ev_code;
8207 
8208 	ev_msg.dv[0].data_ptr = ev_data;
8209 	ev_msg.dv[0].data_length = ev_datalen;
8210 
8211 	kev_post_msg(&ev_msg);
8212 }
8213 
8214 void
socket_post_kev_msg_closed(struct socket * so)8215 socket_post_kev_msg_closed(struct socket *so)
8216 {
8217 	struct kev_socket_closed ev = {};
8218 	struct sockaddr *socksa = NULL, *peersa = NULL;
8219 	int err;
8220 
8221 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8222 		return;
8223 	}
8224 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8225 	if (err == 0) {
8226 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8227 		    &peersa);
8228 		if (err == 0) {
8229 			memcpy(&ev.ev_data.kev_sockname, socksa,
8230 			    min(socksa->sa_len,
8231 			    sizeof(ev.ev_data.kev_sockname)));
8232 			memcpy(&ev.ev_data.kev_peername, peersa,
8233 			    min(peersa->sa_len,
8234 			    sizeof(ev.ev_data.kev_peername)));
8235 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8236 			    &ev.ev_data, sizeof(ev));
8237 		}
8238 	}
8239 	free_sockaddr(socksa);
8240 	free_sockaddr(peersa);
8241 }
8242