xref: /xnu-10002.1.13/bsd/kern/uipc_socket.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <netinet/in.h>
106 #include <netinet/in_pcb.h>
107 #include <netinet/in_tclass.h>
108 #include <netinet/in_var.h>
109 #include <netinet/tcp_var.h>
110 #include <netinet/ip6.h>
111 #include <netinet6/ip6_var.h>
112 #include <netinet/flow_divert.h>
113 #include <kern/zalloc.h>
114 #include <kern/locks.h>
115 #include <machine/limits.h>
116 #include <libkern/OSAtomic.h>
117 #include <pexpert/pexpert.h>
118 #include <kern/assert.h>
119 #include <kern/task.h>
120 #include <kern/policy_internal.h>
121 
122 #include <sys/kpi_mbuf.h>
123 #include <sys/mcache.h>
124 #include <sys/unpcb.h>
125 #include <libkern/section_keywords.h>
126 
127 #include <os/log.h>
128 
129 #if CONFIG_MACF
130 #include <security/mac_framework.h>
131 #endif /* MAC */
132 
133 #if MULTIPATH
134 #include <netinet/mp_pcb.h>
135 #include <netinet/mptcp_var.h>
136 #endif /* MULTIPATH */
137 
138 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
139 
140 #if DEBUG || DEVELOPMENT
141 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
142 #else
143 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
144 #endif
145 
146 /* TODO: this should be in a header file somewhere */
147 extern char *proc_name_address(void *p);
148 
149 static u_int32_t        so_cache_hw;    /* High water mark for socache */
150 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
151 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
152 static u_int32_t        cached_sock_count = 0;
153 STAILQ_HEAD(, socket)   so_cache_head;
154 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
155 static uint64_t        so_cache_time;
156 static int              socketinit_done;
157 static struct zone      *so_cache_zone;
158 
159 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
160 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
161 
162 #include <machine/limits.h>
163 
164 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void     filt_sordetach(struct knote *kn);
166 static int      filt_soread(struct knote *kn, long hint);
167 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
169 
170 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void     filt_sowdetach(struct knote *kn);
172 static int      filt_sowrite(struct knote *kn, long hint);
173 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
175 
176 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
177 static void     filt_sockdetach(struct knote *kn);
178 static int      filt_sockev(struct knote *kn, long hint);
179 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
180 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
181 
182 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
183 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
184 
185 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
186 	.f_isfd = 1,
187 	.f_attach = filt_sorattach,
188 	.f_detach = filt_sordetach,
189 	.f_event = filt_soread,
190 	.f_touch = filt_sortouch,
191 	.f_process = filt_sorprocess,
192 };
193 
194 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
195 	.f_isfd = 1,
196 	.f_attach = filt_sowattach,
197 	.f_detach = filt_sowdetach,
198 	.f_event = filt_sowrite,
199 	.f_touch = filt_sowtouch,
200 	.f_process = filt_sowprocess,
201 };
202 
203 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
204 	.f_isfd = 1,
205 	.f_attach = filt_sockattach,
206 	.f_detach = filt_sockdetach,
207 	.f_event = filt_sockev,
208 	.f_touch = filt_socktouch,
209 	.f_process = filt_sockprocess,
210 };
211 
212 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
213 	.f_isfd = 1,
214 	.f_attach = filt_sorattach,
215 	.f_detach = filt_sordetach,
216 	.f_event = filt_soread,
217 	.f_touch = filt_sortouch,
218 	.f_process = filt_sorprocess,
219 };
220 
221 SYSCTL_DECL(_kern_ipc);
222 
223 #define EVEN_MORE_LOCKING_DEBUG 0
224 
225 int socket_debug = 0;
226 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
227     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
228 
229 #if (DEBUG || DEVELOPMENT)
230 #define DEFAULT_SOSEND_ASSERT_PANIC 1
231 #else
232 #define DEFAULT_SOSEND_ASSERT_PANIC 0
233 #endif /* (DEBUG || DEVELOPMENT) */
234 
235 int sosend_assert_panic = 0;
236 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
237     CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
238 
239 static unsigned long sodefunct_calls = 0;
240 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
241     &sodefunct_calls, "");
242 
243 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
244 so_gen_t        so_gencnt;      /* generation count for sockets */
245 
246 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
247 
248 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
249 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
250 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
251 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
252 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
253 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
254 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
255 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
256 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
257 
258 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
259 
260 int somaxconn = SOMAXCONN;
261 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
262     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
263 
264 /* Should we get a maximum also ??? */
265 static int sosendmaxchain = 65536;
266 static int sosendminchain = 16384;
267 static int sorecvmincopy  = 16384;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
269     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
270 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
271     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
272 
273 /*
274  * Set to enable jumbo clusters (if available) for large writes when
275  * the socket is marked with SOF_MULTIPAGES; see below.
276  */
277 int sosendjcl = 1;
278 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
279     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
280 
281 /*
282  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
283  * writes on the socket for all protocols on any network interfaces,
284  * depending upon sosendjcl above.  Be extra careful when setting this
285  * to 1, because sending down packets that cross physical pages down to
286  * broken drivers (those that falsely assume that the physical pages
287  * are contiguous) might lead to system panics or silent data corruption.
288  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
289  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
290  * capable.  Set this to 1 only for testing/debugging purposes.
291  */
292 int sosendjcl_ignore_capab = 0;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
294     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
295 
296 /*
297  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
298  * writes on the socket for all protocols on any network interfaces.
299  * Be extra careful when setting this to 1, because sending down packets with
300  * clusters larger that 2 KB might lead to system panics or data corruption.
301  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
302  * on the outgoing interface
303  * Set this to 1  for testing/debugging purposes only.
304  */
305 int sosendbigcl_ignore_capab = 0;
306 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
307     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
308 
309 int sodefunctlog = 0;
310 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
311     &sodefunctlog, 0, "");
312 
313 int sothrottlelog = 0;
314 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
315     &sothrottlelog, 0, "");
316 
317 int sorestrictrecv = 1;
318 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
319     &sorestrictrecv, 0, "Enable inbound interface restrictions");
320 
321 int sorestrictsend = 1;
322 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
323     &sorestrictsend, 0, "Enable outbound interface restrictions");
324 
325 int soreserveheadroom = 1;
326 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
327     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
328 
329 #if (DEBUG || DEVELOPMENT)
330 int so_notsent_lowat_check = 1;
331 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
332     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
333 #endif /* DEBUG || DEVELOPMENT */
334 
335 int so_accept_list_waits = 0;
336 #if (DEBUG || DEVELOPMENT)
337 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
338     &so_accept_list_waits, 0, "number of waits for listener incomp list");
339 #endif /* DEBUG || DEVELOPMENT */
340 
341 extern struct inpcbinfo tcbinfo;
342 
343 /* TODO: these should be in header file */
344 extern int get_inpcb_str_size(void);
345 extern int get_tcp_str_size(void);
346 
347 vm_size_t       so_cache_zone_element_size;
348 
349 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
350     user_ssize_t *);
351 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
352 static void cached_sock_free(struct socket *);
353 
354 /*
355  * Maximum of extended background idle sockets per process
356  * Set to zero to disable further setting of the option
357  */
358 
359 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
360 #define SO_IDLE_BK_IDLE_TIME            600
361 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
362 
363 struct soextbkidlestat soextbkidlestat;
364 
365 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
366     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
367     "Maximum of extended background idle sockets per process");
368 
369 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
370     &soextbkidlestat.so_xbkidle_time, 0,
371     "Time in seconds to keep extended background idle sockets");
372 
373 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
374     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
375     "High water mark for extended background idle sockets");
376 
377 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
378     &soextbkidlestat, soextbkidlestat, "");
379 
380 int so_set_extended_bk_idle(struct socket *, int);
381 
382 #define SO_MAX_MSG_X 1024
383 
384 /*
385  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
386  * setting the DSCP code on the packet based on the service class; see
387  * <rdar://problem/11277343> for details.
388  */
389 __private_extern__ u_int32_t sotcdb = 0;
390 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
391     &sotcdb, 0, "");
392 
393 void
socketinit(void)394 socketinit(void)
395 {
396 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
397 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
398 
399 #ifdef __LP64__
400 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
401 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
402 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
403 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
404 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
405 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
406 #else
407 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
408 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
409 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
410 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
411 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
412 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
413 #endif
414 
415 	if (socketinit_done) {
416 		printf("socketinit: already called...\n");
417 		return;
418 	}
419 	socketinit_done = 1;
420 
421 	PE_parse_boot_argn("socket_debug", &socket_debug,
422 	    sizeof(socket_debug));
423 
424 	PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
425 	    sizeof(sosend_assert_panic));
426 
427 	STAILQ_INIT(&so_cache_head);
428 
429 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
430 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
431 
432 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
433 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
434 
435 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
436 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
437 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
438 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
439 
440 	in_pcbinit();
441 }
442 
443 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)444 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
445 {
446 	caddr_t temp;
447 	uintptr_t offset;
448 
449 	lck_mtx_lock(&so_cache_mtx);
450 
451 	if (!STAILQ_EMPTY(&so_cache_head)) {
452 		VERIFY(cached_sock_count > 0);
453 
454 		*so = STAILQ_FIRST(&so_cache_head);
455 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
456 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
457 
458 		cached_sock_count--;
459 		lck_mtx_unlock(&so_cache_mtx);
460 
461 		temp = (*so)->so_saved_pcb;
462 		bzero((caddr_t)*so, sizeof(struct socket));
463 
464 		(*so)->so_saved_pcb = temp;
465 	} else {
466 		lck_mtx_unlock(&so_cache_mtx);
467 
468 		*so = zalloc_flags(so_cache_zone, how | Z_ZERO);
469 
470 		/*
471 		 * Define offsets for extra structures into our
472 		 * single block of memory. Align extra structures
473 		 * on longword boundaries.
474 		 */
475 
476 		offset = (uintptr_t)*so;
477 		offset += sizeof(struct socket);
478 
479 		offset = ALIGN(offset);
480 
481 		(*so)->so_saved_pcb = (caddr_t)offset;
482 		offset += get_inpcb_str_size();
483 
484 		offset = ALIGN(offset);
485 
486 		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
487 		    (caddr_t)offset;
488 	}
489 
490 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
491 }
492 
493 static void
cached_sock_free(struct socket * so)494 cached_sock_free(struct socket *so)
495 {
496 	lck_mtx_lock(&so_cache_mtx);
497 
498 	so_cache_time = net_uptime();
499 	if (++cached_sock_count > max_cached_sock_count) {
500 		--cached_sock_count;
501 		lck_mtx_unlock(&so_cache_mtx);
502 		zfree(so_cache_zone, so);
503 	} else {
504 		if (so_cache_hw < cached_sock_count) {
505 			so_cache_hw = cached_sock_count;
506 		}
507 
508 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
509 
510 		so->cache_timestamp = so_cache_time;
511 		lck_mtx_unlock(&so_cache_mtx);
512 	}
513 }
514 
515 void
so_update_last_owner_locked(struct socket * so,proc_t self)516 so_update_last_owner_locked(struct socket *so, proc_t self)
517 {
518 	if (so->last_pid != 0) {
519 		/*
520 		 * last_pid and last_upid should remain zero for sockets
521 		 * created using sock_socket. The check above achieves that
522 		 */
523 		if (self == PROC_NULL) {
524 			self = current_proc();
525 		}
526 
527 		if (so->last_upid != proc_uniqueid(self) ||
528 		    so->last_pid != proc_pid(self)) {
529 			so->last_upid = proc_uniqueid(self);
530 			so->last_pid = proc_pid(self);
531 			proc_getexecutableuuid(self, so->last_uuid,
532 			    sizeof(so->last_uuid));
533 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
534 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
535 			}
536 		}
537 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
538 	}
539 }
540 
541 void
so_update_policy(struct socket * so)542 so_update_policy(struct socket *so)
543 {
544 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
545 		(void) inp_update_policy(sotoinpcb(so));
546 	}
547 }
548 
549 #if NECP
550 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)551 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
552     struct sockaddr *override_remote_addr)
553 {
554 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
555 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
556 		    override_remote_addr, 0);
557 	}
558 }
559 #endif /* NECP */
560 
561 boolean_t
so_cache_timer(void)562 so_cache_timer(void)
563 {
564 	struct socket   *p;
565 	int             n_freed = 0;
566 	boolean_t rc = FALSE;
567 
568 	lck_mtx_lock(&so_cache_mtx);
569 	so_cache_timeouts++;
570 	so_cache_time = net_uptime();
571 
572 	while (!STAILQ_EMPTY(&so_cache_head)) {
573 		VERIFY(cached_sock_count > 0);
574 		p = STAILQ_FIRST(&so_cache_head);
575 		if ((so_cache_time - p->cache_timestamp) <
576 		    SO_CACHE_TIME_LIMIT) {
577 			break;
578 		}
579 
580 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
581 		--cached_sock_count;
582 
583 		zfree(so_cache_zone, p);
584 
585 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
586 			so_cache_max_freed++;
587 			break;
588 		}
589 	}
590 
591 	/* Schedule again if there is more to cleanup */
592 	if (!STAILQ_EMPTY(&so_cache_head)) {
593 		rc = TRUE;
594 	}
595 
596 	lck_mtx_unlock(&so_cache_mtx);
597 	return rc;
598 }
599 
600 /*
601  * Get a socket structure from our zone, and initialize it.
602  * We don't implement `waitok' yet (see comments in uipc_domain.c).
603  * Note that it would probably be better to allocate socket
604  * and PCB at the same time, but I'm not convinced that all
605  * the protocols can be easily modified to do this.
606  */
607 struct socket *
soalloc(int waitok,int dom,int type)608 soalloc(int waitok, int dom, int type)
609 {
610 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
611 	struct socket *so;
612 
613 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
614 		cached_sock_alloc(&so, how);
615 	} else {
616 		so = zalloc_flags(socket_zone, how | Z_ZERO);
617 	}
618 	if (so != NULL) {
619 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
620 
621 		/*
622 		 * Increment the socket allocation statistics
623 		 */
624 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
625 	}
626 
627 	return so;
628 }
629 
630 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)631 socreate_internal(int dom, struct socket **aso, int type, int proto,
632     struct proc *p, uint32_t flags, struct proc *ep)
633 {
634 	struct protosw *prp;
635 	struct socket *so;
636 	int error = 0;
637 #if defined(XNU_TARGET_OS_OSX)
638 	pid_t rpid = -1;
639 #endif
640 
641 #if TCPDEBUG
642 	extern int tcpconsdebug;
643 #endif
644 
645 	VERIFY(aso != NULL);
646 	*aso = NULL;
647 
648 	if (proto != 0) {
649 		prp = pffindproto(dom, proto, type);
650 	} else {
651 		prp = pffindtype(dom, type);
652 	}
653 
654 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
655 		if (pffinddomain(dom) == NULL) {
656 			return EAFNOSUPPORT;
657 		}
658 		if (proto != 0) {
659 			if (pffindprotonotype(dom, proto) != NULL) {
660 				return EPROTOTYPE;
661 			}
662 		}
663 		return EPROTONOSUPPORT;
664 	}
665 	if (prp->pr_type != type) {
666 		return EPROTOTYPE;
667 	}
668 	so = soalloc(1, dom, type);
669 	if (so == NULL) {
670 		return ENOBUFS;
671 	}
672 
673 	switch (dom) {
674 	case PF_LOCAL:
675 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
676 		break;
677 	case PF_INET:
678 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
679 		if (type == SOCK_STREAM) {
680 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
681 		} else {
682 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
683 		}
684 		break;
685 	case PF_ROUTE:
686 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
687 		break;
688 	case PF_NDRV:
689 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
690 		break;
691 	case PF_KEY:
692 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
693 		break;
694 	case PF_INET6:
695 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
696 		if (type == SOCK_STREAM) {
697 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
698 		} else {
699 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
700 		}
701 		break;
702 	case PF_SYSTEM:
703 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
704 		break;
705 	case PF_MULTIPATH:
706 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
707 		break;
708 	default:
709 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
710 		break;
711 	}
712 
713 	if (flags & SOCF_MPTCP) {
714 		so->so_state |= SS_NBIO;
715 	}
716 
717 	TAILQ_INIT(&so->so_incomp);
718 	TAILQ_INIT(&so->so_comp);
719 	so->so_type = (short)type;
720 	so->last_upid = proc_uniqueid(p);
721 	so->last_pid = proc_pid(p);
722 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
723 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
724 
725 	if (ep != PROC_NULL && ep != p) {
726 		so->e_upid = proc_uniqueid(ep);
727 		so->e_pid = proc_pid(ep);
728 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
729 		so->so_flags |= SOF_DELEGATED;
730 #if defined(XNU_TARGET_OS_OSX)
731 		if (ep->p_responsible_pid != so->e_pid) {
732 			rpid = ep->p_responsible_pid;
733 		}
734 #endif
735 	}
736 
737 #if defined(XNU_TARGET_OS_OSX)
738 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
739 		rpid = p->p_responsible_pid;
740 	}
741 
742 	so->so_rpid = -1;
743 	uuid_clear(so->so_ruuid);
744 	if (rpid >= 0) {
745 		proc_t rp = proc_find(rpid);
746 		if (rp != PROC_NULL) {
747 			proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
748 			so->so_rpid = rpid;
749 			proc_rele(rp);
750 		}
751 	}
752 #endif
753 
754 	so->so_cred = kauth_cred_proc_ref(p);
755 	if (!suser(kauth_cred_get(), NULL)) {
756 		so->so_state |= SS_PRIV;
757 	}
758 
759 	so->so_persona_id = current_persona_get_id();
760 	so->so_proto = prp;
761 	so->so_rcv.sb_flags |= SB_RECV;
762 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
763 	so->next_lock_lr = 0;
764 	so->next_unlock_lr = 0;
765 
766 	/*
767 	 * Attachment will create the per pcb lock if necessary and
768 	 * increase refcount for creation, make sure it's done before
769 	 * socket is inserted in lists.
770 	 */
771 	so->so_usecount++;
772 
773 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
774 	if (error != 0) {
775 		/*
776 		 * Warning:
777 		 * If so_pcb is not zero, the socket will be leaked,
778 		 * so protocol attachment handler must be coded carefuly
779 		 */
780 		if (so->so_pcb != NULL) {
781 			os_log_error(OS_LOG_DEFAULT,
782 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
783 			    error, dom, proto, type);
784 		}
785 		/*
786 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
787 		 */
788 		so->so_state |= SS_NOFDREF;
789 		so->so_flags |= SOF_PCBCLEARING;
790 		VERIFY(so->so_usecount > 0);
791 		so->so_usecount--;
792 		sofreelastref(so, 1);   /* will deallocate the socket */
793 		return error;
794 	}
795 
796 	/*
797 	 * Note: needs so_pcb to be set after pru_attach
798 	 */
799 	if (prp->pr_update_last_owner != NULL) {
800 		(*prp->pr_update_last_owner)(so, p, ep);
801 	}
802 
803 	os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
804 
805 	/* Attach socket filters for this protocol */
806 	sflt_initsock(so);
807 #if TCPDEBUG
808 	if (tcpconsdebug == 2) {
809 		so->so_options |= SO_DEBUG;
810 	}
811 #endif
812 	so_set_default_traffic_class(so);
813 
814 	/*
815 	 * If this thread or task is marked to create backgrounded sockets,
816 	 * mark the socket as background.
817 	 */
818 	if (!(flags & SOCF_MPTCP) &&
819 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
820 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
821 		so->so_background_thread = current_thread();
822 	}
823 
824 	switch (dom) {
825 	/*
826 	 * Don't mark Unix domain or system
827 	 * eligible for defunct by default.
828 	 */
829 	case PF_LOCAL:
830 	case PF_SYSTEM:
831 		so->so_flags |= SOF_NODEFUNCT;
832 		break;
833 	default:
834 		break;
835 	}
836 
837 	/*
838 	 * Entitlements can't be checked at socket creation time except if the
839 	 * application requested a feature guarded by a privilege (c.f., socket
840 	 * delegation).
841 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
842 	 * a privilege check should only be triggered by a userland request.
843 	 * A privilege check at socket creation time is time consuming and
844 	 * could trigger many authorisation error messages from the security
845 	 * APIs.
846 	 */
847 
848 	*aso = so;
849 
850 	return 0;
851 }
852 
853 /*
854  * Returns:	0			Success
855  *		EAFNOSUPPORT
856  *		EPROTOTYPE
857  *		EPROTONOSUPPORT
858  *		ENOBUFS
859  *	<pru_attach>:ENOBUFS[AF_UNIX]
860  *	<pru_attach>:ENOBUFS[TCP]
861  *	<pru_attach>:ENOMEM[TCP]
862  *	<pru_attach>:???		[other protocol families, IPSEC]
863  */
864 int
socreate(int dom,struct socket ** aso,int type,int proto)865 socreate(int dom, struct socket **aso, int type, int proto)
866 {
867 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
868 	           PROC_NULL);
869 }
870 
871 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)872 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
873 {
874 	int error = 0;
875 	struct proc *ep = PROC_NULL;
876 
877 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
878 		error = ESRCH;
879 		goto done;
880 	}
881 
882 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
883 
884 	/*
885 	 * It might not be wise to hold the proc reference when calling
886 	 * socreate_internal since it calls soalloc with M_WAITOK
887 	 */
888 done:
889 	if (ep != PROC_NULL) {
890 		proc_rele(ep);
891 	}
892 
893 	return error;
894 }
895 
896 /*
897  * Returns:	0			Success
898  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
899  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
900  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
901  *	<pru_bind>:EINVAL		Invalid argument
902  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
903  *	<pru_bind>:EACCES		Permission denied
904  *	<pru_bind>:EADDRINUSE		Address in use
905  *	<pru_bind>:EAGAIN		Resource unavailable, try again
906  *	<pru_bind>:EPERM		Operation not permitted
907  *	<pru_bind>:???
908  *	<sf_bind>:???
909  *
910  * Notes:	It's not possible to fully enumerate the return codes above,
911  *		since socket filter authors and protocol family authors may
912  *		not choose to limit their error returns to those listed, even
913  *		though this may result in some software operating incorrectly.
914  *
915  *		The error codes which are enumerated above are those known to
916  *		be returned by the tcp_usr_bind function supplied.
917  */
918 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)919 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
920 {
921 	struct proc *p = current_proc();
922 	int error = 0;
923 
924 	if (dolock) {
925 		socket_lock(so, 1);
926 	}
927 
928 	so_update_last_owner_locked(so, p);
929 	so_update_policy(so);
930 
931 #if NECP
932 	so_update_necp_policy(so, nam, NULL);
933 #endif /* NECP */
934 
935 	/*
936 	 * If this is a bind request on a socket that has been marked
937 	 * as inactive, reject it now before we go any further.
938 	 */
939 	if (so->so_flags & SOF_DEFUNCT) {
940 		error = EINVAL;
941 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
942 		    __func__, proc_pid(p), proc_best_name(p),
943 		    so->so_gencnt,
944 		    SOCK_DOM(so), SOCK_TYPE(so), error);
945 		goto out;
946 	}
947 
948 	/* Socket filter */
949 	error = sflt_bind(so, nam);
950 
951 	if (error == 0) {
952 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
953 	}
954 out:
955 	if (dolock) {
956 		socket_unlock(so, 1);
957 	}
958 
959 	if (error == EJUSTRETURN) {
960 		error = 0;
961 	}
962 
963 	return error;
964 }
965 
966 void
sodealloc(struct socket * so)967 sodealloc(struct socket *so)
968 {
969 	kauth_cred_unref(&so->so_cred);
970 
971 	/* Remove any filters */
972 	sflt_termsock(so);
973 
974 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
975 
976 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
977 		cached_sock_free(so);
978 	} else {
979 		zfree(socket_zone, so);
980 	}
981 }
982 
983 /*
984  * Returns:	0			Success
985  *		EINVAL
986  *		EOPNOTSUPP
987  *	<pru_listen>:EINVAL[AF_UNIX]
988  *	<pru_listen>:EINVAL[TCP]
989  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
990  *	<pru_listen>:EINVAL[TCP]	Invalid argument
991  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
992  *	<pru_listen>:EACCES[TCP]	Permission denied
993  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
994  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
995  *	<pru_listen>:EPERM[TCP]		Operation not permitted
996  *	<sf_listen>:???
997  *
998  * Notes:	Other <pru_listen> returns depend on the protocol family; all
999  *		<sf_listen> returns depend on what the filter author causes
1000  *		their filter to return.
1001  */
1002 int
solisten(struct socket * so,int backlog)1003 solisten(struct socket *so, int backlog)
1004 {
1005 	struct proc *p = current_proc();
1006 	int error = 0;
1007 
1008 	socket_lock(so, 1);
1009 
1010 	so_update_last_owner_locked(so, p);
1011 	so_update_policy(so);
1012 
1013 	if (TAILQ_EMPTY(&so->so_comp)) {
1014 		so->so_options |= SO_ACCEPTCONN;
1015 	}
1016 
1017 #if NECP
1018 	so_update_necp_policy(so, NULL, NULL);
1019 #endif /* NECP */
1020 
1021 	if (so->so_proto == NULL) {
1022 		error = EINVAL;
1023 		so->so_options &= ~SO_ACCEPTCONN;
1024 		goto out;
1025 	}
1026 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1027 		error = EOPNOTSUPP;
1028 		so->so_options &= ~SO_ACCEPTCONN;
1029 		goto out;
1030 	}
1031 
1032 	/*
1033 	 * If the listen request is made on a socket that is not fully
1034 	 * disconnected, or on a socket that has been marked as inactive,
1035 	 * reject the request now.
1036 	 */
1037 	if ((so->so_state &
1038 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1039 	    (so->so_flags & SOF_DEFUNCT)) {
1040 		error = EINVAL;
1041 		if (so->so_flags & SOF_DEFUNCT) {
1042 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1043 			    "(%d)\n", __func__, proc_pid(p),
1044 			    proc_best_name(p),
1045 			    so->so_gencnt,
1046 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1047 		}
1048 		so->so_options &= ~SO_ACCEPTCONN;
1049 		goto out;
1050 	}
1051 
1052 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1053 		error = EPERM;
1054 		so->so_options &= ~SO_ACCEPTCONN;
1055 		goto out;
1056 	}
1057 
1058 	error = sflt_listen(so);
1059 	if (error == 0) {
1060 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1061 	}
1062 
1063 	if (error) {
1064 		if (error == EJUSTRETURN) {
1065 			error = 0;
1066 		}
1067 		so->so_options &= ~SO_ACCEPTCONN;
1068 		goto out;
1069 	}
1070 
1071 	/*
1072 	 * POSIX: The implementation may have an upper limit on the length of
1073 	 * the listen queue-either global or per accepting socket. If backlog
1074 	 * exceeds this limit, the length of the listen queue is set to the
1075 	 * limit.
1076 	 *
1077 	 * If listen() is called with a backlog argument value that is less
1078 	 * than 0, the function behaves as if it had been called with a backlog
1079 	 * argument value of 0.
1080 	 *
1081 	 * A backlog argument of 0 may allow the socket to accept connections,
1082 	 * in which case the length of the listen queue may be set to an
1083 	 * implementation-defined minimum value.
1084 	 */
1085 	if (backlog <= 0 || backlog > somaxconn) {
1086 		backlog = somaxconn;
1087 	}
1088 
1089 	so->so_qlimit = (short)backlog;
1090 out:
1091 	socket_unlock(so, 1);
1092 	return error;
1093 }
1094 
1095 /*
1096  * The "accept list lock" protects the fields related to the listener queues
1097  * because we can unlock a socket to respect the lock ordering between
1098  * the listener socket and its clients sockets. The lock ordering is first to
1099  * acquire the client socket before the listener socket.
1100  *
1101  * The accept list lock serializes access to the following fields:
1102  * - of the listener socket:
1103  *   - so_comp
1104  *   - so_incomp
1105  *   - so_qlen
1106  *   - so_inqlen
1107  * - of client sockets that are in so_comp or so_incomp:
1108  *   - so_head
1109  *   - so_list
1110  *
1111  * As one can see the accept list lock protects the consistent of the
1112  * linkage of the client sockets.
1113  *
1114  * Note that those fields may be read without holding the accept list lock
1115  * for a preflight provided the accept list lock is taken when committing
1116  * to take an action based on the result of the preflight. The preflight
1117  * saves the cost of doing the unlock/lock dance.
1118  */
1119 void
so_acquire_accept_list(struct socket * head,struct socket * so)1120 so_acquire_accept_list(struct socket *head, struct socket *so)
1121 {
1122 	lck_mtx_t *mutex_held;
1123 
1124 	if (head->so_proto->pr_getlock == NULL) {
1125 		return;
1126 	}
1127 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1128 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1129 
1130 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1131 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1132 		return;
1133 	}
1134 	if (so != NULL) {
1135 		socket_unlock(so, 0);
1136 	}
1137 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1138 		so_accept_list_waits += 1;
1139 		msleep((caddr_t)&head->so_incomp, mutex_held,
1140 		    PSOCK | PCATCH, __func__, NULL);
1141 	}
1142 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1143 	if (so != NULL) {
1144 		socket_unlock(head, 0);
1145 		socket_lock(so, 0);
1146 		socket_lock(head, 0);
1147 	}
1148 }
1149 
1150 void
so_release_accept_list(struct socket * head)1151 so_release_accept_list(struct socket *head)
1152 {
1153 	if (head->so_proto->pr_getlock != NULL) {
1154 		lck_mtx_t *mutex_held;
1155 
1156 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1157 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1158 
1159 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1160 		wakeup((caddr_t)&head->so_incomp);
1161 	}
1162 }
1163 
1164 void
sofreelastref(struct socket * so,int dealloc)1165 sofreelastref(struct socket *so, int dealloc)
1166 {
1167 	struct socket *head = so->so_head;
1168 
1169 	/* Assume socket is locked */
1170 
1171 #if FLOW_DIVERT
1172 	if (so->so_flags & SOF_FLOW_DIVERT) {
1173 		flow_divert_detach(so);
1174 	}
1175 #endif  /* FLOW_DIVERT */
1176 
1177 #if CONTENT_FILTER
1178 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1179 		cfil_sock_detach(so);
1180 	}
1181 #endif /* CONTENT_FILTER */
1182 
1183 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1184 		soflow_detach(so);
1185 	}
1186 
1187 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1188 		selthreadclear(&so->so_snd.sb_sel);
1189 		selthreadclear(&so->so_rcv.sb_sel);
1190 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1191 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1192 		so->so_event = sonullevent;
1193 		return;
1194 	}
1195 	if (head != NULL) {
1196 		/*
1197 		 * Need to lock the listener when the protocol has
1198 		 * per socket locks
1199 		 */
1200 		if (head->so_proto->pr_getlock != NULL) {
1201 			socket_lock(head, 1);
1202 			so_acquire_accept_list(head, so);
1203 		}
1204 		if (so->so_state & SS_INCOMP) {
1205 			so->so_state &= ~SS_INCOMP;
1206 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1207 			head->so_incqlen--;
1208 			head->so_qlen--;
1209 			so->so_head = NULL;
1210 
1211 			if (head->so_proto->pr_getlock != NULL) {
1212 				so_release_accept_list(head);
1213 				socket_unlock(head, 1);
1214 			}
1215 		} else if (so->so_state & SS_COMP) {
1216 			if (head->so_proto->pr_getlock != NULL) {
1217 				so_release_accept_list(head);
1218 				socket_unlock(head, 1);
1219 			}
1220 			/*
1221 			 * We must not decommission a socket that's
1222 			 * on the accept(2) queue.  If we do, then
1223 			 * accept(2) may hang after select(2) indicated
1224 			 * that the listening socket was ready.
1225 			 */
1226 			selthreadclear(&so->so_snd.sb_sel);
1227 			selthreadclear(&so->so_rcv.sb_sel);
1228 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1229 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1230 			so->so_event = sonullevent;
1231 			return;
1232 		} else {
1233 			if (head->so_proto->pr_getlock != NULL) {
1234 				so_release_accept_list(head);
1235 				socket_unlock(head, 1);
1236 			}
1237 			printf("sofree: not queued\n");
1238 		}
1239 	}
1240 	sowflush(so);
1241 	sorflush(so);
1242 
1243 	/* 3932268: disable upcall */
1244 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1245 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1246 	so->so_event = sonullevent;
1247 
1248 	if (dealloc) {
1249 		sodealloc(so);
1250 	}
1251 }
1252 
1253 void
soclose_wait_locked(struct socket * so)1254 soclose_wait_locked(struct socket *so)
1255 {
1256 	lck_mtx_t *mutex_held;
1257 
1258 	if (so->so_proto->pr_getlock != NULL) {
1259 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1260 	} else {
1261 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1262 	}
1263 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1264 
1265 	/*
1266 	 * Double check here and return if there's no outstanding upcall;
1267 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1268 	 */
1269 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1270 		return;
1271 	}
1272 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1273 	so->so_snd.sb_flags &= ~SB_UPCALL;
1274 	so->so_flags |= SOF_CLOSEWAIT;
1275 
1276 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1277 	    "soclose_wait_locked", NULL);
1278 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1279 	so->so_flags &= ~SOF_CLOSEWAIT;
1280 }
1281 
1282 /*
1283  * Close a socket on last file table reference removal.
1284  * Initiate disconnect if connected.
1285  * Free socket when disconnect complete.
1286  */
1287 int
soclose_locked(struct socket * so)1288 soclose_locked(struct socket *so)
1289 {
1290 	int error = 0;
1291 	struct timespec ts;
1292 
1293 	if (so->so_usecount == 0) {
1294 		panic("soclose: so=%p refcount=0", so);
1295 		/* NOTREACHED */
1296 	}
1297 
1298 	sflt_notify(so, sock_evt_closing, NULL);
1299 
1300 	if (so->so_upcallusecount) {
1301 		soclose_wait_locked(so);
1302 	}
1303 
1304 #if CONTENT_FILTER
1305 	/*
1306 	 * We have to wait until the content filters are done
1307 	 */
1308 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1309 		cfil_sock_close_wait(so);
1310 		cfil_sock_is_closed(so);
1311 		cfil_sock_detach(so);
1312 	}
1313 #endif /* CONTENT_FILTER */
1314 
1315 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1316 		soflow_detach(so);
1317 	}
1318 
1319 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1320 		soresume(current_proc(), so, 1);
1321 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1322 	}
1323 
1324 	if ((so->so_options & SO_ACCEPTCONN)) {
1325 		struct socket *sp, *sonext;
1326 		int persocklock = 0;
1327 		int incomp_overflow_only;
1328 
1329 		/*
1330 		 * We do not want new connection to be added
1331 		 * to the connection queues
1332 		 */
1333 		so->so_options &= ~SO_ACCEPTCONN;
1334 
1335 		/*
1336 		 * We can drop the lock on the listener once
1337 		 * we've acquired the incoming list
1338 		 */
1339 		if (so->so_proto->pr_getlock != NULL) {
1340 			persocklock = 1;
1341 			so_acquire_accept_list(so, NULL);
1342 			socket_unlock(so, 0);
1343 		}
1344 again:
1345 		incomp_overflow_only = 1;
1346 
1347 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1348 			/*
1349 			 * Radar 5350314
1350 			 * skip sockets thrown away by tcpdropdropblreq
1351 			 * they will get cleanup by the garbage collection.
1352 			 * otherwise, remove the incomp socket from the queue
1353 			 * and let soabort trigger the appropriate cleanup.
1354 			 */
1355 			if (sp->so_flags & SOF_OVERFLOW) {
1356 				continue;
1357 			}
1358 
1359 			if (persocklock != 0) {
1360 				socket_lock(sp, 1);
1361 			}
1362 
1363 			/*
1364 			 * Radar 27945981
1365 			 * The extra reference for the list insure the
1366 			 * validity of the socket pointer when we perform the
1367 			 * unlock of the head above
1368 			 */
1369 			if (sp->so_state & SS_INCOMP) {
1370 				sp->so_state &= ~SS_INCOMP;
1371 				sp->so_head = NULL;
1372 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1373 				so->so_incqlen--;
1374 				so->so_qlen--;
1375 
1376 				(void) soabort(sp);
1377 			} else {
1378 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1379 				    __func__, sp);
1380 			}
1381 
1382 			if (persocklock != 0) {
1383 				socket_unlock(sp, 1);
1384 			}
1385 		}
1386 
1387 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1388 			/* Dequeue from so_comp since sofree() won't do it */
1389 			if (persocklock != 0) {
1390 				socket_lock(sp, 1);
1391 			}
1392 
1393 			if (sp->so_state & SS_COMP) {
1394 				sp->so_state &= ~SS_COMP;
1395 				sp->so_head = NULL;
1396 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1397 				so->so_qlen--;
1398 
1399 				(void) soabort(sp);
1400 			} else {
1401 				panic("%s sp %p in so_comp but !SS_COMP",
1402 				    __func__, sp);
1403 			}
1404 
1405 			if (persocklock) {
1406 				socket_unlock(sp, 1);
1407 			}
1408 		}
1409 
1410 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1411 #if (DEBUG | DEVELOPMENT)
1412 			panic("%s head %p so_comp not empty", __func__, so);
1413 #endif /* (DEVELOPMENT || DEBUG) */
1414 
1415 			goto again;
1416 		}
1417 
1418 		if (!TAILQ_EMPTY(&so->so_comp)) {
1419 #if (DEBUG | DEVELOPMENT)
1420 			panic("%s head %p so_comp not empty", __func__, so);
1421 #endif /* (DEVELOPMENT || DEBUG) */
1422 
1423 			goto again;
1424 		}
1425 
1426 		if (persocklock) {
1427 			socket_lock(so, 0);
1428 			so_release_accept_list(so);
1429 		}
1430 	}
1431 	if (so->so_pcb == NULL) {
1432 		/* 3915887: mark the socket as ready for dealloc */
1433 		so->so_flags |= SOF_PCBCLEARING;
1434 		goto discard;
1435 	}
1436 
1437 	if (so->so_state & SS_ISCONNECTED) {
1438 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1439 			error = sodisconnectlocked(so);
1440 			if (error) {
1441 				goto drop;
1442 			}
1443 		}
1444 		if (so->so_options & SO_LINGER) {
1445 			if ((so->so_state & SS_ISDISCONNECTING) &&
1446 			    (so->so_state & SS_NBIO)) {
1447 				goto drop;
1448 			}
1449 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1450 				lck_mtx_t *mutex_held;
1451 
1452 				if (so->so_proto->pr_getlock != NULL) {
1453 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1454 				} else {
1455 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1456 				}
1457 				ts.tv_sec = (so->so_linger / 100);
1458 				ts.tv_nsec = (so->so_linger % 100) *
1459 				    NSEC_PER_USEC * 1000 * 10;
1460 				error = msleep((caddr_t)&so->so_timeo,
1461 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1462 				if (error) {
1463 					/*
1464 					 * It's OK when the time fires,
1465 					 * don't report an error
1466 					 */
1467 					if (error == EWOULDBLOCK) {
1468 						error = 0;
1469 					}
1470 					break;
1471 				}
1472 			}
1473 		}
1474 	}
1475 drop:
1476 	if (so->so_usecount == 0) {
1477 		panic("soclose: usecount is zero so=%p", so);
1478 		/* NOTREACHED */
1479 	}
1480 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1481 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1482 		if (error == 0) {
1483 			error = error2;
1484 		}
1485 	}
1486 	if (so->so_usecount <= 0) {
1487 		panic("soclose: usecount is zero so=%p", so);
1488 		/* NOTREACHED */
1489 	}
1490 discard:
1491 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1492 	    (so->so_state & SS_NOFDREF)) {
1493 		panic("soclose: NOFDREF");
1494 		/* NOTREACHED */
1495 	}
1496 	so->so_state |= SS_NOFDREF;
1497 
1498 	if ((so->so_flags & SOF_KNOTE) != 0) {
1499 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1500 	}
1501 
1502 	os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1503 
1504 	VERIFY(so->so_usecount > 0);
1505 	so->so_usecount--;
1506 	sofree(so);
1507 	return error;
1508 }
1509 
1510 int
soclose(struct socket * so)1511 soclose(struct socket *so)
1512 {
1513 	int error = 0;
1514 	socket_lock(so, 1);
1515 
1516 	if (so->so_retaincnt == 0) {
1517 		error = soclose_locked(so);
1518 	} else {
1519 		/*
1520 		 * if the FD is going away, but socket is
1521 		 * retained in kernel remove its reference
1522 		 */
1523 		so->so_usecount--;
1524 		if (so->so_usecount < 2) {
1525 			panic("soclose: retaincnt non null and so=%p "
1526 			    "usecount=%d\n", so, so->so_usecount);
1527 		}
1528 	}
1529 	socket_unlock(so, 1);
1530 	return error;
1531 }
1532 
1533 /*
1534  * Must be called at splnet...
1535  */
1536 /* Should already be locked */
1537 int
soabort(struct socket * so)1538 soabort(struct socket *so)
1539 {
1540 	int error;
1541 
1542 #ifdef MORE_LOCKING_DEBUG
1543 	lck_mtx_t *mutex_held;
1544 
1545 	if (so->so_proto->pr_getlock != NULL) {
1546 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1547 	} else {
1548 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1549 	}
1550 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1551 #endif
1552 
1553 	if ((so->so_flags & SOF_ABORTED) == 0) {
1554 		so->so_flags |= SOF_ABORTED;
1555 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1556 		if (error) {
1557 			sofree(so);
1558 			return error;
1559 		}
1560 	}
1561 	return 0;
1562 }
1563 
1564 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1565 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1566 {
1567 	int error;
1568 
1569 	if (dolock) {
1570 		socket_lock(so, 1);
1571 	}
1572 
1573 	so_update_last_owner_locked(so, PROC_NULL);
1574 	so_update_policy(so);
1575 #if NECP
1576 	so_update_necp_policy(so, NULL, NULL);
1577 #endif /* NECP */
1578 
1579 	if ((so->so_state & SS_NOFDREF) == 0) {
1580 		panic("soaccept: !NOFDREF");
1581 	}
1582 	so->so_state &= ~SS_NOFDREF;
1583 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1584 
1585 	if (dolock) {
1586 		socket_unlock(so, 1);
1587 	}
1588 	return error;
1589 }
1590 
1591 int
soaccept(struct socket * so,struct sockaddr ** nam)1592 soaccept(struct socket *so, struct sockaddr **nam)
1593 {
1594 	return soacceptlock(so, nam, 1);
1595 }
1596 
1597 int
soacceptfilter(struct socket * so,struct socket * head)1598 soacceptfilter(struct socket *so, struct socket *head)
1599 {
1600 	struct sockaddr *local = NULL, *remote = NULL;
1601 	int error = 0;
1602 
1603 	/*
1604 	 * Hold the lock even if this socket has not been made visible
1605 	 * to the filter(s).  For sockets with global locks, this protects
1606 	 * against the head or peer going away
1607 	 */
1608 	socket_lock(so, 1);
1609 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1610 	    sogetaddr_locked(so, &local, 0) != 0) {
1611 		so->so_state &= ~SS_NOFDREF;
1612 		socket_unlock(so, 1);
1613 		soclose(so);
1614 		/* Out of resources; try it again next time */
1615 		error = ECONNABORTED;
1616 		goto done;
1617 	}
1618 
1619 	error = sflt_accept(head, so, local, remote);
1620 
1621 	/*
1622 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1623 	 * as inactive and return it anyway.  This newly accepted socket
1624 	 * will be disconnected later before we hand it off to the caller.
1625 	 */
1626 	if (error == EJUSTRETURN) {
1627 		error = 0;
1628 		(void) sosetdefunct(current_proc(), so,
1629 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1630 	}
1631 
1632 	if (error != 0) {
1633 		/*
1634 		 * This may seem like a duplication to the above error
1635 		 * handling part when we return ECONNABORTED, except
1636 		 * the following is done while holding the lock since
1637 		 * the socket has been exposed to the filter(s) earlier.
1638 		 */
1639 		so->so_state &= ~SS_NOFDREF;
1640 		socket_unlock(so, 1);
1641 		soclose(so);
1642 		/* Propagate socket filter's error code to the caller */
1643 	} else {
1644 		socket_unlock(so, 1);
1645 	}
1646 done:
1647 	/* Callee checks for NULL pointer */
1648 	sock_freeaddr(remote);
1649 	sock_freeaddr(local);
1650 	return error;
1651 }
1652 
1653 /*
1654  * Returns:	0			Success
1655  *		EOPNOTSUPP		Operation not supported on socket
1656  *		EISCONN			Socket is connected
1657  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1658  *	<pru_connect>:EINVAL		Invalid argument
1659  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1660  *	<pru_connect>:EACCES		Permission denied
1661  *	<pru_connect>:EADDRINUSE	Address in use
1662  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1663  *	<pru_connect>:EPERM		Operation not permitted
1664  *	<sf_connect_out>:???		[anything a filter writer might set]
1665  */
1666 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1667 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1668 {
1669 	int error;
1670 	struct proc *p = current_proc();
1671 	tracker_metadata_t metadata = { };
1672 
1673 	if (dolock) {
1674 		socket_lock(so, 1);
1675 	}
1676 
1677 	so_update_last_owner_locked(so, p);
1678 	so_update_policy(so);
1679 
1680 	/*
1681 	 * If this is a listening socket or if this is a previously-accepted
1682 	 * socket that has been marked as inactive, reject the connect request.
1683 	 */
1684 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1685 		error = EOPNOTSUPP;
1686 		if (so->so_flags & SOF_DEFUNCT) {
1687 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1688 			    "(%d)\n", __func__, proc_pid(p),
1689 			    proc_best_name(p),
1690 			    so->so_gencnt,
1691 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1692 		}
1693 		if (dolock) {
1694 			socket_unlock(so, 1);
1695 		}
1696 		return error;
1697 	}
1698 
1699 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1700 		if (dolock) {
1701 			socket_unlock(so, 1);
1702 		}
1703 		return EPERM;
1704 	}
1705 
1706 	/*
1707 	 * If protocol is connection-based, can only connect once.
1708 	 * Otherwise, if connected, try to disconnect first.
1709 	 * This allows user to disconnect by connecting to, e.g.,
1710 	 * a null address.
1711 	 */
1712 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1713 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1714 	    (error = sodisconnectlocked(so)))) {
1715 		error = EISCONN;
1716 	} else {
1717 		/*
1718 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1719 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1720 		 */
1721 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1722 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1723 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1724 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1725 				}
1726 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1727 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1728 				}
1729 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1730 					printf("connect() - failed necp_set_socket_domain_attributes");
1731 				}
1732 			}
1733 		}
1734 
1735 #if NECP
1736 		/* Update NECP evaluation after setting any domain via the tracker checks */
1737 		so_update_necp_policy(so, NULL, nam);
1738 #endif /* NECP */
1739 
1740 		/*
1741 		 * Run connect filter before calling protocol:
1742 		 *  - non-blocking connect returns before completion;
1743 		 */
1744 		error = sflt_connectout(so, nam);
1745 		if (error != 0) {
1746 			if (error == EJUSTRETURN) {
1747 				error = 0;
1748 			}
1749 		} else {
1750 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1751 			    (so, nam, p);
1752 			if (error != 0) {
1753 				so->so_state &= ~SS_ISCONNECTING;
1754 			}
1755 		}
1756 	}
1757 	if (dolock) {
1758 		socket_unlock(so, 1);
1759 	}
1760 	return error;
1761 }
1762 
1763 int
soconnect(struct socket * so,struct sockaddr * nam)1764 soconnect(struct socket *so, struct sockaddr *nam)
1765 {
1766 	return soconnectlock(so, nam, 1);
1767 }
1768 
1769 /*
1770  * Returns:	0			Success
1771  *	<pru_connect2>:EINVAL[AF_UNIX]
1772  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1773  *	<pru_connect2>:???		[other protocol families]
1774  *
1775  * Notes:	<pru_connect2> is not supported by [TCP].
1776  */
1777 int
soconnect2(struct socket * so1,struct socket * so2)1778 soconnect2(struct socket *so1, struct socket *so2)
1779 {
1780 	int error;
1781 
1782 	socket_lock(so1, 1);
1783 	if (so2->so_proto->pr_lock) {
1784 		socket_lock(so2, 1);
1785 	}
1786 
1787 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1788 
1789 	socket_unlock(so1, 1);
1790 	if (so2->so_proto->pr_lock) {
1791 		socket_unlock(so2, 1);
1792 	}
1793 	return error;
1794 }
1795 
1796 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1797 soconnectxlocked(struct socket *so, struct sockaddr *src,
1798     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1799     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1800     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1801 {
1802 	int error;
1803 	tracker_metadata_t metadata = { };
1804 
1805 	so_update_last_owner_locked(so, p);
1806 	so_update_policy(so);
1807 
1808 	/*
1809 	 * If this is a listening socket or if this is a previously-accepted
1810 	 * socket that has been marked as inactive, reject the connect request.
1811 	 */
1812 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1813 		error = EOPNOTSUPP;
1814 		if (so->so_flags & SOF_DEFUNCT) {
1815 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1816 			    "(%d)\n", __func__, proc_pid(p),
1817 			    proc_best_name(p),
1818 			    so->so_gencnt,
1819 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1820 		}
1821 		return error;
1822 	}
1823 
1824 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1825 		return EPERM;
1826 	}
1827 
1828 	/*
1829 	 * If protocol is connection-based, can only connect once
1830 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1831 	 * try to disconnect first.  This allows user to disconnect
1832 	 * by connecting to, e.g., a null address.
1833 	 */
1834 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1835 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1836 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1837 	    (error = sodisconnectlocked(so)) != 0)) {
1838 		error = EISCONN;
1839 	} else {
1840 		/*
1841 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1842 		 * (only if it hasn't been marked yet).
1843 		 */
1844 		if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1845 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1846 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1847 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1848 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1849 				}
1850 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1851 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1852 				}
1853 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1854 					printf("connectx() - failed necp_set_socket_domain_attributes");
1855 				}
1856 			}
1857 		}
1858 
1859 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1860 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1861 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1862 
1863 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1864 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1865 			}
1866 		}
1867 
1868 		/*
1869 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1870 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1871 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1872 		 * Case 3 allows user to combine write with connect even if they have
1873 		 * no use for TFO (such as regular TCP, and UDP).
1874 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1875 		 */
1876 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1877 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1878 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1879 		}
1880 
1881 		/*
1882 		 * If a user sets data idempotent and does not pass an uio, or
1883 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1884 		 * SOF1_DATA_IDEMPOTENT.
1885 		 */
1886 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1887 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1888 			/* We should return EINVAL instead perhaps. */
1889 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1890 		}
1891 
1892 		/*
1893 		 * Run connect filter before calling protocol:
1894 		 *  - non-blocking connect returns before completion;
1895 		 */
1896 		error = sflt_connectout(so, dst);
1897 		if (error != 0) {
1898 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1899 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1900 			if (error == EJUSTRETURN) {
1901 				error = 0;
1902 			}
1903 		} else {
1904 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1905 			    (so, src, dst, p, ifscope, aid, pcid,
1906 			    flags, arg, arglen, auio, bytes_written);
1907 			if (error != 0) {
1908 				so->so_state &= ~SS_ISCONNECTING;
1909 				if (error != EINPROGRESS) {
1910 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1911 				}
1912 			}
1913 		}
1914 	}
1915 
1916 	return error;
1917 }
1918 
1919 int
sodisconnectlocked(struct socket * so)1920 sodisconnectlocked(struct socket *so)
1921 {
1922 	int error;
1923 
1924 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1925 		error = ENOTCONN;
1926 		goto bad;
1927 	}
1928 	if (so->so_state & SS_ISDISCONNECTING) {
1929 		error = EALREADY;
1930 		goto bad;
1931 	}
1932 
1933 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1934 	if (error == 0) {
1935 		sflt_notify(so, sock_evt_disconnected, NULL);
1936 	}
1937 
1938 bad:
1939 	return error;
1940 }
1941 
1942 /* Locking version */
1943 int
sodisconnect(struct socket * so)1944 sodisconnect(struct socket *so)
1945 {
1946 	int error;
1947 
1948 	socket_lock(so, 1);
1949 	error = sodisconnectlocked(so);
1950 	socket_unlock(so, 1);
1951 	return error;
1952 }
1953 
1954 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1955 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1956 {
1957 	int error;
1958 
1959 	/*
1960 	 * Call the protocol disconnectx handler; let it handle all
1961 	 * matters related to the connection state of this session.
1962 	 */
1963 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1964 	if (error == 0) {
1965 		/*
1966 		 * The event applies only for the session, not for
1967 		 * the disconnection of individual subflows.
1968 		 */
1969 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1970 			sflt_notify(so, sock_evt_disconnected, NULL);
1971 		}
1972 	}
1973 	return error;
1974 }
1975 
1976 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1977 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1978 {
1979 	int error;
1980 
1981 	socket_lock(so, 1);
1982 	error = sodisconnectxlocked(so, aid, cid);
1983 	socket_unlock(so, 1);
1984 	return error;
1985 }
1986 
1987 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1988 
1989 /*
1990  * sosendcheck will lock the socket buffer if it isn't locked and
1991  * verify that there is space for the data being inserted.
1992  *
1993  * Returns:	0			Success
1994  *		EPIPE
1995  *	sblock:EWOULDBLOCK
1996  *	sblock:EINTR
1997  *	sbwait:EBADF
1998  *	sbwait:EINTR
1999  *	[so_error]:???
2000  */
2001 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)2002 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
2003     int32_t clen, int32_t atomic, int flags, int *sblocked)
2004 {
2005 	int     error = 0;
2006 	int32_t space;
2007 	int     assumelock = 0;
2008 
2009 restart:
2010 	if (*sblocked == 0) {
2011 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2012 		    so->so_send_filt_thread != 0 &&
2013 		    so->so_send_filt_thread == current_thread()) {
2014 			/*
2015 			 * We're being called recursively from a filter,
2016 			 * allow this to continue. Radar 4150520.
2017 			 * Don't set sblocked because we don't want
2018 			 * to perform an unlock later.
2019 			 */
2020 			assumelock = 1;
2021 		} else {
2022 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2023 			if (error) {
2024 				if (so->so_flags & SOF_DEFUNCT) {
2025 					goto defunct;
2026 				}
2027 				return error;
2028 			}
2029 			*sblocked = 1;
2030 		}
2031 	}
2032 
2033 	/*
2034 	 * If a send attempt is made on a socket that has been marked
2035 	 * as inactive (disconnected), reject the request.
2036 	 */
2037 	if (so->so_flags & SOF_DEFUNCT) {
2038 defunct:
2039 		error = EPIPE;
2040 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2041 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2042 		    so->so_gencnt,
2043 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2044 		return error;
2045 	}
2046 
2047 	if (so->so_state & SS_CANTSENDMORE) {
2048 #if CONTENT_FILTER
2049 		/*
2050 		 * Can re-inject data of half closed connections
2051 		 */
2052 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2053 		    so->so_snd.sb_cfil_thread == current_thread() &&
2054 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2055 			CFIL_LOG(LOG_INFO,
2056 			    "so %llx ignore SS_CANTSENDMORE",
2057 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2058 		} else
2059 #endif /* CONTENT_FILTER */
2060 		return EPIPE;
2061 	}
2062 	if (so->so_error) {
2063 		error = so->so_error;
2064 		so->so_error = 0;
2065 		return error;
2066 	}
2067 
2068 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2069 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2070 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2071 			    (resid != 0 || clen == 0) &&
2072 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2073 				return ENOTCONN;
2074 			}
2075 		} else if (addr == 0) {
2076 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2077 			       ENOTCONN : EDESTADDRREQ;
2078 		}
2079 	}
2080 
2081 	space = sbspace(&so->so_snd);
2082 
2083 	if (flags & MSG_OOB) {
2084 		space += 1024;
2085 	}
2086 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2087 	    clen > so->so_snd.sb_hiwat) {
2088 		return EMSGSIZE;
2089 	}
2090 
2091 	if ((space < resid + clen &&
2092 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2093 	    space < clen)) ||
2094 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2095 		/*
2096 		 * don't block the connectx call when there's more data
2097 		 * than can be copied.
2098 		 */
2099 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2100 			if (space == 0) {
2101 				return EWOULDBLOCK;
2102 			}
2103 			if (space < (int32_t)so->so_snd.sb_lowat) {
2104 				return 0;
2105 			}
2106 		}
2107 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2108 		    assumelock) {
2109 			return EWOULDBLOCK;
2110 		}
2111 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2112 		*sblocked = 0;
2113 		error = sbwait(&so->so_snd);
2114 		if (error) {
2115 			if (so->so_flags & SOF_DEFUNCT) {
2116 				goto defunct;
2117 			}
2118 			return error;
2119 		}
2120 		goto restart;
2121 	}
2122 	return 0;
2123 }
2124 
2125 /*
2126  * Send on a socket.
2127  * If send must go all at once and message is larger than
2128  * send buffering, then hard error.
2129  * Lock against other senders.
2130  * If must go all at once and not enough room now, then
2131  * inform user that this would block and do nothing.
2132  * Otherwise, if nonblocking, send as much as possible.
2133  * The data to be sent is described by "uio" if nonzero,
2134  * otherwise by the mbuf chain "top" (which must be null
2135  * if uio is not).  Data provided in mbuf chain must be small
2136  * enough to send all at once.
2137  *
2138  * Returns nonzero on error, timeout or signal; callers
2139  * must check for short counts if EINTR/ERESTART are returned.
2140  * Data and control buffers are freed on return.
2141  *
2142  * Returns:	0			Success
2143  *		EOPNOTSUPP
2144  *		EINVAL
2145  *		ENOBUFS
2146  *	uiomove:EFAULT
2147  *	sosendcheck:EPIPE
2148  *	sosendcheck:EWOULDBLOCK
2149  *	sosendcheck:EINTR
2150  *	sosendcheck:EBADF
2151  *	sosendcheck:EINTR
2152  *	sosendcheck:???			[value from so_error]
2153  *	<pru_send>:ECONNRESET[TCP]
2154  *	<pru_send>:EINVAL[TCP]
2155  *	<pru_send>:ENOBUFS[TCP]
2156  *	<pru_send>:EADDRINUSE[TCP]
2157  *	<pru_send>:EADDRNOTAVAIL[TCP]
2158  *	<pru_send>:EAFNOSUPPORT[TCP]
2159  *	<pru_send>:EACCES[TCP]
2160  *	<pru_send>:EAGAIN[TCP]
2161  *	<pru_send>:EPERM[TCP]
2162  *	<pru_send>:EMSGSIZE[TCP]
2163  *	<pru_send>:EHOSTUNREACH[TCP]
2164  *	<pru_send>:ENETUNREACH[TCP]
2165  *	<pru_send>:ENETDOWN[TCP]
2166  *	<pru_send>:ENOMEM[TCP]
2167  *	<pru_send>:ENOBUFS[TCP]
2168  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2169  *	<pru_send>:EINVAL[AF_UNIX]
2170  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2171  *	<pru_send>:EPIPE[AF_UNIX]
2172  *	<pru_send>:ENOTCONN[AF_UNIX]
2173  *	<pru_send>:EISCONN[AF_UNIX]
2174  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2175  *	<sf_data_out>:???		[whatever a filter author chooses]
2176  *
2177  * Notes:	Other <pru_send> returns depend on the protocol family; all
2178  *		<sf_data_out> returns depend on what the filter author causes
2179  *		their filter to return.
2180  */
2181 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2182 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2183     struct mbuf *top, struct mbuf *control, int flags)
2184 {
2185 	struct mbuf **mp;
2186 	struct mbuf *m, *freelist = NULL;
2187 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2188 	user_ssize_t space, len, resid, orig_resid;
2189 	int clen = 0, error, dontroute, sendflags;
2190 	int atomic = sosendallatonce(so) || top;
2191 	int sblocked = 0;
2192 	struct proc *p = current_proc();
2193 	uint16_t headroom = 0;
2194 	ssize_t mlen;
2195 	boolean_t en_tracing = FALSE;
2196 
2197 	if (uio != NULL) {
2198 		resid = uio_resid(uio);
2199 	} else {
2200 		resid = top->m_pkthdr.len;
2201 	}
2202 	orig_resid = resid;
2203 
2204 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2205 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2206 
2207 	socket_lock(so, 1);
2208 
2209 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2210 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2211 	}
2212 
2213 	/*
2214 	 * trace if tracing & network (vs. unix) sockets & and
2215 	 * non-loopback
2216 	 */
2217 	if (ENTR_SHOULDTRACE &&
2218 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2219 		struct inpcb *inp = sotoinpcb(so);
2220 		if (inp->inp_last_outifp != NULL &&
2221 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2222 			en_tracing = TRUE;
2223 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2224 			    VM_KERNEL_ADDRPERM(so),
2225 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2226 			    (int64_t)resid);
2227 		}
2228 	}
2229 
2230 	/*
2231 	 * Re-injection should not affect process accounting
2232 	 */
2233 	if ((flags & MSG_SKIPCFIL) == 0) {
2234 		so_update_last_owner_locked(so, p);
2235 		so_update_policy(so);
2236 
2237 #if NECP
2238 		so_update_necp_policy(so, NULL, addr);
2239 #endif /* NECP */
2240 	}
2241 
2242 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2243 		error = EOPNOTSUPP;
2244 		goto out_locked;
2245 	}
2246 
2247 	/*
2248 	 * In theory resid should be unsigned.
2249 	 * However, space must be signed, as it might be less than 0
2250 	 * if we over-committed, and we must use a signed comparison
2251 	 * of space and resid.  On the other hand, a negative resid
2252 	 * causes us to loop sending 0-length segments to the protocol.
2253 	 *
2254 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2255 	 *
2256 	 * Note: We limit resid to be a positive int value as we use
2257 	 * imin() to set bytes_to_copy -- radr://14558484
2258 	 */
2259 	if (resid < 0 || resid > INT_MAX ||
2260 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2261 		error = EINVAL;
2262 		goto out_locked;
2263 	}
2264 
2265 	dontroute = (flags & MSG_DONTROUTE) &&
2266 	    (so->so_options & SO_DONTROUTE) == 0 &&
2267 	    (so->so_proto->pr_flags & PR_ATOMIC);
2268 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2269 
2270 	if (control != NULL) {
2271 		clen = control->m_len;
2272 	}
2273 
2274 	if (soreserveheadroom != 0) {
2275 		headroom = so->so_pktheadroom;
2276 	}
2277 
2278 	do {
2279 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2280 		    &sblocked);
2281 		if (error) {
2282 			goto out_locked;
2283 		}
2284 
2285 		mp = &top;
2286 		space = sbspace(&so->so_snd) - clen;
2287 		space += ((flags & MSG_OOB) ? 1024 : 0);
2288 
2289 		do {
2290 			if (uio == NULL) {
2291 				/*
2292 				 * Data is prepackaged in "top".
2293 				 */
2294 				resid = 0;
2295 				if (flags & MSG_EOR) {
2296 					top->m_flags |= M_EOR;
2297 				}
2298 			} else {
2299 				int chainlength;
2300 				int bytes_to_copy;
2301 				boolean_t jumbocl;
2302 				boolean_t bigcl;
2303 				int bytes_to_alloc;
2304 
2305 				bytes_to_copy = imin((int)resid, (int)space);
2306 
2307 				bytes_to_alloc = bytes_to_copy;
2308 				if (top == NULL) {
2309 					bytes_to_alloc += headroom;
2310 				}
2311 
2312 				if (sosendminchain > 0) {
2313 					chainlength = 0;
2314 				} else {
2315 					chainlength = sosendmaxchain;
2316 				}
2317 
2318 				/*
2319 				 * Use big 4 KB cluster when the outgoing interface
2320 				 * does not prefer 2 KB clusters
2321 				 */
2322 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2323 				    sosendbigcl_ignore_capab;
2324 
2325 				/*
2326 				 * Attempt to use larger than system page-size
2327 				 * clusters for large writes only if there is
2328 				 * a jumbo cluster pool and if the socket is
2329 				 * marked accordingly.
2330 				 */
2331 				jumbocl = sosendjcl && njcl > 0 &&
2332 				    ((so->so_flags & SOF_MULTIPAGES) ||
2333 				    sosendjcl_ignore_capab) &&
2334 				    bigcl;
2335 
2336 				socket_unlock(so, 0);
2337 
2338 				do {
2339 					int num_needed;
2340 					int hdrs_needed = (top == NULL) ? 1 : 0;
2341 
2342 					/*
2343 					 * try to maintain a local cache of mbuf
2344 					 * clusters needed to complete this
2345 					 * write the list is further limited to
2346 					 * the number that are currently needed
2347 					 * to fill the socket this mechanism
2348 					 * allows a large number of mbufs/
2349 					 * clusters to be grabbed under a single
2350 					 * mbuf lock... if we can't get any
2351 					 * clusters, than fall back to trying
2352 					 * for mbufs if we fail early (or
2353 					 * miscalcluate the number needed) make
2354 					 * sure to release any clusters we
2355 					 * haven't yet consumed.
2356 					 */
2357 					if (freelist == NULL &&
2358 					    bytes_to_alloc > MBIGCLBYTES &&
2359 					    jumbocl) {
2360 						num_needed =
2361 						    bytes_to_alloc / M16KCLBYTES;
2362 
2363 						if ((bytes_to_alloc -
2364 						    (num_needed * M16KCLBYTES))
2365 						    >= MINCLSIZE) {
2366 							num_needed++;
2367 						}
2368 
2369 						freelist =
2370 						    m_getpackets_internal(
2371 							(unsigned int *)&num_needed,
2372 							hdrs_needed, M_WAIT, 0,
2373 							M16KCLBYTES);
2374 						/*
2375 						 * Fall back to 4K cluster size
2376 						 * if allocation failed
2377 						 */
2378 					}
2379 
2380 					if (freelist == NULL &&
2381 					    bytes_to_alloc > MCLBYTES &&
2382 					    bigcl) {
2383 						num_needed =
2384 						    bytes_to_alloc / MBIGCLBYTES;
2385 
2386 						if ((bytes_to_alloc -
2387 						    (num_needed * MBIGCLBYTES)) >=
2388 						    MINCLSIZE) {
2389 							num_needed++;
2390 						}
2391 
2392 						freelist =
2393 						    m_getpackets_internal(
2394 							(unsigned int *)&num_needed,
2395 							hdrs_needed, M_WAIT, 0,
2396 							MBIGCLBYTES);
2397 						/*
2398 						 * Fall back to cluster size
2399 						 * if allocation failed
2400 						 */
2401 					}
2402 
2403 					/*
2404 					 * Allocate a cluster as we want to
2405 					 * avoid to split the data in more
2406 					 * that one segment and using MINCLSIZE
2407 					 * would lead us to allocate two mbufs
2408 					 */
2409 					if (soreserveheadroom != 0 &&
2410 					    freelist == NULL &&
2411 					    ((top == NULL &&
2412 					    bytes_to_alloc > _MHLEN) ||
2413 					    bytes_to_alloc > _MLEN)) {
2414 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2415 						    MCLBYTES;
2416 						freelist =
2417 						    m_getpackets_internal(
2418 							(unsigned int *)&num_needed,
2419 							hdrs_needed, M_WAIT, 0,
2420 							MCLBYTES);
2421 						/*
2422 						 * Fall back to a single mbuf
2423 						 * if allocation failed
2424 						 */
2425 					} else if (freelist == NULL &&
2426 					    bytes_to_alloc > MINCLSIZE) {
2427 						num_needed =
2428 						    bytes_to_alloc / MCLBYTES;
2429 
2430 						if ((bytes_to_alloc -
2431 						    (num_needed * MCLBYTES)) >=
2432 						    MINCLSIZE) {
2433 							num_needed++;
2434 						}
2435 
2436 						freelist =
2437 						    m_getpackets_internal(
2438 							(unsigned int *)&num_needed,
2439 							hdrs_needed, M_WAIT, 0,
2440 							MCLBYTES);
2441 						/*
2442 						 * Fall back to a single mbuf
2443 						 * if allocation failed
2444 						 */
2445 					}
2446 					/*
2447 					 * For datagram protocols, leave
2448 					 * headroom for protocol headers
2449 					 * in the first cluster of the chain
2450 					 */
2451 					if (freelist != NULL && atomic &&
2452 					    top == NULL && headroom > 0) {
2453 						freelist->m_data += headroom;
2454 					}
2455 
2456 					/*
2457 					 * Fall back to regular mbufs without
2458 					 * reserving the socket headroom
2459 					 */
2460 					if (freelist == NULL) {
2461 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2462 							if (top == NULL) {
2463 								MGETHDR(freelist,
2464 								    M_WAIT, MT_DATA);
2465 							} else {
2466 								MGET(freelist,
2467 								    M_WAIT, MT_DATA);
2468 							}
2469 						}
2470 
2471 						if (freelist == NULL) {
2472 							error = ENOBUFS;
2473 							socket_lock(so, 0);
2474 							goto out_locked;
2475 						}
2476 						/*
2477 						 * For datagram protocols,
2478 						 * leave room for protocol
2479 						 * headers in first mbuf.
2480 						 */
2481 						if (atomic && top == NULL &&
2482 						    bytes_to_copy > 0 &&
2483 						    bytes_to_copy < MHLEN) {
2484 							MH_ALIGN(freelist,
2485 							    bytes_to_copy);
2486 						}
2487 					}
2488 					m = freelist;
2489 					freelist = m->m_next;
2490 					m->m_next = NULL;
2491 
2492 					if ((m->m_flags & M_EXT)) {
2493 						mlen = m->m_ext.ext_size -
2494 						    M_LEADINGSPACE(m);
2495 					} else if ((m->m_flags & M_PKTHDR)) {
2496 						mlen = MHLEN - M_LEADINGSPACE(m);
2497 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2498 					} else {
2499 						mlen = MLEN - M_LEADINGSPACE(m);
2500 					}
2501 					len = imin((int)mlen, bytes_to_copy);
2502 
2503 					chainlength += len;
2504 
2505 					space -= len;
2506 
2507 					error = uiomove(mtod(m, caddr_t),
2508 					    (int)len, uio);
2509 
2510 					resid = uio_resid(uio);
2511 
2512 					m->m_len = (int32_t)len;
2513 					*mp = m;
2514 					top->m_pkthdr.len += len;
2515 					if (error) {
2516 						break;
2517 					}
2518 					mp = &m->m_next;
2519 					if (resid <= 0) {
2520 						if (flags & MSG_EOR) {
2521 							top->m_flags |= M_EOR;
2522 						}
2523 						break;
2524 					}
2525 					bytes_to_copy = imin((int)resid, (int)space);
2526 				} while (space > 0 &&
2527 				    (chainlength < sosendmaxchain || atomic ||
2528 				    resid < MINCLSIZE));
2529 
2530 				socket_lock(so, 0);
2531 
2532 				if (error) {
2533 					goto out_locked;
2534 				}
2535 			}
2536 
2537 			if (dontroute) {
2538 				so->so_options |= SO_DONTROUTE;
2539 			}
2540 
2541 			/*
2542 			 * Compute flags here, for pru_send and NKEs
2543 			 *
2544 			 * If the user set MSG_EOF, the protocol
2545 			 * understands this flag and nothing left to
2546 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2547 			 */
2548 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2549 			    ((flags & MSG_EOF) &&
2550 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2551 			    (resid <= 0)) ? PRUS_EOF :
2552 			    /* If there is more to send set PRUS_MORETOCOME */
2553 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2554 
2555 			if ((flags & MSG_SKIPCFIL) == 0) {
2556 				/*
2557 				 * Socket filter processing
2558 				 */
2559 				error = sflt_data_out(so, addr, &top,
2560 				    &control, (sendflags & MSG_OOB) ?
2561 				    sock_data_filt_flag_oob : 0);
2562 				if (error) {
2563 					if (error == EJUSTRETURN) {
2564 						error = 0;
2565 						goto packet_consumed;
2566 					}
2567 					goto out_locked;
2568 				}
2569 #if CONTENT_FILTER
2570 				/*
2571 				 * Content filter processing
2572 				 */
2573 				error = cfil_sock_data_out(so, addr, top,
2574 				    control, sendflags, dgram_flow_entry);
2575 				if (error) {
2576 					if (error == EJUSTRETURN) {
2577 						error = 0;
2578 						goto packet_consumed;
2579 					}
2580 					goto out_locked;
2581 				}
2582 #endif /* CONTENT_FILTER */
2583 			}
2584 			error = (*so->so_proto->pr_usrreqs->pru_send)
2585 			    (so, sendflags, top, addr, control, p);
2586 
2587 packet_consumed:
2588 			if (dontroute) {
2589 				so->so_options &= ~SO_DONTROUTE;
2590 			}
2591 
2592 			clen = 0;
2593 			control = NULL;
2594 			top = NULL;
2595 			mp = &top;
2596 			if (error) {
2597 				goto out_locked;
2598 			}
2599 		} while (resid && space > 0);
2600 	} while (resid);
2601 
2602 
2603 out_locked:
2604 	if (resid > orig_resid) {
2605 		char pname[MAXCOMLEN] = {};
2606 		pid_t current_pid = proc_pid(current_proc());
2607 		proc_name(current_pid, pname, sizeof(pname));
2608 
2609 		if (sosend_assert_panic != 0) {
2610 			panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2611 			    so, resid, orig_resid, pname, current_pid);
2612 		} else {
2613 			os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2614 			    so->so_gencnt, resid, orig_resid, pname, current_pid);
2615 		}
2616 	}
2617 
2618 	if (sblocked) {
2619 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2620 	} else {
2621 		socket_unlock(so, 1);
2622 	}
2623 	if (top != NULL) {
2624 		m_freem(top);
2625 	}
2626 	if (control != NULL) {
2627 		m_freem(control);
2628 	}
2629 	if (freelist != NULL) {
2630 		m_freem_list(freelist);
2631 	}
2632 
2633 	if (dgram_flow_entry != NULL) {
2634 		soflow_free_flow(dgram_flow_entry);
2635 	}
2636 
2637 	soclearfastopen(so);
2638 
2639 	if (en_tracing) {
2640 		/* resid passed here is the bytes left in uio */
2641 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2642 		    VM_KERNEL_ADDRPERM(so),
2643 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2644 		    (int64_t)(orig_resid - resid));
2645 	}
2646 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2647 	    so->so_snd.sb_cc, space, error);
2648 
2649 	return error;
2650 }
2651 
2652 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2653 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2654 {
2655 	struct mbuf *m0 = NULL, *control_end = NULL;
2656 
2657 	socket_lock_assert_owned(so);
2658 
2659 	/*
2660 	 * top must points to mbuf chain to be sent.
2661 	 * If control is not NULL, top must be packet header
2662 	 */
2663 	VERIFY(top != NULL &&
2664 	    (control == NULL || top->m_flags & M_PKTHDR));
2665 
2666 	/*
2667 	 * If control is not passed in, see if we can get it
2668 	 * from top.
2669 	 */
2670 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2671 		// Locate start of control if present and start of data
2672 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2673 			if (m0->m_flags & M_PKTHDR) {
2674 				top = m0;
2675 				break;
2676 			} else if (m0->m_type == MT_CONTROL) {
2677 				if (control == NULL) {
2678 					// Found start of control
2679 					control = m0;
2680 				}
2681 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2682 					// Found end of control
2683 					control_end = m0;
2684 				}
2685 			}
2686 		}
2687 		if (control_end != NULL) {
2688 			control_end->m_next = NULL;
2689 		}
2690 	}
2691 
2692 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2693 	    (so, sendflags, top, addr, control, current_proc());
2694 
2695 	return error;
2696 }
2697 
2698 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp)2699 mbuf_detach_control_from_list(struct mbuf **mp)
2700 {
2701 	struct mbuf *control = NULL;
2702 	struct mbuf *m = *mp;
2703 
2704 	if (m->m_type == MT_CONTROL) {
2705 		struct mbuf *control_end;
2706 		struct mbuf *n;
2707 
2708 		n = control_end = control = m;
2709 
2710 		/*
2711 		 * Break the chain per mbuf type
2712 		 */
2713 		while (n != NULL && n->m_type == MT_CONTROL) {
2714 			control_end = n;
2715 			n = n->m_next;
2716 		}
2717 		control_end->m_next = NULL;
2718 		*mp = n;
2719 	}
2720 	VERIFY(*mp != NULL);
2721 
2722 	return control;
2723 }
2724 
2725 /*
2726  * Supported only connected sockets (no address) without ancillary data
2727  * (control mbuf) for atomic protocols
2728  */
2729 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2730 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2731 {
2732 	struct mbuf *m;
2733 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2734 	int error, dontroute;
2735 	int atomic = sosendallatonce(so);
2736 	int sblocked = 0;
2737 	struct proc *p = current_proc();
2738 	struct mbuf *top = pktlist;
2739 	bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2740 
2741 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2742 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2743 
2744 	if (so->so_type != SOCK_DGRAM) {
2745 		error = EINVAL;
2746 		os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2747 		    error);
2748 		goto out;
2749 	}
2750 	if (atomic == 0) {
2751 		error = EINVAL;
2752 		os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2753 		    error);
2754 		goto out;
2755 	}
2756 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2757 		error = ENOTCONN;
2758 		os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2759 		    error);
2760 		goto out;
2761 	}
2762 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2763 		error = EINVAL;
2764 		os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2765 		    flags, error);
2766 		goto out;
2767 	}
2768 
2769 	socket_lock(so, 1);
2770 	so_update_last_owner_locked(so, p);
2771 	so_update_policy(so);
2772 
2773 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2774 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, true, 0);
2775 	}
2776 
2777 #if NECP
2778 	so_update_necp_policy(so, NULL, NULL);
2779 #endif /* NECP */
2780 
2781 	dontroute = (flags & MSG_DONTROUTE) &&
2782 	    (so->so_options & SO_DONTROUTE) == 0 &&
2783 	    (so->so_proto->pr_flags & PR_ATOMIC);
2784 	if (dontroute) {
2785 		so->so_options |= SO_DONTROUTE;
2786 	}
2787 
2788 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2789 
2790 	error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2791 	if (error) {
2792 		os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2793 		    error);
2794 		goto release;
2795 	}
2796 
2797 	if (!skip_filt) {
2798 		struct mbuf **prevnextp = NULL;
2799 
2800 		for (m = top; m != NULL; m = m->m_nextpkt) {
2801 			struct mbuf *control = NULL;
2802 			struct mbuf *last_control = NULL;
2803 			struct mbuf *nextpkt;
2804 
2805 			/*
2806 			 * Remove packet from the list of packets
2807 			 */
2808 			nextpkt = m->m_nextpkt;
2809 			if (prevnextp != NULL) {
2810 				*prevnextp = nextpkt;
2811 			} else {
2812 				top = nextpkt;
2813 			}
2814 			m->m_nextpkt = NULL;
2815 
2816 			/*
2817 			 * Break the chain per mbuf type
2818 			 */
2819 			if (m->m_type == MT_CONTROL) {
2820 				control = mbuf_detach_control_from_list(&m);
2821 			}
2822 			/*
2823 			 * Socket filter processing
2824 			 */
2825 			error = sflt_data_out(so, NULL, &m,
2826 			    &control, 0);
2827 			if (error != 0 && error != EJUSTRETURN) {
2828 				os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2829 				    error);
2830 				goto release;
2831 			}
2832 
2833 #if CONTENT_FILTER
2834 			if (error == 0) {
2835 				/*
2836 				 * Content filter processing
2837 				 */
2838 				error = cfil_sock_data_out(so, NULL, m,
2839 				    control, 0, dgram_flow_entry);
2840 				if (error != 0 && error != EJUSTRETURN) {
2841 					os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2842 					    error);
2843 					goto release;
2844 				}
2845 			}
2846 #endif /* CONTENT_FILTER */
2847 			if (error == EJUSTRETURN) {
2848 				/*
2849 				 * When swallowed by a filter, the packet is not
2850 				 * in the list anymore
2851 				 */
2852 				error = 0;
2853 			} else {
2854 				/*
2855 				 * Rebuild the mbuf chain of the packet
2856 				 */
2857 				if (control != NULL) {
2858 					last_control->m_next = m;
2859 					m = control;
2860 				}
2861 				/*
2862 				 * Reinsert the packet in the list of packets
2863 				 */
2864 				m->m_nextpkt = nextpkt;
2865 				if (prevnextp != NULL) {
2866 					*prevnextp = m;
2867 				} else {
2868 					top = m;
2869 				}
2870 				prevnextp = &m->m_nextpkt;
2871 			}
2872 		}
2873 	}
2874 
2875 	if (top != NULL) {
2876 		if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2877 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2878 			    (so, top, pktcnt, flags);
2879 			if (error != 0) {
2880 				os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2881 				    error);
2882 			}
2883 			top = NULL;
2884 		} else {
2885 			*pktcnt = 0;
2886 			for (m = top; m != NULL; m = top) {
2887 				struct mbuf *control = NULL;
2888 
2889 				top = m->m_nextpkt;
2890 				m->m_nextpkt = NULL;
2891 
2892 				/*
2893 				 * Break the chain per mbuf type
2894 				 */
2895 				if (m->m_type == MT_CONTROL) {
2896 					control = mbuf_detach_control_from_list(&m);
2897 				}
2898 
2899 				error = (*so->so_proto->pr_usrreqs->pru_send)
2900 				    (so, 0, m, NULL, control, current_proc());
2901 				if (error != 0) {
2902 					os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2903 					    error);
2904 					goto release;
2905 				}
2906 				*pktcnt += 1;
2907 			}
2908 		}
2909 	}
2910 
2911 release:
2912 	if (dontroute) {
2913 		so->so_options &= ~SO_DONTROUTE;
2914 	}
2915 	if (sblocked) {
2916 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2917 	} else {
2918 		socket_unlock(so, 1);
2919 	}
2920 out:
2921 	if (top != NULL) {
2922 		os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2923 		    error);
2924 		m_freem_list(top);
2925 	}
2926 
2927 	if (dgram_flow_entry != NULL) {
2928 		soflow_free_flow(dgram_flow_entry);
2929 	}
2930 
2931 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2932 	    so->so_snd.sb_cc, 0, error);
2933 
2934 	return error;
2935 }
2936 
2937 /*
2938  * May return ERESTART when packet is dropped by MAC policy check
2939  */
2940 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2941 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2942     struct mbuf **maddrp,
2943     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2944 {
2945 	int error = 0;
2946 	struct mbuf *m = *mp;
2947 	struct mbuf *nextrecord = *nextrecordp;
2948 
2949 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2950 #if CONFIG_MACF_SOCKET_SUBSET
2951 	/*
2952 	 * Call the MAC framework for policy checking if we're in
2953 	 * the user process context and the socket isn't connected.
2954 	 */
2955 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2956 		struct mbuf *m0 = m;
2957 		/*
2958 		 * Dequeue this record (temporarily) from the receive
2959 		 * list since we're about to drop the socket's lock
2960 		 * where a new record may arrive and be appended to
2961 		 * the list.  Upon MAC policy failure, the record
2962 		 * will be freed.  Otherwise, we'll add it back to
2963 		 * the head of the list.  We cannot rely on SB_LOCK
2964 		 * because append operation uses the socket's lock.
2965 		 */
2966 		do {
2967 			m->m_nextpkt = NULL;
2968 			sbfree(&so->so_rcv, m);
2969 			m = m->m_next;
2970 		} while (m != NULL);
2971 		m = m0;
2972 		so->so_rcv.sb_mb = nextrecord;
2973 		SB_EMPTY_FIXUP(&so->so_rcv);
2974 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2975 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2976 		socket_unlock(so, 0);
2977 
2978 		error = mac_socket_check_received(kauth_cred_get(), so,
2979 		    mtod(m, struct sockaddr *));
2980 
2981 		if (error != 0) {
2982 			/*
2983 			 * MAC policy failure; free this record and
2984 			 * process the next record (or block until
2985 			 * one is available).  We have adjusted sb_cc
2986 			 * and sb_mbcnt above so there is no need to
2987 			 * call sbfree() again.
2988 			 */
2989 			m_freem(m);
2990 			/*
2991 			 * Clear SB_LOCK but don't unlock the socket.
2992 			 * Process the next record or wait for one.
2993 			 */
2994 			socket_lock(so, 0);
2995 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
2996 			error = ERESTART;
2997 			goto done;
2998 		}
2999 		socket_lock(so, 0);
3000 		/*
3001 		 * If the socket has been defunct'd, drop it.
3002 		 */
3003 		if (so->so_flags & SOF_DEFUNCT) {
3004 			m_freem(m);
3005 			error = ENOTCONN;
3006 			goto done;
3007 		}
3008 		/*
3009 		 * Re-adjust the socket receive list and re-enqueue
3010 		 * the record in front of any packets which may have
3011 		 * been appended while we dropped the lock.
3012 		 */
3013 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3014 			sballoc(&so->so_rcv, m);
3015 		}
3016 		sballoc(&so->so_rcv, m);
3017 		if (so->so_rcv.sb_mb == NULL) {
3018 			so->so_rcv.sb_lastrecord = m0;
3019 			so->so_rcv.sb_mbtail = m;
3020 		}
3021 		m = m0;
3022 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3023 		so->so_rcv.sb_mb = m;
3024 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3025 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3026 	}
3027 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3028 	if (psa != NULL) {
3029 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3030 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3031 			error = EWOULDBLOCK;
3032 			goto done;
3033 		}
3034 	} else if (maddrp != NULL) {
3035 		*maddrp = m;
3036 	}
3037 	if (flags & MSG_PEEK) {
3038 		m = m->m_next;
3039 	} else {
3040 		sbfree(&so->so_rcv, m);
3041 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3042 			panic("%s: about to create invalid socketbuf",
3043 			    __func__);
3044 			/* NOTREACHED */
3045 		}
3046 		if (maddrp == NULL) {
3047 			MFREE(m, so->so_rcv.sb_mb);
3048 		} else {
3049 			so->so_rcv.sb_mb = m->m_next;
3050 			m->m_next = NULL;
3051 		}
3052 		m = so->so_rcv.sb_mb;
3053 		if (m != NULL) {
3054 			m->m_nextpkt = nextrecord;
3055 		} else {
3056 			so->so_rcv.sb_mb = nextrecord;
3057 			SB_EMPTY_FIXUP(&so->so_rcv);
3058 		}
3059 	}
3060 done:
3061 	*mp = m;
3062 	*nextrecordp = nextrecord;
3063 
3064 	return error;
3065 }
3066 
3067 /*
3068  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3069  * so clear the data portion in order not to leak the file pointers
3070  */
3071 static void
sopeek_scm_rights(struct mbuf * rights)3072 sopeek_scm_rights(struct mbuf *rights)
3073 {
3074 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3075 
3076 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3077 		VERIFY(cm->cmsg_len <= rights->m_len);
3078 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3079 	}
3080 }
3081 
3082 /*
3083  * Process one or more MT_CONTROL mbufs present before any data mbufs
3084  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3085  * just copy the data; if !MSG_PEEK, we call into the protocol to
3086  * perform externalization.
3087  */
3088 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3089 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3090     struct mbuf **mp, struct mbuf **nextrecordp)
3091 {
3092 	int error = 0;
3093 	struct mbuf *cm = NULL, *cmn;
3094 	struct mbuf **cme = &cm;
3095 	struct sockbuf *sb_rcv = &so->so_rcv;
3096 	struct mbuf **msgpcm = NULL;
3097 	struct mbuf *m = *mp;
3098 	struct mbuf *nextrecord = *nextrecordp;
3099 	struct protosw *pr = so->so_proto;
3100 
3101 	/*
3102 	 * Externalizing the control messages would require us to
3103 	 * drop the socket's lock below.  Once we re-acquire the
3104 	 * lock, the mbuf chain might change.  In order to preserve
3105 	 * consistency, we unlink all control messages from the
3106 	 * first mbuf chain in one shot and link them separately
3107 	 * onto a different chain.
3108 	 */
3109 	do {
3110 		if (flags & MSG_PEEK) {
3111 			if (controlp != NULL) {
3112 				if (*controlp == NULL) {
3113 					msgpcm = controlp;
3114 				}
3115 				*controlp = m_copy(m, 0, m->m_len);
3116 
3117 				/*
3118 				 * If we failed to allocate an mbuf,
3119 				 * release any previously allocated
3120 				 * mbufs for control data. Return
3121 				 * an error. Keep the mbufs in the
3122 				 * socket as this is using
3123 				 * MSG_PEEK flag.
3124 				 */
3125 				if (*controlp == NULL) {
3126 					m_freem(*msgpcm);
3127 					error = ENOBUFS;
3128 					goto done;
3129 				}
3130 
3131 				if (pr->pr_domain->dom_externalize != NULL) {
3132 					sopeek_scm_rights(*controlp);
3133 				}
3134 
3135 				controlp = &(*controlp)->m_next;
3136 			}
3137 			m = m->m_next;
3138 		} else {
3139 			m->m_nextpkt = NULL;
3140 			sbfree(sb_rcv, m);
3141 			sb_rcv->sb_mb = m->m_next;
3142 			m->m_next = NULL;
3143 			*cme = m;
3144 			cme = &(*cme)->m_next;
3145 			m = sb_rcv->sb_mb;
3146 		}
3147 	} while (m != NULL && m->m_type == MT_CONTROL);
3148 
3149 	if (!(flags & MSG_PEEK)) {
3150 		if (sb_rcv->sb_mb != NULL) {
3151 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3152 		} else {
3153 			sb_rcv->sb_mb = nextrecord;
3154 			SB_EMPTY_FIXUP(sb_rcv);
3155 		}
3156 		if (nextrecord == NULL) {
3157 			sb_rcv->sb_lastrecord = m;
3158 		}
3159 	}
3160 
3161 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3162 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3163 
3164 	while (cm != NULL) {
3165 		int cmsg_level;
3166 		int cmsg_type;
3167 
3168 		cmn = cm->m_next;
3169 		cm->m_next = NULL;
3170 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3171 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3172 
3173 		/*
3174 		 * Call the protocol to externalize SCM_RIGHTS message
3175 		 * and return the modified message to the caller upon
3176 		 * success.  Otherwise, all other control messages are
3177 		 * returned unmodified to the caller.  Note that we
3178 		 * only get into this loop if MSG_PEEK is not set.
3179 		 */
3180 		if (pr->pr_domain->dom_externalize != NULL &&
3181 		    cmsg_level == SOL_SOCKET &&
3182 		    cmsg_type == SCM_RIGHTS) {
3183 			/*
3184 			 * Release socket lock: see 3903171.  This
3185 			 * would also allow more records to be appended
3186 			 * to the socket buffer.  We still have SB_LOCK
3187 			 * set on it, so we can be sure that the head
3188 			 * of the mbuf chain won't change.
3189 			 */
3190 			socket_unlock(so, 0);
3191 			error = (*pr->pr_domain->dom_externalize)(cm);
3192 			socket_lock(so, 0);
3193 		} else {
3194 			error = 0;
3195 		}
3196 
3197 		if (controlp != NULL && error == 0) {
3198 			*controlp = cm;
3199 			controlp = &(*controlp)->m_next;
3200 		} else {
3201 			(void) m_free(cm);
3202 		}
3203 		cm = cmn;
3204 	}
3205 	/*
3206 	 * Update the value of nextrecord in case we received new
3207 	 * records when the socket was unlocked above for
3208 	 * externalizing SCM_RIGHTS.
3209 	 */
3210 	if (m != NULL) {
3211 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3212 	} else {
3213 		nextrecord = sb_rcv->sb_mb;
3214 	}
3215 
3216 done:
3217 	*mp = m;
3218 	*nextrecordp = nextrecord;
3219 
3220 	return error;
3221 }
3222 
3223 /*
3224  * If we have less data than requested, block awaiting more
3225  * (subject to any timeout) if:
3226  *   1. the current count is less than the low water mark, or
3227  *   2. MSG_WAITALL is set, and it is possible to do the entire
3228  *	receive operation at once if we block (resid <= hiwat).
3229  *   3. MSG_DONTWAIT is not set
3230  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3231  * we have to do the receive in sections, and thus risk returning
3232  * a short count if a timeout or signal occurs after we start.
3233  */
3234 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3235 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3236 {
3237 	struct protosw *pr = so->so_proto;
3238 
3239 	/* No mbufs in the receive-queue? Wait! */
3240 	if (m == NULL) {
3241 		return true;
3242 	}
3243 
3244 	/* Not enough data in the receive socket-buffer - we may have to wait */
3245 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3246 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3247 		/*
3248 		 * Application did set the lowater-mark, so we should wait for
3249 		 * this data to be present.
3250 		 */
3251 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3252 			return true;
3253 		}
3254 
3255 		/*
3256 		 * Application wants all the data - so let's try to do the
3257 		 * receive-operation at once by waiting for everything to
3258 		 * be there.
3259 		 */
3260 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3261 			return true;
3262 		}
3263 	}
3264 
3265 	return false;
3266 }
3267 
3268 /*
3269  * Implement receive operations on a socket.
3270  * We depend on the way that records are added to the sockbuf
3271  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3272  * must begin with an address if the protocol so specifies,
3273  * followed by an optional mbuf or mbufs containing ancillary data,
3274  * and then zero or more mbufs of data.
3275  * In order to avoid blocking network interrupts for the entire time here,
3276  * we splx() while doing the actual copy to user space.
3277  * Although the sockbuf is locked, new data may still be appended,
3278  * and thus we must maintain consistency of the sockbuf during that time.
3279  *
3280  * The caller may receive the data as a single mbuf chain by supplying
3281  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3282  * only for the count in uio_resid.
3283  *
3284  * Returns:	0			Success
3285  *		ENOBUFS
3286  *		ENOTCONN
3287  *		EWOULDBLOCK
3288  *	uiomove:EFAULT
3289  *	sblock:EWOULDBLOCK
3290  *	sblock:EINTR
3291  *	sbwait:EBADF
3292  *	sbwait:EINTR
3293  *	sodelayed_copy:EFAULT
3294  *	<pru_rcvoob>:EINVAL[TCP]
3295  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3296  *	<pru_rcvoob>:???
3297  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3298  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3299  *	<pr_domain->dom_externalize>:???
3300  *
3301  * Notes:	Additional return values from calls through <pru_rcvoob> and
3302  *		<pr_domain->dom_externalize> depend on protocols other than
3303  *		TCP or AF_UNIX, which are documented above.
3304  */
3305 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3306 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3307     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3308 {
3309 	struct mbuf *m, **mp, *ml = NULL;
3310 	struct mbuf *nextrecord, *free_list;
3311 	int flags, error, offset;
3312 	user_ssize_t len;
3313 	struct protosw *pr = so->so_proto;
3314 	int moff, type = 0;
3315 	user_ssize_t orig_resid = uio_resid(uio);
3316 	user_ssize_t delayed_copy_len;
3317 	int can_delay;
3318 	struct proc *p = current_proc();
3319 	boolean_t en_tracing = FALSE;
3320 
3321 	/*
3322 	 * Sanity check on the length passed by caller as we are making 'int'
3323 	 * comparisons
3324 	 */
3325 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3326 		return EINVAL;
3327 	}
3328 
3329 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3330 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3331 	    so->so_rcv.sb_hiwat);
3332 
3333 	socket_lock(so, 1);
3334 	so_update_last_owner_locked(so, p);
3335 	so_update_policy(so);
3336 
3337 #ifdef MORE_LOCKING_DEBUG
3338 	if (so->so_usecount == 1) {
3339 		panic("%s: so=%x no other reference on socket", __func__, so);
3340 		/* NOTREACHED */
3341 	}
3342 #endif
3343 	mp = mp0;
3344 	if (psa != NULL) {
3345 		*psa = NULL;
3346 	}
3347 	if (controlp != NULL) {
3348 		*controlp = NULL;
3349 	}
3350 	if (flagsp != NULL) {
3351 		flags = *flagsp & ~MSG_EOR;
3352 	} else {
3353 		flags = 0;
3354 	}
3355 
3356 	/*
3357 	 * If a recv attempt is made on a previously-accepted socket
3358 	 * that has been marked as inactive (disconnected), reject
3359 	 * the request.
3360 	 */
3361 	if (so->so_flags & SOF_DEFUNCT) {
3362 		struct sockbuf *sb = &so->so_rcv;
3363 
3364 		error = ENOTCONN;
3365 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3366 		    __func__, proc_pid(p), proc_best_name(p),
3367 		    so->so_gencnt,
3368 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3369 		/*
3370 		 * This socket should have been disconnected and flushed
3371 		 * prior to being returned from sodefunct(); there should
3372 		 * be no data on its receive list, so panic otherwise.
3373 		 */
3374 		if (so->so_state & SS_DEFUNCT) {
3375 			sb_empty_assert(sb, __func__);
3376 		}
3377 		socket_unlock(so, 1);
3378 		return error;
3379 	}
3380 
3381 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3382 	    pr->pr_usrreqs->pru_preconnect) {
3383 		/*
3384 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3385 		 * calling write() right after this. *If* the app calls a read
3386 		 * we do not want to block this read indefinetely. Thus,
3387 		 * we trigger a connect so that the session gets initiated.
3388 		 */
3389 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3390 
3391 		if (error) {
3392 			socket_unlock(so, 1);
3393 			return error;
3394 		}
3395 	}
3396 
3397 	if (ENTR_SHOULDTRACE &&
3398 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3399 		/*
3400 		 * enable energy tracing for inet sockets that go over
3401 		 * non-loopback interfaces only.
3402 		 */
3403 		struct inpcb *inp = sotoinpcb(so);
3404 		if (inp->inp_last_outifp != NULL &&
3405 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3406 			en_tracing = TRUE;
3407 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3408 			    VM_KERNEL_ADDRPERM(so),
3409 			    ((so->so_state & SS_NBIO) ?
3410 			    kEnTrFlagNonBlocking : 0),
3411 			    (int64_t)orig_resid);
3412 		}
3413 	}
3414 
3415 	/*
3416 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3417 	 * regardless of the flags argument. Here is the case were
3418 	 * out-of-band data is not inline.
3419 	 */
3420 	if ((flags & MSG_OOB) ||
3421 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3422 	    (so->so_options & SO_OOBINLINE) == 0 &&
3423 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3424 		m = m_get(M_WAIT, MT_DATA);
3425 		if (m == NULL) {
3426 			socket_unlock(so, 1);
3427 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3428 			    ENOBUFS, 0, 0, 0, 0);
3429 			return ENOBUFS;
3430 		}
3431 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3432 		if (error) {
3433 			goto bad;
3434 		}
3435 		socket_unlock(so, 0);
3436 		do {
3437 			error = uiomove(mtod(m, caddr_t),
3438 			    imin((int)uio_resid(uio), m->m_len), uio);
3439 			m = m_free(m);
3440 		} while (uio_resid(uio) && error == 0 && m != NULL);
3441 		socket_lock(so, 0);
3442 bad:
3443 		if (m != NULL) {
3444 			m_freem(m);
3445 		}
3446 
3447 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3448 			if (error == EWOULDBLOCK || error == EINVAL) {
3449 				/*
3450 				 * Let's try to get normal data:
3451 				 * EWOULDBLOCK: out-of-band data not
3452 				 * receive yet. EINVAL: out-of-band data
3453 				 * already read.
3454 				 */
3455 				error = 0;
3456 				goto nooob;
3457 			} else if (error == 0 && flagsp != NULL) {
3458 				*flagsp |= MSG_OOB;
3459 			}
3460 		}
3461 		socket_unlock(so, 1);
3462 		if (en_tracing) {
3463 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3464 			    VM_KERNEL_ADDRPERM(so), 0,
3465 			    (int64_t)(orig_resid - uio_resid(uio)));
3466 		}
3467 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3468 		    0, 0, 0, 0);
3469 
3470 		return error;
3471 	}
3472 nooob:
3473 	if (mp != NULL) {
3474 		*mp = NULL;
3475 	}
3476 
3477 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3478 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3479 	}
3480 
3481 	free_list = NULL;
3482 	delayed_copy_len = 0;
3483 restart:
3484 #ifdef MORE_LOCKING_DEBUG
3485 	if (so->so_usecount <= 1) {
3486 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3487 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3488 	}
3489 #endif
3490 	/*
3491 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3492 	 * and if so just return to the caller.  This could happen when
3493 	 * soreceive() is called by a socket upcall function during the
3494 	 * time the socket is freed.  The socket buffer would have been
3495 	 * locked across the upcall, therefore we cannot put this thread
3496 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3497 	 * we may livelock), because the lock on the socket buffer will
3498 	 * only be released when the upcall routine returns to its caller.
3499 	 * Because the socket has been officially closed, there can be
3500 	 * no further read on it.
3501 	 *
3502 	 * A multipath subflow socket would have its SS_NOFDREF set by
3503 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3504 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3505 	 */
3506 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3507 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3508 		socket_unlock(so, 1);
3509 		return 0;
3510 	}
3511 
3512 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3513 	if (error) {
3514 		socket_unlock(so, 1);
3515 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3516 		    0, 0, 0, 0);
3517 		if (en_tracing) {
3518 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3519 			    VM_KERNEL_ADDRPERM(so), 0,
3520 			    (int64_t)(orig_resid - uio_resid(uio)));
3521 		}
3522 		return error;
3523 	}
3524 
3525 	m = so->so_rcv.sb_mb;
3526 	if (so_should_wait(so, uio, m, flags)) {
3527 		/*
3528 		 * Panic if we notice inconsistencies in the socket's
3529 		 * receive list; both sb_mb and sb_cc should correctly
3530 		 * reflect the contents of the list, otherwise we may
3531 		 * end up with false positives during select() or poll()
3532 		 * which could put the application in a bad state.
3533 		 */
3534 		SB_MB_CHECK(&so->so_rcv);
3535 
3536 		if (so->so_error) {
3537 			if (m != NULL) {
3538 				goto dontblock;
3539 			}
3540 			error = so->so_error;
3541 			if ((flags & MSG_PEEK) == 0) {
3542 				so->so_error = 0;
3543 			}
3544 			goto release;
3545 		}
3546 		if (so->so_state & SS_CANTRCVMORE) {
3547 #if CONTENT_FILTER
3548 			/*
3549 			 * Deal with half closed connections
3550 			 */
3551 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3552 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3553 				CFIL_LOG(LOG_INFO,
3554 				    "so %llx ignore SS_CANTRCVMORE",
3555 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3556 			} else
3557 #endif /* CONTENT_FILTER */
3558 			if (m != NULL) {
3559 				goto dontblock;
3560 			} else {
3561 				goto release;
3562 			}
3563 		}
3564 		for (; m != NULL; m = m->m_next) {
3565 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3566 				m = so->so_rcv.sb_mb;
3567 				goto dontblock;
3568 			}
3569 		}
3570 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3571 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3572 			error = ENOTCONN;
3573 			goto release;
3574 		}
3575 		if (uio_resid(uio) == 0) {
3576 			goto release;
3577 		}
3578 
3579 		if ((so->so_state & SS_NBIO) ||
3580 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3581 			error = EWOULDBLOCK;
3582 			goto release;
3583 		}
3584 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3585 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3586 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3587 #if EVEN_MORE_LOCKING_DEBUG
3588 		if (socket_debug) {
3589 			printf("Waiting for socket data\n");
3590 		}
3591 #endif
3592 
3593 		/*
3594 		 * Depending on the protocol (e.g. TCP), the following
3595 		 * might cause the socket lock to be dropped and later
3596 		 * be reacquired, and more data could have arrived and
3597 		 * have been appended to the receive socket buffer by
3598 		 * the time it returns.  Therefore, we only sleep in
3599 		 * sbwait() below if and only if the wait-condition is still
3600 		 * true.
3601 		 */
3602 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3603 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3604 		}
3605 
3606 		error = 0;
3607 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3608 			error = sbwait(&so->so_rcv);
3609 		}
3610 
3611 #if EVEN_MORE_LOCKING_DEBUG
3612 		if (socket_debug) {
3613 			printf("SORECEIVE - sbwait returned %d\n", error);
3614 		}
3615 #endif
3616 		if (so->so_usecount < 1) {
3617 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3618 			    __func__, so, so->so_usecount);
3619 			/* NOTREACHED */
3620 		}
3621 		if (error) {
3622 			socket_unlock(so, 1);
3623 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3624 			    0, 0, 0, 0);
3625 			if (en_tracing) {
3626 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3627 				    VM_KERNEL_ADDRPERM(so), 0,
3628 				    (int64_t)(orig_resid - uio_resid(uio)));
3629 			}
3630 			return error;
3631 		}
3632 		goto restart;
3633 	}
3634 dontblock:
3635 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3636 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3637 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3638 	nextrecord = m->m_nextpkt;
3639 
3640 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3641 		error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3642 		    mp0 == NULL);
3643 		if (error == ERESTART) {
3644 			goto restart;
3645 		} else if (error != 0) {
3646 			goto release;
3647 		}
3648 		orig_resid = 0;
3649 	}
3650 
3651 	/*
3652 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3653 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3654 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3655 	 * perform externalization.
3656 	 */
3657 	if (m != NULL && m->m_type == MT_CONTROL) {
3658 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3659 		if (error != 0) {
3660 			goto release;
3661 		}
3662 		orig_resid = 0;
3663 	}
3664 
3665 	if (m != NULL) {
3666 		if (!(flags & MSG_PEEK)) {
3667 			/*
3668 			 * We get here because m points to an mbuf following
3669 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3670 			 * processed above.  In any case, m should be pointing
3671 			 * to the head of the mbuf chain, and the nextrecord
3672 			 * should be either NULL or equal to m->m_nextpkt.
3673 			 * See comments above about SB_LOCK.
3674 			 */
3675 			if (m != so->so_rcv.sb_mb ||
3676 			    m->m_nextpkt != nextrecord) {
3677 				panic("%s: post-control !sync so=%p m=%p "
3678 				    "nextrecord=%p\n", __func__, so, m,
3679 				    nextrecord);
3680 				/* NOTREACHED */
3681 			}
3682 			if (nextrecord == NULL) {
3683 				so->so_rcv.sb_lastrecord = m;
3684 			}
3685 		}
3686 		type = m->m_type;
3687 		if (type == MT_OOBDATA) {
3688 			flags |= MSG_OOB;
3689 		}
3690 	} else {
3691 		if (!(flags & MSG_PEEK)) {
3692 			SB_EMPTY_FIXUP(&so->so_rcv);
3693 		}
3694 	}
3695 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3696 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3697 
3698 	moff = 0;
3699 	offset = 0;
3700 
3701 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3702 		can_delay = 1;
3703 	} else {
3704 		can_delay = 0;
3705 	}
3706 
3707 	while (m != NULL &&
3708 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3709 		if (m->m_type == MT_OOBDATA) {
3710 			if (type != MT_OOBDATA) {
3711 				break;
3712 			}
3713 		} else if (type == MT_OOBDATA) {
3714 			break;
3715 		}
3716 
3717 		if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3718 		    m->m_type != MT_HEADER) {
3719 			break;
3720 		}
3721 		/*
3722 		 * Make sure to allways set MSG_OOB event when getting
3723 		 * out of band data inline.
3724 		 */
3725 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3726 		    (so->so_options & SO_OOBINLINE) != 0 &&
3727 		    (so->so_state & SS_RCVATMARK) != 0) {
3728 			flags |= MSG_OOB;
3729 		}
3730 		so->so_state &= ~SS_RCVATMARK;
3731 		len = uio_resid(uio) - delayed_copy_len;
3732 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3733 			len = so->so_oobmark - offset;
3734 		}
3735 		if (len > m->m_len - moff) {
3736 			len = m->m_len - moff;
3737 		}
3738 		/*
3739 		 * If mp is set, just pass back the mbufs.
3740 		 * Otherwise copy them out via the uio, then free.
3741 		 * Sockbuf must be consistent here (points to current mbuf,
3742 		 * it points to next record) when we drop priority;
3743 		 * we must note any additions to the sockbuf when we
3744 		 * block interrupts again.
3745 		 */
3746 		if (mp == NULL) {
3747 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3748 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3749 			if (can_delay && len == m->m_len) {
3750 				/*
3751 				 * only delay the copy if we're consuming the
3752 				 * mbuf and we're NOT in MSG_PEEK mode
3753 				 * and we have enough data to make it worthwile
3754 				 * to drop and retake the lock... can_delay
3755 				 * reflects the state of the 2 latter
3756 				 * constraints moff should always be zero
3757 				 * in these cases
3758 				 */
3759 				delayed_copy_len += len;
3760 			} else {
3761 				if (delayed_copy_len) {
3762 					error = sodelayed_copy(so, uio,
3763 					    &free_list, &delayed_copy_len);
3764 
3765 					if (error) {
3766 						goto release;
3767 					}
3768 					/*
3769 					 * can only get here if MSG_PEEK is not
3770 					 * set therefore, m should point at the
3771 					 * head of the rcv queue; if it doesn't,
3772 					 * it means something drastically
3773 					 * changed while we were out from behind
3774 					 * the lock in sodelayed_copy. perhaps
3775 					 * a RST on the stream. in any event,
3776 					 * the stream has been interrupted. it's
3777 					 * probably best just to return whatever
3778 					 * data we've moved and let the caller
3779 					 * sort it out...
3780 					 */
3781 					if (m != so->so_rcv.sb_mb) {
3782 						break;
3783 					}
3784 				}
3785 				socket_unlock(so, 0);
3786 				error = uiomove(mtod(m, caddr_t) + moff,
3787 				    (int)len, uio);
3788 				socket_lock(so, 0);
3789 
3790 				if (error) {
3791 					goto release;
3792 				}
3793 			}
3794 		} else {
3795 			uio_setresid(uio, (uio_resid(uio) - len));
3796 		}
3797 		if (len == m->m_len - moff) {
3798 			if (m->m_flags & M_EOR) {
3799 				flags |= MSG_EOR;
3800 			}
3801 			if (flags & MSG_PEEK) {
3802 				m = m->m_next;
3803 				moff = 0;
3804 			} else {
3805 				nextrecord = m->m_nextpkt;
3806 				sbfree(&so->so_rcv, m);
3807 				m->m_nextpkt = NULL;
3808 
3809 				if (mp != NULL) {
3810 					*mp = m;
3811 					mp = &m->m_next;
3812 					so->so_rcv.sb_mb = m = m->m_next;
3813 					*mp = NULL;
3814 				} else {
3815 					if (free_list == NULL) {
3816 						free_list = m;
3817 					} else {
3818 						ml->m_next = m;
3819 					}
3820 					ml = m;
3821 					so->so_rcv.sb_mb = m = m->m_next;
3822 					ml->m_next = NULL;
3823 				}
3824 				if (m != NULL) {
3825 					m->m_nextpkt = nextrecord;
3826 					if (nextrecord == NULL) {
3827 						so->so_rcv.sb_lastrecord = m;
3828 					}
3829 				} else {
3830 					so->so_rcv.sb_mb = nextrecord;
3831 					SB_EMPTY_FIXUP(&so->so_rcv);
3832 				}
3833 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3834 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3835 			}
3836 		} else {
3837 			if (flags & MSG_PEEK) {
3838 				moff += len;
3839 			} else {
3840 				if (mp != NULL) {
3841 					int copy_flag;
3842 
3843 					if (flags & MSG_DONTWAIT) {
3844 						copy_flag = M_DONTWAIT;
3845 					} else {
3846 						copy_flag = M_WAIT;
3847 					}
3848 					*mp = m_copym(m, 0, (int)len, copy_flag);
3849 					/*
3850 					 * Failed to allocate an mbuf?
3851 					 * Adjust uio_resid back, it was
3852 					 * adjusted down by len bytes which
3853 					 * we didn't copy over.
3854 					 */
3855 					if (*mp == NULL) {
3856 						uio_setresid(uio,
3857 						    (uio_resid(uio) + len));
3858 						break;
3859 					}
3860 				}
3861 				m->m_data += len;
3862 				m->m_len -= len;
3863 				so->so_rcv.sb_cc -= len;
3864 			}
3865 		}
3866 		if (so->so_oobmark) {
3867 			if ((flags & MSG_PEEK) == 0) {
3868 				so->so_oobmark -= len;
3869 				if (so->so_oobmark == 0) {
3870 					so->so_state |= SS_RCVATMARK;
3871 					break;
3872 				}
3873 			} else {
3874 				offset += len;
3875 				if (offset == so->so_oobmark) {
3876 					break;
3877 				}
3878 			}
3879 		}
3880 		if (flags & MSG_EOR) {
3881 			break;
3882 		}
3883 		/*
3884 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3885 		 * (for non-atomic socket), we must not quit until
3886 		 * "uio->uio_resid == 0" or an error termination.
3887 		 * If a signal/timeout occurs, return with a short
3888 		 * count but without error.  Keep sockbuf locked
3889 		 * against other readers.
3890 		 */
3891 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3892 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3893 		    !sosendallatonce(so) && !nextrecord) {
3894 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3895 #if CONTENT_FILTER
3896 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3897 #endif /* CONTENT_FILTER */
3898 			    )) {
3899 				goto release;
3900 			}
3901 
3902 			/*
3903 			 * Depending on the protocol (e.g. TCP), the following
3904 			 * might cause the socket lock to be dropped and later
3905 			 * be reacquired, and more data could have arrived and
3906 			 * have been appended to the receive socket buffer by
3907 			 * the time it returns.  Therefore, we only sleep in
3908 			 * sbwait() below if and only if the socket buffer is
3909 			 * empty, in order to avoid a false sleep.
3910 			 */
3911 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3912 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3913 			}
3914 
3915 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3916 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3917 
3918 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3919 				error = 0;
3920 				goto release;
3921 			}
3922 			/*
3923 			 * have to wait until after we get back from the sbwait
3924 			 * to do the copy because we will drop the lock if we
3925 			 * have enough data that has been delayed... by dropping
3926 			 * the lock we open up a window allowing the netisr
3927 			 * thread to process the incoming packets and to change
3928 			 * the state of this socket... we're issuing the sbwait
3929 			 * because the socket is empty and we're expecting the
3930 			 * netisr thread to wake us up when more packets arrive;
3931 			 * if we allow that processing to happen and then sbwait
3932 			 * we could stall forever with packets sitting in the
3933 			 * socket if no further packets arrive from the remote
3934 			 * side.
3935 			 *
3936 			 * we want to copy before we've collected all the data
3937 			 * to satisfy this request to allow the copy to overlap
3938 			 * the incoming packet processing on an MP system
3939 			 */
3940 			if (delayed_copy_len > sorecvmincopy &&
3941 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3942 				error = sodelayed_copy(so, uio,
3943 				    &free_list, &delayed_copy_len);
3944 
3945 				if (error) {
3946 					goto release;
3947 				}
3948 			}
3949 			m = so->so_rcv.sb_mb;
3950 			if (m != NULL) {
3951 				nextrecord = m->m_nextpkt;
3952 			}
3953 			SB_MB_CHECK(&so->so_rcv);
3954 		}
3955 	}
3956 #ifdef MORE_LOCKING_DEBUG
3957 	if (so->so_usecount <= 1) {
3958 		panic("%s: after big while so=%p ref=%d on socket",
3959 		    __func__, so, so->so_usecount);
3960 		/* NOTREACHED */
3961 	}
3962 #endif
3963 
3964 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3965 		if (so->so_options & SO_DONTTRUNC) {
3966 			flags |= MSG_RCVMORE;
3967 		} else {
3968 			flags |= MSG_TRUNC;
3969 			if ((flags & MSG_PEEK) == 0) {
3970 				(void) sbdroprecord(&so->so_rcv);
3971 			}
3972 		}
3973 	}
3974 
3975 	/*
3976 	 * pru_rcvd below (for TCP) may cause more data to be received
3977 	 * if the socket lock is dropped prior to sending the ACK; some
3978 	 * legacy OpenTransport applications don't handle this well
3979 	 * (if it receives less data than requested while MSG_HAVEMORE
3980 	 * is set), and so we set the flag now based on what we know
3981 	 * prior to calling pru_rcvd.
3982 	 */
3983 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3984 		flags |= MSG_HAVEMORE;
3985 	}
3986 
3987 	if ((flags & MSG_PEEK) == 0) {
3988 		if (m == NULL) {
3989 			so->so_rcv.sb_mb = nextrecord;
3990 			/*
3991 			 * First part is an inline SB_EMPTY_FIXUP().  Second
3992 			 * part makes sure sb_lastrecord is up-to-date if
3993 			 * there is still data in the socket buffer.
3994 			 */
3995 			if (so->so_rcv.sb_mb == NULL) {
3996 				so->so_rcv.sb_mbtail = NULL;
3997 				so->so_rcv.sb_lastrecord = NULL;
3998 			} else if (nextrecord->m_nextpkt == NULL) {
3999 				so->so_rcv.sb_lastrecord = nextrecord;
4000 			}
4001 			SB_MB_CHECK(&so->so_rcv);
4002 		}
4003 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4004 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4005 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4006 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4007 		}
4008 	}
4009 
4010 	if (delayed_copy_len) {
4011 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4012 		if (error) {
4013 			goto release;
4014 		}
4015 	}
4016 	if (free_list != NULL) {
4017 		m_freem_list(free_list);
4018 		free_list = NULL;
4019 	}
4020 
4021 	if (orig_resid == uio_resid(uio) && orig_resid &&
4022 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4023 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4024 		goto restart;
4025 	}
4026 
4027 	if (flagsp != NULL) {
4028 		*flagsp |= flags;
4029 	}
4030 release:
4031 #ifdef MORE_LOCKING_DEBUG
4032 	if (so->so_usecount <= 1) {
4033 		panic("%s: release so=%p ref=%d on socket", __func__,
4034 		    so, so->so_usecount);
4035 		/* NOTREACHED */
4036 	}
4037 #endif
4038 	if (delayed_copy_len) {
4039 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4040 	}
4041 
4042 	if (free_list != NULL) {
4043 		m_freem_list(free_list);
4044 	}
4045 
4046 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4047 
4048 	if (en_tracing) {
4049 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4050 		    VM_KERNEL_ADDRPERM(so),
4051 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4052 		    (int64_t)(orig_resid - uio_resid(uio)));
4053 	}
4054 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4055 	    so->so_rcv.sb_cc, 0, error);
4056 
4057 	return error;
4058 }
4059 
4060 /*
4061  * Returns:	0			Success
4062  *	uiomove:EFAULT
4063  */
4064 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4065 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4066     user_ssize_t *resid)
4067 {
4068 	int error = 0;
4069 	struct mbuf *m;
4070 
4071 	m = *free_list;
4072 
4073 	socket_unlock(so, 0);
4074 
4075 	while (m != NULL && error == 0) {
4076 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4077 		m = m->m_next;
4078 	}
4079 	m_freem_list(*free_list);
4080 
4081 	*free_list = NULL;
4082 	*resid = 0;
4083 
4084 	socket_lock(so, 0);
4085 
4086 	return error;
4087 }
4088 
4089 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)4090 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
4091     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
4092 {
4093 	struct mbuf *m, **mp;
4094 	struct mbuf *nextrecord;
4095 	int flags, error;
4096 	struct protosw *pr = so->so_proto;
4097 	struct proc *p = current_proc();
4098 	u_int npkts = 0;
4099 	struct mbuf *free_list = NULL;
4100 	int sblocked = 0;
4101 
4102 	/*
4103 	 * Sanity check on the parameters passed by caller
4104 	 */
4105 	if (mp0 == NULL || pktcntp == NULL) {
4106 		return EINVAL;
4107 	}
4108 	if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
4109 		return EINVAL;
4110 	}
4111 
4112 	mp = mp0;
4113 	*mp0 = NULL;
4114 	if (controlp != NULL) {
4115 		*controlp = NULL;
4116 	}
4117 	if (maddrp != NULL) {
4118 		*maddrp = NULL;
4119 	}
4120 	if (flagsp != NULL) {
4121 		flags = *flagsp;
4122 	} else {
4123 		flags = 0;
4124 	}
4125 
4126 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4127 	    *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4128 	    so->so_rcv.sb_hiwat);
4129 
4130 	socket_lock(so, 1);
4131 	so_update_last_owner_locked(so, p);
4132 	so_update_policy(so);
4133 
4134 #if NECP
4135 	so_update_necp_policy(so, NULL, NULL);
4136 #endif /* NECP */
4137 
4138 	/*
4139 	 * If a recv attempt is made on a previously-accepted socket
4140 	 * that has been marked as inactive (disconnected), reject
4141 	 * the request.
4142 	 */
4143 	if (so->so_flags & SOF_DEFUNCT) {
4144 		struct sockbuf *sb = &so->so_rcv;
4145 
4146 		error = ENOTCONN;
4147 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4148 		    __func__, proc_pid(p), proc_best_name(p),
4149 		    so->so_gencnt,
4150 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4151 		/*
4152 		 * This socket should have been disconnected and flushed
4153 		 * prior to being returned from sodefunct(); there should
4154 		 * be no data on its receive list, so panic otherwise.
4155 		 */
4156 		if (so->so_state & SS_DEFUNCT) {
4157 			sb_empty_assert(sb, __func__);
4158 		}
4159 		goto release;
4160 	}
4161 
4162 	*mp = NULL;
4163 
4164 restart:
4165 	/*
4166 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4167 	 * and if so just return to the caller.  This could happen when
4168 	 * soreceive() is called by a socket upcall function during the
4169 	 * time the socket is freed.  The socket buffer would have been
4170 	 * locked across the upcall, therefore we cannot put this thread
4171 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4172 	 * we may livelock), because the lock on the socket buffer will
4173 	 * only be released when the upcall routine returns to its caller.
4174 	 * Because the socket has been officially closed, there can be
4175 	 * no further read on it.
4176 	 */
4177 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4178 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4179 		error = 0;
4180 		goto out;
4181 	}
4182 
4183 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4184 	if (error) {
4185 		goto out;
4186 	}
4187 	sblocked = 1;
4188 
4189 	m = so->so_rcv.sb_mb;
4190 	/*
4191 	 * Block awaiting more datagram if needed
4192 	 */
4193 	if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4194 	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4195 		/*
4196 		 * Panic if we notice inconsistencies in the socket's
4197 		 * receive list; both sb_mb and sb_cc should correctly
4198 		 * reflect the contents of the list, otherwise we may
4199 		 * end up with false positives during select() or poll()
4200 		 * which could put the application in a bad state.
4201 		 */
4202 		SB_MB_CHECK(&so->so_rcv);
4203 
4204 		if (so->so_error) {
4205 			if (m != NULL) {
4206 				goto dontblock;
4207 			}
4208 			error = so->so_error;
4209 			if ((flags & MSG_PEEK) == 0) {
4210 				so->so_error = 0;
4211 			}
4212 			goto release;
4213 		}
4214 		if (so->so_state & SS_CANTRCVMORE) {
4215 			if (m != NULL) {
4216 				goto dontblock;
4217 			} else {
4218 				goto release;
4219 			}
4220 		}
4221 		for (; m != NULL; m = m->m_next) {
4222 			if (m->m_flags & M_EOR) {
4223 				m = so->so_rcv.sb_mb;
4224 				goto dontblock;
4225 			}
4226 		}
4227 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4228 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4229 			error = ENOTCONN;
4230 			goto release;
4231 		}
4232 		if ((so->so_state & SS_NBIO) ||
4233 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4234 			error = EWOULDBLOCK;
4235 			goto release;
4236 		}
4237 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4238 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4239 
4240 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4241 		sblocked = 0;
4242 
4243 		error = sbwait(&so->so_rcv);
4244 		if (error != 0) {
4245 			goto release;
4246 		}
4247 		goto restart;
4248 	}
4249 dontblock:
4250 	m = so->so_rcv.sb_mb;
4251 	if (m == NULL) {
4252 		goto release;
4253 	}
4254 
4255 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4256 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4257 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4258 	nextrecord = m->m_nextpkt;
4259 
4260 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4261 		struct mbuf *maddr = NULL;
4262 
4263 		error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4264 		    &nextrecord, 1);
4265 		if (error == ERESTART) {
4266 			goto restart;
4267 		} else if (error != 0) {
4268 			goto release;
4269 		}
4270 
4271 		if (maddr != NULL) {
4272 			maddr->m_nextpkt = NULL;
4273 			maddr->m_next = NULL;
4274 			if (maddrp != NULL) {
4275 				*maddrp = maddr;
4276 				maddrp = &maddr->m_nextpkt;
4277 			} else {
4278 				maddr->m_next = free_list;
4279 				free_list = maddr;
4280 			}
4281 		}
4282 	}
4283 
4284 	/*
4285 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
4286 	 * in the first mbuf chain on the socket buffer.
4287 	 * We call into the protocol to perform externalization.
4288 	 */
4289 	if (m != NULL && m->m_type == MT_CONTROL) {
4290 		struct mbuf *control = NULL;
4291 
4292 		error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4293 		if (error != 0) {
4294 			goto release;
4295 		}
4296 		if (control != NULL) {
4297 			control->m_nextpkt = NULL;
4298 			control->m_next = NULL;
4299 			if (controlp != NULL) {
4300 				*controlp = control;
4301 				controlp = &control->m_nextpkt;
4302 			} else {
4303 				control->m_next = free_list;
4304 				free_list = control;
4305 			}
4306 		}
4307 	}
4308 
4309 	/*
4310 	 * Link the packet to the list
4311 	 */
4312 	if (m != NULL) {
4313 		if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
4314 		    m->m_type != MT_HEADER) {
4315 			panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4316 		}
4317 		m->m_nextpkt = NULL;
4318 		*mp = m;
4319 		mp = &m->m_nextpkt;
4320 	}
4321 	while (m != NULL) {
4322 		sbfree(&so->so_rcv, m);
4323 
4324 		m = m->m_next;
4325 	}
4326 
4327 	so->so_rcv.sb_mb = nextrecord;
4328 	/*
4329 	 * First part is an inline SB_EMPTY_FIXUP().  Second
4330 	 * part makes sure sb_lastrecord is up-to-date if
4331 	 * there is still data in the socket buffer.
4332 	 */
4333 	if (so->so_rcv.sb_mb == NULL) {
4334 		so->so_rcv.sb_mbtail = NULL;
4335 		so->so_rcv.sb_lastrecord = NULL;
4336 	} else if (nextrecord->m_nextpkt == NULL) {
4337 		so->so_rcv.sb_lastrecord = nextrecord;
4338 	}
4339 	SB_MB_CHECK(&so->so_rcv);
4340 
4341 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4342 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4343 
4344 	npkts += 1;
4345 
4346 	/*
4347 	 * We continue as long as all those conditions as we have less packets
4348 	 * than requested and the socket buffer is not empty
4349 	 */
4350 	if (npkts < *pktcntp) {
4351 		if (so->so_rcv.sb_mb != NULL) {
4352 			goto dontblock;
4353 		}
4354 		if ((flags & MSG_WAITALL) != 0) {
4355 			goto restart;
4356 		}
4357 	}
4358 
4359 	if (flagsp != NULL) {
4360 		*flagsp |= flags;
4361 	}
4362 
4363 release:
4364 	/*
4365 	 * pru_rcvd may cause more data to be received if the socket lock
4366 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4367 	 * That way the caller won't be surprised if it receives less data
4368 	 * than requested.
4369 	 */
4370 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4371 		flags |= MSG_HAVEMORE;
4372 	}
4373 
4374 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4375 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4376 	}
4377 
4378 	if (sblocked) {
4379 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4380 	} else {
4381 		socket_unlock(so, 1);
4382 	}
4383 
4384 out:
4385 	*pktcntp = npkts;
4386 	/*
4387 	 * Amortize the cost of freeing the mbufs
4388 	 */
4389 	if (free_list != NULL) {
4390 		m_freem_list(free_list);
4391 	}
4392 
4393 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4394 	    0, 0, 0, 0);
4395 	return error;
4396 }
4397 
4398 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4399 so_statistics_event_to_nstat_event(int64_t *input_options,
4400     uint64_t *nstat_event)
4401 {
4402 	int error = 0;
4403 	switch (*input_options) {
4404 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4405 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4406 		break;
4407 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4408 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4409 		break;
4410 #if (DEBUG || DEVELOPMENT)
4411 	case SO_STATISTICS_EVENT_RESERVED_1:
4412 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4413 		break;
4414 	case SO_STATISTICS_EVENT_RESERVED_2:
4415 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4416 		break;
4417 #endif /* (DEBUG || DEVELOPMENT) */
4418 	default:
4419 		error = EINVAL;
4420 		break;
4421 	}
4422 	return error;
4423 }
4424 
4425 /*
4426  * Returns:	0			Success
4427  *		EINVAL
4428  *		ENOTCONN
4429  *	<pru_shutdown>:EINVAL
4430  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4431  *	<pru_shutdown>:ENOBUFS[TCP]
4432  *	<pru_shutdown>:EMSGSIZE[TCP]
4433  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4434  *	<pru_shutdown>:ENETUNREACH[TCP]
4435  *	<pru_shutdown>:ENETDOWN[TCP]
4436  *	<pru_shutdown>:ENOMEM[TCP]
4437  *	<pru_shutdown>:EACCES[TCP]
4438  *	<pru_shutdown>:EMSGSIZE[TCP]
4439  *	<pru_shutdown>:ENOBUFS[TCP]
4440  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4441  *	<pru_shutdown>:???		[other protocol families]
4442  */
4443 int
soshutdown(struct socket * so,int how)4444 soshutdown(struct socket *so, int how)
4445 {
4446 	int error;
4447 
4448 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4449 
4450 	switch (how) {
4451 	case SHUT_RD:
4452 	case SHUT_WR:
4453 	case SHUT_RDWR:
4454 		socket_lock(so, 1);
4455 		if ((so->so_state &
4456 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4457 			error = ENOTCONN;
4458 		} else {
4459 			error = soshutdownlock(so, how);
4460 		}
4461 		socket_unlock(so, 1);
4462 		break;
4463 	default:
4464 		error = EINVAL;
4465 		break;
4466 	}
4467 
4468 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4469 
4470 	return error;
4471 }
4472 
4473 int
soshutdownlock_final(struct socket * so,int how)4474 soshutdownlock_final(struct socket *so, int how)
4475 {
4476 	struct protosw *pr = so->so_proto;
4477 	int error = 0;
4478 
4479 	sflt_notify(so, sock_evt_shutdown, &how);
4480 
4481 	if (how != SHUT_WR) {
4482 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4483 			/* read already shut down */
4484 			error = ENOTCONN;
4485 			goto done;
4486 		}
4487 		sorflush(so);
4488 	}
4489 	if (how != SHUT_RD) {
4490 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4491 			/* write already shut down */
4492 			error = ENOTCONN;
4493 			goto done;
4494 		}
4495 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4496 	}
4497 done:
4498 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4499 	return error;
4500 }
4501 
4502 int
soshutdownlock(struct socket * so,int how)4503 soshutdownlock(struct socket *so, int how)
4504 {
4505 	int error = 0;
4506 
4507 #if CONTENT_FILTER
4508 	/*
4509 	 * A content filter may delay the actual shutdown until it
4510 	 * has processed the pending data
4511 	 */
4512 	if (so->so_flags & SOF_CONTENT_FILTER) {
4513 		error = cfil_sock_shutdown(so, &how);
4514 		if (error == EJUSTRETURN) {
4515 			error = 0;
4516 			goto done;
4517 		} else if (error != 0) {
4518 			goto done;
4519 		}
4520 	}
4521 #endif /* CONTENT_FILTER */
4522 
4523 	error = soshutdownlock_final(so, how);
4524 
4525 done:
4526 	return error;
4527 }
4528 
4529 void
sowflush(struct socket * so)4530 sowflush(struct socket *so)
4531 {
4532 	struct sockbuf *sb = &so->so_snd;
4533 
4534 	/*
4535 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4536 	 * to prevent the socket buffer from being unexpectedly altered
4537 	 * while it is used by another thread in socket send/receive.
4538 	 *
4539 	 * sblock() must not fail here, hence the assertion.
4540 	 */
4541 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4542 	VERIFY(sb->sb_flags & SB_LOCK);
4543 
4544 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4545 	sb->sb_flags            |= SB_DROP;
4546 	sb->sb_upcall           = NULL;
4547 	sb->sb_upcallarg        = NULL;
4548 
4549 	sbunlock(sb, TRUE);     /* keep socket locked */
4550 
4551 	selthreadclear(&sb->sb_sel);
4552 	sbrelease(sb);
4553 }
4554 
4555 void
sorflush(struct socket * so)4556 sorflush(struct socket *so)
4557 {
4558 	struct sockbuf *sb = &so->so_rcv;
4559 	struct protosw *pr = so->so_proto;
4560 	struct sockbuf asb;
4561 #ifdef notyet
4562 	lck_mtx_t *mutex_held;
4563 	/*
4564 	 * XXX: This code is currently commented out, because we may get here
4565 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4566 	 * longer be able to return us the lock; this will be fixed in future.
4567 	 */
4568 	if (so->so_proto->pr_getlock != NULL) {
4569 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4570 	} else {
4571 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4572 	}
4573 
4574 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4575 #endif /* notyet */
4576 
4577 	sflt_notify(so, sock_evt_flush_read, NULL);
4578 
4579 	socantrcvmore(so);
4580 
4581 	/*
4582 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4583 	 * to prevent the socket buffer from being unexpectedly altered
4584 	 * while it is used by another thread in socket send/receive.
4585 	 *
4586 	 * sblock() must not fail here, hence the assertion.
4587 	 */
4588 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4589 	VERIFY(sb->sb_flags & SB_LOCK);
4590 
4591 	/*
4592 	 * Copy only the relevant fields from "sb" to "asb" which we
4593 	 * need for sbrelease() to function.  In particular, skip
4594 	 * sb_sel as it contains the wait queue linkage, which would
4595 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4596 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4597 	 * to acquire it later as part of sbrelease().
4598 	 */
4599 	bzero(&asb, sizeof(asb));
4600 	asb.sb_cc               = sb->sb_cc;
4601 	asb.sb_hiwat            = sb->sb_hiwat;
4602 	asb.sb_mbcnt            = sb->sb_mbcnt;
4603 	asb.sb_mbmax            = sb->sb_mbmax;
4604 	asb.sb_ctl              = sb->sb_ctl;
4605 	asb.sb_lowat            = sb->sb_lowat;
4606 	asb.sb_mb               = sb->sb_mb;
4607 	asb.sb_mbtail           = sb->sb_mbtail;
4608 	asb.sb_lastrecord       = sb->sb_lastrecord;
4609 	asb.sb_so               = sb->sb_so;
4610 	asb.sb_flags            = sb->sb_flags;
4611 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4612 	asb.sb_flags            |= SB_DROP;
4613 
4614 	/*
4615 	 * Ideally we'd bzero() these and preserve the ones we need;
4616 	 * but to do that we'd need to shuffle things around in the
4617 	 * sockbuf, and we can't do it now because there are KEXTS
4618 	 * that are directly referring to the socket structure.
4619 	 *
4620 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4621 	 * Clearing SB_SEL is done for selthreadclear() below.
4622 	 */
4623 	sb->sb_cc               = 0;
4624 	sb->sb_hiwat            = 0;
4625 	sb->sb_mbcnt            = 0;
4626 	sb->sb_mbmax            = 0;
4627 	sb->sb_ctl              = 0;
4628 	sb->sb_lowat            = 0;
4629 	sb->sb_mb               = NULL;
4630 	sb->sb_mbtail           = NULL;
4631 	sb->sb_lastrecord       = NULL;
4632 	sb->sb_timeo.tv_sec     = 0;
4633 	sb->sb_timeo.tv_usec    = 0;
4634 	sb->sb_upcall           = NULL;
4635 	sb->sb_upcallarg        = NULL;
4636 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4637 	sb->sb_flags            |= SB_DROP;
4638 
4639 	sbunlock(sb, TRUE);     /* keep socket locked */
4640 
4641 	/*
4642 	 * Note that selthreadclear() is called on the original "sb" and
4643 	 * not the local "asb" because of the way wait queue linkage is
4644 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4645 	 * should no longer be set (cleared above.)
4646 	 */
4647 	selthreadclear(&sb->sb_sel);
4648 
4649 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4650 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4651 	}
4652 
4653 	sbrelease(&asb);
4654 }
4655 
4656 /*
4657  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4658  * an additional variant to handle the case where the option value needs
4659  * to be some kind of integer, but not a specific size.
4660  * In addition to their use here, these functions are also called by the
4661  * protocol-level pr_ctloutput() routines.
4662  *
4663  * Returns:	0			Success
4664  *		EINVAL
4665  *	copyin:EFAULT
4666  */
4667 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4668 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4669 {
4670 	size_t  valsize;
4671 
4672 	/*
4673 	 * If the user gives us more than we wanted, we ignore it,
4674 	 * but if we don't get the minimum length the caller
4675 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4676 	 * is set to however much we actually retrieved.
4677 	 */
4678 	if ((valsize = sopt->sopt_valsize) < minlen) {
4679 		return EINVAL;
4680 	}
4681 	if (valsize > len) {
4682 		sopt->sopt_valsize = valsize = len;
4683 	}
4684 
4685 	if (sopt->sopt_p != kernproc) {
4686 		return copyin(sopt->sopt_val, buf, valsize);
4687 	}
4688 
4689 	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4690 	return 0;
4691 }
4692 
4693 /*
4694  * sooptcopyin_timeval
4695  *   Copy in a timeval value into tv_p, and take into account whether the
4696  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4697  *   code here so that we can verify the 64-bit tv_sec value before we lose
4698  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4699  */
4700 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4701 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4702 {
4703 	int                     error;
4704 
4705 	if (proc_is64bit(sopt->sopt_p)) {
4706 		struct user64_timeval   tv64;
4707 
4708 		if (sopt->sopt_valsize < sizeof(tv64)) {
4709 			return EINVAL;
4710 		}
4711 
4712 		sopt->sopt_valsize = sizeof(tv64);
4713 		if (sopt->sopt_p != kernproc) {
4714 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4715 			if (error != 0) {
4716 				return error;
4717 			}
4718 		} else {
4719 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4720 			    sizeof(tv64));
4721 		}
4722 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4723 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4724 			return EDOM;
4725 		}
4726 
4727 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4728 		tv_p->tv_usec = tv64.tv_usec;
4729 	} else {
4730 		struct user32_timeval   tv32;
4731 
4732 		if (sopt->sopt_valsize < sizeof(tv32)) {
4733 			return EINVAL;
4734 		}
4735 
4736 		sopt->sopt_valsize = sizeof(tv32);
4737 		if (sopt->sopt_p != kernproc) {
4738 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4739 			if (error != 0) {
4740 				return error;
4741 			}
4742 		} else {
4743 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4744 			    sizeof(tv32));
4745 		}
4746 #ifndef __LP64__
4747 		/*
4748 		 * K64todo "comparison is always false due to
4749 		 * limited range of data type"
4750 		 */
4751 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4752 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4753 			return EDOM;
4754 		}
4755 #endif
4756 		tv_p->tv_sec = tv32.tv_sec;
4757 		tv_p->tv_usec = tv32.tv_usec;
4758 	}
4759 	return 0;
4760 }
4761 
4762 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4763 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4764     boolean_t ignore_delegate)
4765 {
4766 	kauth_cred_t cred =  NULL;
4767 	proc_t ep = PROC_NULL;
4768 	uid_t uid;
4769 	int error = 0;
4770 
4771 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4772 		ep = proc_find(so->e_pid);
4773 		if (ep) {
4774 			cred = kauth_cred_proc_ref(ep);
4775 		}
4776 	}
4777 
4778 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4779 
4780 	/* uid is 0 for root */
4781 	if (uid != 0 || !allow_root) {
4782 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4783 	}
4784 	if (cred) {
4785 		kauth_cred_unref(&cred);
4786 	}
4787 	if (ep != PROC_NULL) {
4788 		proc_rele(ep);
4789 	}
4790 
4791 	return error;
4792 }
4793 
4794 /*
4795  * Returns:	0			Success
4796  *		EINVAL
4797  *		ENOPROTOOPT
4798  *		ENOBUFS
4799  *		EDOM
4800  *	sooptcopyin:EINVAL
4801  *	sooptcopyin:EFAULT
4802  *	sooptcopyin_timeval:EINVAL
4803  *	sooptcopyin_timeval:EFAULT
4804  *	sooptcopyin_timeval:EDOM
4805  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4806  *	<pr_ctloutput>:???w
4807  *	sflt_attach_private:???		[whatever a filter author chooses]
4808  *	<sf_setoption>:???		[whatever a filter author chooses]
4809  *
4810  * Notes:	Other <pru_listen> returns depend on the protocol family; all
4811  *		<sf_listen> returns depend on what the filter author causes
4812  *		their filter to return.
4813  */
4814 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4815 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4816 {
4817 	int     error, optval;
4818 	int64_t long_optval;
4819 	struct  linger l;
4820 	struct  timeval tv;
4821 
4822 	if (sopt->sopt_dir != SOPT_SET) {
4823 		sopt->sopt_dir = SOPT_SET;
4824 	}
4825 
4826 	if (dolock) {
4827 		socket_lock(so, 1);
4828 	}
4829 
4830 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4831 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4832 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4833 		/* the socket has been shutdown, no more sockopt's */
4834 		error = EINVAL;
4835 		goto out;
4836 	}
4837 
4838 	error = sflt_setsockopt(so, sopt);
4839 	if (error != 0) {
4840 		if (error == EJUSTRETURN) {
4841 			error = 0;
4842 		}
4843 		goto out;
4844 	}
4845 
4846 	if (sopt->sopt_level != SOL_SOCKET) {
4847 		if (so->so_proto != NULL &&
4848 		    so->so_proto->pr_ctloutput != NULL) {
4849 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
4850 			goto out;
4851 		}
4852 		error = ENOPROTOOPT;
4853 	} else {
4854 		/*
4855 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
4856 		 * the protocol layer, if needed.  A zero value returned from
4857 		 * the handler means use default socket-level processing as
4858 		 * done by the rest of this routine.  Otherwise, any other
4859 		 * return value indicates that the option is unsupported.
4860 		 */
4861 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4862 		    pru_socheckopt(so, sopt)) != 0) {
4863 			goto out;
4864 		}
4865 
4866 		error = 0;
4867 		switch (sopt->sopt_name) {
4868 		case SO_LINGER:
4869 		case SO_LINGER_SEC: {
4870 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4871 			if (error != 0) {
4872 				goto out;
4873 			}
4874 			/* Make sure to use sane values */
4875 			if (sopt->sopt_name == SO_LINGER) {
4876 				so->so_linger = (short)l.l_linger;
4877 			} else {
4878 				so->so_linger = (short)((long)l.l_linger * hz);
4879 			}
4880 			if (l.l_onoff != 0) {
4881 				so->so_options |= SO_LINGER;
4882 			} else {
4883 				so->so_options &= ~SO_LINGER;
4884 			}
4885 			break;
4886 		}
4887 		case SO_DEBUG:
4888 		case SO_KEEPALIVE:
4889 		case SO_DONTROUTE:
4890 		case SO_USELOOPBACK:
4891 		case SO_BROADCAST:
4892 		case SO_REUSEADDR:
4893 		case SO_REUSEPORT:
4894 		case SO_OOBINLINE:
4895 		case SO_TIMESTAMP:
4896 		case SO_TIMESTAMP_MONOTONIC:
4897 		case SO_TIMESTAMP_CONTINUOUS:
4898 		case SO_DONTTRUNC:
4899 		case SO_WANTMORE:
4900 		case SO_WANTOOBFLAG:
4901 		case SO_NOWAKEFROMSLEEP:
4902 		case SO_NOAPNFALLBK:
4903 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4904 			    sizeof(optval));
4905 			if (error != 0) {
4906 				goto out;
4907 			}
4908 			if (optval) {
4909 				so->so_options |= sopt->sopt_name;
4910 			} else {
4911 				so->so_options &= ~sopt->sopt_name;
4912 			}
4913 #if SKYWALK
4914 			inp_update_netns_flags(so);
4915 #endif /* SKYWALK */
4916 			break;
4917 
4918 		case SO_SNDBUF:
4919 		case SO_RCVBUF:
4920 		case SO_SNDLOWAT:
4921 		case SO_RCVLOWAT:
4922 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4923 			    sizeof(optval));
4924 			if (error != 0) {
4925 				goto out;
4926 			}
4927 
4928 			/*
4929 			 * Values < 1 make no sense for any of these
4930 			 * options, so disallow them.
4931 			 */
4932 			if (optval < 1) {
4933 				error = EINVAL;
4934 				goto out;
4935 			}
4936 
4937 			switch (sopt->sopt_name) {
4938 			case SO_SNDBUF:
4939 			case SO_RCVBUF: {
4940 				struct sockbuf *sb =
4941 				    (sopt->sopt_name == SO_SNDBUF) ?
4942 				    &so->so_snd : &so->so_rcv;
4943 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
4944 					error = ENOBUFS;
4945 					goto out;
4946 				}
4947 				sb->sb_flags |= SB_USRSIZE;
4948 				sb->sb_flags &= ~SB_AUTOSIZE;
4949 				sb->sb_idealsize = (u_int32_t)optval;
4950 				break;
4951 			}
4952 			/*
4953 			 * Make sure the low-water is never greater than
4954 			 * the high-water.
4955 			 */
4956 			case SO_SNDLOWAT: {
4957 				int space = sbspace(&so->so_snd);
4958 				uint32_t hiwat = so->so_snd.sb_hiwat;
4959 
4960 				if (so->so_snd.sb_flags & SB_UNIX) {
4961 					struct unpcb *unp =
4962 					    (struct unpcb *)(so->so_pcb);
4963 					if (unp != NULL &&
4964 					    unp->unp_conn != NULL) {
4965 						struct socket *so2 = unp->unp_conn->unp_socket;
4966 						hiwat += unp->unp_conn->unp_cc;
4967 						space = sbspace(&so2->so_rcv);
4968 					}
4969 				}
4970 
4971 				so->so_snd.sb_lowat =
4972 				    (optval > hiwat) ?
4973 				    hiwat : optval;
4974 
4975 				if (space >= so->so_snd.sb_lowat) {
4976 					sowwakeup(so);
4977 				}
4978 				break;
4979 			}
4980 			case SO_RCVLOWAT: {
4981 				int64_t data_len;
4982 				so->so_rcv.sb_lowat =
4983 				    (optval > so->so_rcv.sb_hiwat) ?
4984 				    so->so_rcv.sb_hiwat : optval;
4985 				if (so->so_rcv.sb_flags & SB_UNIX) {
4986 					struct unpcb *unp =
4987 					    (struct unpcb *)(so->so_pcb);
4988 					if (unp != NULL &&
4989 					    unp->unp_conn != NULL) {
4990 						struct socket *so2 = unp->unp_conn->unp_socket;
4991 						data_len = so2->so_snd.sb_cc
4992 						    - so2->so_snd.sb_ctl;
4993 					} else {
4994 						data_len = so->so_rcv.sb_cc
4995 						    - so->so_rcv.sb_ctl;
4996 					}
4997 				} else {
4998 					data_len = so->so_rcv.sb_cc
4999 					    - so->so_rcv.sb_ctl;
5000 				}
5001 
5002 				if (data_len >= so->so_rcv.sb_lowat) {
5003 					sorwakeup(so);
5004 				}
5005 				break;
5006 			}
5007 			}
5008 			break;
5009 
5010 		case SO_SNDTIMEO:
5011 		case SO_RCVTIMEO:
5012 			error = sooptcopyin_timeval(sopt, &tv);
5013 			if (error != 0) {
5014 				goto out;
5015 			}
5016 
5017 			switch (sopt->sopt_name) {
5018 			case SO_SNDTIMEO:
5019 				so->so_snd.sb_timeo = tv;
5020 				break;
5021 			case SO_RCVTIMEO:
5022 				so->so_rcv.sb_timeo = tv;
5023 				break;
5024 			}
5025 			break;
5026 
5027 		case SO_NKE: {
5028 			struct so_nke nke;
5029 
5030 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5031 			    sizeof(nke));
5032 			if (error != 0) {
5033 				goto out;
5034 			}
5035 
5036 			error = sflt_attach_internal(so, nke.nke_handle);
5037 			break;
5038 		}
5039 
5040 		case SO_NOSIGPIPE:
5041 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5042 			    sizeof(optval));
5043 			if (error != 0) {
5044 				goto out;
5045 			}
5046 			if (optval != 0) {
5047 				so->so_flags |= SOF_NOSIGPIPE;
5048 			} else {
5049 				so->so_flags &= ~SOF_NOSIGPIPE;
5050 			}
5051 			break;
5052 
5053 		case SO_NOADDRERR:
5054 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5055 			    sizeof(optval));
5056 			if (error != 0) {
5057 				goto out;
5058 			}
5059 			if (optval != 0) {
5060 				so->so_flags |= SOF_NOADDRAVAIL;
5061 			} else {
5062 				so->so_flags &= ~SOF_NOADDRAVAIL;
5063 			}
5064 			break;
5065 
5066 		case SO_REUSESHAREUID:
5067 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5068 			    sizeof(optval));
5069 			if (error != 0) {
5070 				goto out;
5071 			}
5072 			if (optval != 0) {
5073 				so->so_flags |= SOF_REUSESHAREUID;
5074 			} else {
5075 				so->so_flags &= ~SOF_REUSESHAREUID;
5076 			}
5077 			break;
5078 
5079 		case SO_NOTIFYCONFLICT:
5080 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5081 				error = EPERM;
5082 				goto out;
5083 			}
5084 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5085 			    sizeof(optval));
5086 			if (error != 0) {
5087 				goto out;
5088 			}
5089 			if (optval != 0) {
5090 				so->so_flags |= SOF_NOTIFYCONFLICT;
5091 			} else {
5092 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5093 			}
5094 			break;
5095 
5096 		case SO_RESTRICTIONS:
5097 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5098 			    sizeof(optval));
5099 			if (error != 0) {
5100 				goto out;
5101 			}
5102 
5103 			error = so_set_restrictions(so, optval);
5104 			break;
5105 
5106 		case SO_AWDL_UNRESTRICTED:
5107 			if (SOCK_DOM(so) != PF_INET &&
5108 			    SOCK_DOM(so) != PF_INET6) {
5109 				error = EOPNOTSUPP;
5110 				goto out;
5111 			}
5112 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5113 			    sizeof(optval));
5114 			if (error != 0) {
5115 				goto out;
5116 			}
5117 			if (optval != 0) {
5118 				error = soopt_cred_check(so,
5119 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5120 				if (error == 0) {
5121 					inp_set_awdl_unrestricted(
5122 						sotoinpcb(so));
5123 				}
5124 			} else {
5125 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5126 			}
5127 			break;
5128 		case SO_INTCOPROC_ALLOW:
5129 			if (SOCK_DOM(so) != PF_INET6) {
5130 				error = EOPNOTSUPP;
5131 				goto out;
5132 			}
5133 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5134 			    sizeof(optval));
5135 			if (error != 0) {
5136 				goto out;
5137 			}
5138 			if (optval != 0 &&
5139 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5140 				error = soopt_cred_check(so,
5141 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5142 				if (error == 0) {
5143 					inp_set_intcoproc_allowed(
5144 						sotoinpcb(so));
5145 				}
5146 			} else if (optval == 0) {
5147 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5148 			}
5149 			break;
5150 
5151 		case SO_LABEL:
5152 			error = EOPNOTSUPP;
5153 			break;
5154 
5155 		case SO_UPCALLCLOSEWAIT:
5156 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5157 			    sizeof(optval));
5158 			if (error != 0) {
5159 				goto out;
5160 			}
5161 			if (optval != 0) {
5162 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5163 			} else {
5164 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5165 			}
5166 			break;
5167 
5168 		case SO_RANDOMPORT:
5169 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5170 			    sizeof(optval));
5171 			if (error != 0) {
5172 				goto out;
5173 			}
5174 			if (optval != 0) {
5175 				so->so_flags |= SOF_BINDRANDOMPORT;
5176 			} else {
5177 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5178 			}
5179 			break;
5180 
5181 		case SO_NP_EXTENSIONS: {
5182 			struct so_np_extensions sonpx;
5183 
5184 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5185 			    sizeof(sonpx));
5186 			if (error != 0) {
5187 				goto out;
5188 			}
5189 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5190 				error = EINVAL;
5191 				goto out;
5192 			}
5193 			/*
5194 			 * Only one bit defined for now
5195 			 */
5196 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5197 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5198 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5199 				} else {
5200 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5201 				}
5202 			}
5203 			break;
5204 		}
5205 
5206 		case SO_TRAFFIC_CLASS: {
5207 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5208 			    sizeof(optval));
5209 			if (error != 0) {
5210 				goto out;
5211 			}
5212 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5213 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5214 				error = so_set_net_service_type(so, netsvc);
5215 				goto out;
5216 			}
5217 			error = so_set_traffic_class(so, optval);
5218 			if (error != 0) {
5219 				goto out;
5220 			}
5221 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5222 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5223 			break;
5224 		}
5225 
5226 		case SO_RECV_TRAFFIC_CLASS: {
5227 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5228 			    sizeof(optval));
5229 			if (error != 0) {
5230 				goto out;
5231 			}
5232 			if (optval == 0) {
5233 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5234 			} else {
5235 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5236 			}
5237 			break;
5238 		}
5239 
5240 #if (DEVELOPMENT || DEBUG)
5241 		case SO_TRAFFIC_CLASS_DBG: {
5242 			struct so_tcdbg so_tcdbg;
5243 
5244 			error = sooptcopyin(sopt, &so_tcdbg,
5245 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5246 			if (error != 0) {
5247 				goto out;
5248 			}
5249 			error = so_set_tcdbg(so, &so_tcdbg);
5250 			if (error != 0) {
5251 				goto out;
5252 			}
5253 			break;
5254 		}
5255 #endif /* (DEVELOPMENT || DEBUG) */
5256 
5257 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5258 			error = priv_check_cred(kauth_cred_get(),
5259 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5260 			if (error != 0) {
5261 				goto out;
5262 			}
5263 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5264 			    sizeof(optval));
5265 			if (error != 0) {
5266 				goto out;
5267 			}
5268 			if (optval == 0) {
5269 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5270 			} else {
5271 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5272 			}
5273 			break;
5274 
5275 #if (DEVELOPMENT || DEBUG)
5276 		case SO_DEFUNCTIT:
5277 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5278 			if (error == 0) {
5279 				error = sodefunct(current_proc(), so, 0);
5280 			}
5281 
5282 			break;
5283 #endif /* (DEVELOPMENT || DEBUG) */
5284 
5285 		case SO_DEFUNCTOK:
5286 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5287 			    sizeof(optval));
5288 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5289 				if (error == 0) {
5290 					error = EBADF;
5291 				}
5292 				goto out;
5293 			}
5294 			/*
5295 			 * Any process can set SO_DEFUNCTOK (clear
5296 			 * SOF_NODEFUNCT), but only root can clear
5297 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5298 			 */
5299 			if (optval == 0 &&
5300 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5301 				error = EPERM;
5302 				goto out;
5303 			}
5304 			if (optval) {
5305 				so->so_flags &= ~SOF_NODEFUNCT;
5306 			} else {
5307 				so->so_flags |= SOF_NODEFUNCT;
5308 			}
5309 
5310 			if (SOCK_DOM(so) == PF_INET ||
5311 			    SOCK_DOM(so) == PF_INET6) {
5312 				char s[MAX_IPv6_STR_LEN];
5313 				char d[MAX_IPv6_STR_LEN];
5314 				struct inpcb *inp = sotoinpcb(so);
5315 
5316 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5317 				    "[%s %s:%d -> %s:%d] is now marked "
5318 				    "as %seligible for "
5319 				    "defunct\n", __func__, proc_selfpid(),
5320 				    proc_best_name(current_proc()),
5321 				    so->so_gencnt,
5322 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5323 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5324 				    ((SOCK_DOM(so) == PF_INET) ?
5325 				    (void *)&inp->inp_laddr.s_addr :
5326 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5327 				    ntohs(inp->in6p_lport),
5328 				    inet_ntop(SOCK_DOM(so),
5329 				    (SOCK_DOM(so) == PF_INET) ?
5330 				    (void *)&inp->inp_faddr.s_addr :
5331 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5332 				    ntohs(inp->in6p_fport),
5333 				    (so->so_flags & SOF_NODEFUNCT) ?
5334 				    "not " : "");
5335 			} else {
5336 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5337 				    "is now marked as %seligible for "
5338 				    "defunct\n",
5339 				    __func__, proc_selfpid(),
5340 				    proc_best_name(current_proc()),
5341 				    so->so_gencnt,
5342 				    SOCK_DOM(so), SOCK_TYPE(so),
5343 				    (so->so_flags & SOF_NODEFUNCT) ?
5344 				    "not " : "");
5345 			}
5346 			break;
5347 
5348 		case SO_ISDEFUNCT:
5349 			/* This option is not settable */
5350 			error = EINVAL;
5351 			break;
5352 
5353 		case SO_OPPORTUNISTIC:
5354 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5355 			    sizeof(optval));
5356 			if (error == 0) {
5357 				error = so_set_opportunistic(so, optval);
5358 			}
5359 			break;
5360 
5361 		case SO_FLUSH:
5362 			/* This option is handled by lower layer(s) */
5363 			error = 0;
5364 			break;
5365 
5366 		case SO_RECV_ANYIF:
5367 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5368 			    sizeof(optval));
5369 			if (error == 0) {
5370 				error = so_set_recv_anyif(so, optval);
5371 			}
5372 			break;
5373 
5374 		case SO_TRAFFIC_MGT_BACKGROUND: {
5375 			/* This option is handled by lower layer(s) */
5376 			error = 0;
5377 			break;
5378 		}
5379 
5380 #if FLOW_DIVERT
5381 		case SO_FLOW_DIVERT_TOKEN:
5382 			error = flow_divert_token_set(so, sopt);
5383 			break;
5384 #endif  /* FLOW_DIVERT */
5385 
5386 
5387 		case SO_DELEGATED:
5388 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5389 			    sizeof(optval))) != 0) {
5390 				break;
5391 			}
5392 
5393 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5394 			break;
5395 
5396 		case SO_DELEGATED_UUID: {
5397 			uuid_t euuid;
5398 
5399 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5400 			    sizeof(euuid))) != 0) {
5401 				break;
5402 			}
5403 
5404 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5405 			break;
5406 		}
5407 
5408 #if NECP
5409 		case SO_NECP_ATTRIBUTES:
5410 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5411 				/* Handled by MPTCP itself */
5412 				break;
5413 			}
5414 
5415 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5416 				error = EINVAL;
5417 				goto out;
5418 			}
5419 
5420 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5421 			break;
5422 
5423 		case SO_NECP_CLIENTUUID: {
5424 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5425 				/* Handled by MPTCP itself */
5426 				break;
5427 			}
5428 
5429 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5430 				error = EINVAL;
5431 				goto out;
5432 			}
5433 
5434 			struct inpcb *inp = sotoinpcb(so);
5435 			if (!uuid_is_null(inp->necp_client_uuid)) {
5436 				// Clear out the old client UUID if present
5437 				necp_inpcb_remove_cb(inp);
5438 			}
5439 
5440 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5441 			    sizeof(uuid_t), sizeof(uuid_t));
5442 			if (error != 0) {
5443 				goto out;
5444 			}
5445 
5446 			if (uuid_is_null(inp->necp_client_uuid)) {
5447 				error = EINVAL;
5448 				goto out;
5449 			}
5450 
5451 			pid_t current_pid = proc_pid(current_proc());
5452 			error = necp_client_register_socket_flow(current_pid,
5453 			    inp->necp_client_uuid, inp);
5454 			if (error != 0) {
5455 				uuid_clear(inp->necp_client_uuid);
5456 				goto out;
5457 			}
5458 
5459 			if (inp->inp_lport != 0) {
5460 				// There is a bound local port, so this is not
5461 				// a fresh socket. Assign to the client.
5462 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5463 			}
5464 
5465 			break;
5466 		}
5467 		case SO_NECP_LISTENUUID: {
5468 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5469 				error = EINVAL;
5470 				goto out;
5471 			}
5472 
5473 			struct inpcb *inp = sotoinpcb(so);
5474 			if (!uuid_is_null(inp->necp_client_uuid)) {
5475 				error = EINVAL;
5476 				goto out;
5477 			}
5478 
5479 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5480 			    sizeof(uuid_t), sizeof(uuid_t));
5481 			if (error != 0) {
5482 				goto out;
5483 			}
5484 
5485 			if (uuid_is_null(inp->necp_client_uuid)) {
5486 				error = EINVAL;
5487 				goto out;
5488 			}
5489 
5490 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5491 			    inp->necp_client_uuid, inp);
5492 			if (error != 0) {
5493 				uuid_clear(inp->necp_client_uuid);
5494 				goto out;
5495 			}
5496 
5497 			// Mark that the port registration is held by NECP
5498 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5499 
5500 			break;
5501 		}
5502 
5503 		case SO_RESOLVER_SIGNATURE: {
5504 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5505 				error = EINVAL;
5506 				goto out;
5507 			}
5508 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5509 			break;
5510 		}
5511 #endif /* NECP */
5512 
5513 		case SO_EXTENDED_BK_IDLE:
5514 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5515 			    sizeof(optval));
5516 			if (error == 0) {
5517 				error = so_set_extended_bk_idle(so, optval);
5518 			}
5519 			break;
5520 
5521 		case SO_MARK_CELLFALLBACK:
5522 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5523 			    sizeof(optval));
5524 			if (error != 0) {
5525 				goto out;
5526 			}
5527 			if (optval < 0) {
5528 				error = EINVAL;
5529 				goto out;
5530 			}
5531 			if (optval == 0) {
5532 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5533 			} else {
5534 				so->so_flags1 |= SOF1_CELLFALLBACK;
5535 			}
5536 			break;
5537 
5538 		case SO_MARK_CELLFALLBACK_UUID:
5539 		{
5540 			struct so_mark_cellfallback_uuid_args args;
5541 
5542 			error = sooptcopyin(sopt, &args, sizeof(args),
5543 			    sizeof(args));
5544 			if (error != 0) {
5545 				goto out;
5546 			}
5547 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5548 			    args.flow_cellfallback);
5549 			break;
5550 		}
5551 
5552 		case SO_FALLBACK_MODE:
5553 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5554 			    sizeof(optval));
5555 			if (error != 0) {
5556 				goto out;
5557 			}
5558 			if (optval < SO_FALLBACK_MODE_NONE ||
5559 			    optval > SO_FALLBACK_MODE_PREFER) {
5560 				error = EINVAL;
5561 				goto out;
5562 			}
5563 			so->so_fallback_mode = (u_int8_t)optval;
5564 			break;
5565 
5566 		case SO_MARK_KNOWN_TRACKER: {
5567 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5568 			    sizeof(optval));
5569 			if (error != 0) {
5570 				goto out;
5571 			}
5572 			if (optval < 0) {
5573 				error = EINVAL;
5574 				goto out;
5575 			}
5576 			if (optval == 0) {
5577 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5578 			} else {
5579 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5580 			}
5581 			break;
5582 		}
5583 
5584 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5585 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5586 			    sizeof(optval));
5587 			if (error != 0) {
5588 				goto out;
5589 			}
5590 			if (optval < 0) {
5591 				error = EINVAL;
5592 				goto out;
5593 			}
5594 			if (optval == 0) {
5595 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5596 			} else {
5597 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5598 			}
5599 			break;
5600 		}
5601 
5602 		case SO_MARK_APPROVED_APP_DOMAIN: {
5603 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5604 			    sizeof(optval));
5605 			if (error != 0) {
5606 				goto out;
5607 			}
5608 			if (optval < 0) {
5609 				error = EINVAL;
5610 				goto out;
5611 			}
5612 			if (optval == 0) {
5613 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5614 			} else {
5615 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5616 			}
5617 			break;
5618 		}
5619 
5620 		case SO_STATISTICS_EVENT:
5621 			error = sooptcopyin(sopt, &long_optval,
5622 			    sizeof(long_optval), sizeof(long_optval));
5623 			if (error != 0) {
5624 				goto out;
5625 			}
5626 			u_int64_t nstat_event = 0;
5627 			error = so_statistics_event_to_nstat_event(
5628 				&long_optval, &nstat_event);
5629 			if (error != 0) {
5630 				goto out;
5631 			}
5632 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5633 			break;
5634 
5635 		case SO_NET_SERVICE_TYPE: {
5636 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5637 			    sizeof(optval));
5638 			if (error != 0) {
5639 				goto out;
5640 			}
5641 			error = so_set_net_service_type(so, optval);
5642 			break;
5643 		}
5644 
5645 		case SO_QOSMARKING_POLICY_OVERRIDE:
5646 			error = priv_check_cred(kauth_cred_get(),
5647 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5648 			if (error != 0) {
5649 				goto out;
5650 			}
5651 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5652 			    sizeof(optval));
5653 			if (error != 0) {
5654 				goto out;
5655 			}
5656 			if (optval == 0) {
5657 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5658 			} else {
5659 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5660 			}
5661 			break;
5662 
5663 		case SO_MPKL_SEND_INFO: {
5664 			struct so_mpkl_send_info so_mpkl_send_info;
5665 
5666 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5667 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5668 			if (error != 0) {
5669 				goto out;
5670 			}
5671 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5672 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5673 
5674 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5675 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5676 			} else {
5677 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5678 			}
5679 			break;
5680 		}
5681 		case SO_WANT_KEV_SOCKET_CLOSED: {
5682 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5683 			    sizeof(optval));
5684 			if (error != 0) {
5685 				goto out;
5686 			}
5687 			if (optval == 0) {
5688 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5689 			} else {
5690 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5691 			}
5692 			break;
5693 		}
5694 		case SO_MARK_WAKE_PKT: {
5695 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5696 			    sizeof(optval));
5697 			if (error != 0) {
5698 				goto out;
5699 			}
5700 			if (optval == 0) {
5701 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5702 			} else {
5703 				so->so_flags |= SOF_MARK_WAKE_PKT;
5704 			}
5705 			break;
5706 		}
5707 		case SO_RECV_WAKE_PKT: {
5708 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5709 			    sizeof(optval));
5710 			if (error != 0) {
5711 				goto out;
5712 			}
5713 			if (optval == 0) {
5714 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5715 			} else {
5716 				so->so_flags |= SOF_RECV_WAKE_PKT;
5717 			}
5718 			break;
5719 		}
5720 		case SO_APPLICATION_ID: {
5721 			so_application_id_t application_id = { 0 };
5722 
5723 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5724 				error = EINVAL;
5725 				goto out;
5726 			}
5727 			error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5728 			    sizeof(application_id));
5729 			if (error != 0) {
5730 				goto out;
5731 			}
5732 
5733 			// The user needs to match
5734 			if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5735 				error = EINVAL;
5736 				printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5737 				goto out;
5738 			}
5739 			error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5740 			if (error != 0) {
5741 				printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5742 				goto out;
5743 			}
5744 			if (application_id.persona_id != PERSONA_ID_NONE) {
5745 				so->so_persona_id = application_id.persona_id;
5746 			}
5747 			break;
5748 		}
5749 		default:
5750 			error = ENOPROTOOPT;
5751 			break;
5752 		}
5753 		if (error == 0 && so->so_proto != NULL &&
5754 		    so->so_proto->pr_ctloutput != NULL) {
5755 			(void) so->so_proto->pr_ctloutput(so, sopt);
5756 		}
5757 	}
5758 out:
5759 	if (dolock) {
5760 		socket_unlock(so, 1);
5761 	}
5762 	return error;
5763 }
5764 
5765 /* Helper routines for getsockopt */
5766 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5767 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5768 {
5769 	int     error;
5770 	size_t  valsize;
5771 
5772 	error = 0;
5773 
5774 	/*
5775 	 * Documented get behavior is that we always return a value,
5776 	 * possibly truncated to fit in the user's buffer.
5777 	 * Traditional behavior is that we always tell the user
5778 	 * precisely how much we copied, rather than something useful
5779 	 * like the total amount we had available for her.
5780 	 * Note that this interface is not idempotent; the entire answer must
5781 	 * generated ahead of time.
5782 	 */
5783 	valsize = MIN(len, sopt->sopt_valsize);
5784 	sopt->sopt_valsize = valsize;
5785 	if (sopt->sopt_val != USER_ADDR_NULL) {
5786 		if (sopt->sopt_p != kernproc) {
5787 			error = copyout(buf, sopt->sopt_val, valsize);
5788 		} else {
5789 			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5790 		}
5791 	}
5792 	return error;
5793 }
5794 
5795 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5796 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5797 {
5798 	int                     error;
5799 	size_t                  len;
5800 	struct user64_timeval   tv64 = {};
5801 	struct user32_timeval   tv32 = {};
5802 	const void *            val;
5803 	size_t                  valsize;
5804 
5805 	error = 0;
5806 	if (proc_is64bit(sopt->sopt_p)) {
5807 		len = sizeof(tv64);
5808 		tv64.tv_sec = tv_p->tv_sec;
5809 		tv64.tv_usec = tv_p->tv_usec;
5810 		val = &tv64;
5811 	} else {
5812 		len = sizeof(tv32);
5813 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5814 		tv32.tv_usec = tv_p->tv_usec;
5815 		val = &tv32;
5816 	}
5817 	valsize = MIN(len, sopt->sopt_valsize);
5818 	sopt->sopt_valsize = valsize;
5819 	if (sopt->sopt_val != USER_ADDR_NULL) {
5820 		if (sopt->sopt_p != kernproc) {
5821 			error = copyout(val, sopt->sopt_val, valsize);
5822 		} else {
5823 			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5824 		}
5825 	}
5826 	return error;
5827 }
5828 
5829 /*
5830  * Return:	0			Success
5831  *		ENOPROTOOPT
5832  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5833  *	<pr_ctloutput>:???
5834  *	<sf_getoption>:???
5835  */
5836 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5837 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5838 {
5839 	int     error, optval;
5840 	struct  linger l;
5841 	struct  timeval tv;
5842 
5843 	if (sopt->sopt_dir != SOPT_GET) {
5844 		sopt->sopt_dir = SOPT_GET;
5845 	}
5846 
5847 	if (dolock) {
5848 		socket_lock(so, 1);
5849 	}
5850 
5851 	error = sflt_getsockopt(so, sopt);
5852 	if (error != 0) {
5853 		if (error == EJUSTRETURN) {
5854 			error = 0;
5855 		}
5856 		goto out;
5857 	}
5858 
5859 	if (sopt->sopt_level != SOL_SOCKET) {
5860 		if (so->so_proto != NULL &&
5861 		    so->so_proto->pr_ctloutput != NULL) {
5862 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5863 			goto out;
5864 		}
5865 		error = ENOPROTOOPT;
5866 	} else {
5867 		/*
5868 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5869 		 * the protocol layer, if needed.  A zero value returned from
5870 		 * the handler means use default socket-level processing as
5871 		 * done by the rest of this routine.  Otherwise, any other
5872 		 * return value indicates that the option is unsupported.
5873 		 */
5874 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5875 		    pru_socheckopt(so, sopt)) != 0) {
5876 			goto out;
5877 		}
5878 
5879 		error = 0;
5880 		switch (sopt->sopt_name) {
5881 		case SO_LINGER:
5882 		case SO_LINGER_SEC:
5883 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5884 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5885 			    so->so_linger : so->so_linger / hz;
5886 			error = sooptcopyout(sopt, &l, sizeof(l));
5887 			break;
5888 
5889 		case SO_USELOOPBACK:
5890 		case SO_DONTROUTE:
5891 		case SO_DEBUG:
5892 		case SO_KEEPALIVE:
5893 		case SO_REUSEADDR:
5894 		case SO_REUSEPORT:
5895 		case SO_BROADCAST:
5896 		case SO_OOBINLINE:
5897 		case SO_TIMESTAMP:
5898 		case SO_TIMESTAMP_MONOTONIC:
5899 		case SO_TIMESTAMP_CONTINUOUS:
5900 		case SO_DONTTRUNC:
5901 		case SO_WANTMORE:
5902 		case SO_WANTOOBFLAG:
5903 		case SO_NOWAKEFROMSLEEP:
5904 		case SO_NOAPNFALLBK:
5905 			optval = so->so_options & sopt->sopt_name;
5906 integer:
5907 			error = sooptcopyout(sopt, &optval, sizeof(optval));
5908 			break;
5909 
5910 		case SO_TYPE:
5911 			optval = so->so_type;
5912 			goto integer;
5913 
5914 		case SO_NREAD:
5915 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5916 				int pkt_total;
5917 				struct mbuf *m1;
5918 
5919 				pkt_total = 0;
5920 				m1 = so->so_rcv.sb_mb;
5921 				while (m1 != NULL) {
5922 					if (m1->m_type == MT_DATA ||
5923 					    m1->m_type == MT_HEADER ||
5924 					    m1->m_type == MT_OOBDATA) {
5925 						pkt_total += m1->m_len;
5926 					}
5927 					m1 = m1->m_next;
5928 				}
5929 				optval = pkt_total;
5930 			} else {
5931 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5932 			}
5933 			goto integer;
5934 
5935 		case SO_NUMRCVPKT:
5936 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5937 				int cnt = 0;
5938 				struct mbuf *m1;
5939 
5940 				m1 = so->so_rcv.sb_mb;
5941 				while (m1 != NULL) {
5942 					cnt += 1;
5943 					m1 = m1->m_nextpkt;
5944 				}
5945 				optval = cnt;
5946 				goto integer;
5947 			} else {
5948 				error = ENOPROTOOPT;
5949 				break;
5950 			}
5951 
5952 		case SO_NWRITE:
5953 			optval = so->so_snd.sb_cc;
5954 			goto integer;
5955 
5956 		case SO_ERROR:
5957 			optval = so->so_error;
5958 			so->so_error = 0;
5959 			goto integer;
5960 
5961 		case SO_SNDBUF: {
5962 			u_int32_t hiwat = so->so_snd.sb_hiwat;
5963 
5964 			if (so->so_snd.sb_flags & SB_UNIX) {
5965 				struct unpcb *unp =
5966 				    (struct unpcb *)(so->so_pcb);
5967 				if (unp != NULL && unp->unp_conn != NULL) {
5968 					hiwat += unp->unp_conn->unp_cc;
5969 				}
5970 			}
5971 
5972 			optval = hiwat;
5973 			goto integer;
5974 		}
5975 		case SO_RCVBUF:
5976 			optval = so->so_rcv.sb_hiwat;
5977 			goto integer;
5978 
5979 		case SO_SNDLOWAT:
5980 			optval = so->so_snd.sb_lowat;
5981 			goto integer;
5982 
5983 		case SO_RCVLOWAT:
5984 			optval = so->so_rcv.sb_lowat;
5985 			goto integer;
5986 
5987 		case SO_SNDTIMEO:
5988 		case SO_RCVTIMEO:
5989 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
5990 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5991 
5992 			error = sooptcopyout_timeval(sopt, &tv);
5993 			break;
5994 
5995 		case SO_NOSIGPIPE:
5996 			optval = (so->so_flags & SOF_NOSIGPIPE);
5997 			goto integer;
5998 
5999 		case SO_NOADDRERR:
6000 			optval = (so->so_flags & SOF_NOADDRAVAIL);
6001 			goto integer;
6002 
6003 		case SO_REUSESHAREUID:
6004 			optval = (so->so_flags & SOF_REUSESHAREUID);
6005 			goto integer;
6006 
6007 
6008 		case SO_NOTIFYCONFLICT:
6009 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6010 			goto integer;
6011 
6012 		case SO_RESTRICTIONS:
6013 			optval = so_get_restrictions(so);
6014 			goto integer;
6015 
6016 		case SO_AWDL_UNRESTRICTED:
6017 			if (SOCK_DOM(so) == PF_INET ||
6018 			    SOCK_DOM(so) == PF_INET6) {
6019 				optval = inp_get_awdl_unrestricted(
6020 					sotoinpcb(so));
6021 				goto integer;
6022 			} else {
6023 				error = EOPNOTSUPP;
6024 			}
6025 			break;
6026 
6027 		case SO_INTCOPROC_ALLOW:
6028 			if (SOCK_DOM(so) == PF_INET6) {
6029 				optval = inp_get_intcoproc_allowed(
6030 					sotoinpcb(so));
6031 				goto integer;
6032 			} else {
6033 				error = EOPNOTSUPP;
6034 			}
6035 			break;
6036 
6037 		case SO_LABEL:
6038 			error = EOPNOTSUPP;
6039 			break;
6040 
6041 		case SO_PEERLABEL:
6042 			error = EOPNOTSUPP;
6043 			break;
6044 
6045 #ifdef __APPLE_API_PRIVATE
6046 		case SO_UPCALLCLOSEWAIT:
6047 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6048 			goto integer;
6049 #endif
6050 		case SO_RANDOMPORT:
6051 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6052 			goto integer;
6053 
6054 		case SO_NP_EXTENSIONS: {
6055 			struct so_np_extensions sonpx = {};
6056 
6057 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6058 			    SONPX_SETOPTSHUT : 0;
6059 			sonpx.npx_mask = SONPX_MASK_VALID;
6060 
6061 			error = sooptcopyout(sopt, &sonpx,
6062 			    sizeof(struct so_np_extensions));
6063 			break;
6064 		}
6065 
6066 		case SO_TRAFFIC_CLASS:
6067 			optval = so->so_traffic_class;
6068 			goto integer;
6069 
6070 		case SO_RECV_TRAFFIC_CLASS:
6071 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6072 			goto integer;
6073 
6074 #if (DEVELOPMENT || DEBUG)
6075 		case SO_TRAFFIC_CLASS_DBG:
6076 			error = sogetopt_tcdbg(so, sopt);
6077 			break;
6078 #endif /* (DEVELOPMENT || DEBUG) */
6079 
6080 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6081 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6082 			goto integer;
6083 
6084 		case SO_DEFUNCTOK:
6085 			optval = !(so->so_flags & SOF_NODEFUNCT);
6086 			goto integer;
6087 
6088 		case SO_ISDEFUNCT:
6089 			optval = (so->so_flags & SOF_DEFUNCT);
6090 			goto integer;
6091 
6092 		case SO_OPPORTUNISTIC:
6093 			optval = so_get_opportunistic(so);
6094 			goto integer;
6095 
6096 		case SO_FLUSH:
6097 			/* This option is not gettable */
6098 			error = EINVAL;
6099 			break;
6100 
6101 		case SO_RECV_ANYIF:
6102 			optval = so_get_recv_anyif(so);
6103 			goto integer;
6104 
6105 		case SO_TRAFFIC_MGT_BACKGROUND:
6106 			/* This option is handled by lower layer(s) */
6107 			if (so->so_proto != NULL &&
6108 			    so->so_proto->pr_ctloutput != NULL) {
6109 				(void) so->so_proto->pr_ctloutput(so, sopt);
6110 			}
6111 			break;
6112 
6113 #if FLOW_DIVERT
6114 		case SO_FLOW_DIVERT_TOKEN:
6115 			error = flow_divert_token_get(so, sopt);
6116 			break;
6117 #endif  /* FLOW_DIVERT */
6118 
6119 #if NECP
6120 		case SO_NECP_ATTRIBUTES:
6121 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6122 				/* Handled by MPTCP itself */
6123 				break;
6124 			}
6125 
6126 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6127 				error = EINVAL;
6128 				goto out;
6129 			}
6130 
6131 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6132 			break;
6133 
6134 		case SO_NECP_CLIENTUUID: {
6135 			uuid_t *ncu;
6136 
6137 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6138 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6139 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6140 				ncu = &sotoinpcb(so)->necp_client_uuid;
6141 			} else {
6142 				error = EINVAL;
6143 				goto out;
6144 			}
6145 
6146 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6147 			break;
6148 		}
6149 
6150 		case SO_NECP_LISTENUUID: {
6151 			uuid_t *nlu;
6152 
6153 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6154 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6155 					nlu = &sotoinpcb(so)->necp_client_uuid;
6156 				} else {
6157 					error = ENOENT;
6158 					goto out;
6159 				}
6160 			} else {
6161 				error = EINVAL;
6162 				goto out;
6163 			}
6164 
6165 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6166 			break;
6167 		}
6168 
6169 		case SO_RESOLVER_SIGNATURE: {
6170 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6171 				error = EINVAL;
6172 				goto out;
6173 			}
6174 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6175 			break;
6176 		}
6177 
6178 #endif /* NECP */
6179 
6180 #if CONTENT_FILTER
6181 		case SO_CFIL_SOCK_ID: {
6182 			cfil_sock_id_t sock_id;
6183 
6184 			sock_id = cfil_sock_id_from_socket(so);
6185 
6186 			error = sooptcopyout(sopt, &sock_id,
6187 			    sizeof(cfil_sock_id_t));
6188 			break;
6189 		}
6190 #endif  /* CONTENT_FILTER */
6191 
6192 		case SO_EXTENDED_BK_IDLE:
6193 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6194 			goto integer;
6195 		case SO_MARK_CELLFALLBACK:
6196 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6197 			    ? 1 : 0;
6198 			goto integer;
6199 		case SO_FALLBACK_MODE:
6200 			optval = so->so_fallback_mode;
6201 			goto integer;
6202 		case SO_MARK_KNOWN_TRACKER: {
6203 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6204 			    ? 1 : 0;
6205 			goto integer;
6206 		}
6207 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6208 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6209 			    ? 1 : 0;
6210 			goto integer;
6211 		}
6212 		case SO_MARK_APPROVED_APP_DOMAIN: {
6213 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6214 			    ? 1 : 0;
6215 			goto integer;
6216 		}
6217 		case SO_NET_SERVICE_TYPE: {
6218 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6219 				optval = so->so_netsvctype;
6220 			} else {
6221 				optval = NET_SERVICE_TYPE_BE;
6222 			}
6223 			goto integer;
6224 		}
6225 		case SO_NETSVC_MARKING_LEVEL:
6226 			optval = so_get_netsvc_marking_level(so);
6227 			goto integer;
6228 
6229 		case SO_MPKL_SEND_INFO: {
6230 			struct so_mpkl_send_info so_mpkl_send_info;
6231 
6232 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6233 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6234 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6235 			    sizeof(struct so_mpkl_send_info));
6236 			break;
6237 		}
6238 		case SO_MARK_WAKE_PKT:
6239 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6240 			goto integer;
6241 		case SO_RECV_WAKE_PKT:
6242 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6243 			goto integer;
6244 		case SO_APPLICATION_ID: {
6245 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6246 				error = EINVAL;
6247 				goto out;
6248 			}
6249 			so_application_id_t application_id = { 0 };
6250 			application_id.uid = kauth_cred_getuid(so->so_cred);
6251 			uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6252 			application_id.persona_id = so->so_persona_id;
6253 			error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6254 			break;
6255 		}
6256 		default:
6257 			error = ENOPROTOOPT;
6258 			break;
6259 		}
6260 	}
6261 out:
6262 	if (dolock) {
6263 		socket_unlock(so, 1);
6264 	}
6265 	return error;
6266 }
6267 
6268 /*
6269  * The size limits on our soopt_getm is different from that on FreeBSD.
6270  * We limit the size of options to MCLBYTES. This will have to change
6271  * if we need to define options that need more space than MCLBYTES.
6272  */
6273 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6274 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6275 {
6276 	struct mbuf *m, *m_prev;
6277 	int sopt_size = (int)sopt->sopt_valsize;
6278 	int how;
6279 
6280 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6281 		return EMSGSIZE;
6282 	}
6283 
6284 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6285 	MGET(m, how, MT_DATA);
6286 	if (m == NULL) {
6287 		return ENOBUFS;
6288 	}
6289 	if (sopt_size > MLEN) {
6290 		MCLGET(m, how);
6291 		if ((m->m_flags & M_EXT) == 0) {
6292 			m_free(m);
6293 			return ENOBUFS;
6294 		}
6295 		m->m_len = min(MCLBYTES, sopt_size);
6296 	} else {
6297 		m->m_len = min(MLEN, sopt_size);
6298 	}
6299 	sopt_size -= m->m_len;
6300 	*mp = m;
6301 	m_prev = m;
6302 
6303 	while (sopt_size > 0) {
6304 		MGET(m, how, MT_DATA);
6305 		if (m == NULL) {
6306 			m_freem(*mp);
6307 			return ENOBUFS;
6308 		}
6309 		if (sopt_size > MLEN) {
6310 			MCLGET(m, how);
6311 			if ((m->m_flags & M_EXT) == 0) {
6312 				m_freem(*mp);
6313 				m_freem(m);
6314 				return ENOBUFS;
6315 			}
6316 			m->m_len = min(MCLBYTES, sopt_size);
6317 		} else {
6318 			m->m_len = min(MLEN, sopt_size);
6319 		}
6320 		sopt_size -= m->m_len;
6321 		m_prev->m_next = m;
6322 		m_prev = m;
6323 	}
6324 	return 0;
6325 }
6326 
6327 /* copyin sopt data into mbuf chain */
6328 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6329 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6330 {
6331 	struct mbuf *m0 = m;
6332 
6333 	if (sopt->sopt_val == USER_ADDR_NULL) {
6334 		return 0;
6335 	}
6336 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6337 		if (sopt->sopt_p != kernproc) {
6338 			int error;
6339 
6340 			error = copyin(sopt->sopt_val, mtod(m, char *),
6341 			    m->m_len);
6342 			if (error != 0) {
6343 				m_freem(m0);
6344 				return error;
6345 			}
6346 		} else {
6347 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6348 			    mtod(m, char *), m->m_len);
6349 		}
6350 		sopt->sopt_valsize -= m->m_len;
6351 		sopt->sopt_val += m->m_len;
6352 		m = m->m_next;
6353 	}
6354 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6355 	if (m != NULL) {
6356 		panic("soopt_mcopyin");
6357 		/* NOTREACHED */
6358 	}
6359 	return 0;
6360 }
6361 
6362 /* copyout mbuf chain data into soopt */
6363 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6364 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6365 {
6366 	struct mbuf *m0 = m;
6367 	size_t valsize = 0;
6368 
6369 	if (sopt->sopt_val == USER_ADDR_NULL) {
6370 		return 0;
6371 	}
6372 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6373 		if (sopt->sopt_p != kernproc) {
6374 			int error;
6375 
6376 			error = copyout(mtod(m, char *), sopt->sopt_val,
6377 			    m->m_len);
6378 			if (error != 0) {
6379 				m_freem(m0);
6380 				return error;
6381 			}
6382 		} else {
6383 			bcopy(mtod(m, char *),
6384 			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6385 		}
6386 		sopt->sopt_valsize -= m->m_len;
6387 		sopt->sopt_val += m->m_len;
6388 		valsize += m->m_len;
6389 		m = m->m_next;
6390 	}
6391 	if (m != NULL) {
6392 		/* enough soopt buffer should be given from user-land */
6393 		m_freem(m0);
6394 		return EINVAL;
6395 	}
6396 	sopt->sopt_valsize = valsize;
6397 	return 0;
6398 }
6399 
6400 void
sohasoutofband(struct socket * so)6401 sohasoutofband(struct socket *so)
6402 {
6403 	if (so->so_pgid < 0) {
6404 		gsignal(-so->so_pgid, SIGURG);
6405 	} else if (so->so_pgid > 0) {
6406 		proc_signal(so->so_pgid, SIGURG);
6407 	}
6408 	selwakeup(&so->so_rcv.sb_sel);
6409 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6410 		KNOTE(&so->so_rcv.sb_sel.si_note,
6411 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6412 	}
6413 }
6414 
6415 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6416 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6417 {
6418 #pragma unused(cred)
6419 	struct proc *p = current_proc();
6420 	int revents = 0;
6421 
6422 	socket_lock(so, 1);
6423 	so_update_last_owner_locked(so, PROC_NULL);
6424 	so_update_policy(so);
6425 
6426 	if (events & (POLLIN | POLLRDNORM)) {
6427 		if (soreadable(so)) {
6428 			revents |= events & (POLLIN | POLLRDNORM);
6429 		}
6430 	}
6431 
6432 	if (events & (POLLOUT | POLLWRNORM)) {
6433 		if (sowriteable(so)) {
6434 			revents |= events & (POLLOUT | POLLWRNORM);
6435 		}
6436 	}
6437 
6438 	if (events & (POLLPRI | POLLRDBAND)) {
6439 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6440 			revents |= events & (POLLPRI | POLLRDBAND);
6441 		}
6442 	}
6443 
6444 	if (revents == 0) {
6445 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6446 			/*
6447 			 * Darwin sets the flag first,
6448 			 * BSD calls selrecord first
6449 			 */
6450 			so->so_rcv.sb_flags |= SB_SEL;
6451 			selrecord(p, &so->so_rcv.sb_sel, wql);
6452 		}
6453 
6454 		if (events & (POLLOUT | POLLWRNORM)) {
6455 			/*
6456 			 * Darwin sets the flag first,
6457 			 * BSD calls selrecord first
6458 			 */
6459 			so->so_snd.sb_flags |= SB_SEL;
6460 			selrecord(p, &so->so_snd.sb_sel, wql);
6461 		}
6462 	}
6463 
6464 	socket_unlock(so, 1);
6465 	return revents;
6466 }
6467 
6468 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6469 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6470 {
6471 	struct socket *so = (struct socket *)fp_get_data(fp);
6472 	int result;
6473 
6474 	socket_lock(so, 1);
6475 	so_update_last_owner_locked(so, PROC_NULL);
6476 	so_update_policy(so);
6477 
6478 	switch (kn->kn_filter) {
6479 	case EVFILT_READ:
6480 		kn->kn_filtid = EVFILTID_SOREAD;
6481 		break;
6482 	case EVFILT_WRITE:
6483 		kn->kn_filtid = EVFILTID_SOWRITE;
6484 		break;
6485 	case EVFILT_SOCK:
6486 		kn->kn_filtid = EVFILTID_SCK;
6487 		break;
6488 	case EVFILT_EXCEPT:
6489 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6490 		break;
6491 	default:
6492 		socket_unlock(so, 1);
6493 		knote_set_error(kn, EINVAL);
6494 		return 0;
6495 	}
6496 
6497 	/*
6498 	 * call the appropriate sub-filter attach
6499 	 * with the socket still locked
6500 	 */
6501 	result = knote_fops(kn)->f_attach(kn, kev);
6502 
6503 	socket_unlock(so, 1);
6504 
6505 	return result;
6506 }
6507 
6508 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6509 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6510 {
6511 	int retval = 0;
6512 	int64_t data = 0;
6513 
6514 	if (so->so_options & SO_ACCEPTCONN) {
6515 		/*
6516 		 * Radar 6615193 handle the listen case dynamically
6517 		 * for kqueue read filter. This allows to call listen()
6518 		 * after registering the kqueue EVFILT_READ.
6519 		 */
6520 
6521 		retval = !TAILQ_EMPTY(&so->so_comp);
6522 		data = so->so_qlen;
6523 		goto out;
6524 	}
6525 
6526 	/* socket isn't a listener */
6527 	/*
6528 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6529 	 * the bytes of protocol data. We therefore exclude any
6530 	 * control bytes.
6531 	 */
6532 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6533 
6534 	if (kn->kn_sfflags & NOTE_OOB) {
6535 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6536 			kn->kn_fflags |= NOTE_OOB;
6537 			data -= so->so_oobmark;
6538 			retval = 1;
6539 			goto out;
6540 		}
6541 	}
6542 
6543 	if ((so->so_state & SS_CANTRCVMORE)
6544 #if CONTENT_FILTER
6545 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6546 #endif /* CONTENT_FILTER */
6547 	    ) {
6548 		kn->kn_flags |= EV_EOF;
6549 		kn->kn_fflags = so->so_error;
6550 		retval = 1;
6551 		goto out;
6552 	}
6553 
6554 	if (so->so_error) {     /* temporary udp error */
6555 		retval = 1;
6556 		goto out;
6557 	}
6558 
6559 	int64_t lowwat = so->so_rcv.sb_lowat;
6560 	/*
6561 	 * Ensure that when NOTE_LOWAT is used, the derived
6562 	 * low water mark is bounded by socket's rcv buf's
6563 	 * high and low water mark values.
6564 	 */
6565 	if (kn->kn_sfflags & NOTE_LOWAT) {
6566 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6567 			lowwat = so->so_rcv.sb_hiwat;
6568 		} else if (kn->kn_sdata > lowwat) {
6569 			lowwat = kn->kn_sdata;
6570 		}
6571 	}
6572 
6573 	/*
6574 	 * While the `data` field is the amount of data to read,
6575 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6576 	 * so we need to take control bytes into account too.
6577 	 */
6578 	retval = (so->so_rcv.sb_cc >= lowwat);
6579 
6580 out:
6581 	if (retval && kev) {
6582 		knote_fill_kevent(kn, kev, data);
6583 	}
6584 	return retval;
6585 }
6586 
6587 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6588 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6589 {
6590 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6591 
6592 	/* socket locked */
6593 
6594 	/*
6595 	 * If the caller explicitly asked for OOB results (e.g. poll())
6596 	 * from EVFILT_READ, then save that off in the hookid field
6597 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6598 	 */
6599 	if (kn->kn_filter == EVFILT_READ &&
6600 	    kn->kn_flags & EV_OOBAND) {
6601 		kn->kn_flags &= ~EV_OOBAND;
6602 		kn->kn_hook32 = EV_OOBAND;
6603 	} else {
6604 		kn->kn_hook32 = 0;
6605 	}
6606 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6607 		so->so_rcv.sb_flags |= SB_KNOTE;
6608 	}
6609 
6610 	/* indicate if event is already fired */
6611 	return filt_soread_common(kn, NULL, so);
6612 }
6613 
6614 static void
filt_sordetach(struct knote * kn)6615 filt_sordetach(struct knote *kn)
6616 {
6617 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6618 
6619 	socket_lock(so, 1);
6620 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6621 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6622 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6623 		}
6624 	}
6625 	socket_unlock(so, 1);
6626 }
6627 
6628 /*ARGSUSED*/
6629 static int
filt_soread(struct knote * kn,long hint)6630 filt_soread(struct knote *kn, long hint)
6631 {
6632 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6633 	int retval;
6634 
6635 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6636 		socket_lock(so, 1);
6637 	}
6638 
6639 	retval = filt_soread_common(kn, NULL, so);
6640 
6641 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6642 		socket_unlock(so, 1);
6643 	}
6644 
6645 	return retval;
6646 }
6647 
6648 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6649 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6650 {
6651 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6652 	int retval;
6653 
6654 	socket_lock(so, 1);
6655 
6656 	/* save off the new input fflags and data */
6657 	kn->kn_sfflags = kev->fflags;
6658 	kn->kn_sdata = kev->data;
6659 
6660 	/* determine if changes result in fired events */
6661 	retval = filt_soread_common(kn, NULL, so);
6662 
6663 	socket_unlock(so, 1);
6664 
6665 	return retval;
6666 }
6667 
6668 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6669 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6670 {
6671 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6672 	int retval;
6673 
6674 	socket_lock(so, 1);
6675 	retval = filt_soread_common(kn, kev, so);
6676 	socket_unlock(so, 1);
6677 
6678 	return retval;
6679 }
6680 
6681 int
so_wait_for_if_feedback(struct socket * so)6682 so_wait_for_if_feedback(struct socket *so)
6683 {
6684 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6685 	    (so->so_state & SS_ISCONNECTED)) {
6686 		struct inpcb *inp = sotoinpcb(so);
6687 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6688 			return 1;
6689 		}
6690 	}
6691 	return 0;
6692 }
6693 
6694 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6695 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6696 {
6697 	int ret = 0;
6698 	int64_t data = sbspace(&so->so_snd);
6699 
6700 	if (so->so_state & SS_CANTSENDMORE) {
6701 		kn->kn_flags |= EV_EOF;
6702 		kn->kn_fflags = so->so_error;
6703 		ret = 1;
6704 		goto out;
6705 	}
6706 
6707 	if (so->so_error) {     /* temporary udp error */
6708 		ret = 1;
6709 		goto out;
6710 	}
6711 
6712 	if (!socanwrite(so)) {
6713 		ret = 0;
6714 		goto out;
6715 	}
6716 
6717 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6718 		ret = 1;
6719 		goto out;
6720 	}
6721 
6722 	int64_t lowwat = so->so_snd.sb_lowat;
6723 	const int64_t hiwat = so->so_snd.sb_hiwat;
6724 	/*
6725 	 * Deal with connected UNIX domain sockets which
6726 	 * rely on the fact that the sender's socket buffer is
6727 	 * actually the receiver's socket buffer.
6728 	 */
6729 	if (SOCK_DOM(so) == PF_LOCAL) {
6730 		struct unpcb *unp = sotounpcb(so);
6731 		if (unp != NULL && unp->unp_conn != NULL &&
6732 		    unp->unp_conn->unp_socket != NULL) {
6733 			struct socket *so2 = unp->unp_conn->unp_socket;
6734 			/*
6735 			 * At this point we know that `so' is locked
6736 			 * and that `unp_conn` isn't going to change.
6737 			 * However, we don't lock `so2` because doing so
6738 			 * may require unlocking `so'
6739 			 * (see unp_get_locks_in_order()).
6740 			 *
6741 			 * Two cases can happen:
6742 			 *
6743 			 * 1) we return 1 and tell the application that
6744 			 *    it can write.  Meanwhile, another thread
6745 			 *    fills up the socket buffer.  This will either
6746 			 *    lead to a blocking send or EWOULDBLOCK
6747 			 *    which the application should deal with.
6748 			 * 2) we return 0 and tell the application that
6749 			 *    the socket is not writable.  Meanwhile,
6750 			 *    another thread depletes the receive socket
6751 			 *    buffer. In this case the application will
6752 			 *    be woken up by sb_notify().
6753 			 *
6754 			 * MIN() is required because otherwise sosendcheck()
6755 			 * may return EWOULDBLOCK since it only considers
6756 			 * so->so_snd.
6757 			 */
6758 			data = MIN(data, sbspace(&so2->so_rcv));
6759 		}
6760 	}
6761 
6762 	if (kn->kn_sfflags & NOTE_LOWAT) {
6763 		if (kn->kn_sdata > hiwat) {
6764 			lowwat = hiwat;
6765 		} else if (kn->kn_sdata > lowwat) {
6766 			lowwat = kn->kn_sdata;
6767 		}
6768 	}
6769 
6770 	if (data > 0 && data >= lowwat) {
6771 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6772 #if (DEBUG || DEVELOPMENT)
6773 		    && so_notsent_lowat_check == 1
6774 #endif /* DEBUG || DEVELOPMENT */
6775 		    ) {
6776 			if ((SOCK_DOM(so) == PF_INET ||
6777 			    SOCK_DOM(so) == PF_INET6) &&
6778 			    so->so_type == SOCK_STREAM) {
6779 				ret = tcp_notsent_lowat_check(so);
6780 			}
6781 #if MPTCP
6782 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6783 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6784 				ret = mptcp_notsent_lowat_check(so);
6785 			}
6786 #endif
6787 			else {
6788 				ret = 1;
6789 				goto out;
6790 			}
6791 		} else {
6792 			ret = 1;
6793 		}
6794 	}
6795 	if (so_wait_for_if_feedback(so)) {
6796 		ret = 0;
6797 	}
6798 
6799 out:
6800 	if (ret && kev) {
6801 		knote_fill_kevent(kn, kev, data);
6802 	}
6803 	return ret;
6804 }
6805 
6806 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6807 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6808 {
6809 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6810 
6811 	/* socket locked */
6812 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6813 		so->so_snd.sb_flags |= SB_KNOTE;
6814 	}
6815 
6816 	/* determine if its already fired */
6817 	return filt_sowrite_common(kn, NULL, so);
6818 }
6819 
6820 static void
filt_sowdetach(struct knote * kn)6821 filt_sowdetach(struct knote *kn)
6822 {
6823 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6824 	socket_lock(so, 1);
6825 
6826 	if (so->so_snd.sb_flags & SB_KNOTE) {
6827 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6828 			so->so_snd.sb_flags &= ~SB_KNOTE;
6829 		}
6830 	}
6831 	socket_unlock(so, 1);
6832 }
6833 
6834 /*ARGSUSED*/
6835 static int
filt_sowrite(struct knote * kn,long hint)6836 filt_sowrite(struct knote *kn, long hint)
6837 {
6838 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6839 	int ret;
6840 
6841 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6842 		socket_lock(so, 1);
6843 	}
6844 
6845 	ret = filt_sowrite_common(kn, NULL, so);
6846 
6847 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6848 		socket_unlock(so, 1);
6849 	}
6850 
6851 	return ret;
6852 }
6853 
6854 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6855 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6856 {
6857 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6858 	int ret;
6859 
6860 	socket_lock(so, 1);
6861 
6862 	/*save off the new input fflags and data */
6863 	kn->kn_sfflags = kev->fflags;
6864 	kn->kn_sdata = kev->data;
6865 
6866 	/* determine if these changes result in a triggered event */
6867 	ret = filt_sowrite_common(kn, NULL, so);
6868 
6869 	socket_unlock(so, 1);
6870 
6871 	return ret;
6872 }
6873 
6874 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6875 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6876 {
6877 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6878 	int ret;
6879 
6880 	socket_lock(so, 1);
6881 	ret = filt_sowrite_common(kn, kev, so);
6882 	socket_unlock(so, 1);
6883 
6884 	return ret;
6885 }
6886 
6887 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6888 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6889     struct socket *so, long ev_hint)
6890 {
6891 	int ret = 0;
6892 	int64_t data = 0;
6893 	uint32_t level_trigger = 0;
6894 
6895 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6896 		kn->kn_fflags |= NOTE_CONNRESET;
6897 	}
6898 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6899 		kn->kn_fflags |= NOTE_TIMEOUT;
6900 	}
6901 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6902 		kn->kn_fflags |= NOTE_NOSRCADDR;
6903 	}
6904 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6905 		kn->kn_fflags |= NOTE_IFDENIED;
6906 	}
6907 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6908 		kn->kn_fflags |= NOTE_KEEPALIVE;
6909 	}
6910 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6911 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6912 	}
6913 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6914 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6915 	}
6916 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6917 	    (so->so_state & SS_ISCONNECTED)) {
6918 		kn->kn_fflags |= NOTE_CONNECTED;
6919 		level_trigger |= NOTE_CONNECTED;
6920 	}
6921 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6922 	    (so->so_state & SS_ISDISCONNECTED)) {
6923 		kn->kn_fflags |= NOTE_DISCONNECTED;
6924 		level_trigger |= NOTE_DISCONNECTED;
6925 	}
6926 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6927 		if (so->so_proto != NULL &&
6928 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6929 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6930 		}
6931 	}
6932 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6933 	    tcp_notify_ack_active(so)) {
6934 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
6935 	}
6936 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6937 		kn->kn_fflags |= NOTE_WAKE_PKT;
6938 	}
6939 
6940 	if ((so->so_state & SS_CANTRCVMORE)
6941 #if CONTENT_FILTER
6942 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6943 #endif /* CONTENT_FILTER */
6944 	    ) {
6945 		kn->kn_fflags |= NOTE_READCLOSED;
6946 		level_trigger |= NOTE_READCLOSED;
6947 	}
6948 
6949 	if (so->so_state & SS_CANTSENDMORE) {
6950 		kn->kn_fflags |= NOTE_WRITECLOSED;
6951 		level_trigger |= NOTE_WRITECLOSED;
6952 	}
6953 
6954 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6955 	    (so->so_flags & SOF_SUSPENDED)) {
6956 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6957 
6958 		/* If resume event was delivered before, reset it */
6959 		kn->kn_hook32 &= ~NOTE_RESUME;
6960 
6961 		kn->kn_fflags |= NOTE_SUSPEND;
6962 		level_trigger |= NOTE_SUSPEND;
6963 	}
6964 
6965 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
6966 	    (so->so_flags & SOF_SUSPENDED) == 0) {
6967 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6968 
6969 		/* If suspend event was delivered before, reset it */
6970 		kn->kn_hook32 &= ~NOTE_SUSPEND;
6971 
6972 		kn->kn_fflags |= NOTE_RESUME;
6973 		level_trigger |= NOTE_RESUME;
6974 	}
6975 
6976 	if (so->so_error != 0) {
6977 		ret = 1;
6978 		data = so->so_error;
6979 		kn->kn_flags |= EV_EOF;
6980 	} else {
6981 		u_int32_t data32 = 0;
6982 		get_sockev_state(so, &data32);
6983 		data = data32;
6984 	}
6985 
6986 	/* Reset any events that are not requested on this knote */
6987 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6988 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6989 
6990 	/* Find the level triggerred events that are already delivered */
6991 	level_trigger &= kn->kn_hook32;
6992 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6993 
6994 	/* Do not deliver level triggerred events more than once */
6995 	if ((kn->kn_fflags & ~level_trigger) != 0) {
6996 		ret = 1;
6997 	}
6998 
6999 	if (ret && kev) {
7000 		/*
7001 		 * Store the state of the events being delivered. This
7002 		 * state can be used to deliver level triggered events
7003 		 * ateast once and still avoid waking up the application
7004 		 * multiple times as long as the event is active.
7005 		 */
7006 		if (kn->kn_fflags != 0) {
7007 			kn->kn_hook32 |= (kn->kn_fflags &
7008 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7009 		}
7010 
7011 		/*
7012 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7013 		 * only one of them and remember the last one that was
7014 		 * delivered last
7015 		 */
7016 		if (kn->kn_fflags & NOTE_SUSPEND) {
7017 			kn->kn_hook32 &= ~NOTE_RESUME;
7018 		}
7019 		if (kn->kn_fflags & NOTE_RESUME) {
7020 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7021 		}
7022 
7023 		knote_fill_kevent(kn, kev, data);
7024 	}
7025 	return ret;
7026 }
7027 
7028 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7029 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7030 {
7031 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7032 
7033 	/* socket locked */
7034 	kn->kn_hook32 = 0;
7035 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7036 		so->so_flags |= SOF_KNOTE;
7037 	}
7038 
7039 	/* determine if event already fired */
7040 	return filt_sockev_common(kn, NULL, so, 0);
7041 }
7042 
7043 static void
filt_sockdetach(struct knote * kn)7044 filt_sockdetach(struct knote *kn)
7045 {
7046 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7047 	socket_lock(so, 1);
7048 
7049 	if ((so->so_flags & SOF_KNOTE) != 0) {
7050 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7051 			so->so_flags &= ~SOF_KNOTE;
7052 		}
7053 	}
7054 	socket_unlock(so, 1);
7055 }
7056 
7057 static int
filt_sockev(struct knote * kn,long hint)7058 filt_sockev(struct knote *kn, long hint)
7059 {
7060 	int ret = 0, locked = 0;
7061 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7062 	long ev_hint = (hint & SO_FILT_HINT_EV);
7063 
7064 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7065 		socket_lock(so, 1);
7066 		locked = 1;
7067 	}
7068 
7069 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7070 
7071 	if (locked) {
7072 		socket_unlock(so, 1);
7073 	}
7074 
7075 	return ret;
7076 }
7077 
7078 
7079 
7080 /*
7081  *	filt_socktouch - update event state
7082  */
7083 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7084 filt_socktouch(
7085 	struct knote *kn,
7086 	struct kevent_qos_s *kev)
7087 {
7088 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7089 	uint32_t changed_flags;
7090 	int ret;
7091 
7092 	socket_lock(so, 1);
7093 
7094 	/* save off the [result] data and fflags */
7095 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7096 
7097 	/* save off the new input fflags and data */
7098 	kn->kn_sfflags = kev->fflags;
7099 	kn->kn_sdata = kev->data;
7100 
7101 	/* restrict the current results to the (smaller?) set of new interest */
7102 	/*
7103 	 * For compatibility with previous implementations, we leave kn_fflags
7104 	 * as they were before.
7105 	 */
7106 	//kn->kn_fflags &= kev->fflags;
7107 
7108 	/*
7109 	 * Since we keep track of events that are already
7110 	 * delivered, if any of those events are not requested
7111 	 * anymore the state related to them can be reset
7112 	 */
7113 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7114 
7115 	/* determine if we have events to deliver */
7116 	ret = filt_sockev_common(kn, NULL, so, 0);
7117 
7118 	socket_unlock(so, 1);
7119 
7120 	return ret;
7121 }
7122 
7123 /*
7124  *	filt_sockprocess - query event fired state and return data
7125  */
7126 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7127 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7128 {
7129 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7130 	int ret = 0;
7131 
7132 	socket_lock(so, 1);
7133 
7134 	ret = filt_sockev_common(kn, kev, so, 0);
7135 
7136 	socket_unlock(so, 1);
7137 
7138 	return ret;
7139 }
7140 
7141 void
get_sockev_state(struct socket * so,u_int32_t * statep)7142 get_sockev_state(struct socket *so, u_int32_t *statep)
7143 {
7144 	u_int32_t state = *(statep);
7145 
7146 	/*
7147 	 * If the state variable is already used by a previous event,
7148 	 * reset it.
7149 	 */
7150 	if (state != 0) {
7151 		return;
7152 	}
7153 
7154 	if (so->so_state & SS_ISCONNECTED) {
7155 		state |= SOCKEV_CONNECTED;
7156 	} else {
7157 		state &= ~(SOCKEV_CONNECTED);
7158 	}
7159 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7160 	*(statep) = state;
7161 }
7162 
7163 #define SO_LOCK_HISTORY_STR_LEN \
7164 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7165 
7166 __private_extern__ const char *
solockhistory_nr(struct socket * so)7167 solockhistory_nr(struct socket *so)
7168 {
7169 	size_t n = 0;
7170 	int i;
7171 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7172 
7173 	bzero(lock_history_str, sizeof(lock_history_str));
7174 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7175 		n += scnprintf(lock_history_str + n,
7176 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7177 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7178 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7179 	}
7180 	return lock_history_str;
7181 }
7182 
7183 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7184 socket_getlock(struct socket *so, int flags)
7185 {
7186 	if (so->so_proto->pr_getlock != NULL) {
7187 		return (*so->so_proto->pr_getlock)(so, flags);
7188 	} else {
7189 		return so->so_proto->pr_domain->dom_mtx;
7190 	}
7191 }
7192 
7193 void
socket_lock(struct socket * so,int refcount)7194 socket_lock(struct socket *so, int refcount)
7195 {
7196 	void *lr_saved;
7197 
7198 	lr_saved = __builtin_return_address(0);
7199 
7200 	if (so->so_proto->pr_lock) {
7201 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7202 	} else {
7203 #ifdef MORE_LOCKING_DEBUG
7204 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7205 		    LCK_MTX_ASSERT_NOTOWNED);
7206 #endif
7207 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7208 		if (refcount) {
7209 			so->so_usecount++;
7210 		}
7211 		so->lock_lr[so->next_lock_lr] = lr_saved;
7212 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7213 	}
7214 }
7215 
7216 void
socket_lock_assert_owned(struct socket * so)7217 socket_lock_assert_owned(struct socket *so)
7218 {
7219 	lck_mtx_t *mutex_held;
7220 
7221 	if (so->so_proto->pr_getlock != NULL) {
7222 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7223 	} else {
7224 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7225 	}
7226 
7227 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7228 }
7229 
7230 int
socket_try_lock(struct socket * so)7231 socket_try_lock(struct socket *so)
7232 {
7233 	lck_mtx_t *mtx;
7234 
7235 	if (so->so_proto->pr_getlock != NULL) {
7236 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7237 	} else {
7238 		mtx = so->so_proto->pr_domain->dom_mtx;
7239 	}
7240 
7241 	return lck_mtx_try_lock(mtx);
7242 }
7243 
7244 void
socket_unlock(struct socket * so,int refcount)7245 socket_unlock(struct socket *so, int refcount)
7246 {
7247 	void *lr_saved;
7248 	lck_mtx_t *mutex_held;
7249 
7250 	lr_saved = __builtin_return_address(0);
7251 
7252 	if (so == NULL || so->so_proto == NULL) {
7253 		panic("%s: null so_proto so=%p", __func__, so);
7254 		/* NOTREACHED */
7255 	}
7256 
7257 	if (so->so_proto->pr_unlock) {
7258 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7259 	} else {
7260 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7261 #ifdef MORE_LOCKING_DEBUG
7262 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7263 #endif
7264 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7265 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7266 
7267 		if (refcount) {
7268 			if (so->so_usecount <= 0) {
7269 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7270 				    "lrh=%s", __func__, so->so_usecount, so,
7271 				    SOCK_DOM(so), so->so_type,
7272 				    SOCK_PROTO(so), solockhistory_nr(so));
7273 				/* NOTREACHED */
7274 			}
7275 
7276 			so->so_usecount--;
7277 			if (so->so_usecount == 0) {
7278 				sofreelastref(so, 1);
7279 			}
7280 		}
7281 		lck_mtx_unlock(mutex_held);
7282 	}
7283 }
7284 
7285 /* Called with socket locked, will unlock socket */
7286 void
sofree(struct socket * so)7287 sofree(struct socket *so)
7288 {
7289 	lck_mtx_t *mutex_held;
7290 
7291 	if (so->so_proto->pr_getlock != NULL) {
7292 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7293 	} else {
7294 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7295 	}
7296 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7297 
7298 	sofreelastref(so, 0);
7299 }
7300 
7301 void
soreference(struct socket * so)7302 soreference(struct socket *so)
7303 {
7304 	socket_lock(so, 1);     /* locks & take one reference on socket */
7305 	socket_unlock(so, 0);   /* unlock only */
7306 }
7307 
7308 void
sodereference(struct socket * so)7309 sodereference(struct socket *so)
7310 {
7311 	socket_lock(so, 0);
7312 	socket_unlock(so, 1);
7313 }
7314 
7315 /*
7316  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7317  * possibility of using jumbo clusters.  Caller must ensure to hold
7318  * the socket lock.
7319  */
7320 void
somultipages(struct socket * so,boolean_t set)7321 somultipages(struct socket *so, boolean_t set)
7322 {
7323 	if (set) {
7324 		so->so_flags |= SOF_MULTIPAGES;
7325 	} else {
7326 		so->so_flags &= ~SOF_MULTIPAGES;
7327 	}
7328 }
7329 
7330 void
soif2kcl(struct socket * so,boolean_t set)7331 soif2kcl(struct socket *so, boolean_t set)
7332 {
7333 	if (set) {
7334 		so->so_flags1 |= SOF1_IF_2KCL;
7335 	} else {
7336 		so->so_flags1 &= ~SOF1_IF_2KCL;
7337 	}
7338 }
7339 
7340 int
so_isdstlocal(struct socket * so)7341 so_isdstlocal(struct socket *so)
7342 {
7343 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7344 
7345 	if (SOCK_DOM(so) == PF_INET) {
7346 		return inaddr_local(inp->inp_faddr);
7347 	} else if (SOCK_DOM(so) == PF_INET6) {
7348 		return in6addr_local(&inp->in6p_faddr);
7349 	}
7350 
7351 	return 0;
7352 }
7353 
7354 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7355 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7356 {
7357 	struct sockbuf *rcv, *snd;
7358 	int err = 0, defunct;
7359 
7360 	rcv = &so->so_rcv;
7361 	snd = &so->so_snd;
7362 
7363 	defunct = (so->so_flags & SOF_DEFUNCT);
7364 	if (defunct) {
7365 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7366 			panic("%s: SB_DROP not set", __func__);
7367 			/* NOTREACHED */
7368 		}
7369 		goto done;
7370 	}
7371 
7372 	if (so->so_flags & SOF_NODEFUNCT) {
7373 		if (noforce) {
7374 			err = EOPNOTSUPP;
7375 			if (p != PROC_NULL) {
7376 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7377 				    "name %s level %d) so 0x%llu [%d,%d] "
7378 				    "is not eligible for defunct "
7379 				    "(%d)\n", __func__, proc_selfpid(),
7380 				    proc_best_name(current_proc()), proc_pid(p),
7381 				    proc_best_name(p), level,
7382 				    so->so_gencnt,
7383 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7384 			}
7385 			return err;
7386 		}
7387 		so->so_flags &= ~SOF_NODEFUNCT;
7388 		if (p != PROC_NULL) {
7389 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7390 			    "name %s level %d) so 0x%llu [%d,%d] "
7391 			    "defunct by force "
7392 			    "(%d)\n", __func__, proc_selfpid(),
7393 			    proc_best_name(current_proc()), proc_pid(p),
7394 			    proc_best_name(p), level,
7395 			    so->so_gencnt,
7396 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7397 		}
7398 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7399 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7400 		struct ifnet *ifp = inp->inp_last_outifp;
7401 
7402 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7403 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7404 		} else if (so->so_flags & SOF_DELEGATED) {
7405 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7406 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7407 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7408 		} else if (noforce && p != PROC_NULL) {
7409 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7410 
7411 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7412 			so->so_extended_bk_start = net_uptime();
7413 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7414 
7415 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7416 
7417 			err = EOPNOTSUPP;
7418 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7419 			    "name %s level %d) so 0x%llu [%d,%d] "
7420 			    "extend bk idle "
7421 			    "(%d)\n", __func__, proc_selfpid(),
7422 			    proc_best_name(current_proc()), proc_pid(p),
7423 			    proc_best_name(p), level,
7424 			    so->so_gencnt,
7425 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7426 			return err;
7427 		} else {
7428 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7429 		}
7430 	}
7431 
7432 	so->so_flags |= SOF_DEFUNCT;
7433 
7434 	/* Prevent further data from being appended to the socket buffers */
7435 	snd->sb_flags |= SB_DROP;
7436 	rcv->sb_flags |= SB_DROP;
7437 
7438 	/* Flush any existing data in the socket buffers */
7439 	if (rcv->sb_cc != 0) {
7440 		rcv->sb_flags &= ~SB_SEL;
7441 		selthreadclear(&rcv->sb_sel);
7442 		sbrelease(rcv);
7443 	}
7444 	if (snd->sb_cc != 0) {
7445 		snd->sb_flags &= ~SB_SEL;
7446 		selthreadclear(&snd->sb_sel);
7447 		sbrelease(snd);
7448 	}
7449 
7450 done:
7451 	if (p != PROC_NULL) {
7452 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7453 		    "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7454 		    proc_selfpid(), proc_best_name(current_proc()),
7455 		    proc_pid(p), proc_best_name(p), level,
7456 		    so->so_gencnt, SOCK_DOM(so),
7457 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7458 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7459 		    " extbkidle" : "");
7460 	}
7461 	return err;
7462 }
7463 
7464 int
sodefunct(struct proc * p,struct socket * so,int level)7465 sodefunct(struct proc *p, struct socket *so, int level)
7466 {
7467 	struct sockbuf *rcv, *snd;
7468 
7469 	if (!(so->so_flags & SOF_DEFUNCT)) {
7470 		panic("%s improperly called", __func__);
7471 		/* NOTREACHED */
7472 	}
7473 	if (so->so_state & SS_DEFUNCT) {
7474 		goto done;
7475 	}
7476 
7477 	rcv = &so->so_rcv;
7478 	snd = &so->so_snd;
7479 
7480 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7481 		char s[MAX_IPv6_STR_LEN];
7482 		char d[MAX_IPv6_STR_LEN];
7483 		struct inpcb *inp = sotoinpcb(so);
7484 
7485 		if (p != PROC_NULL) {
7486 			SODEFUNCTLOG(
7487 				"%s[%d, %s]: (target pid %d name %s level %d) "
7488 				"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7489 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7490 				" snd_fl 0x%x]\n", __func__,
7491 				proc_selfpid(), proc_best_name(current_proc()),
7492 				proc_pid(p), proc_best_name(p), level,
7493 				so->so_gencnt,
7494 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7495 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7496 				(void *)&inp->inp_laddr.s_addr :
7497 				(void *)&inp->in6p_laddr),
7498 				s, sizeof(s)), ntohs(inp->in6p_lport),
7499 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7500 				(void *)&inp->inp_faddr.s_addr :
7501 				(void *)&inp->in6p_faddr,
7502 				d, sizeof(d)), ntohs(inp->in6p_fport),
7503 				(uint32_t)rcv->sb_sel.si_flags,
7504 				(uint32_t)snd->sb_sel.si_flags,
7505 				rcv->sb_flags, snd->sb_flags);
7506 		}
7507 	} else if (p != PROC_NULL) {
7508 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7509 		    "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7510 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7511 		    proc_selfpid(), proc_best_name(current_proc()),
7512 		    proc_pid(p), proc_best_name(p), level,
7513 		    so->so_gencnt,
7514 		    SOCK_DOM(so), SOCK_TYPE(so),
7515 		    (uint32_t)rcv->sb_sel.si_flags,
7516 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7517 		    snd->sb_flags);
7518 	}
7519 
7520 	/*
7521 	 * First tell the protocol the flow is defunct
7522 	 */
7523 	(void)  (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7524 
7525 	/*
7526 	 * Unwedge threads blocked on sbwait() and sb_lock().
7527 	 */
7528 	sbwakeup(rcv);
7529 	sbwakeup(snd);
7530 
7531 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7532 	if (rcv->sb_flags & SB_LOCK) {
7533 		sbunlock(rcv, TRUE);    /* keep socket locked */
7534 	}
7535 	if (snd->sb_flags & SB_LOCK) {
7536 		sbunlock(snd, TRUE);    /* keep socket locked */
7537 	}
7538 	/*
7539 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7540 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7541 	 * states are set for the socket.  This would also flush out data
7542 	 * hanging off the receive list of this socket.
7543 	 */
7544 	(void) soshutdownlock_final(so, SHUT_RD);
7545 	(void) soshutdownlock_final(so, SHUT_WR);
7546 	(void) sodisconnectlocked(so);
7547 
7548 	/*
7549 	 * Explicitly handle connectionless-protocol disconnection
7550 	 * and release any remaining data in the socket buffers.
7551 	 */
7552 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7553 		(void) soisdisconnected(so);
7554 	}
7555 
7556 	if (so->so_error == 0) {
7557 		so->so_error = EBADF;
7558 	}
7559 
7560 	if (rcv->sb_cc != 0) {
7561 		rcv->sb_flags &= ~SB_SEL;
7562 		selthreadclear(&rcv->sb_sel);
7563 		sbrelease(rcv);
7564 	}
7565 	if (snd->sb_cc != 0) {
7566 		snd->sb_flags &= ~SB_SEL;
7567 		selthreadclear(&snd->sb_sel);
7568 		sbrelease(snd);
7569 	}
7570 	so->so_state |= SS_DEFUNCT;
7571 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7572 
7573 done:
7574 	return 0;
7575 }
7576 
7577 int
soresume(struct proc * p,struct socket * so,int locked)7578 soresume(struct proc *p, struct socket *so, int locked)
7579 {
7580 	if (locked == 0) {
7581 		socket_lock(so, 1);
7582 	}
7583 
7584 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7585 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7586 		    "[%d,%d] resumed from bk idle\n",
7587 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7588 		    proc_pid(p), proc_best_name(p),
7589 		    so->so_gencnt,
7590 		    SOCK_DOM(so), SOCK_TYPE(so));
7591 
7592 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7593 		so->so_extended_bk_start = 0;
7594 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7595 
7596 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7597 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7598 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7599 	}
7600 	if (locked == 0) {
7601 		socket_unlock(so, 1);
7602 	}
7603 
7604 	return 0;
7605 }
7606 
7607 /*
7608  * Does not attempt to account for sockets that are delegated from
7609  * the current process
7610  */
7611 int
so_set_extended_bk_idle(struct socket * so,int optval)7612 so_set_extended_bk_idle(struct socket *so, int optval)
7613 {
7614 	int error = 0;
7615 
7616 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7617 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7618 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7619 		error = EOPNOTSUPP;
7620 	} else if (optval == 0) {
7621 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7622 
7623 		soresume(current_proc(), so, 1);
7624 	} else {
7625 		struct proc *p = current_proc();
7626 		struct fileproc *fp;
7627 		int count = 0;
7628 
7629 		/*
7630 		 * Unlock socket to avoid lock ordering issue with
7631 		 * the proc fd table lock
7632 		 */
7633 		socket_unlock(so, 0);
7634 
7635 		proc_fdlock(p);
7636 		fdt_foreach(fp, p) {
7637 			struct socket *so2;
7638 
7639 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7640 				continue;
7641 			}
7642 
7643 			so2 = (struct socket *)fp_get_data(fp);
7644 			if (so != so2 &&
7645 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7646 				count++;
7647 			}
7648 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7649 				break;
7650 			}
7651 		}
7652 		proc_fdunlock(p);
7653 
7654 		socket_lock(so, 0);
7655 
7656 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7657 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7658 			error = EBUSY;
7659 		} else if (so->so_flags & SOF_DELEGATED) {
7660 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7661 			error = EBUSY;
7662 		} else {
7663 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7664 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7665 		}
7666 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7667 		    "%s marked for extended bk idle\n",
7668 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7669 		    so->so_gencnt,
7670 		    SOCK_DOM(so), SOCK_TYPE(so),
7671 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7672 		    "is" : "not");
7673 	}
7674 
7675 	return error;
7676 }
7677 
7678 static void
so_stop_extended_bk_idle(struct socket * so)7679 so_stop_extended_bk_idle(struct socket *so)
7680 {
7681 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7682 	so->so_extended_bk_start = 0;
7683 
7684 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7685 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7686 	/*
7687 	 * Force defunct
7688 	 */
7689 	sosetdefunct(current_proc(), so,
7690 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7691 	if (so->so_flags & SOF_DEFUNCT) {
7692 		sodefunct(current_proc(), so,
7693 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7694 	}
7695 }
7696 
7697 void
so_drain_extended_bk_idle(struct socket * so)7698 so_drain_extended_bk_idle(struct socket *so)
7699 {
7700 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7701 		/*
7702 		 * Only penalize sockets that have outstanding data
7703 		 */
7704 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7705 			so_stop_extended_bk_idle(so);
7706 
7707 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7708 		}
7709 	}
7710 }
7711 
7712 /*
7713  * Return values tells if socket is still in extended background idle
7714  */
7715 int
so_check_extended_bk_idle_time(struct socket * so)7716 so_check_extended_bk_idle_time(struct socket *so)
7717 {
7718 	int ret = 1;
7719 
7720 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7721 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7722 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7723 		    so->so_gencnt,
7724 		    SOCK_DOM(so), SOCK_TYPE(so));
7725 		if (net_uptime() - so->so_extended_bk_start >
7726 		    soextbkidlestat.so_xbkidle_time) {
7727 			so_stop_extended_bk_idle(so);
7728 
7729 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7730 
7731 			ret = 0;
7732 		} else {
7733 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7734 
7735 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7736 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7737 		}
7738 	}
7739 
7740 	return ret;
7741 }
7742 
7743 void
resume_proc_sockets(proc_t p)7744 resume_proc_sockets(proc_t p)
7745 {
7746 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7747 		struct fileproc *fp;
7748 		struct socket *so;
7749 
7750 		proc_fdlock(p);
7751 		fdt_foreach(fp, p) {
7752 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7753 				continue;
7754 			}
7755 
7756 			so = (struct socket *)fp_get_data(fp);
7757 			(void) soresume(p, so, 0);
7758 		}
7759 		proc_fdunlock(p);
7760 
7761 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7762 	}
7763 }
7764 
7765 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7766 so_set_recv_anyif(struct socket *so, int optval)
7767 {
7768 	int ret = 0;
7769 
7770 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7771 		if (optval) {
7772 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7773 		} else {
7774 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7775 		}
7776 #if SKYWALK
7777 		inp_update_netns_flags(so);
7778 #endif /* SKYWALK */
7779 	}
7780 
7781 
7782 	return ret;
7783 }
7784 
7785 __private_extern__ int
so_get_recv_anyif(struct socket * so)7786 so_get_recv_anyif(struct socket *so)
7787 {
7788 	int ret = 0;
7789 
7790 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7791 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7792 	}
7793 
7794 	return ret;
7795 }
7796 
7797 int
so_set_restrictions(struct socket * so,uint32_t vals)7798 so_set_restrictions(struct socket *so, uint32_t vals)
7799 {
7800 	int nocell_old, nocell_new;
7801 	int noexpensive_old, noexpensive_new;
7802 	int noconstrained_old, noconstrained_new;
7803 
7804 	/*
7805 	 * Deny-type restrictions are trapdoors; once set they cannot be
7806 	 * unset for the lifetime of the socket.  This allows them to be
7807 	 * issued by a framework on behalf of the application without
7808 	 * having to worry that they can be undone.
7809 	 *
7810 	 * Note here that socket-level restrictions overrides any protocol
7811 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7812 	 * socket restriction issued on the socket has a higher precendence
7813 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7814 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7815 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7816 	 */
7817 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7818 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7819 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7820 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7821 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7822 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7823 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7824 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7825 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7826 
7827 	/* we can only set, not clear restrictions */
7828 	if ((nocell_new - nocell_old) == 0 &&
7829 	    (noexpensive_new - noexpensive_old) == 0 &&
7830 	    (noconstrained_new - noconstrained_old) == 0) {
7831 		return 0;
7832 	}
7833 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7834 		if (nocell_new - nocell_old != 0) {
7835 			/*
7836 			 * if deny cellular is now set, do what's needed
7837 			 * for INPCB
7838 			 */
7839 			inp_set_nocellular(sotoinpcb(so));
7840 		}
7841 		if (noexpensive_new - noexpensive_old != 0) {
7842 			inp_set_noexpensive(sotoinpcb(so));
7843 		}
7844 		if (noconstrained_new - noconstrained_old != 0) {
7845 			inp_set_noconstrained(sotoinpcb(so));
7846 		}
7847 	}
7848 
7849 	if (SOCK_DOM(so) == PF_MULTIPATH) {
7850 		mptcp_set_restrictions(so);
7851 	}
7852 
7853 	return 0;
7854 }
7855 
7856 uint32_t
so_get_restrictions(struct socket * so)7857 so_get_restrictions(struct socket *so)
7858 {
7859 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7860 	       SO_RESTRICT_DENY_OUT |
7861 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7862 }
7863 
7864 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7865 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7866 {
7867 	struct proc *ep = PROC_NULL;
7868 	int error = 0;
7869 
7870 	/* pid 0 is reserved for kernel */
7871 	if (epid == 0) {
7872 		error = EINVAL;
7873 		goto done;
7874 	}
7875 
7876 	/*
7877 	 * If this is an in-kernel socket, prevent its delegate
7878 	 * association from changing unless the socket option is
7879 	 * coming from within the kernel itself.
7880 	 */
7881 	if (so->last_pid == 0 && p != kernproc) {
7882 		error = EACCES;
7883 		goto done;
7884 	}
7885 
7886 	/*
7887 	 * If this is issued by a process that's recorded as the
7888 	 * real owner of the socket, or if the pid is the same as
7889 	 * the process's own pid, then proceed.  Otherwise ensure
7890 	 * that the issuing process has the necessary privileges.
7891 	 */
7892 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7893 		if ((error = priv_check_cred(kauth_cred_get(),
7894 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7895 			error = EACCES;
7896 			goto done;
7897 		}
7898 	}
7899 
7900 	/* Find the process that corresponds to the effective pid */
7901 	if ((ep = proc_find(epid)) == PROC_NULL) {
7902 		error = ESRCH;
7903 		goto done;
7904 	}
7905 
7906 	/*
7907 	 * If a process tries to delegate the socket to itself, then
7908 	 * there's really nothing to do; treat it as a way for the
7909 	 * delegate association to be cleared.  Note that we check
7910 	 * the passed-in proc rather than calling proc_selfpid(),
7911 	 * as we need to check the process issuing the socket option
7912 	 * which could be kernproc.  Given that we don't allow 0 for
7913 	 * effective pid, it means that a delegated in-kernel socket
7914 	 * stays delegated during its lifetime (which is probably OK.)
7915 	 */
7916 	if (epid == proc_pid(p)) {
7917 		so->so_flags &= ~SOF_DELEGATED;
7918 		so->e_upid = 0;
7919 		so->e_pid = 0;
7920 		uuid_clear(so->e_uuid);
7921 	} else {
7922 		so->so_flags |= SOF_DELEGATED;
7923 		so->e_upid = proc_uniqueid(ep);
7924 		so->e_pid = proc_pid(ep);
7925 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7926 
7927 #if defined(XNU_TARGET_OS_OSX)
7928 		if (ep->p_responsible_pid != so->e_pid) {
7929 			proc_t rp = proc_find(ep->p_responsible_pid);
7930 			if (rp != PROC_NULL) {
7931 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7932 				so->so_rpid = ep->p_responsible_pid;
7933 				proc_rele(rp);
7934 			} else {
7935 				uuid_clear(so->so_ruuid);
7936 				so->so_rpid = -1;
7937 			}
7938 		}
7939 #endif
7940 	}
7941 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7942 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7943 	}
7944 done:
7945 	if (error == 0 && net_io_policy_log) {
7946 		uuid_string_t buf;
7947 
7948 		uuid_unparse(so->e_uuid, buf);
7949 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7950 		    "euuid %s%s\n", __func__, proc_name_address(p),
7951 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7952 		    SOCK_DOM(so), SOCK_TYPE(so),
7953 		    so->e_pid, proc_name_address(ep), buf,
7954 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7955 	} else if (error != 0 && net_io_policy_log) {
7956 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7957 		    "ERROR (%d)\n", __func__, proc_name_address(p),
7958 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7959 		    SOCK_DOM(so), SOCK_TYPE(so),
7960 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
7961 		    proc_name_address(ep), error);
7962 	}
7963 
7964 	/* Update this socket's policy upon success */
7965 	if (error == 0) {
7966 		so->so_policy_gencnt *= -1;
7967 		so_update_policy(so);
7968 #if NECP
7969 		so_update_necp_policy(so, NULL, NULL);
7970 #endif /* NECP */
7971 	}
7972 
7973 	if (ep != PROC_NULL) {
7974 		proc_rele(ep);
7975 	}
7976 
7977 	return error;
7978 }
7979 
7980 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7981 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7982 {
7983 	uuid_string_t buf;
7984 	uuid_t uuid;
7985 	int error = 0;
7986 
7987 	/* UUID must not be all-zeroes (reserved for kernel) */
7988 	if (uuid_is_null(euuid)) {
7989 		error = EINVAL;
7990 		goto done;
7991 	}
7992 
7993 	/*
7994 	 * If this is an in-kernel socket, prevent its delegate
7995 	 * association from changing unless the socket option is
7996 	 * coming from within the kernel itself.
7997 	 */
7998 	if (so->last_pid == 0 && p != kernproc) {
7999 		error = EACCES;
8000 		goto done;
8001 	}
8002 
8003 	/* Get the UUID of the issuing process */
8004 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8005 
8006 	/*
8007 	 * If this is issued by a process that's recorded as the
8008 	 * real owner of the socket, or if the uuid is the same as
8009 	 * the process's own uuid, then proceed.  Otherwise ensure
8010 	 * that the issuing process has the necessary privileges.
8011 	 */
8012 	if (check_cred &&
8013 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8014 	    uuid_compare(euuid, uuid) != 0)) {
8015 		if ((error = priv_check_cred(kauth_cred_get(),
8016 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8017 			error = EACCES;
8018 			goto done;
8019 		}
8020 	}
8021 
8022 	/*
8023 	 * If a process tries to delegate the socket to itself, then
8024 	 * there's really nothing to do; treat it as a way for the
8025 	 * delegate association to be cleared.  Note that we check
8026 	 * the uuid of the passed-in proc rather than that of the
8027 	 * current process, as we need to check the process issuing
8028 	 * the socket option which could be kernproc itself.  Given
8029 	 * that we don't allow 0 for effective uuid, it means that
8030 	 * a delegated in-kernel socket stays delegated during its
8031 	 * lifetime (which is okay.)
8032 	 */
8033 	if (uuid_compare(euuid, uuid) == 0) {
8034 		so->so_flags &= ~SOF_DELEGATED;
8035 		so->e_upid = 0;
8036 		so->e_pid = 0;
8037 		uuid_clear(so->e_uuid);
8038 	} else {
8039 		so->so_flags |= SOF_DELEGATED;
8040 		/*
8041 		 * Unlike so_set_effective_pid(), we only have the UUID
8042 		 * here and the process ID is not known.  Inherit the
8043 		 * real {pid,upid} of the socket.
8044 		 */
8045 		so->e_upid = so->last_upid;
8046 		so->e_pid = so->last_pid;
8047 		uuid_copy(so->e_uuid, euuid);
8048 	}
8049 	/*
8050 	 * The following will clear the effective process name as it's the same
8051 	 * as the real process
8052 	 */
8053 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8054 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8055 	}
8056 done:
8057 	if (error == 0 && net_io_policy_log) {
8058 		uuid_unparse(so->e_uuid, buf);
8059 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8060 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8061 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8062 		    SOCK_TYPE(so), so->e_pid, buf,
8063 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8064 	} else if (error != 0 && net_io_policy_log) {
8065 		uuid_unparse(euuid, buf);
8066 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8067 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8068 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8069 		    SOCK_TYPE(so), buf, error);
8070 	}
8071 
8072 	/* Update this socket's policy upon success */
8073 	if (error == 0) {
8074 		so->so_policy_gencnt *= -1;
8075 		so_update_policy(so);
8076 #if NECP
8077 		so_update_necp_policy(so, NULL, NULL);
8078 #endif /* NECP */
8079 	}
8080 
8081 	return error;
8082 }
8083 
8084 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8085 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8086     uint32_t ev_datalen)
8087 {
8088 	struct kev_msg ev_msg;
8089 
8090 	/*
8091 	 * A netpolicy event always starts with a netpolicy_event_data
8092 	 * structure, but the caller can provide for a longer event
8093 	 * structure to post, depending on the event code.
8094 	 */
8095 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8096 
8097 	bzero(&ev_msg, sizeof(ev_msg));
8098 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8099 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8100 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8101 	ev_msg.event_code       = ev_code;
8102 
8103 	ev_msg.dv[0].data_ptr   = ev_data;
8104 	ev_msg.dv[0].data_length = ev_datalen;
8105 
8106 	kev_post_msg(&ev_msg);
8107 }
8108 
8109 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8110 socket_post_kev_msg(uint32_t ev_code,
8111     struct kev_socket_event_data *ev_data,
8112     uint32_t ev_datalen)
8113 {
8114 	struct kev_msg ev_msg;
8115 
8116 	bzero(&ev_msg, sizeof(ev_msg));
8117 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8118 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8119 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8120 	ev_msg.event_code = ev_code;
8121 
8122 	ev_msg.dv[0].data_ptr = ev_data;
8123 	ev_msg.dv[0].data_length = ev_datalen;
8124 
8125 	kev_post_msg(&ev_msg);
8126 }
8127 
8128 void
socket_post_kev_msg_closed(struct socket * so)8129 socket_post_kev_msg_closed(struct socket *so)
8130 {
8131 	struct kev_socket_closed ev = {};
8132 	struct sockaddr *socksa = NULL, *peersa = NULL;
8133 	int err;
8134 
8135 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8136 		return;
8137 	}
8138 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8139 	if (err == 0) {
8140 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8141 		    &peersa);
8142 		if (err == 0) {
8143 			memcpy(&ev.ev_data.kev_sockname, socksa,
8144 			    min(socksa->sa_len,
8145 			    sizeof(ev.ev_data.kev_sockname)));
8146 			memcpy(&ev.ev_data.kev_peername, peersa,
8147 			    min(peersa->sa_len,
8148 			    sizeof(ev.ev_data.kev_peername)));
8149 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8150 			    &ev.ev_data, sizeof(ev));
8151 		}
8152 	}
8153 	free_sockaddr(socksa);
8154 	free_sockaddr(peersa);
8155 }
8156 
8157 __attribute__((noinline, cold, not_tail_called, noreturn))
8158 __private_extern__ int
assfail(const char * a,const char * f,int l)8159 assfail(const char *a, const char *f, int l)
8160 {
8161 	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8162 	/* NOTREACHED */
8163 	__builtin_unreachable();
8164 }
8165