xref: /xnu-10063.121.3/bsd/kern/uipc_socket.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <netinet/in.h>
106 #include <netinet/in_pcb.h>
107 #include <netinet/in_tclass.h>
108 #include <netinet/in_var.h>
109 #include <netinet/tcp_var.h>
110 #include <netinet/ip6.h>
111 #include <netinet6/ip6_var.h>
112 #include <netinet/flow_divert.h>
113 #include <kern/zalloc.h>
114 #include <kern/locks.h>
115 #include <machine/limits.h>
116 #include <libkern/OSAtomic.h>
117 #include <pexpert/pexpert.h>
118 #include <kern/assert.h>
119 #include <kern/task.h>
120 #include <kern/policy_internal.h>
121 
122 #include <sys/kpi_mbuf.h>
123 #include <sys/mcache.h>
124 #include <sys/unpcb.h>
125 #include <libkern/section_keywords.h>
126 
127 #include <os/log.h>
128 
129 #if CONFIG_MACF
130 #include <security/mac_framework.h>
131 #endif /* MAC */
132 
133 #if MULTIPATH
134 #include <netinet/mp_pcb.h>
135 #include <netinet/mptcp_var.h>
136 #endif /* MULTIPATH */
137 
138 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
139 
140 #if DEBUG || DEVELOPMENT
141 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
142 #else
143 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
144 #endif
145 
146 /* TODO: this should be in a header file somewhere */
147 extern char *proc_name_address(void *p);
148 
149 static u_int32_t        so_cache_hw;    /* High water mark for socache */
150 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
151 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
152 static u_int32_t        cached_sock_count = 0;
153 STAILQ_HEAD(, socket)   so_cache_head;
154 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
155 static uint64_t        so_cache_time;
156 static int              socketinit_done;
157 static struct zone      *so_cache_zone;
158 
159 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
160 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
161 
162 #include <machine/limits.h>
163 
164 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void     filt_sordetach(struct knote *kn);
166 static int      filt_soread(struct knote *kn, long hint);
167 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
169 
170 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void     filt_sowdetach(struct knote *kn);
172 static int      filt_sowrite(struct knote *kn, long hint);
173 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
175 
176 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
177 static void     filt_sockdetach(struct knote *kn);
178 static int      filt_sockev(struct knote *kn, long hint);
179 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
180 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
181 
182 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
183 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
184 
185 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
186 	.f_isfd = 1,
187 	.f_attach = filt_sorattach,
188 	.f_detach = filt_sordetach,
189 	.f_event = filt_soread,
190 	.f_touch = filt_sortouch,
191 	.f_process = filt_sorprocess,
192 };
193 
194 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
195 	.f_isfd = 1,
196 	.f_attach = filt_sowattach,
197 	.f_detach = filt_sowdetach,
198 	.f_event = filt_sowrite,
199 	.f_touch = filt_sowtouch,
200 	.f_process = filt_sowprocess,
201 };
202 
203 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
204 	.f_isfd = 1,
205 	.f_attach = filt_sockattach,
206 	.f_detach = filt_sockdetach,
207 	.f_event = filt_sockev,
208 	.f_touch = filt_socktouch,
209 	.f_process = filt_sockprocess,
210 };
211 
212 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
213 	.f_isfd = 1,
214 	.f_attach = filt_sorattach,
215 	.f_detach = filt_sordetach,
216 	.f_event = filt_soread,
217 	.f_touch = filt_sortouch,
218 	.f_process = filt_sorprocess,
219 };
220 
221 SYSCTL_DECL(_kern_ipc);
222 
223 #define EVEN_MORE_LOCKING_DEBUG 0
224 
225 int socket_debug = 0;
226 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
227     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
228 
229 #if (DEBUG || DEVELOPMENT)
230 #define DEFAULT_SOSEND_ASSERT_PANIC 1
231 #else
232 #define DEFAULT_SOSEND_ASSERT_PANIC 0
233 #endif /* (DEBUG || DEVELOPMENT) */
234 
235 int sosend_assert_panic = 0;
236 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
237     CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
238 
239 static unsigned long sodefunct_calls = 0;
240 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
241     &sodefunct_calls, "");
242 
243 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
244 so_gen_t        so_gencnt;      /* generation count for sockets */
245 
246 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
247 
248 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
249 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
250 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
251 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
252 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
253 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
254 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
255 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
256 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
257 
258 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
259 
260 int somaxconn = SOMAXCONN;
261 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
262     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
263 
264 /* Should we get a maximum also ??? */
265 static int sosendmaxchain = 65536;
266 static int sosendminchain = 16384;
267 static int sorecvmincopy  = 16384;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
269     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
270 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
271     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
272 
273 /*
274  * Set to enable jumbo clusters (if available) for large writes when
275  * the socket is marked with SOF_MULTIPAGES; see below.
276  */
277 int sosendjcl = 1;
278 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
279     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
280 
281 /*
282  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
283  * writes on the socket for all protocols on any network interfaces,
284  * depending upon sosendjcl above.  Be extra careful when setting this
285  * to 1, because sending down packets that cross physical pages down to
286  * broken drivers (those that falsely assume that the physical pages
287  * are contiguous) might lead to system panics or silent data corruption.
288  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
289  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
290  * capable.  Set this to 1 only for testing/debugging purposes.
291  */
292 int sosendjcl_ignore_capab = 0;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
294     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
295 
296 /*
297  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
298  * writes on the socket for all protocols on any network interfaces.
299  * Be extra careful when setting this to 1, because sending down packets with
300  * clusters larger that 2 KB might lead to system panics or data corruption.
301  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
302  * on the outgoing interface
303  * Set this to 1  for testing/debugging purposes only.
304  */
305 int sosendbigcl_ignore_capab = 0;
306 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
307     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
308 
309 int sodefunctlog = 0;
310 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
311     &sodefunctlog, 0, "");
312 
313 int sothrottlelog = 0;
314 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
315     &sothrottlelog, 0, "");
316 
317 int sorestrictrecv = 1;
318 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
319     &sorestrictrecv, 0, "Enable inbound interface restrictions");
320 
321 int sorestrictsend = 1;
322 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
323     &sorestrictsend, 0, "Enable outbound interface restrictions");
324 
325 int soreserveheadroom = 1;
326 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
327     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
328 
329 #if (DEBUG || DEVELOPMENT)
330 int so_notsent_lowat_check = 1;
331 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
332     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
333 #endif /* DEBUG || DEVELOPMENT */
334 
335 int so_accept_list_waits = 0;
336 #if (DEBUG || DEVELOPMENT)
337 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
338     &so_accept_list_waits, 0, "number of waits for listener incomp list");
339 #endif /* DEBUG || DEVELOPMENT */
340 
341 extern struct inpcbinfo tcbinfo;
342 
343 /* TODO: these should be in header file */
344 extern int get_inpcb_str_size(void);
345 extern int get_tcp_str_size(void);
346 
347 vm_size_t       so_cache_zone_element_size;
348 
349 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
350     user_ssize_t *);
351 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
352 static void cached_sock_free(struct socket *);
353 
354 /*
355  * Maximum of extended background idle sockets per process
356  * Set to zero to disable further setting of the option
357  */
358 
359 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
360 #define SO_IDLE_BK_IDLE_TIME            600
361 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
362 
363 struct soextbkidlestat soextbkidlestat;
364 
365 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
366     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
367     "Maximum of extended background idle sockets per process");
368 
369 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
370     &soextbkidlestat.so_xbkidle_time, 0,
371     "Time in seconds to keep extended background idle sockets");
372 
373 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
374     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
375     "High water mark for extended background idle sockets");
376 
377 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
378     &soextbkidlestat, soextbkidlestat, "");
379 
380 int so_set_extended_bk_idle(struct socket *, int);
381 
382 #define SO_MAX_MSG_X 1024
383 
384 /*
385  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
386  * setting the DSCP code on the packet based on the service class; see
387  * <rdar://problem/11277343> for details.
388  */
389 __private_extern__ u_int32_t sotcdb = 0;
390 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
391     &sotcdb, 0, "");
392 
393 void
socketinit(void)394 socketinit(void)
395 {
396 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
397 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
398 
399 #ifdef __LP64__
400 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
401 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
402 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
403 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
404 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
405 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
406 #else
407 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
408 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
409 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
410 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
411 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
412 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
413 #endif
414 
415 	if (socketinit_done) {
416 		printf("socketinit: already called...\n");
417 		return;
418 	}
419 	socketinit_done = 1;
420 
421 	PE_parse_boot_argn("socket_debug", &socket_debug,
422 	    sizeof(socket_debug));
423 
424 	PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
425 	    sizeof(sosend_assert_panic));
426 
427 	STAILQ_INIT(&so_cache_head);
428 
429 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
430 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
431 
432 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
433 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
434 
435 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
436 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
437 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
438 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
439 
440 	in_pcbinit();
441 }
442 
443 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)444 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
445 {
446 	caddr_t temp;
447 	uintptr_t offset;
448 
449 	lck_mtx_lock(&so_cache_mtx);
450 
451 	if (!STAILQ_EMPTY(&so_cache_head)) {
452 		VERIFY(cached_sock_count > 0);
453 
454 		*so = STAILQ_FIRST(&so_cache_head);
455 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
456 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
457 
458 		cached_sock_count--;
459 		lck_mtx_unlock(&so_cache_mtx);
460 
461 		temp = (*so)->so_saved_pcb;
462 		bzero((caddr_t)*so, sizeof(struct socket));
463 
464 		(*so)->so_saved_pcb = temp;
465 	} else {
466 		lck_mtx_unlock(&so_cache_mtx);
467 
468 		*so = zalloc_flags(so_cache_zone, how | Z_ZERO);
469 
470 		/*
471 		 * Define offsets for extra structures into our
472 		 * single block of memory. Align extra structures
473 		 * on longword boundaries.
474 		 */
475 
476 		offset = (uintptr_t)*so;
477 		offset += sizeof(struct socket);
478 
479 		offset = ALIGN(offset);
480 
481 		(*so)->so_saved_pcb = (caddr_t)offset;
482 		offset += get_inpcb_str_size();
483 
484 		offset = ALIGN(offset);
485 
486 		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
487 		    (caddr_t)offset;
488 	}
489 
490 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
491 }
492 
493 static void
cached_sock_free(struct socket * so)494 cached_sock_free(struct socket *so)
495 {
496 	lck_mtx_lock(&so_cache_mtx);
497 
498 	so_cache_time = net_uptime();
499 	if (++cached_sock_count > max_cached_sock_count) {
500 		--cached_sock_count;
501 		lck_mtx_unlock(&so_cache_mtx);
502 		zfree(so_cache_zone, so);
503 	} else {
504 		if (so_cache_hw < cached_sock_count) {
505 			so_cache_hw = cached_sock_count;
506 		}
507 
508 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
509 
510 		so->cache_timestamp = so_cache_time;
511 		lck_mtx_unlock(&so_cache_mtx);
512 	}
513 }
514 
515 void
so_update_last_owner_locked(struct socket * so,proc_t self)516 so_update_last_owner_locked(struct socket *so, proc_t self)
517 {
518 	if (so->last_pid != 0) {
519 		/*
520 		 * last_pid and last_upid should remain zero for sockets
521 		 * created using sock_socket. The check above achieves that
522 		 */
523 		if (self == PROC_NULL) {
524 			self = current_proc();
525 		}
526 
527 		if (so->last_upid != proc_uniqueid(self) ||
528 		    so->last_pid != proc_pid(self)) {
529 			so->last_upid = proc_uniqueid(self);
530 			so->last_pid = proc_pid(self);
531 			proc_getexecutableuuid(self, so->last_uuid,
532 			    sizeof(so->last_uuid));
533 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
534 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
535 			}
536 		}
537 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
538 	}
539 }
540 
541 void
so_update_policy(struct socket * so)542 so_update_policy(struct socket *so)
543 {
544 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
545 		(void) inp_update_policy(sotoinpcb(so));
546 	}
547 }
548 
549 #if NECP
550 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)551 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
552     struct sockaddr *override_remote_addr)
553 {
554 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
555 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
556 		    override_remote_addr, 0);
557 	}
558 }
559 #endif /* NECP */
560 
561 boolean_t
so_cache_timer(void)562 so_cache_timer(void)
563 {
564 	struct socket   *p;
565 	int             n_freed = 0;
566 	boolean_t rc = FALSE;
567 
568 	lck_mtx_lock(&so_cache_mtx);
569 	so_cache_timeouts++;
570 	so_cache_time = net_uptime();
571 
572 	while (!STAILQ_EMPTY(&so_cache_head)) {
573 		VERIFY(cached_sock_count > 0);
574 		p = STAILQ_FIRST(&so_cache_head);
575 		if ((so_cache_time - p->cache_timestamp) <
576 		    SO_CACHE_TIME_LIMIT) {
577 			break;
578 		}
579 
580 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
581 		--cached_sock_count;
582 
583 		zfree(so_cache_zone, p);
584 
585 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
586 			so_cache_max_freed++;
587 			break;
588 		}
589 	}
590 
591 	/* Schedule again if there is more to cleanup */
592 	if (!STAILQ_EMPTY(&so_cache_head)) {
593 		rc = TRUE;
594 	}
595 
596 	lck_mtx_unlock(&so_cache_mtx);
597 	return rc;
598 }
599 
600 /*
601  * Get a socket structure from our zone, and initialize it.
602  * We don't implement `waitok' yet (see comments in uipc_domain.c).
603  * Note that it would probably be better to allocate socket
604  * and PCB at the same time, but I'm not convinced that all
605  * the protocols can be easily modified to do this.
606  */
607 struct socket *
soalloc(int waitok,int dom,int type)608 soalloc(int waitok, int dom, int type)
609 {
610 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
611 	struct socket *so;
612 
613 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
614 		cached_sock_alloc(&so, how);
615 	} else {
616 		so = zalloc_flags(socket_zone, how | Z_ZERO);
617 	}
618 	if (so != NULL) {
619 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
620 
621 		/*
622 		 * Increment the socket allocation statistics
623 		 */
624 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
625 	}
626 
627 	return so;
628 }
629 
630 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)631 socreate_internal(int dom, struct socket **aso, int type, int proto,
632     struct proc *p, uint32_t flags, struct proc *ep)
633 {
634 	struct protosw *prp;
635 	struct socket *so;
636 	int error = 0;
637 	pid_t rpid = -1;
638 
639 #if TCPDEBUG
640 	extern int tcpconsdebug;
641 #endif
642 
643 	VERIFY(aso != NULL);
644 	*aso = NULL;
645 
646 	if (proto != 0) {
647 		prp = pffindproto(dom, proto, type);
648 	} else {
649 		prp = pffindtype(dom, type);
650 	}
651 
652 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
653 		if (pffinddomain(dom) == NULL) {
654 			return EAFNOSUPPORT;
655 		}
656 		if (proto != 0) {
657 			if (pffindprotonotype(dom, proto) != NULL) {
658 				return EPROTOTYPE;
659 			}
660 		}
661 		return EPROTONOSUPPORT;
662 	}
663 	if (prp->pr_type != type) {
664 		return EPROTOTYPE;
665 	}
666 	so = soalloc(1, dom, type);
667 	if (so == NULL) {
668 		return ENOBUFS;
669 	}
670 
671 	switch (dom) {
672 	case PF_LOCAL:
673 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
674 		break;
675 	case PF_INET:
676 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
677 		if (type == SOCK_STREAM) {
678 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
679 		} else {
680 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
681 		}
682 		break;
683 	case PF_ROUTE:
684 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
685 		break;
686 	case PF_NDRV:
687 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
688 		break;
689 	case PF_KEY:
690 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
691 		break;
692 	case PF_INET6:
693 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
694 		if (type == SOCK_STREAM) {
695 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
696 		} else {
697 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
698 		}
699 		break;
700 	case PF_SYSTEM:
701 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
702 		break;
703 	case PF_MULTIPATH:
704 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
705 		break;
706 	default:
707 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
708 		break;
709 	}
710 
711 	if (flags & SOCF_MPTCP) {
712 		so->so_state |= SS_NBIO;
713 	}
714 
715 	TAILQ_INIT(&so->so_incomp);
716 	TAILQ_INIT(&so->so_comp);
717 	so->so_type = (short)type;
718 	so->so_family = prp->pr_domain->dom_family;
719 	so->so_protocol = prp->pr_protocol;
720 	so->last_upid = proc_uniqueid(p);
721 	so->last_pid = proc_pid(p);
722 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
723 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
724 
725 	so->so_rpid = -1;
726 	uuid_clear(so->so_ruuid);
727 
728 	if (ep != PROC_NULL && ep != p) {
729 		so->e_upid = proc_uniqueid(ep);
730 		so->e_pid = proc_pid(ep);
731 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
732 		so->so_flags |= SOF_DELEGATED;
733 		if (ep->p_responsible_pid != so->e_pid) {
734 			rpid = ep->p_responsible_pid;
735 			so->so_rpid = rpid;
736 			proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
737 		}
738 	}
739 
740 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
741 		rpid = p->p_responsible_pid;
742 		so->so_rpid = rpid;
743 		proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
744 	}
745 
746 	so->so_cred = kauth_cred_proc_ref(p);
747 	if (!suser(kauth_cred_get(), NULL)) {
748 		so->so_state |= SS_PRIV;
749 	}
750 
751 	so->so_persona_id = current_persona_get_id();
752 	so->so_proto = prp;
753 	so->so_rcv.sb_flags |= SB_RECV;
754 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
755 	so->next_lock_lr = 0;
756 	so->next_unlock_lr = 0;
757 
758 	/*
759 	 * Attachment will create the per pcb lock if necessary and
760 	 * increase refcount for creation, make sure it's done before
761 	 * socket is inserted in lists.
762 	 */
763 	so->so_usecount++;
764 
765 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
766 	if (error != 0) {
767 		/*
768 		 * Warning:
769 		 * If so_pcb is not zero, the socket will be leaked,
770 		 * so protocol attachment handler must be coded carefuly
771 		 */
772 		if (so->so_pcb != NULL) {
773 			os_log_error(OS_LOG_DEFAULT,
774 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
775 			    error, dom, proto, type);
776 		}
777 		/*
778 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
779 		 */
780 		so->so_state |= SS_NOFDREF;
781 		so->so_flags |= SOF_PCBCLEARING;
782 		VERIFY(so->so_usecount > 0);
783 		so->so_usecount--;
784 		sofreelastref(so, 1);   /* will deallocate the socket */
785 		return error;
786 	}
787 
788 	/*
789 	 * Note: needs so_pcb to be set after pru_attach
790 	 */
791 	if (prp->pr_update_last_owner != NULL) {
792 		(*prp->pr_update_last_owner)(so, p, ep);
793 	}
794 
795 	os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
796 
797 	/* Attach socket filters for this protocol */
798 	sflt_initsock(so);
799 #if TCPDEBUG
800 	if (tcpconsdebug == 2) {
801 		so->so_options |= SO_DEBUG;
802 	}
803 #endif
804 	so_set_default_traffic_class(so);
805 
806 	/*
807 	 * If this thread or task is marked to create backgrounded sockets,
808 	 * mark the socket as background.
809 	 */
810 	if (!(flags & SOCF_MPTCP) &&
811 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
812 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
813 		so->so_background_thread = current_thread();
814 	}
815 
816 	switch (dom) {
817 	/*
818 	 * Don't mark Unix domain or system
819 	 * eligible for defunct by default.
820 	 */
821 	case PF_LOCAL:
822 	case PF_SYSTEM:
823 		so->so_flags |= SOF_NODEFUNCT;
824 		break;
825 	default:
826 		break;
827 	}
828 
829 	/*
830 	 * Entitlements can't be checked at socket creation time except if the
831 	 * application requested a feature guarded by a privilege (c.f., socket
832 	 * delegation).
833 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
834 	 * a privilege check should only be triggered by a userland request.
835 	 * A privilege check at socket creation time is time consuming and
836 	 * could trigger many authorisation error messages from the security
837 	 * APIs.
838 	 */
839 
840 	*aso = so;
841 
842 	return 0;
843 }
844 
845 /*
846  * Returns:	0			Success
847  *		EAFNOSUPPORT
848  *		EPROTOTYPE
849  *		EPROTONOSUPPORT
850  *		ENOBUFS
851  *	<pru_attach>:ENOBUFS[AF_UNIX]
852  *	<pru_attach>:ENOBUFS[TCP]
853  *	<pru_attach>:ENOMEM[TCP]
854  *	<pru_attach>:???		[other protocol families, IPSEC]
855  */
856 int
socreate(int dom,struct socket ** aso,int type,int proto)857 socreate(int dom, struct socket **aso, int type, int proto)
858 {
859 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
860 	           PROC_NULL);
861 }
862 
863 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)864 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
865 {
866 	int error = 0;
867 	struct proc *ep = PROC_NULL;
868 
869 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
870 		error = ESRCH;
871 		goto done;
872 	}
873 
874 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
875 
876 	/*
877 	 * It might not be wise to hold the proc reference when calling
878 	 * socreate_internal since it calls soalloc with M_WAITOK
879 	 */
880 done:
881 	if (ep != PROC_NULL) {
882 		proc_rele(ep);
883 	}
884 
885 	return error;
886 }
887 
888 /*
889  * Returns:	0			Success
890  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
891  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
892  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
893  *	<pru_bind>:EINVAL		Invalid argument
894  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
895  *	<pru_bind>:EACCES		Permission denied
896  *	<pru_bind>:EADDRINUSE		Address in use
897  *	<pru_bind>:EAGAIN		Resource unavailable, try again
898  *	<pru_bind>:EPERM		Operation not permitted
899  *	<pru_bind>:???
900  *	<sf_bind>:???
901  *
902  * Notes:	It's not possible to fully enumerate the return codes above,
903  *		since socket filter authors and protocol family authors may
904  *		not choose to limit their error returns to those listed, even
905  *		though this may result in some software operating incorrectly.
906  *
907  *		The error codes which are enumerated above are those known to
908  *		be returned by the tcp_usr_bind function supplied.
909  */
910 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)911 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
912 {
913 	struct proc *p = current_proc();
914 	int error = 0;
915 
916 	if (dolock) {
917 		socket_lock(so, 1);
918 	}
919 
920 	so_update_last_owner_locked(so, p);
921 	so_update_policy(so);
922 
923 #if NECP
924 	so_update_necp_policy(so, nam, NULL);
925 #endif /* NECP */
926 
927 	/*
928 	 * If this is a bind request on a socket that has been marked
929 	 * as inactive, reject it now before we go any further.
930 	 */
931 	if (so->so_flags & SOF_DEFUNCT) {
932 		error = EINVAL;
933 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
934 		    __func__, proc_pid(p), proc_best_name(p),
935 		    so->so_gencnt,
936 		    SOCK_DOM(so), SOCK_TYPE(so), error);
937 		goto out;
938 	}
939 
940 	/* Socket filter */
941 	error = sflt_bind(so, nam);
942 
943 	if (error == 0) {
944 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
945 	}
946 out:
947 	if (dolock) {
948 		socket_unlock(so, 1);
949 	}
950 
951 	if (error == EJUSTRETURN) {
952 		error = 0;
953 	}
954 
955 	return error;
956 }
957 
958 void
sodealloc(struct socket * so)959 sodealloc(struct socket *so)
960 {
961 	kauth_cred_unref(&so->so_cred);
962 
963 	/* Remove any filters */
964 	sflt_termsock(so);
965 
966 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
967 
968 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
969 		cached_sock_free(so);
970 	} else {
971 		zfree(socket_zone, so);
972 	}
973 }
974 
975 /*
976  * Returns:	0			Success
977  *		EINVAL
978  *		EOPNOTSUPP
979  *	<pru_listen>:EINVAL[AF_UNIX]
980  *	<pru_listen>:EINVAL[TCP]
981  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
982  *	<pru_listen>:EINVAL[TCP]	Invalid argument
983  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
984  *	<pru_listen>:EACCES[TCP]	Permission denied
985  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
986  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
987  *	<pru_listen>:EPERM[TCP]		Operation not permitted
988  *	<sf_listen>:???
989  *
990  * Notes:	Other <pru_listen> returns depend on the protocol family; all
991  *		<sf_listen> returns depend on what the filter author causes
992  *		their filter to return.
993  */
994 int
solisten(struct socket * so,int backlog)995 solisten(struct socket *so, int backlog)
996 {
997 	struct proc *p = current_proc();
998 	int error = 0;
999 
1000 	socket_lock(so, 1);
1001 
1002 	so_update_last_owner_locked(so, p);
1003 	so_update_policy(so);
1004 
1005 	if (TAILQ_EMPTY(&so->so_comp)) {
1006 		so->so_options |= SO_ACCEPTCONN;
1007 	}
1008 
1009 #if NECP
1010 	so_update_necp_policy(so, NULL, NULL);
1011 #endif /* NECP */
1012 
1013 	if (so->so_proto == NULL) {
1014 		error = EINVAL;
1015 		so->so_options &= ~SO_ACCEPTCONN;
1016 		goto out;
1017 	}
1018 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1019 		error = EOPNOTSUPP;
1020 		so->so_options &= ~SO_ACCEPTCONN;
1021 		goto out;
1022 	}
1023 
1024 	/*
1025 	 * If the listen request is made on a socket that is not fully
1026 	 * disconnected, or on a socket that has been marked as inactive,
1027 	 * reject the request now.
1028 	 */
1029 	if ((so->so_state &
1030 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1031 	    (so->so_flags & SOF_DEFUNCT)) {
1032 		error = EINVAL;
1033 		if (so->so_flags & SOF_DEFUNCT) {
1034 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1035 			    "(%d)\n", __func__, proc_pid(p),
1036 			    proc_best_name(p),
1037 			    so->so_gencnt,
1038 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1039 		}
1040 		so->so_options &= ~SO_ACCEPTCONN;
1041 		goto out;
1042 	}
1043 
1044 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1045 		error = EPERM;
1046 		so->so_options &= ~SO_ACCEPTCONN;
1047 		goto out;
1048 	}
1049 
1050 	error = sflt_listen(so);
1051 	if (error == 0) {
1052 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1053 	}
1054 
1055 	if (error) {
1056 		if (error == EJUSTRETURN) {
1057 			error = 0;
1058 		}
1059 		so->so_options &= ~SO_ACCEPTCONN;
1060 		goto out;
1061 	}
1062 
1063 	/*
1064 	 * POSIX: The implementation may have an upper limit on the length of
1065 	 * the listen queue-either global or per accepting socket. If backlog
1066 	 * exceeds this limit, the length of the listen queue is set to the
1067 	 * limit.
1068 	 *
1069 	 * If listen() is called with a backlog argument value that is less
1070 	 * than 0, the function behaves as if it had been called with a backlog
1071 	 * argument value of 0.
1072 	 *
1073 	 * A backlog argument of 0 may allow the socket to accept connections,
1074 	 * in which case the length of the listen queue may be set to an
1075 	 * implementation-defined minimum value.
1076 	 */
1077 	if (backlog <= 0 || backlog > somaxconn) {
1078 		backlog = somaxconn;
1079 	}
1080 
1081 	so->so_qlimit = (short)backlog;
1082 out:
1083 	socket_unlock(so, 1);
1084 	return error;
1085 }
1086 
1087 /*
1088  * The "accept list lock" protects the fields related to the listener queues
1089  * because we can unlock a socket to respect the lock ordering between
1090  * the listener socket and its clients sockets. The lock ordering is first to
1091  * acquire the client socket before the listener socket.
1092  *
1093  * The accept list lock serializes access to the following fields:
1094  * - of the listener socket:
1095  *   - so_comp
1096  *   - so_incomp
1097  *   - so_qlen
1098  *   - so_inqlen
1099  * - of client sockets that are in so_comp or so_incomp:
1100  *   - so_head
1101  *   - so_list
1102  *
1103  * As one can see the accept list lock protects the consistent of the
1104  * linkage of the client sockets.
1105  *
1106  * Note that those fields may be read without holding the accept list lock
1107  * for a preflight provided the accept list lock is taken when committing
1108  * to take an action based on the result of the preflight. The preflight
1109  * saves the cost of doing the unlock/lock dance.
1110  */
1111 void
so_acquire_accept_list(struct socket * head,struct socket * so)1112 so_acquire_accept_list(struct socket *head, struct socket *so)
1113 {
1114 	lck_mtx_t *mutex_held;
1115 
1116 	if (head->so_proto->pr_getlock == NULL) {
1117 		return;
1118 	}
1119 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1120 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1121 
1122 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1123 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1124 		return;
1125 	}
1126 	if (so != NULL) {
1127 		socket_unlock(so, 0);
1128 	}
1129 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1130 		so_accept_list_waits += 1;
1131 		msleep((caddr_t)&head->so_incomp, mutex_held,
1132 		    PSOCK | PCATCH, __func__, NULL);
1133 	}
1134 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1135 	if (so != NULL) {
1136 		socket_unlock(head, 0);
1137 		socket_lock(so, 0);
1138 		socket_lock(head, 0);
1139 	}
1140 }
1141 
1142 void
so_release_accept_list(struct socket * head)1143 so_release_accept_list(struct socket *head)
1144 {
1145 	if (head->so_proto->pr_getlock != NULL) {
1146 		lck_mtx_t *mutex_held;
1147 
1148 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1149 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1150 
1151 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1152 		wakeup((caddr_t)&head->so_incomp);
1153 	}
1154 }
1155 
1156 void
sofreelastref(struct socket * so,int dealloc)1157 sofreelastref(struct socket *so, int dealloc)
1158 {
1159 	struct socket *head = so->so_head;
1160 
1161 	/* Assume socket is locked */
1162 
1163 #if FLOW_DIVERT
1164 	if (so->so_flags & SOF_FLOW_DIVERT) {
1165 		flow_divert_detach(so);
1166 	}
1167 #endif  /* FLOW_DIVERT */
1168 
1169 #if CONTENT_FILTER
1170 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1171 		cfil_sock_detach(so);
1172 	}
1173 #endif /* CONTENT_FILTER */
1174 
1175 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1176 		soflow_detach(so);
1177 	}
1178 
1179 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1180 		selthreadclear(&so->so_snd.sb_sel);
1181 		selthreadclear(&so->so_rcv.sb_sel);
1182 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1183 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1184 		so->so_event = sonullevent;
1185 		return;
1186 	}
1187 	if (head != NULL) {
1188 		/*
1189 		 * Need to lock the listener when the protocol has
1190 		 * per socket locks
1191 		 */
1192 		if (head->so_proto->pr_getlock != NULL) {
1193 			socket_lock(head, 1);
1194 			so_acquire_accept_list(head, so);
1195 		}
1196 		if (so->so_state & SS_INCOMP) {
1197 			so->so_state &= ~SS_INCOMP;
1198 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1199 			head->so_incqlen--;
1200 			head->so_qlen--;
1201 			so->so_head = NULL;
1202 
1203 			if (head->so_proto->pr_getlock != NULL) {
1204 				so_release_accept_list(head);
1205 				socket_unlock(head, 1);
1206 			}
1207 		} else if (so->so_state & SS_COMP) {
1208 			if (head->so_proto->pr_getlock != NULL) {
1209 				so_release_accept_list(head);
1210 				socket_unlock(head, 1);
1211 			}
1212 			/*
1213 			 * We must not decommission a socket that's
1214 			 * on the accept(2) queue.  If we do, then
1215 			 * accept(2) may hang after select(2) indicated
1216 			 * that the listening socket was ready.
1217 			 */
1218 			selthreadclear(&so->so_snd.sb_sel);
1219 			selthreadclear(&so->so_rcv.sb_sel);
1220 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1221 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1222 			so->so_event = sonullevent;
1223 			return;
1224 		} else {
1225 			if (head->so_proto->pr_getlock != NULL) {
1226 				so_release_accept_list(head);
1227 				socket_unlock(head, 1);
1228 			}
1229 			printf("sofree: not queued\n");
1230 		}
1231 	}
1232 	sowflush(so);
1233 	sorflush(so);
1234 
1235 	/* 3932268: disable upcall */
1236 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1237 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1238 	so->so_event = sonullevent;
1239 
1240 	if (dealloc) {
1241 		sodealloc(so);
1242 	}
1243 }
1244 
1245 void
soclose_wait_locked(struct socket * so)1246 soclose_wait_locked(struct socket *so)
1247 {
1248 	lck_mtx_t *mutex_held;
1249 
1250 	if (so->so_proto->pr_getlock != NULL) {
1251 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1252 	} else {
1253 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1254 	}
1255 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1256 
1257 	/*
1258 	 * Double check here and return if there's no outstanding upcall;
1259 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1260 	 */
1261 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1262 		return;
1263 	}
1264 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1265 	so->so_snd.sb_flags &= ~SB_UPCALL;
1266 	so->so_flags |= SOF_CLOSEWAIT;
1267 
1268 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1269 	    "soclose_wait_locked", NULL);
1270 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1271 	so->so_flags &= ~SOF_CLOSEWAIT;
1272 }
1273 
1274 /*
1275  * Close a socket on last file table reference removal.
1276  * Initiate disconnect if connected.
1277  * Free socket when disconnect complete.
1278  */
1279 int
soclose_locked(struct socket * so)1280 soclose_locked(struct socket *so)
1281 {
1282 	int error = 0;
1283 	struct timespec ts;
1284 
1285 	if (so->so_usecount == 0) {
1286 		panic("soclose: so=%p refcount=0", so);
1287 		/* NOTREACHED */
1288 	}
1289 
1290 	sflt_notify(so, sock_evt_closing, NULL);
1291 
1292 	if (so->so_upcallusecount) {
1293 		soclose_wait_locked(so);
1294 	}
1295 
1296 #if CONTENT_FILTER
1297 	/*
1298 	 * We have to wait until the content filters are done
1299 	 */
1300 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1301 		cfil_sock_close_wait(so);
1302 		cfil_sock_is_closed(so);
1303 		cfil_sock_detach(so);
1304 	}
1305 #endif /* CONTENT_FILTER */
1306 
1307 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1308 		soflow_detach(so);
1309 	}
1310 
1311 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1312 		soresume(current_proc(), so, 1);
1313 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1314 	}
1315 
1316 	if ((so->so_options & SO_ACCEPTCONN)) {
1317 		struct socket *sp, *sonext;
1318 		int persocklock = 0;
1319 		int incomp_overflow_only;
1320 
1321 		/*
1322 		 * We do not want new connection to be added
1323 		 * to the connection queues
1324 		 */
1325 		so->so_options &= ~SO_ACCEPTCONN;
1326 
1327 		/*
1328 		 * We can drop the lock on the listener once
1329 		 * we've acquired the incoming list
1330 		 */
1331 		if (so->so_proto->pr_getlock != NULL) {
1332 			persocklock = 1;
1333 			so_acquire_accept_list(so, NULL);
1334 			socket_unlock(so, 0);
1335 		}
1336 again:
1337 		incomp_overflow_only = 1;
1338 
1339 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1340 			/*
1341 			 * Radar 5350314
1342 			 * skip sockets thrown away by tcpdropdropblreq
1343 			 * they will get cleanup by the garbage collection.
1344 			 * otherwise, remove the incomp socket from the queue
1345 			 * and let soabort trigger the appropriate cleanup.
1346 			 */
1347 			if (sp->so_flags & SOF_OVERFLOW) {
1348 				continue;
1349 			}
1350 
1351 			if (persocklock != 0) {
1352 				socket_lock(sp, 1);
1353 			}
1354 
1355 			/*
1356 			 * Radar 27945981
1357 			 * The extra reference for the list insure the
1358 			 * validity of the socket pointer when we perform the
1359 			 * unlock of the head above
1360 			 */
1361 			if (sp->so_state & SS_INCOMP) {
1362 				sp->so_state &= ~SS_INCOMP;
1363 				sp->so_head = NULL;
1364 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1365 				so->so_incqlen--;
1366 				so->so_qlen--;
1367 
1368 				(void) soabort(sp);
1369 			} else {
1370 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1371 				    __func__, sp);
1372 			}
1373 
1374 			if (persocklock != 0) {
1375 				socket_unlock(sp, 1);
1376 			}
1377 		}
1378 
1379 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1380 			/* Dequeue from so_comp since sofree() won't do it */
1381 			if (persocklock != 0) {
1382 				socket_lock(sp, 1);
1383 			}
1384 
1385 			if (sp->so_state & SS_COMP) {
1386 				sp->so_state &= ~SS_COMP;
1387 				sp->so_head = NULL;
1388 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1389 				so->so_qlen--;
1390 
1391 				(void) soabort(sp);
1392 			} else {
1393 				panic("%s sp %p in so_comp but !SS_COMP",
1394 				    __func__, sp);
1395 			}
1396 
1397 			if (persocklock) {
1398 				socket_unlock(sp, 1);
1399 			}
1400 		}
1401 
1402 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1403 #if (DEBUG | DEVELOPMENT)
1404 			panic("%s head %p so_comp not empty", __func__, so);
1405 #endif /* (DEVELOPMENT || DEBUG) */
1406 
1407 			goto again;
1408 		}
1409 
1410 		if (!TAILQ_EMPTY(&so->so_comp)) {
1411 #if (DEBUG | DEVELOPMENT)
1412 			panic("%s head %p so_comp not empty", __func__, so);
1413 #endif /* (DEVELOPMENT || DEBUG) */
1414 
1415 			goto again;
1416 		}
1417 
1418 		if (persocklock) {
1419 			socket_lock(so, 0);
1420 			so_release_accept_list(so);
1421 		}
1422 	}
1423 	if (so->so_pcb == NULL) {
1424 		/* 3915887: mark the socket as ready for dealloc */
1425 		so->so_flags |= SOF_PCBCLEARING;
1426 		goto discard;
1427 	}
1428 
1429 	if (so->so_state & SS_ISCONNECTED) {
1430 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1431 			error = sodisconnectlocked(so);
1432 			if (error) {
1433 				goto drop;
1434 			}
1435 		}
1436 		if (so->so_options & SO_LINGER) {
1437 			if ((so->so_state & SS_ISDISCONNECTING) &&
1438 			    (so->so_state & SS_NBIO)) {
1439 				goto drop;
1440 			}
1441 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1442 				lck_mtx_t *mutex_held;
1443 
1444 				if (so->so_proto->pr_getlock != NULL) {
1445 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1446 				} else {
1447 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1448 				}
1449 				ts.tv_sec = (so->so_linger / 100);
1450 				ts.tv_nsec = (so->so_linger % 100) *
1451 				    NSEC_PER_USEC * 1000 * 10;
1452 				error = msleep((caddr_t)&so->so_timeo,
1453 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1454 				if (error) {
1455 					/*
1456 					 * It's OK when the time fires,
1457 					 * don't report an error
1458 					 */
1459 					if (error == EWOULDBLOCK) {
1460 						error = 0;
1461 					}
1462 					break;
1463 				}
1464 			}
1465 		}
1466 	}
1467 drop:
1468 	if (so->so_usecount == 0) {
1469 		panic("soclose: usecount is zero so=%p", so);
1470 		/* NOTREACHED */
1471 	}
1472 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1473 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1474 		if (error == 0) {
1475 			error = error2;
1476 		}
1477 	}
1478 	if (so->so_usecount <= 0) {
1479 		panic("soclose: usecount is zero so=%p", so);
1480 		/* NOTREACHED */
1481 	}
1482 discard:
1483 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1484 	    (so->so_state & SS_NOFDREF)) {
1485 		panic("soclose: NOFDREF");
1486 		/* NOTREACHED */
1487 	}
1488 	so->so_state |= SS_NOFDREF;
1489 
1490 	if ((so->so_flags & SOF_KNOTE) != 0) {
1491 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1492 	}
1493 
1494 	os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1495 
1496 	VERIFY(so->so_usecount > 0);
1497 	so->so_usecount--;
1498 	sofree(so);
1499 	return error;
1500 }
1501 
1502 int
soclose(struct socket * so)1503 soclose(struct socket *so)
1504 {
1505 	int error = 0;
1506 	socket_lock(so, 1);
1507 
1508 	if (so->so_retaincnt == 0) {
1509 		error = soclose_locked(so);
1510 	} else {
1511 		/*
1512 		 * if the FD is going away, but socket is
1513 		 * retained in kernel remove its reference
1514 		 */
1515 		so->so_usecount--;
1516 		if (so->so_usecount < 2) {
1517 			panic("soclose: retaincnt non null and so=%p "
1518 			    "usecount=%d\n", so, so->so_usecount);
1519 		}
1520 	}
1521 	socket_unlock(so, 1);
1522 	return error;
1523 }
1524 
1525 /*
1526  * Must be called at splnet...
1527  */
1528 /* Should already be locked */
1529 int
soabort(struct socket * so)1530 soabort(struct socket *so)
1531 {
1532 	int error;
1533 
1534 #ifdef MORE_LOCKING_DEBUG
1535 	lck_mtx_t *mutex_held;
1536 
1537 	if (so->so_proto->pr_getlock != NULL) {
1538 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1539 	} else {
1540 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1541 	}
1542 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1543 #endif
1544 
1545 	if ((so->so_flags & SOF_ABORTED) == 0) {
1546 		so->so_flags |= SOF_ABORTED;
1547 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1548 		if (error) {
1549 			sofree(so);
1550 			return error;
1551 		}
1552 	}
1553 	return 0;
1554 }
1555 
1556 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1557 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1558 {
1559 	int error;
1560 
1561 	if (dolock) {
1562 		socket_lock(so, 1);
1563 	}
1564 
1565 	so_update_last_owner_locked(so, PROC_NULL);
1566 	so_update_policy(so);
1567 #if NECP
1568 	so_update_necp_policy(so, NULL, NULL);
1569 #endif /* NECP */
1570 
1571 	if ((so->so_state & SS_NOFDREF) == 0) {
1572 		panic("soaccept: !NOFDREF");
1573 	}
1574 	so->so_state &= ~SS_NOFDREF;
1575 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1576 
1577 	if (dolock) {
1578 		socket_unlock(so, 1);
1579 	}
1580 	return error;
1581 }
1582 
1583 int
soaccept(struct socket * so,struct sockaddr ** nam)1584 soaccept(struct socket *so, struct sockaddr **nam)
1585 {
1586 	return soacceptlock(so, nam, 1);
1587 }
1588 
1589 int
soacceptfilter(struct socket * so,struct socket * head)1590 soacceptfilter(struct socket *so, struct socket *head)
1591 {
1592 	struct sockaddr *local = NULL, *remote = NULL;
1593 	int error = 0;
1594 
1595 	/*
1596 	 * Hold the lock even if this socket has not been made visible
1597 	 * to the filter(s).  For sockets with global locks, this protects
1598 	 * against the head or peer going away
1599 	 */
1600 	socket_lock(so, 1);
1601 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1602 	    sogetaddr_locked(so, &local, 0) != 0) {
1603 		so->so_state &= ~SS_NOFDREF;
1604 		socket_unlock(so, 1);
1605 		soclose(so);
1606 		/* Out of resources; try it again next time */
1607 		error = ECONNABORTED;
1608 		goto done;
1609 	}
1610 
1611 	error = sflt_accept(head, so, local, remote);
1612 
1613 	/*
1614 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1615 	 * as inactive and return it anyway.  This newly accepted socket
1616 	 * will be disconnected later before we hand it off to the caller.
1617 	 */
1618 	if (error == EJUSTRETURN) {
1619 		error = 0;
1620 		(void) sosetdefunct(current_proc(), so,
1621 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1622 	}
1623 
1624 	if (error != 0) {
1625 		/*
1626 		 * This may seem like a duplication to the above error
1627 		 * handling part when we return ECONNABORTED, except
1628 		 * the following is done while holding the lock since
1629 		 * the socket has been exposed to the filter(s) earlier.
1630 		 */
1631 		so->so_state &= ~SS_NOFDREF;
1632 		socket_unlock(so, 1);
1633 		soclose(so);
1634 		/* Propagate socket filter's error code to the caller */
1635 	} else {
1636 		socket_unlock(so, 1);
1637 	}
1638 done:
1639 	/* Callee checks for NULL pointer */
1640 	sock_freeaddr(remote);
1641 	sock_freeaddr(local);
1642 	return error;
1643 }
1644 
1645 /*
1646  * Returns:	0			Success
1647  *		EOPNOTSUPP		Operation not supported on socket
1648  *		EISCONN			Socket is connected
1649  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1650  *	<pru_connect>:EINVAL		Invalid argument
1651  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1652  *	<pru_connect>:EACCES		Permission denied
1653  *	<pru_connect>:EADDRINUSE	Address in use
1654  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1655  *	<pru_connect>:EPERM		Operation not permitted
1656  *	<sf_connect_out>:???		[anything a filter writer might set]
1657  */
1658 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1659 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1660 {
1661 	int error;
1662 	struct proc *p = current_proc();
1663 	tracker_metadata_t metadata = { };
1664 
1665 	if (dolock) {
1666 		socket_lock(so, 1);
1667 	}
1668 
1669 	so_update_last_owner_locked(so, p);
1670 	so_update_policy(so);
1671 
1672 	/*
1673 	 * If this is a listening socket or if this is a previously-accepted
1674 	 * socket that has been marked as inactive, reject the connect request.
1675 	 */
1676 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1677 		error = EOPNOTSUPP;
1678 		if (so->so_flags & SOF_DEFUNCT) {
1679 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1680 			    "(%d)\n", __func__, proc_pid(p),
1681 			    proc_best_name(p),
1682 			    so->so_gencnt,
1683 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1684 		}
1685 		if (dolock) {
1686 			socket_unlock(so, 1);
1687 		}
1688 		return error;
1689 	}
1690 
1691 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1692 		if (dolock) {
1693 			socket_unlock(so, 1);
1694 		}
1695 		return EPERM;
1696 	}
1697 
1698 	/*
1699 	 * If protocol is connection-based, can only connect once.
1700 	 * Otherwise, if connected, try to disconnect first.
1701 	 * This allows user to disconnect by connecting to, e.g.,
1702 	 * a null address.
1703 	 */
1704 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1705 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1706 	    (error = sodisconnectlocked(so)))) {
1707 		error = EISCONN;
1708 	} else {
1709 		/*
1710 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1711 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1712 		 */
1713 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1714 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1715 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1716 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1717 				}
1718 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1719 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1720 				}
1721 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1722 					printf("connect() - failed necp_set_socket_domain_attributes");
1723 				}
1724 			}
1725 		}
1726 
1727 #if NECP
1728 		/* Update NECP evaluation after setting any domain via the tracker checks */
1729 		so_update_necp_policy(so, NULL, nam);
1730 #endif /* NECP */
1731 
1732 		/*
1733 		 * Run connect filter before calling protocol:
1734 		 *  - non-blocking connect returns before completion;
1735 		 */
1736 		error = sflt_connectout(so, nam);
1737 		if (error != 0) {
1738 			if (error == EJUSTRETURN) {
1739 				error = 0;
1740 			}
1741 		} else {
1742 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1743 			    (so, nam, p);
1744 			if (error != 0) {
1745 				so->so_state &= ~SS_ISCONNECTING;
1746 			}
1747 		}
1748 	}
1749 	if (dolock) {
1750 		socket_unlock(so, 1);
1751 	}
1752 	return error;
1753 }
1754 
1755 int
soconnect(struct socket * so,struct sockaddr * nam)1756 soconnect(struct socket *so, struct sockaddr *nam)
1757 {
1758 	return soconnectlock(so, nam, 1);
1759 }
1760 
1761 /*
1762  * Returns:	0			Success
1763  *	<pru_connect2>:EINVAL[AF_UNIX]
1764  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1765  *	<pru_connect2>:???		[other protocol families]
1766  *
1767  * Notes:	<pru_connect2> is not supported by [TCP].
1768  */
1769 int
soconnect2(struct socket * so1,struct socket * so2)1770 soconnect2(struct socket *so1, struct socket *so2)
1771 {
1772 	int error;
1773 
1774 	socket_lock(so1, 1);
1775 	if (so2->so_proto->pr_lock) {
1776 		socket_lock(so2, 1);
1777 	}
1778 
1779 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1780 
1781 	socket_unlock(so1, 1);
1782 	if (so2->so_proto->pr_lock) {
1783 		socket_unlock(so2, 1);
1784 	}
1785 	return error;
1786 }
1787 
1788 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1789 soconnectxlocked(struct socket *so, struct sockaddr *src,
1790     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1791     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1792     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1793 {
1794 	int error;
1795 	tracker_metadata_t metadata = { };
1796 
1797 	so_update_last_owner_locked(so, p);
1798 	so_update_policy(so);
1799 
1800 	/*
1801 	 * If this is a listening socket or if this is a previously-accepted
1802 	 * socket that has been marked as inactive, reject the connect request.
1803 	 */
1804 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1805 		error = EOPNOTSUPP;
1806 		if (so->so_flags & SOF_DEFUNCT) {
1807 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1808 			    "(%d)\n", __func__, proc_pid(p),
1809 			    proc_best_name(p),
1810 			    so->so_gencnt,
1811 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1812 		}
1813 		return error;
1814 	}
1815 
1816 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1817 		return EPERM;
1818 	}
1819 
1820 	/*
1821 	 * If protocol is connection-based, can only connect once
1822 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1823 	 * try to disconnect first.  This allows user to disconnect
1824 	 * by connecting to, e.g., a null address.
1825 	 */
1826 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1827 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1828 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1829 	    (error = sodisconnectlocked(so)) != 0)) {
1830 		error = EISCONN;
1831 	} else {
1832 		/*
1833 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1834 		 * (only if it hasn't been marked yet).
1835 		 */
1836 		if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1837 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1838 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1839 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1840 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1841 				}
1842 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1843 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1844 				}
1845 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1846 					printf("connectx() - failed necp_set_socket_domain_attributes");
1847 				}
1848 			}
1849 		}
1850 
1851 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1852 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1853 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1854 
1855 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1856 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1857 			}
1858 		}
1859 
1860 		/*
1861 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1862 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1863 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1864 		 * Case 3 allows user to combine write with connect even if they have
1865 		 * no use for TFO (such as regular TCP, and UDP).
1866 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1867 		 */
1868 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1869 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1870 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1871 		}
1872 
1873 		/*
1874 		 * If a user sets data idempotent and does not pass an uio, or
1875 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1876 		 * SOF1_DATA_IDEMPOTENT.
1877 		 */
1878 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1879 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1880 			/* We should return EINVAL instead perhaps. */
1881 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1882 		}
1883 
1884 		/*
1885 		 * Run connect filter before calling protocol:
1886 		 *  - non-blocking connect returns before completion;
1887 		 */
1888 		error = sflt_connectout(so, dst);
1889 		if (error != 0) {
1890 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1891 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1892 			if (error == EJUSTRETURN) {
1893 				error = 0;
1894 			}
1895 		} else {
1896 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1897 			    (so, src, dst, p, ifscope, aid, pcid,
1898 			    flags, arg, arglen, auio, bytes_written);
1899 			if (error != 0) {
1900 				so->so_state &= ~SS_ISCONNECTING;
1901 				if (error != EINPROGRESS) {
1902 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1903 				}
1904 			}
1905 		}
1906 	}
1907 
1908 	return error;
1909 }
1910 
1911 int
sodisconnectlocked(struct socket * so)1912 sodisconnectlocked(struct socket *so)
1913 {
1914 	int error;
1915 
1916 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1917 		error = ENOTCONN;
1918 		goto bad;
1919 	}
1920 	if (so->so_state & SS_ISDISCONNECTING) {
1921 		error = EALREADY;
1922 		goto bad;
1923 	}
1924 
1925 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1926 	if (error == 0) {
1927 		sflt_notify(so, sock_evt_disconnected, NULL);
1928 	}
1929 
1930 bad:
1931 	return error;
1932 }
1933 
1934 /* Locking version */
1935 int
sodisconnect(struct socket * so)1936 sodisconnect(struct socket *so)
1937 {
1938 	int error;
1939 
1940 	socket_lock(so, 1);
1941 	error = sodisconnectlocked(so);
1942 	socket_unlock(so, 1);
1943 	return error;
1944 }
1945 
1946 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1947 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1948 {
1949 	int error;
1950 
1951 	/*
1952 	 * Call the protocol disconnectx handler; let it handle all
1953 	 * matters related to the connection state of this session.
1954 	 */
1955 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1956 	if (error == 0) {
1957 		/*
1958 		 * The event applies only for the session, not for
1959 		 * the disconnection of individual subflows.
1960 		 */
1961 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1962 			sflt_notify(so, sock_evt_disconnected, NULL);
1963 		}
1964 	}
1965 	return error;
1966 }
1967 
1968 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1969 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1970 {
1971 	int error;
1972 
1973 	socket_lock(so, 1);
1974 	error = sodisconnectxlocked(so, aid, cid);
1975 	socket_unlock(so, 1);
1976 	return error;
1977 }
1978 
1979 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1980 
1981 /*
1982  * sosendcheck will lock the socket buffer if it isn't locked and
1983  * verify that there is space for the data being inserted.
1984  *
1985  * Returns:	0			Success
1986  *		EPIPE
1987  *	sblock:EWOULDBLOCK
1988  *	sblock:EINTR
1989  *	sbwait:EBADF
1990  *	sbwait:EINTR
1991  *	[so_error]:???
1992  */
1993 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1994 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1995     int32_t clen, int32_t atomic, int flags, int *sblocked)
1996 {
1997 	int     error = 0;
1998 	int32_t space;
1999 	int     assumelock = 0;
2000 
2001 restart:
2002 	if (*sblocked == 0) {
2003 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2004 		    so->so_send_filt_thread != 0 &&
2005 		    so->so_send_filt_thread == current_thread()) {
2006 			/*
2007 			 * We're being called recursively from a filter,
2008 			 * allow this to continue. Radar 4150520.
2009 			 * Don't set sblocked because we don't want
2010 			 * to perform an unlock later.
2011 			 */
2012 			assumelock = 1;
2013 		} else {
2014 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2015 			if (error) {
2016 				if (so->so_flags & SOF_DEFUNCT) {
2017 					goto defunct;
2018 				}
2019 				return error;
2020 			}
2021 			*sblocked = 1;
2022 		}
2023 	}
2024 
2025 	/*
2026 	 * If a send attempt is made on a socket that has been marked
2027 	 * as inactive (disconnected), reject the request.
2028 	 */
2029 	if (so->so_flags & SOF_DEFUNCT) {
2030 defunct:
2031 		error = EPIPE;
2032 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2033 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2034 		    so->so_gencnt,
2035 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2036 		return error;
2037 	}
2038 
2039 	if (so->so_state & SS_CANTSENDMORE) {
2040 #if CONTENT_FILTER
2041 		/*
2042 		 * Can re-inject data of half closed connections
2043 		 */
2044 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2045 		    so->so_snd.sb_cfil_thread == current_thread() &&
2046 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2047 			CFIL_LOG(LOG_INFO,
2048 			    "so %llx ignore SS_CANTSENDMORE",
2049 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2050 		} else
2051 #endif /* CONTENT_FILTER */
2052 		return EPIPE;
2053 	}
2054 	if (so->so_error) {
2055 		error = so->so_error;
2056 		so->so_error = 0;
2057 		return error;
2058 	}
2059 
2060 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2061 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2062 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2063 			    (resid != 0 || clen == 0) &&
2064 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2065 				return ENOTCONN;
2066 			}
2067 		} else if (addr == 0) {
2068 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2069 			       ENOTCONN : EDESTADDRREQ;
2070 		}
2071 	}
2072 
2073 	space = sbspace(&so->so_snd);
2074 
2075 	if (flags & MSG_OOB) {
2076 		space += 1024;
2077 	}
2078 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2079 	    clen > so->so_snd.sb_hiwat) {
2080 		return EMSGSIZE;
2081 	}
2082 
2083 	if ((space < resid + clen &&
2084 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2085 	    space < clen)) ||
2086 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2087 		/*
2088 		 * don't block the connectx call when there's more data
2089 		 * than can be copied.
2090 		 */
2091 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2092 			if (space == 0) {
2093 				return EWOULDBLOCK;
2094 			}
2095 			if (space < (int32_t)so->so_snd.sb_lowat) {
2096 				return 0;
2097 			}
2098 		}
2099 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2100 		    assumelock) {
2101 			return EWOULDBLOCK;
2102 		}
2103 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2104 		*sblocked = 0;
2105 		error = sbwait(&so->so_snd);
2106 		if (error) {
2107 			if (so->so_flags & SOF_DEFUNCT) {
2108 				goto defunct;
2109 			}
2110 			return error;
2111 		}
2112 		goto restart;
2113 	}
2114 	return 0;
2115 }
2116 
2117 /*
2118  * Send on a socket.
2119  * If send must go all at once and message is larger than
2120  * send buffering, then hard error.
2121  * Lock against other senders.
2122  * If must go all at once and not enough room now, then
2123  * inform user that this would block and do nothing.
2124  * Otherwise, if nonblocking, send as much as possible.
2125  * The data to be sent is described by "uio" if nonzero,
2126  * otherwise by the mbuf chain "top" (which must be null
2127  * if uio is not).  Data provided in mbuf chain must be small
2128  * enough to send all at once.
2129  *
2130  * Returns nonzero on error, timeout or signal; callers
2131  * must check for short counts if EINTR/ERESTART are returned.
2132  * Data and control buffers are freed on return.
2133  *
2134  * Returns:	0			Success
2135  *		EOPNOTSUPP
2136  *		EINVAL
2137  *		ENOBUFS
2138  *	uiomove:EFAULT
2139  *	sosendcheck:EPIPE
2140  *	sosendcheck:EWOULDBLOCK
2141  *	sosendcheck:EINTR
2142  *	sosendcheck:EBADF
2143  *	sosendcheck:EINTR
2144  *	sosendcheck:???			[value from so_error]
2145  *	<pru_send>:ECONNRESET[TCP]
2146  *	<pru_send>:EINVAL[TCP]
2147  *	<pru_send>:ENOBUFS[TCP]
2148  *	<pru_send>:EADDRINUSE[TCP]
2149  *	<pru_send>:EADDRNOTAVAIL[TCP]
2150  *	<pru_send>:EAFNOSUPPORT[TCP]
2151  *	<pru_send>:EACCES[TCP]
2152  *	<pru_send>:EAGAIN[TCP]
2153  *	<pru_send>:EPERM[TCP]
2154  *	<pru_send>:EMSGSIZE[TCP]
2155  *	<pru_send>:EHOSTUNREACH[TCP]
2156  *	<pru_send>:ENETUNREACH[TCP]
2157  *	<pru_send>:ENETDOWN[TCP]
2158  *	<pru_send>:ENOMEM[TCP]
2159  *	<pru_send>:ENOBUFS[TCP]
2160  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2161  *	<pru_send>:EINVAL[AF_UNIX]
2162  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2163  *	<pru_send>:EPIPE[AF_UNIX]
2164  *	<pru_send>:ENOTCONN[AF_UNIX]
2165  *	<pru_send>:EISCONN[AF_UNIX]
2166  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2167  *	<sf_data_out>:???		[whatever a filter author chooses]
2168  *
2169  * Notes:	Other <pru_send> returns depend on the protocol family; all
2170  *		<sf_data_out> returns depend on what the filter author causes
2171  *		their filter to return.
2172  */
2173 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2174 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2175     struct mbuf *top, struct mbuf *control, int flags)
2176 {
2177 	struct mbuf **mp;
2178 	struct mbuf *m, *freelist = NULL;
2179 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2180 	user_ssize_t space, len, resid, orig_resid;
2181 	int clen = 0, error, dontroute, sendflags;
2182 	int atomic = sosendallatonce(so) || top;
2183 	int sblocked = 0;
2184 	struct proc *p = current_proc();
2185 	uint16_t headroom = 0;
2186 	ssize_t mlen;
2187 	boolean_t en_tracing = FALSE;
2188 
2189 	if (uio != NULL) {
2190 		resid = uio_resid(uio);
2191 	} else {
2192 		resid = top->m_pkthdr.len;
2193 	}
2194 	orig_resid = resid;
2195 
2196 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2197 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2198 
2199 	socket_lock(so, 1);
2200 
2201 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2202 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2203 	}
2204 
2205 	/*
2206 	 * trace if tracing & network (vs. unix) sockets & and
2207 	 * non-loopback
2208 	 */
2209 	if (ENTR_SHOULDTRACE &&
2210 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2211 		struct inpcb *inp = sotoinpcb(so);
2212 		if (inp->inp_last_outifp != NULL &&
2213 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2214 			en_tracing = TRUE;
2215 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2216 			    VM_KERNEL_ADDRPERM(so),
2217 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2218 			    (int64_t)resid);
2219 		}
2220 	}
2221 
2222 	/*
2223 	 * Re-injection should not affect process accounting
2224 	 */
2225 	if ((flags & MSG_SKIPCFIL) == 0) {
2226 		so_update_last_owner_locked(so, p);
2227 		so_update_policy(so);
2228 
2229 #if NECP
2230 		so_update_necp_policy(so, NULL, addr);
2231 #endif /* NECP */
2232 	}
2233 
2234 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2235 		error = EOPNOTSUPP;
2236 		goto out_locked;
2237 	}
2238 
2239 	/*
2240 	 * In theory resid should be unsigned.
2241 	 * However, space must be signed, as it might be less than 0
2242 	 * if we over-committed, and we must use a signed comparison
2243 	 * of space and resid.  On the other hand, a negative resid
2244 	 * causes us to loop sending 0-length segments to the protocol.
2245 	 *
2246 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2247 	 *
2248 	 * Note: We limit resid to be a positive int value as we use
2249 	 * imin() to set bytes_to_copy -- radr://14558484
2250 	 */
2251 	if (resid < 0 || resid > INT_MAX ||
2252 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2253 		error = EINVAL;
2254 		goto out_locked;
2255 	}
2256 
2257 	dontroute = (flags & MSG_DONTROUTE) &&
2258 	    (so->so_options & SO_DONTROUTE) == 0 &&
2259 	    (so->so_proto->pr_flags & PR_ATOMIC);
2260 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2261 
2262 	if (control != NULL) {
2263 		clen = control->m_len;
2264 	}
2265 
2266 	if (soreserveheadroom != 0) {
2267 		headroom = so->so_pktheadroom;
2268 	}
2269 
2270 	do {
2271 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2272 		    &sblocked);
2273 		if (error) {
2274 			goto out_locked;
2275 		}
2276 
2277 		mp = &top;
2278 		space = sbspace(&so->so_snd) - clen;
2279 		space += ((flags & MSG_OOB) ? 1024 : 0);
2280 
2281 		do {
2282 			if (uio == NULL) {
2283 				/*
2284 				 * Data is prepackaged in "top".
2285 				 */
2286 				resid = 0;
2287 				if (flags & MSG_EOR) {
2288 					top->m_flags |= M_EOR;
2289 				}
2290 			} else {
2291 				int chainlength;
2292 				int bytes_to_copy;
2293 				boolean_t jumbocl;
2294 				boolean_t bigcl;
2295 				int bytes_to_alloc;
2296 
2297 				bytes_to_copy = imin((int)resid, (int)space);
2298 
2299 				bytes_to_alloc = bytes_to_copy;
2300 				if (top == NULL) {
2301 					bytes_to_alloc += headroom;
2302 				}
2303 
2304 				if (sosendminchain > 0) {
2305 					chainlength = 0;
2306 				} else {
2307 					chainlength = sosendmaxchain;
2308 				}
2309 
2310 				/*
2311 				 * Use big 4 KB cluster when the outgoing interface
2312 				 * does not prefer 2 KB clusters
2313 				 */
2314 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2315 				    sosendbigcl_ignore_capab;
2316 
2317 				/*
2318 				 * Attempt to use larger than system page-size
2319 				 * clusters for large writes only if there is
2320 				 * a jumbo cluster pool and if the socket is
2321 				 * marked accordingly.
2322 				 */
2323 				jumbocl = sosendjcl && njcl > 0 &&
2324 				    ((so->so_flags & SOF_MULTIPAGES) ||
2325 				    sosendjcl_ignore_capab) &&
2326 				    bigcl;
2327 
2328 				socket_unlock(so, 0);
2329 
2330 				do {
2331 					int num_needed;
2332 					int hdrs_needed = (top == NULL) ? 1 : 0;
2333 
2334 					/*
2335 					 * try to maintain a local cache of mbuf
2336 					 * clusters needed to complete this
2337 					 * write the list is further limited to
2338 					 * the number that are currently needed
2339 					 * to fill the socket this mechanism
2340 					 * allows a large number of mbufs/
2341 					 * clusters to be grabbed under a single
2342 					 * mbuf lock... if we can't get any
2343 					 * clusters, than fall back to trying
2344 					 * for mbufs if we fail early (or
2345 					 * miscalcluate the number needed) make
2346 					 * sure to release any clusters we
2347 					 * haven't yet consumed.
2348 					 */
2349 					if (freelist == NULL &&
2350 					    bytes_to_alloc > MBIGCLBYTES &&
2351 					    jumbocl) {
2352 						num_needed =
2353 						    bytes_to_alloc / M16KCLBYTES;
2354 
2355 						if ((bytes_to_alloc -
2356 						    (num_needed * M16KCLBYTES))
2357 						    >= MINCLSIZE) {
2358 							num_needed++;
2359 						}
2360 
2361 						freelist =
2362 						    m_getpackets_internal(
2363 							(unsigned int *)&num_needed,
2364 							hdrs_needed, M_WAIT, 0,
2365 							M16KCLBYTES);
2366 						/*
2367 						 * Fall back to 4K cluster size
2368 						 * if allocation failed
2369 						 */
2370 					}
2371 
2372 					if (freelist == NULL &&
2373 					    bytes_to_alloc > MCLBYTES &&
2374 					    bigcl) {
2375 						num_needed =
2376 						    bytes_to_alloc / MBIGCLBYTES;
2377 
2378 						if ((bytes_to_alloc -
2379 						    (num_needed * MBIGCLBYTES)) >=
2380 						    MINCLSIZE) {
2381 							num_needed++;
2382 						}
2383 
2384 						freelist =
2385 						    m_getpackets_internal(
2386 							(unsigned int *)&num_needed,
2387 							hdrs_needed, M_WAIT, 0,
2388 							MBIGCLBYTES);
2389 						/*
2390 						 * Fall back to cluster size
2391 						 * if allocation failed
2392 						 */
2393 					}
2394 
2395 					/*
2396 					 * Allocate a cluster as we want to
2397 					 * avoid to split the data in more
2398 					 * that one segment and using MINCLSIZE
2399 					 * would lead us to allocate two mbufs
2400 					 */
2401 					if (soreserveheadroom != 0 &&
2402 					    freelist == NULL &&
2403 					    ((top == NULL &&
2404 					    bytes_to_alloc > _MHLEN) ||
2405 					    bytes_to_alloc > _MLEN)) {
2406 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2407 						    MCLBYTES;
2408 						freelist =
2409 						    m_getpackets_internal(
2410 							(unsigned int *)&num_needed,
2411 							hdrs_needed, M_WAIT, 0,
2412 							MCLBYTES);
2413 						/*
2414 						 * Fall back to a single mbuf
2415 						 * if allocation failed
2416 						 */
2417 					} else if (freelist == NULL &&
2418 					    bytes_to_alloc > MINCLSIZE) {
2419 						num_needed =
2420 						    bytes_to_alloc / MCLBYTES;
2421 
2422 						if ((bytes_to_alloc -
2423 						    (num_needed * MCLBYTES)) >=
2424 						    MINCLSIZE) {
2425 							num_needed++;
2426 						}
2427 
2428 						freelist =
2429 						    m_getpackets_internal(
2430 							(unsigned int *)&num_needed,
2431 							hdrs_needed, M_WAIT, 0,
2432 							MCLBYTES);
2433 						/*
2434 						 * Fall back to a single mbuf
2435 						 * if allocation failed
2436 						 */
2437 					}
2438 					/*
2439 					 * For datagram protocols, leave
2440 					 * headroom for protocol headers
2441 					 * in the first cluster of the chain
2442 					 */
2443 					if (freelist != NULL && atomic &&
2444 					    top == NULL && headroom > 0) {
2445 						freelist->m_data += headroom;
2446 					}
2447 
2448 					/*
2449 					 * Fall back to regular mbufs without
2450 					 * reserving the socket headroom
2451 					 */
2452 					if (freelist == NULL) {
2453 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2454 							if (top == NULL) {
2455 								MGETHDR(freelist,
2456 								    M_WAIT, MT_DATA);
2457 							} else {
2458 								MGET(freelist,
2459 								    M_WAIT, MT_DATA);
2460 							}
2461 						}
2462 
2463 						if (freelist == NULL) {
2464 							error = ENOBUFS;
2465 							socket_lock(so, 0);
2466 							goto out_locked;
2467 						}
2468 						/*
2469 						 * For datagram protocols,
2470 						 * leave room for protocol
2471 						 * headers in first mbuf.
2472 						 */
2473 						if (atomic && top == NULL &&
2474 						    bytes_to_copy > 0 &&
2475 						    bytes_to_copy < MHLEN) {
2476 							MH_ALIGN(freelist,
2477 							    bytes_to_copy);
2478 						}
2479 					}
2480 					m = freelist;
2481 					freelist = m->m_next;
2482 					m->m_next = NULL;
2483 
2484 					if ((m->m_flags & M_EXT)) {
2485 						mlen = m->m_ext.ext_size -
2486 						    M_LEADINGSPACE(m);
2487 					} else if ((m->m_flags & M_PKTHDR)) {
2488 						mlen = MHLEN - M_LEADINGSPACE(m);
2489 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2490 					} else {
2491 						mlen = MLEN - M_LEADINGSPACE(m);
2492 					}
2493 					len = imin((int)mlen, bytes_to_copy);
2494 
2495 					chainlength += len;
2496 
2497 					space -= len;
2498 
2499 					error = uiomove(mtod(m, caddr_t),
2500 					    (int)len, uio);
2501 
2502 					resid = uio_resid(uio);
2503 
2504 					m->m_len = (int32_t)len;
2505 					*mp = m;
2506 					top->m_pkthdr.len += len;
2507 					if (error) {
2508 						break;
2509 					}
2510 					mp = &m->m_next;
2511 					if (resid <= 0) {
2512 						if (flags & MSG_EOR) {
2513 							top->m_flags |= M_EOR;
2514 						}
2515 						break;
2516 					}
2517 					bytes_to_copy = imin((int)resid, (int)space);
2518 				} while (space > 0 &&
2519 				    (chainlength < sosendmaxchain || atomic ||
2520 				    resid < MINCLSIZE));
2521 
2522 				socket_lock(so, 0);
2523 
2524 				if (error) {
2525 					goto out_locked;
2526 				}
2527 			}
2528 
2529 			if (dontroute) {
2530 				so->so_options |= SO_DONTROUTE;
2531 			}
2532 
2533 			/*
2534 			 * Compute flags here, for pru_send and NKEs
2535 			 *
2536 			 * If the user set MSG_EOF, the protocol
2537 			 * understands this flag and nothing left to
2538 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2539 			 */
2540 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2541 			    ((flags & MSG_EOF) &&
2542 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2543 			    (resid <= 0)) ? PRUS_EOF :
2544 			    /* If there is more to send set PRUS_MORETOCOME */
2545 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2546 
2547 			if ((flags & MSG_SKIPCFIL) == 0) {
2548 				/*
2549 				 * Socket filter processing
2550 				 */
2551 				error = sflt_data_out(so, addr, &top,
2552 				    &control, (sendflags & MSG_OOB) ?
2553 				    sock_data_filt_flag_oob : 0);
2554 				if (error) {
2555 					if (error == EJUSTRETURN) {
2556 						error = 0;
2557 						goto packet_consumed;
2558 					}
2559 					goto out_locked;
2560 				}
2561 #if CONTENT_FILTER
2562 				/*
2563 				 * Content filter processing
2564 				 */
2565 				error = cfil_sock_data_out(so, addr, top,
2566 				    control, sendflags, dgram_flow_entry);
2567 				if (error) {
2568 					if (error == EJUSTRETURN) {
2569 						error = 0;
2570 						goto packet_consumed;
2571 					}
2572 					goto out_locked;
2573 				}
2574 #endif /* CONTENT_FILTER */
2575 			}
2576 			error = (*so->so_proto->pr_usrreqs->pru_send)
2577 			    (so, sendflags, top, addr, control, p);
2578 
2579 packet_consumed:
2580 			if (dontroute) {
2581 				so->so_options &= ~SO_DONTROUTE;
2582 			}
2583 
2584 			clen = 0;
2585 			control = NULL;
2586 			top = NULL;
2587 			mp = &top;
2588 			if (error) {
2589 				goto out_locked;
2590 			}
2591 		} while (resid && space > 0);
2592 	} while (resid);
2593 
2594 
2595 out_locked:
2596 	if (resid > orig_resid) {
2597 		char pname[MAXCOMLEN] = {};
2598 		pid_t current_pid = proc_pid(current_proc());
2599 		proc_name(current_pid, pname, sizeof(pname));
2600 
2601 		if (sosend_assert_panic != 0) {
2602 			panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2603 			    so, resid, orig_resid, pname, current_pid);
2604 		} else {
2605 			os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2606 			    so->so_gencnt, resid, orig_resid, pname, current_pid);
2607 		}
2608 	}
2609 
2610 	if (sblocked) {
2611 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2612 	} else {
2613 		socket_unlock(so, 1);
2614 	}
2615 	if (top != NULL) {
2616 		m_freem(top);
2617 	}
2618 	if (control != NULL) {
2619 		m_freem(control);
2620 	}
2621 	if (freelist != NULL) {
2622 		m_freem_list(freelist);
2623 	}
2624 
2625 	if (dgram_flow_entry != NULL) {
2626 		soflow_free_flow(dgram_flow_entry);
2627 	}
2628 
2629 	soclearfastopen(so);
2630 
2631 	if (en_tracing) {
2632 		/* resid passed here is the bytes left in uio */
2633 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2634 		    VM_KERNEL_ADDRPERM(so),
2635 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2636 		    (int64_t)(orig_resid - resid));
2637 	}
2638 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2639 	    so->so_snd.sb_cc, space, error);
2640 
2641 	return error;
2642 }
2643 
2644 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2645 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2646 {
2647 	struct mbuf *m0 = NULL, *control_end = NULL;
2648 
2649 	socket_lock_assert_owned(so);
2650 
2651 	/*
2652 	 * top must points to mbuf chain to be sent.
2653 	 * If control is not NULL, top must be packet header
2654 	 */
2655 	VERIFY(top != NULL &&
2656 	    (control == NULL || top->m_flags & M_PKTHDR));
2657 
2658 	/*
2659 	 * If control is not passed in, see if we can get it
2660 	 * from top.
2661 	 */
2662 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2663 		// Locate start of control if present and start of data
2664 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2665 			if (m0->m_flags & M_PKTHDR) {
2666 				top = m0;
2667 				break;
2668 			} else if (m0->m_type == MT_CONTROL) {
2669 				if (control == NULL) {
2670 					// Found start of control
2671 					control = m0;
2672 				}
2673 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2674 					// Found end of control
2675 					control_end = m0;
2676 				}
2677 			}
2678 		}
2679 		if (control_end != NULL) {
2680 			control_end->m_next = NULL;
2681 		}
2682 	}
2683 
2684 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2685 	    (so, sendflags, top, addr, control, current_proc());
2686 
2687 	return error;
2688 }
2689 
2690 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp)2691 mbuf_detach_control_from_list(struct mbuf **mp)
2692 {
2693 	struct mbuf *control = NULL;
2694 	struct mbuf *m = *mp;
2695 
2696 	if (m->m_type == MT_CONTROL) {
2697 		struct mbuf *control_end;
2698 		struct mbuf *n;
2699 
2700 		n = control_end = control = m;
2701 
2702 		/*
2703 		 * Break the chain per mbuf type
2704 		 */
2705 		while (n != NULL && n->m_type == MT_CONTROL) {
2706 			control_end = n;
2707 			n = n->m_next;
2708 		}
2709 		control_end->m_next = NULL;
2710 		*mp = n;
2711 	}
2712 	VERIFY(*mp != NULL);
2713 
2714 	return control;
2715 }
2716 
2717 /*
2718  * Supported only connected sockets (no address) without ancillary data
2719  * (control mbuf) for atomic protocols
2720  */
2721 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2722 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2723 {
2724 	struct mbuf *m;
2725 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2726 	int error, dontroute;
2727 	int atomic = sosendallatonce(so);
2728 	int sblocked = 0;
2729 	struct proc *p = current_proc();
2730 	struct mbuf *top = pktlist;
2731 	bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2732 
2733 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2734 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2735 
2736 	if (so->so_type != SOCK_DGRAM) {
2737 		error = EINVAL;
2738 		os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2739 		    error);
2740 		goto out;
2741 	}
2742 	if (atomic == 0) {
2743 		error = EINVAL;
2744 		os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2745 		    error);
2746 		goto out;
2747 	}
2748 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2749 		error = ENOTCONN;
2750 		os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2751 		    error);
2752 		goto out;
2753 	}
2754 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2755 		error = EINVAL;
2756 		os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2757 		    flags, error);
2758 		goto out;
2759 	}
2760 
2761 	socket_lock(so, 1);
2762 	so_update_last_owner_locked(so, p);
2763 	so_update_policy(so);
2764 
2765 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2766 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, true, 0);
2767 	}
2768 
2769 #if NECP
2770 	so_update_necp_policy(so, NULL, NULL);
2771 #endif /* NECP */
2772 
2773 	dontroute = (flags & MSG_DONTROUTE) &&
2774 	    (so->so_options & SO_DONTROUTE) == 0 &&
2775 	    (so->so_proto->pr_flags & PR_ATOMIC);
2776 	if (dontroute) {
2777 		so->so_options |= SO_DONTROUTE;
2778 	}
2779 
2780 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2781 
2782 	error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2783 	if (error) {
2784 		os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2785 		    error);
2786 		goto release;
2787 	}
2788 
2789 	if (!skip_filt) {
2790 		struct mbuf **prevnextp = NULL;
2791 
2792 		for (m = top; m != NULL; m = m->m_nextpkt) {
2793 			struct mbuf *control = NULL;
2794 			struct mbuf *last_control = NULL;
2795 			struct mbuf *nextpkt;
2796 
2797 			/*
2798 			 * Remove packet from the list of packets
2799 			 */
2800 			nextpkt = m->m_nextpkt;
2801 			if (prevnextp != NULL) {
2802 				*prevnextp = nextpkt;
2803 			} else {
2804 				top = nextpkt;
2805 			}
2806 			m->m_nextpkt = NULL;
2807 
2808 			/*
2809 			 * Break the chain per mbuf type
2810 			 */
2811 			if (m->m_type == MT_CONTROL) {
2812 				control = mbuf_detach_control_from_list(&m);
2813 			}
2814 			/*
2815 			 * Socket filter processing
2816 			 */
2817 			error = sflt_data_out(so, NULL, &m,
2818 			    &control, 0);
2819 			if (error != 0 && error != EJUSTRETURN) {
2820 				os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2821 				    error);
2822 				goto release;
2823 			}
2824 
2825 #if CONTENT_FILTER
2826 			if (error == 0) {
2827 				/*
2828 				 * Content filter processing
2829 				 */
2830 				error = cfil_sock_data_out(so, NULL, m,
2831 				    control, 0, dgram_flow_entry);
2832 				if (error != 0 && error != EJUSTRETURN) {
2833 					os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2834 					    error);
2835 					goto release;
2836 				}
2837 			}
2838 #endif /* CONTENT_FILTER */
2839 			if (error == EJUSTRETURN) {
2840 				/*
2841 				 * When swallowed by a filter, the packet is not
2842 				 * in the list anymore
2843 				 */
2844 				error = 0;
2845 			} else {
2846 				/*
2847 				 * Rebuild the mbuf chain of the packet
2848 				 */
2849 				if (control != NULL) {
2850 					last_control->m_next = m;
2851 					m = control;
2852 				}
2853 				/*
2854 				 * Reinsert the packet in the list of packets
2855 				 */
2856 				m->m_nextpkt = nextpkt;
2857 				if (prevnextp != NULL) {
2858 					*prevnextp = m;
2859 				} else {
2860 					top = m;
2861 				}
2862 				prevnextp = &m->m_nextpkt;
2863 			}
2864 		}
2865 	}
2866 
2867 	if (top != NULL) {
2868 		if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2869 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2870 			    (so, top, pktcnt, flags);
2871 			if (error != 0) {
2872 				os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2873 				    error);
2874 			}
2875 			top = NULL;
2876 		} else {
2877 			*pktcnt = 0;
2878 			for (m = top; m != NULL; m = top) {
2879 				struct mbuf *control = NULL;
2880 
2881 				top = m->m_nextpkt;
2882 				m->m_nextpkt = NULL;
2883 
2884 				/*
2885 				 * Break the chain per mbuf type
2886 				 */
2887 				if (m->m_type == MT_CONTROL) {
2888 					control = mbuf_detach_control_from_list(&m);
2889 				}
2890 
2891 				error = (*so->so_proto->pr_usrreqs->pru_send)
2892 				    (so, 0, m, NULL, control, current_proc());
2893 				if (error != 0) {
2894 					os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2895 					    error);
2896 					goto release;
2897 				}
2898 				*pktcnt += 1;
2899 			}
2900 		}
2901 	}
2902 
2903 release:
2904 	if (dontroute) {
2905 		so->so_options &= ~SO_DONTROUTE;
2906 	}
2907 	if (sblocked) {
2908 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2909 	} else {
2910 		socket_unlock(so, 1);
2911 	}
2912 out:
2913 	if (top != NULL) {
2914 		os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2915 		    error);
2916 		m_freem_list(top);
2917 	}
2918 
2919 	if (dgram_flow_entry != NULL) {
2920 		soflow_free_flow(dgram_flow_entry);
2921 	}
2922 
2923 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2924 	    so->so_snd.sb_cc, 0, error);
2925 
2926 	return error;
2927 }
2928 
2929 /*
2930  * May return ERESTART when packet is dropped by MAC policy check
2931  */
2932 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2933 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2934     struct mbuf **maddrp,
2935     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2936 {
2937 	int error = 0;
2938 	struct mbuf *m = *mp;
2939 	struct mbuf *nextrecord = *nextrecordp;
2940 
2941 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2942 #if CONFIG_MACF_SOCKET_SUBSET
2943 	/*
2944 	 * Call the MAC framework for policy checking if we're in
2945 	 * the user process context and the socket isn't connected.
2946 	 */
2947 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2948 		struct mbuf *m0 = m;
2949 		/*
2950 		 * Dequeue this record (temporarily) from the receive
2951 		 * list since we're about to drop the socket's lock
2952 		 * where a new record may arrive and be appended to
2953 		 * the list.  Upon MAC policy failure, the record
2954 		 * will be freed.  Otherwise, we'll add it back to
2955 		 * the head of the list.  We cannot rely on SB_LOCK
2956 		 * because append operation uses the socket's lock.
2957 		 */
2958 		do {
2959 			m->m_nextpkt = NULL;
2960 			sbfree(&so->so_rcv, m);
2961 			m = m->m_next;
2962 		} while (m != NULL);
2963 		m = m0;
2964 		so->so_rcv.sb_mb = nextrecord;
2965 		SB_EMPTY_FIXUP(&so->so_rcv);
2966 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2967 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2968 		socket_unlock(so, 0);
2969 
2970 		error = mac_socket_check_received(kauth_cred_get(), so,
2971 		    mtod(m, struct sockaddr *));
2972 
2973 		if (error != 0) {
2974 			/*
2975 			 * MAC policy failure; free this record and
2976 			 * process the next record (or block until
2977 			 * one is available).  We have adjusted sb_cc
2978 			 * and sb_mbcnt above so there is no need to
2979 			 * call sbfree() again.
2980 			 */
2981 			m_freem(m);
2982 			/*
2983 			 * Clear SB_LOCK but don't unlock the socket.
2984 			 * Process the next record or wait for one.
2985 			 */
2986 			socket_lock(so, 0);
2987 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
2988 			error = ERESTART;
2989 			goto done;
2990 		}
2991 		socket_lock(so, 0);
2992 		/*
2993 		 * If the socket has been defunct'd, drop it.
2994 		 */
2995 		if (so->so_flags & SOF_DEFUNCT) {
2996 			m_freem(m);
2997 			error = ENOTCONN;
2998 			goto done;
2999 		}
3000 		/*
3001 		 * Re-adjust the socket receive list and re-enqueue
3002 		 * the record in front of any packets which may have
3003 		 * been appended while we dropped the lock.
3004 		 */
3005 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3006 			sballoc(&so->so_rcv, m);
3007 		}
3008 		sballoc(&so->so_rcv, m);
3009 		if (so->so_rcv.sb_mb == NULL) {
3010 			so->so_rcv.sb_lastrecord = m0;
3011 			so->so_rcv.sb_mbtail = m;
3012 		}
3013 		m = m0;
3014 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3015 		so->so_rcv.sb_mb = m;
3016 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3017 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3018 	}
3019 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3020 	if (psa != NULL) {
3021 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3022 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3023 			error = EWOULDBLOCK;
3024 			goto done;
3025 		}
3026 	} else if (maddrp != NULL) {
3027 		*maddrp = m;
3028 	}
3029 	if (flags & MSG_PEEK) {
3030 		m = m->m_next;
3031 	} else {
3032 		sbfree(&so->so_rcv, m);
3033 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3034 			panic("%s: about to create invalid socketbuf",
3035 			    __func__);
3036 			/* NOTREACHED */
3037 		}
3038 		if (maddrp == NULL) {
3039 			MFREE(m, so->so_rcv.sb_mb);
3040 		} else {
3041 			so->so_rcv.sb_mb = m->m_next;
3042 			m->m_next = NULL;
3043 		}
3044 		m = so->so_rcv.sb_mb;
3045 		if (m != NULL) {
3046 			m->m_nextpkt = nextrecord;
3047 		} else {
3048 			so->so_rcv.sb_mb = nextrecord;
3049 			SB_EMPTY_FIXUP(&so->so_rcv);
3050 		}
3051 	}
3052 done:
3053 	*mp = m;
3054 	*nextrecordp = nextrecord;
3055 
3056 	return error;
3057 }
3058 
3059 /*
3060  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3061  * so clear the data portion in order not to leak the file pointers
3062  */
3063 static void
sopeek_scm_rights(struct mbuf * rights)3064 sopeek_scm_rights(struct mbuf *rights)
3065 {
3066 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3067 
3068 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3069 		VERIFY(cm->cmsg_len <= rights->m_len);
3070 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3071 	}
3072 }
3073 
3074 /*
3075  * Process one or more MT_CONTROL mbufs present before any data mbufs
3076  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3077  * just copy the data; if !MSG_PEEK, we call into the protocol to
3078  * perform externalization.
3079  */
3080 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3081 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3082     struct mbuf **mp, struct mbuf **nextrecordp)
3083 {
3084 	int error = 0;
3085 	struct mbuf *cm = NULL, *cmn;
3086 	struct mbuf **cme = &cm;
3087 	struct sockbuf *sb_rcv = &so->so_rcv;
3088 	struct mbuf **msgpcm = NULL;
3089 	struct mbuf *m = *mp;
3090 	struct mbuf *nextrecord = *nextrecordp;
3091 	struct protosw *pr = so->so_proto;
3092 
3093 	/*
3094 	 * Externalizing the control messages would require us to
3095 	 * drop the socket's lock below.  Once we re-acquire the
3096 	 * lock, the mbuf chain might change.  In order to preserve
3097 	 * consistency, we unlink all control messages from the
3098 	 * first mbuf chain in one shot and link them separately
3099 	 * onto a different chain.
3100 	 */
3101 	do {
3102 		if (flags & MSG_PEEK) {
3103 			if (controlp != NULL) {
3104 				if (*controlp == NULL) {
3105 					msgpcm = controlp;
3106 				}
3107 				*controlp = m_copy(m, 0, m->m_len);
3108 
3109 				/*
3110 				 * If we failed to allocate an mbuf,
3111 				 * release any previously allocated
3112 				 * mbufs for control data. Return
3113 				 * an error. Keep the mbufs in the
3114 				 * socket as this is using
3115 				 * MSG_PEEK flag.
3116 				 */
3117 				if (*controlp == NULL) {
3118 					m_freem(*msgpcm);
3119 					error = ENOBUFS;
3120 					goto done;
3121 				}
3122 
3123 				if (pr->pr_domain->dom_externalize != NULL) {
3124 					sopeek_scm_rights(*controlp);
3125 				}
3126 
3127 				controlp = &(*controlp)->m_next;
3128 			}
3129 			m = m->m_next;
3130 		} else {
3131 			m->m_nextpkt = NULL;
3132 			sbfree(sb_rcv, m);
3133 			sb_rcv->sb_mb = m->m_next;
3134 			m->m_next = NULL;
3135 			*cme = m;
3136 			cme = &(*cme)->m_next;
3137 			m = sb_rcv->sb_mb;
3138 		}
3139 	} while (m != NULL && m->m_type == MT_CONTROL);
3140 
3141 	if (!(flags & MSG_PEEK)) {
3142 		if (sb_rcv->sb_mb != NULL) {
3143 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3144 		} else {
3145 			sb_rcv->sb_mb = nextrecord;
3146 			SB_EMPTY_FIXUP(sb_rcv);
3147 		}
3148 		if (nextrecord == NULL) {
3149 			sb_rcv->sb_lastrecord = m;
3150 		}
3151 	}
3152 
3153 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3154 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3155 
3156 	while (cm != NULL) {
3157 		int cmsg_level;
3158 		int cmsg_type;
3159 
3160 		cmn = cm->m_next;
3161 		cm->m_next = NULL;
3162 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3163 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3164 
3165 		/*
3166 		 * Call the protocol to externalize SCM_RIGHTS message
3167 		 * and return the modified message to the caller upon
3168 		 * success.  Otherwise, all other control messages are
3169 		 * returned unmodified to the caller.  Note that we
3170 		 * only get into this loop if MSG_PEEK is not set.
3171 		 */
3172 		if (pr->pr_domain->dom_externalize != NULL &&
3173 		    cmsg_level == SOL_SOCKET &&
3174 		    cmsg_type == SCM_RIGHTS) {
3175 			/*
3176 			 * Release socket lock: see 3903171.  This
3177 			 * would also allow more records to be appended
3178 			 * to the socket buffer.  We still have SB_LOCK
3179 			 * set on it, so we can be sure that the head
3180 			 * of the mbuf chain won't change.
3181 			 */
3182 			socket_unlock(so, 0);
3183 			error = (*pr->pr_domain->dom_externalize)(cm);
3184 			socket_lock(so, 0);
3185 		} else {
3186 			error = 0;
3187 		}
3188 
3189 		if (controlp != NULL && error == 0) {
3190 			*controlp = cm;
3191 			controlp = &(*controlp)->m_next;
3192 		} else {
3193 			(void) m_free(cm);
3194 		}
3195 		cm = cmn;
3196 	}
3197 	/*
3198 	 * Update the value of nextrecord in case we received new
3199 	 * records when the socket was unlocked above for
3200 	 * externalizing SCM_RIGHTS.
3201 	 */
3202 	if (m != NULL) {
3203 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3204 	} else {
3205 		nextrecord = sb_rcv->sb_mb;
3206 	}
3207 
3208 done:
3209 	*mp = m;
3210 	*nextrecordp = nextrecord;
3211 
3212 	return error;
3213 }
3214 
3215 /*
3216  * If we have less data than requested, block awaiting more
3217  * (subject to any timeout) if:
3218  *   1. the current count is less than the low water mark, or
3219  *   2. MSG_WAITALL is set, and it is possible to do the entire
3220  *	receive operation at once if we block (resid <= hiwat).
3221  *   3. MSG_DONTWAIT is not set
3222  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3223  * we have to do the receive in sections, and thus risk returning
3224  * a short count if a timeout or signal occurs after we start.
3225  */
3226 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3227 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3228 {
3229 	struct protosw *pr = so->so_proto;
3230 
3231 	/* No mbufs in the receive-queue? Wait! */
3232 	if (m == NULL) {
3233 		return true;
3234 	}
3235 
3236 	/* Not enough data in the receive socket-buffer - we may have to wait */
3237 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3238 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3239 		/*
3240 		 * Application did set the lowater-mark, so we should wait for
3241 		 * this data to be present.
3242 		 */
3243 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3244 			return true;
3245 		}
3246 
3247 		/*
3248 		 * Application wants all the data - so let's try to do the
3249 		 * receive-operation at once by waiting for everything to
3250 		 * be there.
3251 		 */
3252 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3253 			return true;
3254 		}
3255 	}
3256 
3257 	return false;
3258 }
3259 
3260 /*
3261  * Implement receive operations on a socket.
3262  * We depend on the way that records are added to the sockbuf
3263  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3264  * must begin with an address if the protocol so specifies,
3265  * followed by an optional mbuf or mbufs containing ancillary data,
3266  * and then zero or more mbufs of data.
3267  * In order to avoid blocking network interrupts for the entire time here,
3268  * we splx() while doing the actual copy to user space.
3269  * Although the sockbuf is locked, new data may still be appended,
3270  * and thus we must maintain consistency of the sockbuf during that time.
3271  *
3272  * The caller may receive the data as a single mbuf chain by supplying
3273  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3274  * only for the count in uio_resid.
3275  *
3276  * Returns:	0			Success
3277  *		ENOBUFS
3278  *		ENOTCONN
3279  *		EWOULDBLOCK
3280  *	uiomove:EFAULT
3281  *	sblock:EWOULDBLOCK
3282  *	sblock:EINTR
3283  *	sbwait:EBADF
3284  *	sbwait:EINTR
3285  *	sodelayed_copy:EFAULT
3286  *	<pru_rcvoob>:EINVAL[TCP]
3287  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3288  *	<pru_rcvoob>:???
3289  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3290  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3291  *	<pr_domain->dom_externalize>:???
3292  *
3293  * Notes:	Additional return values from calls through <pru_rcvoob> and
3294  *		<pr_domain->dom_externalize> depend on protocols other than
3295  *		TCP or AF_UNIX, which are documented above.
3296  */
3297 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3298 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3299     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3300 {
3301 	struct mbuf *m, **mp, *ml = NULL;
3302 	struct mbuf *nextrecord, *free_list;
3303 	int flags, error, offset;
3304 	user_ssize_t len;
3305 	struct protosw *pr = so->so_proto;
3306 	int moff, type = 0;
3307 	user_ssize_t orig_resid = uio_resid(uio);
3308 	user_ssize_t delayed_copy_len;
3309 	int can_delay;
3310 	struct proc *p = current_proc();
3311 	boolean_t en_tracing = FALSE;
3312 
3313 	/*
3314 	 * Sanity check on the length passed by caller as we are making 'int'
3315 	 * comparisons
3316 	 */
3317 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3318 		return EINVAL;
3319 	}
3320 
3321 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3322 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3323 	    so->so_rcv.sb_hiwat);
3324 
3325 	socket_lock(so, 1);
3326 	so_update_last_owner_locked(so, p);
3327 	so_update_policy(so);
3328 
3329 #ifdef MORE_LOCKING_DEBUG
3330 	if (so->so_usecount == 1) {
3331 		panic("%s: so=%x no other reference on socket", __func__, so);
3332 		/* NOTREACHED */
3333 	}
3334 #endif
3335 	mp = mp0;
3336 	if (psa != NULL) {
3337 		*psa = NULL;
3338 	}
3339 	if (controlp != NULL) {
3340 		*controlp = NULL;
3341 	}
3342 	if (flagsp != NULL) {
3343 		flags = *flagsp & ~MSG_EOR;
3344 	} else {
3345 		flags = 0;
3346 	}
3347 
3348 	/*
3349 	 * If a recv attempt is made on a previously-accepted socket
3350 	 * that has been marked as inactive (disconnected), reject
3351 	 * the request.
3352 	 */
3353 	if (so->so_flags & SOF_DEFUNCT) {
3354 		struct sockbuf *sb = &so->so_rcv;
3355 
3356 		error = ENOTCONN;
3357 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3358 		    __func__, proc_pid(p), proc_best_name(p),
3359 		    so->so_gencnt,
3360 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3361 		/*
3362 		 * This socket should have been disconnected and flushed
3363 		 * prior to being returned from sodefunct(); there should
3364 		 * be no data on its receive list, so panic otherwise.
3365 		 */
3366 		if (so->so_state & SS_DEFUNCT) {
3367 			sb_empty_assert(sb, __func__);
3368 		}
3369 		socket_unlock(so, 1);
3370 		return error;
3371 	}
3372 
3373 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3374 	    pr->pr_usrreqs->pru_preconnect) {
3375 		/*
3376 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3377 		 * calling write() right after this. *If* the app calls a read
3378 		 * we do not want to block this read indefinetely. Thus,
3379 		 * we trigger a connect so that the session gets initiated.
3380 		 */
3381 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3382 
3383 		if (error) {
3384 			socket_unlock(so, 1);
3385 			return error;
3386 		}
3387 	}
3388 
3389 	if (ENTR_SHOULDTRACE &&
3390 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3391 		/*
3392 		 * enable energy tracing for inet sockets that go over
3393 		 * non-loopback interfaces only.
3394 		 */
3395 		struct inpcb *inp = sotoinpcb(so);
3396 		if (inp->inp_last_outifp != NULL &&
3397 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3398 			en_tracing = TRUE;
3399 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3400 			    VM_KERNEL_ADDRPERM(so),
3401 			    ((so->so_state & SS_NBIO) ?
3402 			    kEnTrFlagNonBlocking : 0),
3403 			    (int64_t)orig_resid);
3404 		}
3405 	}
3406 
3407 	/*
3408 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3409 	 * regardless of the flags argument. Here is the case were
3410 	 * out-of-band data is not inline.
3411 	 */
3412 	if ((flags & MSG_OOB) ||
3413 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3414 	    (so->so_options & SO_OOBINLINE) == 0 &&
3415 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3416 		m = m_get(M_WAIT, MT_DATA);
3417 		if (m == NULL) {
3418 			socket_unlock(so, 1);
3419 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3420 			    ENOBUFS, 0, 0, 0, 0);
3421 			return ENOBUFS;
3422 		}
3423 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3424 		if (error) {
3425 			goto bad;
3426 		}
3427 		socket_unlock(so, 0);
3428 		do {
3429 			error = uiomove(mtod(m, caddr_t),
3430 			    imin((int)uio_resid(uio), m->m_len), uio);
3431 			m = m_free(m);
3432 		} while (uio_resid(uio) && error == 0 && m != NULL);
3433 		socket_lock(so, 0);
3434 bad:
3435 		if (m != NULL) {
3436 			m_freem(m);
3437 		}
3438 
3439 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3440 			if (error == EWOULDBLOCK || error == EINVAL) {
3441 				/*
3442 				 * Let's try to get normal data:
3443 				 * EWOULDBLOCK: out-of-band data not
3444 				 * receive yet. EINVAL: out-of-band data
3445 				 * already read.
3446 				 */
3447 				error = 0;
3448 				goto nooob;
3449 			} else if (error == 0 && flagsp != NULL) {
3450 				*flagsp |= MSG_OOB;
3451 			}
3452 		}
3453 		socket_unlock(so, 1);
3454 		if (en_tracing) {
3455 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3456 			    VM_KERNEL_ADDRPERM(so), 0,
3457 			    (int64_t)(orig_resid - uio_resid(uio)));
3458 		}
3459 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3460 		    0, 0, 0, 0);
3461 
3462 		return error;
3463 	}
3464 nooob:
3465 	if (mp != NULL) {
3466 		*mp = NULL;
3467 	}
3468 
3469 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3470 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3471 	}
3472 
3473 	free_list = NULL;
3474 	delayed_copy_len = 0;
3475 restart:
3476 #ifdef MORE_LOCKING_DEBUG
3477 	if (so->so_usecount <= 1) {
3478 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3479 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3480 	}
3481 #endif
3482 	/*
3483 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3484 	 * and if so just return to the caller.  This could happen when
3485 	 * soreceive() is called by a socket upcall function during the
3486 	 * time the socket is freed.  The socket buffer would have been
3487 	 * locked across the upcall, therefore we cannot put this thread
3488 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3489 	 * we may livelock), because the lock on the socket buffer will
3490 	 * only be released when the upcall routine returns to its caller.
3491 	 * Because the socket has been officially closed, there can be
3492 	 * no further read on it.
3493 	 *
3494 	 * A multipath subflow socket would have its SS_NOFDREF set by
3495 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3496 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3497 	 */
3498 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3499 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3500 		socket_unlock(so, 1);
3501 		return 0;
3502 	}
3503 
3504 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3505 	if (error) {
3506 		socket_unlock(so, 1);
3507 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3508 		    0, 0, 0, 0);
3509 		if (en_tracing) {
3510 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3511 			    VM_KERNEL_ADDRPERM(so), 0,
3512 			    (int64_t)(orig_resid - uio_resid(uio)));
3513 		}
3514 		return error;
3515 	}
3516 
3517 	m = so->so_rcv.sb_mb;
3518 	if (so_should_wait(so, uio, m, flags)) {
3519 		/*
3520 		 * Panic if we notice inconsistencies in the socket's
3521 		 * receive list; both sb_mb and sb_cc should correctly
3522 		 * reflect the contents of the list, otherwise we may
3523 		 * end up with false positives during select() or poll()
3524 		 * which could put the application in a bad state.
3525 		 */
3526 		SB_MB_CHECK(&so->so_rcv);
3527 
3528 		if (so->so_error) {
3529 			if (m != NULL) {
3530 				goto dontblock;
3531 			}
3532 			error = so->so_error;
3533 			if ((flags & MSG_PEEK) == 0) {
3534 				so->so_error = 0;
3535 			}
3536 			goto release;
3537 		}
3538 		if (so->so_state & SS_CANTRCVMORE) {
3539 #if CONTENT_FILTER
3540 			/*
3541 			 * Deal with half closed connections
3542 			 */
3543 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3544 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3545 				CFIL_LOG(LOG_INFO,
3546 				    "so %llx ignore SS_CANTRCVMORE",
3547 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3548 			} else
3549 #endif /* CONTENT_FILTER */
3550 			if (m != NULL) {
3551 				goto dontblock;
3552 			} else {
3553 				goto release;
3554 			}
3555 		}
3556 		for (; m != NULL; m = m->m_next) {
3557 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3558 				m = so->so_rcv.sb_mb;
3559 				goto dontblock;
3560 			}
3561 		}
3562 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3563 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3564 			error = ENOTCONN;
3565 			goto release;
3566 		}
3567 		if (uio_resid(uio) == 0) {
3568 			goto release;
3569 		}
3570 
3571 		if ((so->so_state & SS_NBIO) ||
3572 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3573 			error = EWOULDBLOCK;
3574 			goto release;
3575 		}
3576 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3577 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3578 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3579 #if EVEN_MORE_LOCKING_DEBUG
3580 		if (socket_debug) {
3581 			printf("Waiting for socket data\n");
3582 		}
3583 #endif
3584 
3585 		/*
3586 		 * Depending on the protocol (e.g. TCP), the following
3587 		 * might cause the socket lock to be dropped and later
3588 		 * be reacquired, and more data could have arrived and
3589 		 * have been appended to the receive socket buffer by
3590 		 * the time it returns.  Therefore, we only sleep in
3591 		 * sbwait() below if and only if the wait-condition is still
3592 		 * true.
3593 		 */
3594 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3595 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3596 		}
3597 
3598 		error = 0;
3599 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3600 			error = sbwait(&so->so_rcv);
3601 		}
3602 
3603 #if EVEN_MORE_LOCKING_DEBUG
3604 		if (socket_debug) {
3605 			printf("SORECEIVE - sbwait returned %d\n", error);
3606 		}
3607 #endif
3608 		if (so->so_usecount < 1) {
3609 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3610 			    __func__, so, so->so_usecount);
3611 			/* NOTREACHED */
3612 		}
3613 		if (error) {
3614 			socket_unlock(so, 1);
3615 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3616 			    0, 0, 0, 0);
3617 			if (en_tracing) {
3618 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3619 				    VM_KERNEL_ADDRPERM(so), 0,
3620 				    (int64_t)(orig_resid - uio_resid(uio)));
3621 			}
3622 			return error;
3623 		}
3624 		goto restart;
3625 	}
3626 dontblock:
3627 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3628 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3629 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3630 	nextrecord = m->m_nextpkt;
3631 
3632 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3633 		error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3634 		    mp0 == NULL);
3635 		if (error == ERESTART) {
3636 			goto restart;
3637 		} else if (error != 0) {
3638 			goto release;
3639 		}
3640 		orig_resid = 0;
3641 	}
3642 
3643 	/*
3644 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3645 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3646 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3647 	 * perform externalization.
3648 	 */
3649 	if (m != NULL && m->m_type == MT_CONTROL) {
3650 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3651 		if (error != 0) {
3652 			goto release;
3653 		}
3654 		orig_resid = 0;
3655 	}
3656 
3657 	if (m != NULL) {
3658 		if (!(flags & MSG_PEEK)) {
3659 			/*
3660 			 * We get here because m points to an mbuf following
3661 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3662 			 * processed above.  In any case, m should be pointing
3663 			 * to the head of the mbuf chain, and the nextrecord
3664 			 * should be either NULL or equal to m->m_nextpkt.
3665 			 * See comments above about SB_LOCK.
3666 			 */
3667 			if (m != so->so_rcv.sb_mb ||
3668 			    m->m_nextpkt != nextrecord) {
3669 				panic("%s: post-control !sync so=%p m=%p "
3670 				    "nextrecord=%p\n", __func__, so, m,
3671 				    nextrecord);
3672 				/* NOTREACHED */
3673 			}
3674 			if (nextrecord == NULL) {
3675 				so->so_rcv.sb_lastrecord = m;
3676 			}
3677 		}
3678 		type = m->m_type;
3679 		if (type == MT_OOBDATA) {
3680 			flags |= MSG_OOB;
3681 		}
3682 	} else {
3683 		if (!(flags & MSG_PEEK)) {
3684 			SB_EMPTY_FIXUP(&so->so_rcv);
3685 		}
3686 	}
3687 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3688 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3689 
3690 	moff = 0;
3691 	offset = 0;
3692 
3693 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3694 		can_delay = 1;
3695 	} else {
3696 		can_delay = 0;
3697 	}
3698 
3699 	while (m != NULL &&
3700 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3701 		if (m->m_type == MT_OOBDATA) {
3702 			if (type != MT_OOBDATA) {
3703 				break;
3704 			}
3705 		} else if (type == MT_OOBDATA) {
3706 			break;
3707 		}
3708 
3709 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3710 			break;
3711 		}
3712 		/*
3713 		 * Make sure to allways set MSG_OOB event when getting
3714 		 * out of band data inline.
3715 		 */
3716 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3717 		    (so->so_options & SO_OOBINLINE) != 0 &&
3718 		    (so->so_state & SS_RCVATMARK) != 0) {
3719 			flags |= MSG_OOB;
3720 		}
3721 		so->so_state &= ~SS_RCVATMARK;
3722 		len = uio_resid(uio) - delayed_copy_len;
3723 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3724 			len = so->so_oobmark - offset;
3725 		}
3726 		if (len > m->m_len - moff) {
3727 			len = m->m_len - moff;
3728 		}
3729 		/*
3730 		 * If mp is set, just pass back the mbufs.
3731 		 * Otherwise copy them out via the uio, then free.
3732 		 * Sockbuf must be consistent here (points to current mbuf,
3733 		 * it points to next record) when we drop priority;
3734 		 * we must note any additions to the sockbuf when we
3735 		 * block interrupts again.
3736 		 */
3737 		if (mp == NULL) {
3738 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3739 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3740 			if (can_delay && len == m->m_len) {
3741 				/*
3742 				 * only delay the copy if we're consuming the
3743 				 * mbuf and we're NOT in MSG_PEEK mode
3744 				 * and we have enough data to make it worthwile
3745 				 * to drop and retake the lock... can_delay
3746 				 * reflects the state of the 2 latter
3747 				 * constraints moff should always be zero
3748 				 * in these cases
3749 				 */
3750 				delayed_copy_len += len;
3751 			} else {
3752 				if (delayed_copy_len) {
3753 					error = sodelayed_copy(so, uio,
3754 					    &free_list, &delayed_copy_len);
3755 
3756 					if (error) {
3757 						goto release;
3758 					}
3759 					/*
3760 					 * can only get here if MSG_PEEK is not
3761 					 * set therefore, m should point at the
3762 					 * head of the rcv queue; if it doesn't,
3763 					 * it means something drastically
3764 					 * changed while we were out from behind
3765 					 * the lock in sodelayed_copy. perhaps
3766 					 * a RST on the stream. in any event,
3767 					 * the stream has been interrupted. it's
3768 					 * probably best just to return whatever
3769 					 * data we've moved and let the caller
3770 					 * sort it out...
3771 					 */
3772 					if (m != so->so_rcv.sb_mb) {
3773 						break;
3774 					}
3775 				}
3776 				socket_unlock(so, 0);
3777 				error = uiomove(mtod(m, caddr_t) + moff,
3778 				    (int)len, uio);
3779 				socket_lock(so, 0);
3780 
3781 				if (error) {
3782 					goto release;
3783 				}
3784 			}
3785 		} else {
3786 			uio_setresid(uio, (uio_resid(uio) - len));
3787 		}
3788 		if (len == m->m_len - moff) {
3789 			if (m->m_flags & M_EOR) {
3790 				flags |= MSG_EOR;
3791 			}
3792 			if (flags & MSG_PEEK) {
3793 				m = m->m_next;
3794 				moff = 0;
3795 			} else {
3796 				nextrecord = m->m_nextpkt;
3797 				sbfree(&so->so_rcv, m);
3798 				m->m_nextpkt = NULL;
3799 
3800 				if (mp != NULL) {
3801 					*mp = m;
3802 					mp = &m->m_next;
3803 					so->so_rcv.sb_mb = m = m->m_next;
3804 					*mp = NULL;
3805 				} else {
3806 					if (free_list == NULL) {
3807 						free_list = m;
3808 					} else {
3809 						ml->m_next = m;
3810 					}
3811 					ml = m;
3812 					so->so_rcv.sb_mb = m = m->m_next;
3813 					ml->m_next = NULL;
3814 				}
3815 				if (m != NULL) {
3816 					m->m_nextpkt = nextrecord;
3817 					if (nextrecord == NULL) {
3818 						so->so_rcv.sb_lastrecord = m;
3819 					}
3820 				} else {
3821 					so->so_rcv.sb_mb = nextrecord;
3822 					SB_EMPTY_FIXUP(&so->so_rcv);
3823 				}
3824 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3825 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3826 			}
3827 		} else {
3828 			if (flags & MSG_PEEK) {
3829 				moff += len;
3830 			} else {
3831 				if (mp != NULL) {
3832 					int copy_flag;
3833 
3834 					if (flags & MSG_DONTWAIT) {
3835 						copy_flag = M_DONTWAIT;
3836 					} else {
3837 						copy_flag = M_WAIT;
3838 					}
3839 					*mp = m_copym(m, 0, (int)len, copy_flag);
3840 					/*
3841 					 * Failed to allocate an mbuf?
3842 					 * Adjust uio_resid back, it was
3843 					 * adjusted down by len bytes which
3844 					 * we didn't copy over.
3845 					 */
3846 					if (*mp == NULL) {
3847 						uio_setresid(uio,
3848 						    (uio_resid(uio) + len));
3849 						break;
3850 					}
3851 				}
3852 				m->m_data += len;
3853 				m->m_len -= len;
3854 				so->so_rcv.sb_cc -= len;
3855 			}
3856 		}
3857 		if (so->so_oobmark) {
3858 			if ((flags & MSG_PEEK) == 0) {
3859 				so->so_oobmark -= len;
3860 				if (so->so_oobmark == 0) {
3861 					so->so_state |= SS_RCVATMARK;
3862 					break;
3863 				}
3864 			} else {
3865 				offset += len;
3866 				if (offset == so->so_oobmark) {
3867 					break;
3868 				}
3869 			}
3870 		}
3871 		if (flags & MSG_EOR) {
3872 			break;
3873 		}
3874 		/*
3875 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3876 		 * (for non-atomic socket), we must not quit until
3877 		 * "uio->uio_resid == 0" or an error termination.
3878 		 * If a signal/timeout occurs, return with a short
3879 		 * count but without error.  Keep sockbuf locked
3880 		 * against other readers.
3881 		 */
3882 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3883 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3884 		    !sosendallatonce(so) && !nextrecord) {
3885 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3886 #if CONTENT_FILTER
3887 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3888 #endif /* CONTENT_FILTER */
3889 			    )) {
3890 				goto release;
3891 			}
3892 
3893 			/*
3894 			 * Depending on the protocol (e.g. TCP), the following
3895 			 * might cause the socket lock to be dropped and later
3896 			 * be reacquired, and more data could have arrived and
3897 			 * have been appended to the receive socket buffer by
3898 			 * the time it returns.  Therefore, we only sleep in
3899 			 * sbwait() below if and only if the socket buffer is
3900 			 * empty, in order to avoid a false sleep.
3901 			 */
3902 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3903 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3904 			}
3905 
3906 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3907 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3908 
3909 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3910 				error = 0;
3911 				goto release;
3912 			}
3913 			/*
3914 			 * have to wait until after we get back from the sbwait
3915 			 * to do the copy because we will drop the lock if we
3916 			 * have enough data that has been delayed... by dropping
3917 			 * the lock we open up a window allowing the netisr
3918 			 * thread to process the incoming packets and to change
3919 			 * the state of this socket... we're issuing the sbwait
3920 			 * because the socket is empty and we're expecting the
3921 			 * netisr thread to wake us up when more packets arrive;
3922 			 * if we allow that processing to happen and then sbwait
3923 			 * we could stall forever with packets sitting in the
3924 			 * socket if no further packets arrive from the remote
3925 			 * side.
3926 			 *
3927 			 * we want to copy before we've collected all the data
3928 			 * to satisfy this request to allow the copy to overlap
3929 			 * the incoming packet processing on an MP system
3930 			 */
3931 			if (delayed_copy_len > sorecvmincopy &&
3932 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3933 				error = sodelayed_copy(so, uio,
3934 				    &free_list, &delayed_copy_len);
3935 
3936 				if (error) {
3937 					goto release;
3938 				}
3939 			}
3940 			m = so->so_rcv.sb_mb;
3941 			if (m != NULL) {
3942 				nextrecord = m->m_nextpkt;
3943 			}
3944 			SB_MB_CHECK(&so->so_rcv);
3945 		}
3946 	}
3947 #ifdef MORE_LOCKING_DEBUG
3948 	if (so->so_usecount <= 1) {
3949 		panic("%s: after big while so=%p ref=%d on socket",
3950 		    __func__, so, so->so_usecount);
3951 		/* NOTREACHED */
3952 	}
3953 #endif
3954 
3955 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3956 		if (so->so_options & SO_DONTTRUNC) {
3957 			flags |= MSG_RCVMORE;
3958 		} else {
3959 			flags |= MSG_TRUNC;
3960 			if ((flags & MSG_PEEK) == 0) {
3961 				(void) sbdroprecord(&so->so_rcv);
3962 			}
3963 		}
3964 	}
3965 
3966 	/*
3967 	 * pru_rcvd below (for TCP) may cause more data to be received
3968 	 * if the socket lock is dropped prior to sending the ACK; some
3969 	 * legacy OpenTransport applications don't handle this well
3970 	 * (if it receives less data than requested while MSG_HAVEMORE
3971 	 * is set), and so we set the flag now based on what we know
3972 	 * prior to calling pru_rcvd.
3973 	 */
3974 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3975 		flags |= MSG_HAVEMORE;
3976 	}
3977 
3978 	if ((flags & MSG_PEEK) == 0) {
3979 		if (m == NULL) {
3980 			so->so_rcv.sb_mb = nextrecord;
3981 			/*
3982 			 * First part is an inline SB_EMPTY_FIXUP().  Second
3983 			 * part makes sure sb_lastrecord is up-to-date if
3984 			 * there is still data in the socket buffer.
3985 			 */
3986 			if (so->so_rcv.sb_mb == NULL) {
3987 				so->so_rcv.sb_mbtail = NULL;
3988 				so->so_rcv.sb_lastrecord = NULL;
3989 			} else if (nextrecord->m_nextpkt == NULL) {
3990 				so->so_rcv.sb_lastrecord = nextrecord;
3991 			}
3992 			SB_MB_CHECK(&so->so_rcv);
3993 		}
3994 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3995 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3996 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3997 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3998 		}
3999 	}
4000 
4001 	if (delayed_copy_len) {
4002 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4003 		if (error) {
4004 			goto release;
4005 		}
4006 	}
4007 	if (free_list != NULL) {
4008 		m_freem_list(free_list);
4009 		free_list = NULL;
4010 	}
4011 
4012 	if (orig_resid == uio_resid(uio) && orig_resid &&
4013 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4014 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4015 		goto restart;
4016 	}
4017 
4018 	if (flagsp != NULL) {
4019 		*flagsp |= flags;
4020 	}
4021 release:
4022 #ifdef MORE_LOCKING_DEBUG
4023 	if (so->so_usecount <= 1) {
4024 		panic("%s: release so=%p ref=%d on socket", __func__,
4025 		    so, so->so_usecount);
4026 		/* NOTREACHED */
4027 	}
4028 #endif
4029 	if (delayed_copy_len) {
4030 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4031 	}
4032 
4033 	if (free_list != NULL) {
4034 		m_freem_list(free_list);
4035 	}
4036 
4037 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4038 
4039 	if (en_tracing) {
4040 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4041 		    VM_KERNEL_ADDRPERM(so),
4042 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4043 		    (int64_t)(orig_resid - uio_resid(uio)));
4044 	}
4045 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4046 	    so->so_rcv.sb_cc, 0, error);
4047 
4048 	return error;
4049 }
4050 
4051 /*
4052  * Returns:	0			Success
4053  *	uiomove:EFAULT
4054  */
4055 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4056 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4057     user_ssize_t *resid)
4058 {
4059 	int error = 0;
4060 	struct mbuf *m;
4061 
4062 	m = *free_list;
4063 
4064 	socket_unlock(so, 0);
4065 
4066 	while (m != NULL && error == 0) {
4067 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4068 		m = m->m_next;
4069 	}
4070 	m_freem_list(*free_list);
4071 
4072 	*free_list = NULL;
4073 	*resid = 0;
4074 
4075 	socket_lock(so, 0);
4076 
4077 	return error;
4078 }
4079 
4080 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)4081 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
4082     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
4083 {
4084 	struct mbuf *m, **mp;
4085 	struct mbuf *nextrecord;
4086 	int flags, error;
4087 	struct protosw *pr = so->so_proto;
4088 	struct proc *p = current_proc();
4089 	u_int npkts = 0;
4090 	struct mbuf *free_list = NULL;
4091 	int sblocked = 0;
4092 
4093 	/*
4094 	 * Sanity check on the parameters passed by caller
4095 	 */
4096 	if (mp0 == NULL || pktcntp == NULL) {
4097 		return EINVAL;
4098 	}
4099 	if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
4100 		return EINVAL;
4101 	}
4102 
4103 	mp = mp0;
4104 	*mp0 = NULL;
4105 	if (controlp != NULL) {
4106 		*controlp = NULL;
4107 	}
4108 	if (maddrp != NULL) {
4109 		*maddrp = NULL;
4110 	}
4111 	if (flagsp != NULL) {
4112 		flags = *flagsp;
4113 	} else {
4114 		flags = 0;
4115 	}
4116 
4117 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4118 	    *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4119 	    so->so_rcv.sb_hiwat);
4120 
4121 	socket_lock(so, 1);
4122 	so_update_last_owner_locked(so, p);
4123 	so_update_policy(so);
4124 
4125 #if NECP
4126 	so_update_necp_policy(so, NULL, NULL);
4127 #endif /* NECP */
4128 
4129 	/*
4130 	 * If a recv attempt is made on a previously-accepted socket
4131 	 * that has been marked as inactive (disconnected), reject
4132 	 * the request.
4133 	 */
4134 	if (so->so_flags & SOF_DEFUNCT) {
4135 		struct sockbuf *sb = &so->so_rcv;
4136 
4137 		error = ENOTCONN;
4138 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4139 		    __func__, proc_pid(p), proc_best_name(p),
4140 		    so->so_gencnt,
4141 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4142 		/*
4143 		 * This socket should have been disconnected and flushed
4144 		 * prior to being returned from sodefunct(); there should
4145 		 * be no data on its receive list, so panic otherwise.
4146 		 */
4147 		if (so->so_state & SS_DEFUNCT) {
4148 			sb_empty_assert(sb, __func__);
4149 		}
4150 		goto release;
4151 	}
4152 
4153 	*mp = NULL;
4154 
4155 restart:
4156 	/*
4157 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4158 	 * and if so just return to the caller.  This could happen when
4159 	 * soreceive() is called by a socket upcall function during the
4160 	 * time the socket is freed.  The socket buffer would have been
4161 	 * locked across the upcall, therefore we cannot put this thread
4162 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4163 	 * we may livelock), because the lock on the socket buffer will
4164 	 * only be released when the upcall routine returns to its caller.
4165 	 * Because the socket has been officially closed, there can be
4166 	 * no further read on it.
4167 	 */
4168 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4169 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4170 		error = 0;
4171 		goto out;
4172 	}
4173 
4174 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4175 	if (error) {
4176 		goto out;
4177 	}
4178 	sblocked = 1;
4179 
4180 	m = so->so_rcv.sb_mb;
4181 	/*
4182 	 * Block awaiting more datagram if needed
4183 	 */
4184 	if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4185 	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4186 		/*
4187 		 * Panic if we notice inconsistencies in the socket's
4188 		 * receive list; both sb_mb and sb_cc should correctly
4189 		 * reflect the contents of the list, otherwise we may
4190 		 * end up with false positives during select() or poll()
4191 		 * which could put the application in a bad state.
4192 		 */
4193 		SB_MB_CHECK(&so->so_rcv);
4194 
4195 		if (so->so_error) {
4196 			if (m != NULL) {
4197 				goto dontblock;
4198 			}
4199 			error = so->so_error;
4200 			if ((flags & MSG_PEEK) == 0) {
4201 				so->so_error = 0;
4202 			}
4203 			goto release;
4204 		}
4205 		if (so->so_state & SS_CANTRCVMORE) {
4206 			if (m != NULL) {
4207 				goto dontblock;
4208 			} else {
4209 				goto release;
4210 			}
4211 		}
4212 		for (; m != NULL; m = m->m_next) {
4213 			if (m->m_flags & M_EOR) {
4214 				m = so->so_rcv.sb_mb;
4215 				goto dontblock;
4216 			}
4217 		}
4218 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4219 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4220 			error = ENOTCONN;
4221 			goto release;
4222 		}
4223 		if ((so->so_state & SS_NBIO) ||
4224 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4225 			error = EWOULDBLOCK;
4226 			goto release;
4227 		}
4228 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4229 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4230 
4231 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4232 		sblocked = 0;
4233 
4234 		error = sbwait(&so->so_rcv);
4235 		if (error != 0) {
4236 			goto release;
4237 		}
4238 		goto restart;
4239 	}
4240 dontblock:
4241 	m = so->so_rcv.sb_mb;
4242 	if (m == NULL) {
4243 		goto release;
4244 	}
4245 
4246 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4247 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4248 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4249 	nextrecord = m->m_nextpkt;
4250 
4251 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4252 		struct mbuf *maddr = NULL;
4253 
4254 		error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4255 		    &nextrecord, 1);
4256 		if (error == ERESTART) {
4257 			goto restart;
4258 		} else if (error != 0) {
4259 			goto release;
4260 		}
4261 
4262 		if (maddr != NULL) {
4263 			maddr->m_nextpkt = NULL;
4264 			maddr->m_next = NULL;
4265 			if (maddrp != NULL) {
4266 				*maddrp = maddr;
4267 				maddrp = &maddr->m_nextpkt;
4268 			} else {
4269 				maddr->m_next = free_list;
4270 				free_list = maddr;
4271 			}
4272 		}
4273 	}
4274 
4275 	/*
4276 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
4277 	 * in the first mbuf chain on the socket buffer.
4278 	 * We call into the protocol to perform externalization.
4279 	 */
4280 	if (m != NULL && m->m_type == MT_CONTROL) {
4281 		struct mbuf *control = NULL;
4282 
4283 		error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4284 		if (error != 0) {
4285 			goto release;
4286 		}
4287 		if (control != NULL) {
4288 			control->m_nextpkt = NULL;
4289 			control->m_next = NULL;
4290 			if (controlp != NULL) {
4291 				*controlp = control;
4292 				controlp = &control->m_nextpkt;
4293 			} else {
4294 				control->m_next = free_list;
4295 				free_list = control;
4296 			}
4297 		}
4298 	}
4299 
4300 	/*
4301 	 * Link the packet to the list
4302 	 */
4303 	if (m != NULL) {
4304 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4305 			panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4306 		}
4307 		m->m_nextpkt = NULL;
4308 		*mp = m;
4309 		mp = &m->m_nextpkt;
4310 	}
4311 	while (m != NULL) {
4312 		sbfree(&so->so_rcv, m);
4313 
4314 		m = m->m_next;
4315 	}
4316 
4317 	so->so_rcv.sb_mb = nextrecord;
4318 	/*
4319 	 * First part is an inline SB_EMPTY_FIXUP().  Second
4320 	 * part makes sure sb_lastrecord is up-to-date if
4321 	 * there is still data in the socket buffer.
4322 	 */
4323 	if (so->so_rcv.sb_mb == NULL) {
4324 		so->so_rcv.sb_mbtail = NULL;
4325 		so->so_rcv.sb_lastrecord = NULL;
4326 	} else if (nextrecord->m_nextpkt == NULL) {
4327 		so->so_rcv.sb_lastrecord = nextrecord;
4328 	}
4329 	SB_MB_CHECK(&so->so_rcv);
4330 
4331 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4332 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4333 
4334 	npkts += 1;
4335 
4336 	/*
4337 	 * We continue as long as all those conditions as we have less packets
4338 	 * than requested and the socket buffer is not empty
4339 	 */
4340 	if (npkts < *pktcntp) {
4341 		if (so->so_rcv.sb_mb != NULL) {
4342 			goto dontblock;
4343 		}
4344 		if ((flags & MSG_WAITALL) != 0) {
4345 			goto restart;
4346 		}
4347 	}
4348 
4349 	if (flagsp != NULL) {
4350 		*flagsp |= flags;
4351 	}
4352 
4353 release:
4354 	/*
4355 	 * pru_rcvd may cause more data to be received if the socket lock
4356 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4357 	 * That way the caller won't be surprised if it receives less data
4358 	 * than requested.
4359 	 */
4360 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4361 		flags |= MSG_HAVEMORE;
4362 	}
4363 
4364 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4365 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4366 	}
4367 
4368 	if (sblocked) {
4369 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4370 	} else {
4371 		socket_unlock(so, 1);
4372 	}
4373 
4374 out:
4375 	*pktcntp = npkts;
4376 	/*
4377 	 * Amortize the cost of freeing the mbufs
4378 	 */
4379 	if (free_list != NULL) {
4380 		m_freem_list(free_list);
4381 	}
4382 
4383 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4384 	    0, 0, 0, 0);
4385 	return error;
4386 }
4387 
4388 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4389 so_statistics_event_to_nstat_event(int64_t *input_options,
4390     uint64_t *nstat_event)
4391 {
4392 	int error = 0;
4393 	switch (*input_options) {
4394 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4395 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4396 		break;
4397 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4398 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4399 		break;
4400 	case SO_STATISTICS_EVENT_ATTRIBUTION_CHANGE:
4401 		*nstat_event = NSTAT_EVENT_SRC_ATTRIBUTION_CHANGE;
4402 		break;
4403 #if (DEBUG || DEVELOPMENT)
4404 	case SO_STATISTICS_EVENT_RESERVED_2:
4405 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4406 		break;
4407 #endif /* (DEBUG || DEVELOPMENT) */
4408 	default:
4409 		error = EINVAL;
4410 		break;
4411 	}
4412 	return error;
4413 }
4414 
4415 /*
4416  * Returns:	0			Success
4417  *		EINVAL
4418  *		ENOTCONN
4419  *	<pru_shutdown>:EINVAL
4420  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4421  *	<pru_shutdown>:ENOBUFS[TCP]
4422  *	<pru_shutdown>:EMSGSIZE[TCP]
4423  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4424  *	<pru_shutdown>:ENETUNREACH[TCP]
4425  *	<pru_shutdown>:ENETDOWN[TCP]
4426  *	<pru_shutdown>:ENOMEM[TCP]
4427  *	<pru_shutdown>:EACCES[TCP]
4428  *	<pru_shutdown>:EMSGSIZE[TCP]
4429  *	<pru_shutdown>:ENOBUFS[TCP]
4430  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4431  *	<pru_shutdown>:???		[other protocol families]
4432  */
4433 int
soshutdown(struct socket * so,int how)4434 soshutdown(struct socket *so, int how)
4435 {
4436 	int error;
4437 
4438 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4439 
4440 	switch (how) {
4441 	case SHUT_RD:
4442 	case SHUT_WR:
4443 	case SHUT_RDWR:
4444 		socket_lock(so, 1);
4445 		if ((so->so_state &
4446 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4447 			error = ENOTCONN;
4448 		} else {
4449 			error = soshutdownlock(so, how);
4450 		}
4451 		socket_unlock(so, 1);
4452 		break;
4453 	default:
4454 		error = EINVAL;
4455 		break;
4456 	}
4457 
4458 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4459 
4460 	return error;
4461 }
4462 
4463 int
soshutdownlock_final(struct socket * so,int how)4464 soshutdownlock_final(struct socket *so, int how)
4465 {
4466 	struct protosw *pr = so->so_proto;
4467 	int error = 0;
4468 
4469 	sflt_notify(so, sock_evt_shutdown, &how);
4470 
4471 	if (how != SHUT_WR) {
4472 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4473 			/* read already shut down */
4474 			error = ENOTCONN;
4475 			goto done;
4476 		}
4477 		sorflush(so);
4478 	}
4479 	if (how != SHUT_RD) {
4480 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4481 			/* write already shut down */
4482 			error = ENOTCONN;
4483 			goto done;
4484 		}
4485 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4486 	}
4487 done:
4488 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4489 	return error;
4490 }
4491 
4492 int
soshutdownlock(struct socket * so,int how)4493 soshutdownlock(struct socket *so, int how)
4494 {
4495 	int error = 0;
4496 
4497 #if CONTENT_FILTER
4498 	/*
4499 	 * A content filter may delay the actual shutdown until it
4500 	 * has processed the pending data
4501 	 */
4502 	if (so->so_flags & SOF_CONTENT_FILTER) {
4503 		error = cfil_sock_shutdown(so, &how);
4504 		if (error == EJUSTRETURN) {
4505 			error = 0;
4506 			goto done;
4507 		} else if (error != 0) {
4508 			goto done;
4509 		}
4510 	}
4511 #endif /* CONTENT_FILTER */
4512 
4513 	error = soshutdownlock_final(so, how);
4514 
4515 done:
4516 	return error;
4517 }
4518 
4519 void
sowflush(struct socket * so)4520 sowflush(struct socket *so)
4521 {
4522 	struct sockbuf *sb = &so->so_snd;
4523 
4524 	/*
4525 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4526 	 * to prevent the socket buffer from being unexpectedly altered
4527 	 * while it is used by another thread in socket send/receive.
4528 	 *
4529 	 * sblock() must not fail here, hence the assertion.
4530 	 */
4531 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4532 	VERIFY(sb->sb_flags & SB_LOCK);
4533 
4534 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4535 	sb->sb_flags            |= SB_DROP;
4536 	sb->sb_upcall           = NULL;
4537 	sb->sb_upcallarg        = NULL;
4538 
4539 	sbunlock(sb, TRUE);     /* keep socket locked */
4540 
4541 	selthreadclear(&sb->sb_sel);
4542 	sbrelease(sb);
4543 }
4544 
4545 void
sorflush(struct socket * so)4546 sorflush(struct socket *so)
4547 {
4548 	struct sockbuf *sb = &so->so_rcv;
4549 	struct protosw *pr = so->so_proto;
4550 	struct sockbuf asb;
4551 #ifdef notyet
4552 	lck_mtx_t *mutex_held;
4553 	/*
4554 	 * XXX: This code is currently commented out, because we may get here
4555 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4556 	 * longer be able to return us the lock; this will be fixed in future.
4557 	 */
4558 	if (so->so_proto->pr_getlock != NULL) {
4559 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4560 	} else {
4561 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4562 	}
4563 
4564 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4565 #endif /* notyet */
4566 
4567 	sflt_notify(so, sock_evt_flush_read, NULL);
4568 
4569 	socantrcvmore(so);
4570 
4571 	/*
4572 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4573 	 * to prevent the socket buffer from being unexpectedly altered
4574 	 * while it is used by another thread in socket send/receive.
4575 	 *
4576 	 * sblock() must not fail here, hence the assertion.
4577 	 */
4578 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4579 	VERIFY(sb->sb_flags & SB_LOCK);
4580 
4581 	/*
4582 	 * Copy only the relevant fields from "sb" to "asb" which we
4583 	 * need for sbrelease() to function.  In particular, skip
4584 	 * sb_sel as it contains the wait queue linkage, which would
4585 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4586 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4587 	 * to acquire it later as part of sbrelease().
4588 	 */
4589 	bzero(&asb, sizeof(asb));
4590 	asb.sb_cc               = sb->sb_cc;
4591 	asb.sb_hiwat            = sb->sb_hiwat;
4592 	asb.sb_mbcnt            = sb->sb_mbcnt;
4593 	asb.sb_mbmax            = sb->sb_mbmax;
4594 	asb.sb_ctl              = sb->sb_ctl;
4595 	asb.sb_lowat            = sb->sb_lowat;
4596 	asb.sb_mb               = sb->sb_mb;
4597 	asb.sb_mbtail           = sb->sb_mbtail;
4598 	asb.sb_lastrecord       = sb->sb_lastrecord;
4599 	asb.sb_so               = sb->sb_so;
4600 	asb.sb_flags            = sb->sb_flags;
4601 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4602 	asb.sb_flags            |= SB_DROP;
4603 
4604 	/*
4605 	 * Ideally we'd bzero() these and preserve the ones we need;
4606 	 * but to do that we'd need to shuffle things around in the
4607 	 * sockbuf, and we can't do it now because there are KEXTS
4608 	 * that are directly referring to the socket structure.
4609 	 *
4610 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4611 	 * Clearing SB_SEL is done for selthreadclear() below.
4612 	 */
4613 	sb->sb_cc               = 0;
4614 	sb->sb_hiwat            = 0;
4615 	sb->sb_mbcnt            = 0;
4616 	sb->sb_mbmax            = 0;
4617 	sb->sb_ctl              = 0;
4618 	sb->sb_lowat            = 0;
4619 	sb->sb_mb               = NULL;
4620 	sb->sb_mbtail           = NULL;
4621 	sb->sb_lastrecord       = NULL;
4622 	sb->sb_timeo.tv_sec     = 0;
4623 	sb->sb_timeo.tv_usec    = 0;
4624 	sb->sb_upcall           = NULL;
4625 	sb->sb_upcallarg        = NULL;
4626 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4627 	sb->sb_flags            |= SB_DROP;
4628 
4629 	sbunlock(sb, TRUE);     /* keep socket locked */
4630 
4631 	/*
4632 	 * Note that selthreadclear() is called on the original "sb" and
4633 	 * not the local "asb" because of the way wait queue linkage is
4634 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4635 	 * should no longer be set (cleared above.)
4636 	 */
4637 	selthreadclear(&sb->sb_sel);
4638 
4639 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4640 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4641 	}
4642 
4643 	sbrelease(&asb);
4644 }
4645 
4646 /*
4647  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4648  * an additional variant to handle the case where the option value needs
4649  * to be some kind of integer, but not a specific size.
4650  * In addition to their use here, these functions are also called by the
4651  * protocol-level pr_ctloutput() routines.
4652  *
4653  * Returns:	0			Success
4654  *		EINVAL
4655  *	copyin:EFAULT
4656  */
4657 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4658 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4659 {
4660 	size_t  valsize;
4661 
4662 	/*
4663 	 * If the user gives us more than we wanted, we ignore it,
4664 	 * but if we don't get the minimum length the caller
4665 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4666 	 * is set to however much we actually retrieved.
4667 	 */
4668 	if ((valsize = sopt->sopt_valsize) < minlen) {
4669 		return EINVAL;
4670 	}
4671 	if (valsize > len) {
4672 		sopt->sopt_valsize = valsize = len;
4673 	}
4674 
4675 	if (sopt->sopt_p != kernproc) {
4676 		return copyin(sopt->sopt_val, buf, valsize);
4677 	}
4678 
4679 	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4680 	return 0;
4681 }
4682 
4683 /*
4684  * sooptcopyin_timeval
4685  *   Copy in a timeval value into tv_p, and take into account whether the
4686  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4687  *   code here so that we can verify the 64-bit tv_sec value before we lose
4688  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4689  */
4690 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4691 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4692 {
4693 	int                     error;
4694 
4695 	if (proc_is64bit(sopt->sopt_p)) {
4696 		struct user64_timeval   tv64;
4697 
4698 		if (sopt->sopt_valsize < sizeof(tv64)) {
4699 			return EINVAL;
4700 		}
4701 
4702 		sopt->sopt_valsize = sizeof(tv64);
4703 		if (sopt->sopt_p != kernproc) {
4704 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4705 			if (error != 0) {
4706 				return error;
4707 			}
4708 		} else {
4709 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4710 			    sizeof(tv64));
4711 		}
4712 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4713 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4714 			return EDOM;
4715 		}
4716 
4717 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4718 		tv_p->tv_usec = tv64.tv_usec;
4719 	} else {
4720 		struct user32_timeval   tv32;
4721 
4722 		if (sopt->sopt_valsize < sizeof(tv32)) {
4723 			return EINVAL;
4724 		}
4725 
4726 		sopt->sopt_valsize = sizeof(tv32);
4727 		if (sopt->sopt_p != kernproc) {
4728 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4729 			if (error != 0) {
4730 				return error;
4731 			}
4732 		} else {
4733 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4734 			    sizeof(tv32));
4735 		}
4736 #ifndef __LP64__
4737 		/*
4738 		 * K64todo "comparison is always false due to
4739 		 * limited range of data type"
4740 		 */
4741 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4742 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4743 			return EDOM;
4744 		}
4745 #endif
4746 		tv_p->tv_sec = tv32.tv_sec;
4747 		tv_p->tv_usec = tv32.tv_usec;
4748 	}
4749 	return 0;
4750 }
4751 
4752 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4753 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4754     boolean_t ignore_delegate)
4755 {
4756 	kauth_cred_t cred =  NULL;
4757 	proc_t ep = PROC_NULL;
4758 	uid_t uid;
4759 	int error = 0;
4760 
4761 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4762 		ep = proc_find(so->e_pid);
4763 		if (ep) {
4764 			cred = kauth_cred_proc_ref(ep);
4765 		}
4766 	}
4767 
4768 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4769 
4770 	/* uid is 0 for root */
4771 	if (uid != 0 || !allow_root) {
4772 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4773 	}
4774 	if (cred) {
4775 		kauth_cred_unref(&cred);
4776 	}
4777 	if (ep != PROC_NULL) {
4778 		proc_rele(ep);
4779 	}
4780 
4781 	return error;
4782 }
4783 
4784 /*
4785  * Returns:	0			Success
4786  *		EINVAL
4787  *		ENOPROTOOPT
4788  *		ENOBUFS
4789  *		EDOM
4790  *	sooptcopyin:EINVAL
4791  *	sooptcopyin:EFAULT
4792  *	sooptcopyin_timeval:EINVAL
4793  *	sooptcopyin_timeval:EFAULT
4794  *	sooptcopyin_timeval:EDOM
4795  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4796  *	<pr_ctloutput>:???w
4797  *	sflt_attach_private:???		[whatever a filter author chooses]
4798  *	<sf_setoption>:???		[whatever a filter author chooses]
4799  *
4800  * Notes:	Other <pru_listen> returns depend on the protocol family; all
4801  *		<sf_listen> returns depend on what the filter author causes
4802  *		their filter to return.
4803  */
4804 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4805 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4806 {
4807 	int     error, optval;
4808 	int64_t long_optval;
4809 	struct  linger l;
4810 	struct  timeval tv;
4811 
4812 	if (sopt->sopt_dir != SOPT_SET) {
4813 		sopt->sopt_dir = SOPT_SET;
4814 	}
4815 
4816 	if (dolock) {
4817 		socket_lock(so, 1);
4818 	}
4819 
4820 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4821 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4822 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4823 		/* the socket has been shutdown, no more sockopt's */
4824 		error = EINVAL;
4825 		goto out;
4826 	}
4827 
4828 	error = sflt_setsockopt(so, sopt);
4829 	if (error != 0) {
4830 		if (error == EJUSTRETURN) {
4831 			error = 0;
4832 		}
4833 		goto out;
4834 	}
4835 
4836 	if (sopt->sopt_level != SOL_SOCKET) {
4837 		if (so->so_proto != NULL &&
4838 		    so->so_proto->pr_ctloutput != NULL) {
4839 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
4840 			goto out;
4841 		}
4842 		error = ENOPROTOOPT;
4843 	} else {
4844 		/*
4845 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
4846 		 * the protocol layer, if needed.  A zero value returned from
4847 		 * the handler means use default socket-level processing as
4848 		 * done by the rest of this routine.  Otherwise, any other
4849 		 * return value indicates that the option is unsupported.
4850 		 */
4851 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4852 		    pru_socheckopt(so, sopt)) != 0) {
4853 			goto out;
4854 		}
4855 
4856 		error = 0;
4857 		switch (sopt->sopt_name) {
4858 		case SO_LINGER:
4859 		case SO_LINGER_SEC: {
4860 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4861 			if (error != 0) {
4862 				goto out;
4863 			}
4864 			/* Make sure to use sane values */
4865 			if (sopt->sopt_name == SO_LINGER) {
4866 				so->so_linger = (short)l.l_linger;
4867 			} else {
4868 				so->so_linger = (short)((long)l.l_linger * hz);
4869 			}
4870 			if (l.l_onoff != 0) {
4871 				so->so_options |= SO_LINGER;
4872 			} else {
4873 				so->so_options &= ~SO_LINGER;
4874 			}
4875 			break;
4876 		}
4877 		case SO_DEBUG:
4878 		case SO_KEEPALIVE:
4879 		case SO_DONTROUTE:
4880 		case SO_USELOOPBACK:
4881 		case SO_BROADCAST:
4882 		case SO_REUSEADDR:
4883 		case SO_REUSEPORT:
4884 		case SO_OOBINLINE:
4885 		case SO_TIMESTAMP:
4886 		case SO_TIMESTAMP_MONOTONIC:
4887 		case SO_TIMESTAMP_CONTINUOUS:
4888 		case SO_DONTTRUNC:
4889 		case SO_WANTMORE:
4890 		case SO_WANTOOBFLAG:
4891 		case SO_NOWAKEFROMSLEEP:
4892 		case SO_NOAPNFALLBK:
4893 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4894 			    sizeof(optval));
4895 			if (error != 0) {
4896 				goto out;
4897 			}
4898 			if (optval) {
4899 				so->so_options |= sopt->sopt_name;
4900 			} else {
4901 				so->so_options &= ~sopt->sopt_name;
4902 			}
4903 #if SKYWALK
4904 			inp_update_netns_flags(so);
4905 #endif /* SKYWALK */
4906 			break;
4907 
4908 		case SO_SNDBUF:
4909 		case SO_RCVBUF:
4910 		case SO_SNDLOWAT:
4911 		case SO_RCVLOWAT:
4912 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4913 			    sizeof(optval));
4914 			if (error != 0) {
4915 				goto out;
4916 			}
4917 
4918 			/*
4919 			 * Values < 1 make no sense for any of these
4920 			 * options, so disallow them.
4921 			 */
4922 			if (optval < 1) {
4923 				error = EINVAL;
4924 				goto out;
4925 			}
4926 
4927 			switch (sopt->sopt_name) {
4928 			case SO_SNDBUF:
4929 			case SO_RCVBUF: {
4930 				struct sockbuf *sb =
4931 				    (sopt->sopt_name == SO_SNDBUF) ?
4932 				    &so->so_snd : &so->so_rcv;
4933 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
4934 					error = ENOBUFS;
4935 					goto out;
4936 				}
4937 				sb->sb_flags |= SB_USRSIZE;
4938 				sb->sb_flags &= ~SB_AUTOSIZE;
4939 				sb->sb_idealsize = (u_int32_t)optval;
4940 				break;
4941 			}
4942 			/*
4943 			 * Make sure the low-water is never greater than
4944 			 * the high-water.
4945 			 */
4946 			case SO_SNDLOWAT: {
4947 				int space = sbspace(&so->so_snd);
4948 				uint32_t hiwat = so->so_snd.sb_hiwat;
4949 
4950 				if (so->so_snd.sb_flags & SB_UNIX) {
4951 					struct unpcb *unp =
4952 					    (struct unpcb *)(so->so_pcb);
4953 					if (unp != NULL &&
4954 					    unp->unp_conn != NULL) {
4955 						struct socket *so2 = unp->unp_conn->unp_socket;
4956 						hiwat += unp->unp_conn->unp_cc;
4957 						space = sbspace(&so2->so_rcv);
4958 					}
4959 				}
4960 
4961 				so->so_snd.sb_lowat =
4962 				    (optval > hiwat) ?
4963 				    hiwat : optval;
4964 
4965 				if (space >= so->so_snd.sb_lowat) {
4966 					sowwakeup(so);
4967 				}
4968 				break;
4969 			}
4970 			case SO_RCVLOWAT: {
4971 				int64_t data_len;
4972 				so->so_rcv.sb_lowat =
4973 				    (optval > so->so_rcv.sb_hiwat) ?
4974 				    so->so_rcv.sb_hiwat : optval;
4975 				if (so->so_rcv.sb_flags & SB_UNIX) {
4976 					struct unpcb *unp =
4977 					    (struct unpcb *)(so->so_pcb);
4978 					if (unp != NULL &&
4979 					    unp->unp_conn != NULL) {
4980 						struct socket *so2 = unp->unp_conn->unp_socket;
4981 						data_len = so2->so_snd.sb_cc
4982 						    - so2->so_snd.sb_ctl;
4983 					} else {
4984 						data_len = so->so_rcv.sb_cc
4985 						    - so->so_rcv.sb_ctl;
4986 					}
4987 				} else {
4988 					data_len = so->so_rcv.sb_cc
4989 					    - so->so_rcv.sb_ctl;
4990 				}
4991 
4992 				if (data_len >= so->so_rcv.sb_lowat) {
4993 					sorwakeup(so);
4994 				}
4995 				break;
4996 			}
4997 			}
4998 			break;
4999 
5000 		case SO_SNDTIMEO:
5001 		case SO_RCVTIMEO:
5002 			error = sooptcopyin_timeval(sopt, &tv);
5003 			if (error != 0) {
5004 				goto out;
5005 			}
5006 
5007 			switch (sopt->sopt_name) {
5008 			case SO_SNDTIMEO:
5009 				so->so_snd.sb_timeo = tv;
5010 				break;
5011 			case SO_RCVTIMEO:
5012 				so->so_rcv.sb_timeo = tv;
5013 				break;
5014 			}
5015 			break;
5016 
5017 		case SO_NKE: {
5018 			struct so_nke nke;
5019 
5020 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5021 			    sizeof(nke));
5022 			if (error != 0) {
5023 				goto out;
5024 			}
5025 
5026 			error = sflt_attach_internal(so, nke.nke_handle);
5027 			break;
5028 		}
5029 
5030 		case SO_NOSIGPIPE:
5031 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5032 			    sizeof(optval));
5033 			if (error != 0) {
5034 				goto out;
5035 			}
5036 			if (optval != 0) {
5037 				so->so_flags |= SOF_NOSIGPIPE;
5038 			} else {
5039 				so->so_flags &= ~SOF_NOSIGPIPE;
5040 			}
5041 			break;
5042 
5043 		case SO_NOADDRERR:
5044 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5045 			    sizeof(optval));
5046 			if (error != 0) {
5047 				goto out;
5048 			}
5049 			if (optval != 0) {
5050 				so->so_flags |= SOF_NOADDRAVAIL;
5051 			} else {
5052 				so->so_flags &= ~SOF_NOADDRAVAIL;
5053 			}
5054 			break;
5055 
5056 		case SO_REUSESHAREUID:
5057 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5058 			    sizeof(optval));
5059 			if (error != 0) {
5060 				goto out;
5061 			}
5062 			if (optval != 0) {
5063 				so->so_flags |= SOF_REUSESHAREUID;
5064 			} else {
5065 				so->so_flags &= ~SOF_REUSESHAREUID;
5066 			}
5067 			break;
5068 
5069 		case SO_NOTIFYCONFLICT:
5070 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5071 				error = EPERM;
5072 				goto out;
5073 			}
5074 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5075 			    sizeof(optval));
5076 			if (error != 0) {
5077 				goto out;
5078 			}
5079 			if (optval != 0) {
5080 				so->so_flags |= SOF_NOTIFYCONFLICT;
5081 			} else {
5082 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5083 			}
5084 			break;
5085 
5086 		case SO_RESTRICTIONS:
5087 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5088 			    sizeof(optval));
5089 			if (error != 0) {
5090 				goto out;
5091 			}
5092 
5093 			error = so_set_restrictions(so, optval);
5094 			break;
5095 
5096 		case SO_AWDL_UNRESTRICTED:
5097 			if (SOCK_DOM(so) != PF_INET &&
5098 			    SOCK_DOM(so) != PF_INET6) {
5099 				error = EOPNOTSUPP;
5100 				goto out;
5101 			}
5102 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5103 			    sizeof(optval));
5104 			if (error != 0) {
5105 				goto out;
5106 			}
5107 			if (optval != 0) {
5108 				error = soopt_cred_check(so,
5109 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5110 				if (error == 0) {
5111 					inp_set_awdl_unrestricted(
5112 						sotoinpcb(so));
5113 				}
5114 			} else {
5115 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5116 			}
5117 			break;
5118 		case SO_INTCOPROC_ALLOW:
5119 			if (SOCK_DOM(so) != PF_INET6) {
5120 				error = EOPNOTSUPP;
5121 				goto out;
5122 			}
5123 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5124 			    sizeof(optval));
5125 			if (error != 0) {
5126 				goto out;
5127 			}
5128 			if (optval != 0 &&
5129 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5130 				error = soopt_cred_check(so,
5131 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5132 				if (error == 0) {
5133 					inp_set_intcoproc_allowed(
5134 						sotoinpcb(so));
5135 				}
5136 			} else if (optval == 0) {
5137 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5138 			}
5139 			break;
5140 
5141 		case SO_LABEL:
5142 			error = EOPNOTSUPP;
5143 			break;
5144 
5145 		case SO_UPCALLCLOSEWAIT:
5146 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5147 			    sizeof(optval));
5148 			if (error != 0) {
5149 				goto out;
5150 			}
5151 			if (optval != 0) {
5152 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5153 			} else {
5154 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5155 			}
5156 			break;
5157 
5158 		case SO_RANDOMPORT:
5159 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5160 			    sizeof(optval));
5161 			if (error != 0) {
5162 				goto out;
5163 			}
5164 			if (optval != 0) {
5165 				so->so_flags |= SOF_BINDRANDOMPORT;
5166 			} else {
5167 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5168 			}
5169 			break;
5170 
5171 		case SO_NP_EXTENSIONS: {
5172 			struct so_np_extensions sonpx;
5173 
5174 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5175 			    sizeof(sonpx));
5176 			if (error != 0) {
5177 				goto out;
5178 			}
5179 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5180 				error = EINVAL;
5181 				goto out;
5182 			}
5183 			/*
5184 			 * Only one bit defined for now
5185 			 */
5186 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5187 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5188 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5189 				} else {
5190 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5191 				}
5192 			}
5193 			break;
5194 		}
5195 
5196 		case SO_TRAFFIC_CLASS: {
5197 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5198 			    sizeof(optval));
5199 			if (error != 0) {
5200 				goto out;
5201 			}
5202 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5203 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5204 				error = so_set_net_service_type(so, netsvc);
5205 				goto out;
5206 			}
5207 			error = so_set_traffic_class(so, optval);
5208 			if (error != 0) {
5209 				goto out;
5210 			}
5211 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5212 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5213 			break;
5214 		}
5215 
5216 		case SO_RECV_TRAFFIC_CLASS: {
5217 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5218 			    sizeof(optval));
5219 			if (error != 0) {
5220 				goto out;
5221 			}
5222 			if (optval == 0) {
5223 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5224 			} else {
5225 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5226 			}
5227 			break;
5228 		}
5229 
5230 #if (DEVELOPMENT || DEBUG)
5231 		case SO_TRAFFIC_CLASS_DBG: {
5232 			struct so_tcdbg so_tcdbg;
5233 
5234 			error = sooptcopyin(sopt, &so_tcdbg,
5235 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5236 			if (error != 0) {
5237 				goto out;
5238 			}
5239 			error = so_set_tcdbg(so, &so_tcdbg);
5240 			if (error != 0) {
5241 				goto out;
5242 			}
5243 			break;
5244 		}
5245 #endif /* (DEVELOPMENT || DEBUG) */
5246 
5247 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5248 			error = priv_check_cred(kauth_cred_get(),
5249 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5250 			if (error != 0) {
5251 				goto out;
5252 			}
5253 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5254 			    sizeof(optval));
5255 			if (error != 0) {
5256 				goto out;
5257 			}
5258 			if (optval == 0) {
5259 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5260 			} else {
5261 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5262 			}
5263 			break;
5264 
5265 #if (DEVELOPMENT || DEBUG)
5266 		case SO_DEFUNCTIT:
5267 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5268 			if (error == 0) {
5269 				error = sodefunct(current_proc(), so, 0);
5270 			}
5271 
5272 			break;
5273 #endif /* (DEVELOPMENT || DEBUG) */
5274 
5275 		case SO_DEFUNCTOK:
5276 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5277 			    sizeof(optval));
5278 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5279 				if (error == 0) {
5280 					error = EBADF;
5281 				}
5282 				goto out;
5283 			}
5284 			/*
5285 			 * Any process can set SO_DEFUNCTOK (clear
5286 			 * SOF_NODEFUNCT), but only root can clear
5287 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5288 			 */
5289 			if (optval == 0 &&
5290 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5291 				error = EPERM;
5292 				goto out;
5293 			}
5294 			if (optval) {
5295 				so->so_flags &= ~SOF_NODEFUNCT;
5296 			} else {
5297 				so->so_flags |= SOF_NODEFUNCT;
5298 			}
5299 
5300 			if (SOCK_DOM(so) == PF_INET ||
5301 			    SOCK_DOM(so) == PF_INET6) {
5302 				char s[MAX_IPv6_STR_LEN];
5303 				char d[MAX_IPv6_STR_LEN];
5304 				struct inpcb *inp = sotoinpcb(so);
5305 
5306 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5307 				    "[%s %s:%d -> %s:%d] is now marked "
5308 				    "as %seligible for "
5309 				    "defunct\n", __func__, proc_selfpid(),
5310 				    proc_best_name(current_proc()),
5311 				    so->so_gencnt,
5312 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5313 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5314 				    ((SOCK_DOM(so) == PF_INET) ?
5315 				    (void *)&inp->inp_laddr.s_addr :
5316 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5317 				    ntohs(inp->in6p_lport),
5318 				    inet_ntop(SOCK_DOM(so),
5319 				    (SOCK_DOM(so) == PF_INET) ?
5320 				    (void *)&inp->inp_faddr.s_addr :
5321 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5322 				    ntohs(inp->in6p_fport),
5323 				    (so->so_flags & SOF_NODEFUNCT) ?
5324 				    "not " : "");
5325 			} else {
5326 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5327 				    "is now marked as %seligible for "
5328 				    "defunct\n",
5329 				    __func__, proc_selfpid(),
5330 				    proc_best_name(current_proc()),
5331 				    so->so_gencnt,
5332 				    SOCK_DOM(so), SOCK_TYPE(so),
5333 				    (so->so_flags & SOF_NODEFUNCT) ?
5334 				    "not " : "");
5335 			}
5336 			break;
5337 
5338 		case SO_ISDEFUNCT:
5339 			/* This option is not settable */
5340 			error = EINVAL;
5341 			break;
5342 
5343 		case SO_OPPORTUNISTIC:
5344 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5345 			    sizeof(optval));
5346 			if (error == 0) {
5347 				error = so_set_opportunistic(so, optval);
5348 			}
5349 			break;
5350 
5351 		case SO_FLUSH:
5352 			/* This option is handled by lower layer(s) */
5353 			error = 0;
5354 			break;
5355 
5356 		case SO_RECV_ANYIF:
5357 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5358 			    sizeof(optval));
5359 			if (error == 0) {
5360 				error = so_set_recv_anyif(so, optval);
5361 			}
5362 			break;
5363 
5364 		case SO_TRAFFIC_MGT_BACKGROUND: {
5365 			/* This option is handled by lower layer(s) */
5366 			error = 0;
5367 			break;
5368 		}
5369 
5370 #if FLOW_DIVERT
5371 		case SO_FLOW_DIVERT_TOKEN:
5372 			error = flow_divert_token_set(so, sopt);
5373 			break;
5374 #endif  /* FLOW_DIVERT */
5375 
5376 
5377 		case SO_DELEGATED:
5378 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5379 			    sizeof(optval))) != 0) {
5380 				break;
5381 			}
5382 
5383 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5384 			break;
5385 
5386 		case SO_DELEGATED_UUID: {
5387 			uuid_t euuid;
5388 
5389 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5390 			    sizeof(euuid))) != 0) {
5391 				break;
5392 			}
5393 
5394 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5395 			break;
5396 		}
5397 
5398 #if NECP
5399 		case SO_NECP_ATTRIBUTES:
5400 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5401 				/* Handled by MPTCP itself */
5402 				break;
5403 			}
5404 
5405 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5406 				error = EINVAL;
5407 				goto out;
5408 			}
5409 
5410 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5411 			break;
5412 
5413 		case SO_NECP_CLIENTUUID: {
5414 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5415 				/* Handled by MPTCP itself */
5416 				break;
5417 			}
5418 
5419 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5420 				error = EINVAL;
5421 				goto out;
5422 			}
5423 
5424 			struct inpcb *inp = sotoinpcb(so);
5425 			if (!uuid_is_null(inp->necp_client_uuid)) {
5426 				// Clear out the old client UUID if present
5427 				necp_inpcb_remove_cb(inp);
5428 			}
5429 
5430 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5431 			    sizeof(uuid_t), sizeof(uuid_t));
5432 			if (error != 0) {
5433 				goto out;
5434 			}
5435 
5436 			if (uuid_is_null(inp->necp_client_uuid)) {
5437 				error = EINVAL;
5438 				goto out;
5439 			}
5440 
5441 			pid_t current_pid = proc_pid(current_proc());
5442 			error = necp_client_register_socket_flow(current_pid,
5443 			    inp->necp_client_uuid, inp);
5444 			if (error != 0) {
5445 				uuid_clear(inp->necp_client_uuid);
5446 				goto out;
5447 			}
5448 
5449 			if (inp->inp_lport != 0) {
5450 				// There is a bound local port, so this is not
5451 				// a fresh socket. Assign to the client.
5452 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5453 			}
5454 
5455 			break;
5456 		}
5457 		case SO_NECP_LISTENUUID: {
5458 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5459 				error = EINVAL;
5460 				goto out;
5461 			}
5462 
5463 			struct inpcb *inp = sotoinpcb(so);
5464 			if (!uuid_is_null(inp->necp_client_uuid)) {
5465 				error = EINVAL;
5466 				goto out;
5467 			}
5468 
5469 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5470 			    sizeof(uuid_t), sizeof(uuid_t));
5471 			if (error != 0) {
5472 				goto out;
5473 			}
5474 
5475 			if (uuid_is_null(inp->necp_client_uuid)) {
5476 				error = EINVAL;
5477 				goto out;
5478 			}
5479 
5480 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5481 			    inp->necp_client_uuid, inp);
5482 			if (error != 0) {
5483 				uuid_clear(inp->necp_client_uuid);
5484 				goto out;
5485 			}
5486 
5487 			// Mark that the port registration is held by NECP
5488 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5489 
5490 			break;
5491 		}
5492 
5493 		case SO_RESOLVER_SIGNATURE: {
5494 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5495 				error = EINVAL;
5496 				goto out;
5497 			}
5498 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5499 			break;
5500 		}
5501 #endif /* NECP */
5502 
5503 		case SO_EXTENDED_BK_IDLE:
5504 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5505 			    sizeof(optval));
5506 			if (error == 0) {
5507 				error = so_set_extended_bk_idle(so, optval);
5508 			}
5509 			break;
5510 
5511 		case SO_MARK_CELLFALLBACK:
5512 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5513 			    sizeof(optval));
5514 			if (error != 0) {
5515 				goto out;
5516 			}
5517 			if (optval < 0) {
5518 				error = EINVAL;
5519 				goto out;
5520 			}
5521 			if (optval == 0) {
5522 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5523 			} else {
5524 				so->so_flags1 |= SOF1_CELLFALLBACK;
5525 			}
5526 			break;
5527 
5528 		case SO_MARK_CELLFALLBACK_UUID:
5529 		{
5530 			struct so_mark_cellfallback_uuid_args args;
5531 
5532 			error = sooptcopyin(sopt, &args, sizeof(args),
5533 			    sizeof(args));
5534 			if (error != 0) {
5535 				goto out;
5536 			}
5537 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5538 			    args.flow_cellfallback);
5539 			break;
5540 		}
5541 
5542 		case SO_FALLBACK_MODE:
5543 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5544 			    sizeof(optval));
5545 			if (error != 0) {
5546 				goto out;
5547 			}
5548 			if (optval < SO_FALLBACK_MODE_NONE ||
5549 			    optval > SO_FALLBACK_MODE_PREFER) {
5550 				error = EINVAL;
5551 				goto out;
5552 			}
5553 			so->so_fallback_mode = (u_int8_t)optval;
5554 			break;
5555 
5556 		case SO_MARK_KNOWN_TRACKER: {
5557 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5558 			    sizeof(optval));
5559 			if (error != 0) {
5560 				goto out;
5561 			}
5562 			if (optval < 0) {
5563 				error = EINVAL;
5564 				goto out;
5565 			}
5566 			if (optval == 0) {
5567 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5568 			} else {
5569 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5570 			}
5571 			break;
5572 		}
5573 
5574 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5575 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5576 			    sizeof(optval));
5577 			if (error != 0) {
5578 				goto out;
5579 			}
5580 			if (optval < 0) {
5581 				error = EINVAL;
5582 				goto out;
5583 			}
5584 			if (optval == 0) {
5585 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5586 			} else {
5587 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5588 			}
5589 			break;
5590 		}
5591 
5592 		case SO_MARK_APPROVED_APP_DOMAIN: {
5593 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5594 			    sizeof(optval));
5595 			if (error != 0) {
5596 				goto out;
5597 			}
5598 			if (optval < 0) {
5599 				error = EINVAL;
5600 				goto out;
5601 			}
5602 			if (optval == 0) {
5603 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5604 			} else {
5605 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5606 			}
5607 			break;
5608 		}
5609 
5610 		case SO_STATISTICS_EVENT:
5611 			error = sooptcopyin(sopt, &long_optval,
5612 			    sizeof(long_optval), sizeof(long_optval));
5613 			if (error != 0) {
5614 				goto out;
5615 			}
5616 			u_int64_t nstat_event = 0;
5617 			error = so_statistics_event_to_nstat_event(
5618 				&long_optval, &nstat_event);
5619 			if (error != 0) {
5620 				goto out;
5621 			}
5622 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5623 			break;
5624 
5625 		case SO_NET_SERVICE_TYPE: {
5626 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5627 			    sizeof(optval));
5628 			if (error != 0) {
5629 				goto out;
5630 			}
5631 			error = so_set_net_service_type(so, optval);
5632 			break;
5633 		}
5634 
5635 		case SO_QOSMARKING_POLICY_OVERRIDE:
5636 			error = priv_check_cred(kauth_cred_get(),
5637 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5638 			if (error != 0) {
5639 				goto out;
5640 			}
5641 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5642 			    sizeof(optval));
5643 			if (error != 0) {
5644 				goto out;
5645 			}
5646 			if (optval == 0) {
5647 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5648 			} else {
5649 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5650 			}
5651 			break;
5652 
5653 		case SO_MPKL_SEND_INFO: {
5654 			struct so_mpkl_send_info so_mpkl_send_info;
5655 
5656 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5657 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5658 			if (error != 0) {
5659 				goto out;
5660 			}
5661 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5662 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5663 
5664 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5665 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5666 			} else {
5667 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5668 			}
5669 			break;
5670 		}
5671 		case SO_WANT_KEV_SOCKET_CLOSED: {
5672 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5673 			    sizeof(optval));
5674 			if (error != 0) {
5675 				goto out;
5676 			}
5677 			if (optval == 0) {
5678 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5679 			} else {
5680 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5681 			}
5682 			break;
5683 		}
5684 		case SO_MARK_WAKE_PKT: {
5685 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5686 			    sizeof(optval));
5687 			if (error != 0) {
5688 				goto out;
5689 			}
5690 			if (optval == 0) {
5691 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5692 			} else {
5693 				so->so_flags |= SOF_MARK_WAKE_PKT;
5694 			}
5695 			break;
5696 		}
5697 		case SO_RECV_WAKE_PKT: {
5698 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5699 			    sizeof(optval));
5700 			if (error != 0) {
5701 				goto out;
5702 			}
5703 			if (optval == 0) {
5704 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5705 			} else {
5706 				so->so_flags |= SOF_RECV_WAKE_PKT;
5707 			}
5708 			break;
5709 		}
5710 		case SO_APPLICATION_ID: {
5711 			so_application_id_t application_id = { 0 };
5712 
5713 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5714 				error = EINVAL;
5715 				goto out;
5716 			}
5717 			error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5718 			    sizeof(application_id));
5719 			if (error != 0) {
5720 				goto out;
5721 			}
5722 
5723 			// The user needs to match
5724 			if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5725 				error = EINVAL;
5726 				printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5727 				goto out;
5728 			}
5729 			error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5730 			if (error != 0) {
5731 				printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5732 				goto out;
5733 			}
5734 			if (application_id.persona_id != PERSONA_ID_NONE) {
5735 				so->so_persona_id = application_id.persona_id;
5736 			}
5737 			break;
5738 		}
5739 		default:
5740 			error = ENOPROTOOPT;
5741 			break;
5742 		}
5743 		if (error == 0 && so->so_proto != NULL &&
5744 		    so->so_proto->pr_ctloutput != NULL) {
5745 			(void) so->so_proto->pr_ctloutput(so, sopt);
5746 		}
5747 	}
5748 out:
5749 	if (dolock) {
5750 		socket_unlock(so, 1);
5751 	}
5752 	return error;
5753 }
5754 
5755 /* Helper routines for getsockopt */
5756 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5757 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5758 {
5759 	int     error;
5760 	size_t  valsize;
5761 
5762 	error = 0;
5763 
5764 	/*
5765 	 * Documented get behavior is that we always return a value,
5766 	 * possibly truncated to fit in the user's buffer.
5767 	 * Traditional behavior is that we always tell the user
5768 	 * precisely how much we copied, rather than something useful
5769 	 * like the total amount we had available for her.
5770 	 * Note that this interface is not idempotent; the entire answer must
5771 	 * generated ahead of time.
5772 	 */
5773 	valsize = MIN(len, sopt->sopt_valsize);
5774 	sopt->sopt_valsize = valsize;
5775 	if (sopt->sopt_val != USER_ADDR_NULL) {
5776 		if (sopt->sopt_p != kernproc) {
5777 			error = copyout(buf, sopt->sopt_val, valsize);
5778 		} else {
5779 			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5780 		}
5781 	}
5782 	return error;
5783 }
5784 
5785 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5786 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5787 {
5788 	int                     error;
5789 	size_t                  len;
5790 	struct user64_timeval   tv64 = {};
5791 	struct user32_timeval   tv32 = {};
5792 	const void *            val;
5793 	size_t                  valsize;
5794 
5795 	error = 0;
5796 	if (proc_is64bit(sopt->sopt_p)) {
5797 		len = sizeof(tv64);
5798 		tv64.tv_sec = tv_p->tv_sec;
5799 		tv64.tv_usec = tv_p->tv_usec;
5800 		val = &tv64;
5801 	} else {
5802 		len = sizeof(tv32);
5803 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5804 		tv32.tv_usec = tv_p->tv_usec;
5805 		val = &tv32;
5806 	}
5807 	valsize = MIN(len, sopt->sopt_valsize);
5808 	sopt->sopt_valsize = valsize;
5809 	if (sopt->sopt_val != USER_ADDR_NULL) {
5810 		if (sopt->sopt_p != kernproc) {
5811 			error = copyout(val, sopt->sopt_val, valsize);
5812 		} else {
5813 			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5814 		}
5815 	}
5816 	return error;
5817 }
5818 
5819 /*
5820  * Return:	0			Success
5821  *		ENOPROTOOPT
5822  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5823  *	<pr_ctloutput>:???
5824  *	<sf_getoption>:???
5825  */
5826 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5827 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5828 {
5829 	int     error, optval;
5830 	struct  linger l;
5831 	struct  timeval tv;
5832 
5833 	if (sopt->sopt_dir != SOPT_GET) {
5834 		sopt->sopt_dir = SOPT_GET;
5835 	}
5836 
5837 	if (dolock) {
5838 		socket_lock(so, 1);
5839 	}
5840 
5841 	error = sflt_getsockopt(so, sopt);
5842 	if (error != 0) {
5843 		if (error == EJUSTRETURN) {
5844 			error = 0;
5845 		}
5846 		goto out;
5847 	}
5848 
5849 	if (sopt->sopt_level != SOL_SOCKET) {
5850 		if (so->so_proto != NULL &&
5851 		    so->so_proto->pr_ctloutput != NULL) {
5852 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5853 			goto out;
5854 		}
5855 		error = ENOPROTOOPT;
5856 	} else {
5857 		/*
5858 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5859 		 * the protocol layer, if needed.  A zero value returned from
5860 		 * the handler means use default socket-level processing as
5861 		 * done by the rest of this routine.  Otherwise, any other
5862 		 * return value indicates that the option is unsupported.
5863 		 */
5864 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5865 		    pru_socheckopt(so, sopt)) != 0) {
5866 			goto out;
5867 		}
5868 
5869 		error = 0;
5870 		switch (sopt->sopt_name) {
5871 		case SO_LINGER:
5872 		case SO_LINGER_SEC:
5873 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5874 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5875 			    so->so_linger : so->so_linger / hz;
5876 			error = sooptcopyout(sopt, &l, sizeof(l));
5877 			break;
5878 
5879 		case SO_USELOOPBACK:
5880 		case SO_DONTROUTE:
5881 		case SO_DEBUG:
5882 		case SO_KEEPALIVE:
5883 		case SO_REUSEADDR:
5884 		case SO_REUSEPORT:
5885 		case SO_BROADCAST:
5886 		case SO_OOBINLINE:
5887 		case SO_TIMESTAMP:
5888 		case SO_TIMESTAMP_MONOTONIC:
5889 		case SO_TIMESTAMP_CONTINUOUS:
5890 		case SO_DONTTRUNC:
5891 		case SO_WANTMORE:
5892 		case SO_WANTOOBFLAG:
5893 		case SO_NOWAKEFROMSLEEP:
5894 		case SO_NOAPNFALLBK:
5895 			optval = so->so_options & sopt->sopt_name;
5896 integer:
5897 			error = sooptcopyout(sopt, &optval, sizeof(optval));
5898 			break;
5899 
5900 		case SO_TYPE:
5901 			optval = so->so_type;
5902 			goto integer;
5903 
5904 		case SO_NREAD:
5905 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5906 				int pkt_total;
5907 				struct mbuf *m1;
5908 
5909 				pkt_total = 0;
5910 				m1 = so->so_rcv.sb_mb;
5911 				while (m1 != NULL) {
5912 					if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5913 						pkt_total += m1->m_len;
5914 					}
5915 					m1 = m1->m_next;
5916 				}
5917 				optval = pkt_total;
5918 			} else {
5919 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5920 			}
5921 			goto integer;
5922 
5923 		case SO_NUMRCVPKT:
5924 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5925 				int cnt = 0;
5926 				struct mbuf *m1;
5927 
5928 				m1 = so->so_rcv.sb_mb;
5929 				while (m1 != NULL) {
5930 					cnt += 1;
5931 					m1 = m1->m_nextpkt;
5932 				}
5933 				optval = cnt;
5934 				goto integer;
5935 			} else {
5936 				error = ENOPROTOOPT;
5937 				break;
5938 			}
5939 
5940 		case SO_NWRITE:
5941 			optval = so->so_snd.sb_cc;
5942 			goto integer;
5943 
5944 		case SO_ERROR:
5945 			optval = so->so_error;
5946 			so->so_error = 0;
5947 			goto integer;
5948 
5949 		case SO_SNDBUF: {
5950 			u_int32_t hiwat = so->so_snd.sb_hiwat;
5951 
5952 			if (so->so_snd.sb_flags & SB_UNIX) {
5953 				struct unpcb *unp =
5954 				    (struct unpcb *)(so->so_pcb);
5955 				if (unp != NULL && unp->unp_conn != NULL) {
5956 					hiwat += unp->unp_conn->unp_cc;
5957 				}
5958 			}
5959 
5960 			optval = hiwat;
5961 			goto integer;
5962 		}
5963 		case SO_RCVBUF:
5964 			optval = so->so_rcv.sb_hiwat;
5965 			goto integer;
5966 
5967 		case SO_SNDLOWAT:
5968 			optval = so->so_snd.sb_lowat;
5969 			goto integer;
5970 
5971 		case SO_RCVLOWAT:
5972 			optval = so->so_rcv.sb_lowat;
5973 			goto integer;
5974 
5975 		case SO_SNDTIMEO:
5976 		case SO_RCVTIMEO:
5977 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
5978 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5979 
5980 			error = sooptcopyout_timeval(sopt, &tv);
5981 			break;
5982 
5983 		case SO_NOSIGPIPE:
5984 			optval = (so->so_flags & SOF_NOSIGPIPE);
5985 			goto integer;
5986 
5987 		case SO_NOADDRERR:
5988 			optval = (so->so_flags & SOF_NOADDRAVAIL);
5989 			goto integer;
5990 
5991 		case SO_REUSESHAREUID:
5992 			optval = (so->so_flags & SOF_REUSESHAREUID);
5993 			goto integer;
5994 
5995 
5996 		case SO_NOTIFYCONFLICT:
5997 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5998 			goto integer;
5999 
6000 		case SO_RESTRICTIONS:
6001 			optval = so_get_restrictions(so);
6002 			goto integer;
6003 
6004 		case SO_AWDL_UNRESTRICTED:
6005 			if (SOCK_DOM(so) == PF_INET ||
6006 			    SOCK_DOM(so) == PF_INET6) {
6007 				optval = inp_get_awdl_unrestricted(
6008 					sotoinpcb(so));
6009 				goto integer;
6010 			} else {
6011 				error = EOPNOTSUPP;
6012 			}
6013 			break;
6014 
6015 		case SO_INTCOPROC_ALLOW:
6016 			if (SOCK_DOM(so) == PF_INET6) {
6017 				optval = inp_get_intcoproc_allowed(
6018 					sotoinpcb(so));
6019 				goto integer;
6020 			} else {
6021 				error = EOPNOTSUPP;
6022 			}
6023 			break;
6024 
6025 		case SO_LABEL:
6026 			error = EOPNOTSUPP;
6027 			break;
6028 
6029 		case SO_PEERLABEL:
6030 			error = EOPNOTSUPP;
6031 			break;
6032 
6033 #ifdef __APPLE_API_PRIVATE
6034 		case SO_UPCALLCLOSEWAIT:
6035 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6036 			goto integer;
6037 #endif
6038 		case SO_RANDOMPORT:
6039 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6040 			goto integer;
6041 
6042 		case SO_NP_EXTENSIONS: {
6043 			struct so_np_extensions sonpx = {};
6044 
6045 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6046 			    SONPX_SETOPTSHUT : 0;
6047 			sonpx.npx_mask = SONPX_MASK_VALID;
6048 
6049 			error = sooptcopyout(sopt, &sonpx,
6050 			    sizeof(struct so_np_extensions));
6051 			break;
6052 		}
6053 
6054 		case SO_TRAFFIC_CLASS:
6055 			optval = so->so_traffic_class;
6056 			goto integer;
6057 
6058 		case SO_RECV_TRAFFIC_CLASS:
6059 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6060 			goto integer;
6061 
6062 #if (DEVELOPMENT || DEBUG)
6063 		case SO_TRAFFIC_CLASS_DBG:
6064 			error = sogetopt_tcdbg(so, sopt);
6065 			break;
6066 #endif /* (DEVELOPMENT || DEBUG) */
6067 
6068 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6069 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6070 			goto integer;
6071 
6072 		case SO_DEFUNCTOK:
6073 			optval = !(so->so_flags & SOF_NODEFUNCT);
6074 			goto integer;
6075 
6076 		case SO_ISDEFUNCT:
6077 			optval = (so->so_flags & SOF_DEFUNCT);
6078 			goto integer;
6079 
6080 		case SO_OPPORTUNISTIC:
6081 			optval = so_get_opportunistic(so);
6082 			goto integer;
6083 
6084 		case SO_FLUSH:
6085 			/* This option is not gettable */
6086 			error = EINVAL;
6087 			break;
6088 
6089 		case SO_RECV_ANYIF:
6090 			optval = so_get_recv_anyif(so);
6091 			goto integer;
6092 
6093 		case SO_TRAFFIC_MGT_BACKGROUND:
6094 			/* This option is handled by lower layer(s) */
6095 			if (so->so_proto != NULL &&
6096 			    so->so_proto->pr_ctloutput != NULL) {
6097 				(void) so->so_proto->pr_ctloutput(so, sopt);
6098 			}
6099 			break;
6100 
6101 #if FLOW_DIVERT
6102 		case SO_FLOW_DIVERT_TOKEN:
6103 			error = flow_divert_token_get(so, sopt);
6104 			break;
6105 #endif  /* FLOW_DIVERT */
6106 
6107 #if NECP
6108 		case SO_NECP_ATTRIBUTES:
6109 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6110 				/* Handled by MPTCP itself */
6111 				break;
6112 			}
6113 
6114 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6115 				error = EINVAL;
6116 				goto out;
6117 			}
6118 
6119 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6120 			break;
6121 
6122 		case SO_NECP_CLIENTUUID: {
6123 			uuid_t *ncu;
6124 
6125 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6126 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6127 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6128 				ncu = &sotoinpcb(so)->necp_client_uuid;
6129 			} else {
6130 				error = EINVAL;
6131 				goto out;
6132 			}
6133 
6134 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6135 			break;
6136 		}
6137 
6138 		case SO_NECP_LISTENUUID: {
6139 			uuid_t *nlu;
6140 
6141 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6142 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6143 					nlu = &sotoinpcb(so)->necp_client_uuid;
6144 				} else {
6145 					error = ENOENT;
6146 					goto out;
6147 				}
6148 			} else {
6149 				error = EINVAL;
6150 				goto out;
6151 			}
6152 
6153 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6154 			break;
6155 		}
6156 
6157 		case SO_RESOLVER_SIGNATURE: {
6158 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6159 				error = EINVAL;
6160 				goto out;
6161 			}
6162 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6163 			break;
6164 		}
6165 
6166 #endif /* NECP */
6167 
6168 #if CONTENT_FILTER
6169 		case SO_CFIL_SOCK_ID: {
6170 			cfil_sock_id_t sock_id;
6171 
6172 			sock_id = cfil_sock_id_from_socket(so);
6173 
6174 			error = sooptcopyout(sopt, &sock_id,
6175 			    sizeof(cfil_sock_id_t));
6176 			break;
6177 		}
6178 #endif  /* CONTENT_FILTER */
6179 
6180 		case SO_EXTENDED_BK_IDLE:
6181 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6182 			goto integer;
6183 		case SO_MARK_CELLFALLBACK:
6184 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6185 			    ? 1 : 0;
6186 			goto integer;
6187 		case SO_FALLBACK_MODE:
6188 			optval = so->so_fallback_mode;
6189 			goto integer;
6190 		case SO_MARK_KNOWN_TRACKER: {
6191 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6192 			    ? 1 : 0;
6193 			goto integer;
6194 		}
6195 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6196 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6197 			    ? 1 : 0;
6198 			goto integer;
6199 		}
6200 		case SO_MARK_APPROVED_APP_DOMAIN: {
6201 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6202 			    ? 1 : 0;
6203 			goto integer;
6204 		}
6205 		case SO_NET_SERVICE_TYPE: {
6206 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6207 				optval = so->so_netsvctype;
6208 			} else {
6209 				optval = NET_SERVICE_TYPE_BE;
6210 			}
6211 			goto integer;
6212 		}
6213 		case SO_NETSVC_MARKING_LEVEL:
6214 			optval = so_get_netsvc_marking_level(so);
6215 			goto integer;
6216 
6217 		case SO_MPKL_SEND_INFO: {
6218 			struct so_mpkl_send_info so_mpkl_send_info;
6219 
6220 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6221 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6222 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6223 			    sizeof(struct so_mpkl_send_info));
6224 			break;
6225 		}
6226 		case SO_MARK_WAKE_PKT:
6227 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6228 			goto integer;
6229 		case SO_RECV_WAKE_PKT:
6230 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6231 			goto integer;
6232 		case SO_APPLICATION_ID: {
6233 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6234 				error = EINVAL;
6235 				goto out;
6236 			}
6237 			so_application_id_t application_id = { 0 };
6238 			application_id.uid = kauth_cred_getuid(so->so_cred);
6239 			uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6240 			application_id.persona_id = so->so_persona_id;
6241 			error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6242 			break;
6243 		}
6244 		default:
6245 			error = ENOPROTOOPT;
6246 			break;
6247 		}
6248 	}
6249 out:
6250 	if (dolock) {
6251 		socket_unlock(so, 1);
6252 	}
6253 	return error;
6254 }
6255 
6256 /*
6257  * The size limits on our soopt_getm is different from that on FreeBSD.
6258  * We limit the size of options to MCLBYTES. This will have to change
6259  * if we need to define options that need more space than MCLBYTES.
6260  */
6261 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6262 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6263 {
6264 	struct mbuf *m, *m_prev;
6265 	int sopt_size = (int)sopt->sopt_valsize;
6266 	int how;
6267 
6268 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6269 		return EMSGSIZE;
6270 	}
6271 
6272 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6273 	MGET(m, how, MT_DATA);
6274 	if (m == NULL) {
6275 		return ENOBUFS;
6276 	}
6277 	if (sopt_size > MLEN) {
6278 		MCLGET(m, how);
6279 		if ((m->m_flags & M_EXT) == 0) {
6280 			m_free(m);
6281 			return ENOBUFS;
6282 		}
6283 		m->m_len = min(MCLBYTES, sopt_size);
6284 	} else {
6285 		m->m_len = min(MLEN, sopt_size);
6286 	}
6287 	sopt_size -= m->m_len;
6288 	*mp = m;
6289 	m_prev = m;
6290 
6291 	while (sopt_size > 0) {
6292 		MGET(m, how, MT_DATA);
6293 		if (m == NULL) {
6294 			m_freem(*mp);
6295 			return ENOBUFS;
6296 		}
6297 		if (sopt_size > MLEN) {
6298 			MCLGET(m, how);
6299 			if ((m->m_flags & M_EXT) == 0) {
6300 				m_freem(*mp);
6301 				m_freem(m);
6302 				return ENOBUFS;
6303 			}
6304 			m->m_len = min(MCLBYTES, sopt_size);
6305 		} else {
6306 			m->m_len = min(MLEN, sopt_size);
6307 		}
6308 		sopt_size -= m->m_len;
6309 		m_prev->m_next = m;
6310 		m_prev = m;
6311 	}
6312 	return 0;
6313 }
6314 
6315 /* copyin sopt data into mbuf chain */
6316 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6317 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6318 {
6319 	struct mbuf *m0 = m;
6320 
6321 	if (sopt->sopt_val == USER_ADDR_NULL) {
6322 		return 0;
6323 	}
6324 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6325 		if (sopt->sopt_p != kernproc) {
6326 			int error;
6327 
6328 			error = copyin(sopt->sopt_val, mtod(m, char *),
6329 			    m->m_len);
6330 			if (error != 0) {
6331 				m_freem(m0);
6332 				return error;
6333 			}
6334 		} else {
6335 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6336 			    mtod(m, char *), m->m_len);
6337 		}
6338 		sopt->sopt_valsize -= m->m_len;
6339 		sopt->sopt_val += m->m_len;
6340 		m = m->m_next;
6341 	}
6342 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6343 	if (m != NULL) {
6344 		panic("soopt_mcopyin");
6345 		/* NOTREACHED */
6346 	}
6347 	return 0;
6348 }
6349 
6350 /* copyout mbuf chain data into soopt */
6351 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6352 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6353 {
6354 	struct mbuf *m0 = m;
6355 	size_t valsize = 0;
6356 
6357 	if (sopt->sopt_val == USER_ADDR_NULL) {
6358 		return 0;
6359 	}
6360 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6361 		if (sopt->sopt_p != kernproc) {
6362 			int error;
6363 
6364 			error = copyout(mtod(m, char *), sopt->sopt_val,
6365 			    m->m_len);
6366 			if (error != 0) {
6367 				m_freem(m0);
6368 				return error;
6369 			}
6370 		} else {
6371 			bcopy(mtod(m, char *),
6372 			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6373 		}
6374 		sopt->sopt_valsize -= m->m_len;
6375 		sopt->sopt_val += m->m_len;
6376 		valsize += m->m_len;
6377 		m = m->m_next;
6378 	}
6379 	if (m != NULL) {
6380 		/* enough soopt buffer should be given from user-land */
6381 		m_freem(m0);
6382 		return EINVAL;
6383 	}
6384 	sopt->sopt_valsize = valsize;
6385 	return 0;
6386 }
6387 
6388 void
sohasoutofband(struct socket * so)6389 sohasoutofband(struct socket *so)
6390 {
6391 	if (so->so_pgid < 0) {
6392 		gsignal(-so->so_pgid, SIGURG);
6393 	} else if (so->so_pgid > 0) {
6394 		proc_signal(so->so_pgid, SIGURG);
6395 	}
6396 	selwakeup(&so->so_rcv.sb_sel);
6397 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6398 		KNOTE(&so->so_rcv.sb_sel.si_note,
6399 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6400 	}
6401 }
6402 
6403 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6404 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6405 {
6406 #pragma unused(cred)
6407 	struct proc *p = current_proc();
6408 	int revents = 0;
6409 
6410 	socket_lock(so, 1);
6411 	so_update_last_owner_locked(so, PROC_NULL);
6412 	so_update_policy(so);
6413 
6414 	if (events & (POLLIN | POLLRDNORM)) {
6415 		if (soreadable(so)) {
6416 			revents |= events & (POLLIN | POLLRDNORM);
6417 		}
6418 	}
6419 
6420 	if (events & (POLLOUT | POLLWRNORM)) {
6421 		if (sowriteable(so)) {
6422 			revents |= events & (POLLOUT | POLLWRNORM);
6423 		}
6424 	}
6425 
6426 	if (events & (POLLPRI | POLLRDBAND)) {
6427 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6428 			revents |= events & (POLLPRI | POLLRDBAND);
6429 		}
6430 	}
6431 
6432 	if (revents == 0) {
6433 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6434 			/*
6435 			 * Darwin sets the flag first,
6436 			 * BSD calls selrecord first
6437 			 */
6438 			so->so_rcv.sb_flags |= SB_SEL;
6439 			selrecord(p, &so->so_rcv.sb_sel, wql);
6440 		}
6441 
6442 		if (events & (POLLOUT | POLLWRNORM)) {
6443 			/*
6444 			 * Darwin sets the flag first,
6445 			 * BSD calls selrecord first
6446 			 */
6447 			so->so_snd.sb_flags |= SB_SEL;
6448 			selrecord(p, &so->so_snd.sb_sel, wql);
6449 		}
6450 	}
6451 
6452 	socket_unlock(so, 1);
6453 	return revents;
6454 }
6455 
6456 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6457 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6458 {
6459 	struct socket *so = (struct socket *)fp_get_data(fp);
6460 	int result;
6461 
6462 	socket_lock(so, 1);
6463 	so_update_last_owner_locked(so, PROC_NULL);
6464 	so_update_policy(so);
6465 
6466 	switch (kn->kn_filter) {
6467 	case EVFILT_READ:
6468 		kn->kn_filtid = EVFILTID_SOREAD;
6469 		break;
6470 	case EVFILT_WRITE:
6471 		kn->kn_filtid = EVFILTID_SOWRITE;
6472 		break;
6473 	case EVFILT_SOCK:
6474 		kn->kn_filtid = EVFILTID_SCK;
6475 		break;
6476 	case EVFILT_EXCEPT:
6477 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6478 		break;
6479 	default:
6480 		socket_unlock(so, 1);
6481 		knote_set_error(kn, EINVAL);
6482 		return 0;
6483 	}
6484 
6485 	/*
6486 	 * call the appropriate sub-filter attach
6487 	 * with the socket still locked
6488 	 */
6489 	result = knote_fops(kn)->f_attach(kn, kev);
6490 
6491 	socket_unlock(so, 1);
6492 
6493 	return result;
6494 }
6495 
6496 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6497 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6498 {
6499 	int retval = 0;
6500 	int64_t data = 0;
6501 
6502 	if (so->so_options & SO_ACCEPTCONN) {
6503 		/*
6504 		 * Radar 6615193 handle the listen case dynamically
6505 		 * for kqueue read filter. This allows to call listen()
6506 		 * after registering the kqueue EVFILT_READ.
6507 		 */
6508 
6509 		retval = !TAILQ_EMPTY(&so->so_comp);
6510 		data = so->so_qlen;
6511 		goto out;
6512 	}
6513 
6514 	/* socket isn't a listener */
6515 	/*
6516 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6517 	 * the bytes of protocol data. We therefore exclude any
6518 	 * control bytes.
6519 	 */
6520 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6521 
6522 	if (kn->kn_sfflags & NOTE_OOB) {
6523 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6524 			kn->kn_fflags |= NOTE_OOB;
6525 			data -= so->so_oobmark;
6526 			retval = 1;
6527 			goto out;
6528 		}
6529 	}
6530 
6531 	if ((so->so_state & SS_CANTRCVMORE)
6532 #if CONTENT_FILTER
6533 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6534 #endif /* CONTENT_FILTER */
6535 	    ) {
6536 		kn->kn_flags |= EV_EOF;
6537 		kn->kn_fflags = so->so_error;
6538 		retval = 1;
6539 		goto out;
6540 	}
6541 
6542 	if (so->so_error) {     /* temporary udp error */
6543 		retval = 1;
6544 		goto out;
6545 	}
6546 
6547 	int64_t lowwat = so->so_rcv.sb_lowat;
6548 	/*
6549 	 * Ensure that when NOTE_LOWAT is used, the derived
6550 	 * low water mark is bounded by socket's rcv buf's
6551 	 * high and low water mark values.
6552 	 */
6553 	if (kn->kn_sfflags & NOTE_LOWAT) {
6554 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6555 			lowwat = so->so_rcv.sb_hiwat;
6556 		} else if (kn->kn_sdata > lowwat) {
6557 			lowwat = kn->kn_sdata;
6558 		}
6559 	}
6560 
6561 	/*
6562 	 * While the `data` field is the amount of data to read,
6563 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6564 	 * so we need to take control bytes into account too.
6565 	 */
6566 	retval = (so->so_rcv.sb_cc >= lowwat);
6567 
6568 out:
6569 	if (retval && kev) {
6570 		knote_fill_kevent(kn, kev, data);
6571 	}
6572 	return retval;
6573 }
6574 
6575 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6576 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6577 {
6578 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6579 
6580 	/* socket locked */
6581 
6582 	/*
6583 	 * If the caller explicitly asked for OOB results (e.g. poll())
6584 	 * from EVFILT_READ, then save that off in the hookid field
6585 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6586 	 */
6587 	if (kn->kn_filter == EVFILT_READ &&
6588 	    kn->kn_flags & EV_OOBAND) {
6589 		kn->kn_flags &= ~EV_OOBAND;
6590 		kn->kn_hook32 = EV_OOBAND;
6591 	} else {
6592 		kn->kn_hook32 = 0;
6593 	}
6594 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6595 		so->so_rcv.sb_flags |= SB_KNOTE;
6596 	}
6597 
6598 	/* indicate if event is already fired */
6599 	return filt_soread_common(kn, NULL, so);
6600 }
6601 
6602 static void
filt_sordetach(struct knote * kn)6603 filt_sordetach(struct knote *kn)
6604 {
6605 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6606 
6607 	socket_lock(so, 1);
6608 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6609 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6610 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6611 		}
6612 	}
6613 	socket_unlock(so, 1);
6614 }
6615 
6616 /*ARGSUSED*/
6617 static int
filt_soread(struct knote * kn,long hint)6618 filt_soread(struct knote *kn, long hint)
6619 {
6620 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6621 	int retval;
6622 
6623 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6624 		socket_lock(so, 1);
6625 	}
6626 
6627 	retval = filt_soread_common(kn, NULL, so);
6628 
6629 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6630 		socket_unlock(so, 1);
6631 	}
6632 
6633 	return retval;
6634 }
6635 
6636 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6637 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6638 {
6639 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6640 	int retval;
6641 
6642 	socket_lock(so, 1);
6643 
6644 	/* save off the new input fflags and data */
6645 	kn->kn_sfflags = kev->fflags;
6646 	kn->kn_sdata = kev->data;
6647 
6648 	/* determine if changes result in fired events */
6649 	retval = filt_soread_common(kn, NULL, so);
6650 
6651 	socket_unlock(so, 1);
6652 
6653 	return retval;
6654 }
6655 
6656 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6657 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6658 {
6659 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6660 	int retval;
6661 
6662 	socket_lock(so, 1);
6663 	retval = filt_soread_common(kn, kev, so);
6664 	socket_unlock(so, 1);
6665 
6666 	return retval;
6667 }
6668 
6669 int
so_wait_for_if_feedback(struct socket * so)6670 so_wait_for_if_feedback(struct socket *so)
6671 {
6672 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6673 	    (so->so_state & SS_ISCONNECTED)) {
6674 		struct inpcb *inp = sotoinpcb(so);
6675 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6676 			return 1;
6677 		}
6678 	}
6679 	return 0;
6680 }
6681 
6682 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6683 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6684 {
6685 	int ret = 0;
6686 	int64_t data = sbspace(&so->so_snd);
6687 
6688 	if (so->so_state & SS_CANTSENDMORE) {
6689 		kn->kn_flags |= EV_EOF;
6690 		kn->kn_fflags = so->so_error;
6691 		ret = 1;
6692 		goto out;
6693 	}
6694 
6695 	if (so->so_error) {     /* temporary udp error */
6696 		ret = 1;
6697 		goto out;
6698 	}
6699 
6700 	if (!socanwrite(so)) {
6701 		ret = 0;
6702 		goto out;
6703 	}
6704 
6705 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6706 		ret = 1;
6707 		goto out;
6708 	}
6709 
6710 	int64_t lowwat = so->so_snd.sb_lowat;
6711 	const int64_t hiwat = so->so_snd.sb_hiwat;
6712 	/*
6713 	 * Deal with connected UNIX domain sockets which
6714 	 * rely on the fact that the sender's socket buffer is
6715 	 * actually the receiver's socket buffer.
6716 	 */
6717 	if (SOCK_DOM(so) == PF_LOCAL) {
6718 		struct unpcb *unp = sotounpcb(so);
6719 		if (unp != NULL && unp->unp_conn != NULL &&
6720 		    unp->unp_conn->unp_socket != NULL) {
6721 			struct socket *so2 = unp->unp_conn->unp_socket;
6722 			/*
6723 			 * At this point we know that `so' is locked
6724 			 * and that `unp_conn` isn't going to change.
6725 			 * However, we don't lock `so2` because doing so
6726 			 * may require unlocking `so'
6727 			 * (see unp_get_locks_in_order()).
6728 			 *
6729 			 * Two cases can happen:
6730 			 *
6731 			 * 1) we return 1 and tell the application that
6732 			 *    it can write.  Meanwhile, another thread
6733 			 *    fills up the socket buffer.  This will either
6734 			 *    lead to a blocking send or EWOULDBLOCK
6735 			 *    which the application should deal with.
6736 			 * 2) we return 0 and tell the application that
6737 			 *    the socket is not writable.  Meanwhile,
6738 			 *    another thread depletes the receive socket
6739 			 *    buffer. In this case the application will
6740 			 *    be woken up by sb_notify().
6741 			 *
6742 			 * MIN() is required because otherwise sosendcheck()
6743 			 * may return EWOULDBLOCK since it only considers
6744 			 * so->so_snd.
6745 			 */
6746 			data = MIN(data, sbspace(&so2->so_rcv));
6747 		}
6748 	}
6749 
6750 	if (kn->kn_sfflags & NOTE_LOWAT) {
6751 		if (kn->kn_sdata > hiwat) {
6752 			lowwat = hiwat;
6753 		} else if (kn->kn_sdata > lowwat) {
6754 			lowwat = kn->kn_sdata;
6755 		}
6756 	}
6757 
6758 	if (data > 0 && data >= lowwat) {
6759 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6760 #if (DEBUG || DEVELOPMENT)
6761 		    && so_notsent_lowat_check == 1
6762 #endif /* DEBUG || DEVELOPMENT */
6763 		    ) {
6764 			if ((SOCK_DOM(so) == PF_INET ||
6765 			    SOCK_DOM(so) == PF_INET6) &&
6766 			    so->so_type == SOCK_STREAM) {
6767 				ret = tcp_notsent_lowat_check(so);
6768 			}
6769 #if MPTCP
6770 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6771 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6772 				ret = mptcp_notsent_lowat_check(so);
6773 			}
6774 #endif
6775 			else {
6776 				ret = 1;
6777 				goto out;
6778 			}
6779 		} else {
6780 			ret = 1;
6781 		}
6782 	}
6783 	if (so_wait_for_if_feedback(so)) {
6784 		ret = 0;
6785 	}
6786 
6787 out:
6788 	if (ret && kev) {
6789 		knote_fill_kevent(kn, kev, data);
6790 	}
6791 	return ret;
6792 }
6793 
6794 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6795 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6796 {
6797 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6798 
6799 	/* socket locked */
6800 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6801 		so->so_snd.sb_flags |= SB_KNOTE;
6802 	}
6803 
6804 	/* determine if its already fired */
6805 	return filt_sowrite_common(kn, NULL, so);
6806 }
6807 
6808 static void
filt_sowdetach(struct knote * kn)6809 filt_sowdetach(struct knote *kn)
6810 {
6811 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6812 	socket_lock(so, 1);
6813 
6814 	if (so->so_snd.sb_flags & SB_KNOTE) {
6815 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6816 			so->so_snd.sb_flags &= ~SB_KNOTE;
6817 		}
6818 	}
6819 	socket_unlock(so, 1);
6820 }
6821 
6822 /*ARGSUSED*/
6823 static int
filt_sowrite(struct knote * kn,long hint)6824 filt_sowrite(struct knote *kn, long hint)
6825 {
6826 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6827 	int ret;
6828 
6829 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6830 		socket_lock(so, 1);
6831 	}
6832 
6833 	ret = filt_sowrite_common(kn, NULL, so);
6834 
6835 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6836 		socket_unlock(so, 1);
6837 	}
6838 
6839 	return ret;
6840 }
6841 
6842 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6843 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6844 {
6845 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6846 	int ret;
6847 
6848 	socket_lock(so, 1);
6849 
6850 	/*save off the new input fflags and data */
6851 	kn->kn_sfflags = kev->fflags;
6852 	kn->kn_sdata = kev->data;
6853 
6854 	/* determine if these changes result in a triggered event */
6855 	ret = filt_sowrite_common(kn, NULL, so);
6856 
6857 	socket_unlock(so, 1);
6858 
6859 	return ret;
6860 }
6861 
6862 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6863 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6864 {
6865 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6866 	int ret;
6867 
6868 	socket_lock(so, 1);
6869 	ret = filt_sowrite_common(kn, kev, so);
6870 	socket_unlock(so, 1);
6871 
6872 	return ret;
6873 }
6874 
6875 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6876 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6877     struct socket *so, long ev_hint)
6878 {
6879 	int ret = 0;
6880 	int64_t data = 0;
6881 	uint32_t level_trigger = 0;
6882 
6883 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6884 		kn->kn_fflags |= NOTE_CONNRESET;
6885 	}
6886 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6887 		kn->kn_fflags |= NOTE_TIMEOUT;
6888 	}
6889 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6890 		kn->kn_fflags |= NOTE_NOSRCADDR;
6891 	}
6892 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6893 		kn->kn_fflags |= NOTE_IFDENIED;
6894 	}
6895 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6896 		kn->kn_fflags |= NOTE_KEEPALIVE;
6897 	}
6898 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6899 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6900 	}
6901 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6902 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6903 	}
6904 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6905 	    (so->so_state & SS_ISCONNECTED)) {
6906 		kn->kn_fflags |= NOTE_CONNECTED;
6907 		level_trigger |= NOTE_CONNECTED;
6908 	}
6909 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6910 	    (so->so_state & SS_ISDISCONNECTED)) {
6911 		kn->kn_fflags |= NOTE_DISCONNECTED;
6912 		level_trigger |= NOTE_DISCONNECTED;
6913 	}
6914 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6915 		if (so->so_proto != NULL &&
6916 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6917 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6918 		}
6919 	}
6920 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6921 	    tcp_notify_ack_active(so)) {
6922 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
6923 	}
6924 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6925 		kn->kn_fflags |= NOTE_WAKE_PKT;
6926 	}
6927 
6928 	if ((so->so_state & SS_CANTRCVMORE)
6929 #if CONTENT_FILTER
6930 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6931 #endif /* CONTENT_FILTER */
6932 	    ) {
6933 		kn->kn_fflags |= NOTE_READCLOSED;
6934 		level_trigger |= NOTE_READCLOSED;
6935 	}
6936 
6937 	if (so->so_state & SS_CANTSENDMORE) {
6938 		kn->kn_fflags |= NOTE_WRITECLOSED;
6939 		level_trigger |= NOTE_WRITECLOSED;
6940 	}
6941 
6942 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6943 	    (so->so_flags & SOF_SUSPENDED)) {
6944 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6945 
6946 		/* If resume event was delivered before, reset it */
6947 		kn->kn_hook32 &= ~NOTE_RESUME;
6948 
6949 		kn->kn_fflags |= NOTE_SUSPEND;
6950 		level_trigger |= NOTE_SUSPEND;
6951 	}
6952 
6953 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
6954 	    (so->so_flags & SOF_SUSPENDED) == 0) {
6955 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6956 
6957 		/* If suspend event was delivered before, reset it */
6958 		kn->kn_hook32 &= ~NOTE_SUSPEND;
6959 
6960 		kn->kn_fflags |= NOTE_RESUME;
6961 		level_trigger |= NOTE_RESUME;
6962 	}
6963 
6964 	if (so->so_error != 0) {
6965 		ret = 1;
6966 		data = so->so_error;
6967 		kn->kn_flags |= EV_EOF;
6968 	} else {
6969 		u_int32_t data32 = 0;
6970 		get_sockev_state(so, &data32);
6971 		data = data32;
6972 	}
6973 
6974 	/* Reset any events that are not requested on this knote */
6975 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6976 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6977 
6978 	/* Find the level triggerred events that are already delivered */
6979 	level_trigger &= kn->kn_hook32;
6980 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6981 
6982 	/* Do not deliver level triggerred events more than once */
6983 	if ((kn->kn_fflags & ~level_trigger) != 0) {
6984 		ret = 1;
6985 	}
6986 
6987 	if (ret && kev) {
6988 		/*
6989 		 * Store the state of the events being delivered. This
6990 		 * state can be used to deliver level triggered events
6991 		 * ateast once and still avoid waking up the application
6992 		 * multiple times as long as the event is active.
6993 		 */
6994 		if (kn->kn_fflags != 0) {
6995 			kn->kn_hook32 |= (kn->kn_fflags &
6996 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6997 		}
6998 
6999 		/*
7000 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7001 		 * only one of them and remember the last one that was
7002 		 * delivered last
7003 		 */
7004 		if (kn->kn_fflags & NOTE_SUSPEND) {
7005 			kn->kn_hook32 &= ~NOTE_RESUME;
7006 		}
7007 		if (kn->kn_fflags & NOTE_RESUME) {
7008 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7009 		}
7010 
7011 		knote_fill_kevent(kn, kev, data);
7012 	}
7013 	return ret;
7014 }
7015 
7016 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7017 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7018 {
7019 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7020 
7021 	/* socket locked */
7022 	kn->kn_hook32 = 0;
7023 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7024 		so->so_flags |= SOF_KNOTE;
7025 	}
7026 
7027 	/* determine if event already fired */
7028 	return filt_sockev_common(kn, NULL, so, 0);
7029 }
7030 
7031 static void
filt_sockdetach(struct knote * kn)7032 filt_sockdetach(struct knote *kn)
7033 {
7034 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7035 	socket_lock(so, 1);
7036 
7037 	if ((so->so_flags & SOF_KNOTE) != 0) {
7038 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7039 			so->so_flags &= ~SOF_KNOTE;
7040 		}
7041 	}
7042 	socket_unlock(so, 1);
7043 }
7044 
7045 static int
filt_sockev(struct knote * kn,long hint)7046 filt_sockev(struct knote *kn, long hint)
7047 {
7048 	int ret = 0, locked = 0;
7049 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7050 	long ev_hint = (hint & SO_FILT_HINT_EV);
7051 
7052 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7053 		socket_lock(so, 1);
7054 		locked = 1;
7055 	}
7056 
7057 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7058 
7059 	if (locked) {
7060 		socket_unlock(so, 1);
7061 	}
7062 
7063 	return ret;
7064 }
7065 
7066 
7067 
7068 /*
7069  *	filt_socktouch - update event state
7070  */
7071 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7072 filt_socktouch(
7073 	struct knote *kn,
7074 	struct kevent_qos_s *kev)
7075 {
7076 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7077 	uint32_t changed_flags;
7078 	int ret;
7079 
7080 	socket_lock(so, 1);
7081 
7082 	/* save off the [result] data and fflags */
7083 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7084 
7085 	/* save off the new input fflags and data */
7086 	kn->kn_sfflags = kev->fflags;
7087 	kn->kn_sdata = kev->data;
7088 
7089 	/* restrict the current results to the (smaller?) set of new interest */
7090 	/*
7091 	 * For compatibility with previous implementations, we leave kn_fflags
7092 	 * as they were before.
7093 	 */
7094 	//kn->kn_fflags &= kev->fflags;
7095 
7096 	/*
7097 	 * Since we keep track of events that are already
7098 	 * delivered, if any of those events are not requested
7099 	 * anymore the state related to them can be reset
7100 	 */
7101 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7102 
7103 	/* determine if we have events to deliver */
7104 	ret = filt_sockev_common(kn, NULL, so, 0);
7105 
7106 	socket_unlock(so, 1);
7107 
7108 	return ret;
7109 }
7110 
7111 /*
7112  *	filt_sockprocess - query event fired state and return data
7113  */
7114 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7115 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7116 {
7117 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7118 	int ret = 0;
7119 
7120 	socket_lock(so, 1);
7121 
7122 	ret = filt_sockev_common(kn, kev, so, 0);
7123 
7124 	socket_unlock(so, 1);
7125 
7126 	return ret;
7127 }
7128 
7129 void
get_sockev_state(struct socket * so,u_int32_t * statep)7130 get_sockev_state(struct socket *so, u_int32_t *statep)
7131 {
7132 	u_int32_t state = *(statep);
7133 
7134 	/*
7135 	 * If the state variable is already used by a previous event,
7136 	 * reset it.
7137 	 */
7138 	if (state != 0) {
7139 		return;
7140 	}
7141 
7142 	if (so->so_state & SS_ISCONNECTED) {
7143 		state |= SOCKEV_CONNECTED;
7144 	} else {
7145 		state &= ~(SOCKEV_CONNECTED);
7146 	}
7147 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7148 	*(statep) = state;
7149 }
7150 
7151 #define SO_LOCK_HISTORY_STR_LEN \
7152 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7153 
7154 __private_extern__ const char *
solockhistory_nr(struct socket * so)7155 solockhistory_nr(struct socket *so)
7156 {
7157 	size_t n = 0;
7158 	int i;
7159 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7160 
7161 	bzero(lock_history_str, sizeof(lock_history_str));
7162 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7163 		n += scnprintf(lock_history_str + n,
7164 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7165 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7166 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7167 	}
7168 	return lock_history_str;
7169 }
7170 
7171 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7172 socket_getlock(struct socket *so, int flags)
7173 {
7174 	if (so->so_proto->pr_getlock != NULL) {
7175 		return (*so->so_proto->pr_getlock)(so, flags);
7176 	} else {
7177 		return so->so_proto->pr_domain->dom_mtx;
7178 	}
7179 }
7180 
7181 void
socket_lock(struct socket * so,int refcount)7182 socket_lock(struct socket *so, int refcount)
7183 {
7184 	void *lr_saved;
7185 
7186 	lr_saved = __builtin_return_address(0);
7187 
7188 	if (so->so_proto->pr_lock) {
7189 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7190 	} else {
7191 #ifdef MORE_LOCKING_DEBUG
7192 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7193 		    LCK_MTX_ASSERT_NOTOWNED);
7194 #endif
7195 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7196 		if (refcount) {
7197 			so->so_usecount++;
7198 		}
7199 		so->lock_lr[so->next_lock_lr] = lr_saved;
7200 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7201 	}
7202 }
7203 
7204 void
socket_lock_assert_owned(struct socket * so)7205 socket_lock_assert_owned(struct socket *so)
7206 {
7207 	lck_mtx_t *mutex_held;
7208 
7209 	if (so->so_proto->pr_getlock != NULL) {
7210 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7211 	} else {
7212 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7213 	}
7214 
7215 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7216 }
7217 
7218 int
socket_try_lock(struct socket * so)7219 socket_try_lock(struct socket *so)
7220 {
7221 	lck_mtx_t *mtx;
7222 
7223 	if (so->so_proto->pr_getlock != NULL) {
7224 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7225 	} else {
7226 		mtx = so->so_proto->pr_domain->dom_mtx;
7227 	}
7228 
7229 	return lck_mtx_try_lock(mtx);
7230 }
7231 
7232 void
socket_unlock(struct socket * so,int refcount)7233 socket_unlock(struct socket *so, int refcount)
7234 {
7235 	void *lr_saved;
7236 	lck_mtx_t *mutex_held;
7237 
7238 	lr_saved = __builtin_return_address(0);
7239 
7240 	if (so == NULL || so->so_proto == NULL) {
7241 		panic("%s: null so_proto so=%p", __func__, so);
7242 		/* NOTREACHED */
7243 	}
7244 
7245 	if (so->so_proto->pr_unlock) {
7246 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7247 	} else {
7248 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7249 #ifdef MORE_LOCKING_DEBUG
7250 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7251 #endif
7252 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7253 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7254 
7255 		if (refcount) {
7256 			if (so->so_usecount <= 0) {
7257 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7258 				    "lrh=%s", __func__, so->so_usecount, so,
7259 				    SOCK_DOM(so), so->so_type,
7260 				    SOCK_PROTO(so), solockhistory_nr(so));
7261 				/* NOTREACHED */
7262 			}
7263 
7264 			so->so_usecount--;
7265 			if (so->so_usecount == 0) {
7266 				sofreelastref(so, 1);
7267 			}
7268 		}
7269 		lck_mtx_unlock(mutex_held);
7270 	}
7271 }
7272 
7273 /* Called with socket locked, will unlock socket */
7274 void
sofree(struct socket * so)7275 sofree(struct socket *so)
7276 {
7277 	lck_mtx_t *mutex_held;
7278 
7279 	if (so->so_proto->pr_getlock != NULL) {
7280 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7281 	} else {
7282 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7283 	}
7284 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7285 
7286 	sofreelastref(so, 0);
7287 }
7288 
7289 void
soreference(struct socket * so)7290 soreference(struct socket *so)
7291 {
7292 	socket_lock(so, 1);     /* locks & take one reference on socket */
7293 	socket_unlock(so, 0);   /* unlock only */
7294 }
7295 
7296 void
sodereference(struct socket * so)7297 sodereference(struct socket *so)
7298 {
7299 	socket_lock(so, 0);
7300 	socket_unlock(so, 1);
7301 }
7302 
7303 /*
7304  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7305  * possibility of using jumbo clusters.  Caller must ensure to hold
7306  * the socket lock.
7307  */
7308 void
somultipages(struct socket * so,boolean_t set)7309 somultipages(struct socket *so, boolean_t set)
7310 {
7311 	if (set) {
7312 		so->so_flags |= SOF_MULTIPAGES;
7313 	} else {
7314 		so->so_flags &= ~SOF_MULTIPAGES;
7315 	}
7316 }
7317 
7318 void
soif2kcl(struct socket * so,boolean_t set)7319 soif2kcl(struct socket *so, boolean_t set)
7320 {
7321 	if (set) {
7322 		so->so_flags1 |= SOF1_IF_2KCL;
7323 	} else {
7324 		so->so_flags1 &= ~SOF1_IF_2KCL;
7325 	}
7326 }
7327 
7328 int
so_isdstlocal(struct socket * so)7329 so_isdstlocal(struct socket *so)
7330 {
7331 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7332 
7333 	if (SOCK_DOM(so) == PF_INET) {
7334 		return inaddr_local(inp->inp_faddr);
7335 	} else if (SOCK_DOM(so) == PF_INET6) {
7336 		return in6addr_local(&inp->in6p_faddr);
7337 	}
7338 
7339 	return 0;
7340 }
7341 
7342 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7343 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7344 {
7345 	struct sockbuf *rcv, *snd;
7346 	int err = 0, defunct;
7347 
7348 	rcv = &so->so_rcv;
7349 	snd = &so->so_snd;
7350 
7351 	defunct = (so->so_flags & SOF_DEFUNCT);
7352 	if (defunct) {
7353 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7354 			panic("%s: SB_DROP not set", __func__);
7355 			/* NOTREACHED */
7356 		}
7357 		goto done;
7358 	}
7359 
7360 	if (so->so_flags & SOF_NODEFUNCT) {
7361 		if (noforce) {
7362 			err = EOPNOTSUPP;
7363 			if (p != PROC_NULL) {
7364 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7365 				    "name %s level %d) so 0x%llu [%d,%d] "
7366 				    "is not eligible for defunct "
7367 				    "(%d)\n", __func__, proc_selfpid(),
7368 				    proc_best_name(current_proc()), proc_pid(p),
7369 				    proc_best_name(p), level,
7370 				    so->so_gencnt,
7371 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7372 			}
7373 			return err;
7374 		}
7375 		so->so_flags &= ~SOF_NODEFUNCT;
7376 		if (p != PROC_NULL) {
7377 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7378 			    "name %s level %d) so 0x%llu [%d,%d] "
7379 			    "defunct by force "
7380 			    "(%d)\n", __func__, proc_selfpid(),
7381 			    proc_best_name(current_proc()), proc_pid(p),
7382 			    proc_best_name(p), level,
7383 			    so->so_gencnt,
7384 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7385 		}
7386 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7387 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7388 		struct ifnet *ifp = inp->inp_last_outifp;
7389 
7390 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7391 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7392 		} else if (so->so_flags & SOF_DELEGATED) {
7393 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7394 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7395 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7396 		} else if (noforce && p != PROC_NULL) {
7397 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7398 
7399 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7400 			so->so_extended_bk_start = net_uptime();
7401 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7402 
7403 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7404 
7405 			err = EOPNOTSUPP;
7406 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7407 			    "name %s level %d) so 0x%llu [%d,%d] "
7408 			    "extend bk idle "
7409 			    "(%d)\n", __func__, proc_selfpid(),
7410 			    proc_best_name(current_proc()), proc_pid(p),
7411 			    proc_best_name(p), level,
7412 			    so->so_gencnt,
7413 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7414 			return err;
7415 		} else {
7416 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7417 		}
7418 	}
7419 
7420 	so->so_flags |= SOF_DEFUNCT;
7421 
7422 	/* Prevent further data from being appended to the socket buffers */
7423 	snd->sb_flags |= SB_DROP;
7424 	rcv->sb_flags |= SB_DROP;
7425 
7426 	/* Flush any existing data in the socket buffers */
7427 	if (rcv->sb_cc != 0) {
7428 		rcv->sb_flags &= ~SB_SEL;
7429 		selthreadclear(&rcv->sb_sel);
7430 		sbrelease(rcv);
7431 	}
7432 	if (snd->sb_cc != 0) {
7433 		snd->sb_flags &= ~SB_SEL;
7434 		selthreadclear(&snd->sb_sel);
7435 		sbrelease(snd);
7436 	}
7437 
7438 done:
7439 	if (p != PROC_NULL) {
7440 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7441 		    "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7442 		    proc_selfpid(), proc_best_name(current_proc()),
7443 		    proc_pid(p), proc_best_name(p), level,
7444 		    so->so_gencnt, SOCK_DOM(so),
7445 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7446 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7447 		    " extbkidle" : "");
7448 	}
7449 	return err;
7450 }
7451 
7452 int
sodefunct(struct proc * p,struct socket * so,int level)7453 sodefunct(struct proc *p, struct socket *so, int level)
7454 {
7455 	struct sockbuf *rcv, *snd;
7456 
7457 	if (!(so->so_flags & SOF_DEFUNCT)) {
7458 		panic("%s improperly called", __func__);
7459 		/* NOTREACHED */
7460 	}
7461 	if (so->so_state & SS_DEFUNCT) {
7462 		goto done;
7463 	}
7464 
7465 	rcv = &so->so_rcv;
7466 	snd = &so->so_snd;
7467 
7468 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7469 		char s[MAX_IPv6_STR_LEN];
7470 		char d[MAX_IPv6_STR_LEN];
7471 		struct inpcb *inp = sotoinpcb(so);
7472 
7473 		if (p != PROC_NULL) {
7474 			SODEFUNCTLOG(
7475 				"%s[%d, %s]: (target pid %d name %s level %d) "
7476 				"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7477 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7478 				" snd_fl 0x%x]\n", __func__,
7479 				proc_selfpid(), proc_best_name(current_proc()),
7480 				proc_pid(p), proc_best_name(p), level,
7481 				so->so_gencnt,
7482 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7483 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7484 				(void *)&inp->inp_laddr.s_addr :
7485 				(void *)&inp->in6p_laddr),
7486 				s, sizeof(s)), ntohs(inp->in6p_lport),
7487 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7488 				(void *)&inp->inp_faddr.s_addr :
7489 				(void *)&inp->in6p_faddr,
7490 				d, sizeof(d)), ntohs(inp->in6p_fport),
7491 				(uint32_t)rcv->sb_sel.si_flags,
7492 				(uint32_t)snd->sb_sel.si_flags,
7493 				rcv->sb_flags, snd->sb_flags);
7494 		}
7495 	} else if (p != PROC_NULL) {
7496 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7497 		    "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7498 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7499 		    proc_selfpid(), proc_best_name(current_proc()),
7500 		    proc_pid(p), proc_best_name(p), level,
7501 		    so->so_gencnt,
7502 		    SOCK_DOM(so), SOCK_TYPE(so),
7503 		    (uint32_t)rcv->sb_sel.si_flags,
7504 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7505 		    snd->sb_flags);
7506 	}
7507 
7508 	/*
7509 	 * First tell the protocol the flow is defunct
7510 	 */
7511 	(void)  (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7512 
7513 	/*
7514 	 * Unwedge threads blocked on sbwait() and sb_lock().
7515 	 */
7516 	sbwakeup(rcv);
7517 	sbwakeup(snd);
7518 
7519 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7520 	if (rcv->sb_flags & SB_LOCK) {
7521 		sbunlock(rcv, TRUE);    /* keep socket locked */
7522 	}
7523 	if (snd->sb_flags & SB_LOCK) {
7524 		sbunlock(snd, TRUE);    /* keep socket locked */
7525 	}
7526 	/*
7527 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7528 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7529 	 * states are set for the socket.  This would also flush out data
7530 	 * hanging off the receive list of this socket.
7531 	 */
7532 	(void) soshutdownlock_final(so, SHUT_RD);
7533 	(void) soshutdownlock_final(so, SHUT_WR);
7534 	(void) sodisconnectlocked(so);
7535 
7536 	/*
7537 	 * Explicitly handle connectionless-protocol disconnection
7538 	 * and release any remaining data in the socket buffers.
7539 	 */
7540 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7541 		(void) soisdisconnected(so);
7542 	}
7543 
7544 	if (so->so_error == 0) {
7545 		so->so_error = EBADF;
7546 	}
7547 
7548 	if (rcv->sb_cc != 0) {
7549 		rcv->sb_flags &= ~SB_SEL;
7550 		selthreadclear(&rcv->sb_sel);
7551 		sbrelease(rcv);
7552 	}
7553 	if (snd->sb_cc != 0) {
7554 		snd->sb_flags &= ~SB_SEL;
7555 		selthreadclear(&snd->sb_sel);
7556 		sbrelease(snd);
7557 	}
7558 	so->so_state |= SS_DEFUNCT;
7559 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7560 
7561 done:
7562 	return 0;
7563 }
7564 
7565 int
soresume(struct proc * p,struct socket * so,int locked)7566 soresume(struct proc *p, struct socket *so, int locked)
7567 {
7568 	if (locked == 0) {
7569 		socket_lock(so, 1);
7570 	}
7571 
7572 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7573 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7574 		    "[%d,%d] resumed from bk idle\n",
7575 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7576 		    proc_pid(p), proc_best_name(p),
7577 		    so->so_gencnt,
7578 		    SOCK_DOM(so), SOCK_TYPE(so));
7579 
7580 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7581 		so->so_extended_bk_start = 0;
7582 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7583 
7584 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7585 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7586 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7587 	}
7588 	if (locked == 0) {
7589 		socket_unlock(so, 1);
7590 	}
7591 
7592 	return 0;
7593 }
7594 
7595 /*
7596  * Does not attempt to account for sockets that are delegated from
7597  * the current process
7598  */
7599 int
so_set_extended_bk_idle(struct socket * so,int optval)7600 so_set_extended_bk_idle(struct socket *so, int optval)
7601 {
7602 	int error = 0;
7603 
7604 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7605 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7606 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7607 		error = EOPNOTSUPP;
7608 	} else if (optval == 0) {
7609 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7610 
7611 		soresume(current_proc(), so, 1);
7612 	} else {
7613 		struct proc *p = current_proc();
7614 		struct fileproc *fp;
7615 		int count = 0;
7616 
7617 		/*
7618 		 * Unlock socket to avoid lock ordering issue with
7619 		 * the proc fd table lock
7620 		 */
7621 		socket_unlock(so, 0);
7622 
7623 		proc_fdlock(p);
7624 		fdt_foreach(fp, p) {
7625 			struct socket *so2;
7626 
7627 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7628 				continue;
7629 			}
7630 
7631 			so2 = (struct socket *)fp_get_data(fp);
7632 			if (so != so2 &&
7633 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7634 				count++;
7635 			}
7636 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7637 				break;
7638 			}
7639 		}
7640 		proc_fdunlock(p);
7641 
7642 		socket_lock(so, 0);
7643 
7644 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7645 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7646 			error = EBUSY;
7647 		} else if (so->so_flags & SOF_DELEGATED) {
7648 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7649 			error = EBUSY;
7650 		} else {
7651 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7652 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7653 		}
7654 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7655 		    "%s marked for extended bk idle\n",
7656 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7657 		    so->so_gencnt,
7658 		    SOCK_DOM(so), SOCK_TYPE(so),
7659 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7660 		    "is" : "not");
7661 	}
7662 
7663 	return error;
7664 }
7665 
7666 static void
so_stop_extended_bk_idle(struct socket * so)7667 so_stop_extended_bk_idle(struct socket *so)
7668 {
7669 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7670 	so->so_extended_bk_start = 0;
7671 
7672 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7673 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7674 	/*
7675 	 * Force defunct
7676 	 */
7677 	sosetdefunct(current_proc(), so,
7678 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7679 	if (so->so_flags & SOF_DEFUNCT) {
7680 		sodefunct(current_proc(), so,
7681 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7682 	}
7683 }
7684 
7685 void
so_drain_extended_bk_idle(struct socket * so)7686 so_drain_extended_bk_idle(struct socket *so)
7687 {
7688 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7689 		/*
7690 		 * Only penalize sockets that have outstanding data
7691 		 */
7692 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7693 			so_stop_extended_bk_idle(so);
7694 
7695 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7696 		}
7697 	}
7698 }
7699 
7700 /*
7701  * Return values tells if socket is still in extended background idle
7702  */
7703 int
so_check_extended_bk_idle_time(struct socket * so)7704 so_check_extended_bk_idle_time(struct socket *so)
7705 {
7706 	int ret = 1;
7707 
7708 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7709 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7710 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7711 		    so->so_gencnt,
7712 		    SOCK_DOM(so), SOCK_TYPE(so));
7713 		if (net_uptime() - so->so_extended_bk_start >
7714 		    soextbkidlestat.so_xbkidle_time) {
7715 			so_stop_extended_bk_idle(so);
7716 
7717 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7718 
7719 			ret = 0;
7720 		} else {
7721 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7722 
7723 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7724 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7725 		}
7726 	}
7727 
7728 	return ret;
7729 }
7730 
7731 void
resume_proc_sockets(proc_t p)7732 resume_proc_sockets(proc_t p)
7733 {
7734 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7735 		struct fileproc *fp;
7736 		struct socket *so;
7737 
7738 		proc_fdlock(p);
7739 		fdt_foreach(fp, p) {
7740 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7741 				continue;
7742 			}
7743 
7744 			so = (struct socket *)fp_get_data(fp);
7745 			(void) soresume(p, so, 0);
7746 		}
7747 		proc_fdunlock(p);
7748 
7749 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7750 	}
7751 }
7752 
7753 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7754 so_set_recv_anyif(struct socket *so, int optval)
7755 {
7756 	int ret = 0;
7757 
7758 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7759 		if (optval) {
7760 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7761 		} else {
7762 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7763 		}
7764 #if SKYWALK
7765 		inp_update_netns_flags(so);
7766 #endif /* SKYWALK */
7767 	}
7768 
7769 
7770 	return ret;
7771 }
7772 
7773 __private_extern__ int
so_get_recv_anyif(struct socket * so)7774 so_get_recv_anyif(struct socket *so)
7775 {
7776 	int ret = 0;
7777 
7778 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7779 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7780 	}
7781 
7782 	return ret;
7783 }
7784 
7785 int
so_set_restrictions(struct socket * so,uint32_t vals)7786 so_set_restrictions(struct socket *so, uint32_t vals)
7787 {
7788 	int nocell_old, nocell_new;
7789 	int noexpensive_old, noexpensive_new;
7790 	int noconstrained_old, noconstrained_new;
7791 
7792 	/*
7793 	 * Deny-type restrictions are trapdoors; once set they cannot be
7794 	 * unset for the lifetime of the socket.  This allows them to be
7795 	 * issued by a framework on behalf of the application without
7796 	 * having to worry that they can be undone.
7797 	 *
7798 	 * Note here that socket-level restrictions overrides any protocol
7799 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7800 	 * socket restriction issued on the socket has a higher precendence
7801 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7802 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7803 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7804 	 */
7805 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7806 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7807 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7808 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7809 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7810 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7811 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7812 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7813 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7814 
7815 	/* we can only set, not clear restrictions */
7816 	if ((nocell_new - nocell_old) == 0 &&
7817 	    (noexpensive_new - noexpensive_old) == 0 &&
7818 	    (noconstrained_new - noconstrained_old) == 0) {
7819 		return 0;
7820 	}
7821 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7822 		if (nocell_new - nocell_old != 0) {
7823 			/*
7824 			 * if deny cellular is now set, do what's needed
7825 			 * for INPCB
7826 			 */
7827 			inp_set_nocellular(sotoinpcb(so));
7828 		}
7829 		if (noexpensive_new - noexpensive_old != 0) {
7830 			inp_set_noexpensive(sotoinpcb(so));
7831 		}
7832 		if (noconstrained_new - noconstrained_old != 0) {
7833 			inp_set_noconstrained(sotoinpcb(so));
7834 		}
7835 	}
7836 
7837 	if (SOCK_DOM(so) == PF_MULTIPATH) {
7838 		mptcp_set_restrictions(so);
7839 	}
7840 
7841 	return 0;
7842 }
7843 
7844 uint32_t
so_get_restrictions(struct socket * so)7845 so_get_restrictions(struct socket *so)
7846 {
7847 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7848 	       SO_RESTRICT_DENY_OUT |
7849 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7850 }
7851 
7852 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7853 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7854 {
7855 	struct proc *ep = PROC_NULL;
7856 	int error = 0;
7857 
7858 	/* pid 0 is reserved for kernel */
7859 	if (epid == 0) {
7860 		error = EINVAL;
7861 		goto done;
7862 	}
7863 
7864 	/*
7865 	 * If this is an in-kernel socket, prevent its delegate
7866 	 * association from changing unless the socket option is
7867 	 * coming from within the kernel itself.
7868 	 */
7869 	if (so->last_pid == 0 && p != kernproc) {
7870 		error = EACCES;
7871 		goto done;
7872 	}
7873 
7874 	/*
7875 	 * If this is issued by a process that's recorded as the
7876 	 * real owner of the socket, or if the pid is the same as
7877 	 * the process's own pid, then proceed.  Otherwise ensure
7878 	 * that the issuing process has the necessary privileges.
7879 	 */
7880 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7881 		if ((error = priv_check_cred(kauth_cred_get(),
7882 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7883 			error = EACCES;
7884 			goto done;
7885 		}
7886 	}
7887 
7888 	/* Find the process that corresponds to the effective pid */
7889 	if ((ep = proc_find(epid)) == PROC_NULL) {
7890 		error = ESRCH;
7891 		goto done;
7892 	}
7893 
7894 	/*
7895 	 * If a process tries to delegate the socket to itself, then
7896 	 * there's really nothing to do; treat it as a way for the
7897 	 * delegate association to be cleared.  Note that we check
7898 	 * the passed-in proc rather than calling proc_selfpid(),
7899 	 * as we need to check the process issuing the socket option
7900 	 * which could be kernproc.  Given that we don't allow 0 for
7901 	 * effective pid, it means that a delegated in-kernel socket
7902 	 * stays delegated during its lifetime (which is probably OK.)
7903 	 */
7904 	if (epid == proc_pid(p)) {
7905 		so->so_flags &= ~SOF_DELEGATED;
7906 		so->e_upid = 0;
7907 		so->e_pid = 0;
7908 		uuid_clear(so->e_uuid);
7909 	} else {
7910 		so->so_flags |= SOF_DELEGATED;
7911 		so->e_upid = proc_uniqueid(ep);
7912 		so->e_pid = proc_pid(ep);
7913 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7914 
7915 #if defined(XNU_TARGET_OS_OSX)
7916 		if (ep->p_responsible_pid != so->e_pid) {
7917 			proc_t rp = proc_find(ep->p_responsible_pid);
7918 			if (rp != PROC_NULL) {
7919 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7920 				so->so_rpid = ep->p_responsible_pid;
7921 				proc_rele(rp);
7922 			} else {
7923 				uuid_clear(so->so_ruuid);
7924 				so->so_rpid = -1;
7925 			}
7926 		}
7927 #endif
7928 	}
7929 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7930 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7931 	}
7932 done:
7933 	if (error == 0 && net_io_policy_log) {
7934 		uuid_string_t buf;
7935 
7936 		uuid_unparse(so->e_uuid, buf);
7937 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7938 		    "euuid %s%s\n", __func__, proc_name_address(p),
7939 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7940 		    SOCK_DOM(so), SOCK_TYPE(so),
7941 		    so->e_pid, proc_name_address(ep), buf,
7942 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7943 	} else if (error != 0 && net_io_policy_log) {
7944 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7945 		    "ERROR (%d)\n", __func__, proc_name_address(p),
7946 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7947 		    SOCK_DOM(so), SOCK_TYPE(so),
7948 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
7949 		    proc_name_address(ep), error);
7950 	}
7951 
7952 	/* Update this socket's policy upon success */
7953 	if (error == 0) {
7954 		so->so_policy_gencnt *= -1;
7955 		so_update_policy(so);
7956 #if NECP
7957 		so_update_necp_policy(so, NULL, NULL);
7958 #endif /* NECP */
7959 	}
7960 
7961 	if (ep != PROC_NULL) {
7962 		proc_rele(ep);
7963 	}
7964 
7965 	return error;
7966 }
7967 
7968 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7969 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7970 {
7971 	uuid_string_t buf;
7972 	uuid_t uuid;
7973 	int error = 0;
7974 
7975 	/* UUID must not be all-zeroes (reserved for kernel) */
7976 	if (uuid_is_null(euuid)) {
7977 		error = EINVAL;
7978 		goto done;
7979 	}
7980 
7981 	/*
7982 	 * If this is an in-kernel socket, prevent its delegate
7983 	 * association from changing unless the socket option is
7984 	 * coming from within the kernel itself.
7985 	 */
7986 	if (so->last_pid == 0 && p != kernproc) {
7987 		error = EACCES;
7988 		goto done;
7989 	}
7990 
7991 	/* Get the UUID of the issuing process */
7992 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
7993 
7994 	/*
7995 	 * If this is issued by a process that's recorded as the
7996 	 * real owner of the socket, or if the uuid is the same as
7997 	 * the process's own uuid, then proceed.  Otherwise ensure
7998 	 * that the issuing process has the necessary privileges.
7999 	 */
8000 	if (check_cred &&
8001 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8002 	    uuid_compare(euuid, uuid) != 0)) {
8003 		if ((error = priv_check_cred(kauth_cred_get(),
8004 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8005 			error = EACCES;
8006 			goto done;
8007 		}
8008 	}
8009 
8010 	/*
8011 	 * If a process tries to delegate the socket to itself, then
8012 	 * there's really nothing to do; treat it as a way for the
8013 	 * delegate association to be cleared.  Note that we check
8014 	 * the uuid of the passed-in proc rather than that of the
8015 	 * current process, as we need to check the process issuing
8016 	 * the socket option which could be kernproc itself.  Given
8017 	 * that we don't allow 0 for effective uuid, it means that
8018 	 * a delegated in-kernel socket stays delegated during its
8019 	 * lifetime (which is okay.)
8020 	 */
8021 	if (uuid_compare(euuid, uuid) == 0) {
8022 		so->so_flags &= ~SOF_DELEGATED;
8023 		so->e_upid = 0;
8024 		so->e_pid = 0;
8025 		uuid_clear(so->e_uuid);
8026 	} else {
8027 		so->so_flags |= SOF_DELEGATED;
8028 		/*
8029 		 * Unlike so_set_effective_pid(), we only have the UUID
8030 		 * here and the process ID is not known.  Inherit the
8031 		 * real {pid,upid} of the socket.
8032 		 */
8033 		so->e_upid = so->last_upid;
8034 		so->e_pid = so->last_pid;
8035 		uuid_copy(so->e_uuid, euuid);
8036 	}
8037 	/*
8038 	 * The following will clear the effective process name as it's the same
8039 	 * as the real process
8040 	 */
8041 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8042 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8043 	}
8044 done:
8045 	if (error == 0 && net_io_policy_log) {
8046 		uuid_unparse(so->e_uuid, buf);
8047 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8048 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8049 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8050 		    SOCK_TYPE(so), so->e_pid, buf,
8051 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8052 	} else if (error != 0 && net_io_policy_log) {
8053 		uuid_unparse(euuid, buf);
8054 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8055 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8056 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8057 		    SOCK_TYPE(so), buf, error);
8058 	}
8059 
8060 	/* Update this socket's policy upon success */
8061 	if (error == 0) {
8062 		so->so_policy_gencnt *= -1;
8063 		so_update_policy(so);
8064 #if NECP
8065 		so_update_necp_policy(so, NULL, NULL);
8066 #endif /* NECP */
8067 	}
8068 
8069 	return error;
8070 }
8071 
8072 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8073 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8074     uint32_t ev_datalen)
8075 {
8076 	struct kev_msg ev_msg;
8077 
8078 	/*
8079 	 * A netpolicy event always starts with a netpolicy_event_data
8080 	 * structure, but the caller can provide for a longer event
8081 	 * structure to post, depending on the event code.
8082 	 */
8083 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8084 
8085 	bzero(&ev_msg, sizeof(ev_msg));
8086 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8087 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8088 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8089 	ev_msg.event_code       = ev_code;
8090 
8091 	ev_msg.dv[0].data_ptr   = ev_data;
8092 	ev_msg.dv[0].data_length = ev_datalen;
8093 
8094 	kev_post_msg(&ev_msg);
8095 }
8096 
8097 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8098 socket_post_kev_msg(uint32_t ev_code,
8099     struct kev_socket_event_data *ev_data,
8100     uint32_t ev_datalen)
8101 {
8102 	struct kev_msg ev_msg;
8103 
8104 	bzero(&ev_msg, sizeof(ev_msg));
8105 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8106 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8107 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8108 	ev_msg.event_code = ev_code;
8109 
8110 	ev_msg.dv[0].data_ptr = ev_data;
8111 	ev_msg.dv[0].data_length = ev_datalen;
8112 
8113 	kev_post_msg(&ev_msg);
8114 }
8115 
8116 void
socket_post_kev_msg_closed(struct socket * so)8117 socket_post_kev_msg_closed(struct socket *so)
8118 {
8119 	struct kev_socket_closed ev = {};
8120 	struct sockaddr *socksa = NULL, *peersa = NULL;
8121 	int err;
8122 
8123 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8124 		return;
8125 	}
8126 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8127 	if (err == 0) {
8128 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8129 		    &peersa);
8130 		if (err == 0) {
8131 			memcpy(&ev.ev_data.kev_sockname, socksa,
8132 			    min(socksa->sa_len,
8133 			    sizeof(ev.ev_data.kev_sockname)));
8134 			memcpy(&ev.ev_data.kev_peername, peersa,
8135 			    min(peersa->sa_len,
8136 			    sizeof(ev.ev_data.kev_peername)));
8137 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8138 			    &ev.ev_data, sizeof(ev));
8139 		}
8140 	}
8141 	free_sockaddr(socksa);
8142 	free_sockaddr(peersa);
8143 }
8144 
8145 __attribute__((noinline, cold, not_tail_called, noreturn))
8146 __private_extern__ int
assfail(const char * a,const char * f,int l)8147 assfail(const char *a, const char *f, int l)
8148 {
8149 	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8150 	/* NOTREACHED */
8151 	__builtin_unreachable();
8152 }
8153