xref: /xnu-8796.121.2/bsd/kern/uipc_socket.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120 
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125 
126 #include <os/log.h>
127 
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131 
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136 
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138 
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144 
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147 
148 static u_int32_t        so_cache_hw;    /* High water mark for socache */
149 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
150 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
151 static u_int32_t        cached_sock_count = 0;
152 STAILQ_HEAD(, socket)   so_cache_head;
153 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t        so_cache_time;
155 static int              socketinit_done;
156 static struct zone      *so_cache_zone;
157 
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160 
161 #include <machine/limits.h>
162 
163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void     filt_sordetach(struct knote *kn);
165 static int      filt_soread(struct knote *kn, long hint);
166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168 
169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void     filt_sowdetach(struct knote *kn);
171 static int      filt_sowrite(struct knote *kn, long hint);
172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174 
175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void     filt_sockdetach(struct knote *kn);
177 static int      filt_sockev(struct knote *kn, long hint);
178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180 
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183 
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 	.f_isfd = 1,
186 	.f_attach = filt_sorattach,
187 	.f_detach = filt_sordetach,
188 	.f_event = filt_soread,
189 	.f_touch = filt_sortouch,
190 	.f_process = filt_sorprocess,
191 };
192 
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 	.f_isfd = 1,
195 	.f_attach = filt_sowattach,
196 	.f_detach = filt_sowdetach,
197 	.f_event = filt_sowrite,
198 	.f_touch = filt_sowtouch,
199 	.f_process = filt_sowprocess,
200 };
201 
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 	.f_isfd = 1,
204 	.f_attach = filt_sockattach,
205 	.f_detach = filt_sockdetach,
206 	.f_event = filt_sockev,
207 	.f_touch = filt_socktouch,
208 	.f_process = filt_sockprocess,
209 };
210 
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 	.f_isfd = 1,
213 	.f_attach = filt_sorattach,
214 	.f_detach = filt_sordetach,
215 	.f_event = filt_soread,
216 	.f_touch = filt_sortouch,
217 	.f_process = filt_sorprocess,
218 };
219 
220 SYSCTL_DECL(_kern_ipc);
221 
222 #define EVEN_MORE_LOCKING_DEBUG 0
223 
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227 
228 #if (DEBUG || DEVELOPMENT)
229 #define DEFAULT_SOSEND_ASSERT_PANIC 1
230 #else
231 #define DEFAULT_SOSEND_ASSERT_PANIC 0
232 #endif /* (DEBUG || DEVELOPMENT) */
233 
234 int sosend_assert_panic = 0;
235 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
236     CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
237 
238 static unsigned long sodefunct_calls = 0;
239 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
240     &sodefunct_calls, "");
241 
242 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
243 so_gen_t        so_gencnt;      /* generation count for sockets */
244 
245 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
246 
247 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
248 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
249 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
250 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
251 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
252 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
253 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
254 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
255 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
256 
257 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
258 
259 int somaxconn = SOMAXCONN;
260 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
261     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
262 
263 /* Should we get a maximum also ??? */
264 static int sosendmaxchain = 65536;
265 static int sosendminchain = 16384;
266 static int sorecvmincopy  = 16384;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
268     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
269 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
270     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
271 
272 /*
273  * Set to enable jumbo clusters (if available) for large writes when
274  * the socket is marked with SOF_MULTIPAGES; see below.
275  */
276 int sosendjcl = 1;
277 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
278     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
279 
280 /*
281  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
282  * writes on the socket for all protocols on any network interfaces,
283  * depending upon sosendjcl above.  Be extra careful when setting this
284  * to 1, because sending down packets that cross physical pages down to
285  * broken drivers (those that falsely assume that the physical pages
286  * are contiguous) might lead to system panics or silent data corruption.
287  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
288  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
289  * capable.  Set this to 1 only for testing/debugging purposes.
290  */
291 int sosendjcl_ignore_capab = 0;
292 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
293     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
294 
295 /*
296  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
297  * writes on the socket for all protocols on any network interfaces.
298  * Be extra careful when setting this to 1, because sending down packets with
299  * clusters larger that 2 KB might lead to system panics or data corruption.
300  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
301  * on the outgoing interface
302  * Set this to 1  for testing/debugging purposes only.
303  */
304 int sosendbigcl_ignore_capab = 0;
305 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
306     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
307 
308 int sodefunctlog = 0;
309 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
310     &sodefunctlog, 0, "");
311 
312 int sothrottlelog = 0;
313 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
314     &sothrottlelog, 0, "");
315 
316 int sorestrictrecv = 1;
317 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
318     &sorestrictrecv, 0, "Enable inbound interface restrictions");
319 
320 int sorestrictsend = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
322     &sorestrictsend, 0, "Enable outbound interface restrictions");
323 
324 int soreserveheadroom = 1;
325 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
326     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
327 
328 #if (DEBUG || DEVELOPMENT)
329 int so_notsent_lowat_check = 1;
330 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
331     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
332 #endif /* DEBUG || DEVELOPMENT */
333 
334 int so_accept_list_waits = 0;
335 #if (DEBUG || DEVELOPMENT)
336 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
337     &so_accept_list_waits, 0, "number of waits for listener incomp list");
338 #endif /* DEBUG || DEVELOPMENT */
339 
340 extern struct inpcbinfo tcbinfo;
341 
342 /* TODO: these should be in header file */
343 extern int get_inpcb_str_size(void);
344 extern int get_tcp_str_size(void);
345 
346 vm_size_t       so_cache_zone_element_size;
347 
348 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
349     user_ssize_t *);
350 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
351 static void cached_sock_free(struct socket *);
352 
353 /*
354  * Maximum of extended background idle sockets per process
355  * Set to zero to disable further setting of the option
356  */
357 
358 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
359 #define SO_IDLE_BK_IDLE_TIME            600
360 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
361 
362 struct soextbkidlestat soextbkidlestat;
363 
364 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
365     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
366     "Maximum of extended background idle sockets per process");
367 
368 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
369     &soextbkidlestat.so_xbkidle_time, 0,
370     "Time in seconds to keep extended background idle sockets");
371 
372 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
373     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
374     "High water mark for extended background idle sockets");
375 
376 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
377     &soextbkidlestat, soextbkidlestat, "");
378 
379 int so_set_extended_bk_idle(struct socket *, int);
380 
381 
382 /*
383  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
384  * setting the DSCP code on the packet based on the service class; see
385  * <rdar://problem/11277343> for details.
386  */
387 __private_extern__ u_int32_t sotcdb = 0;
388 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
389     &sotcdb, 0, "");
390 
391 void
socketinit(void)392 socketinit(void)
393 {
394 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
395 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
396 
397 #ifdef __LP64__
398 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
399 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
400 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
401 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
402 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
403 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
404 #else
405 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
406 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
407 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
408 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
409 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
410 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
411 #endif
412 
413 	if (socketinit_done) {
414 		printf("socketinit: already called...\n");
415 		return;
416 	}
417 	socketinit_done = 1;
418 
419 	PE_parse_boot_argn("socket_debug", &socket_debug,
420 	    sizeof(socket_debug));
421 
422 	PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
423 	    sizeof(sosend_assert_panic));
424 
425 	STAILQ_INIT(&so_cache_head);
426 
427 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
428 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
429 
430 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
431 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
432 
433 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
434 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
435 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
436 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
437 
438 	in_pcbinit();
439 }
440 
441 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)442 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
443 {
444 	caddr_t temp;
445 	uintptr_t offset;
446 
447 	lck_mtx_lock(&so_cache_mtx);
448 
449 	if (!STAILQ_EMPTY(&so_cache_head)) {
450 		VERIFY(cached_sock_count > 0);
451 
452 		*so = STAILQ_FIRST(&so_cache_head);
453 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
454 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
455 
456 		cached_sock_count--;
457 		lck_mtx_unlock(&so_cache_mtx);
458 
459 		temp = (*so)->so_saved_pcb;
460 		bzero((caddr_t)*so, sizeof(struct socket));
461 
462 		(*so)->so_saved_pcb = temp;
463 	} else {
464 		lck_mtx_unlock(&so_cache_mtx);
465 
466 		*so = zalloc_flags(so_cache_zone, how | Z_ZERO);
467 
468 		/*
469 		 * Define offsets for extra structures into our
470 		 * single block of memory. Align extra structures
471 		 * on longword boundaries.
472 		 */
473 
474 		offset = (uintptr_t)*so;
475 		offset += sizeof(struct socket);
476 
477 		offset = ALIGN(offset);
478 
479 		(*so)->so_saved_pcb = (caddr_t)offset;
480 		offset += get_inpcb_str_size();
481 
482 		offset = ALIGN(offset);
483 
484 		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
485 		    (caddr_t)offset;
486 	}
487 
488 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
489 }
490 
491 static void
cached_sock_free(struct socket * so)492 cached_sock_free(struct socket *so)
493 {
494 	lck_mtx_lock(&so_cache_mtx);
495 
496 	so_cache_time = net_uptime();
497 	if (++cached_sock_count > max_cached_sock_count) {
498 		--cached_sock_count;
499 		lck_mtx_unlock(&so_cache_mtx);
500 		zfree(so_cache_zone, so);
501 	} else {
502 		if (so_cache_hw < cached_sock_count) {
503 			so_cache_hw = cached_sock_count;
504 		}
505 
506 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
507 
508 		so->cache_timestamp = so_cache_time;
509 		lck_mtx_unlock(&so_cache_mtx);
510 	}
511 }
512 
513 void
so_update_last_owner_locked(struct socket * so,proc_t self)514 so_update_last_owner_locked(struct socket *so, proc_t self)
515 {
516 	if (so->last_pid != 0) {
517 		/*
518 		 * last_pid and last_upid should remain zero for sockets
519 		 * created using sock_socket. The check above achieves that
520 		 */
521 		if (self == PROC_NULL) {
522 			self = current_proc();
523 		}
524 
525 		if (so->last_upid != proc_uniqueid(self) ||
526 		    so->last_pid != proc_pid(self)) {
527 			so->last_upid = proc_uniqueid(self);
528 			so->last_pid = proc_pid(self);
529 			proc_getexecutableuuid(self, so->last_uuid,
530 			    sizeof(so->last_uuid));
531 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
532 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
533 			}
534 		}
535 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
536 	}
537 }
538 
539 void
so_update_policy(struct socket * so)540 so_update_policy(struct socket *so)
541 {
542 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
543 		(void) inp_update_policy(sotoinpcb(so));
544 	}
545 }
546 
547 #if NECP
548 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)549 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
550     struct sockaddr *override_remote_addr)
551 {
552 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
553 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
554 		    override_remote_addr, 0);
555 	}
556 }
557 #endif /* NECP */
558 
559 boolean_t
so_cache_timer(void)560 so_cache_timer(void)
561 {
562 	struct socket   *p;
563 	int             n_freed = 0;
564 	boolean_t rc = FALSE;
565 
566 	lck_mtx_lock(&so_cache_mtx);
567 	so_cache_timeouts++;
568 	so_cache_time = net_uptime();
569 
570 	while (!STAILQ_EMPTY(&so_cache_head)) {
571 		VERIFY(cached_sock_count > 0);
572 		p = STAILQ_FIRST(&so_cache_head);
573 		if ((so_cache_time - p->cache_timestamp) <
574 		    SO_CACHE_TIME_LIMIT) {
575 			break;
576 		}
577 
578 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
579 		--cached_sock_count;
580 
581 		zfree(so_cache_zone, p);
582 
583 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
584 			so_cache_max_freed++;
585 			break;
586 		}
587 	}
588 
589 	/* Schedule again if there is more to cleanup */
590 	if (!STAILQ_EMPTY(&so_cache_head)) {
591 		rc = TRUE;
592 	}
593 
594 	lck_mtx_unlock(&so_cache_mtx);
595 	return rc;
596 }
597 
598 /*
599  * Get a socket structure from our zone, and initialize it.
600  * We don't implement `waitok' yet (see comments in uipc_domain.c).
601  * Note that it would probably be better to allocate socket
602  * and PCB at the same time, but I'm not convinced that all
603  * the protocols can be easily modified to do this.
604  */
605 struct socket *
soalloc(int waitok,int dom,int type)606 soalloc(int waitok, int dom, int type)
607 {
608 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
609 	struct socket *so;
610 
611 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
612 		cached_sock_alloc(&so, how);
613 	} else {
614 		so = zalloc_flags(socket_zone, how | Z_ZERO);
615 	}
616 	if (so != NULL) {
617 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
618 
619 		/*
620 		 * Increment the socket allocation statistics
621 		 */
622 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
623 	}
624 
625 	return so;
626 }
627 
628 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)629 socreate_internal(int dom, struct socket **aso, int type, int proto,
630     struct proc *p, uint32_t flags, struct proc *ep)
631 {
632 	struct protosw *prp;
633 	struct socket *so;
634 	int error = 0;
635 #if defined(XNU_TARGET_OS_OSX)
636 	pid_t rpid = -1;
637 #endif
638 
639 #if TCPDEBUG
640 	extern int tcpconsdebug;
641 #endif
642 
643 	VERIFY(aso != NULL);
644 	*aso = NULL;
645 
646 	if (proto != 0) {
647 		prp = pffindproto(dom, proto, type);
648 	} else {
649 		prp = pffindtype(dom, type);
650 	}
651 
652 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
653 		if (pffinddomain(dom) == NULL) {
654 			return EAFNOSUPPORT;
655 		}
656 		if (proto != 0) {
657 			if (pffindprotonotype(dom, proto) != NULL) {
658 				return EPROTOTYPE;
659 			}
660 		}
661 		return EPROTONOSUPPORT;
662 	}
663 	if (prp->pr_type != type) {
664 		return EPROTOTYPE;
665 	}
666 	so = soalloc(1, dom, type);
667 	if (so == NULL) {
668 		return ENOBUFS;
669 	}
670 
671 	switch (dom) {
672 	case PF_LOCAL:
673 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
674 		break;
675 	case PF_INET:
676 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
677 		if (type == SOCK_STREAM) {
678 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
679 		} else {
680 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
681 		}
682 		break;
683 	case PF_ROUTE:
684 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
685 		break;
686 	case PF_NDRV:
687 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
688 		break;
689 	case PF_KEY:
690 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
691 		break;
692 	case PF_INET6:
693 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
694 		if (type == SOCK_STREAM) {
695 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
696 		} else {
697 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
698 		}
699 		break;
700 	case PF_SYSTEM:
701 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
702 		break;
703 	case PF_MULTIPATH:
704 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
705 		break;
706 	default:
707 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
708 		break;
709 	}
710 
711 	if (flags & SOCF_MPTCP) {
712 		so->so_state |= SS_NBIO;
713 	}
714 
715 	TAILQ_INIT(&so->so_incomp);
716 	TAILQ_INIT(&so->so_comp);
717 	so->so_type = (short)type;
718 	so->last_upid = proc_uniqueid(p);
719 	so->last_pid = proc_pid(p);
720 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
721 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
722 
723 	if (ep != PROC_NULL && ep != p) {
724 		so->e_upid = proc_uniqueid(ep);
725 		so->e_pid = proc_pid(ep);
726 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
727 		so->so_flags |= SOF_DELEGATED;
728 #if defined(XNU_TARGET_OS_OSX)
729 		if (ep->p_responsible_pid != so->e_pid) {
730 			rpid = ep->p_responsible_pid;
731 		}
732 #endif
733 	}
734 
735 #if defined(XNU_TARGET_OS_OSX)
736 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
737 		rpid = p->p_responsible_pid;
738 	}
739 
740 	so->so_rpid = -1;
741 	uuid_clear(so->so_ruuid);
742 	if (rpid >= 0) {
743 		proc_t rp = proc_find(rpid);
744 		if (rp != PROC_NULL) {
745 			proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
746 			so->so_rpid = rpid;
747 			proc_rele(rp);
748 		}
749 	}
750 #endif
751 
752 	so->so_cred = kauth_cred_proc_ref(p);
753 	if (!suser(kauth_cred_get(), NULL)) {
754 		so->so_state |= SS_PRIV;
755 	}
756 
757 	so->so_proto = prp;
758 	so->so_rcv.sb_flags |= SB_RECV;
759 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
760 	so->next_lock_lr = 0;
761 	so->next_unlock_lr = 0;
762 
763 	/*
764 	 * Attachment will create the per pcb lock if necessary and
765 	 * increase refcount for creation, make sure it's done before
766 	 * socket is inserted in lists.
767 	 */
768 	so->so_usecount++;
769 
770 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
771 	if (error != 0) {
772 		/*
773 		 * Warning:
774 		 * If so_pcb is not zero, the socket will be leaked,
775 		 * so protocol attachment handler must be coded carefuly
776 		 */
777 		if (so->so_pcb != NULL) {
778 			os_log_error(OS_LOG_DEFAULT,
779 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
780 			    error, dom, proto, type);
781 		}
782 		/*
783 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
784 		 */
785 		so->so_state |= SS_NOFDREF;
786 		so->so_flags |= SOF_PCBCLEARING;
787 		VERIFY(so->so_usecount > 0);
788 		so->so_usecount--;
789 		sofreelastref(so, 1);   /* will deallocate the socket */
790 		return error;
791 	}
792 
793 	/*
794 	 * Note: needs so_pcb to be set after pru_attach
795 	 */
796 	if (prp->pr_update_last_owner != NULL) {
797 		(*prp->pr_update_last_owner)(so, p, ep);
798 	}
799 
800 	atomic_add_32(&prp->pr_domain->dom_refs, 1);
801 
802 	/* Attach socket filters for this protocol */
803 	sflt_initsock(so);
804 #if TCPDEBUG
805 	if (tcpconsdebug == 2) {
806 		so->so_options |= SO_DEBUG;
807 	}
808 #endif
809 	so_set_default_traffic_class(so);
810 
811 	/*
812 	 * If this thread or task is marked to create backgrounded sockets,
813 	 * mark the socket as background.
814 	 */
815 	if (!(flags & SOCF_MPTCP) &&
816 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
817 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
818 		so->so_background_thread = current_thread();
819 	}
820 
821 	switch (dom) {
822 	/*
823 	 * Don't mark Unix domain or system
824 	 * eligible for defunct by default.
825 	 */
826 	case PF_LOCAL:
827 	case PF_SYSTEM:
828 		so->so_flags |= SOF_NODEFUNCT;
829 		break;
830 	default:
831 		break;
832 	}
833 
834 	/*
835 	 * Entitlements can't be checked at socket creation time except if the
836 	 * application requested a feature guarded by a privilege (c.f., socket
837 	 * delegation).
838 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
839 	 * a privilege check should only be triggered by a userland request.
840 	 * A privilege check at socket creation time is time consuming and
841 	 * could trigger many authorisation error messages from the security
842 	 * APIs.
843 	 */
844 
845 	*aso = so;
846 
847 	return 0;
848 }
849 
850 /*
851  * Returns:	0			Success
852  *		EAFNOSUPPORT
853  *		EPROTOTYPE
854  *		EPROTONOSUPPORT
855  *		ENOBUFS
856  *	<pru_attach>:ENOBUFS[AF_UNIX]
857  *	<pru_attach>:ENOBUFS[TCP]
858  *	<pru_attach>:ENOMEM[TCP]
859  *	<pru_attach>:???		[other protocol families, IPSEC]
860  */
861 int
socreate(int dom,struct socket ** aso,int type,int proto)862 socreate(int dom, struct socket **aso, int type, int proto)
863 {
864 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
865 	           PROC_NULL);
866 }
867 
868 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)869 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
870 {
871 	int error = 0;
872 	struct proc *ep = PROC_NULL;
873 
874 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
875 		error = ESRCH;
876 		goto done;
877 	}
878 
879 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
880 
881 	/*
882 	 * It might not be wise to hold the proc reference when calling
883 	 * socreate_internal since it calls soalloc with M_WAITOK
884 	 */
885 done:
886 	if (ep != PROC_NULL) {
887 		proc_rele(ep);
888 	}
889 
890 	return error;
891 }
892 
893 /*
894  * Returns:	0			Success
895  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
896  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
897  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
898  *	<pru_bind>:EINVAL		Invalid argument
899  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
900  *	<pru_bind>:EACCES		Permission denied
901  *	<pru_bind>:EADDRINUSE		Address in use
902  *	<pru_bind>:EAGAIN		Resource unavailable, try again
903  *	<pru_bind>:EPERM		Operation not permitted
904  *	<pru_bind>:???
905  *	<sf_bind>:???
906  *
907  * Notes:	It's not possible to fully enumerate the return codes above,
908  *		since socket filter authors and protocol family authors may
909  *		not choose to limit their error returns to those listed, even
910  *		though this may result in some software operating incorrectly.
911  *
912  *		The error codes which are enumerated above are those known to
913  *		be returned by the tcp_usr_bind function supplied.
914  */
915 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)916 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
917 {
918 	struct proc *p = current_proc();
919 	int error = 0;
920 
921 	if (dolock) {
922 		socket_lock(so, 1);
923 	}
924 
925 	so_update_last_owner_locked(so, p);
926 	so_update_policy(so);
927 
928 #if NECP
929 	so_update_necp_policy(so, nam, NULL);
930 #endif /* NECP */
931 
932 	/*
933 	 * If this is a bind request on a socket that has been marked
934 	 * as inactive, reject it now before we go any further.
935 	 */
936 	if (so->so_flags & SOF_DEFUNCT) {
937 		error = EINVAL;
938 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
939 		    __func__, proc_pid(p), proc_best_name(p),
940 		    so->so_gencnt,
941 		    SOCK_DOM(so), SOCK_TYPE(so), error);
942 		goto out;
943 	}
944 
945 	/* Socket filter */
946 	error = sflt_bind(so, nam);
947 
948 	if (error == 0) {
949 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
950 	}
951 out:
952 	if (dolock) {
953 		socket_unlock(so, 1);
954 	}
955 
956 	if (error == EJUSTRETURN) {
957 		error = 0;
958 	}
959 
960 	return error;
961 }
962 
963 void
sodealloc(struct socket * so)964 sodealloc(struct socket *so)
965 {
966 	kauth_cred_unref(&so->so_cred);
967 
968 	/* Remove any filters */
969 	sflt_termsock(so);
970 
971 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
972 
973 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
974 		cached_sock_free(so);
975 	} else {
976 		zfree(socket_zone, so);
977 	}
978 }
979 
980 /*
981  * Returns:	0			Success
982  *		EINVAL
983  *		EOPNOTSUPP
984  *	<pru_listen>:EINVAL[AF_UNIX]
985  *	<pru_listen>:EINVAL[TCP]
986  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
987  *	<pru_listen>:EINVAL[TCP]	Invalid argument
988  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
989  *	<pru_listen>:EACCES[TCP]	Permission denied
990  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
991  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
992  *	<pru_listen>:EPERM[TCP]		Operation not permitted
993  *	<sf_listen>:???
994  *
995  * Notes:	Other <pru_listen> returns depend on the protocol family; all
996  *		<sf_listen> returns depend on what the filter author causes
997  *		their filter to return.
998  */
999 int
solisten(struct socket * so,int backlog)1000 solisten(struct socket *so, int backlog)
1001 {
1002 	struct proc *p = current_proc();
1003 	int error = 0;
1004 
1005 	socket_lock(so, 1);
1006 
1007 	so_update_last_owner_locked(so, p);
1008 	so_update_policy(so);
1009 
1010 	if (TAILQ_EMPTY(&so->so_comp)) {
1011 		so->so_options |= SO_ACCEPTCONN;
1012 	}
1013 
1014 #if NECP
1015 	so_update_necp_policy(so, NULL, NULL);
1016 #endif /* NECP */
1017 
1018 	if (so->so_proto == NULL) {
1019 		error = EINVAL;
1020 		so->so_options &= ~SO_ACCEPTCONN;
1021 		goto out;
1022 	}
1023 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1024 		error = EOPNOTSUPP;
1025 		so->so_options &= ~SO_ACCEPTCONN;
1026 		goto out;
1027 	}
1028 
1029 	/*
1030 	 * If the listen request is made on a socket that is not fully
1031 	 * disconnected, or on a socket that has been marked as inactive,
1032 	 * reject the request now.
1033 	 */
1034 	if ((so->so_state &
1035 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1036 	    (so->so_flags & SOF_DEFUNCT)) {
1037 		error = EINVAL;
1038 		if (so->so_flags & SOF_DEFUNCT) {
1039 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1040 			    "(%d)\n", __func__, proc_pid(p),
1041 			    proc_best_name(p),
1042 			    so->so_gencnt,
1043 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1044 		}
1045 		so->so_options &= ~SO_ACCEPTCONN;
1046 		goto out;
1047 	}
1048 
1049 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1050 		error = EPERM;
1051 		so->so_options &= ~SO_ACCEPTCONN;
1052 		goto out;
1053 	}
1054 
1055 	error = sflt_listen(so);
1056 	if (error == 0) {
1057 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1058 	}
1059 
1060 	if (error) {
1061 		if (error == EJUSTRETURN) {
1062 			error = 0;
1063 		}
1064 		so->so_options &= ~SO_ACCEPTCONN;
1065 		goto out;
1066 	}
1067 
1068 	/*
1069 	 * POSIX: The implementation may have an upper limit on the length of
1070 	 * the listen queue-either global or per accepting socket. If backlog
1071 	 * exceeds this limit, the length of the listen queue is set to the
1072 	 * limit.
1073 	 *
1074 	 * If listen() is called with a backlog argument value that is less
1075 	 * than 0, the function behaves as if it had been called with a backlog
1076 	 * argument value of 0.
1077 	 *
1078 	 * A backlog argument of 0 may allow the socket to accept connections,
1079 	 * in which case the length of the listen queue may be set to an
1080 	 * implementation-defined minimum value.
1081 	 */
1082 	if (backlog <= 0 || backlog > somaxconn) {
1083 		backlog = somaxconn;
1084 	}
1085 
1086 	so->so_qlimit = (short)backlog;
1087 out:
1088 	socket_unlock(so, 1);
1089 	return error;
1090 }
1091 
1092 /*
1093  * The "accept list lock" protects the fields related to the listener queues
1094  * because we can unlock a socket to respect the lock ordering between
1095  * the listener socket and its clients sockets. The lock ordering is first to
1096  * acquire the client socket before the listener socket.
1097  *
1098  * The accept list lock serializes access to the following fields:
1099  * - of the listener socket:
1100  *   - so_comp
1101  *   - so_incomp
1102  *   - so_qlen
1103  *   - so_inqlen
1104  * - of client sockets that are in so_comp or so_incomp:
1105  *   - so_head
1106  *   - so_list
1107  *
1108  * As one can see the accept list lock protects the consistent of the
1109  * linkage of the client sockets.
1110  *
1111  * Note that those fields may be read without holding the accept list lock
1112  * for a preflight provided the accept list lock is taken when committing
1113  * to take an action based on the result of the preflight. The preflight
1114  * saves the cost of doing the unlock/lock dance.
1115  */
1116 void
so_acquire_accept_list(struct socket * head,struct socket * so)1117 so_acquire_accept_list(struct socket *head, struct socket *so)
1118 {
1119 	lck_mtx_t *mutex_held;
1120 
1121 	if (head->so_proto->pr_getlock == NULL) {
1122 		return;
1123 	}
1124 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1125 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1126 
1127 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1128 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 		return;
1130 	}
1131 	if (so != NULL) {
1132 		socket_unlock(so, 0);
1133 	}
1134 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1135 		so_accept_list_waits += 1;
1136 		msleep((caddr_t)&head->so_incomp, mutex_held,
1137 		    PSOCK | PCATCH, __func__, NULL);
1138 	}
1139 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1140 	if (so != NULL) {
1141 		socket_unlock(head, 0);
1142 		socket_lock(so, 0);
1143 		socket_lock(head, 0);
1144 	}
1145 }
1146 
1147 void
so_release_accept_list(struct socket * head)1148 so_release_accept_list(struct socket *head)
1149 {
1150 	if (head->so_proto->pr_getlock != NULL) {
1151 		lck_mtx_t *mutex_held;
1152 
1153 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1154 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1155 
1156 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1157 		wakeup((caddr_t)&head->so_incomp);
1158 	}
1159 }
1160 
1161 void
sofreelastref(struct socket * so,int dealloc)1162 sofreelastref(struct socket *so, int dealloc)
1163 {
1164 	struct socket *head = so->so_head;
1165 
1166 	/* Assume socket is locked */
1167 
1168 #if FLOW_DIVERT
1169 	if (so->so_flags & SOF_FLOW_DIVERT) {
1170 		flow_divert_detach(so);
1171 	}
1172 #endif  /* FLOW_DIVERT */
1173 
1174 #if CONTENT_FILTER
1175 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1176 		cfil_sock_detach(so);
1177 	}
1178 #endif /* CONTENT_FILTER */
1179 
1180 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1181 		soflow_detach(so);
1182 	}
1183 
1184 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1185 		selthreadclear(&so->so_snd.sb_sel);
1186 		selthreadclear(&so->so_rcv.sb_sel);
1187 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1188 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1189 		so->so_event = sonullevent;
1190 		return;
1191 	}
1192 	if (head != NULL) {
1193 		/*
1194 		 * Need to lock the listener when the protocol has
1195 		 * per socket locks
1196 		 */
1197 		if (head->so_proto->pr_getlock != NULL) {
1198 			socket_lock(head, 1);
1199 			so_acquire_accept_list(head, so);
1200 		}
1201 		if (so->so_state & SS_INCOMP) {
1202 			so->so_state &= ~SS_INCOMP;
1203 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1204 			head->so_incqlen--;
1205 			head->so_qlen--;
1206 			so->so_head = NULL;
1207 
1208 			if (head->so_proto->pr_getlock != NULL) {
1209 				so_release_accept_list(head);
1210 				socket_unlock(head, 1);
1211 			}
1212 		} else if (so->so_state & SS_COMP) {
1213 			if (head->so_proto->pr_getlock != NULL) {
1214 				so_release_accept_list(head);
1215 				socket_unlock(head, 1);
1216 			}
1217 			/*
1218 			 * We must not decommission a socket that's
1219 			 * on the accept(2) queue.  If we do, then
1220 			 * accept(2) may hang after select(2) indicated
1221 			 * that the listening socket was ready.
1222 			 */
1223 			selthreadclear(&so->so_snd.sb_sel);
1224 			selthreadclear(&so->so_rcv.sb_sel);
1225 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1226 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1227 			so->so_event = sonullevent;
1228 			return;
1229 		} else {
1230 			if (head->so_proto->pr_getlock != NULL) {
1231 				so_release_accept_list(head);
1232 				socket_unlock(head, 1);
1233 			}
1234 			printf("sofree: not queued\n");
1235 		}
1236 	}
1237 	sowflush(so);
1238 	sorflush(so);
1239 
1240 	/* 3932268: disable upcall */
1241 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1242 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1243 	so->so_event = sonullevent;
1244 
1245 	if (dealloc) {
1246 		sodealloc(so);
1247 	}
1248 }
1249 
1250 void
soclose_wait_locked(struct socket * so)1251 soclose_wait_locked(struct socket *so)
1252 {
1253 	lck_mtx_t *mutex_held;
1254 
1255 	if (so->so_proto->pr_getlock != NULL) {
1256 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1257 	} else {
1258 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1259 	}
1260 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1261 
1262 	/*
1263 	 * Double check here and return if there's no outstanding upcall;
1264 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1265 	 */
1266 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1267 		return;
1268 	}
1269 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1270 	so->so_snd.sb_flags &= ~SB_UPCALL;
1271 	so->so_flags |= SOF_CLOSEWAIT;
1272 
1273 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1274 	    "soclose_wait_locked", NULL);
1275 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1276 	so->so_flags &= ~SOF_CLOSEWAIT;
1277 }
1278 
1279 /*
1280  * Close a socket on last file table reference removal.
1281  * Initiate disconnect if connected.
1282  * Free socket when disconnect complete.
1283  */
1284 int
soclose_locked(struct socket * so)1285 soclose_locked(struct socket *so)
1286 {
1287 	int error = 0;
1288 	struct timespec ts;
1289 
1290 	if (so->so_usecount == 0) {
1291 		panic("soclose: so=%p refcount=0", so);
1292 		/* NOTREACHED */
1293 	}
1294 
1295 	sflt_notify(so, sock_evt_closing, NULL);
1296 
1297 	if (so->so_upcallusecount) {
1298 		soclose_wait_locked(so);
1299 	}
1300 
1301 #if CONTENT_FILTER
1302 	/*
1303 	 * We have to wait until the content filters are done
1304 	 */
1305 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1306 		cfil_sock_close_wait(so);
1307 		cfil_sock_is_closed(so);
1308 		cfil_sock_detach(so);
1309 	}
1310 #endif /* CONTENT_FILTER */
1311 
1312 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1313 		soflow_detach(so);
1314 	}
1315 
1316 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1317 		soresume(current_proc(), so, 1);
1318 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1319 	}
1320 
1321 	if ((so->so_options & SO_ACCEPTCONN)) {
1322 		struct socket *sp, *sonext;
1323 		int persocklock = 0;
1324 		int incomp_overflow_only;
1325 
1326 		/*
1327 		 * We do not want new connection to be added
1328 		 * to the connection queues
1329 		 */
1330 		so->so_options &= ~SO_ACCEPTCONN;
1331 
1332 		/*
1333 		 * We can drop the lock on the listener once
1334 		 * we've acquired the incoming list
1335 		 */
1336 		if (so->so_proto->pr_getlock != NULL) {
1337 			persocklock = 1;
1338 			so_acquire_accept_list(so, NULL);
1339 			socket_unlock(so, 0);
1340 		}
1341 again:
1342 		incomp_overflow_only = 1;
1343 
1344 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1345 			/*
1346 			 * Radar 5350314
1347 			 * skip sockets thrown away by tcpdropdropblreq
1348 			 * they will get cleanup by the garbage collection.
1349 			 * otherwise, remove the incomp socket from the queue
1350 			 * and let soabort trigger the appropriate cleanup.
1351 			 */
1352 			if (sp->so_flags & SOF_OVERFLOW) {
1353 				continue;
1354 			}
1355 
1356 			if (persocklock != 0) {
1357 				socket_lock(sp, 1);
1358 			}
1359 
1360 			/*
1361 			 * Radar 27945981
1362 			 * The extra reference for the list insure the
1363 			 * validity of the socket pointer when we perform the
1364 			 * unlock of the head above
1365 			 */
1366 			if (sp->so_state & SS_INCOMP) {
1367 				sp->so_state &= ~SS_INCOMP;
1368 				sp->so_head = NULL;
1369 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1370 				so->so_incqlen--;
1371 				so->so_qlen--;
1372 
1373 				(void) soabort(sp);
1374 			} else {
1375 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1376 				    __func__, sp);
1377 			}
1378 
1379 			if (persocklock != 0) {
1380 				socket_unlock(sp, 1);
1381 			}
1382 		}
1383 
1384 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1385 			/* Dequeue from so_comp since sofree() won't do it */
1386 			if (persocklock != 0) {
1387 				socket_lock(sp, 1);
1388 			}
1389 
1390 			if (sp->so_state & SS_COMP) {
1391 				sp->so_state &= ~SS_COMP;
1392 				sp->so_head = NULL;
1393 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1394 				so->so_qlen--;
1395 
1396 				(void) soabort(sp);
1397 			} else {
1398 				panic("%s sp %p in so_comp but !SS_COMP",
1399 				    __func__, sp);
1400 			}
1401 
1402 			if (persocklock) {
1403 				socket_unlock(sp, 1);
1404 			}
1405 		}
1406 
1407 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1408 #if (DEBUG | DEVELOPMENT)
1409 			panic("%s head %p so_comp not empty", __func__, so);
1410 #endif /* (DEVELOPMENT || DEBUG) */
1411 
1412 			goto again;
1413 		}
1414 
1415 		if (!TAILQ_EMPTY(&so->so_comp)) {
1416 #if (DEBUG | DEVELOPMENT)
1417 			panic("%s head %p so_comp not empty", __func__, so);
1418 #endif /* (DEVELOPMENT || DEBUG) */
1419 
1420 			goto again;
1421 		}
1422 
1423 		if (persocklock) {
1424 			socket_lock(so, 0);
1425 			so_release_accept_list(so);
1426 		}
1427 	}
1428 	if (so->so_pcb == NULL) {
1429 		/* 3915887: mark the socket as ready for dealloc */
1430 		so->so_flags |= SOF_PCBCLEARING;
1431 		goto discard;
1432 	}
1433 
1434 	if (so->so_state & SS_ISCONNECTED) {
1435 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1436 			error = sodisconnectlocked(so);
1437 			if (error) {
1438 				goto drop;
1439 			}
1440 		}
1441 		if (so->so_options & SO_LINGER) {
1442 			if ((so->so_state & SS_ISDISCONNECTING) &&
1443 			    (so->so_state & SS_NBIO)) {
1444 				goto drop;
1445 			}
1446 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1447 				lck_mtx_t *mutex_held;
1448 
1449 				if (so->so_proto->pr_getlock != NULL) {
1450 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1451 				} else {
1452 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1453 				}
1454 				ts.tv_sec = (so->so_linger / 100);
1455 				ts.tv_nsec = (so->so_linger % 100) *
1456 				    NSEC_PER_USEC * 1000 * 10;
1457 				error = msleep((caddr_t)&so->so_timeo,
1458 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1459 				if (error) {
1460 					/*
1461 					 * It's OK when the time fires,
1462 					 * don't report an error
1463 					 */
1464 					if (error == EWOULDBLOCK) {
1465 						error = 0;
1466 					}
1467 					break;
1468 				}
1469 			}
1470 		}
1471 	}
1472 drop:
1473 	if (so->so_usecount == 0) {
1474 		panic("soclose: usecount is zero so=%p", so);
1475 		/* NOTREACHED */
1476 	}
1477 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1478 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1479 		if (error == 0) {
1480 			error = error2;
1481 		}
1482 	}
1483 	if (so->so_usecount <= 0) {
1484 		panic("soclose: usecount is zero so=%p", so);
1485 		/* NOTREACHED */
1486 	}
1487 discard:
1488 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1489 	    (so->so_state & SS_NOFDREF)) {
1490 		panic("soclose: NOFDREF");
1491 		/* NOTREACHED */
1492 	}
1493 	so->so_state |= SS_NOFDREF;
1494 
1495 	if ((so->so_flags & SOF_KNOTE) != 0) {
1496 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1497 	}
1498 
1499 	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1500 
1501 	VERIFY(so->so_usecount > 0);
1502 	so->so_usecount--;
1503 	sofree(so);
1504 	return error;
1505 }
1506 
1507 int
soclose(struct socket * so)1508 soclose(struct socket *so)
1509 {
1510 	int error = 0;
1511 	socket_lock(so, 1);
1512 
1513 	if (so->so_retaincnt == 0) {
1514 		error = soclose_locked(so);
1515 	} else {
1516 		/*
1517 		 * if the FD is going away, but socket is
1518 		 * retained in kernel remove its reference
1519 		 */
1520 		so->so_usecount--;
1521 		if (so->so_usecount < 2) {
1522 			panic("soclose: retaincnt non null and so=%p "
1523 			    "usecount=%d\n", so, so->so_usecount);
1524 		}
1525 	}
1526 	socket_unlock(so, 1);
1527 	return error;
1528 }
1529 
1530 /*
1531  * Must be called at splnet...
1532  */
1533 /* Should already be locked */
1534 int
soabort(struct socket * so)1535 soabort(struct socket *so)
1536 {
1537 	int error;
1538 
1539 #ifdef MORE_LOCKING_DEBUG
1540 	lck_mtx_t *mutex_held;
1541 
1542 	if (so->so_proto->pr_getlock != NULL) {
1543 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1544 	} else {
1545 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1546 	}
1547 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1548 #endif
1549 
1550 	if ((so->so_flags & SOF_ABORTED) == 0) {
1551 		so->so_flags |= SOF_ABORTED;
1552 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1553 		if (error) {
1554 			sofree(so);
1555 			return error;
1556 		}
1557 	}
1558 	return 0;
1559 }
1560 
1561 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1562 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1563 {
1564 	int error;
1565 
1566 	if (dolock) {
1567 		socket_lock(so, 1);
1568 	}
1569 
1570 	so_update_last_owner_locked(so, PROC_NULL);
1571 	so_update_policy(so);
1572 #if NECP
1573 	so_update_necp_policy(so, NULL, NULL);
1574 #endif /* NECP */
1575 
1576 	if ((so->so_state & SS_NOFDREF) == 0) {
1577 		panic("soaccept: !NOFDREF");
1578 	}
1579 	so->so_state &= ~SS_NOFDREF;
1580 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1581 
1582 	if (dolock) {
1583 		socket_unlock(so, 1);
1584 	}
1585 	return error;
1586 }
1587 
1588 int
soaccept(struct socket * so,struct sockaddr ** nam)1589 soaccept(struct socket *so, struct sockaddr **nam)
1590 {
1591 	return soacceptlock(so, nam, 1);
1592 }
1593 
1594 int
soacceptfilter(struct socket * so,struct socket * head)1595 soacceptfilter(struct socket *so, struct socket *head)
1596 {
1597 	struct sockaddr *local = NULL, *remote = NULL;
1598 	int error = 0;
1599 
1600 	/*
1601 	 * Hold the lock even if this socket has not been made visible
1602 	 * to the filter(s).  For sockets with global locks, this protects
1603 	 * against the head or peer going away
1604 	 */
1605 	socket_lock(so, 1);
1606 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1607 	    sogetaddr_locked(so, &local, 0) != 0) {
1608 		so->so_state &= ~SS_NOFDREF;
1609 		socket_unlock(so, 1);
1610 		soclose(so);
1611 		/* Out of resources; try it again next time */
1612 		error = ECONNABORTED;
1613 		goto done;
1614 	}
1615 
1616 	error = sflt_accept(head, so, local, remote);
1617 
1618 	/*
1619 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1620 	 * as inactive and return it anyway.  This newly accepted socket
1621 	 * will be disconnected later before we hand it off to the caller.
1622 	 */
1623 	if (error == EJUSTRETURN) {
1624 		error = 0;
1625 		(void) sosetdefunct(current_proc(), so,
1626 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1627 	}
1628 
1629 	if (error != 0) {
1630 		/*
1631 		 * This may seem like a duplication to the above error
1632 		 * handling part when we return ECONNABORTED, except
1633 		 * the following is done while holding the lock since
1634 		 * the socket has been exposed to the filter(s) earlier.
1635 		 */
1636 		so->so_state &= ~SS_NOFDREF;
1637 		socket_unlock(so, 1);
1638 		soclose(so);
1639 		/* Propagate socket filter's error code to the caller */
1640 	} else {
1641 		socket_unlock(so, 1);
1642 	}
1643 done:
1644 	/* Callee checks for NULL pointer */
1645 	sock_freeaddr(remote);
1646 	sock_freeaddr(local);
1647 	return error;
1648 }
1649 
1650 /*
1651  * Returns:	0			Success
1652  *		EOPNOTSUPP		Operation not supported on socket
1653  *		EISCONN			Socket is connected
1654  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1655  *	<pru_connect>:EINVAL		Invalid argument
1656  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1657  *	<pru_connect>:EACCES		Permission denied
1658  *	<pru_connect>:EADDRINUSE	Address in use
1659  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1660  *	<pru_connect>:EPERM		Operation not permitted
1661  *	<sf_connect_out>:???		[anything a filter writer might set]
1662  */
1663 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1664 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1665 {
1666 	int error;
1667 	struct proc *p = current_proc();
1668 	tracker_metadata_t metadata = { };
1669 
1670 	if (dolock) {
1671 		socket_lock(so, 1);
1672 	}
1673 
1674 	so_update_last_owner_locked(so, p);
1675 	so_update_policy(so);
1676 
1677 #if NECP
1678 	so_update_necp_policy(so, NULL, nam);
1679 #endif /* NECP */
1680 
1681 	/*
1682 	 * If this is a listening socket or if this is a previously-accepted
1683 	 * socket that has been marked as inactive, reject the connect request.
1684 	 */
1685 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1686 		error = EOPNOTSUPP;
1687 		if (so->so_flags & SOF_DEFUNCT) {
1688 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1689 			    "(%d)\n", __func__, proc_pid(p),
1690 			    proc_best_name(p),
1691 			    so->so_gencnt,
1692 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1693 		}
1694 		if (dolock) {
1695 			socket_unlock(so, 1);
1696 		}
1697 		return error;
1698 	}
1699 
1700 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1701 		if (dolock) {
1702 			socket_unlock(so, 1);
1703 		}
1704 		return EPERM;
1705 	}
1706 
1707 	/*
1708 	 * If protocol is connection-based, can only connect once.
1709 	 * Otherwise, if connected, try to disconnect first.
1710 	 * This allows user to disconnect by connecting to, e.g.,
1711 	 * a null address.
1712 	 */
1713 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1714 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1715 	    (error = sodisconnectlocked(so)))) {
1716 		error = EISCONN;
1717 	} else {
1718 		/*
1719 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1720 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1721 		 */
1722 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1723 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1724 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1725 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1726 				}
1727 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1728 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1729 				}
1730 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1731 					printf("connect() - failed necp_set_socket_domain_attributes");
1732 				}
1733 			}
1734 		}
1735 
1736 		/*
1737 		 * Run connect filter before calling protocol:
1738 		 *  - non-blocking connect returns before completion;
1739 		 */
1740 		error = sflt_connectout(so, nam);
1741 		if (error != 0) {
1742 			if (error == EJUSTRETURN) {
1743 				error = 0;
1744 			}
1745 		} else {
1746 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1747 			    (so, nam, p);
1748 			if (error != 0) {
1749 				so->so_state &= ~SS_ISCONNECTING;
1750 			}
1751 		}
1752 	}
1753 	if (dolock) {
1754 		socket_unlock(so, 1);
1755 	}
1756 	return error;
1757 }
1758 
1759 int
soconnect(struct socket * so,struct sockaddr * nam)1760 soconnect(struct socket *so, struct sockaddr *nam)
1761 {
1762 	return soconnectlock(so, nam, 1);
1763 }
1764 
1765 /*
1766  * Returns:	0			Success
1767  *	<pru_connect2>:EINVAL[AF_UNIX]
1768  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1769  *	<pru_connect2>:???		[other protocol families]
1770  *
1771  * Notes:	<pru_connect2> is not supported by [TCP].
1772  */
1773 int
soconnect2(struct socket * so1,struct socket * so2)1774 soconnect2(struct socket *so1, struct socket *so2)
1775 {
1776 	int error;
1777 
1778 	socket_lock(so1, 1);
1779 	if (so2->so_proto->pr_lock) {
1780 		socket_lock(so2, 1);
1781 	}
1782 
1783 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1784 
1785 	socket_unlock(so1, 1);
1786 	if (so2->so_proto->pr_lock) {
1787 		socket_unlock(so2, 1);
1788 	}
1789 	return error;
1790 }
1791 
1792 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1793 soconnectxlocked(struct socket *so, struct sockaddr *src,
1794     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1795     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1796     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1797 {
1798 	int error;
1799 	tracker_metadata_t metadata = { };
1800 
1801 	so_update_last_owner_locked(so, p);
1802 	so_update_policy(so);
1803 
1804 	/*
1805 	 * If this is a listening socket or if this is a previously-accepted
1806 	 * socket that has been marked as inactive, reject the connect request.
1807 	 */
1808 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1809 		error = EOPNOTSUPP;
1810 		if (so->so_flags & SOF_DEFUNCT) {
1811 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1812 			    "(%d)\n", __func__, proc_pid(p),
1813 			    proc_best_name(p),
1814 			    so->so_gencnt,
1815 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1816 		}
1817 		return error;
1818 	}
1819 
1820 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1821 		return EPERM;
1822 	}
1823 
1824 	/*
1825 	 * If protocol is connection-based, can only connect once
1826 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1827 	 * try to disconnect first.  This allows user to disconnect
1828 	 * by connecting to, e.g., a null address.
1829 	 */
1830 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1831 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1832 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1833 	    (error = sodisconnectlocked(so)) != 0)) {
1834 		error = EISCONN;
1835 	} else {
1836 		/*
1837 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1838 		 * (only if it hasn't been marked yet).
1839 		 */
1840 		if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1841 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1842 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1843 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1844 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1845 				}
1846 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1847 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1848 				}
1849 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1850 					printf("connectx() - failed necp_set_socket_domain_attributes");
1851 				}
1852 			}
1853 		}
1854 
1855 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1856 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1857 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1858 
1859 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1860 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1861 			}
1862 		}
1863 
1864 		/*
1865 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1866 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1867 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1868 		 * Case 3 allows user to combine write with connect even if they have
1869 		 * no use for TFO (such as regular TCP, and UDP).
1870 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1871 		 */
1872 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1873 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1874 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1875 		}
1876 
1877 		/*
1878 		 * If a user sets data idempotent and does not pass an uio, or
1879 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1880 		 * SOF1_DATA_IDEMPOTENT.
1881 		 */
1882 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1883 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1884 			/* We should return EINVAL instead perhaps. */
1885 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1886 		}
1887 
1888 		/*
1889 		 * Run connect filter before calling protocol:
1890 		 *  - non-blocking connect returns before completion;
1891 		 */
1892 		error = sflt_connectout(so, dst);
1893 		if (error != 0) {
1894 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1895 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1896 			if (error == EJUSTRETURN) {
1897 				error = 0;
1898 			}
1899 		} else {
1900 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1901 			    (so, src, dst, p, ifscope, aid, pcid,
1902 			    flags, arg, arglen, auio, bytes_written);
1903 			if (error != 0) {
1904 				so->so_state &= ~SS_ISCONNECTING;
1905 				if (error != EINPROGRESS) {
1906 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1907 				}
1908 			}
1909 		}
1910 	}
1911 
1912 	return error;
1913 }
1914 
1915 int
sodisconnectlocked(struct socket * so)1916 sodisconnectlocked(struct socket *so)
1917 {
1918 	int error;
1919 
1920 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1921 		error = ENOTCONN;
1922 		goto bad;
1923 	}
1924 	if (so->so_state & SS_ISDISCONNECTING) {
1925 		error = EALREADY;
1926 		goto bad;
1927 	}
1928 
1929 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1930 	if (error == 0) {
1931 		sflt_notify(so, sock_evt_disconnected, NULL);
1932 	}
1933 
1934 bad:
1935 	return error;
1936 }
1937 
1938 /* Locking version */
1939 int
sodisconnect(struct socket * so)1940 sodisconnect(struct socket *so)
1941 {
1942 	int error;
1943 
1944 	socket_lock(so, 1);
1945 	error = sodisconnectlocked(so);
1946 	socket_unlock(so, 1);
1947 	return error;
1948 }
1949 
1950 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1951 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1952 {
1953 	int error;
1954 
1955 	/*
1956 	 * Call the protocol disconnectx handler; let it handle all
1957 	 * matters related to the connection state of this session.
1958 	 */
1959 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1960 	if (error == 0) {
1961 		/*
1962 		 * The event applies only for the session, not for
1963 		 * the disconnection of individual subflows.
1964 		 */
1965 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1966 			sflt_notify(so, sock_evt_disconnected, NULL);
1967 		}
1968 	}
1969 	return error;
1970 }
1971 
1972 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1973 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1974 {
1975 	int error;
1976 
1977 	socket_lock(so, 1);
1978 	error = sodisconnectxlocked(so, aid, cid);
1979 	socket_unlock(so, 1);
1980 	return error;
1981 }
1982 
1983 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1984 
1985 /*
1986  * sosendcheck will lock the socket buffer if it isn't locked and
1987  * verify that there is space for the data being inserted.
1988  *
1989  * Returns:	0			Success
1990  *		EPIPE
1991  *	sblock:EWOULDBLOCK
1992  *	sblock:EINTR
1993  *	sbwait:EBADF
1994  *	sbwait:EINTR
1995  *	[so_error]:???
1996  */
1997 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1998 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1999     int32_t clen, int32_t atomic, int flags, int *sblocked)
2000 {
2001 	int     error = 0;
2002 	int32_t space;
2003 	int     assumelock = 0;
2004 
2005 restart:
2006 	if (*sblocked == 0) {
2007 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2008 		    so->so_send_filt_thread != 0 &&
2009 		    so->so_send_filt_thread == current_thread()) {
2010 			/*
2011 			 * We're being called recursively from a filter,
2012 			 * allow this to continue. Radar 4150520.
2013 			 * Don't set sblocked because we don't want
2014 			 * to perform an unlock later.
2015 			 */
2016 			assumelock = 1;
2017 		} else {
2018 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2019 			if (error) {
2020 				if (so->so_flags & SOF_DEFUNCT) {
2021 					goto defunct;
2022 				}
2023 				return error;
2024 			}
2025 			*sblocked = 1;
2026 		}
2027 	}
2028 
2029 	/*
2030 	 * If a send attempt is made on a socket that has been marked
2031 	 * as inactive (disconnected), reject the request.
2032 	 */
2033 	if (so->so_flags & SOF_DEFUNCT) {
2034 defunct:
2035 		error = EPIPE;
2036 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2037 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2038 		    so->so_gencnt,
2039 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2040 		return error;
2041 	}
2042 
2043 	if (so->so_state & SS_CANTSENDMORE) {
2044 #if CONTENT_FILTER
2045 		/*
2046 		 * Can re-inject data of half closed connections
2047 		 */
2048 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2049 		    so->so_snd.sb_cfil_thread == current_thread() &&
2050 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2051 			CFIL_LOG(LOG_INFO,
2052 			    "so %llx ignore SS_CANTSENDMORE",
2053 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2054 		} else
2055 #endif /* CONTENT_FILTER */
2056 		return EPIPE;
2057 	}
2058 	if (so->so_error) {
2059 		error = so->so_error;
2060 		so->so_error = 0;
2061 		return error;
2062 	}
2063 
2064 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2065 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2066 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2067 			    (resid != 0 || clen == 0) &&
2068 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2069 				return ENOTCONN;
2070 			}
2071 		} else if (addr == 0) {
2072 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2073 			       ENOTCONN : EDESTADDRREQ;
2074 		}
2075 	}
2076 
2077 	space = sbspace(&so->so_snd);
2078 
2079 	if (flags & MSG_OOB) {
2080 		space += 1024;
2081 	}
2082 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2083 	    clen > so->so_snd.sb_hiwat) {
2084 		return EMSGSIZE;
2085 	}
2086 
2087 	if ((space < resid + clen &&
2088 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2089 	    space < clen)) ||
2090 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2091 		/*
2092 		 * don't block the connectx call when there's more data
2093 		 * than can be copied.
2094 		 */
2095 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2096 			if (space == 0) {
2097 				return EWOULDBLOCK;
2098 			}
2099 			if (space < (int32_t)so->so_snd.sb_lowat) {
2100 				return 0;
2101 			}
2102 		}
2103 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2104 		    assumelock) {
2105 			return EWOULDBLOCK;
2106 		}
2107 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2108 		*sblocked = 0;
2109 		error = sbwait(&so->so_snd);
2110 		if (error) {
2111 			if (so->so_flags & SOF_DEFUNCT) {
2112 				goto defunct;
2113 			}
2114 			return error;
2115 		}
2116 		goto restart;
2117 	}
2118 	return 0;
2119 }
2120 
2121 /*
2122  * Send on a socket.
2123  * If send must go all at once and message is larger than
2124  * send buffering, then hard error.
2125  * Lock against other senders.
2126  * If must go all at once and not enough room now, then
2127  * inform user that this would block and do nothing.
2128  * Otherwise, if nonblocking, send as much as possible.
2129  * The data to be sent is described by "uio" if nonzero,
2130  * otherwise by the mbuf chain "top" (which must be null
2131  * if uio is not).  Data provided in mbuf chain must be small
2132  * enough to send all at once.
2133  *
2134  * Returns nonzero on error, timeout or signal; callers
2135  * must check for short counts if EINTR/ERESTART are returned.
2136  * Data and control buffers are freed on return.
2137  *
2138  * Returns:	0			Success
2139  *		EOPNOTSUPP
2140  *		EINVAL
2141  *		ENOBUFS
2142  *	uiomove:EFAULT
2143  *	sosendcheck:EPIPE
2144  *	sosendcheck:EWOULDBLOCK
2145  *	sosendcheck:EINTR
2146  *	sosendcheck:EBADF
2147  *	sosendcheck:EINTR
2148  *	sosendcheck:???			[value from so_error]
2149  *	<pru_send>:ECONNRESET[TCP]
2150  *	<pru_send>:EINVAL[TCP]
2151  *	<pru_send>:ENOBUFS[TCP]
2152  *	<pru_send>:EADDRINUSE[TCP]
2153  *	<pru_send>:EADDRNOTAVAIL[TCP]
2154  *	<pru_send>:EAFNOSUPPORT[TCP]
2155  *	<pru_send>:EACCES[TCP]
2156  *	<pru_send>:EAGAIN[TCP]
2157  *	<pru_send>:EPERM[TCP]
2158  *	<pru_send>:EMSGSIZE[TCP]
2159  *	<pru_send>:EHOSTUNREACH[TCP]
2160  *	<pru_send>:ENETUNREACH[TCP]
2161  *	<pru_send>:ENETDOWN[TCP]
2162  *	<pru_send>:ENOMEM[TCP]
2163  *	<pru_send>:ENOBUFS[TCP]
2164  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2165  *	<pru_send>:EINVAL[AF_UNIX]
2166  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2167  *	<pru_send>:EPIPE[AF_UNIX]
2168  *	<pru_send>:ENOTCONN[AF_UNIX]
2169  *	<pru_send>:EISCONN[AF_UNIX]
2170  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2171  *	<sf_data_out>:???		[whatever a filter author chooses]
2172  *
2173  * Notes:	Other <pru_send> returns depend on the protocol family; all
2174  *		<sf_data_out> returns depend on what the filter author causes
2175  *		their filter to return.
2176  */
2177 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2178 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2179     struct mbuf *top, struct mbuf *control, int flags)
2180 {
2181 	struct mbuf **mp;
2182 	struct mbuf *m, *freelist = NULL;
2183 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2184 	user_ssize_t space, len, resid, orig_resid;
2185 	int clen = 0, error, dontroute, sendflags;
2186 	int atomic = sosendallatonce(so) || top;
2187 	int sblocked = 0;
2188 	struct proc *p = current_proc();
2189 	uint16_t headroom = 0;
2190 	ssize_t mlen;
2191 	boolean_t en_tracing = FALSE;
2192 
2193 	if (uio != NULL) {
2194 		resid = uio_resid(uio);
2195 	} else {
2196 		resid = top->m_pkthdr.len;
2197 	}
2198 	orig_resid = resid;
2199 
2200 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2201 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2202 
2203 	socket_lock(so, 1);
2204 
2205 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2206 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2207 	}
2208 
2209 	/*
2210 	 * trace if tracing & network (vs. unix) sockets & and
2211 	 * non-loopback
2212 	 */
2213 	if (ENTR_SHOULDTRACE &&
2214 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2215 		struct inpcb *inp = sotoinpcb(so);
2216 		if (inp->inp_last_outifp != NULL &&
2217 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2218 			en_tracing = TRUE;
2219 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2220 			    VM_KERNEL_ADDRPERM(so),
2221 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2222 			    (int64_t)resid);
2223 		}
2224 	}
2225 
2226 	/*
2227 	 * Re-injection should not affect process accounting
2228 	 */
2229 	if ((flags & MSG_SKIPCFIL) == 0) {
2230 		so_update_last_owner_locked(so, p);
2231 		so_update_policy(so);
2232 
2233 #if NECP
2234 		so_update_necp_policy(so, NULL, addr);
2235 #endif /* NECP */
2236 	}
2237 
2238 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2239 		error = EOPNOTSUPP;
2240 		goto out_locked;
2241 	}
2242 
2243 	/*
2244 	 * In theory resid should be unsigned.
2245 	 * However, space must be signed, as it might be less than 0
2246 	 * if we over-committed, and we must use a signed comparison
2247 	 * of space and resid.  On the other hand, a negative resid
2248 	 * causes us to loop sending 0-length segments to the protocol.
2249 	 *
2250 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2251 	 *
2252 	 * Note: We limit resid to be a positive int value as we use
2253 	 * imin() to set bytes_to_copy -- radr://14558484
2254 	 */
2255 	if (resid < 0 || resid > INT_MAX ||
2256 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2257 		error = EINVAL;
2258 		goto out_locked;
2259 	}
2260 
2261 	dontroute = (flags & MSG_DONTROUTE) &&
2262 	    (so->so_options & SO_DONTROUTE) == 0 &&
2263 	    (so->so_proto->pr_flags & PR_ATOMIC);
2264 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2265 
2266 	if (control != NULL) {
2267 		clen = control->m_len;
2268 	}
2269 
2270 	if (soreserveheadroom != 0) {
2271 		headroom = so->so_pktheadroom;
2272 	}
2273 
2274 	do {
2275 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2276 		    &sblocked);
2277 		if (error) {
2278 			goto out_locked;
2279 		}
2280 
2281 		mp = &top;
2282 		space = sbspace(&so->so_snd) - clen;
2283 		space += ((flags & MSG_OOB) ? 1024 : 0);
2284 
2285 		do {
2286 			if (uio == NULL) {
2287 				/*
2288 				 * Data is prepackaged in "top".
2289 				 */
2290 				resid = 0;
2291 				if (flags & MSG_EOR) {
2292 					top->m_flags |= M_EOR;
2293 				}
2294 			} else {
2295 				int chainlength;
2296 				int bytes_to_copy;
2297 				boolean_t jumbocl;
2298 				boolean_t bigcl;
2299 				int bytes_to_alloc;
2300 
2301 				bytes_to_copy = imin((int)resid, (int)space);
2302 
2303 				bytes_to_alloc = bytes_to_copy;
2304 				if (top == NULL) {
2305 					bytes_to_alloc += headroom;
2306 				}
2307 
2308 				if (sosendminchain > 0) {
2309 					chainlength = 0;
2310 				} else {
2311 					chainlength = sosendmaxchain;
2312 				}
2313 
2314 				/*
2315 				 * Use big 4 KB cluster when the outgoing interface
2316 				 * does not prefer 2 KB clusters
2317 				 */
2318 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2319 				    sosendbigcl_ignore_capab;
2320 
2321 				/*
2322 				 * Attempt to use larger than system page-size
2323 				 * clusters for large writes only if there is
2324 				 * a jumbo cluster pool and if the socket is
2325 				 * marked accordingly.
2326 				 */
2327 				jumbocl = sosendjcl && njcl > 0 &&
2328 				    ((so->so_flags & SOF_MULTIPAGES) ||
2329 				    sosendjcl_ignore_capab) &&
2330 				    bigcl;
2331 
2332 				socket_unlock(so, 0);
2333 
2334 				do {
2335 					int num_needed;
2336 					int hdrs_needed = (top == NULL) ? 1 : 0;
2337 
2338 					/*
2339 					 * try to maintain a local cache of mbuf
2340 					 * clusters needed to complete this
2341 					 * write the list is further limited to
2342 					 * the number that are currently needed
2343 					 * to fill the socket this mechanism
2344 					 * allows a large number of mbufs/
2345 					 * clusters to be grabbed under a single
2346 					 * mbuf lock... if we can't get any
2347 					 * clusters, than fall back to trying
2348 					 * for mbufs if we fail early (or
2349 					 * miscalcluate the number needed) make
2350 					 * sure to release any clusters we
2351 					 * haven't yet consumed.
2352 					 */
2353 					if (freelist == NULL &&
2354 					    bytes_to_alloc > MBIGCLBYTES &&
2355 					    jumbocl) {
2356 						num_needed =
2357 						    bytes_to_alloc / M16KCLBYTES;
2358 
2359 						if ((bytes_to_alloc -
2360 						    (num_needed * M16KCLBYTES))
2361 						    >= MINCLSIZE) {
2362 							num_needed++;
2363 						}
2364 
2365 						freelist =
2366 						    m_getpackets_internal(
2367 							(unsigned int *)&num_needed,
2368 							hdrs_needed, M_WAIT, 0,
2369 							M16KCLBYTES);
2370 						/*
2371 						 * Fall back to 4K cluster size
2372 						 * if allocation failed
2373 						 */
2374 					}
2375 
2376 					if (freelist == NULL &&
2377 					    bytes_to_alloc > MCLBYTES &&
2378 					    bigcl) {
2379 						num_needed =
2380 						    bytes_to_alloc / MBIGCLBYTES;
2381 
2382 						if ((bytes_to_alloc -
2383 						    (num_needed * MBIGCLBYTES)) >=
2384 						    MINCLSIZE) {
2385 							num_needed++;
2386 						}
2387 
2388 						freelist =
2389 						    m_getpackets_internal(
2390 							(unsigned int *)&num_needed,
2391 							hdrs_needed, M_WAIT, 0,
2392 							MBIGCLBYTES);
2393 						/*
2394 						 * Fall back to cluster size
2395 						 * if allocation failed
2396 						 */
2397 					}
2398 
2399 					/*
2400 					 * Allocate a cluster as we want to
2401 					 * avoid to split the data in more
2402 					 * that one segment and using MINCLSIZE
2403 					 * would lead us to allocate two mbufs
2404 					 */
2405 					if (soreserveheadroom != 0 &&
2406 					    freelist == NULL &&
2407 					    ((top == NULL &&
2408 					    bytes_to_alloc > _MHLEN) ||
2409 					    bytes_to_alloc > _MLEN)) {
2410 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2411 						    MCLBYTES;
2412 						freelist =
2413 						    m_getpackets_internal(
2414 							(unsigned int *)&num_needed,
2415 							hdrs_needed, M_WAIT, 0,
2416 							MCLBYTES);
2417 						/*
2418 						 * Fall back to a single mbuf
2419 						 * if allocation failed
2420 						 */
2421 					} else if (freelist == NULL &&
2422 					    bytes_to_alloc > MINCLSIZE) {
2423 						num_needed =
2424 						    bytes_to_alloc / MCLBYTES;
2425 
2426 						if ((bytes_to_alloc -
2427 						    (num_needed * MCLBYTES)) >=
2428 						    MINCLSIZE) {
2429 							num_needed++;
2430 						}
2431 
2432 						freelist =
2433 						    m_getpackets_internal(
2434 							(unsigned int *)&num_needed,
2435 							hdrs_needed, M_WAIT, 0,
2436 							MCLBYTES);
2437 						/*
2438 						 * Fall back to a single mbuf
2439 						 * if allocation failed
2440 						 */
2441 					}
2442 					/*
2443 					 * For datagram protocols, leave
2444 					 * headroom for protocol headers
2445 					 * in the first cluster of the chain
2446 					 */
2447 					if (freelist != NULL && atomic &&
2448 					    top == NULL && headroom > 0) {
2449 						freelist->m_data += headroom;
2450 					}
2451 
2452 					/*
2453 					 * Fall back to regular mbufs without
2454 					 * reserving the socket headroom
2455 					 */
2456 					if (freelist == NULL) {
2457 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2458 							if (top == NULL) {
2459 								MGETHDR(freelist,
2460 								    M_WAIT, MT_DATA);
2461 							} else {
2462 								MGET(freelist,
2463 								    M_WAIT, MT_DATA);
2464 							}
2465 						}
2466 
2467 						if (freelist == NULL) {
2468 							error = ENOBUFS;
2469 							socket_lock(so, 0);
2470 							goto out_locked;
2471 						}
2472 						/*
2473 						 * For datagram protocols,
2474 						 * leave room for protocol
2475 						 * headers in first mbuf.
2476 						 */
2477 						if (atomic && top == NULL &&
2478 						    bytes_to_copy > 0 &&
2479 						    bytes_to_copy < MHLEN) {
2480 							MH_ALIGN(freelist,
2481 							    bytes_to_copy);
2482 						}
2483 					}
2484 					m = freelist;
2485 					freelist = m->m_next;
2486 					m->m_next = NULL;
2487 
2488 					if ((m->m_flags & M_EXT)) {
2489 						mlen = m->m_ext.ext_size -
2490 						    M_LEADINGSPACE(m);
2491 					} else if ((m->m_flags & M_PKTHDR)) {
2492 						mlen = MHLEN - M_LEADINGSPACE(m);
2493 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2494 					} else {
2495 						mlen = MLEN - M_LEADINGSPACE(m);
2496 					}
2497 					len = imin((int)mlen, bytes_to_copy);
2498 
2499 					chainlength += len;
2500 
2501 					space -= len;
2502 
2503 					error = uiomove(mtod(m, caddr_t),
2504 					    (int)len, uio);
2505 
2506 					resid = uio_resid(uio);
2507 
2508 					m->m_len = (int32_t)len;
2509 					*mp = m;
2510 					top->m_pkthdr.len += len;
2511 					if (error) {
2512 						break;
2513 					}
2514 					mp = &m->m_next;
2515 					if (resid <= 0) {
2516 						if (flags & MSG_EOR) {
2517 							top->m_flags |= M_EOR;
2518 						}
2519 						break;
2520 					}
2521 					bytes_to_copy = imin((int)resid, (int)space);
2522 				} while (space > 0 &&
2523 				    (chainlength < sosendmaxchain || atomic ||
2524 				    resid < MINCLSIZE));
2525 
2526 				socket_lock(so, 0);
2527 
2528 				if (error) {
2529 					goto out_locked;
2530 				}
2531 			}
2532 
2533 			if (dontroute) {
2534 				so->so_options |= SO_DONTROUTE;
2535 			}
2536 
2537 			/*
2538 			 * Compute flags here, for pru_send and NKEs
2539 			 *
2540 			 * If the user set MSG_EOF, the protocol
2541 			 * understands this flag and nothing left to
2542 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2543 			 */
2544 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2545 			    ((flags & MSG_EOF) &&
2546 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2547 			    (resid <= 0)) ? PRUS_EOF :
2548 			    /* If there is more to send set PRUS_MORETOCOME */
2549 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2550 
2551 			if ((flags & MSG_SKIPCFIL) == 0) {
2552 				/*
2553 				 * Socket filter processing
2554 				 */
2555 				error = sflt_data_out(so, addr, &top,
2556 				    &control, (sendflags & MSG_OOB) ?
2557 				    sock_data_filt_flag_oob : 0);
2558 				if (error) {
2559 					if (error == EJUSTRETURN) {
2560 						error = 0;
2561 						goto packet_consumed;
2562 					}
2563 					goto out_locked;
2564 				}
2565 #if CONTENT_FILTER
2566 				/*
2567 				 * Content filter processing
2568 				 */
2569 				error = cfil_sock_data_out(so, addr, top,
2570 				    control, sendflags, dgram_flow_entry);
2571 				if (error) {
2572 					if (error == EJUSTRETURN) {
2573 						error = 0;
2574 						goto packet_consumed;
2575 					}
2576 					goto out_locked;
2577 				}
2578 #endif /* CONTENT_FILTER */
2579 			}
2580 			error = (*so->so_proto->pr_usrreqs->pru_send)
2581 			    (so, sendflags, top, addr, control, p);
2582 
2583 packet_consumed:
2584 			if (dontroute) {
2585 				so->so_options &= ~SO_DONTROUTE;
2586 			}
2587 
2588 			clen = 0;
2589 			control = NULL;
2590 			top = NULL;
2591 			mp = &top;
2592 			if (error) {
2593 				goto out_locked;
2594 			}
2595 		} while (resid && space > 0);
2596 	} while (resid);
2597 
2598 
2599 out_locked:
2600 	if (resid > orig_resid) {
2601 		char pname[MAXCOMLEN] = {};
2602 		pid_t current_pid = proc_pid(current_proc());
2603 		proc_name(current_pid, pname, sizeof(pname));
2604 
2605 		if (sosend_assert_panic != 0) {
2606 			panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2607 			    so, resid, orig_resid, pname, current_pid);
2608 		} else {
2609 			os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2610 			    so->so_gencnt, resid, orig_resid, pname, current_pid);
2611 		}
2612 	}
2613 
2614 	if (sblocked) {
2615 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2616 	} else {
2617 		socket_unlock(so, 1);
2618 	}
2619 	if (top != NULL) {
2620 		m_freem(top);
2621 	}
2622 	if (control != NULL) {
2623 		m_freem(control);
2624 	}
2625 	if (freelist != NULL) {
2626 		m_freem_list(freelist);
2627 	}
2628 
2629 	if (dgram_flow_entry != NULL) {
2630 		soflow_free_flow(dgram_flow_entry);
2631 	}
2632 
2633 	soclearfastopen(so);
2634 
2635 	if (en_tracing) {
2636 		/* resid passed here is the bytes left in uio */
2637 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2638 		    VM_KERNEL_ADDRPERM(so),
2639 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2640 		    (int64_t)(orig_resid - resid));
2641 	}
2642 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2643 	    so->so_snd.sb_cc, space, error);
2644 
2645 	return error;
2646 }
2647 
2648 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2649 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2650 {
2651 	struct mbuf *m0 = NULL, *control_end = NULL;
2652 
2653 	socket_lock_assert_owned(so);
2654 
2655 	/*
2656 	 * top must points to mbuf chain to be sent.
2657 	 * If control is not NULL, top must be packet header
2658 	 */
2659 	VERIFY(top != NULL &&
2660 	    (control == NULL || top->m_flags & M_PKTHDR));
2661 
2662 	/*
2663 	 * If control is not passed in, see if we can get it
2664 	 * from top.
2665 	 */
2666 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2667 		// Locate start of control if present and start of data
2668 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2669 			if (m0->m_flags & M_PKTHDR) {
2670 				top = m0;
2671 				break;
2672 			} else if (m0->m_type == MT_CONTROL) {
2673 				if (control == NULL) {
2674 					// Found start of control
2675 					control = m0;
2676 				}
2677 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2678 					// Found end of control
2679 					control_end = m0;
2680 				}
2681 			}
2682 		}
2683 		if (control_end != NULL) {
2684 			control_end->m_next = NULL;
2685 		}
2686 	}
2687 
2688 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2689 	    (so, sendflags, top, addr, control, current_proc());
2690 
2691 	return error;
2692 }
2693 
2694 /*
2695  * Supported only connected sockets (no address) without ancillary data
2696  * (control mbuf) for atomic protocols
2697  */
2698 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2699 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2700 {
2701 	struct mbuf *m, *freelist = NULL;
2702 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2703 	user_ssize_t len, resid;
2704 	int error, dontroute;
2705 	int atomic = sosendallatonce(so);
2706 	int sblocked = 0;
2707 	struct proc *p = current_proc();
2708 	u_int uiofirst = 0;
2709 	u_int uiolast = 0;
2710 	struct mbuf *top = NULL;
2711 	uint16_t headroom = 0;
2712 	ssize_t mlen;
2713 	boolean_t bigcl;
2714 
2715 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2716 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2717 
2718 	if (so->so_type != SOCK_DGRAM) {
2719 		error = EINVAL;
2720 		goto out;
2721 	}
2722 	if (atomic == 0) {
2723 		error = EINVAL;
2724 		goto out;
2725 	}
2726 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2727 		error = EPROTONOSUPPORT;
2728 		goto out;
2729 	}
2730 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2731 		error = EINVAL;
2732 		goto out;
2733 	}
2734 	resid = uio_array_resid(uioarray, uiocnt);
2735 
2736 	/*
2737 	 * In theory resid should be unsigned.
2738 	 * However, space must be signed, as it might be less than 0
2739 	 * if we over-committed, and we must use a signed comparison
2740 	 * of space and resid.  On the other hand, a negative resid
2741 	 * causes us to loop sending 0-length segments to the protocol.
2742 	 *
2743 	 * Note: We limit resid to be a positive int value as we use
2744 	 * imin() to set bytes_to_copy -- radr://14558484
2745 	 */
2746 	if (resid < 0 || resid > INT_MAX) {
2747 		error = EINVAL;
2748 		goto out;
2749 	}
2750 
2751 	socket_lock(so, 1);
2752 	so_update_last_owner_locked(so, p);
2753 	so_update_policy(so);
2754 
2755 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2756 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2757 	}
2758 
2759 #if NECP
2760 	so_update_necp_policy(so, NULL, NULL);
2761 #endif /* NECP */
2762 
2763 	dontroute = (flags & MSG_DONTROUTE) &&
2764 	    (so->so_options & SO_DONTROUTE) == 0 &&
2765 	    (so->so_proto->pr_flags & PR_ATOMIC);
2766 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2767 
2768 	error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2769 	if (error) {
2770 		goto release;
2771 	}
2772 
2773 	/*
2774 	 * Use big 4 KB clusters when the outgoing interface does not prefer
2775 	 * 2 KB clusters
2776 	 */
2777 	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2778 
2779 	if (soreserveheadroom != 0) {
2780 		headroom = so->so_pktheadroom;
2781 	}
2782 
2783 	do {
2784 		int i;
2785 		int num_needed = 0;
2786 		int chainlength;
2787 		size_t maxpktlen = 0;
2788 		int bytes_to_alloc;
2789 
2790 		if (sosendminchain > 0) {
2791 			chainlength = 0;
2792 		} else {
2793 			chainlength = sosendmaxchain;
2794 		}
2795 
2796 		socket_unlock(so, 0);
2797 
2798 		/*
2799 		 * Find a set of uio that fit in a reasonable number
2800 		 * of mbuf packets
2801 		 */
2802 		for (i = uiofirst; i < uiocnt; i++) {
2803 			struct uio *auio = uioarray[i];
2804 
2805 			len = uio_resid(auio);
2806 
2807 			/* Do nothing for empty messages */
2808 			if (len == 0) {
2809 				continue;
2810 			}
2811 
2812 			num_needed += 1;
2813 			uiolast += 1;
2814 
2815 			if (len > maxpktlen) {
2816 				maxpktlen = len;
2817 			}
2818 
2819 			chainlength += len;
2820 			if (chainlength > sosendmaxchain) {
2821 				break;
2822 			}
2823 		}
2824 		/*
2825 		 * Nothing left to send
2826 		 */
2827 		if (num_needed == 0) {
2828 			socket_lock(so, 0);
2829 			break;
2830 		}
2831 		/*
2832 		 * Allocate buffer large enough to include headroom space for
2833 		 * network and link header
2834 		 *
2835 		 */
2836 		bytes_to_alloc = (int) maxpktlen + headroom;
2837 
2838 		/*
2839 		 * Allocate a single contiguous buffer of the smallest available
2840 		 * size when possible
2841 		 */
2842 		if (bytes_to_alloc > MCLBYTES &&
2843 		    bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2844 			freelist = m_getpackets_internal(
2845 				(unsigned int *)&num_needed,
2846 				num_needed, M_WAIT, 1,
2847 				MBIGCLBYTES);
2848 		} else if (bytes_to_alloc > _MHLEN &&
2849 		    bytes_to_alloc <= MCLBYTES) {
2850 			freelist = m_getpackets_internal(
2851 				(unsigned int *)&num_needed,
2852 				num_needed, M_WAIT, 1,
2853 				MCLBYTES);
2854 		} else {
2855 			freelist = m_allocpacket_internal(
2856 				(unsigned int *)&num_needed,
2857 				bytes_to_alloc, NULL, M_WAIT, 1, 0);
2858 		}
2859 
2860 		if (freelist == NULL) {
2861 			socket_lock(so, 0);
2862 			error = ENOMEM;
2863 			goto release;
2864 		}
2865 		/*
2866 		 * Copy each uio of the set into its own mbuf packet
2867 		 */
2868 		for (i = uiofirst, m = freelist;
2869 		    i < uiolast && m != NULL;
2870 		    i++) {
2871 			int bytes_to_copy;
2872 			struct mbuf *n;
2873 			struct uio *auio = uioarray[i];
2874 
2875 			bytes_to_copy = (int)uio_resid(auio);
2876 
2877 			/* Do nothing for empty messages */
2878 			if (bytes_to_copy == 0) {
2879 				continue;
2880 			}
2881 			/*
2882 			 * Leave headroom for protocol headers
2883 			 * in the first mbuf of the chain
2884 			 */
2885 			m->m_data += headroom;
2886 
2887 			for (n = m; n != NULL; n = n->m_next) {
2888 				if ((m->m_flags & M_EXT)) {
2889 					mlen = m->m_ext.ext_size -
2890 					    M_LEADINGSPACE(m);
2891 				} else if ((m->m_flags & M_PKTHDR)) {
2892 					mlen =
2893 					    MHLEN - M_LEADINGSPACE(m);
2894 				} else {
2895 					mlen = MLEN - M_LEADINGSPACE(m);
2896 				}
2897 				len = imin((int)mlen, bytes_to_copy);
2898 
2899 				/*
2900 				 * Note: uiomove() decrements the iovec
2901 				 * length
2902 				 */
2903 				error = uiomove(mtod(n, caddr_t),
2904 				    (int)len, auio);
2905 				if (error != 0) {
2906 					break;
2907 				}
2908 				n->m_len = (int32_t)len;
2909 				m->m_pkthdr.len += len;
2910 
2911 				VERIFY(m->m_pkthdr.len <= maxpktlen);
2912 
2913 				bytes_to_copy -= len;
2914 				resid -= len;
2915 			}
2916 			if (m->m_pkthdr.len == 0) {
2917 				printf(
2918 					"%s:%d so %llx pkt %llx type %u len null\n",
2919 					__func__, __LINE__,
2920 					(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2921 					(uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2922 					m->m_type);
2923 			}
2924 			if (error != 0) {
2925 				break;
2926 			}
2927 			m = m->m_nextpkt;
2928 		}
2929 
2930 		socket_lock(so, 0);
2931 
2932 		if (error) {
2933 			goto release;
2934 		}
2935 		top = freelist;
2936 		freelist = NULL;
2937 
2938 		if (dontroute) {
2939 			so->so_options |= SO_DONTROUTE;
2940 		}
2941 
2942 		if ((flags & MSG_SKIPCFIL) == 0) {
2943 			struct mbuf **prevnextp = NULL;
2944 
2945 			for (i = uiofirst, m = top;
2946 			    i < uiolast && m != NULL;
2947 			    i++) {
2948 				struct mbuf *nextpkt = m->m_nextpkt;
2949 
2950 				/*
2951 				 * Socket filter processing
2952 				 */
2953 				error = sflt_data_out(so, NULL, &m,
2954 				    NULL, 0);
2955 				if (error != 0 && error != EJUSTRETURN) {
2956 					goto release;
2957 				}
2958 
2959 #if CONTENT_FILTER
2960 				if (error == 0) {
2961 					/*
2962 					 * Content filter processing
2963 					 */
2964 					error = cfil_sock_data_out(so, NULL, m,
2965 					    NULL, 0, dgram_flow_entry);
2966 					if (error != 0 && error != EJUSTRETURN) {
2967 						goto release;
2968 					}
2969 				}
2970 #endif /* CONTENT_FILTER */
2971 				/*
2972 				 * Remove packet from the list when
2973 				 * swallowed by a filter
2974 				 */
2975 				if (error == EJUSTRETURN) {
2976 					error = 0;
2977 					if (prevnextp != NULL) {
2978 						*prevnextp = nextpkt;
2979 					} else {
2980 						top = nextpkt;
2981 					}
2982 				}
2983 
2984 				m = nextpkt;
2985 				if (m != NULL) {
2986 					prevnextp = &m->m_nextpkt;
2987 				}
2988 			}
2989 		}
2990 		if (top != NULL) {
2991 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2992 			    (so, 0, top, NULL, NULL, p);
2993 		}
2994 
2995 		if (dontroute) {
2996 			so->so_options &= ~SO_DONTROUTE;
2997 		}
2998 
2999 		top = NULL;
3000 		uiofirst = uiolast;
3001 	} while (resid > 0 && error == 0);
3002 release:
3003 	if (sblocked) {
3004 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
3005 	} else {
3006 		socket_unlock(so, 1);
3007 	}
3008 out:
3009 	if (top != NULL) {
3010 		m_freem(top);
3011 	}
3012 	if (freelist != NULL) {
3013 		m_freem_list(freelist);
3014 	}
3015 
3016 	if (dgram_flow_entry != NULL) {
3017 		soflow_free_flow(dgram_flow_entry);
3018 	}
3019 
3020 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
3021 	    so->so_snd.sb_cc, 0, error);
3022 
3023 	return error;
3024 }
3025 
3026 /*
3027  * May return ERESTART when packet is dropped by MAC policy check
3028  */
3029 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)3030 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
3031     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
3032 {
3033 	int error = 0;
3034 	struct mbuf *m = *mp;
3035 	struct mbuf *nextrecord = *nextrecordp;
3036 
3037 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3038 #if CONFIG_MACF_SOCKET_SUBSET
3039 	/*
3040 	 * Call the MAC framework for policy checking if we're in
3041 	 * the user process context and the socket isn't connected.
3042 	 */
3043 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3044 		struct mbuf *m0 = m;
3045 		/*
3046 		 * Dequeue this record (temporarily) from the receive
3047 		 * list since we're about to drop the socket's lock
3048 		 * where a new record may arrive and be appended to
3049 		 * the list.  Upon MAC policy failure, the record
3050 		 * will be freed.  Otherwise, we'll add it back to
3051 		 * the head of the list.  We cannot rely on SB_LOCK
3052 		 * because append operation uses the socket's lock.
3053 		 */
3054 		do {
3055 			m->m_nextpkt = NULL;
3056 			sbfree(&so->so_rcv, m);
3057 			m = m->m_next;
3058 		} while (m != NULL);
3059 		m = m0;
3060 		so->so_rcv.sb_mb = nextrecord;
3061 		SB_EMPTY_FIXUP(&so->so_rcv);
3062 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3063 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3064 		socket_unlock(so, 0);
3065 
3066 		error = mac_socket_check_received(kauth_cred_get(), so,
3067 		    mtod(m, struct sockaddr *));
3068 
3069 		if (error != 0) {
3070 			/*
3071 			 * MAC policy failure; free this record and
3072 			 * process the next record (or block until
3073 			 * one is available).  We have adjusted sb_cc
3074 			 * and sb_mbcnt above so there is no need to
3075 			 * call sbfree() again.
3076 			 */
3077 			m_freem(m);
3078 			/*
3079 			 * Clear SB_LOCK but don't unlock the socket.
3080 			 * Process the next record or wait for one.
3081 			 */
3082 			socket_lock(so, 0);
3083 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
3084 			error = ERESTART;
3085 			goto done;
3086 		}
3087 		socket_lock(so, 0);
3088 		/*
3089 		 * If the socket has been defunct'd, drop it.
3090 		 */
3091 		if (so->so_flags & SOF_DEFUNCT) {
3092 			m_freem(m);
3093 			error = ENOTCONN;
3094 			goto done;
3095 		}
3096 		/*
3097 		 * Re-adjust the socket receive list and re-enqueue
3098 		 * the record in front of any packets which may have
3099 		 * been appended while we dropped the lock.
3100 		 */
3101 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3102 			sballoc(&so->so_rcv, m);
3103 		}
3104 		sballoc(&so->so_rcv, m);
3105 		if (so->so_rcv.sb_mb == NULL) {
3106 			so->so_rcv.sb_lastrecord = m0;
3107 			so->so_rcv.sb_mbtail = m;
3108 		}
3109 		m = m0;
3110 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3111 		so->so_rcv.sb_mb = m;
3112 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3113 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3114 	}
3115 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3116 	if (psa != NULL) {
3117 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3118 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3119 			error = EWOULDBLOCK;
3120 			goto done;
3121 		}
3122 	}
3123 	if (flags & MSG_PEEK) {
3124 		m = m->m_next;
3125 	} else {
3126 		sbfree(&so->so_rcv, m);
3127 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3128 			panic("%s: about to create invalid socketbuf",
3129 			    __func__);
3130 			/* NOTREACHED */
3131 		}
3132 		MFREE(m, so->so_rcv.sb_mb);
3133 		m = so->so_rcv.sb_mb;
3134 		if (m != NULL) {
3135 			m->m_nextpkt = nextrecord;
3136 		} else {
3137 			so->so_rcv.sb_mb = nextrecord;
3138 			SB_EMPTY_FIXUP(&so->so_rcv);
3139 		}
3140 	}
3141 done:
3142 	*mp = m;
3143 	*nextrecordp = nextrecord;
3144 
3145 	return error;
3146 }
3147 
3148 /*
3149  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3150  * so clear the data portion in order not to leak the file pointers
3151  */
3152 static void
sopeek_scm_rights(struct mbuf * rights)3153 sopeek_scm_rights(struct mbuf *rights)
3154 {
3155 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3156 
3157 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3158 		VERIFY(cm->cmsg_len <= rights->m_len);
3159 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3160 	}
3161 }
3162 
3163 /*
3164  * Process one or more MT_CONTROL mbufs present before any data mbufs
3165  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3166  * just copy the data; if !MSG_PEEK, we call into the protocol to
3167  * perform externalization.
3168  */
3169 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3170 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3171     struct mbuf **mp, struct mbuf **nextrecordp)
3172 {
3173 	int error = 0;
3174 	struct mbuf *cm = NULL, *cmn;
3175 	struct mbuf **cme = &cm;
3176 	struct sockbuf *sb_rcv = &so->so_rcv;
3177 	struct mbuf **msgpcm = NULL;
3178 	struct mbuf *m = *mp;
3179 	struct mbuf *nextrecord = *nextrecordp;
3180 	struct protosw *pr = so->so_proto;
3181 
3182 	/*
3183 	 * Externalizing the control messages would require us to
3184 	 * drop the socket's lock below.  Once we re-acquire the
3185 	 * lock, the mbuf chain might change.  In order to preserve
3186 	 * consistency, we unlink all control messages from the
3187 	 * first mbuf chain in one shot and link them separately
3188 	 * onto a different chain.
3189 	 */
3190 	do {
3191 		if (flags & MSG_PEEK) {
3192 			if (controlp != NULL) {
3193 				if (*controlp == NULL) {
3194 					msgpcm = controlp;
3195 				}
3196 				*controlp = m_copy(m, 0, m->m_len);
3197 
3198 				/*
3199 				 * If we failed to allocate an mbuf,
3200 				 * release any previously allocated
3201 				 * mbufs for control data. Return
3202 				 * an error. Keep the mbufs in the
3203 				 * socket as this is using
3204 				 * MSG_PEEK flag.
3205 				 */
3206 				if (*controlp == NULL) {
3207 					m_freem(*msgpcm);
3208 					error = ENOBUFS;
3209 					goto done;
3210 				}
3211 
3212 				if (pr->pr_domain->dom_externalize != NULL) {
3213 					sopeek_scm_rights(*controlp);
3214 				}
3215 
3216 				controlp = &(*controlp)->m_next;
3217 			}
3218 			m = m->m_next;
3219 		} else {
3220 			m->m_nextpkt = NULL;
3221 			sbfree(sb_rcv, m);
3222 			sb_rcv->sb_mb = m->m_next;
3223 			m->m_next = NULL;
3224 			*cme = m;
3225 			cme = &(*cme)->m_next;
3226 			m = sb_rcv->sb_mb;
3227 		}
3228 	} while (m != NULL && m->m_type == MT_CONTROL);
3229 
3230 	if (!(flags & MSG_PEEK)) {
3231 		if (sb_rcv->sb_mb != NULL) {
3232 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3233 		} else {
3234 			sb_rcv->sb_mb = nextrecord;
3235 			SB_EMPTY_FIXUP(sb_rcv);
3236 		}
3237 		if (nextrecord == NULL) {
3238 			sb_rcv->sb_lastrecord = m;
3239 		}
3240 	}
3241 
3242 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3243 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3244 
3245 	while (cm != NULL) {
3246 		int cmsg_level;
3247 		int cmsg_type;
3248 
3249 		cmn = cm->m_next;
3250 		cm->m_next = NULL;
3251 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3252 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3253 
3254 		/*
3255 		 * Call the protocol to externalize SCM_RIGHTS message
3256 		 * and return the modified message to the caller upon
3257 		 * success.  Otherwise, all other control messages are
3258 		 * returned unmodified to the caller.  Note that we
3259 		 * only get into this loop if MSG_PEEK is not set.
3260 		 */
3261 		if (pr->pr_domain->dom_externalize != NULL &&
3262 		    cmsg_level == SOL_SOCKET &&
3263 		    cmsg_type == SCM_RIGHTS) {
3264 			/*
3265 			 * Release socket lock: see 3903171.  This
3266 			 * would also allow more records to be appended
3267 			 * to the socket buffer.  We still have SB_LOCK
3268 			 * set on it, so we can be sure that the head
3269 			 * of the mbuf chain won't change.
3270 			 */
3271 			socket_unlock(so, 0);
3272 			error = (*pr->pr_domain->dom_externalize)(cm);
3273 			socket_lock(so, 0);
3274 		} else {
3275 			error = 0;
3276 		}
3277 
3278 		if (controlp != NULL && error == 0) {
3279 			*controlp = cm;
3280 			controlp = &(*controlp)->m_next;
3281 		} else {
3282 			(void) m_free(cm);
3283 		}
3284 		cm = cmn;
3285 	}
3286 	/*
3287 	 * Update the value of nextrecord in case we received new
3288 	 * records when the socket was unlocked above for
3289 	 * externalizing SCM_RIGHTS.
3290 	 */
3291 	if (m != NULL) {
3292 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3293 	} else {
3294 		nextrecord = sb_rcv->sb_mb;
3295 	}
3296 
3297 done:
3298 	*mp = m;
3299 	*nextrecordp = nextrecord;
3300 
3301 	return error;
3302 }
3303 
3304 /*
3305  * If we have less data than requested, block awaiting more
3306  * (subject to any timeout) if:
3307  *   1. the current count is less than the low water mark, or
3308  *   2. MSG_WAITALL is set, and it is possible to do the entire
3309  *	receive operation at once if we block (resid <= hiwat).
3310  *   3. MSG_DONTWAIT is not set
3311  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3312  * we have to do the receive in sections, and thus risk returning
3313  * a short count if a timeout or signal occurs after we start.
3314  */
3315 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3316 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3317 {
3318 	struct protosw *pr = so->so_proto;
3319 
3320 	/* No mbufs in the receive-queue? Wait! */
3321 	if (m == NULL) {
3322 		return true;
3323 	}
3324 
3325 	/* Not enough data in the receive socket-buffer - we may have to wait */
3326 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3327 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3328 		/*
3329 		 * Application did set the lowater-mark, so we should wait for
3330 		 * this data to be present.
3331 		 */
3332 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3333 			return true;
3334 		}
3335 
3336 		/*
3337 		 * Application wants all the data - so let's try to do the
3338 		 * receive-operation at once by waiting for everything to
3339 		 * be there.
3340 		 */
3341 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3342 			return true;
3343 		}
3344 	}
3345 
3346 	return false;
3347 }
3348 
3349 /*
3350  * Implement receive operations on a socket.
3351  * We depend on the way that records are added to the sockbuf
3352  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3353  * must begin with an address if the protocol so specifies,
3354  * followed by an optional mbuf or mbufs containing ancillary data,
3355  * and then zero or more mbufs of data.
3356  * In order to avoid blocking network interrupts for the entire time here,
3357  * we splx() while doing the actual copy to user space.
3358  * Although the sockbuf is locked, new data may still be appended,
3359  * and thus we must maintain consistency of the sockbuf during that time.
3360  *
3361  * The caller may receive the data as a single mbuf chain by supplying
3362  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3363  * only for the count in uio_resid.
3364  *
3365  * Returns:	0			Success
3366  *		ENOBUFS
3367  *		ENOTCONN
3368  *		EWOULDBLOCK
3369  *	uiomove:EFAULT
3370  *	sblock:EWOULDBLOCK
3371  *	sblock:EINTR
3372  *	sbwait:EBADF
3373  *	sbwait:EINTR
3374  *	sodelayed_copy:EFAULT
3375  *	<pru_rcvoob>:EINVAL[TCP]
3376  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3377  *	<pru_rcvoob>:???
3378  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3379  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3380  *	<pr_domain->dom_externalize>:???
3381  *
3382  * Notes:	Additional return values from calls through <pru_rcvoob> and
3383  *		<pr_domain->dom_externalize> depend on protocols other than
3384  *		TCP or AF_UNIX, which are documented above.
3385  */
3386 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3387 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3388     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3389 {
3390 	struct mbuf *m, **mp, *ml = NULL;
3391 	struct mbuf *nextrecord, *free_list;
3392 	int flags, error, offset;
3393 	user_ssize_t len;
3394 	struct protosw *pr = so->so_proto;
3395 	int moff, type = 0;
3396 	user_ssize_t orig_resid = uio_resid(uio);
3397 	user_ssize_t delayed_copy_len;
3398 	int can_delay;
3399 	struct proc *p = current_proc();
3400 	boolean_t en_tracing = FALSE;
3401 
3402 	/*
3403 	 * Sanity check on the length passed by caller as we are making 'int'
3404 	 * comparisons
3405 	 */
3406 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3407 		return EINVAL;
3408 	}
3409 
3410 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3411 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3412 	    so->so_rcv.sb_hiwat);
3413 
3414 	socket_lock(so, 1);
3415 	so_update_last_owner_locked(so, p);
3416 	so_update_policy(so);
3417 
3418 #ifdef MORE_LOCKING_DEBUG
3419 	if (so->so_usecount == 1) {
3420 		panic("%s: so=%x no other reference on socket", __func__, so);
3421 		/* NOTREACHED */
3422 	}
3423 #endif
3424 	mp = mp0;
3425 	if (psa != NULL) {
3426 		*psa = NULL;
3427 	}
3428 	if (controlp != NULL) {
3429 		*controlp = NULL;
3430 	}
3431 	if (flagsp != NULL) {
3432 		flags = *flagsp & ~MSG_EOR;
3433 	} else {
3434 		flags = 0;
3435 	}
3436 
3437 	/*
3438 	 * If a recv attempt is made on a previously-accepted socket
3439 	 * that has been marked as inactive (disconnected), reject
3440 	 * the request.
3441 	 */
3442 	if (so->so_flags & SOF_DEFUNCT) {
3443 		struct sockbuf *sb = &so->so_rcv;
3444 
3445 		error = ENOTCONN;
3446 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3447 		    __func__, proc_pid(p), proc_best_name(p),
3448 		    so->so_gencnt,
3449 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3450 		/*
3451 		 * This socket should have been disconnected and flushed
3452 		 * prior to being returned from sodefunct(); there should
3453 		 * be no data on its receive list, so panic otherwise.
3454 		 */
3455 		if (so->so_state & SS_DEFUNCT) {
3456 			sb_empty_assert(sb, __func__);
3457 		}
3458 		socket_unlock(so, 1);
3459 		return error;
3460 	}
3461 
3462 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3463 	    pr->pr_usrreqs->pru_preconnect) {
3464 		/*
3465 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3466 		 * calling write() right after this. *If* the app calls a read
3467 		 * we do not want to block this read indefinetely. Thus,
3468 		 * we trigger a connect so that the session gets initiated.
3469 		 */
3470 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3471 
3472 		if (error) {
3473 			socket_unlock(so, 1);
3474 			return error;
3475 		}
3476 	}
3477 
3478 	if (ENTR_SHOULDTRACE &&
3479 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3480 		/*
3481 		 * enable energy tracing for inet sockets that go over
3482 		 * non-loopback interfaces only.
3483 		 */
3484 		struct inpcb *inp = sotoinpcb(so);
3485 		if (inp->inp_last_outifp != NULL &&
3486 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3487 			en_tracing = TRUE;
3488 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3489 			    VM_KERNEL_ADDRPERM(so),
3490 			    ((so->so_state & SS_NBIO) ?
3491 			    kEnTrFlagNonBlocking : 0),
3492 			    (int64_t)orig_resid);
3493 		}
3494 	}
3495 
3496 	/*
3497 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3498 	 * regardless of the flags argument. Here is the case were
3499 	 * out-of-band data is not inline.
3500 	 */
3501 	if ((flags & MSG_OOB) ||
3502 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3503 	    (so->so_options & SO_OOBINLINE) == 0 &&
3504 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3505 		m = m_get(M_WAIT, MT_DATA);
3506 		if (m == NULL) {
3507 			socket_unlock(so, 1);
3508 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3509 			    ENOBUFS, 0, 0, 0, 0);
3510 			return ENOBUFS;
3511 		}
3512 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3513 		if (error) {
3514 			goto bad;
3515 		}
3516 		socket_unlock(so, 0);
3517 		do {
3518 			error = uiomove(mtod(m, caddr_t),
3519 			    imin((int)uio_resid(uio), m->m_len), uio);
3520 			m = m_free(m);
3521 		} while (uio_resid(uio) && error == 0 && m != NULL);
3522 		socket_lock(so, 0);
3523 bad:
3524 		if (m != NULL) {
3525 			m_freem(m);
3526 		}
3527 
3528 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3529 			if (error == EWOULDBLOCK || error == EINVAL) {
3530 				/*
3531 				 * Let's try to get normal data:
3532 				 * EWOULDBLOCK: out-of-band data not
3533 				 * receive yet. EINVAL: out-of-band data
3534 				 * already read.
3535 				 */
3536 				error = 0;
3537 				goto nooob;
3538 			} else if (error == 0 && flagsp != NULL) {
3539 				*flagsp |= MSG_OOB;
3540 			}
3541 		}
3542 		socket_unlock(so, 1);
3543 		if (en_tracing) {
3544 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3545 			    VM_KERNEL_ADDRPERM(so), 0,
3546 			    (int64_t)(orig_resid - uio_resid(uio)));
3547 		}
3548 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3549 		    0, 0, 0, 0);
3550 
3551 		return error;
3552 	}
3553 nooob:
3554 	if (mp != NULL) {
3555 		*mp = NULL;
3556 	}
3557 
3558 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3559 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3560 	}
3561 
3562 	free_list = NULL;
3563 	delayed_copy_len = 0;
3564 restart:
3565 #ifdef MORE_LOCKING_DEBUG
3566 	if (so->so_usecount <= 1) {
3567 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3568 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3569 	}
3570 #endif
3571 	/*
3572 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3573 	 * and if so just return to the caller.  This could happen when
3574 	 * soreceive() is called by a socket upcall function during the
3575 	 * time the socket is freed.  The socket buffer would have been
3576 	 * locked across the upcall, therefore we cannot put this thread
3577 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3578 	 * we may livelock), because the lock on the socket buffer will
3579 	 * only be released when the upcall routine returns to its caller.
3580 	 * Because the socket has been officially closed, there can be
3581 	 * no further read on it.
3582 	 *
3583 	 * A multipath subflow socket would have its SS_NOFDREF set by
3584 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3585 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3586 	 */
3587 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3588 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3589 		socket_unlock(so, 1);
3590 		return 0;
3591 	}
3592 
3593 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3594 	if (error) {
3595 		socket_unlock(so, 1);
3596 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3597 		    0, 0, 0, 0);
3598 		if (en_tracing) {
3599 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3600 			    VM_KERNEL_ADDRPERM(so), 0,
3601 			    (int64_t)(orig_resid - uio_resid(uio)));
3602 		}
3603 		return error;
3604 	}
3605 
3606 	m = so->so_rcv.sb_mb;
3607 	if (so_should_wait(so, uio, m, flags)) {
3608 		/*
3609 		 * Panic if we notice inconsistencies in the socket's
3610 		 * receive list; both sb_mb and sb_cc should correctly
3611 		 * reflect the contents of the list, otherwise we may
3612 		 * end up with false positives during select() or poll()
3613 		 * which could put the application in a bad state.
3614 		 */
3615 		SB_MB_CHECK(&so->so_rcv);
3616 
3617 		if (so->so_error) {
3618 			if (m != NULL) {
3619 				goto dontblock;
3620 			}
3621 			error = so->so_error;
3622 			if ((flags & MSG_PEEK) == 0) {
3623 				so->so_error = 0;
3624 			}
3625 			goto release;
3626 		}
3627 		if (so->so_state & SS_CANTRCVMORE) {
3628 #if CONTENT_FILTER
3629 			/*
3630 			 * Deal with half closed connections
3631 			 */
3632 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3633 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3634 				CFIL_LOG(LOG_INFO,
3635 				    "so %llx ignore SS_CANTRCVMORE",
3636 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3637 			} else
3638 #endif /* CONTENT_FILTER */
3639 			if (m != NULL) {
3640 				goto dontblock;
3641 			} else {
3642 				goto release;
3643 			}
3644 		}
3645 		for (; m != NULL; m = m->m_next) {
3646 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3647 				m = so->so_rcv.sb_mb;
3648 				goto dontblock;
3649 			}
3650 		}
3651 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3652 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3653 			error = ENOTCONN;
3654 			goto release;
3655 		}
3656 		if (uio_resid(uio) == 0) {
3657 			goto release;
3658 		}
3659 
3660 		if ((so->so_state & SS_NBIO) ||
3661 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3662 			error = EWOULDBLOCK;
3663 			goto release;
3664 		}
3665 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3666 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3667 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3668 #if EVEN_MORE_LOCKING_DEBUG
3669 		if (socket_debug) {
3670 			printf("Waiting for socket data\n");
3671 		}
3672 #endif
3673 
3674 		/*
3675 		 * Depending on the protocol (e.g. TCP), the following
3676 		 * might cause the socket lock to be dropped and later
3677 		 * be reacquired, and more data could have arrived and
3678 		 * have been appended to the receive socket buffer by
3679 		 * the time it returns.  Therefore, we only sleep in
3680 		 * sbwait() below if and only if the wait-condition is still
3681 		 * true.
3682 		 */
3683 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3684 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3685 		}
3686 
3687 		error = 0;
3688 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3689 			error = sbwait(&so->so_rcv);
3690 		}
3691 
3692 #if EVEN_MORE_LOCKING_DEBUG
3693 		if (socket_debug) {
3694 			printf("SORECEIVE - sbwait returned %d\n", error);
3695 		}
3696 #endif
3697 		if (so->so_usecount < 1) {
3698 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3699 			    __func__, so, so->so_usecount);
3700 			/* NOTREACHED */
3701 		}
3702 		if (error) {
3703 			socket_unlock(so, 1);
3704 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3705 			    0, 0, 0, 0);
3706 			if (en_tracing) {
3707 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3708 				    VM_KERNEL_ADDRPERM(so), 0,
3709 				    (int64_t)(orig_resid - uio_resid(uio)));
3710 			}
3711 			return error;
3712 		}
3713 		goto restart;
3714 	}
3715 dontblock:
3716 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3717 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3718 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3719 	nextrecord = m->m_nextpkt;
3720 
3721 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3722 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3723 		    mp0 == NULL);
3724 		if (error == ERESTART) {
3725 			goto restart;
3726 		} else if (error != 0) {
3727 			goto release;
3728 		}
3729 		orig_resid = 0;
3730 	}
3731 
3732 	/*
3733 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3734 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3735 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3736 	 * perform externalization.
3737 	 */
3738 	if (m != NULL && m->m_type == MT_CONTROL) {
3739 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3740 		if (error != 0) {
3741 			goto release;
3742 		}
3743 		orig_resid = 0;
3744 	}
3745 
3746 	if (m != NULL) {
3747 		if (!(flags & MSG_PEEK)) {
3748 			/*
3749 			 * We get here because m points to an mbuf following
3750 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3751 			 * processed above.  In any case, m should be pointing
3752 			 * to the head of the mbuf chain, and the nextrecord
3753 			 * should be either NULL or equal to m->m_nextpkt.
3754 			 * See comments above about SB_LOCK.
3755 			 */
3756 			if (m != so->so_rcv.sb_mb ||
3757 			    m->m_nextpkt != nextrecord) {
3758 				panic("%s: post-control !sync so=%p m=%p "
3759 				    "nextrecord=%p\n", __func__, so, m,
3760 				    nextrecord);
3761 				/* NOTREACHED */
3762 			}
3763 			if (nextrecord == NULL) {
3764 				so->so_rcv.sb_lastrecord = m;
3765 			}
3766 		}
3767 		type = m->m_type;
3768 		if (type == MT_OOBDATA) {
3769 			flags |= MSG_OOB;
3770 		}
3771 	} else {
3772 		if (!(flags & MSG_PEEK)) {
3773 			SB_EMPTY_FIXUP(&so->so_rcv);
3774 		}
3775 	}
3776 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3777 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3778 
3779 	moff = 0;
3780 	offset = 0;
3781 
3782 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3783 		can_delay = 1;
3784 	} else {
3785 		can_delay = 0;
3786 	}
3787 
3788 	while (m != NULL &&
3789 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3790 		if (m->m_type == MT_OOBDATA) {
3791 			if (type != MT_OOBDATA) {
3792 				break;
3793 			}
3794 		} else if (type == MT_OOBDATA) {
3795 			break;
3796 		}
3797 
3798 		if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3799 		    m->m_type != MT_HEADER) {
3800 			break;
3801 		}
3802 		/*
3803 		 * Make sure to allways set MSG_OOB event when getting
3804 		 * out of band data inline.
3805 		 */
3806 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3807 		    (so->so_options & SO_OOBINLINE) != 0 &&
3808 		    (so->so_state & SS_RCVATMARK) != 0) {
3809 			flags |= MSG_OOB;
3810 		}
3811 		so->so_state &= ~SS_RCVATMARK;
3812 		len = uio_resid(uio) - delayed_copy_len;
3813 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3814 			len = so->so_oobmark - offset;
3815 		}
3816 		if (len > m->m_len - moff) {
3817 			len = m->m_len - moff;
3818 		}
3819 		/*
3820 		 * If mp is set, just pass back the mbufs.
3821 		 * Otherwise copy them out via the uio, then free.
3822 		 * Sockbuf must be consistent here (points to current mbuf,
3823 		 * it points to next record) when we drop priority;
3824 		 * we must note any additions to the sockbuf when we
3825 		 * block interrupts again.
3826 		 */
3827 		if (mp == NULL) {
3828 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3829 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3830 			if (can_delay && len == m->m_len) {
3831 				/*
3832 				 * only delay the copy if we're consuming the
3833 				 * mbuf and we're NOT in MSG_PEEK mode
3834 				 * and we have enough data to make it worthwile
3835 				 * to drop and retake the lock... can_delay
3836 				 * reflects the state of the 2 latter
3837 				 * constraints moff should always be zero
3838 				 * in these cases
3839 				 */
3840 				delayed_copy_len += len;
3841 			} else {
3842 				if (delayed_copy_len) {
3843 					error = sodelayed_copy(so, uio,
3844 					    &free_list, &delayed_copy_len);
3845 
3846 					if (error) {
3847 						goto release;
3848 					}
3849 					/*
3850 					 * can only get here if MSG_PEEK is not
3851 					 * set therefore, m should point at the
3852 					 * head of the rcv queue; if it doesn't,
3853 					 * it means something drastically
3854 					 * changed while we were out from behind
3855 					 * the lock in sodelayed_copy. perhaps
3856 					 * a RST on the stream. in any event,
3857 					 * the stream has been interrupted. it's
3858 					 * probably best just to return whatever
3859 					 * data we've moved and let the caller
3860 					 * sort it out...
3861 					 */
3862 					if (m != so->so_rcv.sb_mb) {
3863 						break;
3864 					}
3865 				}
3866 				socket_unlock(so, 0);
3867 				error = uiomove(mtod(m, caddr_t) + moff,
3868 				    (int)len, uio);
3869 				socket_lock(so, 0);
3870 
3871 				if (error) {
3872 					goto release;
3873 				}
3874 			}
3875 		} else {
3876 			uio_setresid(uio, (uio_resid(uio) - len));
3877 		}
3878 		if (len == m->m_len - moff) {
3879 			if (m->m_flags & M_EOR) {
3880 				flags |= MSG_EOR;
3881 			}
3882 			if (flags & MSG_PEEK) {
3883 				m = m->m_next;
3884 				moff = 0;
3885 			} else {
3886 				nextrecord = m->m_nextpkt;
3887 				sbfree(&so->so_rcv, m);
3888 				m->m_nextpkt = NULL;
3889 
3890 				if (mp != NULL) {
3891 					*mp = m;
3892 					mp = &m->m_next;
3893 					so->so_rcv.sb_mb = m = m->m_next;
3894 					*mp = NULL;
3895 				} else {
3896 					if (free_list == NULL) {
3897 						free_list = m;
3898 					} else {
3899 						ml->m_next = m;
3900 					}
3901 					ml = m;
3902 					so->so_rcv.sb_mb = m = m->m_next;
3903 					ml->m_next = NULL;
3904 				}
3905 				if (m != NULL) {
3906 					m->m_nextpkt = nextrecord;
3907 					if (nextrecord == NULL) {
3908 						so->so_rcv.sb_lastrecord = m;
3909 					}
3910 				} else {
3911 					so->so_rcv.sb_mb = nextrecord;
3912 					SB_EMPTY_FIXUP(&so->so_rcv);
3913 				}
3914 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3915 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3916 			}
3917 		} else {
3918 			if (flags & MSG_PEEK) {
3919 				moff += len;
3920 			} else {
3921 				if (mp != NULL) {
3922 					int copy_flag;
3923 
3924 					if (flags & MSG_DONTWAIT) {
3925 						copy_flag = M_DONTWAIT;
3926 					} else {
3927 						copy_flag = M_WAIT;
3928 					}
3929 					*mp = m_copym(m, 0, (int)len, copy_flag);
3930 					/*
3931 					 * Failed to allocate an mbuf?
3932 					 * Adjust uio_resid back, it was
3933 					 * adjusted down by len bytes which
3934 					 * we didn't copy over.
3935 					 */
3936 					if (*mp == NULL) {
3937 						uio_setresid(uio,
3938 						    (uio_resid(uio) + len));
3939 						break;
3940 					}
3941 				}
3942 				m->m_data += len;
3943 				m->m_len -= len;
3944 				so->so_rcv.sb_cc -= len;
3945 			}
3946 		}
3947 		if (so->so_oobmark) {
3948 			if ((flags & MSG_PEEK) == 0) {
3949 				so->so_oobmark -= len;
3950 				if (so->so_oobmark == 0) {
3951 					so->so_state |= SS_RCVATMARK;
3952 					break;
3953 				}
3954 			} else {
3955 				offset += len;
3956 				if (offset == so->so_oobmark) {
3957 					break;
3958 				}
3959 			}
3960 		}
3961 		if (flags & MSG_EOR) {
3962 			break;
3963 		}
3964 		/*
3965 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3966 		 * (for non-atomic socket), we must not quit until
3967 		 * "uio->uio_resid == 0" or an error termination.
3968 		 * If a signal/timeout occurs, return with a short
3969 		 * count but without error.  Keep sockbuf locked
3970 		 * against other readers.
3971 		 */
3972 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3973 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3974 		    !sosendallatonce(so) && !nextrecord) {
3975 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3976 #if CONTENT_FILTER
3977 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3978 #endif /* CONTENT_FILTER */
3979 			    )) {
3980 				goto release;
3981 			}
3982 
3983 			/*
3984 			 * Depending on the protocol (e.g. TCP), the following
3985 			 * might cause the socket lock to be dropped and later
3986 			 * be reacquired, and more data could have arrived and
3987 			 * have been appended to the receive socket buffer by
3988 			 * the time it returns.  Therefore, we only sleep in
3989 			 * sbwait() below if and only if the socket buffer is
3990 			 * empty, in order to avoid a false sleep.
3991 			 */
3992 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3993 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3994 			}
3995 
3996 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3997 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3998 
3999 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
4000 				error = 0;
4001 				goto release;
4002 			}
4003 			/*
4004 			 * have to wait until after we get back from the sbwait
4005 			 * to do the copy because we will drop the lock if we
4006 			 * have enough data that has been delayed... by dropping
4007 			 * the lock we open up a window allowing the netisr
4008 			 * thread to process the incoming packets and to change
4009 			 * the state of this socket... we're issuing the sbwait
4010 			 * because the socket is empty and we're expecting the
4011 			 * netisr thread to wake us up when more packets arrive;
4012 			 * if we allow that processing to happen and then sbwait
4013 			 * we could stall forever with packets sitting in the
4014 			 * socket if no further packets arrive from the remote
4015 			 * side.
4016 			 *
4017 			 * we want to copy before we've collected all the data
4018 			 * to satisfy this request to allow the copy to overlap
4019 			 * the incoming packet processing on an MP system
4020 			 */
4021 			if (delayed_copy_len > sorecvmincopy &&
4022 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
4023 				error = sodelayed_copy(so, uio,
4024 				    &free_list, &delayed_copy_len);
4025 
4026 				if (error) {
4027 					goto release;
4028 				}
4029 			}
4030 			m = so->so_rcv.sb_mb;
4031 			if (m != NULL) {
4032 				nextrecord = m->m_nextpkt;
4033 			}
4034 			SB_MB_CHECK(&so->so_rcv);
4035 		}
4036 	}
4037 #ifdef MORE_LOCKING_DEBUG
4038 	if (so->so_usecount <= 1) {
4039 		panic("%s: after big while so=%p ref=%d on socket",
4040 		    __func__, so, so->so_usecount);
4041 		/* NOTREACHED */
4042 	}
4043 #endif
4044 
4045 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4046 		if (so->so_options & SO_DONTTRUNC) {
4047 			flags |= MSG_RCVMORE;
4048 		} else {
4049 			flags |= MSG_TRUNC;
4050 			if ((flags & MSG_PEEK) == 0) {
4051 				(void) sbdroprecord(&so->so_rcv);
4052 			}
4053 		}
4054 	}
4055 
4056 	/*
4057 	 * pru_rcvd below (for TCP) may cause more data to be received
4058 	 * if the socket lock is dropped prior to sending the ACK; some
4059 	 * legacy OpenTransport applications don't handle this well
4060 	 * (if it receives less data than requested while MSG_HAVEMORE
4061 	 * is set), and so we set the flag now based on what we know
4062 	 * prior to calling pru_rcvd.
4063 	 */
4064 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4065 		flags |= MSG_HAVEMORE;
4066 	}
4067 
4068 	if ((flags & MSG_PEEK) == 0) {
4069 		if (m == NULL) {
4070 			so->so_rcv.sb_mb = nextrecord;
4071 			/*
4072 			 * First part is an inline SB_EMPTY_FIXUP().  Second
4073 			 * part makes sure sb_lastrecord is up-to-date if
4074 			 * there is still data in the socket buffer.
4075 			 */
4076 			if (so->so_rcv.sb_mb == NULL) {
4077 				so->so_rcv.sb_mbtail = NULL;
4078 				so->so_rcv.sb_lastrecord = NULL;
4079 			} else if (nextrecord->m_nextpkt == NULL) {
4080 				so->so_rcv.sb_lastrecord = nextrecord;
4081 			}
4082 			SB_MB_CHECK(&so->so_rcv);
4083 		}
4084 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4085 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4086 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4087 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4088 		}
4089 	}
4090 
4091 	if (delayed_copy_len) {
4092 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4093 		if (error) {
4094 			goto release;
4095 		}
4096 	}
4097 	if (free_list != NULL) {
4098 		m_freem_list(free_list);
4099 		free_list = NULL;
4100 	}
4101 
4102 	if (orig_resid == uio_resid(uio) && orig_resid &&
4103 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4104 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4105 		goto restart;
4106 	}
4107 
4108 	if (flagsp != NULL) {
4109 		*flagsp |= flags;
4110 	}
4111 release:
4112 #ifdef MORE_LOCKING_DEBUG
4113 	if (so->so_usecount <= 1) {
4114 		panic("%s: release so=%p ref=%d on socket", __func__,
4115 		    so, so->so_usecount);
4116 		/* NOTREACHED */
4117 	}
4118 #endif
4119 	if (delayed_copy_len) {
4120 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4121 	}
4122 
4123 	if (free_list != NULL) {
4124 		m_freem_list(free_list);
4125 	}
4126 
4127 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4128 
4129 	if (en_tracing) {
4130 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4131 		    VM_KERNEL_ADDRPERM(so),
4132 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4133 		    (int64_t)(orig_resid - uio_resid(uio)));
4134 	}
4135 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4136 	    so->so_rcv.sb_cc, 0, error);
4137 
4138 	return error;
4139 }
4140 
4141 /*
4142  * Returns:	0			Success
4143  *	uiomove:EFAULT
4144  */
4145 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4146 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4147     user_ssize_t *resid)
4148 {
4149 	int error = 0;
4150 	struct mbuf *m;
4151 
4152 	m = *free_list;
4153 
4154 	socket_unlock(so, 0);
4155 
4156 	while (m != NULL && error == 0) {
4157 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4158 		m = m->m_next;
4159 	}
4160 	m_freem_list(*free_list);
4161 
4162 	*free_list = NULL;
4163 	*resid = 0;
4164 
4165 	socket_lock(so, 0);
4166 
4167 	return error;
4168 }
4169 
4170 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4171 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4172     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4173 {
4174 #pragma unused(so)
4175 	int error = 0;
4176 	struct mbuf *ml, *m;
4177 	int i = 0;
4178 	struct uio *auio;
4179 
4180 	for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4181 	    ml = ml->m_nextpkt, i++) {
4182 		auio = msgarray[i].uio;
4183 		for (m = ml; m != NULL; m = m->m_next) {
4184 			error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4185 			if (error != 0) {
4186 				goto out;
4187 			}
4188 		}
4189 	}
4190 out:
4191 	m_freem_list(*free_list);
4192 
4193 	*free_list = NULL;
4194 	*resid = 0;
4195 
4196 	return error;
4197 }
4198 
4199 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4200 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4201     int *flagsp)
4202 {
4203 	struct mbuf *m;
4204 	struct mbuf *nextrecord;
4205 	struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4206 	int error;
4207 	user_ssize_t len, pktlen, delayed_copy_len = 0;
4208 	struct protosw *pr = so->so_proto;
4209 	user_ssize_t resid;
4210 	struct proc *p = current_proc();
4211 	struct uio *auio = NULL;
4212 	int npkts = 0;
4213 	int sblocked = 0;
4214 	struct sockaddr **psa = NULL;
4215 	struct mbuf **controlp = NULL;
4216 	int can_delay;
4217 	int flags;
4218 	struct mbuf *free_others = NULL;
4219 
4220 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4221 	    so, uiocnt,
4222 	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4223 
4224 	/*
4225 	 * Sanity checks:
4226 	 * - Only supports don't wait flags
4227 	 * - Only support datagram sockets (could be extended to raw)
4228 	 * - Must be atomic
4229 	 * - Protocol must support packet chains
4230 	 * - The uio array is NULL (should we panic?)
4231 	 */
4232 	if (flagsp != NULL) {
4233 		flags = *flagsp;
4234 	} else {
4235 		flags = 0;
4236 	}
4237 	if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4238 	    MSG_NBIO)) {
4239 		printf("%s invalid flags 0x%x\n", __func__, flags);
4240 		error = EINVAL;
4241 		goto out;
4242 	}
4243 	if (so->so_type != SOCK_DGRAM) {
4244 		error = EINVAL;
4245 		goto out;
4246 	}
4247 	if (sosendallatonce(so) == 0) {
4248 		error = EINVAL;
4249 		goto out;
4250 	}
4251 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4252 		error = EPROTONOSUPPORT;
4253 		goto out;
4254 	}
4255 	if (msgarray == NULL) {
4256 		printf("%s uioarray is NULL\n", __func__);
4257 		error = EINVAL;
4258 		goto out;
4259 	}
4260 	if (uiocnt == 0) {
4261 		printf("%s uiocnt is 0\n", __func__);
4262 		error = EINVAL;
4263 		goto out;
4264 	}
4265 	/*
4266 	 * Sanity check on the length passed by caller as we are making 'int'
4267 	 * comparisons
4268 	 */
4269 	resid = recv_msg_array_resid(msgarray, uiocnt);
4270 	if (resid < 0 || resid > INT_MAX) {
4271 		error = EINVAL;
4272 		goto out;
4273 	}
4274 
4275 	if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4276 		can_delay = 1;
4277 	} else {
4278 		can_delay = 0;
4279 	}
4280 
4281 	socket_lock(so, 1);
4282 	so_update_last_owner_locked(so, p);
4283 	so_update_policy(so);
4284 
4285 #if NECP
4286 	so_update_necp_policy(so, NULL, NULL);
4287 #endif /* NECP */
4288 
4289 	/*
4290 	 * If a recv attempt is made on a previously-accepted socket
4291 	 * that has been marked as inactive (disconnected), reject
4292 	 * the request.
4293 	 */
4294 	if (so->so_flags & SOF_DEFUNCT) {
4295 		struct sockbuf *sb = &so->so_rcv;
4296 
4297 		error = ENOTCONN;
4298 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4299 		    __func__, proc_pid(p), proc_best_name(p),
4300 		    so->so_gencnt,
4301 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4302 		/*
4303 		 * This socket should have been disconnected and flushed
4304 		 * prior to being returned from sodefunct(); there should
4305 		 * be no data on its receive list, so panic otherwise.
4306 		 */
4307 		if (so->so_state & SS_DEFUNCT) {
4308 			sb_empty_assert(sb, __func__);
4309 		}
4310 		goto release;
4311 	}
4312 
4313 next:
4314 	/*
4315 	 * The uio may be empty
4316 	 */
4317 	if (npkts >= uiocnt) {
4318 		error = 0;
4319 		goto release;
4320 	}
4321 restart:
4322 	/*
4323 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4324 	 * and if so just return to the caller.  This could happen when
4325 	 * soreceive() is called by a socket upcall function during the
4326 	 * time the socket is freed.  The socket buffer would have been
4327 	 * locked across the upcall, therefore we cannot put this thread
4328 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4329 	 * we may livelock), because the lock on the socket buffer will
4330 	 * only be released when the upcall routine returns to its caller.
4331 	 * Because the socket has been officially closed, there can be
4332 	 * no further read on it.
4333 	 */
4334 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4335 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4336 		error = 0;
4337 		goto release;
4338 	}
4339 
4340 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4341 	if (error) {
4342 		goto release;
4343 	}
4344 	sblocked = 1;
4345 
4346 	m = so->so_rcv.sb_mb;
4347 	/*
4348 	 * Block awaiting more datagram if needed
4349 	 */
4350 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4351 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4352 	    ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4353 		/*
4354 		 * Panic if we notice inconsistencies in the socket's
4355 		 * receive list; both sb_mb and sb_cc should correctly
4356 		 * reflect the contents of the list, otherwise we may
4357 		 * end up with false positives during select() or poll()
4358 		 * which could put the application in a bad state.
4359 		 */
4360 		SB_MB_CHECK(&so->so_rcv);
4361 
4362 		if (so->so_error) {
4363 			error = so->so_error;
4364 			if ((flags & MSG_PEEK) == 0) {
4365 				so->so_error = 0;
4366 			}
4367 			goto release;
4368 		}
4369 		if (so->so_state & SS_CANTRCVMORE) {
4370 			goto release;
4371 		}
4372 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4373 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4374 			error = ENOTCONN;
4375 			goto release;
4376 		}
4377 		if ((so->so_state & SS_NBIO) ||
4378 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4379 			error = EWOULDBLOCK;
4380 			goto release;
4381 		}
4382 		/*
4383 		 * Do not block if we got some data
4384 		 */
4385 		if (free_list != NULL) {
4386 			error = 0;
4387 			goto release;
4388 		}
4389 
4390 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4391 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4392 
4393 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4394 		sblocked = 0;
4395 
4396 		error = sbwait(&so->so_rcv);
4397 		if (error) {
4398 			goto release;
4399 		}
4400 		goto restart;
4401 	}
4402 
4403 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4404 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4405 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4406 
4407 	/*
4408 	 * Consume the current uio index as we have a datagram
4409 	 */
4410 	auio = msgarray[npkts].uio;
4411 	resid = uio_resid(auio);
4412 	msgarray[npkts].which |= SOCK_MSG_DATA;
4413 	psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4414 	    &msgarray[npkts].psa : NULL;
4415 	controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4416 	    &msgarray[npkts].controlp : NULL;
4417 	npkts += 1;
4418 	nextrecord = m->m_nextpkt;
4419 
4420 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4421 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4422 		if (error == ERESTART) {
4423 			goto restart;
4424 		} else if (error != 0) {
4425 			goto release;
4426 		}
4427 	}
4428 
4429 	if (m != NULL && m->m_type == MT_CONTROL) {
4430 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4431 		if (error != 0) {
4432 			goto release;
4433 		}
4434 	}
4435 
4436 	if (m->m_pkthdr.len == 0) {
4437 		printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4438 		    __func__, __LINE__,
4439 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4440 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4441 		    m->m_type);
4442 	}
4443 
4444 	/*
4445 	 * Loop to copy the mbufs of the current record
4446 	 * Support zero length packets
4447 	 */
4448 	ml = NULL;
4449 	pktlen = 0;
4450 	while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4451 		if (m->m_len == 0) {
4452 			panic("%p m_len zero", m);
4453 		}
4454 		if (m->m_type == 0) {
4455 			panic("%p m_type zero", m);
4456 		}
4457 		/*
4458 		 * Clip to the residual length
4459 		 */
4460 		if (len > m->m_len) {
4461 			len = m->m_len;
4462 		}
4463 		pktlen += len;
4464 		/*
4465 		 * Copy the mbufs via the uio or delay the copy
4466 		 * Sockbuf must be consistent here (points to current mbuf,
4467 		 * it points to next record) when we drop priority;
4468 		 * we must note any additions to the sockbuf when we
4469 		 * block interrupts again.
4470 		 */
4471 		if (len > 0 && can_delay == 0) {
4472 			socket_unlock(so, 0);
4473 			error = uiomove(mtod(m, caddr_t), (int)len, auio);
4474 			socket_lock(so, 0);
4475 			if (error) {
4476 				goto release;
4477 			}
4478 		} else {
4479 			delayed_copy_len += len;
4480 		}
4481 
4482 		if (len == m->m_len) {
4483 			/*
4484 			 * m was entirely copied
4485 			 */
4486 			sbfree(&so->so_rcv, m);
4487 			nextrecord = m->m_nextpkt;
4488 			m->m_nextpkt = NULL;
4489 
4490 			/*
4491 			 * Set the first packet to the head of the free list
4492 			 */
4493 			if (free_list == NULL) {
4494 				free_list = m;
4495 			}
4496 			/*
4497 			 * Link current packet to tail of free list
4498 			 */
4499 			if (ml == NULL) {
4500 				if (free_tail != NULL) {
4501 					free_tail->m_nextpkt = m;
4502 				}
4503 				free_tail = m;
4504 			}
4505 			/*
4506 			 * Link current mbuf to last mbuf of current packet
4507 			 */
4508 			if (ml != NULL) {
4509 				ml->m_next = m;
4510 			}
4511 			ml = m;
4512 
4513 			/*
4514 			 * Move next buf to head of socket buffer
4515 			 */
4516 			so->so_rcv.sb_mb = m = ml->m_next;
4517 			ml->m_next = NULL;
4518 
4519 			if (m != NULL) {
4520 				m->m_nextpkt = nextrecord;
4521 				if (nextrecord == NULL) {
4522 					so->so_rcv.sb_lastrecord = m;
4523 				}
4524 			} else {
4525 				so->so_rcv.sb_mb = nextrecord;
4526 				SB_EMPTY_FIXUP(&so->so_rcv);
4527 			}
4528 			SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4529 			SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4530 		} else {
4531 			/*
4532 			 * Stop the loop on partial copy
4533 			 */
4534 			break;
4535 		}
4536 	}
4537 #ifdef MORE_LOCKING_DEBUG
4538 	if (so->so_usecount <= 1) {
4539 		panic("%s: after big while so=%llx ref=%d on socket",
4540 		    __func__,
4541 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4542 		/* NOTREACHED */
4543 	}
4544 #endif
4545 	/*
4546 	 * Tell the caller we made a partial copy
4547 	 */
4548 	if (m != NULL) {
4549 		if (so->so_options & SO_DONTTRUNC) {
4550 			/*
4551 			 * Copyout first the freelist then the partial mbuf
4552 			 */
4553 			socket_unlock(so, 0);
4554 			if (delayed_copy_len) {
4555 				error = sodelayed_copy_list(so, msgarray,
4556 				    uiocnt, &free_list, &delayed_copy_len);
4557 			}
4558 
4559 			if (error == 0) {
4560 				error = uiomove(mtod(m, caddr_t), (int)len,
4561 				    auio);
4562 			}
4563 			socket_lock(so, 0);
4564 			if (error) {
4565 				goto release;
4566 			}
4567 
4568 			m->m_data += len;
4569 			m->m_len -= len;
4570 			so->so_rcv.sb_cc -= len;
4571 			flags |= MSG_RCVMORE;
4572 		} else {
4573 			(void) sbdroprecord(&so->so_rcv);
4574 			nextrecord = so->so_rcv.sb_mb;
4575 			m = NULL;
4576 			flags |= MSG_TRUNC;
4577 		}
4578 	}
4579 
4580 	if (m == NULL) {
4581 		so->so_rcv.sb_mb = nextrecord;
4582 		/*
4583 		 * First part is an inline SB_EMPTY_FIXUP().  Second
4584 		 * part makes sure sb_lastrecord is up-to-date if
4585 		 * there is still data in the socket buffer.
4586 		 */
4587 		if (so->so_rcv.sb_mb == NULL) {
4588 			so->so_rcv.sb_mbtail = NULL;
4589 			so->so_rcv.sb_lastrecord = NULL;
4590 		} else if (nextrecord->m_nextpkt == NULL) {
4591 			so->so_rcv.sb_lastrecord = nextrecord;
4592 		}
4593 		SB_MB_CHECK(&so->so_rcv);
4594 	}
4595 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4596 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4597 
4598 	/*
4599 	 * We can continue to the next packet as long as:
4600 	 * - We haven't exhausted the uio array
4601 	 * - There was no error
4602 	 * - A packet was not truncated
4603 	 * - We can still receive more data
4604 	 */
4605 	if (npkts < uiocnt && error == 0 &&
4606 	    (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4607 	    (so->so_state & SS_CANTRCVMORE) == 0) {
4608 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4609 		sblocked = 0;
4610 
4611 		goto next;
4612 	}
4613 	if (flagsp != NULL) {
4614 		*flagsp |= flags;
4615 	}
4616 
4617 release:
4618 	/*
4619 	 * pru_rcvd may cause more data to be received if the socket lock
4620 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4621 	 * That way the caller won't be surprised if it receives less data
4622 	 * than requested.
4623 	 */
4624 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4625 		flags |= MSG_HAVEMORE;
4626 	}
4627 
4628 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4629 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4630 	}
4631 
4632 	if (sblocked) {
4633 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4634 	} else {
4635 		socket_unlock(so, 1);
4636 	}
4637 
4638 	if (delayed_copy_len) {
4639 		error = sodelayed_copy_list(so, msgarray, uiocnt,
4640 		    &free_list, &delayed_copy_len);
4641 	}
4642 out:
4643 	/*
4644 	 * Amortize the cost of freeing the mbufs
4645 	 */
4646 	if (free_list != NULL) {
4647 		m_freem_list(free_list);
4648 	}
4649 	if (free_others != NULL) {
4650 		m_freem_list(free_others);
4651 	}
4652 
4653 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4654 	    0, 0, 0, 0);
4655 	return error;
4656 }
4657 
4658 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4659 so_statistics_event_to_nstat_event(int64_t *input_options,
4660     uint64_t *nstat_event)
4661 {
4662 	int error = 0;
4663 	switch (*input_options) {
4664 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4665 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4666 		break;
4667 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4668 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4669 		break;
4670 #if (DEBUG || DEVELOPMENT)
4671 	case SO_STATISTICS_EVENT_RESERVED_1:
4672 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4673 		break;
4674 	case SO_STATISTICS_EVENT_RESERVED_2:
4675 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4676 		break;
4677 #endif /* (DEBUG || DEVELOPMENT) */
4678 	default:
4679 		error = EINVAL;
4680 		break;
4681 	}
4682 	return error;
4683 }
4684 
4685 /*
4686  * Returns:	0			Success
4687  *		EINVAL
4688  *		ENOTCONN
4689  *	<pru_shutdown>:EINVAL
4690  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4691  *	<pru_shutdown>:ENOBUFS[TCP]
4692  *	<pru_shutdown>:EMSGSIZE[TCP]
4693  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4694  *	<pru_shutdown>:ENETUNREACH[TCP]
4695  *	<pru_shutdown>:ENETDOWN[TCP]
4696  *	<pru_shutdown>:ENOMEM[TCP]
4697  *	<pru_shutdown>:EACCES[TCP]
4698  *	<pru_shutdown>:EMSGSIZE[TCP]
4699  *	<pru_shutdown>:ENOBUFS[TCP]
4700  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4701  *	<pru_shutdown>:???		[other protocol families]
4702  */
4703 int
soshutdown(struct socket * so,int how)4704 soshutdown(struct socket *so, int how)
4705 {
4706 	int error;
4707 
4708 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4709 
4710 	switch (how) {
4711 	case SHUT_RD:
4712 	case SHUT_WR:
4713 	case SHUT_RDWR:
4714 		socket_lock(so, 1);
4715 		if ((so->so_state &
4716 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4717 			error = ENOTCONN;
4718 		} else {
4719 			error = soshutdownlock(so, how);
4720 		}
4721 		socket_unlock(so, 1);
4722 		break;
4723 	default:
4724 		error = EINVAL;
4725 		break;
4726 	}
4727 
4728 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4729 
4730 	return error;
4731 }
4732 
4733 int
soshutdownlock_final(struct socket * so,int how)4734 soshutdownlock_final(struct socket *so, int how)
4735 {
4736 	struct protosw *pr = so->so_proto;
4737 	int error = 0;
4738 
4739 	sflt_notify(so, sock_evt_shutdown, &how);
4740 
4741 	if (how != SHUT_WR) {
4742 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4743 			/* read already shut down */
4744 			error = ENOTCONN;
4745 			goto done;
4746 		}
4747 		sorflush(so);
4748 	}
4749 	if (how != SHUT_RD) {
4750 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4751 			/* write already shut down */
4752 			error = ENOTCONN;
4753 			goto done;
4754 		}
4755 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4756 	}
4757 done:
4758 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4759 	return error;
4760 }
4761 
4762 int
soshutdownlock(struct socket * so,int how)4763 soshutdownlock(struct socket *so, int how)
4764 {
4765 	int error = 0;
4766 
4767 #if CONTENT_FILTER
4768 	/*
4769 	 * A content filter may delay the actual shutdown until it
4770 	 * has processed the pending data
4771 	 */
4772 	if (so->so_flags & SOF_CONTENT_FILTER) {
4773 		error = cfil_sock_shutdown(so, &how);
4774 		if (error == EJUSTRETURN) {
4775 			error = 0;
4776 			goto done;
4777 		} else if (error != 0) {
4778 			goto done;
4779 		}
4780 	}
4781 #endif /* CONTENT_FILTER */
4782 
4783 	error = soshutdownlock_final(so, how);
4784 
4785 done:
4786 	return error;
4787 }
4788 
4789 void
sowflush(struct socket * so)4790 sowflush(struct socket *so)
4791 {
4792 	struct sockbuf *sb = &so->so_snd;
4793 
4794 	/*
4795 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4796 	 * to prevent the socket buffer from being unexpectedly altered
4797 	 * while it is used by another thread in socket send/receive.
4798 	 *
4799 	 * sblock() must not fail here, hence the assertion.
4800 	 */
4801 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4802 	VERIFY(sb->sb_flags & SB_LOCK);
4803 
4804 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4805 	sb->sb_flags            |= SB_DROP;
4806 	sb->sb_upcall           = NULL;
4807 	sb->sb_upcallarg        = NULL;
4808 
4809 	sbunlock(sb, TRUE);     /* keep socket locked */
4810 
4811 	selthreadclear(&sb->sb_sel);
4812 	sbrelease(sb);
4813 }
4814 
4815 void
sorflush(struct socket * so)4816 sorflush(struct socket *so)
4817 {
4818 	struct sockbuf *sb = &so->so_rcv;
4819 	struct protosw *pr = so->so_proto;
4820 	struct sockbuf asb;
4821 #ifdef notyet
4822 	lck_mtx_t *mutex_held;
4823 	/*
4824 	 * XXX: This code is currently commented out, because we may get here
4825 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4826 	 * longer be able to return us the lock; this will be fixed in future.
4827 	 */
4828 	if (so->so_proto->pr_getlock != NULL) {
4829 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4830 	} else {
4831 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4832 	}
4833 
4834 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4835 #endif /* notyet */
4836 
4837 	sflt_notify(so, sock_evt_flush_read, NULL);
4838 
4839 	socantrcvmore(so);
4840 
4841 	/*
4842 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4843 	 * to prevent the socket buffer from being unexpectedly altered
4844 	 * while it is used by another thread in socket send/receive.
4845 	 *
4846 	 * sblock() must not fail here, hence the assertion.
4847 	 */
4848 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4849 	VERIFY(sb->sb_flags & SB_LOCK);
4850 
4851 	/*
4852 	 * Copy only the relevant fields from "sb" to "asb" which we
4853 	 * need for sbrelease() to function.  In particular, skip
4854 	 * sb_sel as it contains the wait queue linkage, which would
4855 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4856 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4857 	 * to acquire it later as part of sbrelease().
4858 	 */
4859 	bzero(&asb, sizeof(asb));
4860 	asb.sb_cc               = sb->sb_cc;
4861 	asb.sb_hiwat            = sb->sb_hiwat;
4862 	asb.sb_mbcnt            = sb->sb_mbcnt;
4863 	asb.sb_mbmax            = sb->sb_mbmax;
4864 	asb.sb_ctl              = sb->sb_ctl;
4865 	asb.sb_lowat            = sb->sb_lowat;
4866 	asb.sb_mb               = sb->sb_mb;
4867 	asb.sb_mbtail           = sb->sb_mbtail;
4868 	asb.sb_lastrecord       = sb->sb_lastrecord;
4869 	asb.sb_so               = sb->sb_so;
4870 	asb.sb_flags            = sb->sb_flags;
4871 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4872 	asb.sb_flags            |= SB_DROP;
4873 
4874 	/*
4875 	 * Ideally we'd bzero() these and preserve the ones we need;
4876 	 * but to do that we'd need to shuffle things around in the
4877 	 * sockbuf, and we can't do it now because there are KEXTS
4878 	 * that are directly referring to the socket structure.
4879 	 *
4880 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4881 	 * Clearing SB_SEL is done for selthreadclear() below.
4882 	 */
4883 	sb->sb_cc               = 0;
4884 	sb->sb_hiwat            = 0;
4885 	sb->sb_mbcnt            = 0;
4886 	sb->sb_mbmax            = 0;
4887 	sb->sb_ctl              = 0;
4888 	sb->sb_lowat            = 0;
4889 	sb->sb_mb               = NULL;
4890 	sb->sb_mbtail           = NULL;
4891 	sb->sb_lastrecord       = NULL;
4892 	sb->sb_timeo.tv_sec     = 0;
4893 	sb->sb_timeo.tv_usec    = 0;
4894 	sb->sb_upcall           = NULL;
4895 	sb->sb_upcallarg        = NULL;
4896 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4897 	sb->sb_flags            |= SB_DROP;
4898 
4899 	sbunlock(sb, TRUE);     /* keep socket locked */
4900 
4901 	/*
4902 	 * Note that selthreadclear() is called on the original "sb" and
4903 	 * not the local "asb" because of the way wait queue linkage is
4904 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4905 	 * should no longer be set (cleared above.)
4906 	 */
4907 	selthreadclear(&sb->sb_sel);
4908 
4909 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4910 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4911 	}
4912 
4913 	sbrelease(&asb);
4914 }
4915 
4916 /*
4917  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4918  * an additional variant to handle the case where the option value needs
4919  * to be some kind of integer, but not a specific size.
4920  * In addition to their use here, these functions are also called by the
4921  * protocol-level pr_ctloutput() routines.
4922  *
4923  * Returns:	0			Success
4924  *		EINVAL
4925  *	copyin:EFAULT
4926  */
4927 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4928 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4929 {
4930 	size_t  valsize;
4931 
4932 	/*
4933 	 * If the user gives us more than we wanted, we ignore it,
4934 	 * but if we don't get the minimum length the caller
4935 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4936 	 * is set to however much we actually retrieved.
4937 	 */
4938 	if ((valsize = sopt->sopt_valsize) < minlen) {
4939 		return EINVAL;
4940 	}
4941 	if (valsize > len) {
4942 		sopt->sopt_valsize = valsize = len;
4943 	}
4944 
4945 	if (sopt->sopt_p != kernproc) {
4946 		return copyin(sopt->sopt_val, buf, valsize);
4947 	}
4948 
4949 	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4950 	return 0;
4951 }
4952 
4953 /*
4954  * sooptcopyin_timeval
4955  *   Copy in a timeval value into tv_p, and take into account whether the
4956  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4957  *   code here so that we can verify the 64-bit tv_sec value before we lose
4958  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4959  */
4960 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4961 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4962 {
4963 	int                     error;
4964 
4965 	if (proc_is64bit(sopt->sopt_p)) {
4966 		struct user64_timeval   tv64;
4967 
4968 		if (sopt->sopt_valsize < sizeof(tv64)) {
4969 			return EINVAL;
4970 		}
4971 
4972 		sopt->sopt_valsize = sizeof(tv64);
4973 		if (sopt->sopt_p != kernproc) {
4974 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4975 			if (error != 0) {
4976 				return error;
4977 			}
4978 		} else {
4979 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4980 			    sizeof(tv64));
4981 		}
4982 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4983 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4984 			return EDOM;
4985 		}
4986 
4987 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4988 		tv_p->tv_usec = tv64.tv_usec;
4989 	} else {
4990 		struct user32_timeval   tv32;
4991 
4992 		if (sopt->sopt_valsize < sizeof(tv32)) {
4993 			return EINVAL;
4994 		}
4995 
4996 		sopt->sopt_valsize = sizeof(tv32);
4997 		if (sopt->sopt_p != kernproc) {
4998 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4999 			if (error != 0) {
5000 				return error;
5001 			}
5002 		} else {
5003 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
5004 			    sizeof(tv32));
5005 		}
5006 #ifndef __LP64__
5007 		/*
5008 		 * K64todo "comparison is always false due to
5009 		 * limited range of data type"
5010 		 */
5011 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
5012 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
5013 			return EDOM;
5014 		}
5015 #endif
5016 		tv_p->tv_sec = tv32.tv_sec;
5017 		tv_p->tv_usec = tv32.tv_usec;
5018 	}
5019 	return 0;
5020 }
5021 
5022 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)5023 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
5024     boolean_t ignore_delegate)
5025 {
5026 	kauth_cred_t cred =  NULL;
5027 	proc_t ep = PROC_NULL;
5028 	uid_t uid;
5029 	int error = 0;
5030 
5031 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
5032 		ep = proc_find(so->e_pid);
5033 		if (ep) {
5034 			cred = kauth_cred_proc_ref(ep);
5035 		}
5036 	}
5037 
5038 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5039 
5040 	/* uid is 0 for root */
5041 	if (uid != 0 || !allow_root) {
5042 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5043 	}
5044 	if (cred) {
5045 		kauth_cred_unref(&cred);
5046 	}
5047 	if (ep != PROC_NULL) {
5048 		proc_rele(ep);
5049 	}
5050 
5051 	return error;
5052 }
5053 
5054 /*
5055  * Returns:	0			Success
5056  *		EINVAL
5057  *		ENOPROTOOPT
5058  *		ENOBUFS
5059  *		EDOM
5060  *	sooptcopyin:EINVAL
5061  *	sooptcopyin:EFAULT
5062  *	sooptcopyin_timeval:EINVAL
5063  *	sooptcopyin_timeval:EFAULT
5064  *	sooptcopyin_timeval:EDOM
5065  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5066  *	<pr_ctloutput>:???w
5067  *	sflt_attach_private:???		[whatever a filter author chooses]
5068  *	<sf_setoption>:???		[whatever a filter author chooses]
5069  *
5070  * Notes:	Other <pru_listen> returns depend on the protocol family; all
5071  *		<sf_listen> returns depend on what the filter author causes
5072  *		their filter to return.
5073  */
5074 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5075 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5076 {
5077 	int     error, optval;
5078 	int64_t long_optval;
5079 	struct  linger l;
5080 	struct  timeval tv;
5081 
5082 	if (sopt->sopt_dir != SOPT_SET) {
5083 		sopt->sopt_dir = SOPT_SET;
5084 	}
5085 
5086 	if (dolock) {
5087 		socket_lock(so, 1);
5088 	}
5089 
5090 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5091 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5092 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5093 		/* the socket has been shutdown, no more sockopt's */
5094 		error = EINVAL;
5095 		goto out;
5096 	}
5097 
5098 	error = sflt_setsockopt(so, sopt);
5099 	if (error != 0) {
5100 		if (error == EJUSTRETURN) {
5101 			error = 0;
5102 		}
5103 		goto out;
5104 	}
5105 
5106 	if (sopt->sopt_level != SOL_SOCKET) {
5107 		if (so->so_proto != NULL &&
5108 		    so->so_proto->pr_ctloutput != NULL) {
5109 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5110 			goto out;
5111 		}
5112 		error = ENOPROTOOPT;
5113 	} else {
5114 		/*
5115 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5116 		 * the protocol layer, if needed.  A zero value returned from
5117 		 * the handler means use default socket-level processing as
5118 		 * done by the rest of this routine.  Otherwise, any other
5119 		 * return value indicates that the option is unsupported.
5120 		 */
5121 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5122 		    pru_socheckopt(so, sopt)) != 0) {
5123 			goto out;
5124 		}
5125 
5126 		error = 0;
5127 		switch (sopt->sopt_name) {
5128 		case SO_LINGER:
5129 		case SO_LINGER_SEC: {
5130 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5131 			if (error != 0) {
5132 				goto out;
5133 			}
5134 			/* Make sure to use sane values */
5135 			if (sopt->sopt_name == SO_LINGER) {
5136 				so->so_linger = (short)l.l_linger;
5137 			} else {
5138 				so->so_linger = (short)((long)l.l_linger * hz);
5139 			}
5140 			if (l.l_onoff != 0) {
5141 				so->so_options |= SO_LINGER;
5142 			} else {
5143 				so->so_options &= ~SO_LINGER;
5144 			}
5145 			break;
5146 		}
5147 		case SO_DEBUG:
5148 		case SO_KEEPALIVE:
5149 		case SO_DONTROUTE:
5150 		case SO_USELOOPBACK:
5151 		case SO_BROADCAST:
5152 		case SO_REUSEADDR:
5153 		case SO_REUSEPORT:
5154 		case SO_OOBINLINE:
5155 		case SO_TIMESTAMP:
5156 		case SO_TIMESTAMP_MONOTONIC:
5157 		case SO_TIMESTAMP_CONTINUOUS:
5158 		case SO_DONTTRUNC:
5159 		case SO_WANTMORE:
5160 		case SO_WANTOOBFLAG:
5161 		case SO_NOWAKEFROMSLEEP:
5162 		case SO_NOAPNFALLBK:
5163 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5164 			    sizeof(optval));
5165 			if (error != 0) {
5166 				goto out;
5167 			}
5168 			if (optval) {
5169 				so->so_options |= sopt->sopt_name;
5170 			} else {
5171 				so->so_options &= ~sopt->sopt_name;
5172 			}
5173 #if SKYWALK
5174 			inp_update_netns_flags(so);
5175 #endif /* SKYWALK */
5176 			break;
5177 
5178 		case SO_SNDBUF:
5179 		case SO_RCVBUF:
5180 		case SO_SNDLOWAT:
5181 		case SO_RCVLOWAT:
5182 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5183 			    sizeof(optval));
5184 			if (error != 0) {
5185 				goto out;
5186 			}
5187 
5188 			/*
5189 			 * Values < 1 make no sense for any of these
5190 			 * options, so disallow them.
5191 			 */
5192 			if (optval < 1) {
5193 				error = EINVAL;
5194 				goto out;
5195 			}
5196 
5197 			switch (sopt->sopt_name) {
5198 			case SO_SNDBUF:
5199 			case SO_RCVBUF: {
5200 				struct sockbuf *sb =
5201 				    (sopt->sopt_name == SO_SNDBUF) ?
5202 				    &so->so_snd : &so->so_rcv;
5203 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
5204 					error = ENOBUFS;
5205 					goto out;
5206 				}
5207 				sb->sb_flags |= SB_USRSIZE;
5208 				sb->sb_flags &= ~SB_AUTOSIZE;
5209 				sb->sb_idealsize = (u_int32_t)optval;
5210 				break;
5211 			}
5212 			/*
5213 			 * Make sure the low-water is never greater than
5214 			 * the high-water.
5215 			 */
5216 			case SO_SNDLOWAT: {
5217 				int space = sbspace(&so->so_snd);
5218 				uint32_t hiwat = so->so_snd.sb_hiwat;
5219 
5220 				if (so->so_snd.sb_flags & SB_UNIX) {
5221 					struct unpcb *unp =
5222 					    (struct unpcb *)(so->so_pcb);
5223 					if (unp != NULL &&
5224 					    unp->unp_conn != NULL) {
5225 						struct socket *so2 = unp->unp_conn->unp_socket;
5226 						hiwat += unp->unp_conn->unp_cc;
5227 						space = sbspace(&so2->so_rcv);
5228 					}
5229 				}
5230 
5231 				so->so_snd.sb_lowat =
5232 				    (optval > hiwat) ?
5233 				    hiwat : optval;
5234 
5235 				if (space >= so->so_snd.sb_lowat) {
5236 					sowwakeup(so);
5237 				}
5238 				break;
5239 			}
5240 			case SO_RCVLOWAT: {
5241 				int64_t data_len;
5242 				so->so_rcv.sb_lowat =
5243 				    (optval > so->so_rcv.sb_hiwat) ?
5244 				    so->so_rcv.sb_hiwat : optval;
5245 				if (so->so_rcv.sb_flags & SB_UNIX) {
5246 					struct unpcb *unp =
5247 					    (struct unpcb *)(so->so_pcb);
5248 					if (unp != NULL &&
5249 					    unp->unp_conn != NULL) {
5250 						struct socket *so2 = unp->unp_conn->unp_socket;
5251 						data_len = so2->so_snd.sb_cc
5252 						    - so2->so_snd.sb_ctl;
5253 					} else {
5254 						data_len = so->so_rcv.sb_cc
5255 						    - so->so_rcv.sb_ctl;
5256 					}
5257 				} else {
5258 					data_len = so->so_rcv.sb_cc
5259 					    - so->so_rcv.sb_ctl;
5260 				}
5261 
5262 				if (data_len >= so->so_rcv.sb_lowat) {
5263 					sorwakeup(so);
5264 				}
5265 				break;
5266 			}
5267 			}
5268 			break;
5269 
5270 		case SO_SNDTIMEO:
5271 		case SO_RCVTIMEO:
5272 			error = sooptcopyin_timeval(sopt, &tv);
5273 			if (error != 0) {
5274 				goto out;
5275 			}
5276 
5277 			switch (sopt->sopt_name) {
5278 			case SO_SNDTIMEO:
5279 				so->so_snd.sb_timeo = tv;
5280 				break;
5281 			case SO_RCVTIMEO:
5282 				so->so_rcv.sb_timeo = tv;
5283 				break;
5284 			}
5285 			break;
5286 
5287 		case SO_NKE: {
5288 			struct so_nke nke;
5289 
5290 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5291 			    sizeof(nke));
5292 			if (error != 0) {
5293 				goto out;
5294 			}
5295 
5296 			error = sflt_attach_internal(so, nke.nke_handle);
5297 			break;
5298 		}
5299 
5300 		case SO_NOSIGPIPE:
5301 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5302 			    sizeof(optval));
5303 			if (error != 0) {
5304 				goto out;
5305 			}
5306 			if (optval != 0) {
5307 				so->so_flags |= SOF_NOSIGPIPE;
5308 			} else {
5309 				so->so_flags &= ~SOF_NOSIGPIPE;
5310 			}
5311 			break;
5312 
5313 		case SO_NOADDRERR:
5314 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5315 			    sizeof(optval));
5316 			if (error != 0) {
5317 				goto out;
5318 			}
5319 			if (optval != 0) {
5320 				so->so_flags |= SOF_NOADDRAVAIL;
5321 			} else {
5322 				so->so_flags &= ~SOF_NOADDRAVAIL;
5323 			}
5324 			break;
5325 
5326 		case SO_REUSESHAREUID:
5327 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5328 			    sizeof(optval));
5329 			if (error != 0) {
5330 				goto out;
5331 			}
5332 			if (optval != 0) {
5333 				so->so_flags |= SOF_REUSESHAREUID;
5334 			} else {
5335 				so->so_flags &= ~SOF_REUSESHAREUID;
5336 			}
5337 			break;
5338 
5339 		case SO_NOTIFYCONFLICT:
5340 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5341 				error = EPERM;
5342 				goto out;
5343 			}
5344 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5345 			    sizeof(optval));
5346 			if (error != 0) {
5347 				goto out;
5348 			}
5349 			if (optval != 0) {
5350 				so->so_flags |= SOF_NOTIFYCONFLICT;
5351 			} else {
5352 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5353 			}
5354 			break;
5355 
5356 		case SO_RESTRICTIONS:
5357 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5358 			    sizeof(optval));
5359 			if (error != 0) {
5360 				goto out;
5361 			}
5362 
5363 			error = so_set_restrictions(so, optval);
5364 			break;
5365 
5366 		case SO_AWDL_UNRESTRICTED:
5367 			if (SOCK_DOM(so) != PF_INET &&
5368 			    SOCK_DOM(so) != PF_INET6) {
5369 				error = EOPNOTSUPP;
5370 				goto out;
5371 			}
5372 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5373 			    sizeof(optval));
5374 			if (error != 0) {
5375 				goto out;
5376 			}
5377 			if (optval != 0) {
5378 				error = soopt_cred_check(so,
5379 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5380 				if (error == 0) {
5381 					inp_set_awdl_unrestricted(
5382 						sotoinpcb(so));
5383 				}
5384 			} else {
5385 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5386 			}
5387 			break;
5388 		case SO_INTCOPROC_ALLOW:
5389 			if (SOCK_DOM(so) != PF_INET6) {
5390 				error = EOPNOTSUPP;
5391 				goto out;
5392 			}
5393 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5394 			    sizeof(optval));
5395 			if (error != 0) {
5396 				goto out;
5397 			}
5398 			if (optval != 0 &&
5399 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5400 				error = soopt_cred_check(so,
5401 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5402 				if (error == 0) {
5403 					inp_set_intcoproc_allowed(
5404 						sotoinpcb(so));
5405 				}
5406 			} else if (optval == 0) {
5407 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5408 			}
5409 			break;
5410 
5411 		case SO_LABEL:
5412 			error = EOPNOTSUPP;
5413 			break;
5414 
5415 		case SO_UPCALLCLOSEWAIT:
5416 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5417 			    sizeof(optval));
5418 			if (error != 0) {
5419 				goto out;
5420 			}
5421 			if (optval != 0) {
5422 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5423 			} else {
5424 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5425 			}
5426 			break;
5427 
5428 		case SO_RANDOMPORT:
5429 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5430 			    sizeof(optval));
5431 			if (error != 0) {
5432 				goto out;
5433 			}
5434 			if (optval != 0) {
5435 				so->so_flags |= SOF_BINDRANDOMPORT;
5436 			} else {
5437 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5438 			}
5439 			break;
5440 
5441 		case SO_NP_EXTENSIONS: {
5442 			struct so_np_extensions sonpx;
5443 
5444 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5445 			    sizeof(sonpx));
5446 			if (error != 0) {
5447 				goto out;
5448 			}
5449 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5450 				error = EINVAL;
5451 				goto out;
5452 			}
5453 			/*
5454 			 * Only one bit defined for now
5455 			 */
5456 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5457 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5458 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5459 				} else {
5460 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5461 				}
5462 			}
5463 			break;
5464 		}
5465 
5466 		case SO_TRAFFIC_CLASS: {
5467 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5468 			    sizeof(optval));
5469 			if (error != 0) {
5470 				goto out;
5471 			}
5472 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5473 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5474 				error = so_set_net_service_type(so, netsvc);
5475 				goto out;
5476 			}
5477 			error = so_set_traffic_class(so, optval);
5478 			if (error != 0) {
5479 				goto out;
5480 			}
5481 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5482 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5483 			break;
5484 		}
5485 
5486 		case SO_RECV_TRAFFIC_CLASS: {
5487 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5488 			    sizeof(optval));
5489 			if (error != 0) {
5490 				goto out;
5491 			}
5492 			if (optval == 0) {
5493 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5494 			} else {
5495 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5496 			}
5497 			break;
5498 		}
5499 
5500 #if (DEVELOPMENT || DEBUG)
5501 		case SO_TRAFFIC_CLASS_DBG: {
5502 			struct so_tcdbg so_tcdbg;
5503 
5504 			error = sooptcopyin(sopt, &so_tcdbg,
5505 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5506 			if (error != 0) {
5507 				goto out;
5508 			}
5509 			error = so_set_tcdbg(so, &so_tcdbg);
5510 			if (error != 0) {
5511 				goto out;
5512 			}
5513 			break;
5514 		}
5515 #endif /* (DEVELOPMENT || DEBUG) */
5516 
5517 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5518 			error = priv_check_cred(kauth_cred_get(),
5519 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5520 			if (error != 0) {
5521 				goto out;
5522 			}
5523 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5524 			    sizeof(optval));
5525 			if (error != 0) {
5526 				goto out;
5527 			}
5528 			if (optval == 0) {
5529 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5530 			} else {
5531 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5532 			}
5533 			break;
5534 
5535 #if (DEVELOPMENT || DEBUG)
5536 		case SO_DEFUNCTIT:
5537 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5538 			if (error == 0) {
5539 				error = sodefunct(current_proc(), so, 0);
5540 			}
5541 
5542 			break;
5543 #endif /* (DEVELOPMENT || DEBUG) */
5544 
5545 		case SO_DEFUNCTOK:
5546 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5547 			    sizeof(optval));
5548 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5549 				if (error == 0) {
5550 					error = EBADF;
5551 				}
5552 				goto out;
5553 			}
5554 			/*
5555 			 * Any process can set SO_DEFUNCTOK (clear
5556 			 * SOF_NODEFUNCT), but only root can clear
5557 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5558 			 */
5559 			if (optval == 0 &&
5560 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5561 				error = EPERM;
5562 				goto out;
5563 			}
5564 			if (optval) {
5565 				so->so_flags &= ~SOF_NODEFUNCT;
5566 			} else {
5567 				so->so_flags |= SOF_NODEFUNCT;
5568 			}
5569 
5570 			if (SOCK_DOM(so) == PF_INET ||
5571 			    SOCK_DOM(so) == PF_INET6) {
5572 				char s[MAX_IPv6_STR_LEN];
5573 				char d[MAX_IPv6_STR_LEN];
5574 				struct inpcb *inp = sotoinpcb(so);
5575 
5576 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5577 				    "[%s %s:%d -> %s:%d] is now marked "
5578 				    "as %seligible for "
5579 				    "defunct\n", __func__, proc_selfpid(),
5580 				    proc_best_name(current_proc()),
5581 				    so->so_gencnt,
5582 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5583 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5584 				    ((SOCK_DOM(so) == PF_INET) ?
5585 				    (void *)&inp->inp_laddr.s_addr :
5586 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5587 				    ntohs(inp->in6p_lport),
5588 				    inet_ntop(SOCK_DOM(so),
5589 				    (SOCK_DOM(so) == PF_INET) ?
5590 				    (void *)&inp->inp_faddr.s_addr :
5591 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5592 				    ntohs(inp->in6p_fport),
5593 				    (so->so_flags & SOF_NODEFUNCT) ?
5594 				    "not " : "");
5595 			} else {
5596 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5597 				    "is now marked as %seligible for "
5598 				    "defunct\n",
5599 				    __func__, proc_selfpid(),
5600 				    proc_best_name(current_proc()),
5601 				    so->so_gencnt,
5602 				    SOCK_DOM(so), SOCK_TYPE(so),
5603 				    (so->so_flags & SOF_NODEFUNCT) ?
5604 				    "not " : "");
5605 			}
5606 			break;
5607 
5608 		case SO_ISDEFUNCT:
5609 			/* This option is not settable */
5610 			error = EINVAL;
5611 			break;
5612 
5613 		case SO_OPPORTUNISTIC:
5614 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5615 			    sizeof(optval));
5616 			if (error == 0) {
5617 				error = so_set_opportunistic(so, optval);
5618 			}
5619 			break;
5620 
5621 		case SO_FLUSH:
5622 			/* This option is handled by lower layer(s) */
5623 			error = 0;
5624 			break;
5625 
5626 		case SO_RECV_ANYIF:
5627 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5628 			    sizeof(optval));
5629 			if (error == 0) {
5630 				error = so_set_recv_anyif(so, optval);
5631 			}
5632 			break;
5633 
5634 		case SO_TRAFFIC_MGT_BACKGROUND: {
5635 			/* This option is handled by lower layer(s) */
5636 			error = 0;
5637 			break;
5638 		}
5639 
5640 #if FLOW_DIVERT
5641 		case SO_FLOW_DIVERT_TOKEN:
5642 			error = flow_divert_token_set(so, sopt);
5643 			break;
5644 #endif  /* FLOW_DIVERT */
5645 
5646 
5647 		case SO_DELEGATED:
5648 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5649 			    sizeof(optval))) != 0) {
5650 				break;
5651 			}
5652 
5653 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5654 			break;
5655 
5656 		case SO_DELEGATED_UUID: {
5657 			uuid_t euuid;
5658 
5659 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5660 			    sizeof(euuid))) != 0) {
5661 				break;
5662 			}
5663 
5664 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5665 			break;
5666 		}
5667 
5668 #if NECP
5669 		case SO_NECP_ATTRIBUTES:
5670 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5671 				/* Handled by MPTCP itself */
5672 				break;
5673 			}
5674 
5675 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5676 				error = EINVAL;
5677 				goto out;
5678 			}
5679 
5680 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5681 			break;
5682 
5683 		case SO_NECP_CLIENTUUID: {
5684 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5685 				/* Handled by MPTCP itself */
5686 				break;
5687 			}
5688 
5689 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5690 				error = EINVAL;
5691 				goto out;
5692 			}
5693 
5694 			struct inpcb *inp = sotoinpcb(so);
5695 			if (!uuid_is_null(inp->necp_client_uuid)) {
5696 				// Clear out the old client UUID if present
5697 				necp_inpcb_remove_cb(inp);
5698 			}
5699 
5700 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5701 			    sizeof(uuid_t), sizeof(uuid_t));
5702 			if (error != 0) {
5703 				goto out;
5704 			}
5705 
5706 			if (uuid_is_null(inp->necp_client_uuid)) {
5707 				error = EINVAL;
5708 				goto out;
5709 			}
5710 
5711 			pid_t current_pid = proc_pid(current_proc());
5712 			error = necp_client_register_socket_flow(current_pid,
5713 			    inp->necp_client_uuid, inp);
5714 			if (error != 0) {
5715 				uuid_clear(inp->necp_client_uuid);
5716 				goto out;
5717 			}
5718 
5719 			if (inp->inp_lport != 0) {
5720 				// There is a bound local port, so this is not
5721 				// a fresh socket. Assign to the client.
5722 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5723 			}
5724 
5725 			break;
5726 		}
5727 		case SO_NECP_LISTENUUID: {
5728 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5729 				error = EINVAL;
5730 				goto out;
5731 			}
5732 
5733 			struct inpcb *inp = sotoinpcb(so);
5734 			if (!uuid_is_null(inp->necp_client_uuid)) {
5735 				error = EINVAL;
5736 				goto out;
5737 			}
5738 
5739 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5740 			    sizeof(uuid_t), sizeof(uuid_t));
5741 			if (error != 0) {
5742 				goto out;
5743 			}
5744 
5745 			if (uuid_is_null(inp->necp_client_uuid)) {
5746 				error = EINVAL;
5747 				goto out;
5748 			}
5749 
5750 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5751 			    inp->necp_client_uuid, inp);
5752 			if (error != 0) {
5753 				uuid_clear(inp->necp_client_uuid);
5754 				goto out;
5755 			}
5756 
5757 			// Mark that the port registration is held by NECP
5758 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5759 
5760 			break;
5761 		}
5762 
5763 		case SO_RESOLVER_SIGNATURE: {
5764 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5765 				error = EINVAL;
5766 				goto out;
5767 			}
5768 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5769 			break;
5770 		}
5771 #endif /* NECP */
5772 
5773 		case SO_EXTENDED_BK_IDLE:
5774 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5775 			    sizeof(optval));
5776 			if (error == 0) {
5777 				error = so_set_extended_bk_idle(so, optval);
5778 			}
5779 			break;
5780 
5781 		case SO_MARK_CELLFALLBACK:
5782 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5783 			    sizeof(optval));
5784 			if (error != 0) {
5785 				goto out;
5786 			}
5787 			if (optval < 0) {
5788 				error = EINVAL;
5789 				goto out;
5790 			}
5791 			if (optval == 0) {
5792 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5793 			} else {
5794 				so->so_flags1 |= SOF1_CELLFALLBACK;
5795 			}
5796 			break;
5797 
5798 		case SO_MARK_CELLFALLBACK_UUID:
5799 		{
5800 			struct so_mark_cellfallback_uuid_args args;
5801 
5802 			error = sooptcopyin(sopt, &args, sizeof(args),
5803 			    sizeof(args));
5804 			if (error != 0) {
5805 				goto out;
5806 			}
5807 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5808 			    args.flow_cellfallback);
5809 			break;
5810 		}
5811 
5812 		case SO_FALLBACK_MODE:
5813 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5814 			    sizeof(optval));
5815 			if (error != 0) {
5816 				goto out;
5817 			}
5818 			if (optval < SO_FALLBACK_MODE_NONE ||
5819 			    optval > SO_FALLBACK_MODE_PREFER) {
5820 				error = EINVAL;
5821 				goto out;
5822 			}
5823 			so->so_fallback_mode = (u_int8_t)optval;
5824 			break;
5825 
5826 		case SO_MARK_KNOWN_TRACKER: {
5827 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5828 			    sizeof(optval));
5829 			if (error != 0) {
5830 				goto out;
5831 			}
5832 			if (optval < 0) {
5833 				error = EINVAL;
5834 				goto out;
5835 			}
5836 			if (optval == 0) {
5837 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5838 			} else {
5839 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5840 			}
5841 			break;
5842 		}
5843 
5844 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5845 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5846 			    sizeof(optval));
5847 			if (error != 0) {
5848 				goto out;
5849 			}
5850 			if (optval < 0) {
5851 				error = EINVAL;
5852 				goto out;
5853 			}
5854 			if (optval == 0) {
5855 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5856 			} else {
5857 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5858 			}
5859 			break;
5860 		}
5861 
5862 		case SO_MARK_APPROVED_APP_DOMAIN: {
5863 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5864 			    sizeof(optval));
5865 			if (error != 0) {
5866 				goto out;
5867 			}
5868 			if (optval < 0) {
5869 				error = EINVAL;
5870 				goto out;
5871 			}
5872 			if (optval == 0) {
5873 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5874 			} else {
5875 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5876 			}
5877 			break;
5878 		}
5879 
5880 		case SO_STATISTICS_EVENT:
5881 			error = sooptcopyin(sopt, &long_optval,
5882 			    sizeof(long_optval), sizeof(long_optval));
5883 			if (error != 0) {
5884 				goto out;
5885 			}
5886 			u_int64_t nstat_event = 0;
5887 			error = so_statistics_event_to_nstat_event(
5888 				&long_optval, &nstat_event);
5889 			if (error != 0) {
5890 				goto out;
5891 			}
5892 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5893 			break;
5894 
5895 		case SO_NET_SERVICE_TYPE: {
5896 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5897 			    sizeof(optval));
5898 			if (error != 0) {
5899 				goto out;
5900 			}
5901 			error = so_set_net_service_type(so, optval);
5902 			break;
5903 		}
5904 
5905 		case SO_QOSMARKING_POLICY_OVERRIDE:
5906 			error = priv_check_cred(kauth_cred_get(),
5907 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5908 			if (error != 0) {
5909 				goto out;
5910 			}
5911 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5912 			    sizeof(optval));
5913 			if (error != 0) {
5914 				goto out;
5915 			}
5916 			if (optval == 0) {
5917 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5918 			} else {
5919 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5920 			}
5921 			break;
5922 
5923 		case SO_MPKL_SEND_INFO: {
5924 			struct so_mpkl_send_info so_mpkl_send_info;
5925 
5926 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5927 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5928 			if (error != 0) {
5929 				goto out;
5930 			}
5931 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5932 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5933 
5934 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5935 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5936 			} else {
5937 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5938 			}
5939 			break;
5940 		}
5941 		case SO_WANT_KEV_SOCKET_CLOSED: {
5942 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5943 			    sizeof(optval));
5944 			if (error != 0) {
5945 				goto out;
5946 			}
5947 			if (optval == 0) {
5948 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5949 			} else {
5950 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5951 			}
5952 			break;
5953 		}
5954 		case SO_MARK_WAKE_PKT: {
5955 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5956 			    sizeof(optval));
5957 			if (error != 0) {
5958 				goto out;
5959 			}
5960 			if (optval == 0) {
5961 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5962 			} else {
5963 				so->so_flags |= SOF_MARK_WAKE_PKT;
5964 			}
5965 			break;
5966 		}
5967 		case SO_RECV_WAKE_PKT: {
5968 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5969 			    sizeof(optval));
5970 			if (error != 0) {
5971 				goto out;
5972 			}
5973 			if (optval == 0) {
5974 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5975 			} else {
5976 				so->so_flags |= SOF_RECV_WAKE_PKT;
5977 			}
5978 			break;
5979 		}
5980 		default:
5981 			error = ENOPROTOOPT;
5982 			break;
5983 		}
5984 		if (error == 0 && so->so_proto != NULL &&
5985 		    so->so_proto->pr_ctloutput != NULL) {
5986 			(void) so->so_proto->pr_ctloutput(so, sopt);
5987 		}
5988 	}
5989 out:
5990 	if (dolock) {
5991 		socket_unlock(so, 1);
5992 	}
5993 	return error;
5994 }
5995 
5996 /* Helper routines for getsockopt */
5997 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5998 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5999 {
6000 	int     error;
6001 	size_t  valsize;
6002 
6003 	error = 0;
6004 
6005 	/*
6006 	 * Documented get behavior is that we always return a value,
6007 	 * possibly truncated to fit in the user's buffer.
6008 	 * Traditional behavior is that we always tell the user
6009 	 * precisely how much we copied, rather than something useful
6010 	 * like the total amount we had available for her.
6011 	 * Note that this interface is not idempotent; the entire answer must
6012 	 * generated ahead of time.
6013 	 */
6014 	valsize = MIN(len, sopt->sopt_valsize);
6015 	sopt->sopt_valsize = valsize;
6016 	if (sopt->sopt_val != USER_ADDR_NULL) {
6017 		if (sopt->sopt_p != kernproc) {
6018 			error = copyout(buf, sopt->sopt_val, valsize);
6019 		} else {
6020 			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6021 		}
6022 	}
6023 	return error;
6024 }
6025 
6026 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)6027 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
6028 {
6029 	int                     error;
6030 	size_t                  len;
6031 	struct user64_timeval   tv64 = {};
6032 	struct user32_timeval   tv32 = {};
6033 	const void *            val;
6034 	size_t                  valsize;
6035 
6036 	error = 0;
6037 	if (proc_is64bit(sopt->sopt_p)) {
6038 		len = sizeof(tv64);
6039 		tv64.tv_sec = tv_p->tv_sec;
6040 		tv64.tv_usec = tv_p->tv_usec;
6041 		val = &tv64;
6042 	} else {
6043 		len = sizeof(tv32);
6044 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
6045 		tv32.tv_usec = tv_p->tv_usec;
6046 		val = &tv32;
6047 	}
6048 	valsize = MIN(len, sopt->sopt_valsize);
6049 	sopt->sopt_valsize = valsize;
6050 	if (sopt->sopt_val != USER_ADDR_NULL) {
6051 		if (sopt->sopt_p != kernproc) {
6052 			error = copyout(val, sopt->sopt_val, valsize);
6053 		} else {
6054 			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6055 		}
6056 	}
6057 	return error;
6058 }
6059 
6060 /*
6061  * Return:	0			Success
6062  *		ENOPROTOOPT
6063  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
6064  *	<pr_ctloutput>:???
6065  *	<sf_getoption>:???
6066  */
6067 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)6068 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
6069 {
6070 	int     error, optval;
6071 	struct  linger l;
6072 	struct  timeval tv;
6073 
6074 	if (sopt->sopt_dir != SOPT_GET) {
6075 		sopt->sopt_dir = SOPT_GET;
6076 	}
6077 
6078 	if (dolock) {
6079 		socket_lock(so, 1);
6080 	}
6081 
6082 	error = sflt_getsockopt(so, sopt);
6083 	if (error != 0) {
6084 		if (error == EJUSTRETURN) {
6085 			error = 0;
6086 		}
6087 		goto out;
6088 	}
6089 
6090 	if (sopt->sopt_level != SOL_SOCKET) {
6091 		if (so->so_proto != NULL &&
6092 		    so->so_proto->pr_ctloutput != NULL) {
6093 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
6094 			goto out;
6095 		}
6096 		error = ENOPROTOOPT;
6097 	} else {
6098 		/*
6099 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
6100 		 * the protocol layer, if needed.  A zero value returned from
6101 		 * the handler means use default socket-level processing as
6102 		 * done by the rest of this routine.  Otherwise, any other
6103 		 * return value indicates that the option is unsupported.
6104 		 */
6105 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6106 		    pru_socheckopt(so, sopt)) != 0) {
6107 			goto out;
6108 		}
6109 
6110 		error = 0;
6111 		switch (sopt->sopt_name) {
6112 		case SO_LINGER:
6113 		case SO_LINGER_SEC:
6114 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6115 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6116 			    so->so_linger : so->so_linger / hz;
6117 			error = sooptcopyout(sopt, &l, sizeof(l));
6118 			break;
6119 
6120 		case SO_USELOOPBACK:
6121 		case SO_DONTROUTE:
6122 		case SO_DEBUG:
6123 		case SO_KEEPALIVE:
6124 		case SO_REUSEADDR:
6125 		case SO_REUSEPORT:
6126 		case SO_BROADCAST:
6127 		case SO_OOBINLINE:
6128 		case SO_TIMESTAMP:
6129 		case SO_TIMESTAMP_MONOTONIC:
6130 		case SO_TIMESTAMP_CONTINUOUS:
6131 		case SO_DONTTRUNC:
6132 		case SO_WANTMORE:
6133 		case SO_WANTOOBFLAG:
6134 		case SO_NOWAKEFROMSLEEP:
6135 		case SO_NOAPNFALLBK:
6136 			optval = so->so_options & sopt->sopt_name;
6137 integer:
6138 			error = sooptcopyout(sopt, &optval, sizeof(optval));
6139 			break;
6140 
6141 		case SO_TYPE:
6142 			optval = so->so_type;
6143 			goto integer;
6144 
6145 		case SO_NREAD:
6146 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6147 				int pkt_total;
6148 				struct mbuf *m1;
6149 
6150 				pkt_total = 0;
6151 				m1 = so->so_rcv.sb_mb;
6152 				while (m1 != NULL) {
6153 					if (m1->m_type == MT_DATA ||
6154 					    m1->m_type == MT_HEADER ||
6155 					    m1->m_type == MT_OOBDATA) {
6156 						pkt_total += m1->m_len;
6157 					}
6158 					m1 = m1->m_next;
6159 				}
6160 				optval = pkt_total;
6161 			} else {
6162 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6163 			}
6164 			goto integer;
6165 
6166 		case SO_NUMRCVPKT:
6167 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6168 				int cnt = 0;
6169 				struct mbuf *m1;
6170 
6171 				m1 = so->so_rcv.sb_mb;
6172 				while (m1 != NULL) {
6173 					cnt += 1;
6174 					m1 = m1->m_nextpkt;
6175 				}
6176 				optval = cnt;
6177 				goto integer;
6178 			} else {
6179 				error = ENOPROTOOPT;
6180 				break;
6181 			}
6182 
6183 		case SO_NWRITE:
6184 			optval = so->so_snd.sb_cc;
6185 			goto integer;
6186 
6187 		case SO_ERROR:
6188 			optval = so->so_error;
6189 			so->so_error = 0;
6190 			goto integer;
6191 
6192 		case SO_SNDBUF: {
6193 			u_int32_t hiwat = so->so_snd.sb_hiwat;
6194 
6195 			if (so->so_snd.sb_flags & SB_UNIX) {
6196 				struct unpcb *unp =
6197 				    (struct unpcb *)(so->so_pcb);
6198 				if (unp != NULL && unp->unp_conn != NULL) {
6199 					hiwat += unp->unp_conn->unp_cc;
6200 				}
6201 			}
6202 
6203 			optval = hiwat;
6204 			goto integer;
6205 		}
6206 		case SO_RCVBUF:
6207 			optval = so->so_rcv.sb_hiwat;
6208 			goto integer;
6209 
6210 		case SO_SNDLOWAT:
6211 			optval = so->so_snd.sb_lowat;
6212 			goto integer;
6213 
6214 		case SO_RCVLOWAT:
6215 			optval = so->so_rcv.sb_lowat;
6216 			goto integer;
6217 
6218 		case SO_SNDTIMEO:
6219 		case SO_RCVTIMEO:
6220 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
6221 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6222 
6223 			error = sooptcopyout_timeval(sopt, &tv);
6224 			break;
6225 
6226 		case SO_NOSIGPIPE:
6227 			optval = (so->so_flags & SOF_NOSIGPIPE);
6228 			goto integer;
6229 
6230 		case SO_NOADDRERR:
6231 			optval = (so->so_flags & SOF_NOADDRAVAIL);
6232 			goto integer;
6233 
6234 		case SO_REUSESHAREUID:
6235 			optval = (so->so_flags & SOF_REUSESHAREUID);
6236 			goto integer;
6237 
6238 
6239 		case SO_NOTIFYCONFLICT:
6240 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6241 			goto integer;
6242 
6243 		case SO_RESTRICTIONS:
6244 			optval = so_get_restrictions(so);
6245 			goto integer;
6246 
6247 		case SO_AWDL_UNRESTRICTED:
6248 			if (SOCK_DOM(so) == PF_INET ||
6249 			    SOCK_DOM(so) == PF_INET6) {
6250 				optval = inp_get_awdl_unrestricted(
6251 					sotoinpcb(so));
6252 				goto integer;
6253 			} else {
6254 				error = EOPNOTSUPP;
6255 			}
6256 			break;
6257 
6258 		case SO_INTCOPROC_ALLOW:
6259 			if (SOCK_DOM(so) == PF_INET6) {
6260 				optval = inp_get_intcoproc_allowed(
6261 					sotoinpcb(so));
6262 				goto integer;
6263 			} else {
6264 				error = EOPNOTSUPP;
6265 			}
6266 			break;
6267 
6268 		case SO_LABEL:
6269 			error = EOPNOTSUPP;
6270 			break;
6271 
6272 		case SO_PEERLABEL:
6273 			error = EOPNOTSUPP;
6274 			break;
6275 
6276 #ifdef __APPLE_API_PRIVATE
6277 		case SO_UPCALLCLOSEWAIT:
6278 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6279 			goto integer;
6280 #endif
6281 		case SO_RANDOMPORT:
6282 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6283 			goto integer;
6284 
6285 		case SO_NP_EXTENSIONS: {
6286 			struct so_np_extensions sonpx = {};
6287 
6288 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6289 			    SONPX_SETOPTSHUT : 0;
6290 			sonpx.npx_mask = SONPX_MASK_VALID;
6291 
6292 			error = sooptcopyout(sopt, &sonpx,
6293 			    sizeof(struct so_np_extensions));
6294 			break;
6295 		}
6296 
6297 		case SO_TRAFFIC_CLASS:
6298 			optval = so->so_traffic_class;
6299 			goto integer;
6300 
6301 		case SO_RECV_TRAFFIC_CLASS:
6302 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6303 			goto integer;
6304 
6305 #if (DEVELOPMENT || DEBUG)
6306 		case SO_TRAFFIC_CLASS_DBG:
6307 			error = sogetopt_tcdbg(so, sopt);
6308 			break;
6309 #endif /* (DEVELOPMENT || DEBUG) */
6310 
6311 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6312 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6313 			goto integer;
6314 
6315 		case SO_DEFUNCTOK:
6316 			optval = !(so->so_flags & SOF_NODEFUNCT);
6317 			goto integer;
6318 
6319 		case SO_ISDEFUNCT:
6320 			optval = (so->so_flags & SOF_DEFUNCT);
6321 			goto integer;
6322 
6323 		case SO_OPPORTUNISTIC:
6324 			optval = so_get_opportunistic(so);
6325 			goto integer;
6326 
6327 		case SO_FLUSH:
6328 			/* This option is not gettable */
6329 			error = EINVAL;
6330 			break;
6331 
6332 		case SO_RECV_ANYIF:
6333 			optval = so_get_recv_anyif(so);
6334 			goto integer;
6335 
6336 		case SO_TRAFFIC_MGT_BACKGROUND:
6337 			/* This option is handled by lower layer(s) */
6338 			if (so->so_proto != NULL &&
6339 			    so->so_proto->pr_ctloutput != NULL) {
6340 				(void) so->so_proto->pr_ctloutput(so, sopt);
6341 			}
6342 			break;
6343 
6344 #if FLOW_DIVERT
6345 		case SO_FLOW_DIVERT_TOKEN:
6346 			error = flow_divert_token_get(so, sopt);
6347 			break;
6348 #endif  /* FLOW_DIVERT */
6349 
6350 #if NECP
6351 		case SO_NECP_ATTRIBUTES:
6352 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6353 				/* Handled by MPTCP itself */
6354 				break;
6355 			}
6356 
6357 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6358 				error = EINVAL;
6359 				goto out;
6360 			}
6361 
6362 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6363 			break;
6364 
6365 		case SO_NECP_CLIENTUUID: {
6366 			uuid_t *ncu;
6367 
6368 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6369 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6370 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6371 				ncu = &sotoinpcb(so)->necp_client_uuid;
6372 			} else {
6373 				error = EINVAL;
6374 				goto out;
6375 			}
6376 
6377 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6378 			break;
6379 		}
6380 
6381 		case SO_NECP_LISTENUUID: {
6382 			uuid_t *nlu;
6383 
6384 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6385 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6386 					nlu = &sotoinpcb(so)->necp_client_uuid;
6387 				} else {
6388 					error = ENOENT;
6389 					goto out;
6390 				}
6391 			} else {
6392 				error = EINVAL;
6393 				goto out;
6394 			}
6395 
6396 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6397 			break;
6398 		}
6399 
6400 		case SO_RESOLVER_SIGNATURE: {
6401 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6402 				error = EINVAL;
6403 				goto out;
6404 			}
6405 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6406 			break;
6407 		}
6408 
6409 #endif /* NECP */
6410 
6411 #if CONTENT_FILTER
6412 		case SO_CFIL_SOCK_ID: {
6413 			cfil_sock_id_t sock_id;
6414 
6415 			sock_id = cfil_sock_id_from_socket(so);
6416 
6417 			error = sooptcopyout(sopt, &sock_id,
6418 			    sizeof(cfil_sock_id_t));
6419 			break;
6420 		}
6421 #endif  /* CONTENT_FILTER */
6422 
6423 		case SO_EXTENDED_BK_IDLE:
6424 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6425 			goto integer;
6426 		case SO_MARK_CELLFALLBACK:
6427 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6428 			    ? 1 : 0;
6429 			goto integer;
6430 		case SO_FALLBACK_MODE:
6431 			optval = so->so_fallback_mode;
6432 			goto integer;
6433 		case SO_MARK_KNOWN_TRACKER: {
6434 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6435 			    ? 1 : 0;
6436 			goto integer;
6437 		}
6438 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6439 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6440 			    ? 1 : 0;
6441 			goto integer;
6442 		}
6443 		case SO_MARK_APPROVED_APP_DOMAIN: {
6444 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6445 			    ? 1 : 0;
6446 			goto integer;
6447 		}
6448 		case SO_NET_SERVICE_TYPE: {
6449 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6450 				optval = so->so_netsvctype;
6451 			} else {
6452 				optval = NET_SERVICE_TYPE_BE;
6453 			}
6454 			goto integer;
6455 		}
6456 		case SO_NETSVC_MARKING_LEVEL:
6457 			optval = so_get_netsvc_marking_level(so);
6458 			goto integer;
6459 
6460 		case SO_MPKL_SEND_INFO: {
6461 			struct so_mpkl_send_info so_mpkl_send_info;
6462 
6463 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6464 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6465 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6466 			    sizeof(struct so_mpkl_send_info));
6467 			break;
6468 		}
6469 		case SO_MARK_WAKE_PKT:
6470 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6471 			goto integer;
6472 		case SO_RECV_WAKE_PKT:
6473 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6474 			goto integer;
6475 		default:
6476 			error = ENOPROTOOPT;
6477 			break;
6478 		}
6479 	}
6480 out:
6481 	if (dolock) {
6482 		socket_unlock(so, 1);
6483 	}
6484 	return error;
6485 }
6486 
6487 /*
6488  * The size limits on our soopt_getm is different from that on FreeBSD.
6489  * We limit the size of options to MCLBYTES. This will have to change
6490  * if we need to define options that need more space than MCLBYTES.
6491  */
6492 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6493 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6494 {
6495 	struct mbuf *m, *m_prev;
6496 	int sopt_size = (int)sopt->sopt_valsize;
6497 	int how;
6498 
6499 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6500 		return EMSGSIZE;
6501 	}
6502 
6503 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6504 	MGET(m, how, MT_DATA);
6505 	if (m == NULL) {
6506 		return ENOBUFS;
6507 	}
6508 	if (sopt_size > MLEN) {
6509 		MCLGET(m, how);
6510 		if ((m->m_flags & M_EXT) == 0) {
6511 			m_free(m);
6512 			return ENOBUFS;
6513 		}
6514 		m->m_len = min(MCLBYTES, sopt_size);
6515 	} else {
6516 		m->m_len = min(MLEN, sopt_size);
6517 	}
6518 	sopt_size -= m->m_len;
6519 	*mp = m;
6520 	m_prev = m;
6521 
6522 	while (sopt_size > 0) {
6523 		MGET(m, how, MT_DATA);
6524 		if (m == NULL) {
6525 			m_freem(*mp);
6526 			return ENOBUFS;
6527 		}
6528 		if (sopt_size > MLEN) {
6529 			MCLGET(m, how);
6530 			if ((m->m_flags & M_EXT) == 0) {
6531 				m_freem(*mp);
6532 				m_freem(m);
6533 				return ENOBUFS;
6534 			}
6535 			m->m_len = min(MCLBYTES, sopt_size);
6536 		} else {
6537 			m->m_len = min(MLEN, sopt_size);
6538 		}
6539 		sopt_size -= m->m_len;
6540 		m_prev->m_next = m;
6541 		m_prev = m;
6542 	}
6543 	return 0;
6544 }
6545 
6546 /* copyin sopt data into mbuf chain */
6547 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6548 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6549 {
6550 	struct mbuf *m0 = m;
6551 
6552 	if (sopt->sopt_val == USER_ADDR_NULL) {
6553 		return 0;
6554 	}
6555 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6556 		if (sopt->sopt_p != kernproc) {
6557 			int error;
6558 
6559 			error = copyin(sopt->sopt_val, mtod(m, char *),
6560 			    m->m_len);
6561 			if (error != 0) {
6562 				m_freem(m0);
6563 				return error;
6564 			}
6565 		} else {
6566 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6567 			    mtod(m, char *), m->m_len);
6568 		}
6569 		sopt->sopt_valsize -= m->m_len;
6570 		sopt->sopt_val += m->m_len;
6571 		m = m->m_next;
6572 	}
6573 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6574 	if (m != NULL) {
6575 		panic("soopt_mcopyin");
6576 		/* NOTREACHED */
6577 	}
6578 	return 0;
6579 }
6580 
6581 /* copyout mbuf chain data into soopt */
6582 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6583 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6584 {
6585 	struct mbuf *m0 = m;
6586 	size_t valsize = 0;
6587 
6588 	if (sopt->sopt_val == USER_ADDR_NULL) {
6589 		return 0;
6590 	}
6591 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6592 		if (sopt->sopt_p != kernproc) {
6593 			int error;
6594 
6595 			error = copyout(mtod(m, char *), sopt->sopt_val,
6596 			    m->m_len);
6597 			if (error != 0) {
6598 				m_freem(m0);
6599 				return error;
6600 			}
6601 		} else {
6602 			bcopy(mtod(m, char *),
6603 			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6604 		}
6605 		sopt->sopt_valsize -= m->m_len;
6606 		sopt->sopt_val += m->m_len;
6607 		valsize += m->m_len;
6608 		m = m->m_next;
6609 	}
6610 	if (m != NULL) {
6611 		/* enough soopt buffer should be given from user-land */
6612 		m_freem(m0);
6613 		return EINVAL;
6614 	}
6615 	sopt->sopt_valsize = valsize;
6616 	return 0;
6617 }
6618 
6619 void
sohasoutofband(struct socket * so)6620 sohasoutofband(struct socket *so)
6621 {
6622 	if (so->so_pgid < 0) {
6623 		gsignal(-so->so_pgid, SIGURG);
6624 	} else if (so->so_pgid > 0) {
6625 		proc_signal(so->so_pgid, SIGURG);
6626 	}
6627 	selwakeup(&so->so_rcv.sb_sel);
6628 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6629 		KNOTE(&so->so_rcv.sb_sel.si_note,
6630 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6631 	}
6632 }
6633 
6634 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6635 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6636 {
6637 #pragma unused(cred)
6638 	struct proc *p = current_proc();
6639 	int revents = 0;
6640 
6641 	socket_lock(so, 1);
6642 	so_update_last_owner_locked(so, PROC_NULL);
6643 	so_update_policy(so);
6644 
6645 	if (events & (POLLIN | POLLRDNORM)) {
6646 		if (soreadable(so)) {
6647 			revents |= events & (POLLIN | POLLRDNORM);
6648 		}
6649 	}
6650 
6651 	if (events & (POLLOUT | POLLWRNORM)) {
6652 		if (sowriteable(so)) {
6653 			revents |= events & (POLLOUT | POLLWRNORM);
6654 		}
6655 	}
6656 
6657 	if (events & (POLLPRI | POLLRDBAND)) {
6658 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6659 			revents |= events & (POLLPRI | POLLRDBAND);
6660 		}
6661 	}
6662 
6663 	if (revents == 0) {
6664 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6665 			/*
6666 			 * Darwin sets the flag first,
6667 			 * BSD calls selrecord first
6668 			 */
6669 			so->so_rcv.sb_flags |= SB_SEL;
6670 			selrecord(p, &so->so_rcv.sb_sel, wql);
6671 		}
6672 
6673 		if (events & (POLLOUT | POLLWRNORM)) {
6674 			/*
6675 			 * Darwin sets the flag first,
6676 			 * BSD calls selrecord first
6677 			 */
6678 			so->so_snd.sb_flags |= SB_SEL;
6679 			selrecord(p, &so->so_snd.sb_sel, wql);
6680 		}
6681 	}
6682 
6683 	socket_unlock(so, 1);
6684 	return revents;
6685 }
6686 
6687 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6688 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6689 {
6690 	struct socket *so = (struct socket *)fp_get_data(fp);
6691 	int result;
6692 
6693 	socket_lock(so, 1);
6694 	so_update_last_owner_locked(so, PROC_NULL);
6695 	so_update_policy(so);
6696 
6697 	switch (kn->kn_filter) {
6698 	case EVFILT_READ:
6699 		kn->kn_filtid = EVFILTID_SOREAD;
6700 		break;
6701 	case EVFILT_WRITE:
6702 		kn->kn_filtid = EVFILTID_SOWRITE;
6703 		break;
6704 	case EVFILT_SOCK:
6705 		kn->kn_filtid = EVFILTID_SCK;
6706 		break;
6707 	case EVFILT_EXCEPT:
6708 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6709 		break;
6710 	default:
6711 		socket_unlock(so, 1);
6712 		knote_set_error(kn, EINVAL);
6713 		return 0;
6714 	}
6715 
6716 	/*
6717 	 * call the appropriate sub-filter attach
6718 	 * with the socket still locked
6719 	 */
6720 	result = knote_fops(kn)->f_attach(kn, kev);
6721 
6722 	socket_unlock(so, 1);
6723 
6724 	return result;
6725 }
6726 
6727 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6728 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6729 {
6730 	int retval = 0;
6731 	int64_t data = 0;
6732 
6733 	if (so->so_options & SO_ACCEPTCONN) {
6734 		/*
6735 		 * Radar 6615193 handle the listen case dynamically
6736 		 * for kqueue read filter. This allows to call listen()
6737 		 * after registering the kqueue EVFILT_READ.
6738 		 */
6739 
6740 		retval = !TAILQ_EMPTY(&so->so_comp);
6741 		data = so->so_qlen;
6742 		goto out;
6743 	}
6744 
6745 	/* socket isn't a listener */
6746 	/*
6747 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6748 	 * the bytes of protocol data. We therefore exclude any
6749 	 * control bytes.
6750 	 */
6751 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6752 
6753 	if (kn->kn_sfflags & NOTE_OOB) {
6754 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6755 			kn->kn_fflags |= NOTE_OOB;
6756 			data -= so->so_oobmark;
6757 			retval = 1;
6758 			goto out;
6759 		}
6760 	}
6761 
6762 	if ((so->so_state & SS_CANTRCVMORE)
6763 #if CONTENT_FILTER
6764 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6765 #endif /* CONTENT_FILTER */
6766 	    ) {
6767 		kn->kn_flags |= EV_EOF;
6768 		kn->kn_fflags = so->so_error;
6769 		retval = 1;
6770 		goto out;
6771 	}
6772 
6773 	if (so->so_error) {     /* temporary udp error */
6774 		retval = 1;
6775 		goto out;
6776 	}
6777 
6778 	int64_t lowwat = so->so_rcv.sb_lowat;
6779 	/*
6780 	 * Ensure that when NOTE_LOWAT is used, the derived
6781 	 * low water mark is bounded by socket's rcv buf's
6782 	 * high and low water mark values.
6783 	 */
6784 	if (kn->kn_sfflags & NOTE_LOWAT) {
6785 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6786 			lowwat = so->so_rcv.sb_hiwat;
6787 		} else if (kn->kn_sdata > lowwat) {
6788 			lowwat = kn->kn_sdata;
6789 		}
6790 	}
6791 
6792 	/*
6793 	 * While the `data` field is the amount of data to read,
6794 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6795 	 * so we need to take control bytes into account too.
6796 	 */
6797 	retval = (so->so_rcv.sb_cc >= lowwat);
6798 
6799 out:
6800 	if (retval && kev) {
6801 		knote_fill_kevent(kn, kev, data);
6802 	}
6803 	return retval;
6804 }
6805 
6806 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6807 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6808 {
6809 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6810 
6811 	/* socket locked */
6812 
6813 	/*
6814 	 * If the caller explicitly asked for OOB results (e.g. poll())
6815 	 * from EVFILT_READ, then save that off in the hookid field
6816 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6817 	 */
6818 	if (kn->kn_filter == EVFILT_READ &&
6819 	    kn->kn_flags & EV_OOBAND) {
6820 		kn->kn_flags &= ~EV_OOBAND;
6821 		kn->kn_hook32 = EV_OOBAND;
6822 	} else {
6823 		kn->kn_hook32 = 0;
6824 	}
6825 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6826 		so->so_rcv.sb_flags |= SB_KNOTE;
6827 	}
6828 
6829 	/* indicate if event is already fired */
6830 	return filt_soread_common(kn, NULL, so);
6831 }
6832 
6833 static void
filt_sordetach(struct knote * kn)6834 filt_sordetach(struct knote *kn)
6835 {
6836 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6837 
6838 	socket_lock(so, 1);
6839 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6840 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6841 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6842 		}
6843 	}
6844 	socket_unlock(so, 1);
6845 }
6846 
6847 /*ARGSUSED*/
6848 static int
filt_soread(struct knote * kn,long hint)6849 filt_soread(struct knote *kn, long hint)
6850 {
6851 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6852 	int retval;
6853 
6854 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6855 		socket_lock(so, 1);
6856 	}
6857 
6858 	retval = filt_soread_common(kn, NULL, so);
6859 
6860 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6861 		socket_unlock(so, 1);
6862 	}
6863 
6864 	return retval;
6865 }
6866 
6867 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6868 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6869 {
6870 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6871 	int retval;
6872 
6873 	socket_lock(so, 1);
6874 
6875 	/* save off the new input fflags and data */
6876 	kn->kn_sfflags = kev->fflags;
6877 	kn->kn_sdata = kev->data;
6878 
6879 	/* determine if changes result in fired events */
6880 	retval = filt_soread_common(kn, NULL, so);
6881 
6882 	socket_unlock(so, 1);
6883 
6884 	return retval;
6885 }
6886 
6887 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6888 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6889 {
6890 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6891 	int retval;
6892 
6893 	socket_lock(so, 1);
6894 	retval = filt_soread_common(kn, kev, so);
6895 	socket_unlock(so, 1);
6896 
6897 	return retval;
6898 }
6899 
6900 int
so_wait_for_if_feedback(struct socket * so)6901 so_wait_for_if_feedback(struct socket *so)
6902 {
6903 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6904 	    (so->so_state & SS_ISCONNECTED)) {
6905 		struct inpcb *inp = sotoinpcb(so);
6906 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6907 			return 1;
6908 		}
6909 	}
6910 	return 0;
6911 }
6912 
6913 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6914 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6915 {
6916 	int ret = 0;
6917 	int64_t data = sbspace(&so->so_snd);
6918 
6919 	if (so->so_state & SS_CANTSENDMORE) {
6920 		kn->kn_flags |= EV_EOF;
6921 		kn->kn_fflags = so->so_error;
6922 		ret = 1;
6923 		goto out;
6924 	}
6925 
6926 	if (so->so_error) {     /* temporary udp error */
6927 		ret = 1;
6928 		goto out;
6929 	}
6930 
6931 	if (!socanwrite(so)) {
6932 		ret = 0;
6933 		goto out;
6934 	}
6935 
6936 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6937 		ret = 1;
6938 		goto out;
6939 	}
6940 
6941 	int64_t lowwat = so->so_snd.sb_lowat;
6942 	const int64_t hiwat = so->so_snd.sb_hiwat;
6943 	/*
6944 	 * Deal with connected UNIX domain sockets which
6945 	 * rely on the fact that the sender's socket buffer is
6946 	 * actually the receiver's socket buffer.
6947 	 */
6948 	if (SOCK_DOM(so) == PF_LOCAL) {
6949 		struct unpcb *unp = sotounpcb(so);
6950 		if (unp != NULL && unp->unp_conn != NULL &&
6951 		    unp->unp_conn->unp_socket != NULL) {
6952 			struct socket *so2 = unp->unp_conn->unp_socket;
6953 			/*
6954 			 * At this point we know that `so' is locked
6955 			 * and that `unp_conn` isn't going to change.
6956 			 * However, we don't lock `so2` because doing so
6957 			 * may require unlocking `so'
6958 			 * (see unp_get_locks_in_order()).
6959 			 *
6960 			 * Two cases can happen:
6961 			 *
6962 			 * 1) we return 1 and tell the application that
6963 			 *    it can write.  Meanwhile, another thread
6964 			 *    fills up the socket buffer.  This will either
6965 			 *    lead to a blocking send or EWOULDBLOCK
6966 			 *    which the application should deal with.
6967 			 * 2) we return 0 and tell the application that
6968 			 *    the socket is not writable.  Meanwhile,
6969 			 *    another thread depletes the receive socket
6970 			 *    buffer. In this case the application will
6971 			 *    be woken up by sb_notify().
6972 			 *
6973 			 * MIN() is required because otherwise sosendcheck()
6974 			 * may return EWOULDBLOCK since it only considers
6975 			 * so->so_snd.
6976 			 */
6977 			data = MIN(data, sbspace(&so2->so_rcv));
6978 		}
6979 	}
6980 
6981 	if (kn->kn_sfflags & NOTE_LOWAT) {
6982 		if (kn->kn_sdata > hiwat) {
6983 			lowwat = hiwat;
6984 		} else if (kn->kn_sdata > lowwat) {
6985 			lowwat = kn->kn_sdata;
6986 		}
6987 	}
6988 
6989 	if (data > 0 && data >= lowwat) {
6990 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6991 #if (DEBUG || DEVELOPMENT)
6992 		    && so_notsent_lowat_check == 1
6993 #endif /* DEBUG || DEVELOPMENT */
6994 		    ) {
6995 			if ((SOCK_DOM(so) == PF_INET ||
6996 			    SOCK_DOM(so) == PF_INET6) &&
6997 			    so->so_type == SOCK_STREAM) {
6998 				ret = tcp_notsent_lowat_check(so);
6999 			}
7000 #if MPTCP
7001 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
7002 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
7003 				ret = mptcp_notsent_lowat_check(so);
7004 			}
7005 #endif
7006 			else {
7007 				ret = 1;
7008 				goto out;
7009 			}
7010 		} else {
7011 			ret = 1;
7012 		}
7013 	}
7014 	if (so_wait_for_if_feedback(so)) {
7015 		ret = 0;
7016 	}
7017 
7018 out:
7019 	if (ret && kev) {
7020 		knote_fill_kevent(kn, kev, data);
7021 	}
7022 	return ret;
7023 }
7024 
7025 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)7026 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7027 {
7028 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7029 
7030 	/* socket locked */
7031 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
7032 		so->so_snd.sb_flags |= SB_KNOTE;
7033 	}
7034 
7035 	/* determine if its already fired */
7036 	return filt_sowrite_common(kn, NULL, so);
7037 }
7038 
7039 static void
filt_sowdetach(struct knote * kn)7040 filt_sowdetach(struct knote *kn)
7041 {
7042 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7043 	socket_lock(so, 1);
7044 
7045 	if (so->so_snd.sb_flags & SB_KNOTE) {
7046 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
7047 			so->so_snd.sb_flags &= ~SB_KNOTE;
7048 		}
7049 	}
7050 	socket_unlock(so, 1);
7051 }
7052 
7053 /*ARGSUSED*/
7054 static int
filt_sowrite(struct knote * kn,long hint)7055 filt_sowrite(struct knote *kn, long hint)
7056 {
7057 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7058 	int ret;
7059 
7060 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7061 		socket_lock(so, 1);
7062 	}
7063 
7064 	ret = filt_sowrite_common(kn, NULL, so);
7065 
7066 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7067 		socket_unlock(so, 1);
7068 	}
7069 
7070 	return ret;
7071 }
7072 
7073 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)7074 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
7075 {
7076 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7077 	int ret;
7078 
7079 	socket_lock(so, 1);
7080 
7081 	/*save off the new input fflags and data */
7082 	kn->kn_sfflags = kev->fflags;
7083 	kn->kn_sdata = kev->data;
7084 
7085 	/* determine if these changes result in a triggered event */
7086 	ret = filt_sowrite_common(kn, NULL, so);
7087 
7088 	socket_unlock(so, 1);
7089 
7090 	return ret;
7091 }
7092 
7093 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)7094 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
7095 {
7096 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7097 	int ret;
7098 
7099 	socket_lock(so, 1);
7100 	ret = filt_sowrite_common(kn, kev, so);
7101 	socket_unlock(so, 1);
7102 
7103 	return ret;
7104 }
7105 
7106 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)7107 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
7108     struct socket *so, long ev_hint)
7109 {
7110 	int ret = 0;
7111 	int64_t data = 0;
7112 	uint32_t level_trigger = 0;
7113 
7114 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
7115 		kn->kn_fflags |= NOTE_CONNRESET;
7116 	}
7117 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
7118 		kn->kn_fflags |= NOTE_TIMEOUT;
7119 	}
7120 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
7121 		kn->kn_fflags |= NOTE_NOSRCADDR;
7122 	}
7123 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
7124 		kn->kn_fflags |= NOTE_IFDENIED;
7125 	}
7126 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7127 		kn->kn_fflags |= NOTE_KEEPALIVE;
7128 	}
7129 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7130 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7131 	}
7132 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7133 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7134 	}
7135 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7136 	    (so->so_state & SS_ISCONNECTED)) {
7137 		kn->kn_fflags |= NOTE_CONNECTED;
7138 		level_trigger |= NOTE_CONNECTED;
7139 	}
7140 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7141 	    (so->so_state & SS_ISDISCONNECTED)) {
7142 		kn->kn_fflags |= NOTE_DISCONNECTED;
7143 		level_trigger |= NOTE_DISCONNECTED;
7144 	}
7145 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7146 		if (so->so_proto != NULL &&
7147 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7148 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7149 		}
7150 	}
7151 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7152 	    tcp_notify_ack_active(so)) {
7153 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
7154 	}
7155 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7156 		kn->kn_fflags |= NOTE_WAKE_PKT;
7157 	}
7158 
7159 	if ((so->so_state & SS_CANTRCVMORE)
7160 #if CONTENT_FILTER
7161 	    && cfil_sock_data_pending(&so->so_rcv) == 0
7162 #endif /* CONTENT_FILTER */
7163 	    ) {
7164 		kn->kn_fflags |= NOTE_READCLOSED;
7165 		level_trigger |= NOTE_READCLOSED;
7166 	}
7167 
7168 	if (so->so_state & SS_CANTSENDMORE) {
7169 		kn->kn_fflags |= NOTE_WRITECLOSED;
7170 		level_trigger |= NOTE_WRITECLOSED;
7171 	}
7172 
7173 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7174 	    (so->so_flags & SOF_SUSPENDED)) {
7175 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7176 
7177 		/* If resume event was delivered before, reset it */
7178 		kn->kn_hook32 &= ~NOTE_RESUME;
7179 
7180 		kn->kn_fflags |= NOTE_SUSPEND;
7181 		level_trigger |= NOTE_SUSPEND;
7182 	}
7183 
7184 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
7185 	    (so->so_flags & SOF_SUSPENDED) == 0) {
7186 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7187 
7188 		/* If suspend event was delivered before, reset it */
7189 		kn->kn_hook32 &= ~NOTE_SUSPEND;
7190 
7191 		kn->kn_fflags |= NOTE_RESUME;
7192 		level_trigger |= NOTE_RESUME;
7193 	}
7194 
7195 	if (so->so_error != 0) {
7196 		ret = 1;
7197 		data = so->so_error;
7198 		kn->kn_flags |= EV_EOF;
7199 	} else {
7200 		u_int32_t data32 = 0;
7201 		get_sockev_state(so, &data32);
7202 		data = data32;
7203 	}
7204 
7205 	/* Reset any events that are not requested on this knote */
7206 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7207 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7208 
7209 	/* Find the level triggerred events that are already delivered */
7210 	level_trigger &= kn->kn_hook32;
7211 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7212 
7213 	/* Do not deliver level triggerred events more than once */
7214 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7215 		ret = 1;
7216 	}
7217 
7218 	if (ret && kev) {
7219 		/*
7220 		 * Store the state of the events being delivered. This
7221 		 * state can be used to deliver level triggered events
7222 		 * ateast once and still avoid waking up the application
7223 		 * multiple times as long as the event is active.
7224 		 */
7225 		if (kn->kn_fflags != 0) {
7226 			kn->kn_hook32 |= (kn->kn_fflags &
7227 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7228 		}
7229 
7230 		/*
7231 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7232 		 * only one of them and remember the last one that was
7233 		 * delivered last
7234 		 */
7235 		if (kn->kn_fflags & NOTE_SUSPEND) {
7236 			kn->kn_hook32 &= ~NOTE_RESUME;
7237 		}
7238 		if (kn->kn_fflags & NOTE_RESUME) {
7239 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7240 		}
7241 
7242 		knote_fill_kevent(kn, kev, data);
7243 	}
7244 	return ret;
7245 }
7246 
7247 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7248 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7249 {
7250 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7251 
7252 	/* socket locked */
7253 	kn->kn_hook32 = 0;
7254 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7255 		so->so_flags |= SOF_KNOTE;
7256 	}
7257 
7258 	/* determine if event already fired */
7259 	return filt_sockev_common(kn, NULL, so, 0);
7260 }
7261 
7262 static void
filt_sockdetach(struct knote * kn)7263 filt_sockdetach(struct knote *kn)
7264 {
7265 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7266 	socket_lock(so, 1);
7267 
7268 	if ((so->so_flags & SOF_KNOTE) != 0) {
7269 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7270 			so->so_flags &= ~SOF_KNOTE;
7271 		}
7272 	}
7273 	socket_unlock(so, 1);
7274 }
7275 
7276 static int
filt_sockev(struct knote * kn,long hint)7277 filt_sockev(struct knote *kn, long hint)
7278 {
7279 	int ret = 0, locked = 0;
7280 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7281 	long ev_hint = (hint & SO_FILT_HINT_EV);
7282 
7283 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7284 		socket_lock(so, 1);
7285 		locked = 1;
7286 	}
7287 
7288 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7289 
7290 	if (locked) {
7291 		socket_unlock(so, 1);
7292 	}
7293 
7294 	return ret;
7295 }
7296 
7297 
7298 
7299 /*
7300  *	filt_socktouch - update event state
7301  */
7302 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7303 filt_socktouch(
7304 	struct knote *kn,
7305 	struct kevent_qos_s *kev)
7306 {
7307 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7308 	uint32_t changed_flags;
7309 	int ret;
7310 
7311 	socket_lock(so, 1);
7312 
7313 	/* save off the [result] data and fflags */
7314 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7315 
7316 	/* save off the new input fflags and data */
7317 	kn->kn_sfflags = kev->fflags;
7318 	kn->kn_sdata = kev->data;
7319 
7320 	/* restrict the current results to the (smaller?) set of new interest */
7321 	/*
7322 	 * For compatibility with previous implementations, we leave kn_fflags
7323 	 * as they were before.
7324 	 */
7325 	//kn->kn_fflags &= kev->fflags;
7326 
7327 	/*
7328 	 * Since we keep track of events that are already
7329 	 * delivered, if any of those events are not requested
7330 	 * anymore the state related to them can be reset
7331 	 */
7332 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7333 
7334 	/* determine if we have events to deliver */
7335 	ret = filt_sockev_common(kn, NULL, so, 0);
7336 
7337 	socket_unlock(so, 1);
7338 
7339 	return ret;
7340 }
7341 
7342 /*
7343  *	filt_sockprocess - query event fired state and return data
7344  */
7345 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7346 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7347 {
7348 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7349 	int ret = 0;
7350 
7351 	socket_lock(so, 1);
7352 
7353 	ret = filt_sockev_common(kn, kev, so, 0);
7354 
7355 	socket_unlock(so, 1);
7356 
7357 	return ret;
7358 }
7359 
7360 void
get_sockev_state(struct socket * so,u_int32_t * statep)7361 get_sockev_state(struct socket *so, u_int32_t *statep)
7362 {
7363 	u_int32_t state = *(statep);
7364 
7365 	/*
7366 	 * If the state variable is already used by a previous event,
7367 	 * reset it.
7368 	 */
7369 	if (state != 0) {
7370 		return;
7371 	}
7372 
7373 	if (so->so_state & SS_ISCONNECTED) {
7374 		state |= SOCKEV_CONNECTED;
7375 	} else {
7376 		state &= ~(SOCKEV_CONNECTED);
7377 	}
7378 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7379 	*(statep) = state;
7380 }
7381 
7382 #define SO_LOCK_HISTORY_STR_LEN \
7383 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7384 
7385 __private_extern__ const char *
solockhistory_nr(struct socket * so)7386 solockhistory_nr(struct socket *so)
7387 {
7388 	size_t n = 0;
7389 	int i;
7390 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7391 
7392 	bzero(lock_history_str, sizeof(lock_history_str));
7393 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7394 		n += scnprintf(lock_history_str + n,
7395 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7396 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7397 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7398 	}
7399 	return lock_history_str;
7400 }
7401 
7402 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7403 socket_getlock(struct socket *so, int flags)
7404 {
7405 	if (so->so_proto->pr_getlock != NULL) {
7406 		return (*so->so_proto->pr_getlock)(so, flags);
7407 	} else {
7408 		return so->so_proto->pr_domain->dom_mtx;
7409 	}
7410 }
7411 
7412 void
socket_lock(struct socket * so,int refcount)7413 socket_lock(struct socket *so, int refcount)
7414 {
7415 	void *lr_saved;
7416 
7417 	lr_saved = __builtin_return_address(0);
7418 
7419 	if (so->so_proto->pr_lock) {
7420 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7421 	} else {
7422 #ifdef MORE_LOCKING_DEBUG
7423 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7424 		    LCK_MTX_ASSERT_NOTOWNED);
7425 #endif
7426 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7427 		if (refcount) {
7428 			so->so_usecount++;
7429 		}
7430 		so->lock_lr[so->next_lock_lr] = lr_saved;
7431 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7432 	}
7433 }
7434 
7435 void
socket_lock_assert_owned(struct socket * so)7436 socket_lock_assert_owned(struct socket *so)
7437 {
7438 	lck_mtx_t *mutex_held;
7439 
7440 	if (so->so_proto->pr_getlock != NULL) {
7441 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7442 	} else {
7443 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7444 	}
7445 
7446 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7447 }
7448 
7449 int
socket_try_lock(struct socket * so)7450 socket_try_lock(struct socket *so)
7451 {
7452 	lck_mtx_t *mtx;
7453 
7454 	if (so->so_proto->pr_getlock != NULL) {
7455 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7456 	} else {
7457 		mtx = so->so_proto->pr_domain->dom_mtx;
7458 	}
7459 
7460 	return lck_mtx_try_lock(mtx);
7461 }
7462 
7463 void
socket_unlock(struct socket * so,int refcount)7464 socket_unlock(struct socket *so, int refcount)
7465 {
7466 	void *lr_saved;
7467 	lck_mtx_t *mutex_held;
7468 
7469 	lr_saved = __builtin_return_address(0);
7470 
7471 	if (so == NULL || so->so_proto == NULL) {
7472 		panic("%s: null so_proto so=%p", __func__, so);
7473 		/* NOTREACHED */
7474 	}
7475 
7476 	if (so->so_proto->pr_unlock) {
7477 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7478 	} else {
7479 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7480 #ifdef MORE_LOCKING_DEBUG
7481 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7482 #endif
7483 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7484 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7485 
7486 		if (refcount) {
7487 			if (so->so_usecount <= 0) {
7488 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7489 				    "lrh=%s", __func__, so->so_usecount, so,
7490 				    SOCK_DOM(so), so->so_type,
7491 				    SOCK_PROTO(so), solockhistory_nr(so));
7492 				/* NOTREACHED */
7493 			}
7494 
7495 			so->so_usecount--;
7496 			if (so->so_usecount == 0) {
7497 				sofreelastref(so, 1);
7498 			}
7499 		}
7500 		lck_mtx_unlock(mutex_held);
7501 	}
7502 }
7503 
7504 /* Called with socket locked, will unlock socket */
7505 void
sofree(struct socket * so)7506 sofree(struct socket *so)
7507 {
7508 	lck_mtx_t *mutex_held;
7509 
7510 	if (so->so_proto->pr_getlock != NULL) {
7511 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7512 	} else {
7513 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7514 	}
7515 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7516 
7517 	sofreelastref(so, 0);
7518 }
7519 
7520 void
soreference(struct socket * so)7521 soreference(struct socket *so)
7522 {
7523 	socket_lock(so, 1);     /* locks & take one reference on socket */
7524 	socket_unlock(so, 0);   /* unlock only */
7525 }
7526 
7527 void
sodereference(struct socket * so)7528 sodereference(struct socket *so)
7529 {
7530 	socket_lock(so, 0);
7531 	socket_unlock(so, 1);
7532 }
7533 
7534 /*
7535  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7536  * possibility of using jumbo clusters.  Caller must ensure to hold
7537  * the socket lock.
7538  */
7539 void
somultipages(struct socket * so,boolean_t set)7540 somultipages(struct socket *so, boolean_t set)
7541 {
7542 	if (set) {
7543 		so->so_flags |= SOF_MULTIPAGES;
7544 	} else {
7545 		so->so_flags &= ~SOF_MULTIPAGES;
7546 	}
7547 }
7548 
7549 void
soif2kcl(struct socket * so,boolean_t set)7550 soif2kcl(struct socket *so, boolean_t set)
7551 {
7552 	if (set) {
7553 		so->so_flags1 |= SOF1_IF_2KCL;
7554 	} else {
7555 		so->so_flags1 &= ~SOF1_IF_2KCL;
7556 	}
7557 }
7558 
7559 int
so_isdstlocal(struct socket * so)7560 so_isdstlocal(struct socket *so)
7561 {
7562 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7563 
7564 	if (SOCK_DOM(so) == PF_INET) {
7565 		return inaddr_local(inp->inp_faddr);
7566 	} else if (SOCK_DOM(so) == PF_INET6) {
7567 		return in6addr_local(&inp->in6p_faddr);
7568 	}
7569 
7570 	return 0;
7571 }
7572 
7573 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7574 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7575 {
7576 	struct sockbuf *rcv, *snd;
7577 	int err = 0, defunct;
7578 
7579 	rcv = &so->so_rcv;
7580 	snd = &so->so_snd;
7581 
7582 	defunct = (so->so_flags & SOF_DEFUNCT);
7583 	if (defunct) {
7584 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7585 			panic("%s: SB_DROP not set", __func__);
7586 			/* NOTREACHED */
7587 		}
7588 		goto done;
7589 	}
7590 
7591 	if (so->so_flags & SOF_NODEFUNCT) {
7592 		if (noforce) {
7593 			err = EOPNOTSUPP;
7594 			if (p != PROC_NULL) {
7595 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7596 				    "name %s level %d) so 0x%llu [%d,%d] "
7597 				    "is not eligible for defunct "
7598 				    "(%d)\n", __func__, proc_selfpid(),
7599 				    proc_best_name(current_proc()), proc_pid(p),
7600 				    proc_best_name(p), level,
7601 				    so->so_gencnt,
7602 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7603 			}
7604 			return err;
7605 		}
7606 		so->so_flags &= ~SOF_NODEFUNCT;
7607 		if (p != PROC_NULL) {
7608 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7609 			    "name %s level %d) so 0x%llu [%d,%d] "
7610 			    "defunct by force "
7611 			    "(%d)\n", __func__, proc_selfpid(),
7612 			    proc_best_name(current_proc()), proc_pid(p),
7613 			    proc_best_name(p), level,
7614 			    so->so_gencnt,
7615 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7616 		}
7617 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7618 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7619 		struct ifnet *ifp = inp->inp_last_outifp;
7620 
7621 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7622 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7623 		} else if (so->so_flags & SOF_DELEGATED) {
7624 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7625 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7626 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7627 		} else if (noforce && p != PROC_NULL) {
7628 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7629 
7630 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7631 			so->so_extended_bk_start = net_uptime();
7632 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7633 
7634 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7635 
7636 			err = EOPNOTSUPP;
7637 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7638 			    "name %s level %d) so 0x%llu [%d,%d] "
7639 			    "extend bk idle "
7640 			    "(%d)\n", __func__, proc_selfpid(),
7641 			    proc_best_name(current_proc()), proc_pid(p),
7642 			    proc_best_name(p), level,
7643 			    so->so_gencnt,
7644 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7645 			return err;
7646 		} else {
7647 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7648 		}
7649 	}
7650 
7651 	so->so_flags |= SOF_DEFUNCT;
7652 
7653 	/* Prevent further data from being appended to the socket buffers */
7654 	snd->sb_flags |= SB_DROP;
7655 	rcv->sb_flags |= SB_DROP;
7656 
7657 	/* Flush any existing data in the socket buffers */
7658 	if (rcv->sb_cc != 0) {
7659 		rcv->sb_flags &= ~SB_SEL;
7660 		selthreadclear(&rcv->sb_sel);
7661 		sbrelease(rcv);
7662 	}
7663 	if (snd->sb_cc != 0) {
7664 		snd->sb_flags &= ~SB_SEL;
7665 		selthreadclear(&snd->sb_sel);
7666 		sbrelease(snd);
7667 	}
7668 
7669 done:
7670 	if (p != PROC_NULL) {
7671 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7672 		    "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7673 		    proc_selfpid(), proc_best_name(current_proc()),
7674 		    proc_pid(p), proc_best_name(p), level,
7675 		    so->so_gencnt, SOCK_DOM(so),
7676 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7677 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7678 		    " extbkidle" : "");
7679 	}
7680 	return err;
7681 }
7682 
7683 int
sodefunct(struct proc * p,struct socket * so,int level)7684 sodefunct(struct proc *p, struct socket *so, int level)
7685 {
7686 	struct sockbuf *rcv, *snd;
7687 
7688 	if (!(so->so_flags & SOF_DEFUNCT)) {
7689 		panic("%s improperly called", __func__);
7690 		/* NOTREACHED */
7691 	}
7692 	if (so->so_state & SS_DEFUNCT) {
7693 		goto done;
7694 	}
7695 
7696 	rcv = &so->so_rcv;
7697 	snd = &so->so_snd;
7698 
7699 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7700 		char s[MAX_IPv6_STR_LEN];
7701 		char d[MAX_IPv6_STR_LEN];
7702 		struct inpcb *inp = sotoinpcb(so);
7703 
7704 		if (p != PROC_NULL) {
7705 			SODEFUNCTLOG(
7706 				"%s[%d, %s]: (target pid %d name %s level %d) "
7707 				"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7708 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7709 				" snd_fl 0x%x]\n", __func__,
7710 				proc_selfpid(), proc_best_name(current_proc()),
7711 				proc_pid(p), proc_best_name(p), level,
7712 				so->so_gencnt,
7713 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7714 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7715 				(void *)&inp->inp_laddr.s_addr :
7716 				(void *)&inp->in6p_laddr),
7717 				s, sizeof(s)), ntohs(inp->in6p_lport),
7718 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7719 				(void *)&inp->inp_faddr.s_addr :
7720 				(void *)&inp->in6p_faddr,
7721 				d, sizeof(d)), ntohs(inp->in6p_fport),
7722 				(uint32_t)rcv->sb_sel.si_flags,
7723 				(uint32_t)snd->sb_sel.si_flags,
7724 				rcv->sb_flags, snd->sb_flags);
7725 		}
7726 	} else if (p != PROC_NULL) {
7727 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7728 		    "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7729 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7730 		    proc_selfpid(), proc_best_name(current_proc()),
7731 		    proc_pid(p), proc_best_name(p), level,
7732 		    so->so_gencnt,
7733 		    SOCK_DOM(so), SOCK_TYPE(so),
7734 		    (uint32_t)rcv->sb_sel.si_flags,
7735 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7736 		    snd->sb_flags);
7737 	}
7738 
7739 	/*
7740 	 * First tell the protocol the flow is defunct
7741 	 */
7742 	(void)  (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7743 
7744 	/*
7745 	 * Unwedge threads blocked on sbwait() and sb_lock().
7746 	 */
7747 	sbwakeup(rcv);
7748 	sbwakeup(snd);
7749 
7750 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7751 	if (rcv->sb_flags & SB_LOCK) {
7752 		sbunlock(rcv, TRUE);    /* keep socket locked */
7753 	}
7754 	if (snd->sb_flags & SB_LOCK) {
7755 		sbunlock(snd, TRUE);    /* keep socket locked */
7756 	}
7757 	/*
7758 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7759 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7760 	 * states are set for the socket.  This would also flush out data
7761 	 * hanging off the receive list of this socket.
7762 	 */
7763 	(void) soshutdownlock_final(so, SHUT_RD);
7764 	(void) soshutdownlock_final(so, SHUT_WR);
7765 	(void) sodisconnectlocked(so);
7766 
7767 	/*
7768 	 * Explicitly handle connectionless-protocol disconnection
7769 	 * and release any remaining data in the socket buffers.
7770 	 */
7771 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7772 		(void) soisdisconnected(so);
7773 	}
7774 
7775 	if (so->so_error == 0) {
7776 		so->so_error = EBADF;
7777 	}
7778 
7779 	if (rcv->sb_cc != 0) {
7780 		rcv->sb_flags &= ~SB_SEL;
7781 		selthreadclear(&rcv->sb_sel);
7782 		sbrelease(rcv);
7783 	}
7784 	if (snd->sb_cc != 0) {
7785 		snd->sb_flags &= ~SB_SEL;
7786 		selthreadclear(&snd->sb_sel);
7787 		sbrelease(snd);
7788 	}
7789 	so->so_state |= SS_DEFUNCT;
7790 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7791 
7792 done:
7793 	return 0;
7794 }
7795 
7796 int
soresume(struct proc * p,struct socket * so,int locked)7797 soresume(struct proc *p, struct socket *so, int locked)
7798 {
7799 	if (locked == 0) {
7800 		socket_lock(so, 1);
7801 	}
7802 
7803 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7804 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7805 		    "[%d,%d] resumed from bk idle\n",
7806 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7807 		    proc_pid(p), proc_best_name(p),
7808 		    so->so_gencnt,
7809 		    SOCK_DOM(so), SOCK_TYPE(so));
7810 
7811 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7812 		so->so_extended_bk_start = 0;
7813 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7814 
7815 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7816 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7817 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7818 	}
7819 	if (locked == 0) {
7820 		socket_unlock(so, 1);
7821 	}
7822 
7823 	return 0;
7824 }
7825 
7826 /*
7827  * Does not attempt to account for sockets that are delegated from
7828  * the current process
7829  */
7830 int
so_set_extended_bk_idle(struct socket * so,int optval)7831 so_set_extended_bk_idle(struct socket *so, int optval)
7832 {
7833 	int error = 0;
7834 
7835 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7836 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7837 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7838 		error = EOPNOTSUPP;
7839 	} else if (optval == 0) {
7840 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7841 
7842 		soresume(current_proc(), so, 1);
7843 	} else {
7844 		struct proc *p = current_proc();
7845 		struct fileproc *fp;
7846 		int count = 0;
7847 
7848 		/*
7849 		 * Unlock socket to avoid lock ordering issue with
7850 		 * the proc fd table lock
7851 		 */
7852 		socket_unlock(so, 0);
7853 
7854 		proc_fdlock(p);
7855 		fdt_foreach(fp, p) {
7856 			struct socket *so2;
7857 
7858 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7859 				continue;
7860 			}
7861 
7862 			so2 = (struct socket *)fp_get_data(fp);
7863 			if (so != so2 &&
7864 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7865 				count++;
7866 			}
7867 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7868 				break;
7869 			}
7870 		}
7871 		proc_fdunlock(p);
7872 
7873 		socket_lock(so, 0);
7874 
7875 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7876 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7877 			error = EBUSY;
7878 		} else if (so->so_flags & SOF_DELEGATED) {
7879 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7880 			error = EBUSY;
7881 		} else {
7882 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7883 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7884 		}
7885 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7886 		    "%s marked for extended bk idle\n",
7887 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7888 		    so->so_gencnt,
7889 		    SOCK_DOM(so), SOCK_TYPE(so),
7890 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7891 		    "is" : "not");
7892 	}
7893 
7894 	return error;
7895 }
7896 
7897 static void
so_stop_extended_bk_idle(struct socket * so)7898 so_stop_extended_bk_idle(struct socket *so)
7899 {
7900 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7901 	so->so_extended_bk_start = 0;
7902 
7903 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7904 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7905 	/*
7906 	 * Force defunct
7907 	 */
7908 	sosetdefunct(current_proc(), so,
7909 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7910 	if (so->so_flags & SOF_DEFUNCT) {
7911 		sodefunct(current_proc(), so,
7912 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7913 	}
7914 }
7915 
7916 void
so_drain_extended_bk_idle(struct socket * so)7917 so_drain_extended_bk_idle(struct socket *so)
7918 {
7919 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7920 		/*
7921 		 * Only penalize sockets that have outstanding data
7922 		 */
7923 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7924 			so_stop_extended_bk_idle(so);
7925 
7926 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7927 		}
7928 	}
7929 }
7930 
7931 /*
7932  * Return values tells if socket is still in extended background idle
7933  */
7934 int
so_check_extended_bk_idle_time(struct socket * so)7935 so_check_extended_bk_idle_time(struct socket *so)
7936 {
7937 	int ret = 1;
7938 
7939 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7940 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7941 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7942 		    so->so_gencnt,
7943 		    SOCK_DOM(so), SOCK_TYPE(so));
7944 		if (net_uptime() - so->so_extended_bk_start >
7945 		    soextbkidlestat.so_xbkidle_time) {
7946 			so_stop_extended_bk_idle(so);
7947 
7948 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7949 
7950 			ret = 0;
7951 		} else {
7952 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7953 
7954 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7955 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7956 		}
7957 	}
7958 
7959 	return ret;
7960 }
7961 
7962 void
resume_proc_sockets(proc_t p)7963 resume_proc_sockets(proc_t p)
7964 {
7965 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7966 		struct fileproc *fp;
7967 		struct socket *so;
7968 
7969 		proc_fdlock(p);
7970 		fdt_foreach(fp, p) {
7971 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7972 				continue;
7973 			}
7974 
7975 			so = (struct socket *)fp_get_data(fp);
7976 			(void) soresume(p, so, 0);
7977 		}
7978 		proc_fdunlock(p);
7979 
7980 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7981 	}
7982 }
7983 
7984 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7985 so_set_recv_anyif(struct socket *so, int optval)
7986 {
7987 	int ret = 0;
7988 
7989 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7990 		if (optval) {
7991 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7992 		} else {
7993 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7994 		}
7995 #if SKYWALK
7996 		inp_update_netns_flags(so);
7997 #endif /* SKYWALK */
7998 	}
7999 
8000 
8001 	return ret;
8002 }
8003 
8004 __private_extern__ int
so_get_recv_anyif(struct socket * so)8005 so_get_recv_anyif(struct socket *so)
8006 {
8007 	int ret = 0;
8008 
8009 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8010 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
8011 	}
8012 
8013 	return ret;
8014 }
8015 
8016 int
so_set_restrictions(struct socket * so,uint32_t vals)8017 so_set_restrictions(struct socket *so, uint32_t vals)
8018 {
8019 	int nocell_old, nocell_new;
8020 	int noexpensive_old, noexpensive_new;
8021 	int noconstrained_old, noconstrained_new;
8022 
8023 	/*
8024 	 * Deny-type restrictions are trapdoors; once set they cannot be
8025 	 * unset for the lifetime of the socket.  This allows them to be
8026 	 * issued by a framework on behalf of the application without
8027 	 * having to worry that they can be undone.
8028 	 *
8029 	 * Note here that socket-level restrictions overrides any protocol
8030 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
8031 	 * socket restriction issued on the socket has a higher precendence
8032 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
8033 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
8034 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
8035 	 */
8036 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8037 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8038 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8039 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
8040 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
8041 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
8042 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8043 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8044 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8045 
8046 	/* we can only set, not clear restrictions */
8047 	if ((nocell_new - nocell_old) == 0 &&
8048 	    (noexpensive_new - noexpensive_old) == 0 &&
8049 	    (noconstrained_new - noconstrained_old) == 0) {
8050 		return 0;
8051 	}
8052 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8053 		if (nocell_new - nocell_old != 0) {
8054 			/*
8055 			 * if deny cellular is now set, do what's needed
8056 			 * for INPCB
8057 			 */
8058 			inp_set_nocellular(sotoinpcb(so));
8059 		}
8060 		if (noexpensive_new - noexpensive_old != 0) {
8061 			inp_set_noexpensive(sotoinpcb(so));
8062 		}
8063 		if (noconstrained_new - noconstrained_old != 0) {
8064 			inp_set_noconstrained(sotoinpcb(so));
8065 		}
8066 	}
8067 
8068 	if (SOCK_DOM(so) == PF_MULTIPATH) {
8069 		mptcp_set_restrictions(so);
8070 	}
8071 
8072 	return 0;
8073 }
8074 
8075 uint32_t
so_get_restrictions(struct socket * so)8076 so_get_restrictions(struct socket *so)
8077 {
8078 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
8079 	       SO_RESTRICT_DENY_OUT |
8080 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
8081 }
8082 
8083 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)8084 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
8085 {
8086 	struct proc *ep = PROC_NULL;
8087 	int error = 0;
8088 
8089 	/* pid 0 is reserved for kernel */
8090 	if (epid == 0) {
8091 		error = EINVAL;
8092 		goto done;
8093 	}
8094 
8095 	/*
8096 	 * If this is an in-kernel socket, prevent its delegate
8097 	 * association from changing unless the socket option is
8098 	 * coming from within the kernel itself.
8099 	 */
8100 	if (so->last_pid == 0 && p != kernproc) {
8101 		error = EACCES;
8102 		goto done;
8103 	}
8104 
8105 	/*
8106 	 * If this is issued by a process that's recorded as the
8107 	 * real owner of the socket, or if the pid is the same as
8108 	 * the process's own pid, then proceed.  Otherwise ensure
8109 	 * that the issuing process has the necessary privileges.
8110 	 */
8111 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
8112 		if ((error = priv_check_cred(kauth_cred_get(),
8113 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8114 			error = EACCES;
8115 			goto done;
8116 		}
8117 	}
8118 
8119 	/* Find the process that corresponds to the effective pid */
8120 	if ((ep = proc_find(epid)) == PROC_NULL) {
8121 		error = ESRCH;
8122 		goto done;
8123 	}
8124 
8125 	/*
8126 	 * If a process tries to delegate the socket to itself, then
8127 	 * there's really nothing to do; treat it as a way for the
8128 	 * delegate association to be cleared.  Note that we check
8129 	 * the passed-in proc rather than calling proc_selfpid(),
8130 	 * as we need to check the process issuing the socket option
8131 	 * which could be kernproc.  Given that we don't allow 0 for
8132 	 * effective pid, it means that a delegated in-kernel socket
8133 	 * stays delegated during its lifetime (which is probably OK.)
8134 	 */
8135 	if (epid == proc_pid(p)) {
8136 		so->so_flags &= ~SOF_DELEGATED;
8137 		so->e_upid = 0;
8138 		so->e_pid = 0;
8139 		uuid_clear(so->e_uuid);
8140 	} else {
8141 		so->so_flags |= SOF_DELEGATED;
8142 		so->e_upid = proc_uniqueid(ep);
8143 		so->e_pid = proc_pid(ep);
8144 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8145 
8146 #if defined(XNU_TARGET_OS_OSX)
8147 		if (ep->p_responsible_pid != so->e_pid) {
8148 			proc_t rp = proc_find(ep->p_responsible_pid);
8149 			if (rp != PROC_NULL) {
8150 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8151 				so->so_rpid = ep->p_responsible_pid;
8152 				proc_rele(rp);
8153 			} else {
8154 				uuid_clear(so->so_ruuid);
8155 				so->so_rpid = -1;
8156 			}
8157 		}
8158 #endif
8159 	}
8160 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8161 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8162 	}
8163 done:
8164 	if (error == 0 && net_io_policy_log) {
8165 		uuid_string_t buf;
8166 
8167 		uuid_unparse(so->e_uuid, buf);
8168 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8169 		    "euuid %s%s\n", __func__, proc_name_address(p),
8170 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8171 		    SOCK_DOM(so), SOCK_TYPE(so),
8172 		    so->e_pid, proc_name_address(ep), buf,
8173 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8174 	} else if (error != 0 && net_io_policy_log) {
8175 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8176 		    "ERROR (%d)\n", __func__, proc_name_address(p),
8177 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8178 		    SOCK_DOM(so), SOCK_TYPE(so),
8179 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
8180 		    proc_name_address(ep), error);
8181 	}
8182 
8183 	/* Update this socket's policy upon success */
8184 	if (error == 0) {
8185 		so->so_policy_gencnt *= -1;
8186 		so_update_policy(so);
8187 #if NECP
8188 		so_update_necp_policy(so, NULL, NULL);
8189 #endif /* NECP */
8190 	}
8191 
8192 	if (ep != PROC_NULL) {
8193 		proc_rele(ep);
8194 	}
8195 
8196 	return error;
8197 }
8198 
8199 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8200 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8201 {
8202 	uuid_string_t buf;
8203 	uuid_t uuid;
8204 	int error = 0;
8205 
8206 	/* UUID must not be all-zeroes (reserved for kernel) */
8207 	if (uuid_is_null(euuid)) {
8208 		error = EINVAL;
8209 		goto done;
8210 	}
8211 
8212 	/*
8213 	 * If this is an in-kernel socket, prevent its delegate
8214 	 * association from changing unless the socket option is
8215 	 * coming from within the kernel itself.
8216 	 */
8217 	if (so->last_pid == 0 && p != kernproc) {
8218 		error = EACCES;
8219 		goto done;
8220 	}
8221 
8222 	/* Get the UUID of the issuing process */
8223 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8224 
8225 	/*
8226 	 * If this is issued by a process that's recorded as the
8227 	 * real owner of the socket, or if the uuid is the same as
8228 	 * the process's own uuid, then proceed.  Otherwise ensure
8229 	 * that the issuing process has the necessary privileges.
8230 	 */
8231 	if (check_cred &&
8232 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8233 	    uuid_compare(euuid, uuid) != 0)) {
8234 		if ((error = priv_check_cred(kauth_cred_get(),
8235 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8236 			error = EACCES;
8237 			goto done;
8238 		}
8239 	}
8240 
8241 	/*
8242 	 * If a process tries to delegate the socket to itself, then
8243 	 * there's really nothing to do; treat it as a way for the
8244 	 * delegate association to be cleared.  Note that we check
8245 	 * the uuid of the passed-in proc rather than that of the
8246 	 * current process, as we need to check the process issuing
8247 	 * the socket option which could be kernproc itself.  Given
8248 	 * that we don't allow 0 for effective uuid, it means that
8249 	 * a delegated in-kernel socket stays delegated during its
8250 	 * lifetime (which is okay.)
8251 	 */
8252 	if (uuid_compare(euuid, uuid) == 0) {
8253 		so->so_flags &= ~SOF_DELEGATED;
8254 		so->e_upid = 0;
8255 		so->e_pid = 0;
8256 		uuid_clear(so->e_uuid);
8257 	} else {
8258 		so->so_flags |= SOF_DELEGATED;
8259 		/*
8260 		 * Unlike so_set_effective_pid(), we only have the UUID
8261 		 * here and the process ID is not known.  Inherit the
8262 		 * real {pid,upid} of the socket.
8263 		 */
8264 		so->e_upid = so->last_upid;
8265 		so->e_pid = so->last_pid;
8266 		uuid_copy(so->e_uuid, euuid);
8267 	}
8268 	/*
8269 	 * The following will clear the effective process name as it's the same
8270 	 * as the real process
8271 	 */
8272 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8273 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8274 	}
8275 done:
8276 	if (error == 0 && net_io_policy_log) {
8277 		uuid_unparse(so->e_uuid, buf);
8278 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8279 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8280 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8281 		    SOCK_TYPE(so), so->e_pid, buf,
8282 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8283 	} else if (error != 0 && net_io_policy_log) {
8284 		uuid_unparse(euuid, buf);
8285 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8286 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8287 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8288 		    SOCK_TYPE(so), buf, error);
8289 	}
8290 
8291 	/* Update this socket's policy upon success */
8292 	if (error == 0) {
8293 		so->so_policy_gencnt *= -1;
8294 		so_update_policy(so);
8295 #if NECP
8296 		so_update_necp_policy(so, NULL, NULL);
8297 #endif /* NECP */
8298 	}
8299 
8300 	return error;
8301 }
8302 
8303 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8304 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8305     uint32_t ev_datalen)
8306 {
8307 	struct kev_msg ev_msg;
8308 
8309 	/*
8310 	 * A netpolicy event always starts with a netpolicy_event_data
8311 	 * structure, but the caller can provide for a longer event
8312 	 * structure to post, depending on the event code.
8313 	 */
8314 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8315 
8316 	bzero(&ev_msg, sizeof(ev_msg));
8317 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8318 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8319 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8320 	ev_msg.event_code       = ev_code;
8321 
8322 	ev_msg.dv[0].data_ptr   = ev_data;
8323 	ev_msg.dv[0].data_length = ev_datalen;
8324 
8325 	kev_post_msg(&ev_msg);
8326 }
8327 
8328 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8329 socket_post_kev_msg(uint32_t ev_code,
8330     struct kev_socket_event_data *ev_data,
8331     uint32_t ev_datalen)
8332 {
8333 	struct kev_msg ev_msg;
8334 
8335 	bzero(&ev_msg, sizeof(ev_msg));
8336 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8337 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8338 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8339 	ev_msg.event_code = ev_code;
8340 
8341 	ev_msg.dv[0].data_ptr = ev_data;
8342 	ev_msg.dv[0].data_length = ev_datalen;
8343 
8344 	kev_post_msg(&ev_msg);
8345 }
8346 
8347 void
socket_post_kev_msg_closed(struct socket * so)8348 socket_post_kev_msg_closed(struct socket *so)
8349 {
8350 	struct kev_socket_closed ev = {};
8351 	struct sockaddr *socksa = NULL, *peersa = NULL;
8352 	int err;
8353 
8354 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8355 		return;
8356 	}
8357 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8358 	if (err == 0) {
8359 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8360 		    &peersa);
8361 		if (err == 0) {
8362 			memcpy(&ev.ev_data.kev_sockname, socksa,
8363 			    min(socksa->sa_len,
8364 			    sizeof(ev.ev_data.kev_sockname)));
8365 			memcpy(&ev.ev_data.kev_peername, peersa,
8366 			    min(peersa->sa_len,
8367 			    sizeof(ev.ev_data.kev_peername)));
8368 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8369 			    &ev.ev_data, sizeof(ev));
8370 		}
8371 	}
8372 	free_sockaddr(socksa);
8373 	free_sockaddr(peersa);
8374 }
8375