xref: /xnu-8020.101.4/bsd/kern/uipc_socket.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120 
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125 
126 #include <os/log.h>
127 
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131 
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136 
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138 
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144 
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147 
148 static u_int32_t        so_cache_hw;    /* High water mark for socache */
149 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
150 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
151 static u_int32_t        cached_sock_count = 0;
152 STAILQ_HEAD(, socket)   so_cache_head;
153 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t        so_cache_time;
155 static int              socketinit_done;
156 static struct zone      *so_cache_zone;
157 
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160 
161 #include <machine/limits.h>
162 
163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void     filt_sordetach(struct knote *kn);
165 static int      filt_soread(struct knote *kn, long hint);
166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168 
169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void     filt_sowdetach(struct knote *kn);
171 static int      filt_sowrite(struct knote *kn, long hint);
172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174 
175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void     filt_sockdetach(struct knote *kn);
177 static int      filt_sockev(struct knote *kn, long hint);
178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180 
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183 
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 	.f_isfd = 1,
186 	.f_attach = filt_sorattach,
187 	.f_detach = filt_sordetach,
188 	.f_event = filt_soread,
189 	.f_touch = filt_sortouch,
190 	.f_process = filt_sorprocess,
191 };
192 
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 	.f_isfd = 1,
195 	.f_attach = filt_sowattach,
196 	.f_detach = filt_sowdetach,
197 	.f_event = filt_sowrite,
198 	.f_touch = filt_sowtouch,
199 	.f_process = filt_sowprocess,
200 };
201 
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 	.f_isfd = 1,
204 	.f_attach = filt_sockattach,
205 	.f_detach = filt_sockdetach,
206 	.f_event = filt_sockev,
207 	.f_touch = filt_socktouch,
208 	.f_process = filt_sockprocess,
209 };
210 
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 	.f_isfd = 1,
213 	.f_attach = filt_sorattach,
214 	.f_detach = filt_sordetach,
215 	.f_event = filt_soread,
216 	.f_touch = filt_sortouch,
217 	.f_process = filt_sorprocess,
218 };
219 
220 SYSCTL_DECL(_kern_ipc);
221 
222 #define EVEN_MORE_LOCKING_DEBUG 0
223 
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227 
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230     &sodefunct_calls, "");
231 
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t        so_gencnt;      /* generation count for sockets */
234 
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236 
237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246 
247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
248 
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252 
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy  = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261 
262 /*
263  * Set to enable jumbo clusters (if available) for large writes when
264  * the socket is marked with SOF_MULTIPAGES; see below.
265  */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269 
270 /*
271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272  * writes on the socket for all protocols on any network interfaces,
273  * depending upon sosendjcl above.  Be extra careful when setting this
274  * to 1, because sending down packets that cross physical pages down to
275  * broken drivers (those that falsely assume that the physical pages
276  * are contiguous) might lead to system panics or silent data corruption.
277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279  * capable.  Set this to 1 only for testing/debugging purposes.
280  */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284 
285 /*
286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287  * writes on the socket for all protocols on any network interfaces.
288  * Be extra careful when setting this to 1, because sending down packets with
289  * clusters larger that 2 KB might lead to system panics or data corruption.
290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291  * on the outgoing interface
292  * Set this to 1  for testing/debugging purposes only.
293  */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297 
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300     &sodefunctlog, 0, "");
301 
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304     &sothrottlelog, 0, "");
305 
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308     &sorestrictrecv, 0, "Enable inbound interface restrictions");
309 
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312     &sorestrictsend, 0, "Enable outbound interface restrictions");
313 
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317 
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323 
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327     &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329 
330 extern struct inpcbinfo tcbinfo;
331 
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335 
336 vm_size_t       so_cache_zone_element_size;
337 
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339     user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342 
343 /*
344  * Maximum of extended background idle sockets per process
345  * Set to zero to disable further setting of the option
346  */
347 
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
349 #define SO_IDLE_BK_IDLE_TIME            600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
351 
352 struct soextbkidlestat soextbkidlestat;
353 
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356     "Maximum of extended background idle sockets per process");
357 
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359     &soextbkidlestat.so_xbkidle_time, 0,
360     "Time in seconds to keep extended background idle sockets");
361 
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364     "High water mark for extended background idle sockets");
365 
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367     &soextbkidlestat, soextbkidlestat, "");
368 
369 int so_set_extended_bk_idle(struct socket *, int);
370 
371 
372 /*
373  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374  * setting the DSCP code on the packet based on the service class; see
375  * <rdar://problem/11277343> for details.
376  */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379     &sotcdb, 0, "");
380 
381 void
socketinit(void)382 socketinit(void)
383 {
384 	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386 
387 #ifdef __LP64__
388 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402 
403 	if (socketinit_done) {
404 		printf("socketinit: already called...\n");
405 		return;
406 	}
407 	socketinit_done = 1;
408 
409 	PE_parse_boot_argn("socket_debug", &socket_debug,
410 	    sizeof(socket_debug));
411 
412 	STAILQ_INIT(&so_cache_head);
413 
414 	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
416 
417 	so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 	    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419 
420 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424 
425 	in_pcbinit();
426 }
427 
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 	caddr_t temp;
432 	uintptr_t offset;
433 
434 	lck_mtx_lock(&so_cache_mtx);
435 
436 	if (!STAILQ_EMPTY(&so_cache_head)) {
437 		VERIFY(cached_sock_count > 0);
438 
439 		*so = STAILQ_FIRST(&so_cache_head);
440 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 		STAILQ_NEXT((*so), so_cache_ent) = NULL;
442 
443 		cached_sock_count--;
444 		lck_mtx_unlock(&so_cache_mtx);
445 
446 		temp = (*so)->so_saved_pcb;
447 		bzero((caddr_t)*so, sizeof(struct socket));
448 
449 		(*so)->so_saved_pcb = temp;
450 	} else {
451 		lck_mtx_unlock(&so_cache_mtx);
452 
453 		*so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454 
455 		/*
456 		 * Define offsets for extra structures into our
457 		 * single block of memory. Align extra structures
458 		 * on longword boundaries.
459 		 */
460 
461 		offset = (uintptr_t)*so;
462 		offset += sizeof(struct socket);
463 
464 		offset = ALIGN(offset);
465 
466 		(*so)->so_saved_pcb = (caddr_t)offset;
467 		offset += get_inpcb_str_size();
468 
469 		offset = ALIGN(offset);
470 
471 		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 		    (caddr_t)offset;
473 	}
474 
475 	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477 
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 	lck_mtx_lock(&so_cache_mtx);
482 
483 	so_cache_time = net_uptime();
484 	if (++cached_sock_count > max_cached_sock_count) {
485 		--cached_sock_count;
486 		lck_mtx_unlock(&so_cache_mtx);
487 		zfree(so_cache_zone, so);
488 	} else {
489 		if (so_cache_hw < cached_sock_count) {
490 			so_cache_hw = cached_sock_count;
491 		}
492 
493 		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494 
495 		so->cache_timestamp = so_cache_time;
496 		lck_mtx_unlock(&so_cache_mtx);
497 	}
498 }
499 
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 	if (so->last_pid != 0) {
504 		/*
505 		 * last_pid and last_upid should remain zero for sockets
506 		 * created using sock_socket. The check above achieves that
507 		 */
508 		if (self == PROC_NULL) {
509 			self = current_proc();
510 		}
511 
512 		if (so->last_upid != proc_uniqueid(self) ||
513 		    so->last_pid != proc_pid(self)) {
514 			so->last_upid = proc_uniqueid(self);
515 			so->last_pid = proc_pid(self);
516 			proc_getexecutableuuid(self, so->last_uuid,
517 			    sizeof(so->last_uuid));
518 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 			}
521 		}
522 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 	}
524 }
525 
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 		(void) inp_update_policy(sotoinpcb(so));
531 	}
532 }
533 
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537     struct sockaddr *override_remote_addr)
538 {
539 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 		    override_remote_addr, 0);
542 	}
543 }
544 #endif /* NECP */
545 
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 	struct socket   *p;
550 	int             n_freed = 0;
551 	boolean_t rc = FALSE;
552 
553 	lck_mtx_lock(&so_cache_mtx);
554 	so_cache_timeouts++;
555 	so_cache_time = net_uptime();
556 
557 	while (!STAILQ_EMPTY(&so_cache_head)) {
558 		VERIFY(cached_sock_count > 0);
559 		p = STAILQ_FIRST(&so_cache_head);
560 		if ((so_cache_time - p->cache_timestamp) <
561 		    SO_CACHE_TIME_LIMIT) {
562 			break;
563 		}
564 
565 		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 		--cached_sock_count;
567 
568 		zfree(so_cache_zone, p);
569 
570 		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 			so_cache_max_freed++;
572 			break;
573 		}
574 	}
575 
576 	/* Schedule again if there is more to cleanup */
577 	if (!STAILQ_EMPTY(&so_cache_head)) {
578 		rc = TRUE;
579 	}
580 
581 	lck_mtx_unlock(&so_cache_mtx);
582 	return rc;
583 }
584 
585 /*
586  * Get a socket structure from our zone, and initialize it.
587  * We don't implement `waitok' yet (see comments in uipc_domain.c).
588  * Note that it would probably be better to allocate socket
589  * and PCB at the same time, but I'm not convinced that all
590  * the protocols can be easily modified to do this.
591  */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 	struct socket *so;
597 
598 	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 		cached_sock_alloc(&so, how);
600 	} else {
601 		so = zalloc_flags(socket_zone, how | Z_ZERO);
602 	}
603 	if (so != NULL) {
604 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605 
606 		/*
607 		 * Increment the socket allocation statistics
608 		 */
609 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 	}
611 
612 	return so;
613 }
614 
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617     struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 	struct protosw *prp;
620 	struct socket *so;
621 	int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 	pid_t rpid = -1;
624 #endif
625 
626 #if TCPDEBUG
627 	extern int tcpconsdebug;
628 #endif
629 
630 	VERIFY(aso != NULL);
631 	*aso = NULL;
632 
633 	if (proto != 0) {
634 		prp = pffindproto(dom, proto, type);
635 	} else {
636 		prp = pffindtype(dom, type);
637 	}
638 
639 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 		if (pffinddomain(dom) == NULL) {
641 			return EAFNOSUPPORT;
642 		}
643 		if (proto != 0) {
644 			if (pffindprotonotype(dom, proto) != NULL) {
645 				return EPROTOTYPE;
646 			}
647 		}
648 		return EPROTONOSUPPORT;
649 	}
650 	if (prp->pr_type != type) {
651 		return EPROTOTYPE;
652 	}
653 	so = soalloc(1, dom, type);
654 	if (so == NULL) {
655 		return ENOBUFS;
656 	}
657 
658 	switch (dom) {
659 	case PF_LOCAL:
660 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 		break;
662 	case PF_INET:
663 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 		if (type == SOCK_STREAM) {
665 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 		} else {
667 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 		}
669 		break;
670 	case PF_ROUTE:
671 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 		break;
673 	case PF_NDRV:
674 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 		break;
676 	case PF_KEY:
677 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 		break;
679 	case PF_INET6:
680 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 		if (type == SOCK_STREAM) {
682 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 		} else {
684 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 		}
686 		break;
687 	case PF_SYSTEM:
688 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 		break;
690 	case PF_MULTIPATH:
691 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 		break;
693 	default:
694 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 		break;
696 	}
697 
698 	if (flags & SOCF_MPTCP) {
699 		so->so_state |= SS_NBIO;
700 	}
701 
702 	TAILQ_INIT(&so->so_incomp);
703 	TAILQ_INIT(&so->so_comp);
704 	so->so_type = (short)type;
705 	so->last_upid = proc_uniqueid(p);
706 	so->last_pid = proc_pid(p);
707 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709 
710 	if (ep != PROC_NULL && ep != p) {
711 		so->e_upid = proc_uniqueid(ep);
712 		so->e_pid = proc_pid(ep);
713 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 		so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 		if (ep->p_responsible_pid != so->e_pid) {
717 			rpid = ep->p_responsible_pid;
718 		}
719 #endif
720 	}
721 
722 #if defined(XNU_TARGET_OS_OSX)
723 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 		rpid = p->p_responsible_pid;
725 	}
726 
727 	so->so_rpid = -1;
728 	uuid_clear(so->so_ruuid);
729 	if (rpid >= 0) {
730 		proc_t rp = proc_find(rpid);
731 		if (rp != PROC_NULL) {
732 			proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 			so->so_rpid = rpid;
734 			proc_rele(rp);
735 		}
736 	}
737 #endif
738 
739 	so->so_cred = kauth_cred_proc_ref(p);
740 	if (!suser(kauth_cred_get(), NULL)) {
741 		so->so_state |= SS_PRIV;
742 	}
743 
744 	so->so_proto = prp;
745 	so->so_rcv.sb_flags |= SB_RECV;
746 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 	so->next_lock_lr = 0;
748 	so->next_unlock_lr = 0;
749 
750 	/*
751 	 * Attachment will create the per pcb lock if necessary and
752 	 * increase refcount for creation, make sure it's done before
753 	 * socket is inserted in lists.
754 	 */
755 	so->so_usecount++;
756 
757 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 	if (error != 0) {
759 		/*
760 		 * Warning:
761 		 * If so_pcb is not zero, the socket will be leaked,
762 		 * so protocol attachment handler must be coded carefuly
763 		 */
764 		if (so->so_pcb != NULL) {
765 			os_log_error(OS_LOG_DEFAULT,
766 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 			    error, dom, proto, type);
768 		}
769 		/*
770 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 		 */
772 		so->so_state |= SS_NOFDREF;
773 		so->so_flags |= SOF_PCBCLEARING;
774 		VERIFY(so->so_usecount > 0);
775 		so->so_usecount--;
776 		sofreelastref(so, 1);   /* will deallocate the socket */
777 		return error;
778 	}
779 
780 	/*
781 	 * Note: needs so_pcb to be set after pru_attach
782 	 */
783 	if (prp->pr_update_last_owner != NULL) {
784 		(*prp->pr_update_last_owner)(so, p, ep);
785 	}
786 
787 	atomic_add_32(&prp->pr_domain->dom_refs, 1);
788 
789 	/* Attach socket filters for this protocol */
790 	sflt_initsock(so);
791 #if TCPDEBUG
792 	if (tcpconsdebug == 2) {
793 		so->so_options |= SO_DEBUG;
794 	}
795 #endif
796 	so_set_default_traffic_class(so);
797 
798 	/*
799 	 * If this thread or task is marked to create backgrounded sockets,
800 	 * mark the socket as background.
801 	 */
802 	if (!(flags & SOCF_MPTCP) &&
803 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 		so->so_background_thread = current_thread();
806 	}
807 
808 	switch (dom) {
809 	/*
810 	 * Don't mark Unix domain or system
811 	 * eligible for defunct by default.
812 	 */
813 	case PF_LOCAL:
814 	case PF_SYSTEM:
815 		so->so_flags |= SOF_NODEFUNCT;
816 		break;
817 	default:
818 		break;
819 	}
820 
821 	/*
822 	 * Entitlements can't be checked at socket creation time except if the
823 	 * application requested a feature guarded by a privilege (c.f., socket
824 	 * delegation).
825 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 	 * a privilege check should only be triggered by a userland request.
827 	 * A privilege check at socket creation time is time consuming and
828 	 * could trigger many authorisation error messages from the security
829 	 * APIs.
830 	 */
831 
832 	*aso = so;
833 
834 	return 0;
835 }
836 
837 /*
838  * Returns:	0			Success
839  *		EAFNOSUPPORT
840  *		EPROTOTYPE
841  *		EPROTONOSUPPORT
842  *		ENOBUFS
843  *	<pru_attach>:ENOBUFS[AF_UNIX]
844  *	<pru_attach>:ENOBUFS[TCP]
845  *	<pru_attach>:ENOMEM[TCP]
846  *	<pru_attach>:???		[other protocol families, IPSEC]
847  */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 	           PROC_NULL);
853 }
854 
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 	int error = 0;
859 	struct proc *ep = PROC_NULL;
860 
861 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 		error = ESRCH;
863 		goto done;
864 	}
865 
866 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867 
868 	/*
869 	 * It might not be wise to hold the proc reference when calling
870 	 * socreate_internal since it calls soalloc with M_WAITOK
871 	 */
872 done:
873 	if (ep != PROC_NULL) {
874 		proc_rele(ep);
875 	}
876 
877 	return error;
878 }
879 
880 /*
881  * Returns:	0			Success
882  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
883  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
884  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
885  *	<pru_bind>:EINVAL		Invalid argument
886  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
887  *	<pru_bind>:EACCES		Permission denied
888  *	<pru_bind>:EADDRINUSE		Address in use
889  *	<pru_bind>:EAGAIN		Resource unavailable, try again
890  *	<pru_bind>:EPERM		Operation not permitted
891  *	<pru_bind>:???
892  *	<sf_bind>:???
893  *
894  * Notes:	It's not possible to fully enumerate the return codes above,
895  *		since socket filter authors and protocol family authors may
896  *		not choose to limit their error returns to those listed, even
897  *		though this may result in some software operating incorrectly.
898  *
899  *		The error codes which are enumerated above are those known to
900  *		be returned by the tcp_usr_bind function supplied.
901  */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 	struct proc *p = current_proc();
906 	int error = 0;
907 
908 	if (dolock) {
909 		socket_lock(so, 1);
910 	}
911 
912 	so_update_last_owner_locked(so, p);
913 	so_update_policy(so);
914 
915 #if NECP
916 	so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918 
919 	/*
920 	 * If this is a bind request on a socket that has been marked
921 	 * as inactive, reject it now before we go any further.
922 	 */
923 	if (so->so_flags & SOF_DEFUNCT) {
924 		error = EINVAL;
925 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
926 		    __func__, proc_pid(p), proc_best_name(p),
927 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
928 		    SOCK_DOM(so), SOCK_TYPE(so), error);
929 		goto out;
930 	}
931 
932 	/* Socket filter */
933 	error = sflt_bind(so, nam);
934 
935 	if (error == 0) {
936 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 	}
938 out:
939 	if (dolock) {
940 		socket_unlock(so, 1);
941 	}
942 
943 	if (error == EJUSTRETURN) {
944 		error = 0;
945 	}
946 
947 	return error;
948 }
949 
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 	kauth_cred_unref(&so->so_cred);
954 
955 	/* Remove any filters */
956 	sflt_termsock(so);
957 
958 #if CONTENT_FILTER
959 	cfil_sock_detach(so);
960 #endif /* CONTENT_FILTER */
961 
962 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
963 		soflow_detach(so);
964 	}
965 
966 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
967 
968 	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
969 		cached_sock_free(so);
970 	} else {
971 		zfree(socket_zone, so);
972 	}
973 }
974 
975 /*
976  * Returns:	0			Success
977  *		EINVAL
978  *		EOPNOTSUPP
979  *	<pru_listen>:EINVAL[AF_UNIX]
980  *	<pru_listen>:EINVAL[TCP]
981  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
982  *	<pru_listen>:EINVAL[TCP]	Invalid argument
983  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
984  *	<pru_listen>:EACCES[TCP]	Permission denied
985  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
986  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
987  *	<pru_listen>:EPERM[TCP]		Operation not permitted
988  *	<sf_listen>:???
989  *
990  * Notes:	Other <pru_listen> returns depend on the protocol family; all
991  *		<sf_listen> returns depend on what the filter author causes
992  *		their filter to return.
993  */
994 int
solisten(struct socket * so,int backlog)995 solisten(struct socket *so, int backlog)
996 {
997 	struct proc *p = current_proc();
998 	int error = 0;
999 
1000 	socket_lock(so, 1);
1001 
1002 	so_update_last_owner_locked(so, p);
1003 	so_update_policy(so);
1004 
1005 #if NECP
1006 	so_update_necp_policy(so, NULL, NULL);
1007 #endif /* NECP */
1008 
1009 	if (so->so_proto == NULL) {
1010 		error = EINVAL;
1011 		goto out;
1012 	}
1013 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1014 		error = EOPNOTSUPP;
1015 		goto out;
1016 	}
1017 
1018 	/*
1019 	 * If the listen request is made on a socket that is not fully
1020 	 * disconnected, or on a socket that has been marked as inactive,
1021 	 * reject the request now.
1022 	 */
1023 	if ((so->so_state &
1024 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1025 	    (so->so_flags & SOF_DEFUNCT)) {
1026 		error = EINVAL;
1027 		if (so->so_flags & SOF_DEFUNCT) {
1028 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1029 			    "(%d)\n", __func__, proc_pid(p),
1030 			    proc_best_name(p),
1031 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1032 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1033 		}
1034 		goto out;
1035 	}
1036 
1037 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1038 		error = EPERM;
1039 		goto out;
1040 	}
1041 
1042 	error = sflt_listen(so);
1043 	if (error == 0) {
1044 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1045 	}
1046 
1047 	if (error) {
1048 		if (error == EJUSTRETURN) {
1049 			error = 0;
1050 		}
1051 		goto out;
1052 	}
1053 
1054 	if (TAILQ_EMPTY(&so->so_comp)) {
1055 		so->so_options |= SO_ACCEPTCONN;
1056 	}
1057 	/*
1058 	 * POSIX: The implementation may have an upper limit on the length of
1059 	 * the listen queue-either global or per accepting socket. If backlog
1060 	 * exceeds this limit, the length of the listen queue is set to the
1061 	 * limit.
1062 	 *
1063 	 * If listen() is called with a backlog argument value that is less
1064 	 * than 0, the function behaves as if it had been called with a backlog
1065 	 * argument value of 0.
1066 	 *
1067 	 * A backlog argument of 0 may allow the socket to accept connections,
1068 	 * in which case the length of the listen queue may be set to an
1069 	 * implementation-defined minimum value.
1070 	 */
1071 	if (backlog <= 0 || backlog > somaxconn) {
1072 		backlog = somaxconn;
1073 	}
1074 
1075 	so->so_qlimit = (short)backlog;
1076 out:
1077 	socket_unlock(so, 1);
1078 	return error;
1079 }
1080 
1081 /*
1082  * The "accept list lock" protects the fields related to the listener queues
1083  * because we can unlock a socket to respect the lock ordering between
1084  * the listener socket and its clients sockets. The lock ordering is first to
1085  * acquire the client socket before the listener socket.
1086  *
1087  * The accept list lock serializes access to the following fields:
1088  * - of the listener socket:
1089  *   - so_comp
1090  *   - so_incomp
1091  *   - so_qlen
1092  *   - so_inqlen
1093  * - of client sockets that are in so_comp or so_incomp:
1094  *   - so_head
1095  *   - so_list
1096  *
1097  * As one can see the accept list lock protects the consistent of the
1098  * linkage of the client sockets.
1099  *
1100  * Note that those fields may be read without holding the accept list lock
1101  * for a preflight provided the accept list lock is taken when committing
1102  * to take an action based on the result of the preflight. The preflight
1103  * saves the cost of doing the unlock/lock dance.
1104  */
1105 void
so_acquire_accept_list(struct socket * head,struct socket * so)1106 so_acquire_accept_list(struct socket *head, struct socket *so)
1107 {
1108 	lck_mtx_t *mutex_held;
1109 
1110 	if (head->so_proto->pr_getlock == NULL) {
1111 		return;
1112 	}
1113 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1114 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1115 
1116 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1117 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1118 		return;
1119 	}
1120 	if (so != NULL) {
1121 		socket_unlock(so, 0);
1122 	}
1123 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1124 		so_accept_list_waits += 1;
1125 		msleep((caddr_t)&head->so_incomp, mutex_held,
1126 		    PSOCK | PCATCH, __func__, NULL);
1127 	}
1128 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 	if (so != NULL) {
1130 		socket_unlock(head, 0);
1131 		socket_lock(so, 0);
1132 		socket_lock(head, 0);
1133 	}
1134 }
1135 
1136 void
so_release_accept_list(struct socket * head)1137 so_release_accept_list(struct socket *head)
1138 {
1139 	if (head->so_proto->pr_getlock != NULL) {
1140 		lck_mtx_t *mutex_held;
1141 
1142 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1143 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1144 
1145 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1146 		wakeup((caddr_t)&head->so_incomp);
1147 	}
1148 }
1149 
1150 void
sofreelastref(struct socket * so,int dealloc)1151 sofreelastref(struct socket *so, int dealloc)
1152 {
1153 	struct socket *head = so->so_head;
1154 
1155 	/* Assume socket is locked */
1156 
1157 #if FLOW_DIVERT
1158 	if (so->so_flags & SOF_FLOW_DIVERT) {
1159 		flow_divert_detach(so);
1160 	}
1161 #endif  /* FLOW_DIVERT */
1162 
1163 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1164 		selthreadclear(&so->so_snd.sb_sel);
1165 		selthreadclear(&so->so_rcv.sb_sel);
1166 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1167 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1168 		so->so_event = sonullevent;
1169 		return;
1170 	}
1171 	if (head != NULL) {
1172 		/*
1173 		 * Need to lock the listener when the protocol has
1174 		 * per socket locks
1175 		 */
1176 		if (head->so_proto->pr_getlock != NULL) {
1177 			socket_lock(head, 1);
1178 			so_acquire_accept_list(head, so);
1179 		}
1180 		if (so->so_state & SS_INCOMP) {
1181 			so->so_state &= ~SS_INCOMP;
1182 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1183 			head->so_incqlen--;
1184 			head->so_qlen--;
1185 			so->so_head = NULL;
1186 
1187 			if (head->so_proto->pr_getlock != NULL) {
1188 				so_release_accept_list(head);
1189 				socket_unlock(head, 1);
1190 			}
1191 		} else if (so->so_state & SS_COMP) {
1192 			if (head->so_proto->pr_getlock != NULL) {
1193 				so_release_accept_list(head);
1194 				socket_unlock(head, 1);
1195 			}
1196 			/*
1197 			 * We must not decommission a socket that's
1198 			 * on the accept(2) queue.  If we do, then
1199 			 * accept(2) may hang after select(2) indicated
1200 			 * that the listening socket was ready.
1201 			 */
1202 			selthreadclear(&so->so_snd.sb_sel);
1203 			selthreadclear(&so->so_rcv.sb_sel);
1204 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1205 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1206 			so->so_event = sonullevent;
1207 			return;
1208 		} else {
1209 			if (head->so_proto->pr_getlock != NULL) {
1210 				so_release_accept_list(head);
1211 				socket_unlock(head, 1);
1212 			}
1213 			printf("sofree: not queued\n");
1214 		}
1215 	}
1216 	sowflush(so);
1217 	sorflush(so);
1218 
1219 	/* 3932268: disable upcall */
1220 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1221 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1222 	so->so_event = sonullevent;
1223 
1224 	if (dealloc) {
1225 		sodealloc(so);
1226 	}
1227 }
1228 
1229 void
soclose_wait_locked(struct socket * so)1230 soclose_wait_locked(struct socket *so)
1231 {
1232 	lck_mtx_t *mutex_held;
1233 
1234 	if (so->so_proto->pr_getlock != NULL) {
1235 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1236 	} else {
1237 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1238 	}
1239 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1240 
1241 	/*
1242 	 * Double check here and return if there's no outstanding upcall;
1243 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1244 	 */
1245 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1246 		return;
1247 	}
1248 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1249 	so->so_snd.sb_flags &= ~SB_UPCALL;
1250 	so->so_flags |= SOF_CLOSEWAIT;
1251 
1252 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1253 	    "soclose_wait_locked", NULL);
1254 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1255 	so->so_flags &= ~SOF_CLOSEWAIT;
1256 }
1257 
1258 /*
1259  * Close a socket on last file table reference removal.
1260  * Initiate disconnect if connected.
1261  * Free socket when disconnect complete.
1262  */
1263 int
soclose_locked(struct socket * so)1264 soclose_locked(struct socket *so)
1265 {
1266 	int error = 0;
1267 	struct timespec ts;
1268 
1269 	if (so->so_usecount == 0) {
1270 		panic("soclose: so=%p refcount=0", so);
1271 		/* NOTREACHED */
1272 	}
1273 
1274 	sflt_notify(so, sock_evt_closing, NULL);
1275 
1276 	if (so->so_upcallusecount) {
1277 		soclose_wait_locked(so);
1278 	}
1279 
1280 #if CONTENT_FILTER
1281 	/*
1282 	 * We have to wait until the content filters are done
1283 	 */
1284 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1285 		cfil_sock_close_wait(so);
1286 		cfil_sock_is_closed(so);
1287 		cfil_sock_detach(so);
1288 	}
1289 #endif /* CONTENT_FILTER */
1290 
1291 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1292 		soflow_detach(so);
1293 	}
1294 
1295 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1296 		soresume(current_proc(), so, 1);
1297 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1298 	}
1299 
1300 	if ((so->so_options & SO_ACCEPTCONN)) {
1301 		struct socket *sp, *sonext;
1302 		int persocklock = 0;
1303 		int incomp_overflow_only;
1304 
1305 		/*
1306 		 * We do not want new connection to be added
1307 		 * to the connection queues
1308 		 */
1309 		so->so_options &= ~SO_ACCEPTCONN;
1310 
1311 		/*
1312 		 * We can drop the lock on the listener once
1313 		 * we've acquired the incoming list
1314 		 */
1315 		if (so->so_proto->pr_getlock != NULL) {
1316 			persocklock = 1;
1317 			so_acquire_accept_list(so, NULL);
1318 			socket_unlock(so, 0);
1319 		}
1320 again:
1321 		incomp_overflow_only = 1;
1322 
1323 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1324 			/*
1325 			 * Radar 5350314
1326 			 * skip sockets thrown away by tcpdropdropblreq
1327 			 * they will get cleanup by the garbage collection.
1328 			 * otherwise, remove the incomp socket from the queue
1329 			 * and let soabort trigger the appropriate cleanup.
1330 			 */
1331 			if (sp->so_flags & SOF_OVERFLOW) {
1332 				continue;
1333 			}
1334 
1335 			if (persocklock != 0) {
1336 				socket_lock(sp, 1);
1337 			}
1338 
1339 			/*
1340 			 * Radar 27945981
1341 			 * The extra reference for the list insure the
1342 			 * validity of the socket pointer when we perform the
1343 			 * unlock of the head above
1344 			 */
1345 			if (sp->so_state & SS_INCOMP) {
1346 				sp->so_state &= ~SS_INCOMP;
1347 				sp->so_head = NULL;
1348 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1349 				so->so_incqlen--;
1350 				so->so_qlen--;
1351 
1352 				(void) soabort(sp);
1353 			} else {
1354 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1355 				    __func__, sp);
1356 			}
1357 
1358 			if (persocklock != 0) {
1359 				socket_unlock(sp, 1);
1360 			}
1361 		}
1362 
1363 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1364 			/* Dequeue from so_comp since sofree() won't do it */
1365 			if (persocklock != 0) {
1366 				socket_lock(sp, 1);
1367 			}
1368 
1369 			if (sp->so_state & SS_COMP) {
1370 				sp->so_state &= ~SS_COMP;
1371 				sp->so_head = NULL;
1372 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1373 				so->so_qlen--;
1374 
1375 				(void) soabort(sp);
1376 			} else {
1377 				panic("%s sp %p in so_comp but !SS_COMP",
1378 				    __func__, sp);
1379 			}
1380 
1381 			if (persocklock) {
1382 				socket_unlock(sp, 1);
1383 			}
1384 		}
1385 
1386 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1387 #if (DEBUG | DEVELOPMENT)
1388 			panic("%s head %p so_comp not empty", __func__, so);
1389 #endif /* (DEVELOPMENT || DEBUG) */
1390 
1391 			goto again;
1392 		}
1393 
1394 		if (!TAILQ_EMPTY(&so->so_comp)) {
1395 #if (DEBUG | DEVELOPMENT)
1396 			panic("%s head %p so_comp not empty", __func__, so);
1397 #endif /* (DEVELOPMENT || DEBUG) */
1398 
1399 			goto again;
1400 		}
1401 
1402 		if (persocklock) {
1403 			socket_lock(so, 0);
1404 			so_release_accept_list(so);
1405 		}
1406 	}
1407 	if (so->so_pcb == NULL) {
1408 		/* 3915887: mark the socket as ready for dealloc */
1409 		so->so_flags |= SOF_PCBCLEARING;
1410 		goto discard;
1411 	}
1412 	if (so->so_state & SS_ISCONNECTED) {
1413 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1414 			error = sodisconnectlocked(so);
1415 			if (error) {
1416 				goto drop;
1417 			}
1418 		}
1419 		if (so->so_options & SO_LINGER) {
1420 			lck_mtx_t *mutex_held;
1421 
1422 			if ((so->so_state & SS_ISDISCONNECTING) &&
1423 			    (so->so_state & SS_NBIO)) {
1424 				goto drop;
1425 			}
1426 			if (so->so_proto->pr_getlock != NULL) {
1427 				mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1428 			} else {
1429 				mutex_held = so->so_proto->pr_domain->dom_mtx;
1430 			}
1431 			while (so->so_state & SS_ISCONNECTED) {
1432 				ts.tv_sec = (so->so_linger / 100);
1433 				ts.tv_nsec = (so->so_linger % 100) *
1434 				    NSEC_PER_USEC * 1000 * 10;
1435 				error = msleep((caddr_t)&so->so_timeo,
1436 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1437 				if (error) {
1438 					/*
1439 					 * It's OK when the time fires,
1440 					 * don't report an error
1441 					 */
1442 					if (error == EWOULDBLOCK) {
1443 						error = 0;
1444 					}
1445 					break;
1446 				}
1447 			}
1448 		}
1449 	}
1450 drop:
1451 	if (so->so_usecount == 0) {
1452 		panic("soclose: usecount is zero so=%p", so);
1453 		/* NOTREACHED */
1454 	}
1455 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1456 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1457 		if (error == 0) {
1458 			error = error2;
1459 		}
1460 	}
1461 	if (so->so_usecount <= 0) {
1462 		panic("soclose: usecount is zero so=%p", so);
1463 		/* NOTREACHED */
1464 	}
1465 discard:
1466 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1467 	    (so->so_state & SS_NOFDREF)) {
1468 		panic("soclose: NOFDREF");
1469 		/* NOTREACHED */
1470 	}
1471 	so->so_state |= SS_NOFDREF;
1472 
1473 	if ((so->so_flags & SOF_KNOTE) != 0) {
1474 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1475 	}
1476 
1477 	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1478 
1479 	VERIFY(so->so_usecount > 0);
1480 	so->so_usecount--;
1481 	sofree(so);
1482 	return error;
1483 }
1484 
1485 int
soclose(struct socket * so)1486 soclose(struct socket *so)
1487 {
1488 	int error = 0;
1489 	socket_lock(so, 1);
1490 
1491 	if (so->so_retaincnt == 0) {
1492 		error = soclose_locked(so);
1493 	} else {
1494 		/*
1495 		 * if the FD is going away, but socket is
1496 		 * retained in kernel remove its reference
1497 		 */
1498 		so->so_usecount--;
1499 		if (so->so_usecount < 2) {
1500 			panic("soclose: retaincnt non null and so=%p "
1501 			    "usecount=%d\n", so, so->so_usecount);
1502 		}
1503 	}
1504 	socket_unlock(so, 1);
1505 	return error;
1506 }
1507 
1508 /*
1509  * Must be called at splnet...
1510  */
1511 /* Should already be locked */
1512 int
soabort(struct socket * so)1513 soabort(struct socket *so)
1514 {
1515 	int error;
1516 
1517 #ifdef MORE_LOCKING_DEBUG
1518 	lck_mtx_t *mutex_held;
1519 
1520 	if (so->so_proto->pr_getlock != NULL) {
1521 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1522 	} else {
1523 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1524 	}
1525 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1526 #endif
1527 
1528 	if ((so->so_flags & SOF_ABORTED) == 0) {
1529 		so->so_flags |= SOF_ABORTED;
1530 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1531 		if (error) {
1532 			sofree(so);
1533 			return error;
1534 		}
1535 	}
1536 	return 0;
1537 }
1538 
1539 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1540 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1541 {
1542 	int error;
1543 
1544 	if (dolock) {
1545 		socket_lock(so, 1);
1546 	}
1547 
1548 	so_update_last_owner_locked(so, PROC_NULL);
1549 	so_update_policy(so);
1550 #if NECP
1551 	so_update_necp_policy(so, NULL, NULL);
1552 #endif /* NECP */
1553 
1554 	if ((so->so_state & SS_NOFDREF) == 0) {
1555 		panic("soaccept: !NOFDREF");
1556 	}
1557 	so->so_state &= ~SS_NOFDREF;
1558 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1559 
1560 	if (dolock) {
1561 		socket_unlock(so, 1);
1562 	}
1563 	return error;
1564 }
1565 
1566 int
soaccept(struct socket * so,struct sockaddr ** nam)1567 soaccept(struct socket *so, struct sockaddr **nam)
1568 {
1569 	return soacceptlock(so, nam, 1);
1570 }
1571 
1572 int
soacceptfilter(struct socket * so,struct socket * head)1573 soacceptfilter(struct socket *so, struct socket *head)
1574 {
1575 	struct sockaddr *local = NULL, *remote = NULL;
1576 	int error = 0;
1577 
1578 	/*
1579 	 * Hold the lock even if this socket has not been made visible
1580 	 * to the filter(s).  For sockets with global locks, this protects
1581 	 * against the head or peer going away
1582 	 */
1583 	socket_lock(so, 1);
1584 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1585 	    sogetaddr_locked(so, &local, 0) != 0) {
1586 		so->so_state &= ~SS_NOFDREF;
1587 		socket_unlock(so, 1);
1588 		soclose(so);
1589 		/* Out of resources; try it again next time */
1590 		error = ECONNABORTED;
1591 		goto done;
1592 	}
1593 
1594 	error = sflt_accept(head, so, local, remote);
1595 
1596 	/*
1597 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1598 	 * as inactive and return it anyway.  This newly accepted socket
1599 	 * will be disconnected later before we hand it off to the caller.
1600 	 */
1601 	if (error == EJUSTRETURN) {
1602 		error = 0;
1603 		(void) sosetdefunct(current_proc(), so,
1604 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1605 	}
1606 
1607 	if (error != 0) {
1608 		/*
1609 		 * This may seem like a duplication to the above error
1610 		 * handling part when we return ECONNABORTED, except
1611 		 * the following is done while holding the lock since
1612 		 * the socket has been exposed to the filter(s) earlier.
1613 		 */
1614 		so->so_state &= ~SS_NOFDREF;
1615 		socket_unlock(so, 1);
1616 		soclose(so);
1617 		/* Propagate socket filter's error code to the caller */
1618 	} else {
1619 		socket_unlock(so, 1);
1620 	}
1621 done:
1622 	/* Callee checks for NULL pointer */
1623 	sock_freeaddr(remote);
1624 	sock_freeaddr(local);
1625 	return error;
1626 }
1627 
1628 /*
1629  * Returns:	0			Success
1630  *		EOPNOTSUPP		Operation not supported on socket
1631  *		EISCONN			Socket is connected
1632  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1633  *	<pru_connect>:EINVAL		Invalid argument
1634  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1635  *	<pru_connect>:EACCES		Permission denied
1636  *	<pru_connect>:EADDRINUSE	Address in use
1637  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1638  *	<pru_connect>:EPERM		Operation not permitted
1639  *	<sf_connect_out>:???		[anything a filter writer might set]
1640  */
1641 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1642 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1643 {
1644 	int error;
1645 	struct proc *p = current_proc();
1646 	tracker_metadata_t metadata = { };
1647 
1648 	if (dolock) {
1649 		socket_lock(so, 1);
1650 	}
1651 
1652 	so_update_last_owner_locked(so, p);
1653 	so_update_policy(so);
1654 
1655 #if NECP
1656 	so_update_necp_policy(so, NULL, nam);
1657 #endif /* NECP */
1658 
1659 	/*
1660 	 * If this is a listening socket or if this is a previously-accepted
1661 	 * socket that has been marked as inactive, reject the connect request.
1662 	 */
1663 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1664 		error = EOPNOTSUPP;
1665 		if (so->so_flags & SOF_DEFUNCT) {
1666 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1667 			    "(%d)\n", __func__, proc_pid(p),
1668 			    proc_best_name(p),
1669 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1670 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1671 		}
1672 		if (dolock) {
1673 			socket_unlock(so, 1);
1674 		}
1675 		return error;
1676 	}
1677 
1678 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1679 		if (dolock) {
1680 			socket_unlock(so, 1);
1681 		}
1682 		return EPERM;
1683 	}
1684 
1685 	/*
1686 	 * If protocol is connection-based, can only connect once.
1687 	 * Otherwise, if connected, try to disconnect first.
1688 	 * This allows user to disconnect by connecting to, e.g.,
1689 	 * a null address.
1690 	 */
1691 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1692 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1693 	    (error = sodisconnectlocked(so)))) {
1694 		error = EISCONN;
1695 	} else {
1696 		/*
1697 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1698 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1699 		 */
1700 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1701 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1702 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1703 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1704 				}
1705 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1706 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1707 				}
1708 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1709 					printf("connect() - failed necp_set_socket_domain_attributes");
1710 				}
1711 			}
1712 		}
1713 
1714 		/*
1715 		 * Run connect filter before calling protocol:
1716 		 *  - non-blocking connect returns before completion;
1717 		 */
1718 		error = sflt_connectout(so, nam);
1719 		if (error != 0) {
1720 			if (error == EJUSTRETURN) {
1721 				error = 0;
1722 			}
1723 		} else {
1724 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1725 			    (so, nam, p);
1726 			if (error != 0) {
1727 				so->so_state &= ~SS_ISCONNECTING;
1728 			}
1729 		}
1730 	}
1731 	if (dolock) {
1732 		socket_unlock(so, 1);
1733 	}
1734 	return error;
1735 }
1736 
1737 int
soconnect(struct socket * so,struct sockaddr * nam)1738 soconnect(struct socket *so, struct sockaddr *nam)
1739 {
1740 	return soconnectlock(so, nam, 1);
1741 }
1742 
1743 /*
1744  * Returns:	0			Success
1745  *	<pru_connect2>:EINVAL[AF_UNIX]
1746  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1747  *	<pru_connect2>:???		[other protocol families]
1748  *
1749  * Notes:	<pru_connect2> is not supported by [TCP].
1750  */
1751 int
soconnect2(struct socket * so1,struct socket * so2)1752 soconnect2(struct socket *so1, struct socket *so2)
1753 {
1754 	int error;
1755 
1756 	socket_lock(so1, 1);
1757 	if (so2->so_proto->pr_lock) {
1758 		socket_lock(so2, 1);
1759 	}
1760 
1761 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1762 
1763 	socket_unlock(so1, 1);
1764 	if (so2->so_proto->pr_lock) {
1765 		socket_unlock(so2, 1);
1766 	}
1767 	return error;
1768 }
1769 
1770 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1771 soconnectxlocked(struct socket *so, struct sockaddr *src,
1772     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1773     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1774     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1775 {
1776 	int error;
1777 	tracker_metadata_t metadata = { };
1778 
1779 	so_update_last_owner_locked(so, p);
1780 	so_update_policy(so);
1781 
1782 	/*
1783 	 * If this is a listening socket or if this is a previously-accepted
1784 	 * socket that has been marked as inactive, reject the connect request.
1785 	 */
1786 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1787 		error = EOPNOTSUPP;
1788 		if (so->so_flags & SOF_DEFUNCT) {
1789 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1790 			    "(%d)\n", __func__, proc_pid(p),
1791 			    proc_best_name(p),
1792 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1793 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1794 		}
1795 		return error;
1796 	}
1797 
1798 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1799 		return EPERM;
1800 	}
1801 
1802 	/*
1803 	 * If protocol is connection-based, can only connect once
1804 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1805 	 * try to disconnect first.  This allows user to disconnect
1806 	 * by connecting to, e.g., a null address.
1807 	 */
1808 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1809 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1810 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1811 	    (error = sodisconnectlocked(so)) != 0)) {
1812 		error = EISCONN;
1813 	} else {
1814 		/*
1815 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1816 		 * (only if it hasn't been marked yet).
1817 		 */
1818 		if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1819 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1820 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1821 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1822 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1823 				}
1824 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1825 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1826 				}
1827 				if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1828 					printf("connectx() - failed necp_set_socket_domain_attributes");
1829 				}
1830 			}
1831 		}
1832 
1833 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1834 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1835 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1836 
1837 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1838 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1839 			}
1840 		}
1841 
1842 		/*
1843 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1844 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1845 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1846 		 * Case 3 allows user to combine write with connect even if they have
1847 		 * no use for TFO (such as regular TCP, and UDP).
1848 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1849 		 */
1850 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1851 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1852 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1853 		}
1854 
1855 		/*
1856 		 * If a user sets data idempotent and does not pass an uio, or
1857 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1858 		 * SOF1_DATA_IDEMPOTENT.
1859 		 */
1860 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1861 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1862 			/* We should return EINVAL instead perhaps. */
1863 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1864 		}
1865 
1866 		/*
1867 		 * Run connect filter before calling protocol:
1868 		 *  - non-blocking connect returns before completion;
1869 		 */
1870 		error = sflt_connectout(so, dst);
1871 		if (error != 0) {
1872 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1873 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1874 			if (error == EJUSTRETURN) {
1875 				error = 0;
1876 			}
1877 		} else {
1878 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1879 			    (so, src, dst, p, ifscope, aid, pcid,
1880 			    flags, arg, arglen, auio, bytes_written);
1881 			if (error != 0) {
1882 				so->so_state &= ~SS_ISCONNECTING;
1883 				if (error != EINPROGRESS) {
1884 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1885 				}
1886 			}
1887 		}
1888 	}
1889 
1890 	return error;
1891 }
1892 
1893 int
sodisconnectlocked(struct socket * so)1894 sodisconnectlocked(struct socket *so)
1895 {
1896 	int error;
1897 
1898 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1899 		error = ENOTCONN;
1900 		goto bad;
1901 	}
1902 	if (so->so_state & SS_ISDISCONNECTING) {
1903 		error = EALREADY;
1904 		goto bad;
1905 	}
1906 
1907 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1908 	if (error == 0) {
1909 		sflt_notify(so, sock_evt_disconnected, NULL);
1910 	}
1911 
1912 bad:
1913 	return error;
1914 }
1915 
1916 /* Locking version */
1917 int
sodisconnect(struct socket * so)1918 sodisconnect(struct socket *so)
1919 {
1920 	int error;
1921 
1922 	socket_lock(so, 1);
1923 	error = sodisconnectlocked(so);
1924 	socket_unlock(so, 1);
1925 	return error;
1926 }
1927 
1928 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1929 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1930 {
1931 	int error;
1932 
1933 	/*
1934 	 * Call the protocol disconnectx handler; let it handle all
1935 	 * matters related to the connection state of this session.
1936 	 */
1937 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1938 	if (error == 0) {
1939 		/*
1940 		 * The event applies only for the session, not for
1941 		 * the disconnection of individual subflows.
1942 		 */
1943 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1944 			sflt_notify(so, sock_evt_disconnected, NULL);
1945 		}
1946 	}
1947 	return error;
1948 }
1949 
1950 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1951 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1952 {
1953 	int error;
1954 
1955 	socket_lock(so, 1);
1956 	error = sodisconnectxlocked(so, aid, cid);
1957 	socket_unlock(so, 1);
1958 	return error;
1959 }
1960 
1961 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1962 
1963 /*
1964  * sosendcheck will lock the socket buffer if it isn't locked and
1965  * verify that there is space for the data being inserted.
1966  *
1967  * Returns:	0			Success
1968  *		EPIPE
1969  *	sblock:EWOULDBLOCK
1970  *	sblock:EINTR
1971  *	sbwait:EBADF
1972  *	sbwait:EINTR
1973  *	[so_error]:???
1974  */
1975 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1976 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1977     int32_t clen, int32_t atomic, int flags, int *sblocked)
1978 {
1979 	int     error = 0;
1980 	int32_t space;
1981 	int     assumelock = 0;
1982 
1983 restart:
1984 	if (*sblocked == 0) {
1985 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1986 		    so->so_send_filt_thread != 0 &&
1987 		    so->so_send_filt_thread == current_thread()) {
1988 			/*
1989 			 * We're being called recursively from a filter,
1990 			 * allow this to continue. Radar 4150520.
1991 			 * Don't set sblocked because we don't want
1992 			 * to perform an unlock later.
1993 			 */
1994 			assumelock = 1;
1995 		} else {
1996 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1997 			if (error) {
1998 				if (so->so_flags & SOF_DEFUNCT) {
1999 					goto defunct;
2000 				}
2001 				return error;
2002 			}
2003 			*sblocked = 1;
2004 		}
2005 	}
2006 
2007 	/*
2008 	 * If a send attempt is made on a socket that has been marked
2009 	 * as inactive (disconnected), reject the request.
2010 	 */
2011 	if (so->so_flags & SOF_DEFUNCT) {
2012 defunct:
2013 		error = EPIPE;
2014 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2015 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
2016 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2017 		    SOCK_DOM(so), SOCK_TYPE(so), error);
2018 		return error;
2019 	}
2020 
2021 	if (so->so_state & SS_CANTSENDMORE) {
2022 #if CONTENT_FILTER
2023 		/*
2024 		 * Can re-inject data of half closed connections
2025 		 */
2026 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2027 		    so->so_snd.sb_cfil_thread == current_thread() &&
2028 		    cfil_sock_data_pending(&so->so_snd) != 0) {
2029 			CFIL_LOG(LOG_INFO,
2030 			    "so %llx ignore SS_CANTSENDMORE",
2031 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2032 		} else
2033 #endif /* CONTENT_FILTER */
2034 		return EPIPE;
2035 	}
2036 	if (so->so_error) {
2037 		error = so->so_error;
2038 		so->so_error = 0;
2039 		return error;
2040 	}
2041 
2042 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2043 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2044 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2045 			    (resid != 0 || clen == 0) &&
2046 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2047 				return ENOTCONN;
2048 			}
2049 		} else if (addr == 0) {
2050 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2051 			       ENOTCONN : EDESTADDRREQ;
2052 		}
2053 	}
2054 
2055 	space = sbspace(&so->so_snd);
2056 
2057 	if (flags & MSG_OOB) {
2058 		space += 1024;
2059 	}
2060 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
2061 	    clen > so->so_snd.sb_hiwat) {
2062 		return EMSGSIZE;
2063 	}
2064 
2065 	if ((space < resid + clen &&
2066 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2067 	    space < clen)) ||
2068 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2069 		/*
2070 		 * don't block the connectx call when there's more data
2071 		 * than can be copied.
2072 		 */
2073 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2074 			if (space == 0) {
2075 				return EWOULDBLOCK;
2076 			}
2077 			if (space < (int32_t)so->so_snd.sb_lowat) {
2078 				return 0;
2079 			}
2080 		}
2081 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2082 		    assumelock) {
2083 			return EWOULDBLOCK;
2084 		}
2085 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2086 		*sblocked = 0;
2087 		error = sbwait(&so->so_snd);
2088 		if (error) {
2089 			if (so->so_flags & SOF_DEFUNCT) {
2090 				goto defunct;
2091 			}
2092 			return error;
2093 		}
2094 		goto restart;
2095 	}
2096 	return 0;
2097 }
2098 
2099 /*
2100  * Send on a socket.
2101  * If send must go all at once and message is larger than
2102  * send buffering, then hard error.
2103  * Lock against other senders.
2104  * If must go all at once and not enough room now, then
2105  * inform user that this would block and do nothing.
2106  * Otherwise, if nonblocking, send as much as possible.
2107  * The data to be sent is described by "uio" if nonzero,
2108  * otherwise by the mbuf chain "top" (which must be null
2109  * if uio is not).  Data provided in mbuf chain must be small
2110  * enough to send all at once.
2111  *
2112  * Returns nonzero on error, timeout or signal; callers
2113  * must check for short counts if EINTR/ERESTART are returned.
2114  * Data and control buffers are freed on return.
2115  *
2116  * Returns:	0			Success
2117  *		EOPNOTSUPP
2118  *		EINVAL
2119  *		ENOBUFS
2120  *	uiomove:EFAULT
2121  *	sosendcheck:EPIPE
2122  *	sosendcheck:EWOULDBLOCK
2123  *	sosendcheck:EINTR
2124  *	sosendcheck:EBADF
2125  *	sosendcheck:EINTR
2126  *	sosendcheck:???			[value from so_error]
2127  *	<pru_send>:ECONNRESET[TCP]
2128  *	<pru_send>:EINVAL[TCP]
2129  *	<pru_send>:ENOBUFS[TCP]
2130  *	<pru_send>:EADDRINUSE[TCP]
2131  *	<pru_send>:EADDRNOTAVAIL[TCP]
2132  *	<pru_send>:EAFNOSUPPORT[TCP]
2133  *	<pru_send>:EACCES[TCP]
2134  *	<pru_send>:EAGAIN[TCP]
2135  *	<pru_send>:EPERM[TCP]
2136  *	<pru_send>:EMSGSIZE[TCP]
2137  *	<pru_send>:EHOSTUNREACH[TCP]
2138  *	<pru_send>:ENETUNREACH[TCP]
2139  *	<pru_send>:ENETDOWN[TCP]
2140  *	<pru_send>:ENOMEM[TCP]
2141  *	<pru_send>:ENOBUFS[TCP]
2142  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2143  *	<pru_send>:EINVAL[AF_UNIX]
2144  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2145  *	<pru_send>:EPIPE[AF_UNIX]
2146  *	<pru_send>:ENOTCONN[AF_UNIX]
2147  *	<pru_send>:EISCONN[AF_UNIX]
2148  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2149  *	<sf_data_out>:???		[whatever a filter author chooses]
2150  *
2151  * Notes:	Other <pru_send> returns depend on the protocol family; all
2152  *		<sf_data_out> returns depend on what the filter author causes
2153  *		their filter to return.
2154  */
2155 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2156 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2157     struct mbuf *top, struct mbuf *control, int flags)
2158 {
2159 	struct mbuf **mp;
2160 	struct mbuf *m, *freelist = NULL;
2161 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2162 	user_ssize_t space, len, resid, orig_resid;
2163 	int clen = 0, error, dontroute, sendflags;
2164 	int atomic = sosendallatonce(so) || top;
2165 	int sblocked = 0;
2166 	struct proc *p = current_proc();
2167 	uint16_t headroom = 0;
2168 	ssize_t mlen;
2169 	boolean_t en_tracing = FALSE;
2170 
2171 	if (uio != NULL) {
2172 		resid = uio_resid(uio);
2173 	} else {
2174 		resid = top->m_pkthdr.len;
2175 	}
2176 
2177 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2178 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2179 
2180 	socket_lock(so, 1);
2181 
2182 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2183 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2184 	}
2185 
2186 	/*
2187 	 * trace if tracing & network (vs. unix) sockets & and
2188 	 * non-loopback
2189 	 */
2190 	if (ENTR_SHOULDTRACE &&
2191 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2192 		struct inpcb *inp = sotoinpcb(so);
2193 		if (inp->inp_last_outifp != NULL &&
2194 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2195 			en_tracing = TRUE;
2196 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2197 			    VM_KERNEL_ADDRPERM(so),
2198 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2199 			    (int64_t)resid);
2200 			orig_resid = resid;
2201 		}
2202 	}
2203 
2204 	/*
2205 	 * Re-injection should not affect process accounting
2206 	 */
2207 	if ((flags & MSG_SKIPCFIL) == 0) {
2208 		so_update_last_owner_locked(so, p);
2209 		so_update_policy(so);
2210 
2211 #if NECP
2212 		so_update_necp_policy(so, NULL, addr);
2213 #endif /* NECP */
2214 	}
2215 
2216 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2217 		error = EOPNOTSUPP;
2218 		goto out_locked;
2219 	}
2220 
2221 	/*
2222 	 * In theory resid should be unsigned.
2223 	 * However, space must be signed, as it might be less than 0
2224 	 * if we over-committed, and we must use a signed comparison
2225 	 * of space and resid.  On the other hand, a negative resid
2226 	 * causes us to loop sending 0-length segments to the protocol.
2227 	 *
2228 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2229 	 *
2230 	 * Note: We limit resid to be a positive int value as we use
2231 	 * imin() to set bytes_to_copy -- radr://14558484
2232 	 */
2233 	if (resid < 0 || resid > INT_MAX ||
2234 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2235 		error = EINVAL;
2236 		goto out_locked;
2237 	}
2238 
2239 	dontroute = (flags & MSG_DONTROUTE) &&
2240 	    (so->so_options & SO_DONTROUTE) == 0 &&
2241 	    (so->so_proto->pr_flags & PR_ATOMIC);
2242 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2243 
2244 	if (control != NULL) {
2245 		clen = control->m_len;
2246 	}
2247 
2248 	if (soreserveheadroom != 0) {
2249 		headroom = so->so_pktheadroom;
2250 	}
2251 
2252 	do {
2253 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2254 		    &sblocked);
2255 		if (error) {
2256 			goto out_locked;
2257 		}
2258 
2259 		mp = &top;
2260 		space = sbspace(&so->so_snd) - clen;
2261 		space += ((flags & MSG_OOB) ? 1024 : 0);
2262 
2263 		do {
2264 			if (uio == NULL) {
2265 				/*
2266 				 * Data is prepackaged in "top".
2267 				 */
2268 				resid = 0;
2269 				if (flags & MSG_EOR) {
2270 					top->m_flags |= M_EOR;
2271 				}
2272 			} else {
2273 				int chainlength;
2274 				int bytes_to_copy;
2275 				boolean_t jumbocl;
2276 				boolean_t bigcl;
2277 				int bytes_to_alloc;
2278 
2279 				bytes_to_copy = imin((int)resid, (int)space);
2280 
2281 				bytes_to_alloc = bytes_to_copy;
2282 				if (top == NULL) {
2283 					bytes_to_alloc += headroom;
2284 				}
2285 
2286 				if (sosendminchain > 0) {
2287 					chainlength = 0;
2288 				} else {
2289 					chainlength = sosendmaxchain;
2290 				}
2291 
2292 				/*
2293 				 * Use big 4 KB cluster when the outgoing interface
2294 				 * does not prefer 2 KB clusters
2295 				 */
2296 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2297 				    sosendbigcl_ignore_capab;
2298 
2299 				/*
2300 				 * Attempt to use larger than system page-size
2301 				 * clusters for large writes only if there is
2302 				 * a jumbo cluster pool and if the socket is
2303 				 * marked accordingly.
2304 				 */
2305 				jumbocl = sosendjcl && njcl > 0 &&
2306 				    ((so->so_flags & SOF_MULTIPAGES) ||
2307 				    sosendjcl_ignore_capab) &&
2308 				    bigcl;
2309 
2310 				socket_unlock(so, 0);
2311 
2312 				do {
2313 					int num_needed;
2314 					int hdrs_needed = (top == NULL) ? 1 : 0;
2315 
2316 					/*
2317 					 * try to maintain a local cache of mbuf
2318 					 * clusters needed to complete this
2319 					 * write the list is further limited to
2320 					 * the number that are currently needed
2321 					 * to fill the socket this mechanism
2322 					 * allows a large number of mbufs/
2323 					 * clusters to be grabbed under a single
2324 					 * mbuf lock... if we can't get any
2325 					 * clusters, than fall back to trying
2326 					 * for mbufs if we fail early (or
2327 					 * miscalcluate the number needed) make
2328 					 * sure to release any clusters we
2329 					 * haven't yet consumed.
2330 					 */
2331 					if (freelist == NULL &&
2332 					    bytes_to_alloc > MBIGCLBYTES &&
2333 					    jumbocl) {
2334 						num_needed =
2335 						    bytes_to_alloc / M16KCLBYTES;
2336 
2337 						if ((bytes_to_alloc -
2338 						    (num_needed * M16KCLBYTES))
2339 						    >= MINCLSIZE) {
2340 							num_needed++;
2341 						}
2342 
2343 						freelist =
2344 						    m_getpackets_internal(
2345 							(unsigned int *)&num_needed,
2346 							hdrs_needed, M_WAIT, 0,
2347 							M16KCLBYTES);
2348 						/*
2349 						 * Fall back to 4K cluster size
2350 						 * if allocation failed
2351 						 */
2352 					}
2353 
2354 					if (freelist == NULL &&
2355 					    bytes_to_alloc > MCLBYTES &&
2356 					    bigcl) {
2357 						num_needed =
2358 						    bytes_to_alloc / MBIGCLBYTES;
2359 
2360 						if ((bytes_to_alloc -
2361 						    (num_needed * MBIGCLBYTES)) >=
2362 						    MINCLSIZE) {
2363 							num_needed++;
2364 						}
2365 
2366 						freelist =
2367 						    m_getpackets_internal(
2368 							(unsigned int *)&num_needed,
2369 							hdrs_needed, M_WAIT, 0,
2370 							MBIGCLBYTES);
2371 						/*
2372 						 * Fall back to cluster size
2373 						 * if allocation failed
2374 						 */
2375 					}
2376 
2377 					/*
2378 					 * Allocate a cluster as we want to
2379 					 * avoid to split the data in more
2380 					 * that one segment and using MINCLSIZE
2381 					 * would lead us to allocate two mbufs
2382 					 */
2383 					if (soreserveheadroom != 0 &&
2384 					    freelist == NULL &&
2385 					    ((top == NULL &&
2386 					    bytes_to_alloc > _MHLEN) ||
2387 					    bytes_to_alloc > _MLEN)) {
2388 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2389 						    MCLBYTES;
2390 						freelist =
2391 						    m_getpackets_internal(
2392 							(unsigned int *)&num_needed,
2393 							hdrs_needed, M_WAIT, 0,
2394 							MCLBYTES);
2395 						/*
2396 						 * Fall back to a single mbuf
2397 						 * if allocation failed
2398 						 */
2399 					} else if (freelist == NULL &&
2400 					    bytes_to_alloc > MINCLSIZE) {
2401 						num_needed =
2402 						    bytes_to_alloc / MCLBYTES;
2403 
2404 						if ((bytes_to_alloc -
2405 						    (num_needed * MCLBYTES)) >=
2406 						    MINCLSIZE) {
2407 							num_needed++;
2408 						}
2409 
2410 						freelist =
2411 						    m_getpackets_internal(
2412 							(unsigned int *)&num_needed,
2413 							hdrs_needed, M_WAIT, 0,
2414 							MCLBYTES);
2415 						/*
2416 						 * Fall back to a single mbuf
2417 						 * if allocation failed
2418 						 */
2419 					}
2420 					/*
2421 					 * For datagram protocols, leave
2422 					 * headroom for protocol headers
2423 					 * in the first cluster of the chain
2424 					 */
2425 					if (freelist != NULL && atomic &&
2426 					    top == NULL && headroom > 0) {
2427 						freelist->m_data += headroom;
2428 					}
2429 
2430 					/*
2431 					 * Fall back to regular mbufs without
2432 					 * reserving the socket headroom
2433 					 */
2434 					if (freelist == NULL) {
2435 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2436 							if (top == NULL) {
2437 								MGETHDR(freelist,
2438 								    M_WAIT, MT_DATA);
2439 							} else {
2440 								MGET(freelist,
2441 								    M_WAIT, MT_DATA);
2442 							}
2443 						}
2444 
2445 						if (freelist == NULL) {
2446 							error = ENOBUFS;
2447 							socket_lock(so, 0);
2448 							goto out_locked;
2449 						}
2450 						/*
2451 						 * For datagram protocols,
2452 						 * leave room for protocol
2453 						 * headers in first mbuf.
2454 						 */
2455 						if (atomic && top == NULL &&
2456 						    bytes_to_copy > 0 &&
2457 						    bytes_to_copy < MHLEN) {
2458 							MH_ALIGN(freelist,
2459 							    bytes_to_copy);
2460 						}
2461 					}
2462 					m = freelist;
2463 					freelist = m->m_next;
2464 					m->m_next = NULL;
2465 
2466 					if ((m->m_flags & M_EXT)) {
2467 						mlen = m->m_ext.ext_size -
2468 						    M_LEADINGSPACE(m);
2469 					} else if ((m->m_flags & M_PKTHDR)) {
2470 						mlen = MHLEN - M_LEADINGSPACE(m);
2471 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2472 					} else {
2473 						mlen = MLEN - M_LEADINGSPACE(m);
2474 					}
2475 					len = imin((int)mlen, bytes_to_copy);
2476 
2477 					chainlength += len;
2478 
2479 					space -= len;
2480 
2481 					error = uiomove(mtod(m, caddr_t),
2482 					    (int)len, uio);
2483 
2484 					resid = uio_resid(uio);
2485 
2486 					m->m_len = (int32_t)len;
2487 					*mp = m;
2488 					top->m_pkthdr.len += len;
2489 					if (error) {
2490 						break;
2491 					}
2492 					mp = &m->m_next;
2493 					if (resid <= 0) {
2494 						if (flags & MSG_EOR) {
2495 							top->m_flags |= M_EOR;
2496 						}
2497 						break;
2498 					}
2499 					bytes_to_copy = imin((int)resid, (int)space);
2500 				} while (space > 0 &&
2501 				    (chainlength < sosendmaxchain || atomic ||
2502 				    resid < MINCLSIZE));
2503 
2504 				socket_lock(so, 0);
2505 
2506 				if (error) {
2507 					goto out_locked;
2508 				}
2509 			}
2510 
2511 			if (dontroute) {
2512 				so->so_options |= SO_DONTROUTE;
2513 			}
2514 
2515 			/*
2516 			 * Compute flags here, for pru_send and NKEs
2517 			 *
2518 			 * If the user set MSG_EOF, the protocol
2519 			 * understands this flag and nothing left to
2520 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2521 			 */
2522 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2523 			    ((flags & MSG_EOF) &&
2524 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2525 			    (resid <= 0)) ? PRUS_EOF :
2526 			    /* If there is more to send set PRUS_MORETOCOME */
2527 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2528 
2529 			if ((flags & MSG_SKIPCFIL) == 0) {
2530 				/*
2531 				 * Socket filter processing
2532 				 */
2533 				error = sflt_data_out(so, addr, &top,
2534 				    &control, (sendflags & MSG_OOB) ?
2535 				    sock_data_filt_flag_oob : 0);
2536 				if (error) {
2537 					if (error == EJUSTRETURN) {
2538 						error = 0;
2539 						goto packet_consumed;
2540 					}
2541 					goto out_locked;
2542 				}
2543 #if CONTENT_FILTER
2544 				/*
2545 				 * Content filter processing
2546 				 */
2547 				error = cfil_sock_data_out(so, addr, top,
2548 				    control, sendflags, dgram_flow_entry);
2549 				if (error) {
2550 					if (error == EJUSTRETURN) {
2551 						error = 0;
2552 						goto packet_consumed;
2553 					}
2554 					goto out_locked;
2555 				}
2556 #endif /* CONTENT_FILTER */
2557 			}
2558 			error = (*so->so_proto->pr_usrreqs->pru_send)
2559 			    (so, sendflags, top, addr, control, p);
2560 
2561 packet_consumed:
2562 			if (dontroute) {
2563 				so->so_options &= ~SO_DONTROUTE;
2564 			}
2565 
2566 			clen = 0;
2567 			control = NULL;
2568 			top = NULL;
2569 			mp = &top;
2570 			if (error) {
2571 				goto out_locked;
2572 			}
2573 		} while (resid && space > 0);
2574 	} while (resid);
2575 
2576 out_locked:
2577 	if (sblocked) {
2578 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2579 	} else {
2580 		socket_unlock(so, 1);
2581 	}
2582 	if (top != NULL) {
2583 		m_freem(top);
2584 	}
2585 	if (control != NULL) {
2586 		m_freem(control);
2587 	}
2588 	if (freelist != NULL) {
2589 		m_freem_list(freelist);
2590 	}
2591 
2592 	if (dgram_flow_entry != NULL) {
2593 		soflow_free_flow(dgram_flow_entry);
2594 	}
2595 
2596 	soclearfastopen(so);
2597 
2598 	if (en_tracing) {
2599 		/* resid passed here is the bytes left in uio */
2600 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2601 		    VM_KERNEL_ADDRPERM(so),
2602 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2603 		    (int64_t)(orig_resid - resid));
2604 	}
2605 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2606 	    so->so_snd.sb_cc, space, error);
2607 
2608 	return error;
2609 }
2610 
2611 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2612 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2613 {
2614 	struct mbuf *m0 = NULL, *control_end = NULL;
2615 
2616 	socket_lock_assert_owned(so);
2617 
2618 	/*
2619 	 * top must points to mbuf chain to be sent.
2620 	 * If control is not NULL, top must be packet header
2621 	 */
2622 	VERIFY(top != NULL &&
2623 	    (control == NULL || top->m_flags & M_PKTHDR));
2624 
2625 	/*
2626 	 * If control is not passed in, see if we can get it
2627 	 * from top.
2628 	 */
2629 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2630 		// Locate start of control if present and start of data
2631 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2632 			if (m0->m_flags & M_PKTHDR) {
2633 				top = m0;
2634 				break;
2635 			} else if (m0->m_type == MT_CONTROL) {
2636 				if (control == NULL) {
2637 					// Found start of control
2638 					control = m0;
2639 				}
2640 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2641 					// Found end of control
2642 					control_end = m0;
2643 				}
2644 			}
2645 		}
2646 		if (control_end != NULL) {
2647 			control_end->m_next = NULL;
2648 		}
2649 	}
2650 
2651 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2652 	    (so, sendflags, top, addr, control, current_proc());
2653 
2654 	return error;
2655 }
2656 
2657 /*
2658  * Supported only connected sockets (no address) without ancillary data
2659  * (control mbuf) for atomic protocols
2660  */
2661 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2662 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2663 {
2664 	struct mbuf *m, *freelist = NULL;
2665 	struct soflow_hash_entry *dgram_flow_entry = NULL;
2666 	user_ssize_t len, resid;
2667 	int error, dontroute;
2668 	int atomic = sosendallatonce(so);
2669 	int sblocked = 0;
2670 	struct proc *p = current_proc();
2671 	u_int uiofirst = 0;
2672 	u_int uiolast = 0;
2673 	struct mbuf *top = NULL;
2674 	uint16_t headroom = 0;
2675 	ssize_t mlen;
2676 	boolean_t bigcl;
2677 
2678 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2679 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2680 
2681 	if (so->so_type != SOCK_DGRAM) {
2682 		error = EINVAL;
2683 		goto out;
2684 	}
2685 	if (atomic == 0) {
2686 		error = EINVAL;
2687 		goto out;
2688 	}
2689 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2690 		error = EPROTONOSUPPORT;
2691 		goto out;
2692 	}
2693 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2694 		error = EINVAL;
2695 		goto out;
2696 	}
2697 	resid = uio_array_resid(uioarray, uiocnt);
2698 
2699 	/*
2700 	 * In theory resid should be unsigned.
2701 	 * However, space must be signed, as it might be less than 0
2702 	 * if we over-committed, and we must use a signed comparison
2703 	 * of space and resid.  On the other hand, a negative resid
2704 	 * causes us to loop sending 0-length segments to the protocol.
2705 	 *
2706 	 * Note: We limit resid to be a positive int value as we use
2707 	 * imin() to set bytes_to_copy -- radr://14558484
2708 	 */
2709 	if (resid < 0 || resid > INT_MAX) {
2710 		error = EINVAL;
2711 		goto out;
2712 	}
2713 
2714 	socket_lock(so, 1);
2715 	so_update_last_owner_locked(so, p);
2716 	so_update_policy(so);
2717 
2718 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2719 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2720 	}
2721 
2722 #if NECP
2723 	so_update_necp_policy(so, NULL, NULL);
2724 #endif /* NECP */
2725 
2726 	dontroute = (flags & MSG_DONTROUTE) &&
2727 	    (so->so_options & SO_DONTROUTE) == 0 &&
2728 	    (so->so_proto->pr_flags & PR_ATOMIC);
2729 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2730 
2731 	error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2732 	if (error) {
2733 		goto release;
2734 	}
2735 
2736 	/*
2737 	 * Use big 4 KB clusters when the outgoing interface does not prefer
2738 	 * 2 KB clusters
2739 	 */
2740 	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2741 
2742 	if (soreserveheadroom != 0) {
2743 		headroom = so->so_pktheadroom;
2744 	}
2745 
2746 	do {
2747 		int i;
2748 		int num_needed = 0;
2749 		int chainlength;
2750 		size_t maxpktlen = 0;
2751 		int bytes_to_alloc;
2752 
2753 		if (sosendminchain > 0) {
2754 			chainlength = 0;
2755 		} else {
2756 			chainlength = sosendmaxchain;
2757 		}
2758 
2759 		socket_unlock(so, 0);
2760 
2761 		/*
2762 		 * Find a set of uio that fit in a reasonable number
2763 		 * of mbuf packets
2764 		 */
2765 		for (i = uiofirst; i < uiocnt; i++) {
2766 			struct uio *auio = uioarray[i];
2767 
2768 			len = uio_resid(auio);
2769 
2770 			/* Do nothing for empty messages */
2771 			if (len == 0) {
2772 				continue;
2773 			}
2774 
2775 			num_needed += 1;
2776 			uiolast += 1;
2777 
2778 			if (len > maxpktlen) {
2779 				maxpktlen = len;
2780 			}
2781 
2782 			chainlength += len;
2783 			if (chainlength > sosendmaxchain) {
2784 				break;
2785 			}
2786 		}
2787 		/*
2788 		 * Nothing left to send
2789 		 */
2790 		if (num_needed == 0) {
2791 			socket_lock(so, 0);
2792 			break;
2793 		}
2794 		/*
2795 		 * Allocate buffer large enough to include headroom space for
2796 		 * network and link header
2797 		 *
2798 		 */
2799 		bytes_to_alloc = (int) maxpktlen + headroom;
2800 
2801 		/*
2802 		 * Allocate a single contiguous buffer of the smallest available
2803 		 * size when possible
2804 		 */
2805 		if (bytes_to_alloc > MCLBYTES &&
2806 		    bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2807 			freelist = m_getpackets_internal(
2808 				(unsigned int *)&num_needed,
2809 				num_needed, M_WAIT, 1,
2810 				MBIGCLBYTES);
2811 		} else if (bytes_to_alloc > _MHLEN &&
2812 		    bytes_to_alloc <= MCLBYTES) {
2813 			freelist = m_getpackets_internal(
2814 				(unsigned int *)&num_needed,
2815 				num_needed, M_WAIT, 1,
2816 				MCLBYTES);
2817 		} else {
2818 			freelist = m_allocpacket_internal(
2819 				(unsigned int *)&num_needed,
2820 				bytes_to_alloc, NULL, M_WAIT, 1, 0);
2821 		}
2822 
2823 		if (freelist == NULL) {
2824 			socket_lock(so, 0);
2825 			error = ENOMEM;
2826 			goto release;
2827 		}
2828 		/*
2829 		 * Copy each uio of the set into its own mbuf packet
2830 		 */
2831 		for (i = uiofirst, m = freelist;
2832 		    i < uiolast && m != NULL;
2833 		    i++) {
2834 			int bytes_to_copy;
2835 			struct mbuf *n;
2836 			struct uio *auio = uioarray[i];
2837 
2838 			bytes_to_copy = (int)uio_resid(auio);
2839 
2840 			/* Do nothing for empty messages */
2841 			if (bytes_to_copy == 0) {
2842 				continue;
2843 			}
2844 			/*
2845 			 * Leave headroom for protocol headers
2846 			 * in the first mbuf of the chain
2847 			 */
2848 			m->m_data += headroom;
2849 
2850 			for (n = m; n != NULL; n = n->m_next) {
2851 				if ((m->m_flags & M_EXT)) {
2852 					mlen = m->m_ext.ext_size -
2853 					    M_LEADINGSPACE(m);
2854 				} else if ((m->m_flags & M_PKTHDR)) {
2855 					mlen =
2856 					    MHLEN - M_LEADINGSPACE(m);
2857 				} else {
2858 					mlen = MLEN - M_LEADINGSPACE(m);
2859 				}
2860 				len = imin((int)mlen, bytes_to_copy);
2861 
2862 				/*
2863 				 * Note: uiomove() decrements the iovec
2864 				 * length
2865 				 */
2866 				error = uiomove(mtod(n, caddr_t),
2867 				    (int)len, auio);
2868 				if (error != 0) {
2869 					break;
2870 				}
2871 				n->m_len = (int32_t)len;
2872 				m->m_pkthdr.len += len;
2873 
2874 				VERIFY(m->m_pkthdr.len <= maxpktlen);
2875 
2876 				bytes_to_copy -= len;
2877 				resid -= len;
2878 			}
2879 			if (m->m_pkthdr.len == 0) {
2880 				printf(
2881 					"%s:%d so %llx pkt %llx type %u len null\n",
2882 					__func__, __LINE__,
2883 					(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2884 					(uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2885 					m->m_type);
2886 			}
2887 			if (error != 0) {
2888 				break;
2889 			}
2890 			m = m->m_nextpkt;
2891 		}
2892 
2893 		socket_lock(so, 0);
2894 
2895 		if (error) {
2896 			goto release;
2897 		}
2898 		top = freelist;
2899 		freelist = NULL;
2900 
2901 		if (dontroute) {
2902 			so->so_options |= SO_DONTROUTE;
2903 		}
2904 
2905 		if ((flags & MSG_SKIPCFIL) == 0) {
2906 			struct mbuf **prevnextp = NULL;
2907 
2908 			for (i = uiofirst, m = top;
2909 			    i < uiolast && m != NULL;
2910 			    i++) {
2911 				struct mbuf *nextpkt = m->m_nextpkt;
2912 
2913 				/*
2914 				 * Socket filter processing
2915 				 */
2916 				error = sflt_data_out(so, NULL, &m,
2917 				    NULL, 0);
2918 				if (error != 0 && error != EJUSTRETURN) {
2919 					goto release;
2920 				}
2921 
2922 #if CONTENT_FILTER
2923 				if (error == 0) {
2924 					/*
2925 					 * Content filter processing
2926 					 */
2927 					error = cfil_sock_data_out(so, NULL, m,
2928 					    NULL, 0, dgram_flow_entry);
2929 					if (error != 0 && error != EJUSTRETURN) {
2930 						goto release;
2931 					}
2932 				}
2933 #endif /* CONTENT_FILTER */
2934 				/*
2935 				 * Remove packet from the list when
2936 				 * swallowed by a filter
2937 				 */
2938 				if (error == EJUSTRETURN) {
2939 					error = 0;
2940 					if (prevnextp != NULL) {
2941 						*prevnextp = nextpkt;
2942 					} else {
2943 						top = nextpkt;
2944 					}
2945 				}
2946 
2947 				m = nextpkt;
2948 				if (m != NULL) {
2949 					prevnextp = &m->m_nextpkt;
2950 				}
2951 			}
2952 		}
2953 		if (top != NULL) {
2954 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2955 			    (so, 0, top, NULL, NULL, p);
2956 		}
2957 
2958 		if (dontroute) {
2959 			so->so_options &= ~SO_DONTROUTE;
2960 		}
2961 
2962 		top = NULL;
2963 		uiofirst = uiolast;
2964 	} while (resid > 0 && error == 0);
2965 release:
2966 	if (sblocked) {
2967 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2968 	} else {
2969 		socket_unlock(so, 1);
2970 	}
2971 out:
2972 	if (top != NULL) {
2973 		m_freem(top);
2974 	}
2975 	if (freelist != NULL) {
2976 		m_freem_list(freelist);
2977 	}
2978 
2979 	if (dgram_flow_entry != NULL) {
2980 		soflow_free_flow(dgram_flow_entry);
2981 	}
2982 
2983 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2984 	    so->so_snd.sb_cc, 0, error);
2985 
2986 	return error;
2987 }
2988 
2989 /*
2990  * May return ERESTART when packet is dropped by MAC policy check
2991  */
2992 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2993 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2994     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2995 {
2996 	int error = 0;
2997 	struct mbuf *m = *mp;
2998 	struct mbuf *nextrecord = *nextrecordp;
2999 
3000 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3001 #if CONFIG_MACF_SOCKET_SUBSET
3002 	/*
3003 	 * Call the MAC framework for policy checking if we're in
3004 	 * the user process context and the socket isn't connected.
3005 	 */
3006 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3007 		struct mbuf *m0 = m;
3008 		/*
3009 		 * Dequeue this record (temporarily) from the receive
3010 		 * list since we're about to drop the socket's lock
3011 		 * where a new record may arrive and be appended to
3012 		 * the list.  Upon MAC policy failure, the record
3013 		 * will be freed.  Otherwise, we'll add it back to
3014 		 * the head of the list.  We cannot rely on SB_LOCK
3015 		 * because append operation uses the socket's lock.
3016 		 */
3017 		do {
3018 			m->m_nextpkt = NULL;
3019 			sbfree(&so->so_rcv, m);
3020 			m = m->m_next;
3021 		} while (m != NULL);
3022 		m = m0;
3023 		so->so_rcv.sb_mb = nextrecord;
3024 		SB_EMPTY_FIXUP(&so->so_rcv);
3025 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3026 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3027 		socket_unlock(so, 0);
3028 
3029 		error = mac_socket_check_received(kauth_cred_get(), so,
3030 		    mtod(m, struct sockaddr *));
3031 
3032 		if (error != 0) {
3033 			/*
3034 			 * MAC policy failure; free this record and
3035 			 * process the next record (or block until
3036 			 * one is available).  We have adjusted sb_cc
3037 			 * and sb_mbcnt above so there is no need to
3038 			 * call sbfree() again.
3039 			 */
3040 			m_freem(m);
3041 			/*
3042 			 * Clear SB_LOCK but don't unlock the socket.
3043 			 * Process the next record or wait for one.
3044 			 */
3045 			socket_lock(so, 0);
3046 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
3047 			error = ERESTART;
3048 			goto done;
3049 		}
3050 		socket_lock(so, 0);
3051 		/*
3052 		 * If the socket has been defunct'd, drop it.
3053 		 */
3054 		if (so->so_flags & SOF_DEFUNCT) {
3055 			m_freem(m);
3056 			error = ENOTCONN;
3057 			goto done;
3058 		}
3059 		/*
3060 		 * Re-adjust the socket receive list and re-enqueue
3061 		 * the record in front of any packets which may have
3062 		 * been appended while we dropped the lock.
3063 		 */
3064 		for (m = m0; m->m_next != NULL; m = m->m_next) {
3065 			sballoc(&so->so_rcv, m);
3066 		}
3067 		sballoc(&so->so_rcv, m);
3068 		if (so->so_rcv.sb_mb == NULL) {
3069 			so->so_rcv.sb_lastrecord = m0;
3070 			so->so_rcv.sb_mbtail = m;
3071 		}
3072 		m = m0;
3073 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3074 		so->so_rcv.sb_mb = m;
3075 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3076 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3077 	}
3078 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3079 	if (psa != NULL) {
3080 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3081 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3082 			error = EWOULDBLOCK;
3083 			goto done;
3084 		}
3085 	}
3086 	if (flags & MSG_PEEK) {
3087 		m = m->m_next;
3088 	} else {
3089 		sbfree(&so->so_rcv, m);
3090 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3091 			panic("%s: about to create invalid socketbuf",
3092 			    __func__);
3093 			/* NOTREACHED */
3094 		}
3095 		MFREE(m, so->so_rcv.sb_mb);
3096 		m = so->so_rcv.sb_mb;
3097 		if (m != NULL) {
3098 			m->m_nextpkt = nextrecord;
3099 		} else {
3100 			so->so_rcv.sb_mb = nextrecord;
3101 			SB_EMPTY_FIXUP(&so->so_rcv);
3102 		}
3103 	}
3104 done:
3105 	*mp = m;
3106 	*nextrecordp = nextrecord;
3107 
3108 	return error;
3109 }
3110 
3111 /*
3112  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3113  * so clear the data portion in order not to leak the file pointers
3114  */
3115 static void
sopeek_scm_rights(struct mbuf * rights)3116 sopeek_scm_rights(struct mbuf *rights)
3117 {
3118 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3119 
3120 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3121 		VERIFY(cm->cmsg_len <= rights->m_len);
3122 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3123 	}
3124 }
3125 
3126 /*
3127  * Process one or more MT_CONTROL mbufs present before any data mbufs
3128  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3129  * just copy the data; if !MSG_PEEK, we call into the protocol to
3130  * perform externalization.
3131  */
3132 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3133 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3134     struct mbuf **mp, struct mbuf **nextrecordp)
3135 {
3136 	int error = 0;
3137 	struct mbuf *cm = NULL, *cmn;
3138 	struct mbuf **cme = &cm;
3139 	struct sockbuf *sb_rcv = &so->so_rcv;
3140 	struct mbuf **msgpcm = NULL;
3141 	struct mbuf *m = *mp;
3142 	struct mbuf *nextrecord = *nextrecordp;
3143 	struct protosw *pr = so->so_proto;
3144 
3145 	/*
3146 	 * Externalizing the control messages would require us to
3147 	 * drop the socket's lock below.  Once we re-acquire the
3148 	 * lock, the mbuf chain might change.  In order to preserve
3149 	 * consistency, we unlink all control messages from the
3150 	 * first mbuf chain in one shot and link them separately
3151 	 * onto a different chain.
3152 	 */
3153 	do {
3154 		if (flags & MSG_PEEK) {
3155 			if (controlp != NULL) {
3156 				if (*controlp == NULL) {
3157 					msgpcm = controlp;
3158 				}
3159 				*controlp = m_copy(m, 0, m->m_len);
3160 
3161 				/*
3162 				 * If we failed to allocate an mbuf,
3163 				 * release any previously allocated
3164 				 * mbufs for control data. Return
3165 				 * an error. Keep the mbufs in the
3166 				 * socket as this is using
3167 				 * MSG_PEEK flag.
3168 				 */
3169 				if (*controlp == NULL) {
3170 					m_freem(*msgpcm);
3171 					error = ENOBUFS;
3172 					goto done;
3173 				}
3174 
3175 				if (pr->pr_domain->dom_externalize != NULL) {
3176 					sopeek_scm_rights(*controlp);
3177 				}
3178 
3179 				controlp = &(*controlp)->m_next;
3180 			}
3181 			m = m->m_next;
3182 		} else {
3183 			m->m_nextpkt = NULL;
3184 			sbfree(sb_rcv, m);
3185 			sb_rcv->sb_mb = m->m_next;
3186 			m->m_next = NULL;
3187 			*cme = m;
3188 			cme = &(*cme)->m_next;
3189 			m = sb_rcv->sb_mb;
3190 		}
3191 	} while (m != NULL && m->m_type == MT_CONTROL);
3192 
3193 	if (!(flags & MSG_PEEK)) {
3194 		if (sb_rcv->sb_mb != NULL) {
3195 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3196 		} else {
3197 			sb_rcv->sb_mb = nextrecord;
3198 			SB_EMPTY_FIXUP(sb_rcv);
3199 		}
3200 		if (nextrecord == NULL) {
3201 			sb_rcv->sb_lastrecord = m;
3202 		}
3203 	}
3204 
3205 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3206 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3207 
3208 	while (cm != NULL) {
3209 		int cmsg_level;
3210 		int cmsg_type;
3211 
3212 		cmn = cm->m_next;
3213 		cm->m_next = NULL;
3214 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3215 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3216 
3217 		/*
3218 		 * Call the protocol to externalize SCM_RIGHTS message
3219 		 * and return the modified message to the caller upon
3220 		 * success.  Otherwise, all other control messages are
3221 		 * returned unmodified to the caller.  Note that we
3222 		 * only get into this loop if MSG_PEEK is not set.
3223 		 */
3224 		if (pr->pr_domain->dom_externalize != NULL &&
3225 		    cmsg_level == SOL_SOCKET &&
3226 		    cmsg_type == SCM_RIGHTS) {
3227 			/*
3228 			 * Release socket lock: see 3903171.  This
3229 			 * would also allow more records to be appended
3230 			 * to the socket buffer.  We still have SB_LOCK
3231 			 * set on it, so we can be sure that the head
3232 			 * of the mbuf chain won't change.
3233 			 */
3234 			socket_unlock(so, 0);
3235 			error = (*pr->pr_domain->dom_externalize)(cm);
3236 			socket_lock(so, 0);
3237 		} else {
3238 			error = 0;
3239 		}
3240 
3241 		if (controlp != NULL && error == 0) {
3242 			*controlp = cm;
3243 			controlp = &(*controlp)->m_next;
3244 		} else {
3245 			(void) m_free(cm);
3246 		}
3247 		cm = cmn;
3248 	}
3249 	/*
3250 	 * Update the value of nextrecord in case we received new
3251 	 * records when the socket was unlocked above for
3252 	 * externalizing SCM_RIGHTS.
3253 	 */
3254 	if (m != NULL) {
3255 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3256 	} else {
3257 		nextrecord = sb_rcv->sb_mb;
3258 	}
3259 
3260 done:
3261 	*mp = m;
3262 	*nextrecordp = nextrecord;
3263 
3264 	return error;
3265 }
3266 
3267 /*
3268  * If we have less data than requested, block awaiting more
3269  * (subject to any timeout) if:
3270  *   1. the current count is less than the low water mark, or
3271  *   2. MSG_WAITALL is set, and it is possible to do the entire
3272  *	receive operation at once if we block (resid <= hiwat).
3273  *   3. MSG_DONTWAIT is not set
3274  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3275  * we have to do the receive in sections, and thus risk returning
3276  * a short count if a timeout or signal occurs after we start.
3277  */
3278 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3279 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3280 {
3281 	struct protosw *pr = so->so_proto;
3282 
3283 	/* No mbufs in the receive-queue? Wait! */
3284 	if (m == NULL) {
3285 		return true;
3286 	}
3287 
3288 	/* Not enough data in the receive socket-buffer - we may have to wait */
3289 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3290 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3291 		/*
3292 		 * Application did set the lowater-mark, so we should wait for
3293 		 * this data to be present.
3294 		 */
3295 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3296 			return true;
3297 		}
3298 
3299 		/*
3300 		 * Application wants all the data - so let's try to do the
3301 		 * receive-operation at once by waiting for everything to
3302 		 * be there.
3303 		 */
3304 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3305 			return true;
3306 		}
3307 	}
3308 
3309 	return false;
3310 }
3311 
3312 /*
3313  * Implement receive operations on a socket.
3314  * We depend on the way that records are added to the sockbuf
3315  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3316  * must begin with an address if the protocol so specifies,
3317  * followed by an optional mbuf or mbufs containing ancillary data,
3318  * and then zero or more mbufs of data.
3319  * In order to avoid blocking network interrupts for the entire time here,
3320  * we splx() while doing the actual copy to user space.
3321  * Although the sockbuf is locked, new data may still be appended,
3322  * and thus we must maintain consistency of the sockbuf during that time.
3323  *
3324  * The caller may receive the data as a single mbuf chain by supplying
3325  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3326  * only for the count in uio_resid.
3327  *
3328  * Returns:	0			Success
3329  *		ENOBUFS
3330  *		ENOTCONN
3331  *		EWOULDBLOCK
3332  *	uiomove:EFAULT
3333  *	sblock:EWOULDBLOCK
3334  *	sblock:EINTR
3335  *	sbwait:EBADF
3336  *	sbwait:EINTR
3337  *	sodelayed_copy:EFAULT
3338  *	<pru_rcvoob>:EINVAL[TCP]
3339  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3340  *	<pru_rcvoob>:???
3341  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3342  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3343  *	<pr_domain->dom_externalize>:???
3344  *
3345  * Notes:	Additional return values from calls through <pru_rcvoob> and
3346  *		<pr_domain->dom_externalize> depend on protocols other than
3347  *		TCP or AF_UNIX, which are documented above.
3348  */
3349 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3350 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3351     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3352 {
3353 	struct mbuf *m, **mp, *ml = NULL;
3354 	struct mbuf *nextrecord, *free_list;
3355 	int flags, error, offset;
3356 	user_ssize_t len;
3357 	struct protosw *pr = so->so_proto;
3358 	int moff, type = 0;
3359 	user_ssize_t orig_resid = uio_resid(uio);
3360 	user_ssize_t delayed_copy_len;
3361 	int can_delay;
3362 	struct proc *p = current_proc();
3363 	boolean_t en_tracing = FALSE;
3364 
3365 	/*
3366 	 * Sanity check on the length passed by caller as we are making 'int'
3367 	 * comparisons
3368 	 */
3369 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3370 		return EINVAL;
3371 	}
3372 
3373 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3374 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3375 	    so->so_rcv.sb_hiwat);
3376 
3377 	socket_lock(so, 1);
3378 	so_update_last_owner_locked(so, p);
3379 	so_update_policy(so);
3380 
3381 #ifdef MORE_LOCKING_DEBUG
3382 	if (so->so_usecount == 1) {
3383 		panic("%s: so=%x no other reference on socket", __func__, so);
3384 		/* NOTREACHED */
3385 	}
3386 #endif
3387 	mp = mp0;
3388 	if (psa != NULL) {
3389 		*psa = NULL;
3390 	}
3391 	if (controlp != NULL) {
3392 		*controlp = NULL;
3393 	}
3394 	if (flagsp != NULL) {
3395 		flags = *flagsp & ~MSG_EOR;
3396 	} else {
3397 		flags = 0;
3398 	}
3399 
3400 	/*
3401 	 * If a recv attempt is made on a previously-accepted socket
3402 	 * that has been marked as inactive (disconnected), reject
3403 	 * the request.
3404 	 */
3405 	if (so->so_flags & SOF_DEFUNCT) {
3406 		struct sockbuf *sb = &so->so_rcv;
3407 
3408 		error = ENOTCONN;
3409 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3410 		    __func__, proc_pid(p), proc_best_name(p),
3411 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3412 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3413 		/*
3414 		 * This socket should have been disconnected and flushed
3415 		 * prior to being returned from sodefunct(); there should
3416 		 * be no data on its receive list, so panic otherwise.
3417 		 */
3418 		if (so->so_state & SS_DEFUNCT) {
3419 			sb_empty_assert(sb, __func__);
3420 		}
3421 		socket_unlock(so, 1);
3422 		return error;
3423 	}
3424 
3425 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3426 	    pr->pr_usrreqs->pru_preconnect) {
3427 		/*
3428 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3429 		 * calling write() right after this. *If* the app calls a read
3430 		 * we do not want to block this read indefinetely. Thus,
3431 		 * we trigger a connect so that the session gets initiated.
3432 		 */
3433 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3434 
3435 		if (error) {
3436 			socket_unlock(so, 1);
3437 			return error;
3438 		}
3439 	}
3440 
3441 	if (ENTR_SHOULDTRACE &&
3442 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3443 		/*
3444 		 * enable energy tracing for inet sockets that go over
3445 		 * non-loopback interfaces only.
3446 		 */
3447 		struct inpcb *inp = sotoinpcb(so);
3448 		if (inp->inp_last_outifp != NULL &&
3449 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3450 			en_tracing = TRUE;
3451 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3452 			    VM_KERNEL_ADDRPERM(so),
3453 			    ((so->so_state & SS_NBIO) ?
3454 			    kEnTrFlagNonBlocking : 0),
3455 			    (int64_t)orig_resid);
3456 		}
3457 	}
3458 
3459 	/*
3460 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3461 	 * regardless of the flags argument. Here is the case were
3462 	 * out-of-band data is not inline.
3463 	 */
3464 	if ((flags & MSG_OOB) ||
3465 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3466 	    (so->so_options & SO_OOBINLINE) == 0 &&
3467 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3468 		m = m_get(M_WAIT, MT_DATA);
3469 		if (m == NULL) {
3470 			socket_unlock(so, 1);
3471 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3472 			    ENOBUFS, 0, 0, 0, 0);
3473 			return ENOBUFS;
3474 		}
3475 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3476 		if (error) {
3477 			goto bad;
3478 		}
3479 		socket_unlock(so, 0);
3480 		do {
3481 			error = uiomove(mtod(m, caddr_t),
3482 			    imin((int)uio_resid(uio), m->m_len), uio);
3483 			m = m_free(m);
3484 		} while (uio_resid(uio) && error == 0 && m != NULL);
3485 		socket_lock(so, 0);
3486 bad:
3487 		if (m != NULL) {
3488 			m_freem(m);
3489 		}
3490 
3491 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3492 			if (error == EWOULDBLOCK || error == EINVAL) {
3493 				/*
3494 				 * Let's try to get normal data:
3495 				 * EWOULDBLOCK: out-of-band data not
3496 				 * receive yet. EINVAL: out-of-band data
3497 				 * already read.
3498 				 */
3499 				error = 0;
3500 				goto nooob;
3501 			} else if (error == 0 && flagsp != NULL) {
3502 				*flagsp |= MSG_OOB;
3503 			}
3504 		}
3505 		socket_unlock(so, 1);
3506 		if (en_tracing) {
3507 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3508 			    VM_KERNEL_ADDRPERM(so), 0,
3509 			    (int64_t)(orig_resid - uio_resid(uio)));
3510 		}
3511 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3512 		    0, 0, 0, 0);
3513 
3514 		return error;
3515 	}
3516 nooob:
3517 	if (mp != NULL) {
3518 		*mp = NULL;
3519 	}
3520 
3521 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3522 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3523 	}
3524 
3525 	free_list = NULL;
3526 	delayed_copy_len = 0;
3527 restart:
3528 #ifdef MORE_LOCKING_DEBUG
3529 	if (so->so_usecount <= 1) {
3530 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3531 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3532 	}
3533 #endif
3534 	/*
3535 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3536 	 * and if so just return to the caller.  This could happen when
3537 	 * soreceive() is called by a socket upcall function during the
3538 	 * time the socket is freed.  The socket buffer would have been
3539 	 * locked across the upcall, therefore we cannot put this thread
3540 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3541 	 * we may livelock), because the lock on the socket buffer will
3542 	 * only be released when the upcall routine returns to its caller.
3543 	 * Because the socket has been officially closed, there can be
3544 	 * no further read on it.
3545 	 *
3546 	 * A multipath subflow socket would have its SS_NOFDREF set by
3547 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3548 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3549 	 */
3550 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3551 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3552 		socket_unlock(so, 1);
3553 		return 0;
3554 	}
3555 
3556 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3557 	if (error) {
3558 		socket_unlock(so, 1);
3559 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3560 		    0, 0, 0, 0);
3561 		if (en_tracing) {
3562 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3563 			    VM_KERNEL_ADDRPERM(so), 0,
3564 			    (int64_t)(orig_resid - uio_resid(uio)));
3565 		}
3566 		return error;
3567 	}
3568 
3569 	m = so->so_rcv.sb_mb;
3570 	if (so_should_wait(so, uio, m, flags)) {
3571 		/*
3572 		 * Panic if we notice inconsistencies in the socket's
3573 		 * receive list; both sb_mb and sb_cc should correctly
3574 		 * reflect the contents of the list, otherwise we may
3575 		 * end up with false positives during select() or poll()
3576 		 * which could put the application in a bad state.
3577 		 */
3578 		SB_MB_CHECK(&so->so_rcv);
3579 
3580 		if (so->so_error) {
3581 			if (m != NULL) {
3582 				goto dontblock;
3583 			}
3584 			error = so->so_error;
3585 			if ((flags & MSG_PEEK) == 0) {
3586 				so->so_error = 0;
3587 			}
3588 			goto release;
3589 		}
3590 		if (so->so_state & SS_CANTRCVMORE) {
3591 #if CONTENT_FILTER
3592 			/*
3593 			 * Deal with half closed connections
3594 			 */
3595 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3596 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3597 				CFIL_LOG(LOG_INFO,
3598 				    "so %llx ignore SS_CANTRCVMORE",
3599 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3600 			} else
3601 #endif /* CONTENT_FILTER */
3602 			if (m != NULL) {
3603 				goto dontblock;
3604 			} else {
3605 				goto release;
3606 			}
3607 		}
3608 		for (; m != NULL; m = m->m_next) {
3609 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3610 				m = so->so_rcv.sb_mb;
3611 				goto dontblock;
3612 			}
3613 		}
3614 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3615 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3616 			error = ENOTCONN;
3617 			goto release;
3618 		}
3619 		if (uio_resid(uio) == 0) {
3620 			goto release;
3621 		}
3622 
3623 		if ((so->so_state & SS_NBIO) ||
3624 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3625 			error = EWOULDBLOCK;
3626 			goto release;
3627 		}
3628 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3629 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3630 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3631 #if EVEN_MORE_LOCKING_DEBUG
3632 		if (socket_debug) {
3633 			printf("Waiting for socket data\n");
3634 		}
3635 #endif
3636 
3637 		/*
3638 		 * Depending on the protocol (e.g. TCP), the following
3639 		 * might cause the socket lock to be dropped and later
3640 		 * be reacquired, and more data could have arrived and
3641 		 * have been appended to the receive socket buffer by
3642 		 * the time it returns.  Therefore, we only sleep in
3643 		 * sbwait() below if and only if the wait-condition is still
3644 		 * true.
3645 		 */
3646 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3647 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3648 		}
3649 
3650 		error = 0;
3651 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3652 			error = sbwait(&so->so_rcv);
3653 		}
3654 
3655 #if EVEN_MORE_LOCKING_DEBUG
3656 		if (socket_debug) {
3657 			printf("SORECEIVE - sbwait returned %d\n", error);
3658 		}
3659 #endif
3660 		if (so->so_usecount < 1) {
3661 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3662 			    __func__, so, so->so_usecount);
3663 			/* NOTREACHED */
3664 		}
3665 		if (error) {
3666 			socket_unlock(so, 1);
3667 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3668 			    0, 0, 0, 0);
3669 			if (en_tracing) {
3670 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3671 				    VM_KERNEL_ADDRPERM(so), 0,
3672 				    (int64_t)(orig_resid - uio_resid(uio)));
3673 			}
3674 			return error;
3675 		}
3676 		goto restart;
3677 	}
3678 dontblock:
3679 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3680 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3681 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3682 	nextrecord = m->m_nextpkt;
3683 
3684 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3685 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3686 		    mp0 == NULL);
3687 		if (error == ERESTART) {
3688 			goto restart;
3689 		} else if (error != 0) {
3690 			goto release;
3691 		}
3692 		orig_resid = 0;
3693 	}
3694 
3695 	/*
3696 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3697 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3698 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3699 	 * perform externalization.
3700 	 */
3701 	if (m != NULL && m->m_type == MT_CONTROL) {
3702 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3703 		if (error != 0) {
3704 			goto release;
3705 		}
3706 		orig_resid = 0;
3707 	}
3708 
3709 	if (m != NULL) {
3710 		if (!(flags & MSG_PEEK)) {
3711 			/*
3712 			 * We get here because m points to an mbuf following
3713 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3714 			 * processed above.  In any case, m should be pointing
3715 			 * to the head of the mbuf chain, and the nextrecord
3716 			 * should be either NULL or equal to m->m_nextpkt.
3717 			 * See comments above about SB_LOCK.
3718 			 */
3719 			if (m != so->so_rcv.sb_mb ||
3720 			    m->m_nextpkt != nextrecord) {
3721 				panic("%s: post-control !sync so=%p m=%p "
3722 				    "nextrecord=%p\n", __func__, so, m,
3723 				    nextrecord);
3724 				/* NOTREACHED */
3725 			}
3726 			if (nextrecord == NULL) {
3727 				so->so_rcv.sb_lastrecord = m;
3728 			}
3729 		}
3730 		type = m->m_type;
3731 		if (type == MT_OOBDATA) {
3732 			flags |= MSG_OOB;
3733 		}
3734 	} else {
3735 		if (!(flags & MSG_PEEK)) {
3736 			SB_EMPTY_FIXUP(&so->so_rcv);
3737 		}
3738 	}
3739 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3740 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3741 
3742 	moff = 0;
3743 	offset = 0;
3744 
3745 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3746 		can_delay = 1;
3747 	} else {
3748 		can_delay = 0;
3749 	}
3750 
3751 	while (m != NULL &&
3752 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3753 		if (m->m_type == MT_OOBDATA) {
3754 			if (type != MT_OOBDATA) {
3755 				break;
3756 			}
3757 		} else if (type == MT_OOBDATA) {
3758 			break;
3759 		}
3760 
3761 		if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3762 		    m->m_type != MT_HEADER) {
3763 			break;
3764 		}
3765 		/*
3766 		 * Make sure to allways set MSG_OOB event when getting
3767 		 * out of band data inline.
3768 		 */
3769 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3770 		    (so->so_options & SO_OOBINLINE) != 0 &&
3771 		    (so->so_state & SS_RCVATMARK) != 0) {
3772 			flags |= MSG_OOB;
3773 		}
3774 		so->so_state &= ~SS_RCVATMARK;
3775 		len = uio_resid(uio) - delayed_copy_len;
3776 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3777 			len = so->so_oobmark - offset;
3778 		}
3779 		if (len > m->m_len - moff) {
3780 			len = m->m_len - moff;
3781 		}
3782 		/*
3783 		 * If mp is set, just pass back the mbufs.
3784 		 * Otherwise copy them out via the uio, then free.
3785 		 * Sockbuf must be consistent here (points to current mbuf,
3786 		 * it points to next record) when we drop priority;
3787 		 * we must note any additions to the sockbuf when we
3788 		 * block interrupts again.
3789 		 */
3790 		if (mp == NULL) {
3791 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3792 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3793 			if (can_delay && len == m->m_len) {
3794 				/*
3795 				 * only delay the copy if we're consuming the
3796 				 * mbuf and we're NOT in MSG_PEEK mode
3797 				 * and we have enough data to make it worthwile
3798 				 * to drop and retake the lock... can_delay
3799 				 * reflects the state of the 2 latter
3800 				 * constraints moff should always be zero
3801 				 * in these cases
3802 				 */
3803 				delayed_copy_len += len;
3804 			} else {
3805 				if (delayed_copy_len) {
3806 					error = sodelayed_copy(so, uio,
3807 					    &free_list, &delayed_copy_len);
3808 
3809 					if (error) {
3810 						goto release;
3811 					}
3812 					/*
3813 					 * can only get here if MSG_PEEK is not
3814 					 * set therefore, m should point at the
3815 					 * head of the rcv queue; if it doesn't,
3816 					 * it means something drastically
3817 					 * changed while we were out from behind
3818 					 * the lock in sodelayed_copy. perhaps
3819 					 * a RST on the stream. in any event,
3820 					 * the stream has been interrupted. it's
3821 					 * probably best just to return whatever
3822 					 * data we've moved and let the caller
3823 					 * sort it out...
3824 					 */
3825 					if (m != so->so_rcv.sb_mb) {
3826 						break;
3827 					}
3828 				}
3829 				socket_unlock(so, 0);
3830 				error = uiomove(mtod(m, caddr_t) + moff,
3831 				    (int)len, uio);
3832 				socket_lock(so, 0);
3833 
3834 				if (error) {
3835 					goto release;
3836 				}
3837 			}
3838 		} else {
3839 			uio_setresid(uio, (uio_resid(uio) - len));
3840 		}
3841 		if (len == m->m_len - moff) {
3842 			if (m->m_flags & M_EOR) {
3843 				flags |= MSG_EOR;
3844 			}
3845 			if (flags & MSG_PEEK) {
3846 				m = m->m_next;
3847 				moff = 0;
3848 			} else {
3849 				nextrecord = m->m_nextpkt;
3850 				sbfree(&so->so_rcv, m);
3851 				m->m_nextpkt = NULL;
3852 
3853 				if (mp != NULL) {
3854 					*mp = m;
3855 					mp = &m->m_next;
3856 					so->so_rcv.sb_mb = m = m->m_next;
3857 					*mp = NULL;
3858 				} else {
3859 					if (free_list == NULL) {
3860 						free_list = m;
3861 					} else {
3862 						ml->m_next = m;
3863 					}
3864 					ml = m;
3865 					so->so_rcv.sb_mb = m = m->m_next;
3866 					ml->m_next = NULL;
3867 				}
3868 				if (m != NULL) {
3869 					m->m_nextpkt = nextrecord;
3870 					if (nextrecord == NULL) {
3871 						so->so_rcv.sb_lastrecord = m;
3872 					}
3873 				} else {
3874 					so->so_rcv.sb_mb = nextrecord;
3875 					SB_EMPTY_FIXUP(&so->so_rcv);
3876 				}
3877 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3878 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3879 			}
3880 		} else {
3881 			if (flags & MSG_PEEK) {
3882 				moff += len;
3883 			} else {
3884 				if (mp != NULL) {
3885 					int copy_flag;
3886 
3887 					if (flags & MSG_DONTWAIT) {
3888 						copy_flag = M_DONTWAIT;
3889 					} else {
3890 						copy_flag = M_WAIT;
3891 					}
3892 					*mp = m_copym(m, 0, (int)len, copy_flag);
3893 					/*
3894 					 * Failed to allocate an mbuf?
3895 					 * Adjust uio_resid back, it was
3896 					 * adjusted down by len bytes which
3897 					 * we didn't copy over.
3898 					 */
3899 					if (*mp == NULL) {
3900 						uio_setresid(uio,
3901 						    (uio_resid(uio) + len));
3902 						break;
3903 					}
3904 				}
3905 				m->m_data += len;
3906 				m->m_len -= len;
3907 				so->so_rcv.sb_cc -= len;
3908 			}
3909 		}
3910 		if (so->so_oobmark) {
3911 			if ((flags & MSG_PEEK) == 0) {
3912 				so->so_oobmark -= len;
3913 				if (so->so_oobmark == 0) {
3914 					so->so_state |= SS_RCVATMARK;
3915 					break;
3916 				}
3917 			} else {
3918 				offset += len;
3919 				if (offset == so->so_oobmark) {
3920 					break;
3921 				}
3922 			}
3923 		}
3924 		if (flags & MSG_EOR) {
3925 			break;
3926 		}
3927 		/*
3928 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3929 		 * (for non-atomic socket), we must not quit until
3930 		 * "uio->uio_resid == 0" or an error termination.
3931 		 * If a signal/timeout occurs, return with a short
3932 		 * count but without error.  Keep sockbuf locked
3933 		 * against other readers.
3934 		 */
3935 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3936 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3937 		    !sosendallatonce(so) && !nextrecord) {
3938 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3939 #if CONTENT_FILTER
3940 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3941 #endif /* CONTENT_FILTER */
3942 			    )) {
3943 				goto release;
3944 			}
3945 
3946 			/*
3947 			 * Depending on the protocol (e.g. TCP), the following
3948 			 * might cause the socket lock to be dropped and later
3949 			 * be reacquired, and more data could have arrived and
3950 			 * have been appended to the receive socket buffer by
3951 			 * the time it returns.  Therefore, we only sleep in
3952 			 * sbwait() below if and only if the socket buffer is
3953 			 * empty, in order to avoid a false sleep.
3954 			 */
3955 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3956 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3957 			}
3958 
3959 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3960 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3961 
3962 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3963 				error = 0;
3964 				goto release;
3965 			}
3966 			/*
3967 			 * have to wait until after we get back from the sbwait
3968 			 * to do the copy because we will drop the lock if we
3969 			 * have enough data that has been delayed... by dropping
3970 			 * the lock we open up a window allowing the netisr
3971 			 * thread to process the incoming packets and to change
3972 			 * the state of this socket... we're issuing the sbwait
3973 			 * because the socket is empty and we're expecting the
3974 			 * netisr thread to wake us up when more packets arrive;
3975 			 * if we allow that processing to happen and then sbwait
3976 			 * we could stall forever with packets sitting in the
3977 			 * socket if no further packets arrive from the remote
3978 			 * side.
3979 			 *
3980 			 * we want to copy before we've collected all the data
3981 			 * to satisfy this request to allow the copy to overlap
3982 			 * the incoming packet processing on an MP system
3983 			 */
3984 			if (delayed_copy_len > sorecvmincopy &&
3985 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3986 				error = sodelayed_copy(so, uio,
3987 				    &free_list, &delayed_copy_len);
3988 
3989 				if (error) {
3990 					goto release;
3991 				}
3992 			}
3993 			m = so->so_rcv.sb_mb;
3994 			if (m != NULL) {
3995 				nextrecord = m->m_nextpkt;
3996 			}
3997 			SB_MB_CHECK(&so->so_rcv);
3998 		}
3999 	}
4000 #ifdef MORE_LOCKING_DEBUG
4001 	if (so->so_usecount <= 1) {
4002 		panic("%s: after big while so=%p ref=%d on socket",
4003 		    __func__, so, so->so_usecount);
4004 		/* NOTREACHED */
4005 	}
4006 #endif
4007 
4008 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4009 		if (so->so_options & SO_DONTTRUNC) {
4010 			flags |= MSG_RCVMORE;
4011 		} else {
4012 			flags |= MSG_TRUNC;
4013 			if ((flags & MSG_PEEK) == 0) {
4014 				(void) sbdroprecord(&so->so_rcv);
4015 			}
4016 		}
4017 	}
4018 
4019 	/*
4020 	 * pru_rcvd below (for TCP) may cause more data to be received
4021 	 * if the socket lock is dropped prior to sending the ACK; some
4022 	 * legacy OpenTransport applications don't handle this well
4023 	 * (if it receives less data than requested while MSG_HAVEMORE
4024 	 * is set), and so we set the flag now based on what we know
4025 	 * prior to calling pru_rcvd.
4026 	 */
4027 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4028 		flags |= MSG_HAVEMORE;
4029 	}
4030 
4031 	if ((flags & MSG_PEEK) == 0) {
4032 		if (m == NULL) {
4033 			so->so_rcv.sb_mb = nextrecord;
4034 			/*
4035 			 * First part is an inline SB_EMPTY_FIXUP().  Second
4036 			 * part makes sure sb_lastrecord is up-to-date if
4037 			 * there is still data in the socket buffer.
4038 			 */
4039 			if (so->so_rcv.sb_mb == NULL) {
4040 				so->so_rcv.sb_mbtail = NULL;
4041 				so->so_rcv.sb_lastrecord = NULL;
4042 			} else if (nextrecord->m_nextpkt == NULL) {
4043 				so->so_rcv.sb_lastrecord = nextrecord;
4044 			}
4045 			SB_MB_CHECK(&so->so_rcv);
4046 		}
4047 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4048 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4049 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4050 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4051 		}
4052 	}
4053 
4054 	if (delayed_copy_len) {
4055 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4056 		if (error) {
4057 			goto release;
4058 		}
4059 	}
4060 	if (free_list != NULL) {
4061 		m_freem_list(free_list);
4062 		free_list = NULL;
4063 	}
4064 
4065 	if (orig_resid == uio_resid(uio) && orig_resid &&
4066 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4067 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4068 		goto restart;
4069 	}
4070 
4071 	if (flagsp != NULL) {
4072 		*flagsp |= flags;
4073 	}
4074 release:
4075 #ifdef MORE_LOCKING_DEBUG
4076 	if (so->so_usecount <= 1) {
4077 		panic("%s: release so=%p ref=%d on socket", __func__,
4078 		    so, so->so_usecount);
4079 		/* NOTREACHED */
4080 	}
4081 #endif
4082 	if (delayed_copy_len) {
4083 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4084 	}
4085 
4086 	if (free_list != NULL) {
4087 		m_freem_list(free_list);
4088 	}
4089 
4090 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4091 
4092 	if (en_tracing) {
4093 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4094 		    VM_KERNEL_ADDRPERM(so),
4095 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4096 		    (int64_t)(orig_resid - uio_resid(uio)));
4097 	}
4098 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4099 	    so->so_rcv.sb_cc, 0, error);
4100 
4101 	return error;
4102 }
4103 
4104 /*
4105  * Returns:	0			Success
4106  *	uiomove:EFAULT
4107  */
4108 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4109 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4110     user_ssize_t *resid)
4111 {
4112 	int error = 0;
4113 	struct mbuf *m;
4114 
4115 	m = *free_list;
4116 
4117 	socket_unlock(so, 0);
4118 
4119 	while (m != NULL && error == 0) {
4120 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4121 		m = m->m_next;
4122 	}
4123 	m_freem_list(*free_list);
4124 
4125 	*free_list = NULL;
4126 	*resid = 0;
4127 
4128 	socket_lock(so, 0);
4129 
4130 	return error;
4131 }
4132 
4133 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4134 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4135     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4136 {
4137 #pragma unused(so)
4138 	int error = 0;
4139 	struct mbuf *ml, *m;
4140 	int i = 0;
4141 	struct uio *auio;
4142 
4143 	for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4144 	    ml = ml->m_nextpkt, i++) {
4145 		auio = msgarray[i].uio;
4146 		for (m = ml; m != NULL; m = m->m_next) {
4147 			error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4148 			if (error != 0) {
4149 				goto out;
4150 			}
4151 		}
4152 	}
4153 out:
4154 	m_freem_list(*free_list);
4155 
4156 	*free_list = NULL;
4157 	*resid = 0;
4158 
4159 	return error;
4160 }
4161 
4162 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4163 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4164     int *flagsp)
4165 {
4166 	struct mbuf *m;
4167 	struct mbuf *nextrecord;
4168 	struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4169 	int error;
4170 	user_ssize_t len, pktlen, delayed_copy_len = 0;
4171 	struct protosw *pr = so->so_proto;
4172 	user_ssize_t resid;
4173 	struct proc *p = current_proc();
4174 	struct uio *auio = NULL;
4175 	int npkts = 0;
4176 	int sblocked = 0;
4177 	struct sockaddr **psa = NULL;
4178 	struct mbuf **controlp = NULL;
4179 	int can_delay;
4180 	int flags;
4181 	struct mbuf *free_others = NULL;
4182 
4183 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4184 	    so, uiocnt,
4185 	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4186 
4187 	/*
4188 	 * Sanity checks:
4189 	 * - Only supports don't wait flags
4190 	 * - Only support datagram sockets (could be extended to raw)
4191 	 * - Must be atomic
4192 	 * - Protocol must support packet chains
4193 	 * - The uio array is NULL (should we panic?)
4194 	 */
4195 	if (flagsp != NULL) {
4196 		flags = *flagsp;
4197 	} else {
4198 		flags = 0;
4199 	}
4200 	if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4201 	    MSG_NBIO)) {
4202 		printf("%s invalid flags 0x%x\n", __func__, flags);
4203 		error = EINVAL;
4204 		goto out;
4205 	}
4206 	if (so->so_type != SOCK_DGRAM) {
4207 		error = EINVAL;
4208 		goto out;
4209 	}
4210 	if (sosendallatonce(so) == 0) {
4211 		error = EINVAL;
4212 		goto out;
4213 	}
4214 	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4215 		error = EPROTONOSUPPORT;
4216 		goto out;
4217 	}
4218 	if (msgarray == NULL) {
4219 		printf("%s uioarray is NULL\n", __func__);
4220 		error = EINVAL;
4221 		goto out;
4222 	}
4223 	if (uiocnt == 0) {
4224 		printf("%s uiocnt is 0\n", __func__);
4225 		error = EINVAL;
4226 		goto out;
4227 	}
4228 	/*
4229 	 * Sanity check on the length passed by caller as we are making 'int'
4230 	 * comparisons
4231 	 */
4232 	resid = recv_msg_array_resid(msgarray, uiocnt);
4233 	if (resid < 0 || resid > INT_MAX) {
4234 		error = EINVAL;
4235 		goto out;
4236 	}
4237 
4238 	if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4239 		can_delay = 1;
4240 	} else {
4241 		can_delay = 0;
4242 	}
4243 
4244 	socket_lock(so, 1);
4245 	so_update_last_owner_locked(so, p);
4246 	so_update_policy(so);
4247 
4248 #if NECP
4249 	so_update_necp_policy(so, NULL, NULL);
4250 #endif /* NECP */
4251 
4252 	/*
4253 	 * If a recv attempt is made on a previously-accepted socket
4254 	 * that has been marked as inactive (disconnected), reject
4255 	 * the request.
4256 	 */
4257 	if (so->so_flags & SOF_DEFUNCT) {
4258 		struct sockbuf *sb = &so->so_rcv;
4259 
4260 		error = ENOTCONN;
4261 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4262 		    __func__, proc_pid(p), proc_best_name(p),
4263 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4264 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4265 		/*
4266 		 * This socket should have been disconnected and flushed
4267 		 * prior to being returned from sodefunct(); there should
4268 		 * be no data on its receive list, so panic otherwise.
4269 		 */
4270 		if (so->so_state & SS_DEFUNCT) {
4271 			sb_empty_assert(sb, __func__);
4272 		}
4273 		goto release;
4274 	}
4275 
4276 next:
4277 	/*
4278 	 * The uio may be empty
4279 	 */
4280 	if (npkts >= uiocnt) {
4281 		error = 0;
4282 		goto release;
4283 	}
4284 restart:
4285 	/*
4286 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4287 	 * and if so just return to the caller.  This could happen when
4288 	 * soreceive() is called by a socket upcall function during the
4289 	 * time the socket is freed.  The socket buffer would have been
4290 	 * locked across the upcall, therefore we cannot put this thread
4291 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4292 	 * we may livelock), because the lock on the socket buffer will
4293 	 * only be released when the upcall routine returns to its caller.
4294 	 * Because the socket has been officially closed, there can be
4295 	 * no further read on it.
4296 	 */
4297 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4298 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4299 		error = 0;
4300 		goto release;
4301 	}
4302 
4303 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4304 	if (error) {
4305 		goto release;
4306 	}
4307 	sblocked = 1;
4308 
4309 	m = so->so_rcv.sb_mb;
4310 	/*
4311 	 * Block awaiting more datagram if needed
4312 	 */
4313 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4314 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4315 	    ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4316 		/*
4317 		 * Panic if we notice inconsistencies in the socket's
4318 		 * receive list; both sb_mb and sb_cc should correctly
4319 		 * reflect the contents of the list, otherwise we may
4320 		 * end up with false positives during select() or poll()
4321 		 * which could put the application in a bad state.
4322 		 */
4323 		SB_MB_CHECK(&so->so_rcv);
4324 
4325 		if (so->so_error) {
4326 			error = so->so_error;
4327 			if ((flags & MSG_PEEK) == 0) {
4328 				so->so_error = 0;
4329 			}
4330 			goto release;
4331 		}
4332 		if (so->so_state & SS_CANTRCVMORE) {
4333 			goto release;
4334 		}
4335 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4336 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4337 			error = ENOTCONN;
4338 			goto release;
4339 		}
4340 		if ((so->so_state & SS_NBIO) ||
4341 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4342 			error = EWOULDBLOCK;
4343 			goto release;
4344 		}
4345 		/*
4346 		 * Do not block if we got some data
4347 		 */
4348 		if (free_list != NULL) {
4349 			error = 0;
4350 			goto release;
4351 		}
4352 
4353 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4354 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4355 
4356 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4357 		sblocked = 0;
4358 
4359 		error = sbwait(&so->so_rcv);
4360 		if (error) {
4361 			goto release;
4362 		}
4363 		goto restart;
4364 	}
4365 
4366 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4367 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4368 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4369 
4370 	/*
4371 	 * Consume the current uio index as we have a datagram
4372 	 */
4373 	auio = msgarray[npkts].uio;
4374 	resid = uio_resid(auio);
4375 	msgarray[npkts].which |= SOCK_MSG_DATA;
4376 	psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4377 	    &msgarray[npkts].psa : NULL;
4378 	controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4379 	    &msgarray[npkts].controlp : NULL;
4380 	npkts += 1;
4381 	nextrecord = m->m_nextpkt;
4382 
4383 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4384 		error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4385 		if (error == ERESTART) {
4386 			goto restart;
4387 		} else if (error != 0) {
4388 			goto release;
4389 		}
4390 	}
4391 
4392 	if (m != NULL && m->m_type == MT_CONTROL) {
4393 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4394 		if (error != 0) {
4395 			goto release;
4396 		}
4397 	}
4398 
4399 	if (m->m_pkthdr.len == 0) {
4400 		printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4401 		    __func__, __LINE__,
4402 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4403 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4404 		    m->m_type);
4405 	}
4406 
4407 	/*
4408 	 * Loop to copy the mbufs of the current record
4409 	 * Support zero length packets
4410 	 */
4411 	ml = NULL;
4412 	pktlen = 0;
4413 	while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4414 		if (m->m_len == 0) {
4415 			panic("%p m_len zero", m);
4416 		}
4417 		if (m->m_type == 0) {
4418 			panic("%p m_type zero", m);
4419 		}
4420 		/*
4421 		 * Clip to the residual length
4422 		 */
4423 		if (len > m->m_len) {
4424 			len = m->m_len;
4425 		}
4426 		pktlen += len;
4427 		/*
4428 		 * Copy the mbufs via the uio or delay the copy
4429 		 * Sockbuf must be consistent here (points to current mbuf,
4430 		 * it points to next record) when we drop priority;
4431 		 * we must note any additions to the sockbuf when we
4432 		 * block interrupts again.
4433 		 */
4434 		if (len > 0 && can_delay == 0) {
4435 			socket_unlock(so, 0);
4436 			error = uiomove(mtod(m, caddr_t), (int)len, auio);
4437 			socket_lock(so, 0);
4438 			if (error) {
4439 				goto release;
4440 			}
4441 		} else {
4442 			delayed_copy_len += len;
4443 		}
4444 
4445 		if (len == m->m_len) {
4446 			/*
4447 			 * m was entirely copied
4448 			 */
4449 			sbfree(&so->so_rcv, m);
4450 			nextrecord = m->m_nextpkt;
4451 			m->m_nextpkt = NULL;
4452 
4453 			/*
4454 			 * Set the first packet to the head of the free list
4455 			 */
4456 			if (free_list == NULL) {
4457 				free_list = m;
4458 			}
4459 			/*
4460 			 * Link current packet to tail of free list
4461 			 */
4462 			if (ml == NULL) {
4463 				if (free_tail != NULL) {
4464 					free_tail->m_nextpkt = m;
4465 				}
4466 				free_tail = m;
4467 			}
4468 			/*
4469 			 * Link current mbuf to last mbuf of current packet
4470 			 */
4471 			if (ml != NULL) {
4472 				ml->m_next = m;
4473 			}
4474 			ml = m;
4475 
4476 			/*
4477 			 * Move next buf to head of socket buffer
4478 			 */
4479 			so->so_rcv.sb_mb = m = ml->m_next;
4480 			ml->m_next = NULL;
4481 
4482 			if (m != NULL) {
4483 				m->m_nextpkt = nextrecord;
4484 				if (nextrecord == NULL) {
4485 					so->so_rcv.sb_lastrecord = m;
4486 				}
4487 			} else {
4488 				so->so_rcv.sb_mb = nextrecord;
4489 				SB_EMPTY_FIXUP(&so->so_rcv);
4490 			}
4491 			SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4492 			SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4493 		} else {
4494 			/*
4495 			 * Stop the loop on partial copy
4496 			 */
4497 			break;
4498 		}
4499 	}
4500 #ifdef MORE_LOCKING_DEBUG
4501 	if (so->so_usecount <= 1) {
4502 		panic("%s: after big while so=%llx ref=%d on socket",
4503 		    __func__,
4504 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4505 		/* NOTREACHED */
4506 	}
4507 #endif
4508 	/*
4509 	 * Tell the caller we made a partial copy
4510 	 */
4511 	if (m != NULL) {
4512 		if (so->so_options & SO_DONTTRUNC) {
4513 			/*
4514 			 * Copyout first the freelist then the partial mbuf
4515 			 */
4516 			socket_unlock(so, 0);
4517 			if (delayed_copy_len) {
4518 				error = sodelayed_copy_list(so, msgarray,
4519 				    uiocnt, &free_list, &delayed_copy_len);
4520 			}
4521 
4522 			if (error == 0) {
4523 				error = uiomove(mtod(m, caddr_t), (int)len,
4524 				    auio);
4525 			}
4526 			socket_lock(so, 0);
4527 			if (error) {
4528 				goto release;
4529 			}
4530 
4531 			m->m_data += len;
4532 			m->m_len -= len;
4533 			so->so_rcv.sb_cc -= len;
4534 			flags |= MSG_RCVMORE;
4535 		} else {
4536 			(void) sbdroprecord(&so->so_rcv);
4537 			nextrecord = so->so_rcv.sb_mb;
4538 			m = NULL;
4539 			flags |= MSG_TRUNC;
4540 		}
4541 	}
4542 
4543 	if (m == NULL) {
4544 		so->so_rcv.sb_mb = nextrecord;
4545 		/*
4546 		 * First part is an inline SB_EMPTY_FIXUP().  Second
4547 		 * part makes sure sb_lastrecord is up-to-date if
4548 		 * there is still data in the socket buffer.
4549 		 */
4550 		if (so->so_rcv.sb_mb == NULL) {
4551 			so->so_rcv.sb_mbtail = NULL;
4552 			so->so_rcv.sb_lastrecord = NULL;
4553 		} else if (nextrecord->m_nextpkt == NULL) {
4554 			so->so_rcv.sb_lastrecord = nextrecord;
4555 		}
4556 		SB_MB_CHECK(&so->so_rcv);
4557 	}
4558 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4559 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4560 
4561 	/*
4562 	 * We can continue to the next packet as long as:
4563 	 * - We haven't exhausted the uio array
4564 	 * - There was no error
4565 	 * - A packet was not truncated
4566 	 * - We can still receive more data
4567 	 */
4568 	if (npkts < uiocnt && error == 0 &&
4569 	    (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4570 	    (so->so_state & SS_CANTRCVMORE) == 0) {
4571 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4572 		sblocked = 0;
4573 
4574 		goto next;
4575 	}
4576 	if (flagsp != NULL) {
4577 		*flagsp |= flags;
4578 	}
4579 
4580 release:
4581 	/*
4582 	 * pru_rcvd may cause more data to be received if the socket lock
4583 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4584 	 * That way the caller won't be surprised if it receives less data
4585 	 * than requested.
4586 	 */
4587 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4588 		flags |= MSG_HAVEMORE;
4589 	}
4590 
4591 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4592 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4593 	}
4594 
4595 	if (sblocked) {
4596 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4597 	} else {
4598 		socket_unlock(so, 1);
4599 	}
4600 
4601 	if (delayed_copy_len) {
4602 		error = sodelayed_copy_list(so, msgarray, uiocnt,
4603 		    &free_list, &delayed_copy_len);
4604 	}
4605 out:
4606 	/*
4607 	 * Amortize the cost of freeing the mbufs
4608 	 */
4609 	if (free_list != NULL) {
4610 		m_freem_list(free_list);
4611 	}
4612 	if (free_others != NULL) {
4613 		m_freem_list(free_others);
4614 	}
4615 
4616 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4617 	    0, 0, 0, 0);
4618 	return error;
4619 }
4620 
4621 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4622 so_statistics_event_to_nstat_event(int64_t *input_options,
4623     uint64_t *nstat_event)
4624 {
4625 	int error = 0;
4626 	switch (*input_options) {
4627 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4628 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4629 		break;
4630 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4631 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4632 		break;
4633 #if (DEBUG || DEVELOPMENT)
4634 	case SO_STATISTICS_EVENT_RESERVED_1:
4635 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4636 		break;
4637 	case SO_STATISTICS_EVENT_RESERVED_2:
4638 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4639 		break;
4640 #endif /* (DEBUG || DEVELOPMENT) */
4641 	default:
4642 		error = EINVAL;
4643 		break;
4644 	}
4645 	return error;
4646 }
4647 
4648 /*
4649  * Returns:	0			Success
4650  *		EINVAL
4651  *		ENOTCONN
4652  *	<pru_shutdown>:EINVAL
4653  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4654  *	<pru_shutdown>:ENOBUFS[TCP]
4655  *	<pru_shutdown>:EMSGSIZE[TCP]
4656  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4657  *	<pru_shutdown>:ENETUNREACH[TCP]
4658  *	<pru_shutdown>:ENETDOWN[TCP]
4659  *	<pru_shutdown>:ENOMEM[TCP]
4660  *	<pru_shutdown>:EACCES[TCP]
4661  *	<pru_shutdown>:EMSGSIZE[TCP]
4662  *	<pru_shutdown>:ENOBUFS[TCP]
4663  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4664  *	<pru_shutdown>:???		[other protocol families]
4665  */
4666 int
soshutdown(struct socket * so,int how)4667 soshutdown(struct socket *so, int how)
4668 {
4669 	int error;
4670 
4671 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4672 
4673 	switch (how) {
4674 	case SHUT_RD:
4675 	case SHUT_WR:
4676 	case SHUT_RDWR:
4677 		socket_lock(so, 1);
4678 		if ((so->so_state &
4679 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4680 			error = ENOTCONN;
4681 		} else {
4682 			error = soshutdownlock(so, how);
4683 		}
4684 		socket_unlock(so, 1);
4685 		break;
4686 	default:
4687 		error = EINVAL;
4688 		break;
4689 	}
4690 
4691 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4692 
4693 	return error;
4694 }
4695 
4696 int
soshutdownlock_final(struct socket * so,int how)4697 soshutdownlock_final(struct socket *so, int how)
4698 {
4699 	struct protosw *pr = so->so_proto;
4700 	int error = 0;
4701 
4702 	sflt_notify(so, sock_evt_shutdown, &how);
4703 
4704 	if (how != SHUT_WR) {
4705 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4706 			/* read already shut down */
4707 			error = ENOTCONN;
4708 			goto done;
4709 		}
4710 		sorflush(so);
4711 	}
4712 	if (how != SHUT_RD) {
4713 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4714 			/* write already shut down */
4715 			error = ENOTCONN;
4716 			goto done;
4717 		}
4718 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4719 	}
4720 done:
4721 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4722 	return error;
4723 }
4724 
4725 int
soshutdownlock(struct socket * so,int how)4726 soshutdownlock(struct socket *so, int how)
4727 {
4728 	int error = 0;
4729 
4730 #if CONTENT_FILTER
4731 	/*
4732 	 * A content filter may delay the actual shutdown until it
4733 	 * has processed the pending data
4734 	 */
4735 	if (so->so_flags & SOF_CONTENT_FILTER) {
4736 		error = cfil_sock_shutdown(so, &how);
4737 		if (error == EJUSTRETURN) {
4738 			error = 0;
4739 			goto done;
4740 		} else if (error != 0) {
4741 			goto done;
4742 		}
4743 	}
4744 #endif /* CONTENT_FILTER */
4745 
4746 	error = soshutdownlock_final(so, how);
4747 
4748 done:
4749 	return error;
4750 }
4751 
4752 void
sowflush(struct socket * so)4753 sowflush(struct socket *so)
4754 {
4755 	struct sockbuf *sb = &so->so_snd;
4756 
4757 	/*
4758 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4759 	 * to prevent the socket buffer from being unexpectedly altered
4760 	 * while it is used by another thread in socket send/receive.
4761 	 *
4762 	 * sblock() must not fail here, hence the assertion.
4763 	 */
4764 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4765 	VERIFY(sb->sb_flags & SB_LOCK);
4766 
4767 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4768 	sb->sb_flags            |= SB_DROP;
4769 	sb->sb_upcall           = NULL;
4770 	sb->sb_upcallarg        = NULL;
4771 
4772 	sbunlock(sb, TRUE);     /* keep socket locked */
4773 
4774 	selthreadclear(&sb->sb_sel);
4775 	sbrelease(sb);
4776 }
4777 
4778 void
sorflush(struct socket * so)4779 sorflush(struct socket *so)
4780 {
4781 	struct sockbuf *sb = &so->so_rcv;
4782 	struct protosw *pr = so->so_proto;
4783 	struct sockbuf asb;
4784 #ifdef notyet
4785 	lck_mtx_t *mutex_held;
4786 	/*
4787 	 * XXX: This code is currently commented out, because we may get here
4788 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4789 	 * longer be able to return us the lock; this will be fixed in future.
4790 	 */
4791 	if (so->so_proto->pr_getlock != NULL) {
4792 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4793 	} else {
4794 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4795 	}
4796 
4797 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4798 #endif /* notyet */
4799 
4800 	sflt_notify(so, sock_evt_flush_read, NULL);
4801 
4802 	socantrcvmore(so);
4803 
4804 	/*
4805 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4806 	 * to prevent the socket buffer from being unexpectedly altered
4807 	 * while it is used by another thread in socket send/receive.
4808 	 *
4809 	 * sblock() must not fail here, hence the assertion.
4810 	 */
4811 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4812 	VERIFY(sb->sb_flags & SB_LOCK);
4813 
4814 	/*
4815 	 * Copy only the relevant fields from "sb" to "asb" which we
4816 	 * need for sbrelease() to function.  In particular, skip
4817 	 * sb_sel as it contains the wait queue linkage, which would
4818 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4819 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4820 	 * to acquire it later as part of sbrelease().
4821 	 */
4822 	bzero(&asb, sizeof(asb));
4823 	asb.sb_cc               = sb->sb_cc;
4824 	asb.sb_hiwat            = sb->sb_hiwat;
4825 	asb.sb_mbcnt            = sb->sb_mbcnt;
4826 	asb.sb_mbmax            = sb->sb_mbmax;
4827 	asb.sb_ctl              = sb->sb_ctl;
4828 	asb.sb_lowat            = sb->sb_lowat;
4829 	asb.sb_mb               = sb->sb_mb;
4830 	asb.sb_mbtail           = sb->sb_mbtail;
4831 	asb.sb_lastrecord       = sb->sb_lastrecord;
4832 	asb.sb_so               = sb->sb_so;
4833 	asb.sb_flags            = sb->sb_flags;
4834 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4835 	asb.sb_flags            |= SB_DROP;
4836 
4837 	/*
4838 	 * Ideally we'd bzero() these and preserve the ones we need;
4839 	 * but to do that we'd need to shuffle things around in the
4840 	 * sockbuf, and we can't do it now because there are KEXTS
4841 	 * that are directly referring to the socket structure.
4842 	 *
4843 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4844 	 * Clearing SB_SEL is done for selthreadclear() below.
4845 	 */
4846 	sb->sb_cc               = 0;
4847 	sb->sb_hiwat            = 0;
4848 	sb->sb_mbcnt            = 0;
4849 	sb->sb_mbmax            = 0;
4850 	sb->sb_ctl              = 0;
4851 	sb->sb_lowat            = 0;
4852 	sb->sb_mb               = NULL;
4853 	sb->sb_mbtail           = NULL;
4854 	sb->sb_lastrecord       = NULL;
4855 	sb->sb_timeo.tv_sec     = 0;
4856 	sb->sb_timeo.tv_usec    = 0;
4857 	sb->sb_upcall           = NULL;
4858 	sb->sb_upcallarg        = NULL;
4859 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4860 	sb->sb_flags            |= SB_DROP;
4861 
4862 	sbunlock(sb, TRUE);     /* keep socket locked */
4863 
4864 	/*
4865 	 * Note that selthreadclear() is called on the original "sb" and
4866 	 * not the local "asb" because of the way wait queue linkage is
4867 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4868 	 * should no longer be set (cleared above.)
4869 	 */
4870 	selthreadclear(&sb->sb_sel);
4871 
4872 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4873 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4874 	}
4875 
4876 	sbrelease(&asb);
4877 }
4878 
4879 /*
4880  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4881  * an additional variant to handle the case where the option value needs
4882  * to be some kind of integer, but not a specific size.
4883  * In addition to their use here, these functions are also called by the
4884  * protocol-level pr_ctloutput() routines.
4885  *
4886  * Returns:	0			Success
4887  *		EINVAL
4888  *	copyin:EFAULT
4889  */
4890 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4891 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4892 {
4893 	size_t  valsize;
4894 
4895 	/*
4896 	 * If the user gives us more than we wanted, we ignore it,
4897 	 * but if we don't get the minimum length the caller
4898 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4899 	 * is set to however much we actually retrieved.
4900 	 */
4901 	if ((valsize = sopt->sopt_valsize) < minlen) {
4902 		return EINVAL;
4903 	}
4904 	if (valsize > len) {
4905 		sopt->sopt_valsize = valsize = len;
4906 	}
4907 
4908 	if (sopt->sopt_p != kernproc) {
4909 		return copyin(sopt->sopt_val, buf, valsize);
4910 	}
4911 
4912 	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4913 	return 0;
4914 }
4915 
4916 /*
4917  * sooptcopyin_timeval
4918  *   Copy in a timeval value into tv_p, and take into account whether the
4919  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4920  *   code here so that we can verify the 64-bit tv_sec value before we lose
4921  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4922  */
4923 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4924 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4925 {
4926 	int                     error;
4927 
4928 	if (proc_is64bit(sopt->sopt_p)) {
4929 		struct user64_timeval   tv64;
4930 
4931 		if (sopt->sopt_valsize < sizeof(tv64)) {
4932 			return EINVAL;
4933 		}
4934 
4935 		sopt->sopt_valsize = sizeof(tv64);
4936 		if (sopt->sopt_p != kernproc) {
4937 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4938 			if (error != 0) {
4939 				return error;
4940 			}
4941 		} else {
4942 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4943 			    sizeof(tv64));
4944 		}
4945 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4946 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4947 			return EDOM;
4948 		}
4949 
4950 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4951 		tv_p->tv_usec = tv64.tv_usec;
4952 	} else {
4953 		struct user32_timeval   tv32;
4954 
4955 		if (sopt->sopt_valsize < sizeof(tv32)) {
4956 			return EINVAL;
4957 		}
4958 
4959 		sopt->sopt_valsize = sizeof(tv32);
4960 		if (sopt->sopt_p != kernproc) {
4961 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4962 			if (error != 0) {
4963 				return error;
4964 			}
4965 		} else {
4966 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4967 			    sizeof(tv32));
4968 		}
4969 #ifndef __LP64__
4970 		/*
4971 		 * K64todo "comparison is always false due to
4972 		 * limited range of data type"
4973 		 */
4974 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4975 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4976 			return EDOM;
4977 		}
4978 #endif
4979 		tv_p->tv_sec = tv32.tv_sec;
4980 		tv_p->tv_usec = tv32.tv_usec;
4981 	}
4982 	return 0;
4983 }
4984 
4985 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4986 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4987     boolean_t ignore_delegate)
4988 {
4989 	kauth_cred_t cred =  NULL;
4990 	proc_t ep = PROC_NULL;
4991 	uid_t uid;
4992 	int error = 0;
4993 
4994 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4995 		ep = proc_find(so->e_pid);
4996 		if (ep) {
4997 			cred = kauth_cred_proc_ref(ep);
4998 		}
4999 	}
5000 
5001 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5002 
5003 	/* uid is 0 for root */
5004 	if (uid != 0 || !allow_root) {
5005 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5006 	}
5007 	if (cred) {
5008 		kauth_cred_unref(&cred);
5009 	}
5010 	if (ep != PROC_NULL) {
5011 		proc_rele(ep);
5012 	}
5013 
5014 	return error;
5015 }
5016 
5017 /*
5018  * Returns:	0			Success
5019  *		EINVAL
5020  *		ENOPROTOOPT
5021  *		ENOBUFS
5022  *		EDOM
5023  *	sooptcopyin:EINVAL
5024  *	sooptcopyin:EFAULT
5025  *	sooptcopyin_timeval:EINVAL
5026  *	sooptcopyin_timeval:EFAULT
5027  *	sooptcopyin_timeval:EDOM
5028  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5029  *	<pr_ctloutput>:???w
5030  *	sflt_attach_private:???		[whatever a filter author chooses]
5031  *	<sf_setoption>:???		[whatever a filter author chooses]
5032  *
5033  * Notes:	Other <pru_listen> returns depend on the protocol family; all
5034  *		<sf_listen> returns depend on what the filter author causes
5035  *		their filter to return.
5036  */
5037 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5038 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5039 {
5040 	int     error, optval;
5041 	int64_t long_optval;
5042 	struct  linger l;
5043 	struct  timeval tv;
5044 
5045 	if (sopt->sopt_dir != SOPT_SET) {
5046 		sopt->sopt_dir = SOPT_SET;
5047 	}
5048 
5049 	if (dolock) {
5050 		socket_lock(so, 1);
5051 	}
5052 
5053 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5054 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5055 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5056 		/* the socket has been shutdown, no more sockopt's */
5057 		error = EINVAL;
5058 		goto out;
5059 	}
5060 
5061 	error = sflt_setsockopt(so, sopt);
5062 	if (error != 0) {
5063 		if (error == EJUSTRETURN) {
5064 			error = 0;
5065 		}
5066 		goto out;
5067 	}
5068 
5069 	if (sopt->sopt_level != SOL_SOCKET) {
5070 		if (so->so_proto != NULL &&
5071 		    so->so_proto->pr_ctloutput != NULL) {
5072 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5073 			goto out;
5074 		}
5075 		error = ENOPROTOOPT;
5076 	} else {
5077 		/*
5078 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5079 		 * the protocol layer, if needed.  A zero value returned from
5080 		 * the handler means use default socket-level processing as
5081 		 * done by the rest of this routine.  Otherwise, any other
5082 		 * return value indicates that the option is unsupported.
5083 		 */
5084 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5085 		    pru_socheckopt(so, sopt)) != 0) {
5086 			goto out;
5087 		}
5088 
5089 		error = 0;
5090 		switch (sopt->sopt_name) {
5091 		case SO_LINGER:
5092 		case SO_LINGER_SEC:
5093 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5094 			if (error != 0) {
5095 				goto out;
5096 			}
5097 
5098 			so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5099 			    (short)l.l_linger : (short)(l.l_linger * hz);
5100 			if (l.l_onoff != 0) {
5101 				so->so_options |= SO_LINGER;
5102 			} else {
5103 				so->so_options &= ~SO_LINGER;
5104 			}
5105 			break;
5106 
5107 		case SO_DEBUG:
5108 		case SO_KEEPALIVE:
5109 		case SO_DONTROUTE:
5110 		case SO_USELOOPBACK:
5111 		case SO_BROADCAST:
5112 		case SO_REUSEADDR:
5113 		case SO_REUSEPORT:
5114 		case SO_OOBINLINE:
5115 		case SO_TIMESTAMP:
5116 		case SO_TIMESTAMP_MONOTONIC:
5117 		case SO_TIMESTAMP_CONTINUOUS:
5118 		case SO_DONTTRUNC:
5119 		case SO_WANTMORE:
5120 		case SO_WANTOOBFLAG:
5121 		case SO_NOWAKEFROMSLEEP:
5122 		case SO_NOAPNFALLBK:
5123 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5124 			    sizeof(optval));
5125 			if (error != 0) {
5126 				goto out;
5127 			}
5128 			if (optval) {
5129 				so->so_options |= sopt->sopt_name;
5130 			} else {
5131 				so->so_options &= ~sopt->sopt_name;
5132 			}
5133 #if SKYWALK
5134 			inp_update_netns_flags(so);
5135 #endif /* SKYWALK */
5136 			break;
5137 
5138 		case SO_SNDBUF:
5139 		case SO_RCVBUF:
5140 		case SO_SNDLOWAT:
5141 		case SO_RCVLOWAT:
5142 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5143 			    sizeof(optval));
5144 			if (error != 0) {
5145 				goto out;
5146 			}
5147 
5148 			/*
5149 			 * Values < 1 make no sense for any of these
5150 			 * options, so disallow them.
5151 			 */
5152 			if (optval < 1) {
5153 				error = EINVAL;
5154 				goto out;
5155 			}
5156 
5157 			switch (sopt->sopt_name) {
5158 			case SO_SNDBUF:
5159 			case SO_RCVBUF: {
5160 				struct sockbuf *sb =
5161 				    (sopt->sopt_name == SO_SNDBUF) ?
5162 				    &so->so_snd : &so->so_rcv;
5163 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
5164 					error = ENOBUFS;
5165 					goto out;
5166 				}
5167 				sb->sb_flags |= SB_USRSIZE;
5168 				sb->sb_flags &= ~SB_AUTOSIZE;
5169 				sb->sb_idealsize = (u_int32_t)optval;
5170 				break;
5171 			}
5172 			/*
5173 			 * Make sure the low-water is never greater than
5174 			 * the high-water.
5175 			 */
5176 			case SO_SNDLOWAT: {
5177 				int space = sbspace(&so->so_snd);
5178 				u_int32_t hiwat = so->so_snd.sb_hiwat;
5179 
5180 				if (so->so_snd.sb_flags & SB_UNIX) {
5181 					struct unpcb *unp =
5182 					    (struct unpcb *)(so->so_pcb);
5183 					if (unp != NULL &&
5184 					    unp->unp_conn != NULL) {
5185 						hiwat += unp->unp_conn->unp_cc;
5186 					}
5187 				}
5188 
5189 				so->so_snd.sb_lowat =
5190 				    (optval > hiwat) ?
5191 				    hiwat : optval;
5192 
5193 				if (space >= so->so_snd.sb_lowat) {
5194 					sowwakeup(so);
5195 				}
5196 				break;
5197 			}
5198 			case SO_RCVLOWAT: {
5199 				int64_t data_len;
5200 				so->so_rcv.sb_lowat =
5201 				    (optval > so->so_rcv.sb_hiwat) ?
5202 				    so->so_rcv.sb_hiwat : optval;
5203 				data_len = so->so_rcv.sb_cc
5204 				    - so->so_rcv.sb_ctl;
5205 				if (data_len >= so->so_rcv.sb_lowat) {
5206 					sorwakeup(so);
5207 				}
5208 				break;
5209 			}
5210 			}
5211 			break;
5212 
5213 		case SO_SNDTIMEO:
5214 		case SO_RCVTIMEO:
5215 			error = sooptcopyin_timeval(sopt, &tv);
5216 			if (error != 0) {
5217 				goto out;
5218 			}
5219 
5220 			switch (sopt->sopt_name) {
5221 			case SO_SNDTIMEO:
5222 				so->so_snd.sb_timeo = tv;
5223 				break;
5224 			case SO_RCVTIMEO:
5225 				so->so_rcv.sb_timeo = tv;
5226 				break;
5227 			}
5228 			break;
5229 
5230 		case SO_NKE: {
5231 			struct so_nke nke;
5232 
5233 			error = sooptcopyin(sopt, &nke, sizeof(nke),
5234 			    sizeof(nke));
5235 			if (error != 0) {
5236 				goto out;
5237 			}
5238 
5239 			error = sflt_attach_internal(so, nke.nke_handle);
5240 			break;
5241 		}
5242 
5243 		case SO_NOSIGPIPE:
5244 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5245 			    sizeof(optval));
5246 			if (error != 0) {
5247 				goto out;
5248 			}
5249 			if (optval != 0) {
5250 				so->so_flags |= SOF_NOSIGPIPE;
5251 			} else {
5252 				so->so_flags &= ~SOF_NOSIGPIPE;
5253 			}
5254 			break;
5255 
5256 		case SO_NOADDRERR:
5257 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5258 			    sizeof(optval));
5259 			if (error != 0) {
5260 				goto out;
5261 			}
5262 			if (optval != 0) {
5263 				so->so_flags |= SOF_NOADDRAVAIL;
5264 			} else {
5265 				so->so_flags &= ~SOF_NOADDRAVAIL;
5266 			}
5267 			break;
5268 
5269 		case SO_REUSESHAREUID:
5270 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5271 			    sizeof(optval));
5272 			if (error != 0) {
5273 				goto out;
5274 			}
5275 			if (optval != 0) {
5276 				so->so_flags |= SOF_REUSESHAREUID;
5277 			} else {
5278 				so->so_flags &= ~SOF_REUSESHAREUID;
5279 			}
5280 			break;
5281 
5282 		case SO_NOTIFYCONFLICT:
5283 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5284 				error = EPERM;
5285 				goto out;
5286 			}
5287 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5288 			    sizeof(optval));
5289 			if (error != 0) {
5290 				goto out;
5291 			}
5292 			if (optval != 0) {
5293 				so->so_flags |= SOF_NOTIFYCONFLICT;
5294 			} else {
5295 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5296 			}
5297 			break;
5298 
5299 		case SO_RESTRICTIONS:
5300 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5301 			    sizeof(optval));
5302 			if (error != 0) {
5303 				goto out;
5304 			}
5305 
5306 			error = so_set_restrictions(so, optval);
5307 			break;
5308 
5309 		case SO_AWDL_UNRESTRICTED:
5310 			if (SOCK_DOM(so) != PF_INET &&
5311 			    SOCK_DOM(so) != PF_INET6) {
5312 				error = EOPNOTSUPP;
5313 				goto out;
5314 			}
5315 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5316 			    sizeof(optval));
5317 			if (error != 0) {
5318 				goto out;
5319 			}
5320 			if (optval != 0) {
5321 				error = soopt_cred_check(so,
5322 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5323 				if (error == 0) {
5324 					inp_set_awdl_unrestricted(
5325 						sotoinpcb(so));
5326 				}
5327 			} else {
5328 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5329 			}
5330 			break;
5331 		case SO_INTCOPROC_ALLOW:
5332 			if (SOCK_DOM(so) != PF_INET6) {
5333 				error = EOPNOTSUPP;
5334 				goto out;
5335 			}
5336 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5337 			    sizeof(optval));
5338 			if (error != 0) {
5339 				goto out;
5340 			}
5341 			if (optval != 0 &&
5342 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5343 				error = soopt_cred_check(so,
5344 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5345 				if (error == 0) {
5346 					inp_set_intcoproc_allowed(
5347 						sotoinpcb(so));
5348 				}
5349 			} else if (optval == 0) {
5350 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5351 			}
5352 			break;
5353 
5354 		case SO_LABEL:
5355 			error = EOPNOTSUPP;
5356 			break;
5357 
5358 		case SO_UPCALLCLOSEWAIT:
5359 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5360 			    sizeof(optval));
5361 			if (error != 0) {
5362 				goto out;
5363 			}
5364 			if (optval != 0) {
5365 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5366 			} else {
5367 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5368 			}
5369 			break;
5370 
5371 		case SO_RANDOMPORT:
5372 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5373 			    sizeof(optval));
5374 			if (error != 0) {
5375 				goto out;
5376 			}
5377 			if (optval != 0) {
5378 				so->so_flags |= SOF_BINDRANDOMPORT;
5379 			} else {
5380 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5381 			}
5382 			break;
5383 
5384 		case SO_NP_EXTENSIONS: {
5385 			struct so_np_extensions sonpx;
5386 
5387 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5388 			    sizeof(sonpx));
5389 			if (error != 0) {
5390 				goto out;
5391 			}
5392 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5393 				error = EINVAL;
5394 				goto out;
5395 			}
5396 			/*
5397 			 * Only one bit defined for now
5398 			 */
5399 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5400 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5401 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5402 				} else {
5403 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5404 				}
5405 			}
5406 			break;
5407 		}
5408 
5409 		case SO_TRAFFIC_CLASS: {
5410 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5411 			    sizeof(optval));
5412 			if (error != 0) {
5413 				goto out;
5414 			}
5415 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5416 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5417 				error = so_set_net_service_type(so, netsvc);
5418 				goto out;
5419 			}
5420 			error = so_set_traffic_class(so, optval);
5421 			if (error != 0) {
5422 				goto out;
5423 			}
5424 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5425 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5426 			break;
5427 		}
5428 
5429 		case SO_RECV_TRAFFIC_CLASS: {
5430 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5431 			    sizeof(optval));
5432 			if (error != 0) {
5433 				goto out;
5434 			}
5435 			if (optval == 0) {
5436 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5437 			} else {
5438 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5439 			}
5440 			break;
5441 		}
5442 
5443 #if (DEVELOPMENT || DEBUG)
5444 		case SO_TRAFFIC_CLASS_DBG: {
5445 			struct so_tcdbg so_tcdbg;
5446 
5447 			error = sooptcopyin(sopt, &so_tcdbg,
5448 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5449 			if (error != 0) {
5450 				goto out;
5451 			}
5452 			error = so_set_tcdbg(so, &so_tcdbg);
5453 			if (error != 0) {
5454 				goto out;
5455 			}
5456 			break;
5457 		}
5458 #endif /* (DEVELOPMENT || DEBUG) */
5459 
5460 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5461 			error = priv_check_cred(kauth_cred_get(),
5462 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5463 			if (error != 0) {
5464 				goto out;
5465 			}
5466 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5467 			    sizeof(optval));
5468 			if (error != 0) {
5469 				goto out;
5470 			}
5471 			if (optval == 0) {
5472 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5473 			} else {
5474 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5475 			}
5476 			break;
5477 
5478 #if (DEVELOPMENT || DEBUG)
5479 		case SO_DEFUNCTIT:
5480 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5481 			if (error == 0) {
5482 				error = sodefunct(current_proc(), so, 0);
5483 			}
5484 
5485 			break;
5486 #endif /* (DEVELOPMENT || DEBUG) */
5487 
5488 		case SO_DEFUNCTOK:
5489 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5490 			    sizeof(optval));
5491 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5492 				if (error == 0) {
5493 					error = EBADF;
5494 				}
5495 				goto out;
5496 			}
5497 			/*
5498 			 * Any process can set SO_DEFUNCTOK (clear
5499 			 * SOF_NODEFUNCT), but only root can clear
5500 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5501 			 */
5502 			if (optval == 0 &&
5503 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5504 				error = EPERM;
5505 				goto out;
5506 			}
5507 			if (optval) {
5508 				so->so_flags &= ~SOF_NODEFUNCT;
5509 			} else {
5510 				so->so_flags |= SOF_NODEFUNCT;
5511 			}
5512 
5513 			if (SOCK_DOM(so) == PF_INET ||
5514 			    SOCK_DOM(so) == PF_INET6) {
5515 				char s[MAX_IPv6_STR_LEN];
5516 				char d[MAX_IPv6_STR_LEN];
5517 				struct inpcb *inp = sotoinpcb(so);
5518 
5519 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5520 				    "[%s %s:%d -> %s:%d] is now marked "
5521 				    "as %seligible for "
5522 				    "defunct\n", __func__, proc_selfpid(),
5523 				    proc_best_name(current_proc()),
5524 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5525 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5526 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5527 				    ((SOCK_DOM(so) == PF_INET) ?
5528 				    (void *)&inp->inp_laddr.s_addr :
5529 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5530 				    ntohs(inp->in6p_lport),
5531 				    inet_ntop(SOCK_DOM(so),
5532 				    (SOCK_DOM(so) == PF_INET) ?
5533 				    (void *)&inp->inp_faddr.s_addr :
5534 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5535 				    ntohs(inp->in6p_fport),
5536 				    (so->so_flags & SOF_NODEFUNCT) ?
5537 				    "not " : "");
5538 			} else {
5539 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5540 				    "is now marked as %seligible for "
5541 				    "defunct\n",
5542 				    __func__, proc_selfpid(),
5543 				    proc_best_name(current_proc()),
5544 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5545 				    SOCK_DOM(so), SOCK_TYPE(so),
5546 				    (so->so_flags & SOF_NODEFUNCT) ?
5547 				    "not " : "");
5548 			}
5549 			break;
5550 
5551 		case SO_ISDEFUNCT:
5552 			/* This option is not settable */
5553 			error = EINVAL;
5554 			break;
5555 
5556 		case SO_OPPORTUNISTIC:
5557 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5558 			    sizeof(optval));
5559 			if (error == 0) {
5560 				error = so_set_opportunistic(so, optval);
5561 			}
5562 			break;
5563 
5564 		case SO_FLUSH:
5565 			/* This option is handled by lower layer(s) */
5566 			error = 0;
5567 			break;
5568 
5569 		case SO_RECV_ANYIF:
5570 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5571 			    sizeof(optval));
5572 			if (error == 0) {
5573 				error = so_set_recv_anyif(so, optval);
5574 			}
5575 			break;
5576 
5577 		case SO_TRAFFIC_MGT_BACKGROUND: {
5578 			/* This option is handled by lower layer(s) */
5579 			error = 0;
5580 			break;
5581 		}
5582 
5583 #if FLOW_DIVERT
5584 		case SO_FLOW_DIVERT_TOKEN:
5585 			error = flow_divert_token_set(so, sopt);
5586 			break;
5587 #endif  /* FLOW_DIVERT */
5588 
5589 
5590 		case SO_DELEGATED:
5591 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5592 			    sizeof(optval))) != 0) {
5593 				break;
5594 			}
5595 
5596 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5597 			break;
5598 
5599 		case SO_DELEGATED_UUID: {
5600 			uuid_t euuid;
5601 
5602 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5603 			    sizeof(euuid))) != 0) {
5604 				break;
5605 			}
5606 
5607 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5608 			break;
5609 		}
5610 
5611 #if NECP
5612 		case SO_NECP_ATTRIBUTES:
5613 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5614 				/* Handled by MPTCP itself */
5615 				break;
5616 			}
5617 
5618 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5619 				error = EINVAL;
5620 				goto out;
5621 			}
5622 
5623 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5624 			break;
5625 
5626 		case SO_NECP_CLIENTUUID: {
5627 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5628 				/* Handled by MPTCP itself */
5629 				break;
5630 			}
5631 
5632 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5633 				error = EINVAL;
5634 				goto out;
5635 			}
5636 
5637 			struct inpcb *inp = sotoinpcb(so);
5638 			if (!uuid_is_null(inp->necp_client_uuid)) {
5639 				// Clear out the old client UUID if present
5640 				necp_inpcb_remove_cb(inp);
5641 			}
5642 
5643 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5644 			    sizeof(uuid_t), sizeof(uuid_t));
5645 			if (error != 0) {
5646 				goto out;
5647 			}
5648 
5649 			if (uuid_is_null(inp->necp_client_uuid)) {
5650 				error = EINVAL;
5651 				goto out;
5652 			}
5653 
5654 			pid_t current_pid = proc_pid(current_proc());
5655 			error = necp_client_register_socket_flow(current_pid,
5656 			    inp->necp_client_uuid, inp);
5657 			if (error != 0) {
5658 				uuid_clear(inp->necp_client_uuid);
5659 				goto out;
5660 			}
5661 
5662 			if (inp->inp_lport != 0) {
5663 				// There is a bound local port, so this is not
5664 				// a fresh socket. Assign to the client.
5665 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5666 			}
5667 
5668 			break;
5669 		}
5670 		case SO_NECP_LISTENUUID: {
5671 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5672 				error = EINVAL;
5673 				goto out;
5674 			}
5675 
5676 			struct inpcb *inp = sotoinpcb(so);
5677 			if (!uuid_is_null(inp->necp_client_uuid)) {
5678 				error = EINVAL;
5679 				goto out;
5680 			}
5681 
5682 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5683 			    sizeof(uuid_t), sizeof(uuid_t));
5684 			if (error != 0) {
5685 				goto out;
5686 			}
5687 
5688 			if (uuid_is_null(inp->necp_client_uuid)) {
5689 				error = EINVAL;
5690 				goto out;
5691 			}
5692 
5693 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5694 			    inp->necp_client_uuid, inp);
5695 			if (error != 0) {
5696 				uuid_clear(inp->necp_client_uuid);
5697 				goto out;
5698 			}
5699 
5700 			// Mark that the port registration is held by NECP
5701 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5702 
5703 			break;
5704 		}
5705 #endif /* NECP */
5706 
5707 		case SO_EXTENDED_BK_IDLE:
5708 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5709 			    sizeof(optval));
5710 			if (error == 0) {
5711 				error = so_set_extended_bk_idle(so, optval);
5712 			}
5713 			break;
5714 
5715 		case SO_MARK_CELLFALLBACK:
5716 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5717 			    sizeof(optval));
5718 			if (error != 0) {
5719 				goto out;
5720 			}
5721 			if (optval < 0) {
5722 				error = EINVAL;
5723 				goto out;
5724 			}
5725 			if (optval == 0) {
5726 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5727 			} else {
5728 				so->so_flags1 |= SOF1_CELLFALLBACK;
5729 			}
5730 			break;
5731 
5732 		case SO_FALLBACK_MODE:
5733 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5734 			    sizeof(optval));
5735 			if (error != 0) {
5736 				goto out;
5737 			}
5738 			if (optval < SO_FALLBACK_MODE_NONE ||
5739 			    optval > SO_FALLBACK_MODE_PREFER) {
5740 				error = EINVAL;
5741 				goto out;
5742 			}
5743 			so->so_fallback_mode = (u_int8_t)optval;
5744 			break;
5745 
5746 		case SO_MARK_KNOWN_TRACKER: {
5747 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5748 			    sizeof(optval));
5749 			if (error != 0) {
5750 				goto out;
5751 			}
5752 			if (optval < 0) {
5753 				error = EINVAL;
5754 				goto out;
5755 			}
5756 			if (optval == 0) {
5757 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5758 			} else {
5759 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5760 			}
5761 			break;
5762 		}
5763 
5764 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5765 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5766 			    sizeof(optval));
5767 			if (error != 0) {
5768 				goto out;
5769 			}
5770 			if (optval < 0) {
5771 				error = EINVAL;
5772 				goto out;
5773 			}
5774 			if (optval == 0) {
5775 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5776 			} else {
5777 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5778 			}
5779 			break;
5780 		}
5781 
5782 		case SO_MARK_APPROVED_APP_DOMAIN: {
5783 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5784 			    sizeof(optval));
5785 			if (error != 0) {
5786 				goto out;
5787 			}
5788 			if (optval < 0) {
5789 				error = EINVAL;
5790 				goto out;
5791 			}
5792 			if (optval == 0) {
5793 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5794 			} else {
5795 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5796 			}
5797 			break;
5798 		}
5799 
5800 		case SO_STATISTICS_EVENT:
5801 			error = sooptcopyin(sopt, &long_optval,
5802 			    sizeof(long_optval), sizeof(long_optval));
5803 			if (error != 0) {
5804 				goto out;
5805 			}
5806 			u_int64_t nstat_event = 0;
5807 			error = so_statistics_event_to_nstat_event(
5808 				&long_optval, &nstat_event);
5809 			if (error != 0) {
5810 				goto out;
5811 			}
5812 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5813 			break;
5814 
5815 		case SO_NET_SERVICE_TYPE: {
5816 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5817 			    sizeof(optval));
5818 			if (error != 0) {
5819 				goto out;
5820 			}
5821 			error = so_set_net_service_type(so, optval);
5822 			break;
5823 		}
5824 
5825 		case SO_QOSMARKING_POLICY_OVERRIDE:
5826 			error = priv_check_cred(kauth_cred_get(),
5827 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5828 			if (error != 0) {
5829 				goto out;
5830 			}
5831 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5832 			    sizeof(optval));
5833 			if (error != 0) {
5834 				goto out;
5835 			}
5836 			if (optval == 0) {
5837 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5838 			} else {
5839 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5840 			}
5841 			break;
5842 
5843 		case SO_MPKL_SEND_INFO: {
5844 			struct so_mpkl_send_info so_mpkl_send_info;
5845 
5846 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5847 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5848 			if (error != 0) {
5849 				goto out;
5850 			}
5851 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5852 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5853 
5854 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5855 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5856 			} else {
5857 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5858 			}
5859 			break;
5860 		}
5861 		case SO_WANT_KEV_SOCKET_CLOSED: {
5862 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5863 			    sizeof(optval));
5864 			if (error != 0) {
5865 				goto out;
5866 			}
5867 			if (optval == 0) {
5868 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5869 			} else {
5870 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5871 			}
5872 			break;
5873 		}
5874 		case SO_MARK_WAKE_PKT: {
5875 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5876 			    sizeof(optval));
5877 			if (error != 0) {
5878 				goto out;
5879 			}
5880 			if (optval == 0) {
5881 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5882 			} else {
5883 				so->so_flags |= SOF_MARK_WAKE_PKT;
5884 			}
5885 			break;
5886 		}
5887 		case SO_RECV_WAKE_PKT: {
5888 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5889 			    sizeof(optval));
5890 			if (error != 0) {
5891 				goto out;
5892 			}
5893 			if (optval == 0) {
5894 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5895 			} else {
5896 				so->so_flags |= SOF_RECV_WAKE_PKT;
5897 			}
5898 			break;
5899 		}
5900 		default:
5901 			error = ENOPROTOOPT;
5902 			break;
5903 		}
5904 		if (error == 0 && so->so_proto != NULL &&
5905 		    so->so_proto->pr_ctloutput != NULL) {
5906 			(void) so->so_proto->pr_ctloutput(so, sopt);
5907 		}
5908 	}
5909 out:
5910 	if (dolock) {
5911 		socket_unlock(so, 1);
5912 	}
5913 	return error;
5914 }
5915 
5916 /* Helper routines for getsockopt */
5917 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5918 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5919 {
5920 	int     error;
5921 	size_t  valsize;
5922 
5923 	error = 0;
5924 
5925 	/*
5926 	 * Documented get behavior is that we always return a value,
5927 	 * possibly truncated to fit in the user's buffer.
5928 	 * Traditional behavior is that we always tell the user
5929 	 * precisely how much we copied, rather than something useful
5930 	 * like the total amount we had available for her.
5931 	 * Note that this interface is not idempotent; the entire answer must
5932 	 * generated ahead of time.
5933 	 */
5934 	valsize = MIN(len, sopt->sopt_valsize);
5935 	sopt->sopt_valsize = valsize;
5936 	if (sopt->sopt_val != USER_ADDR_NULL) {
5937 		if (sopt->sopt_p != kernproc) {
5938 			error = copyout(buf, sopt->sopt_val, valsize);
5939 		} else {
5940 			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5941 		}
5942 	}
5943 	return error;
5944 }
5945 
5946 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5947 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5948 {
5949 	int                     error;
5950 	size_t                  len;
5951 	struct user64_timeval   tv64 = {};
5952 	struct user32_timeval   tv32 = {};
5953 	const void *            val;
5954 	size_t                  valsize;
5955 
5956 	error = 0;
5957 	if (proc_is64bit(sopt->sopt_p)) {
5958 		len = sizeof(tv64);
5959 		tv64.tv_sec = tv_p->tv_sec;
5960 		tv64.tv_usec = tv_p->tv_usec;
5961 		val = &tv64;
5962 	} else {
5963 		len = sizeof(tv32);
5964 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5965 		tv32.tv_usec = tv_p->tv_usec;
5966 		val = &tv32;
5967 	}
5968 	valsize = MIN(len, sopt->sopt_valsize);
5969 	sopt->sopt_valsize = valsize;
5970 	if (sopt->sopt_val != USER_ADDR_NULL) {
5971 		if (sopt->sopt_p != kernproc) {
5972 			error = copyout(val, sopt->sopt_val, valsize);
5973 		} else {
5974 			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5975 		}
5976 	}
5977 	return error;
5978 }
5979 
5980 /*
5981  * Return:	0			Success
5982  *		ENOPROTOOPT
5983  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5984  *	<pr_ctloutput>:???
5985  *	<sf_getoption>:???
5986  */
5987 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5988 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5989 {
5990 	int     error, optval;
5991 	struct  linger l;
5992 	struct  timeval tv;
5993 
5994 	if (sopt->sopt_dir != SOPT_GET) {
5995 		sopt->sopt_dir = SOPT_GET;
5996 	}
5997 
5998 	if (dolock) {
5999 		socket_lock(so, 1);
6000 	}
6001 
6002 	error = sflt_getsockopt(so, sopt);
6003 	if (error != 0) {
6004 		if (error == EJUSTRETURN) {
6005 			error = 0;
6006 		}
6007 		goto out;
6008 	}
6009 
6010 	if (sopt->sopt_level != SOL_SOCKET) {
6011 		if (so->so_proto != NULL &&
6012 		    so->so_proto->pr_ctloutput != NULL) {
6013 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
6014 			goto out;
6015 		}
6016 		error = ENOPROTOOPT;
6017 	} else {
6018 		/*
6019 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
6020 		 * the protocol layer, if needed.  A zero value returned from
6021 		 * the handler means use default socket-level processing as
6022 		 * done by the rest of this routine.  Otherwise, any other
6023 		 * return value indicates that the option is unsupported.
6024 		 */
6025 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6026 		    pru_socheckopt(so, sopt)) != 0) {
6027 			goto out;
6028 		}
6029 
6030 		error = 0;
6031 		switch (sopt->sopt_name) {
6032 		case SO_LINGER:
6033 		case SO_LINGER_SEC:
6034 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6035 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6036 			    so->so_linger : so->so_linger / hz;
6037 			error = sooptcopyout(sopt, &l, sizeof(l));
6038 			break;
6039 
6040 		case SO_USELOOPBACK:
6041 		case SO_DONTROUTE:
6042 		case SO_DEBUG:
6043 		case SO_KEEPALIVE:
6044 		case SO_REUSEADDR:
6045 		case SO_REUSEPORT:
6046 		case SO_BROADCAST:
6047 		case SO_OOBINLINE:
6048 		case SO_TIMESTAMP:
6049 		case SO_TIMESTAMP_MONOTONIC:
6050 		case SO_TIMESTAMP_CONTINUOUS:
6051 		case SO_DONTTRUNC:
6052 		case SO_WANTMORE:
6053 		case SO_WANTOOBFLAG:
6054 		case SO_NOWAKEFROMSLEEP:
6055 		case SO_NOAPNFALLBK:
6056 			optval = so->so_options & sopt->sopt_name;
6057 integer:
6058 			error = sooptcopyout(sopt, &optval, sizeof(optval));
6059 			break;
6060 
6061 		case SO_TYPE:
6062 			optval = so->so_type;
6063 			goto integer;
6064 
6065 		case SO_NREAD:
6066 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6067 				int pkt_total;
6068 				struct mbuf *m1;
6069 
6070 				pkt_total = 0;
6071 				m1 = so->so_rcv.sb_mb;
6072 				while (m1 != NULL) {
6073 					if (m1->m_type == MT_DATA ||
6074 					    m1->m_type == MT_HEADER ||
6075 					    m1->m_type == MT_OOBDATA) {
6076 						pkt_total += m1->m_len;
6077 					}
6078 					m1 = m1->m_next;
6079 				}
6080 				optval = pkt_total;
6081 			} else {
6082 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6083 			}
6084 			goto integer;
6085 
6086 		case SO_NUMRCVPKT:
6087 			if (so->so_proto->pr_flags & PR_ATOMIC) {
6088 				int cnt = 0;
6089 				struct mbuf *m1;
6090 
6091 				m1 = so->so_rcv.sb_mb;
6092 				while (m1 != NULL) {
6093 					cnt += 1;
6094 					m1 = m1->m_nextpkt;
6095 				}
6096 				optval = cnt;
6097 				goto integer;
6098 			} else {
6099 				error = ENOPROTOOPT;
6100 				break;
6101 			}
6102 
6103 		case SO_NWRITE:
6104 			optval = so->so_snd.sb_cc;
6105 			goto integer;
6106 
6107 		case SO_ERROR:
6108 			optval = so->so_error;
6109 			so->so_error = 0;
6110 			goto integer;
6111 
6112 		case SO_SNDBUF: {
6113 			u_int32_t hiwat = so->so_snd.sb_hiwat;
6114 
6115 			if (so->so_snd.sb_flags & SB_UNIX) {
6116 				struct unpcb *unp =
6117 				    (struct unpcb *)(so->so_pcb);
6118 				if (unp != NULL && unp->unp_conn != NULL) {
6119 					hiwat += unp->unp_conn->unp_cc;
6120 				}
6121 			}
6122 
6123 			optval = hiwat;
6124 			goto integer;
6125 		}
6126 		case SO_RCVBUF:
6127 			optval = so->so_rcv.sb_hiwat;
6128 			goto integer;
6129 
6130 		case SO_SNDLOWAT:
6131 			optval = so->so_snd.sb_lowat;
6132 			goto integer;
6133 
6134 		case SO_RCVLOWAT:
6135 			optval = so->so_rcv.sb_lowat;
6136 			goto integer;
6137 
6138 		case SO_SNDTIMEO:
6139 		case SO_RCVTIMEO:
6140 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
6141 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6142 
6143 			error = sooptcopyout_timeval(sopt, &tv);
6144 			break;
6145 
6146 		case SO_NOSIGPIPE:
6147 			optval = (so->so_flags & SOF_NOSIGPIPE);
6148 			goto integer;
6149 
6150 		case SO_NOADDRERR:
6151 			optval = (so->so_flags & SOF_NOADDRAVAIL);
6152 			goto integer;
6153 
6154 		case SO_REUSESHAREUID:
6155 			optval = (so->so_flags & SOF_REUSESHAREUID);
6156 			goto integer;
6157 
6158 
6159 		case SO_NOTIFYCONFLICT:
6160 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6161 			goto integer;
6162 
6163 		case SO_RESTRICTIONS:
6164 			optval = so_get_restrictions(so);
6165 			goto integer;
6166 
6167 		case SO_AWDL_UNRESTRICTED:
6168 			if (SOCK_DOM(so) == PF_INET ||
6169 			    SOCK_DOM(so) == PF_INET6) {
6170 				optval = inp_get_awdl_unrestricted(
6171 					sotoinpcb(so));
6172 				goto integer;
6173 			} else {
6174 				error = EOPNOTSUPP;
6175 			}
6176 			break;
6177 
6178 		case SO_INTCOPROC_ALLOW:
6179 			if (SOCK_DOM(so) == PF_INET6) {
6180 				optval = inp_get_intcoproc_allowed(
6181 					sotoinpcb(so));
6182 				goto integer;
6183 			} else {
6184 				error = EOPNOTSUPP;
6185 			}
6186 			break;
6187 
6188 		case SO_LABEL:
6189 			error = EOPNOTSUPP;
6190 			break;
6191 
6192 		case SO_PEERLABEL:
6193 			error = EOPNOTSUPP;
6194 			break;
6195 
6196 #ifdef __APPLE_API_PRIVATE
6197 		case SO_UPCALLCLOSEWAIT:
6198 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6199 			goto integer;
6200 #endif
6201 		case SO_RANDOMPORT:
6202 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6203 			goto integer;
6204 
6205 		case SO_NP_EXTENSIONS: {
6206 			struct so_np_extensions sonpx = {};
6207 
6208 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6209 			    SONPX_SETOPTSHUT : 0;
6210 			sonpx.npx_mask = SONPX_MASK_VALID;
6211 
6212 			error = sooptcopyout(sopt, &sonpx,
6213 			    sizeof(struct so_np_extensions));
6214 			break;
6215 		}
6216 
6217 		case SO_TRAFFIC_CLASS:
6218 			optval = so->so_traffic_class;
6219 			goto integer;
6220 
6221 		case SO_RECV_TRAFFIC_CLASS:
6222 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6223 			goto integer;
6224 
6225 #if (DEVELOPMENT || DEBUG)
6226 		case SO_TRAFFIC_CLASS_DBG:
6227 			error = sogetopt_tcdbg(so, sopt);
6228 			break;
6229 #endif /* (DEVELOPMENT || DEBUG) */
6230 
6231 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6232 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6233 			goto integer;
6234 
6235 		case SO_DEFUNCTOK:
6236 			optval = !(so->so_flags & SOF_NODEFUNCT);
6237 			goto integer;
6238 
6239 		case SO_ISDEFUNCT:
6240 			optval = (so->so_flags & SOF_DEFUNCT);
6241 			goto integer;
6242 
6243 		case SO_OPPORTUNISTIC:
6244 			optval = so_get_opportunistic(so);
6245 			goto integer;
6246 
6247 		case SO_FLUSH:
6248 			/* This option is not gettable */
6249 			error = EINVAL;
6250 			break;
6251 
6252 		case SO_RECV_ANYIF:
6253 			optval = so_get_recv_anyif(so);
6254 			goto integer;
6255 
6256 		case SO_TRAFFIC_MGT_BACKGROUND:
6257 			/* This option is handled by lower layer(s) */
6258 			if (so->so_proto != NULL &&
6259 			    so->so_proto->pr_ctloutput != NULL) {
6260 				(void) so->so_proto->pr_ctloutput(so, sopt);
6261 			}
6262 			break;
6263 
6264 #if FLOW_DIVERT
6265 		case SO_FLOW_DIVERT_TOKEN:
6266 			error = flow_divert_token_get(so, sopt);
6267 			break;
6268 #endif  /* FLOW_DIVERT */
6269 
6270 #if NECP
6271 		case SO_NECP_ATTRIBUTES:
6272 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6273 				/* Handled by MPTCP itself */
6274 				break;
6275 			}
6276 
6277 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6278 				error = EINVAL;
6279 				goto out;
6280 			}
6281 
6282 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6283 			break;
6284 
6285 		case SO_NECP_CLIENTUUID: {
6286 			uuid_t *ncu;
6287 
6288 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6289 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6290 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6291 				ncu = &sotoinpcb(so)->necp_client_uuid;
6292 			} else {
6293 				error = EINVAL;
6294 				goto out;
6295 			}
6296 
6297 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6298 			break;
6299 		}
6300 
6301 		case SO_NECP_LISTENUUID: {
6302 			uuid_t *nlu;
6303 
6304 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6305 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6306 					nlu = &sotoinpcb(so)->necp_client_uuid;
6307 				} else {
6308 					error = ENOENT;
6309 					goto out;
6310 				}
6311 			} else {
6312 				error = EINVAL;
6313 				goto out;
6314 			}
6315 
6316 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6317 			break;
6318 		}
6319 #endif /* NECP */
6320 
6321 #if CONTENT_FILTER
6322 		case SO_CFIL_SOCK_ID: {
6323 			cfil_sock_id_t sock_id;
6324 
6325 			sock_id = cfil_sock_id_from_socket(so);
6326 
6327 			error = sooptcopyout(sopt, &sock_id,
6328 			    sizeof(cfil_sock_id_t));
6329 			break;
6330 		}
6331 #endif  /* CONTENT_FILTER */
6332 
6333 		case SO_EXTENDED_BK_IDLE:
6334 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6335 			goto integer;
6336 		case SO_MARK_CELLFALLBACK:
6337 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6338 			    ? 1 : 0;
6339 			goto integer;
6340 		case SO_FALLBACK_MODE:
6341 			optval = so->so_fallback_mode;
6342 			goto integer;
6343 		case SO_MARK_KNOWN_TRACKER: {
6344 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6345 			    ? 1 : 0;
6346 			goto integer;
6347 		}
6348 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6349 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6350 			    ? 1 : 0;
6351 			goto integer;
6352 		}
6353 		case SO_MARK_APPROVED_APP_DOMAIN: {
6354 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6355 			    ? 1 : 0;
6356 			goto integer;
6357 		}
6358 		case SO_NET_SERVICE_TYPE: {
6359 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6360 				optval = so->so_netsvctype;
6361 			} else {
6362 				optval = NET_SERVICE_TYPE_BE;
6363 			}
6364 			goto integer;
6365 		}
6366 		case SO_NETSVC_MARKING_LEVEL:
6367 			optval = so_get_netsvc_marking_level(so);
6368 			goto integer;
6369 
6370 		case SO_MPKL_SEND_INFO: {
6371 			struct so_mpkl_send_info so_mpkl_send_info;
6372 
6373 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6374 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6375 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6376 			    sizeof(struct so_mpkl_send_info));
6377 			break;
6378 		}
6379 		case SO_MARK_WAKE_PKT:
6380 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6381 			goto integer;
6382 		case SO_RECV_WAKE_PKT:
6383 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6384 			goto integer;
6385 		default:
6386 			error = ENOPROTOOPT;
6387 			break;
6388 		}
6389 	}
6390 out:
6391 	if (dolock) {
6392 		socket_unlock(so, 1);
6393 	}
6394 	return error;
6395 }
6396 
6397 /*
6398  * The size limits on our soopt_getm is different from that on FreeBSD.
6399  * We limit the size of options to MCLBYTES. This will have to change
6400  * if we need to define options that need more space than MCLBYTES.
6401  */
6402 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6403 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6404 {
6405 	struct mbuf *m, *m_prev;
6406 	int sopt_size = (int)sopt->sopt_valsize;
6407 	int how;
6408 
6409 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6410 		return EMSGSIZE;
6411 	}
6412 
6413 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6414 	MGET(m, how, MT_DATA);
6415 	if (m == NULL) {
6416 		return ENOBUFS;
6417 	}
6418 	if (sopt_size > MLEN) {
6419 		MCLGET(m, how);
6420 		if ((m->m_flags & M_EXT) == 0) {
6421 			m_free(m);
6422 			return ENOBUFS;
6423 		}
6424 		m->m_len = min(MCLBYTES, sopt_size);
6425 	} else {
6426 		m->m_len = min(MLEN, sopt_size);
6427 	}
6428 	sopt_size -= m->m_len;
6429 	*mp = m;
6430 	m_prev = m;
6431 
6432 	while (sopt_size > 0) {
6433 		MGET(m, how, MT_DATA);
6434 		if (m == NULL) {
6435 			m_freem(*mp);
6436 			return ENOBUFS;
6437 		}
6438 		if (sopt_size > MLEN) {
6439 			MCLGET(m, how);
6440 			if ((m->m_flags & M_EXT) == 0) {
6441 				m_freem(*mp);
6442 				m_freem(m);
6443 				return ENOBUFS;
6444 			}
6445 			m->m_len = min(MCLBYTES, sopt_size);
6446 		} else {
6447 			m->m_len = min(MLEN, sopt_size);
6448 		}
6449 		sopt_size -= m->m_len;
6450 		m_prev->m_next = m;
6451 		m_prev = m;
6452 	}
6453 	return 0;
6454 }
6455 
6456 /* copyin sopt data into mbuf chain */
6457 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6458 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6459 {
6460 	struct mbuf *m0 = m;
6461 
6462 	if (sopt->sopt_val == USER_ADDR_NULL) {
6463 		return 0;
6464 	}
6465 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6466 		if (sopt->sopt_p != kernproc) {
6467 			int error;
6468 
6469 			error = copyin(sopt->sopt_val, mtod(m, char *),
6470 			    m->m_len);
6471 			if (error != 0) {
6472 				m_freem(m0);
6473 				return error;
6474 			}
6475 		} else {
6476 			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6477 			    mtod(m, char *), m->m_len);
6478 		}
6479 		sopt->sopt_valsize -= m->m_len;
6480 		sopt->sopt_val += m->m_len;
6481 		m = m->m_next;
6482 	}
6483 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6484 	if (m != NULL) {
6485 		panic("soopt_mcopyin");
6486 		/* NOTREACHED */
6487 	}
6488 	return 0;
6489 }
6490 
6491 /* copyout mbuf chain data into soopt */
6492 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6493 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6494 {
6495 	struct mbuf *m0 = m;
6496 	size_t valsize = 0;
6497 
6498 	if (sopt->sopt_val == USER_ADDR_NULL) {
6499 		return 0;
6500 	}
6501 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6502 		if (sopt->sopt_p != kernproc) {
6503 			int error;
6504 
6505 			error = copyout(mtod(m, char *), sopt->sopt_val,
6506 			    m->m_len);
6507 			if (error != 0) {
6508 				m_freem(m0);
6509 				return error;
6510 			}
6511 		} else {
6512 			bcopy(mtod(m, char *),
6513 			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6514 		}
6515 		sopt->sopt_valsize -= m->m_len;
6516 		sopt->sopt_val += m->m_len;
6517 		valsize += m->m_len;
6518 		m = m->m_next;
6519 	}
6520 	if (m != NULL) {
6521 		/* enough soopt buffer should be given from user-land */
6522 		m_freem(m0);
6523 		return EINVAL;
6524 	}
6525 	sopt->sopt_valsize = valsize;
6526 	return 0;
6527 }
6528 
6529 void
sohasoutofband(struct socket * so)6530 sohasoutofband(struct socket *so)
6531 {
6532 	if (so->so_pgid < 0) {
6533 		gsignal(-so->so_pgid, SIGURG);
6534 	} else if (so->so_pgid > 0) {
6535 		proc_signal(so->so_pgid, SIGURG);
6536 	}
6537 	selwakeup(&so->so_rcv.sb_sel);
6538 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6539 		KNOTE(&so->so_rcv.sb_sel.si_note,
6540 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6541 	}
6542 }
6543 
6544 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6545 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6546 {
6547 #pragma unused(cred)
6548 	struct proc *p = current_proc();
6549 	int revents = 0;
6550 
6551 	socket_lock(so, 1);
6552 	so_update_last_owner_locked(so, PROC_NULL);
6553 	so_update_policy(so);
6554 
6555 	if (events & (POLLIN | POLLRDNORM)) {
6556 		if (soreadable(so)) {
6557 			revents |= events & (POLLIN | POLLRDNORM);
6558 		}
6559 	}
6560 
6561 	if (events & (POLLOUT | POLLWRNORM)) {
6562 		if (sowriteable(so)) {
6563 			revents |= events & (POLLOUT | POLLWRNORM);
6564 		}
6565 	}
6566 
6567 	if (events & (POLLPRI | POLLRDBAND)) {
6568 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6569 			revents |= events & (POLLPRI | POLLRDBAND);
6570 		}
6571 	}
6572 
6573 	if (revents == 0) {
6574 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6575 			/*
6576 			 * Darwin sets the flag first,
6577 			 * BSD calls selrecord first
6578 			 */
6579 			so->so_rcv.sb_flags |= SB_SEL;
6580 			selrecord(p, &so->so_rcv.sb_sel, wql);
6581 		}
6582 
6583 		if (events & (POLLOUT | POLLWRNORM)) {
6584 			/*
6585 			 * Darwin sets the flag first,
6586 			 * BSD calls selrecord first
6587 			 */
6588 			so->so_snd.sb_flags |= SB_SEL;
6589 			selrecord(p, &so->so_snd.sb_sel, wql);
6590 		}
6591 	}
6592 
6593 	socket_unlock(so, 1);
6594 	return revents;
6595 }
6596 
6597 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6598 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6599 {
6600 	struct socket *so = (struct socket *)fp_get_data(fp);
6601 	int result;
6602 
6603 	socket_lock(so, 1);
6604 	so_update_last_owner_locked(so, PROC_NULL);
6605 	so_update_policy(so);
6606 
6607 	switch (kn->kn_filter) {
6608 	case EVFILT_READ:
6609 		kn->kn_filtid = EVFILTID_SOREAD;
6610 		break;
6611 	case EVFILT_WRITE:
6612 		kn->kn_filtid = EVFILTID_SOWRITE;
6613 		break;
6614 	case EVFILT_SOCK:
6615 		kn->kn_filtid = EVFILTID_SCK;
6616 		break;
6617 	case EVFILT_EXCEPT:
6618 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6619 		break;
6620 	default:
6621 		socket_unlock(so, 1);
6622 		knote_set_error(kn, EINVAL);
6623 		return 0;
6624 	}
6625 
6626 	/*
6627 	 * call the appropriate sub-filter attach
6628 	 * with the socket still locked
6629 	 */
6630 	result = knote_fops(kn)->f_attach(kn, kev);
6631 
6632 	socket_unlock(so, 1);
6633 
6634 	return result;
6635 }
6636 
6637 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6638 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6639 {
6640 	int retval = 0;
6641 	int64_t data = 0;
6642 
6643 	if (so->so_options & SO_ACCEPTCONN) {
6644 		/*
6645 		 * Radar 6615193 handle the listen case dynamically
6646 		 * for kqueue read filter. This allows to call listen()
6647 		 * after registering the kqueue EVFILT_READ.
6648 		 */
6649 
6650 		retval = !TAILQ_EMPTY(&so->so_comp);
6651 		data = so->so_qlen;
6652 		goto out;
6653 	}
6654 
6655 	/* socket isn't a listener */
6656 	/*
6657 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6658 	 * the bytes of protocol data. We therefore exclude any
6659 	 * control bytes.
6660 	 */
6661 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6662 
6663 	if (kn->kn_sfflags & NOTE_OOB) {
6664 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6665 			kn->kn_fflags |= NOTE_OOB;
6666 			data -= so->so_oobmark;
6667 			retval = 1;
6668 			goto out;
6669 		}
6670 	}
6671 
6672 	if ((so->so_state & SS_CANTRCVMORE)
6673 #if CONTENT_FILTER
6674 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6675 #endif /* CONTENT_FILTER */
6676 	    ) {
6677 		kn->kn_flags |= EV_EOF;
6678 		kn->kn_fflags = so->so_error;
6679 		retval = 1;
6680 		goto out;
6681 	}
6682 
6683 	if (so->so_error) {     /* temporary udp error */
6684 		retval = 1;
6685 		goto out;
6686 	}
6687 
6688 	int64_t lowwat = so->so_rcv.sb_lowat;
6689 	/*
6690 	 * Ensure that when NOTE_LOWAT is used, the derived
6691 	 * low water mark is bounded by socket's rcv buf's
6692 	 * high and low water mark values.
6693 	 */
6694 	if (kn->kn_sfflags & NOTE_LOWAT) {
6695 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6696 			lowwat = so->so_rcv.sb_hiwat;
6697 		} else if (kn->kn_sdata > lowwat) {
6698 			lowwat = kn->kn_sdata;
6699 		}
6700 	}
6701 
6702 	/*
6703 	 * While the `data` field is the amount of data to read,
6704 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6705 	 * so we need to take control bytes into account too.
6706 	 */
6707 	retval = (so->so_rcv.sb_cc >= lowwat);
6708 
6709 out:
6710 	if (retval && kev) {
6711 		knote_fill_kevent(kn, kev, data);
6712 	}
6713 	return retval;
6714 }
6715 
6716 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6717 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6718 {
6719 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6720 
6721 	/* socket locked */
6722 
6723 	/*
6724 	 * If the caller explicitly asked for OOB results (e.g. poll())
6725 	 * from EVFILT_READ, then save that off in the hookid field
6726 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6727 	 */
6728 	if (kn->kn_filter == EVFILT_READ &&
6729 	    kn->kn_flags & EV_OOBAND) {
6730 		kn->kn_flags &= ~EV_OOBAND;
6731 		kn->kn_hook32 = EV_OOBAND;
6732 	} else {
6733 		kn->kn_hook32 = 0;
6734 	}
6735 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6736 		so->so_rcv.sb_flags |= SB_KNOTE;
6737 	}
6738 
6739 	/* indicate if event is already fired */
6740 	return filt_soread_common(kn, NULL, so);
6741 }
6742 
6743 static void
filt_sordetach(struct knote * kn)6744 filt_sordetach(struct knote *kn)
6745 {
6746 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6747 
6748 	socket_lock(so, 1);
6749 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6750 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6751 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6752 		}
6753 	}
6754 	socket_unlock(so, 1);
6755 }
6756 
6757 /*ARGSUSED*/
6758 static int
filt_soread(struct knote * kn,long hint)6759 filt_soread(struct knote *kn, long hint)
6760 {
6761 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6762 	int retval;
6763 
6764 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6765 		socket_lock(so, 1);
6766 	}
6767 
6768 	retval = filt_soread_common(kn, NULL, so);
6769 
6770 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6771 		socket_unlock(so, 1);
6772 	}
6773 
6774 	return retval;
6775 }
6776 
6777 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6778 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6779 {
6780 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6781 	int retval;
6782 
6783 	socket_lock(so, 1);
6784 
6785 	/* save off the new input fflags and data */
6786 	kn->kn_sfflags = kev->fflags;
6787 	kn->kn_sdata = kev->data;
6788 
6789 	/* determine if changes result in fired events */
6790 	retval = filt_soread_common(kn, NULL, so);
6791 
6792 	socket_unlock(so, 1);
6793 
6794 	return retval;
6795 }
6796 
6797 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6798 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6799 {
6800 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6801 	int retval;
6802 
6803 	socket_lock(so, 1);
6804 	retval = filt_soread_common(kn, kev, so);
6805 	socket_unlock(so, 1);
6806 
6807 	return retval;
6808 }
6809 
6810 int
so_wait_for_if_feedback(struct socket * so)6811 so_wait_for_if_feedback(struct socket *so)
6812 {
6813 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6814 	    (so->so_state & SS_ISCONNECTED)) {
6815 		struct inpcb *inp = sotoinpcb(so);
6816 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6817 			return 1;
6818 		}
6819 	}
6820 	return 0;
6821 }
6822 
6823 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6824 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6825 {
6826 	int ret = 0;
6827 	int64_t data = sbspace(&so->so_snd);
6828 
6829 	if (so->so_state & SS_CANTSENDMORE) {
6830 		kn->kn_flags |= EV_EOF;
6831 		kn->kn_fflags = so->so_error;
6832 		ret = 1;
6833 		goto out;
6834 	}
6835 
6836 	if (so->so_error) {     /* temporary udp error */
6837 		ret = 1;
6838 		goto out;
6839 	}
6840 
6841 	if (!socanwrite(so)) {
6842 		ret = 0;
6843 		goto out;
6844 	}
6845 
6846 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6847 		ret = 1;
6848 		goto out;
6849 	}
6850 
6851 	int64_t lowwat = so->so_snd.sb_lowat;
6852 
6853 	if (kn->kn_sfflags & NOTE_LOWAT) {
6854 		if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6855 			lowwat = so->so_snd.sb_hiwat;
6856 		} else if (kn->kn_sdata > lowwat) {
6857 			lowwat = kn->kn_sdata;
6858 		}
6859 	}
6860 
6861 	if (data >= lowwat) {
6862 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6863 #if (DEBUG || DEVELOPMENT)
6864 		    && so_notsent_lowat_check == 1
6865 #endif /* DEBUG || DEVELOPMENT */
6866 		    ) {
6867 			if ((SOCK_DOM(so) == PF_INET ||
6868 			    SOCK_DOM(so) == PF_INET6) &&
6869 			    so->so_type == SOCK_STREAM) {
6870 				ret = tcp_notsent_lowat_check(so);
6871 			}
6872 #if MPTCP
6873 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6874 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6875 				ret = mptcp_notsent_lowat_check(so);
6876 			}
6877 #endif
6878 			else {
6879 				ret = 1;
6880 				goto out;
6881 			}
6882 		} else {
6883 			ret = 1;
6884 		}
6885 	}
6886 	if (so_wait_for_if_feedback(so)) {
6887 		ret = 0;
6888 	}
6889 
6890 out:
6891 	if (ret && kev) {
6892 		knote_fill_kevent(kn, kev, data);
6893 	}
6894 	return ret;
6895 }
6896 
6897 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6898 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6899 {
6900 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6901 
6902 	/* socket locked */
6903 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6904 		so->so_snd.sb_flags |= SB_KNOTE;
6905 	}
6906 
6907 	/* determine if its already fired */
6908 	return filt_sowrite_common(kn, NULL, so);
6909 }
6910 
6911 static void
filt_sowdetach(struct knote * kn)6912 filt_sowdetach(struct knote *kn)
6913 {
6914 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6915 	socket_lock(so, 1);
6916 
6917 	if (so->so_snd.sb_flags & SB_KNOTE) {
6918 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6919 			so->so_snd.sb_flags &= ~SB_KNOTE;
6920 		}
6921 	}
6922 	socket_unlock(so, 1);
6923 }
6924 
6925 /*ARGSUSED*/
6926 static int
filt_sowrite(struct knote * kn,long hint)6927 filt_sowrite(struct knote *kn, long hint)
6928 {
6929 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6930 	int ret;
6931 
6932 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6933 		socket_lock(so, 1);
6934 	}
6935 
6936 	ret = filt_sowrite_common(kn, NULL, so);
6937 
6938 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6939 		socket_unlock(so, 1);
6940 	}
6941 
6942 	return ret;
6943 }
6944 
6945 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6946 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6947 {
6948 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6949 	int ret;
6950 
6951 	socket_lock(so, 1);
6952 
6953 	/*save off the new input fflags and data */
6954 	kn->kn_sfflags = kev->fflags;
6955 	kn->kn_sdata = kev->data;
6956 
6957 	/* determine if these changes result in a triggered event */
6958 	ret = filt_sowrite_common(kn, NULL, so);
6959 
6960 	socket_unlock(so, 1);
6961 
6962 	return ret;
6963 }
6964 
6965 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6966 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6967 {
6968 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6969 	int ret;
6970 
6971 	socket_lock(so, 1);
6972 	ret = filt_sowrite_common(kn, kev, so);
6973 	socket_unlock(so, 1);
6974 
6975 	return ret;
6976 }
6977 
6978 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6979 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6980     struct socket *so, long ev_hint)
6981 {
6982 	int ret = 0;
6983 	int64_t data = 0;
6984 	uint32_t level_trigger = 0;
6985 
6986 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6987 		kn->kn_fflags |= NOTE_CONNRESET;
6988 	}
6989 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6990 		kn->kn_fflags |= NOTE_TIMEOUT;
6991 	}
6992 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6993 		kn->kn_fflags |= NOTE_NOSRCADDR;
6994 	}
6995 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6996 		kn->kn_fflags |= NOTE_IFDENIED;
6997 	}
6998 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6999 		kn->kn_fflags |= NOTE_KEEPALIVE;
7000 	}
7001 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7002 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7003 	}
7004 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7005 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7006 	}
7007 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7008 	    (so->so_state & SS_ISCONNECTED)) {
7009 		kn->kn_fflags |= NOTE_CONNECTED;
7010 		level_trigger |= NOTE_CONNECTED;
7011 	}
7012 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7013 	    (so->so_state & SS_ISDISCONNECTED)) {
7014 		kn->kn_fflags |= NOTE_DISCONNECTED;
7015 		level_trigger |= NOTE_DISCONNECTED;
7016 	}
7017 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7018 		if (so->so_proto != NULL &&
7019 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7020 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7021 		}
7022 	}
7023 
7024 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7025 	    tcp_notify_ack_active(so)) {
7026 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
7027 	}
7028 
7029 	if ((so->so_state & SS_CANTRCVMORE)
7030 #if CONTENT_FILTER
7031 	    && cfil_sock_data_pending(&so->so_rcv) == 0
7032 #endif /* CONTENT_FILTER */
7033 	    ) {
7034 		kn->kn_fflags |= NOTE_READCLOSED;
7035 		level_trigger |= NOTE_READCLOSED;
7036 	}
7037 
7038 	if (so->so_state & SS_CANTSENDMORE) {
7039 		kn->kn_fflags |= NOTE_WRITECLOSED;
7040 		level_trigger |= NOTE_WRITECLOSED;
7041 	}
7042 
7043 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7044 	    (so->so_flags & SOF_SUSPENDED)) {
7045 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7046 
7047 		/* If resume event was delivered before, reset it */
7048 		kn->kn_hook32 &= ~NOTE_RESUME;
7049 
7050 		kn->kn_fflags |= NOTE_SUSPEND;
7051 		level_trigger |= NOTE_SUSPEND;
7052 	}
7053 
7054 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
7055 	    (so->so_flags & SOF_SUSPENDED) == 0) {
7056 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7057 
7058 		/* If suspend event was delivered before, reset it */
7059 		kn->kn_hook32 &= ~NOTE_SUSPEND;
7060 
7061 		kn->kn_fflags |= NOTE_RESUME;
7062 		level_trigger |= NOTE_RESUME;
7063 	}
7064 
7065 	if (so->so_error != 0) {
7066 		ret = 1;
7067 		data = so->so_error;
7068 		kn->kn_flags |= EV_EOF;
7069 	} else {
7070 		u_int32_t data32 = 0;
7071 		get_sockev_state(so, &data32);
7072 		data = data32;
7073 	}
7074 
7075 	/* Reset any events that are not requested on this knote */
7076 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7077 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7078 
7079 	/* Find the level triggerred events that are already delivered */
7080 	level_trigger &= kn->kn_hook32;
7081 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7082 
7083 	/* Do not deliver level triggerred events more than once */
7084 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7085 		ret = 1;
7086 	}
7087 
7088 	if (ret && kev) {
7089 		/*
7090 		 * Store the state of the events being delivered. This
7091 		 * state can be used to deliver level triggered events
7092 		 * ateast once and still avoid waking up the application
7093 		 * multiple times as long as the event is active.
7094 		 */
7095 		if (kn->kn_fflags != 0) {
7096 			kn->kn_hook32 |= (kn->kn_fflags &
7097 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7098 		}
7099 
7100 		/*
7101 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7102 		 * only one of them and remember the last one that was
7103 		 * delivered last
7104 		 */
7105 		if (kn->kn_fflags & NOTE_SUSPEND) {
7106 			kn->kn_hook32 &= ~NOTE_RESUME;
7107 		}
7108 		if (kn->kn_fflags & NOTE_RESUME) {
7109 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7110 		}
7111 
7112 		knote_fill_kevent(kn, kev, data);
7113 	}
7114 	return ret;
7115 }
7116 
7117 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7118 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7119 {
7120 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7121 
7122 	/* socket locked */
7123 	kn->kn_hook32 = 0;
7124 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7125 		so->so_flags |= SOF_KNOTE;
7126 	}
7127 
7128 	/* determine if event already fired */
7129 	return filt_sockev_common(kn, NULL, so, 0);
7130 }
7131 
7132 static void
filt_sockdetach(struct knote * kn)7133 filt_sockdetach(struct knote *kn)
7134 {
7135 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7136 	socket_lock(so, 1);
7137 
7138 	if ((so->so_flags & SOF_KNOTE) != 0) {
7139 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7140 			so->so_flags &= ~SOF_KNOTE;
7141 		}
7142 	}
7143 	socket_unlock(so, 1);
7144 }
7145 
7146 static int
filt_sockev(struct knote * kn,long hint)7147 filt_sockev(struct knote *kn, long hint)
7148 {
7149 	int ret = 0, locked = 0;
7150 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7151 	long ev_hint = (hint & SO_FILT_HINT_EV);
7152 
7153 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7154 		socket_lock(so, 1);
7155 		locked = 1;
7156 	}
7157 
7158 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7159 
7160 	if (locked) {
7161 		socket_unlock(so, 1);
7162 	}
7163 
7164 	return ret;
7165 }
7166 
7167 
7168 
7169 /*
7170  *	filt_socktouch - update event state
7171  */
7172 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7173 filt_socktouch(
7174 	struct knote *kn,
7175 	struct kevent_qos_s *kev)
7176 {
7177 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7178 	uint32_t changed_flags;
7179 	int ret;
7180 
7181 	socket_lock(so, 1);
7182 
7183 	/* save off the [result] data and fflags */
7184 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7185 
7186 	/* save off the new input fflags and data */
7187 	kn->kn_sfflags = kev->fflags;
7188 	kn->kn_sdata = kev->data;
7189 
7190 	/* restrict the current results to the (smaller?) set of new interest */
7191 	/*
7192 	 * For compatibility with previous implementations, we leave kn_fflags
7193 	 * as they were before.
7194 	 */
7195 	//kn->kn_fflags &= kev->fflags;
7196 
7197 	/*
7198 	 * Since we keep track of events that are already
7199 	 * delivered, if any of those events are not requested
7200 	 * anymore the state related to them can be reset
7201 	 */
7202 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7203 
7204 	/* determine if we have events to deliver */
7205 	ret = filt_sockev_common(kn, NULL, so, 0);
7206 
7207 	socket_unlock(so, 1);
7208 
7209 	return ret;
7210 }
7211 
7212 /*
7213  *	filt_sockprocess - query event fired state and return data
7214  */
7215 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7216 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7217 {
7218 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7219 	int ret = 0;
7220 
7221 	socket_lock(so, 1);
7222 
7223 	ret = filt_sockev_common(kn, kev, so, 0);
7224 
7225 	socket_unlock(so, 1);
7226 
7227 	return ret;
7228 }
7229 
7230 void
get_sockev_state(struct socket * so,u_int32_t * statep)7231 get_sockev_state(struct socket *so, u_int32_t *statep)
7232 {
7233 	u_int32_t state = *(statep);
7234 
7235 	/*
7236 	 * If the state variable is already used by a previous event,
7237 	 * reset it.
7238 	 */
7239 	if (state != 0) {
7240 		return;
7241 	}
7242 
7243 	if (so->so_state & SS_ISCONNECTED) {
7244 		state |= SOCKEV_CONNECTED;
7245 	} else {
7246 		state &= ~(SOCKEV_CONNECTED);
7247 	}
7248 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7249 	*(statep) = state;
7250 }
7251 
7252 #define SO_LOCK_HISTORY_STR_LEN \
7253 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7254 
7255 __private_extern__ const char *
solockhistory_nr(struct socket * so)7256 solockhistory_nr(struct socket *so)
7257 {
7258 	size_t n = 0;
7259 	int i;
7260 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7261 
7262 	bzero(lock_history_str, sizeof(lock_history_str));
7263 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7264 		n += scnprintf(lock_history_str + n,
7265 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7266 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7267 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7268 	}
7269 	return lock_history_str;
7270 }
7271 
7272 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7273 socket_getlock(struct socket *so, int flags)
7274 {
7275 	if (so->so_proto->pr_getlock != NULL) {
7276 		return (*so->so_proto->pr_getlock)(so, flags);
7277 	} else {
7278 		return so->so_proto->pr_domain->dom_mtx;
7279 	}
7280 }
7281 
7282 void
socket_lock(struct socket * so,int refcount)7283 socket_lock(struct socket *so, int refcount)
7284 {
7285 	void *lr_saved;
7286 
7287 	lr_saved = __builtin_return_address(0);
7288 
7289 	if (so->so_proto->pr_lock) {
7290 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7291 	} else {
7292 #ifdef MORE_LOCKING_DEBUG
7293 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7294 		    LCK_MTX_ASSERT_NOTOWNED);
7295 #endif
7296 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7297 		if (refcount) {
7298 			so->so_usecount++;
7299 		}
7300 		so->lock_lr[so->next_lock_lr] = lr_saved;
7301 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7302 	}
7303 }
7304 
7305 void
socket_lock_assert_owned(struct socket * so)7306 socket_lock_assert_owned(struct socket *so)
7307 {
7308 	lck_mtx_t *mutex_held;
7309 
7310 	if (so->so_proto->pr_getlock != NULL) {
7311 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7312 	} else {
7313 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7314 	}
7315 
7316 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7317 }
7318 
7319 int
socket_try_lock(struct socket * so)7320 socket_try_lock(struct socket *so)
7321 {
7322 	lck_mtx_t *mtx;
7323 
7324 	if (so->so_proto->pr_getlock != NULL) {
7325 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7326 	} else {
7327 		mtx = so->so_proto->pr_domain->dom_mtx;
7328 	}
7329 
7330 	return lck_mtx_try_lock(mtx);
7331 }
7332 
7333 void
socket_unlock(struct socket * so,int refcount)7334 socket_unlock(struct socket *so, int refcount)
7335 {
7336 	void *lr_saved;
7337 	lck_mtx_t *mutex_held;
7338 
7339 	lr_saved = __builtin_return_address(0);
7340 
7341 	if (so == NULL || so->so_proto == NULL) {
7342 		panic("%s: null so_proto so=%p", __func__, so);
7343 		/* NOTREACHED */
7344 	}
7345 
7346 	if (so->so_proto->pr_unlock) {
7347 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7348 	} else {
7349 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7350 #ifdef MORE_LOCKING_DEBUG
7351 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7352 #endif
7353 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7354 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7355 
7356 		if (refcount) {
7357 			if (so->so_usecount <= 0) {
7358 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7359 				    "lrh=%s", __func__, so->so_usecount, so,
7360 				    SOCK_DOM(so), so->so_type,
7361 				    SOCK_PROTO(so), solockhistory_nr(so));
7362 				/* NOTREACHED */
7363 			}
7364 
7365 			so->so_usecount--;
7366 			if (so->so_usecount == 0) {
7367 				sofreelastref(so, 1);
7368 			}
7369 		}
7370 		lck_mtx_unlock(mutex_held);
7371 	}
7372 }
7373 
7374 /* Called with socket locked, will unlock socket */
7375 void
sofree(struct socket * so)7376 sofree(struct socket *so)
7377 {
7378 	lck_mtx_t *mutex_held;
7379 
7380 	if (so->so_proto->pr_getlock != NULL) {
7381 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7382 	} else {
7383 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7384 	}
7385 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7386 
7387 	sofreelastref(so, 0);
7388 }
7389 
7390 void
soreference(struct socket * so)7391 soreference(struct socket *so)
7392 {
7393 	socket_lock(so, 1);     /* locks & take one reference on socket */
7394 	socket_unlock(so, 0);   /* unlock only */
7395 }
7396 
7397 void
sodereference(struct socket * so)7398 sodereference(struct socket *so)
7399 {
7400 	socket_lock(so, 0);
7401 	socket_unlock(so, 1);
7402 }
7403 
7404 /*
7405  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7406  * possibility of using jumbo clusters.  Caller must ensure to hold
7407  * the socket lock.
7408  */
7409 void
somultipages(struct socket * so,boolean_t set)7410 somultipages(struct socket *so, boolean_t set)
7411 {
7412 	if (set) {
7413 		so->so_flags |= SOF_MULTIPAGES;
7414 	} else {
7415 		so->so_flags &= ~SOF_MULTIPAGES;
7416 	}
7417 }
7418 
7419 void
soif2kcl(struct socket * so,boolean_t set)7420 soif2kcl(struct socket *so, boolean_t set)
7421 {
7422 	if (set) {
7423 		so->so_flags1 |= SOF1_IF_2KCL;
7424 	} else {
7425 		so->so_flags1 &= ~SOF1_IF_2KCL;
7426 	}
7427 }
7428 
7429 int
so_isdstlocal(struct socket * so)7430 so_isdstlocal(struct socket *so)
7431 {
7432 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7433 
7434 	if (SOCK_DOM(so) == PF_INET) {
7435 		return inaddr_local(inp->inp_faddr);
7436 	} else if (SOCK_DOM(so) == PF_INET6) {
7437 		return in6addr_local(&inp->in6p_faddr);
7438 	}
7439 
7440 	return 0;
7441 }
7442 
7443 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7444 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7445 {
7446 	struct sockbuf *rcv, *snd;
7447 	int err = 0, defunct;
7448 
7449 	rcv = &so->so_rcv;
7450 	snd = &so->so_snd;
7451 
7452 	defunct = (so->so_flags & SOF_DEFUNCT);
7453 	if (defunct) {
7454 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7455 			panic("%s: SB_DROP not set", __func__);
7456 			/* NOTREACHED */
7457 		}
7458 		goto done;
7459 	}
7460 
7461 	if (so->so_flags & SOF_NODEFUNCT) {
7462 		if (noforce) {
7463 			err = EOPNOTSUPP;
7464 			if (p != PROC_NULL) {
7465 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7466 				    "name %s level %d) so 0x%llx [%d,%d] "
7467 				    "is not eligible for defunct "
7468 				    "(%d)\n", __func__, proc_selfpid(),
7469 				    proc_best_name(current_proc()), proc_pid(p),
7470 				    proc_best_name(p), level,
7471 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7472 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7473 			}
7474 			return err;
7475 		}
7476 		so->so_flags &= ~SOF_NODEFUNCT;
7477 		if (p != PROC_NULL) {
7478 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7479 			    "name %s level %d) so 0x%llx [%d,%d] "
7480 			    "defunct by force "
7481 			    "(%d)\n", __func__, proc_selfpid(),
7482 			    proc_best_name(current_proc()), proc_pid(p),
7483 			    proc_best_name(p), level,
7484 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7485 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7486 		}
7487 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7488 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7489 		struct ifnet *ifp = inp->inp_last_outifp;
7490 
7491 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7492 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7493 		} else if (so->so_flags & SOF_DELEGATED) {
7494 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7495 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7496 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7497 		} else if (noforce && p != PROC_NULL) {
7498 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7499 
7500 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7501 			so->so_extended_bk_start = net_uptime();
7502 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7503 
7504 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7505 
7506 			err = EOPNOTSUPP;
7507 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7508 			    "name %s level %d) so 0x%llx [%d,%d] "
7509 			    "extend bk idle "
7510 			    "(%d)\n", __func__, proc_selfpid(),
7511 			    proc_best_name(current_proc()), proc_pid(p),
7512 			    proc_best_name(p), level,
7513 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7514 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7515 			return err;
7516 		} else {
7517 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7518 		}
7519 	}
7520 
7521 	so->so_flags |= SOF_DEFUNCT;
7522 
7523 	/* Prevent further data from being appended to the socket buffers */
7524 	snd->sb_flags |= SB_DROP;
7525 	rcv->sb_flags |= SB_DROP;
7526 
7527 	/* Flush any existing data in the socket buffers */
7528 	if (rcv->sb_cc != 0) {
7529 		rcv->sb_flags &= ~SB_SEL;
7530 		selthreadclear(&rcv->sb_sel);
7531 		sbrelease(rcv);
7532 	}
7533 	if (snd->sb_cc != 0) {
7534 		snd->sb_flags &= ~SB_SEL;
7535 		selthreadclear(&snd->sb_sel);
7536 		sbrelease(snd);
7537 	}
7538 
7539 done:
7540 	if (p != PROC_NULL) {
7541 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7542 		    "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7543 		    proc_selfpid(), proc_best_name(current_proc()),
7544 		    proc_pid(p), proc_best_name(p), level,
7545 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7546 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7547 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7548 		    " extbkidle" : "");
7549 	}
7550 	return err;
7551 }
7552 
7553 int
sodefunct(struct proc * p,struct socket * so,int level)7554 sodefunct(struct proc *p, struct socket *so, int level)
7555 {
7556 	struct sockbuf *rcv, *snd;
7557 
7558 	if (!(so->so_flags & SOF_DEFUNCT)) {
7559 		panic("%s improperly called", __func__);
7560 		/* NOTREACHED */
7561 	}
7562 	if (so->so_state & SS_DEFUNCT) {
7563 		goto done;
7564 	}
7565 
7566 	rcv = &so->so_rcv;
7567 	snd = &so->so_snd;
7568 
7569 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7570 		char s[MAX_IPv6_STR_LEN];
7571 		char d[MAX_IPv6_STR_LEN];
7572 		struct inpcb *inp = sotoinpcb(so);
7573 
7574 		if (p != PROC_NULL) {
7575 			SODEFUNCTLOG(
7576 				"%s[%d, %s]: (target pid %d name %s level %d) "
7577 				"so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7578 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7579 				" snd_fl 0x%x]\n", __func__,
7580 				proc_selfpid(), proc_best_name(current_proc()),
7581 				proc_pid(p), proc_best_name(p), level,
7582 				(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7583 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7584 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7585 				(void *)&inp->inp_laddr.s_addr :
7586 				(void *)&inp->in6p_laddr),
7587 				s, sizeof(s)), ntohs(inp->in6p_lport),
7588 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7589 				(void *)&inp->inp_faddr.s_addr :
7590 				(void *)&inp->in6p_faddr,
7591 				d, sizeof(d)), ntohs(inp->in6p_fport),
7592 				(uint32_t)rcv->sb_sel.si_flags,
7593 				(uint32_t)snd->sb_sel.si_flags,
7594 				rcv->sb_flags, snd->sb_flags);
7595 		}
7596 	} else if (p != PROC_NULL) {
7597 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7598 		    "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7599 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7600 		    proc_selfpid(), proc_best_name(current_proc()),
7601 		    proc_pid(p), proc_best_name(p), level,
7602 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7603 		    SOCK_DOM(so), SOCK_TYPE(so),
7604 		    (uint32_t)rcv->sb_sel.si_flags,
7605 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7606 		    snd->sb_flags);
7607 	}
7608 
7609 	/*
7610 	 * Unwedge threads blocked on sbwait() and sb_lock().
7611 	 */
7612 	sbwakeup(rcv);
7613 	sbwakeup(snd);
7614 
7615 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7616 	if (rcv->sb_flags & SB_LOCK) {
7617 		sbunlock(rcv, TRUE);    /* keep socket locked */
7618 	}
7619 	if (snd->sb_flags & SB_LOCK) {
7620 		sbunlock(snd, TRUE);    /* keep socket locked */
7621 	}
7622 	/*
7623 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7624 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7625 	 * states are set for the socket.  This would also flush out data
7626 	 * hanging off the receive list of this socket.
7627 	 */
7628 	(void) soshutdownlock_final(so, SHUT_RD);
7629 	(void) soshutdownlock_final(so, SHUT_WR);
7630 	(void) sodisconnectlocked(so);
7631 
7632 	/*
7633 	 * Explicitly handle connectionless-protocol disconnection
7634 	 * and release any remaining data in the socket buffers.
7635 	 */
7636 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7637 		(void) soisdisconnected(so);
7638 	}
7639 
7640 	if (so->so_error == 0) {
7641 		so->so_error = EBADF;
7642 	}
7643 
7644 	if (rcv->sb_cc != 0) {
7645 		rcv->sb_flags &= ~SB_SEL;
7646 		selthreadclear(&rcv->sb_sel);
7647 		sbrelease(rcv);
7648 	}
7649 	if (snd->sb_cc != 0) {
7650 		snd->sb_flags &= ~SB_SEL;
7651 		selthreadclear(&snd->sb_sel);
7652 		sbrelease(snd);
7653 	}
7654 	so->so_state |= SS_DEFUNCT;
7655 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7656 
7657 done:
7658 	return 0;
7659 }
7660 
7661 int
soresume(struct proc * p,struct socket * so,int locked)7662 soresume(struct proc *p, struct socket *so, int locked)
7663 {
7664 	if (locked == 0) {
7665 		socket_lock(so, 1);
7666 	}
7667 
7668 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7669 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7670 		    "[%d,%d] resumed from bk idle\n",
7671 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7672 		    proc_pid(p), proc_best_name(p),
7673 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7674 		    SOCK_DOM(so), SOCK_TYPE(so));
7675 
7676 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7677 		so->so_extended_bk_start = 0;
7678 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7679 
7680 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7681 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7682 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7683 	}
7684 	if (locked == 0) {
7685 		socket_unlock(so, 1);
7686 	}
7687 
7688 	return 0;
7689 }
7690 
7691 /*
7692  * Does not attempt to account for sockets that are delegated from
7693  * the current process
7694  */
7695 int
so_set_extended_bk_idle(struct socket * so,int optval)7696 so_set_extended_bk_idle(struct socket *so, int optval)
7697 {
7698 	int error = 0;
7699 
7700 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7701 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7702 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7703 		error = EOPNOTSUPP;
7704 	} else if (optval == 0) {
7705 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7706 
7707 		soresume(current_proc(), so, 1);
7708 	} else {
7709 		struct proc *p = current_proc();
7710 		struct fileproc *fp;
7711 		int count = 0;
7712 
7713 		/*
7714 		 * Unlock socket to avoid lock ordering issue with
7715 		 * the proc fd table lock
7716 		 */
7717 		socket_unlock(so, 0);
7718 
7719 		proc_fdlock(p);
7720 		fdt_foreach(fp, p) {
7721 			struct socket *so2;
7722 
7723 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7724 				continue;
7725 			}
7726 
7727 			so2 = (struct socket *)fp_get_data(fp);
7728 			if (so != so2 &&
7729 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7730 				count++;
7731 			}
7732 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7733 				break;
7734 			}
7735 		}
7736 		proc_fdunlock(p);
7737 
7738 		socket_lock(so, 0);
7739 
7740 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7741 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7742 			error = EBUSY;
7743 		} else if (so->so_flags & SOF_DELEGATED) {
7744 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7745 			error = EBUSY;
7746 		} else {
7747 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7748 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7749 		}
7750 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7751 		    "%s marked for extended bk idle\n",
7752 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7753 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7754 		    SOCK_DOM(so), SOCK_TYPE(so),
7755 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7756 		    "is" : "not");
7757 	}
7758 
7759 	return error;
7760 }
7761 
7762 static void
so_stop_extended_bk_idle(struct socket * so)7763 so_stop_extended_bk_idle(struct socket *so)
7764 {
7765 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7766 	so->so_extended_bk_start = 0;
7767 
7768 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7769 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7770 	/*
7771 	 * Force defunct
7772 	 */
7773 	sosetdefunct(current_proc(), so,
7774 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7775 	if (so->so_flags & SOF_DEFUNCT) {
7776 		sodefunct(current_proc(), so,
7777 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7778 	}
7779 }
7780 
7781 void
so_drain_extended_bk_idle(struct socket * so)7782 so_drain_extended_bk_idle(struct socket *so)
7783 {
7784 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7785 		/*
7786 		 * Only penalize sockets that have outstanding data
7787 		 */
7788 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7789 			so_stop_extended_bk_idle(so);
7790 
7791 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7792 		}
7793 	}
7794 }
7795 
7796 /*
7797  * Return values tells if socket is still in extended background idle
7798  */
7799 int
so_check_extended_bk_idle_time(struct socket * so)7800 so_check_extended_bk_idle_time(struct socket *so)
7801 {
7802 	int ret = 1;
7803 
7804 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7805 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7806 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7807 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7808 		    SOCK_DOM(so), SOCK_TYPE(so));
7809 		if (net_uptime() - so->so_extended_bk_start >
7810 		    soextbkidlestat.so_xbkidle_time) {
7811 			so_stop_extended_bk_idle(so);
7812 
7813 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7814 
7815 			ret = 0;
7816 		} else {
7817 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7818 
7819 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7820 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7821 		}
7822 	}
7823 
7824 	return ret;
7825 }
7826 
7827 void
resume_proc_sockets(proc_t p)7828 resume_proc_sockets(proc_t p)
7829 {
7830 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7831 		struct fileproc *fp;
7832 		struct socket *so;
7833 
7834 		proc_fdlock(p);
7835 		fdt_foreach(fp, p) {
7836 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7837 				continue;
7838 			}
7839 
7840 			so = (struct socket *)fp_get_data(fp);
7841 			(void) soresume(p, so, 0);
7842 		}
7843 		proc_fdunlock(p);
7844 
7845 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7846 	}
7847 }
7848 
7849 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7850 so_set_recv_anyif(struct socket *so, int optval)
7851 {
7852 	int ret = 0;
7853 
7854 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7855 		if (optval) {
7856 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7857 		} else {
7858 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7859 		}
7860 #if SKYWALK
7861 		inp_update_netns_flags(so);
7862 #endif /* SKYWALK */
7863 	}
7864 
7865 
7866 	return ret;
7867 }
7868 
7869 __private_extern__ int
so_get_recv_anyif(struct socket * so)7870 so_get_recv_anyif(struct socket *so)
7871 {
7872 	int ret = 0;
7873 
7874 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7875 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7876 	}
7877 
7878 	return ret;
7879 }
7880 
7881 int
so_set_restrictions(struct socket * so,uint32_t vals)7882 so_set_restrictions(struct socket *so, uint32_t vals)
7883 {
7884 	int nocell_old, nocell_new;
7885 	int noexpensive_old, noexpensive_new;
7886 	int noconstrained_old, noconstrained_new;
7887 
7888 	/*
7889 	 * Deny-type restrictions are trapdoors; once set they cannot be
7890 	 * unset for the lifetime of the socket.  This allows them to be
7891 	 * issued by a framework on behalf of the application without
7892 	 * having to worry that they can be undone.
7893 	 *
7894 	 * Note here that socket-level restrictions overrides any protocol
7895 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7896 	 * socket restriction issued on the socket has a higher precendence
7897 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7898 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7899 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7900 	 */
7901 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7902 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7903 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7904 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7905 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7906 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7907 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7908 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7909 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7910 
7911 	/* we can only set, not clear restrictions */
7912 	if ((nocell_new - nocell_old) == 0 &&
7913 	    (noexpensive_new - noexpensive_old) == 0 &&
7914 	    (noconstrained_new - noconstrained_old) == 0) {
7915 		return 0;
7916 	}
7917 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7918 		if (nocell_new - nocell_old != 0) {
7919 			/*
7920 			 * if deny cellular is now set, do what's needed
7921 			 * for INPCB
7922 			 */
7923 			inp_set_nocellular(sotoinpcb(so));
7924 		}
7925 		if (noexpensive_new - noexpensive_old != 0) {
7926 			inp_set_noexpensive(sotoinpcb(so));
7927 		}
7928 		if (noconstrained_new - noconstrained_old != 0) {
7929 			inp_set_noconstrained(sotoinpcb(so));
7930 		}
7931 	}
7932 
7933 	if (SOCK_DOM(so) == PF_MULTIPATH) {
7934 		mptcp_set_restrictions(so);
7935 	}
7936 
7937 	return 0;
7938 }
7939 
7940 uint32_t
so_get_restrictions(struct socket * so)7941 so_get_restrictions(struct socket *so)
7942 {
7943 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7944 	       SO_RESTRICT_DENY_OUT |
7945 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7946 }
7947 
7948 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7949 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7950 {
7951 	struct proc *ep = PROC_NULL;
7952 	int error = 0;
7953 
7954 	/* pid 0 is reserved for kernel */
7955 	if (epid == 0) {
7956 		error = EINVAL;
7957 		goto done;
7958 	}
7959 
7960 	/*
7961 	 * If this is an in-kernel socket, prevent its delegate
7962 	 * association from changing unless the socket option is
7963 	 * coming from within the kernel itself.
7964 	 */
7965 	if (so->last_pid == 0 && p != kernproc) {
7966 		error = EACCES;
7967 		goto done;
7968 	}
7969 
7970 	/*
7971 	 * If this is issued by a process that's recorded as the
7972 	 * real owner of the socket, or if the pid is the same as
7973 	 * the process's own pid, then proceed.  Otherwise ensure
7974 	 * that the issuing process has the necessary privileges.
7975 	 */
7976 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7977 		if ((error = priv_check_cred(kauth_cred_get(),
7978 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7979 			error = EACCES;
7980 			goto done;
7981 		}
7982 	}
7983 
7984 	/* Find the process that corresponds to the effective pid */
7985 	if ((ep = proc_find(epid)) == PROC_NULL) {
7986 		error = ESRCH;
7987 		goto done;
7988 	}
7989 
7990 	/*
7991 	 * If a process tries to delegate the socket to itself, then
7992 	 * there's really nothing to do; treat it as a way for the
7993 	 * delegate association to be cleared.  Note that we check
7994 	 * the passed-in proc rather than calling proc_selfpid(),
7995 	 * as we need to check the process issuing the socket option
7996 	 * which could be kernproc.  Given that we don't allow 0 for
7997 	 * effective pid, it means that a delegated in-kernel socket
7998 	 * stays delegated during its lifetime (which is probably OK.)
7999 	 */
8000 	if (epid == proc_pid(p)) {
8001 		so->so_flags &= ~SOF_DELEGATED;
8002 		so->e_upid = 0;
8003 		so->e_pid = 0;
8004 		uuid_clear(so->e_uuid);
8005 	} else {
8006 		so->so_flags |= SOF_DELEGATED;
8007 		so->e_upid = proc_uniqueid(ep);
8008 		so->e_pid = proc_pid(ep);
8009 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8010 
8011 #if defined(XNU_TARGET_OS_OSX)
8012 		if (ep->p_responsible_pid != so->e_pid) {
8013 			proc_t rp = proc_find(ep->p_responsible_pid);
8014 			if (rp != PROC_NULL) {
8015 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8016 				so->so_rpid = ep->p_responsible_pid;
8017 				proc_rele(rp);
8018 			} else {
8019 				uuid_clear(so->so_ruuid);
8020 				so->so_rpid = -1;
8021 			}
8022 		}
8023 #endif
8024 	}
8025 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8026 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8027 	}
8028 done:
8029 	if (error == 0 && net_io_policy_log) {
8030 		uuid_string_t buf;
8031 
8032 		uuid_unparse(so->e_uuid, buf);
8033 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8034 		    "euuid %s%s\n", __func__, proc_name_address(p),
8035 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8036 		    SOCK_DOM(so), SOCK_TYPE(so),
8037 		    so->e_pid, proc_name_address(ep), buf,
8038 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8039 	} else if (error != 0 && net_io_policy_log) {
8040 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8041 		    "ERROR (%d)\n", __func__, proc_name_address(p),
8042 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8043 		    SOCK_DOM(so), SOCK_TYPE(so),
8044 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
8045 		    proc_name_address(ep), error);
8046 	}
8047 
8048 	/* Update this socket's policy upon success */
8049 	if (error == 0) {
8050 		so->so_policy_gencnt *= -1;
8051 		so_update_policy(so);
8052 #if NECP
8053 		so_update_necp_policy(so, NULL, NULL);
8054 #endif /* NECP */
8055 	}
8056 
8057 	if (ep != PROC_NULL) {
8058 		proc_rele(ep);
8059 	}
8060 
8061 	return error;
8062 }
8063 
8064 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8065 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8066 {
8067 	uuid_string_t buf;
8068 	uuid_t uuid;
8069 	int error = 0;
8070 
8071 	/* UUID must not be all-zeroes (reserved for kernel) */
8072 	if (uuid_is_null(euuid)) {
8073 		error = EINVAL;
8074 		goto done;
8075 	}
8076 
8077 	/*
8078 	 * If this is an in-kernel socket, prevent its delegate
8079 	 * association from changing unless the socket option is
8080 	 * coming from within the kernel itself.
8081 	 */
8082 	if (so->last_pid == 0 && p != kernproc) {
8083 		error = EACCES;
8084 		goto done;
8085 	}
8086 
8087 	/* Get the UUID of the issuing process */
8088 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8089 
8090 	/*
8091 	 * If this is issued by a process that's recorded as the
8092 	 * real owner of the socket, or if the uuid is the same as
8093 	 * the process's own uuid, then proceed.  Otherwise ensure
8094 	 * that the issuing process has the necessary privileges.
8095 	 */
8096 	if (check_cred &&
8097 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8098 	    uuid_compare(euuid, uuid) != 0)) {
8099 		if ((error = priv_check_cred(kauth_cred_get(),
8100 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8101 			error = EACCES;
8102 			goto done;
8103 		}
8104 	}
8105 
8106 	/*
8107 	 * If a process tries to delegate the socket to itself, then
8108 	 * there's really nothing to do; treat it as a way for the
8109 	 * delegate association to be cleared.  Note that we check
8110 	 * the uuid of the passed-in proc rather than that of the
8111 	 * current process, as we need to check the process issuing
8112 	 * the socket option which could be kernproc itself.  Given
8113 	 * that we don't allow 0 for effective uuid, it means that
8114 	 * a delegated in-kernel socket stays delegated during its
8115 	 * lifetime (which is okay.)
8116 	 */
8117 	if (uuid_compare(euuid, uuid) == 0) {
8118 		so->so_flags &= ~SOF_DELEGATED;
8119 		so->e_upid = 0;
8120 		so->e_pid = 0;
8121 		uuid_clear(so->e_uuid);
8122 	} else {
8123 		so->so_flags |= SOF_DELEGATED;
8124 		/*
8125 		 * Unlike so_set_effective_pid(), we only have the UUID
8126 		 * here and the process ID is not known.  Inherit the
8127 		 * real {pid,upid} of the socket.
8128 		 */
8129 		so->e_upid = so->last_upid;
8130 		so->e_pid = so->last_pid;
8131 		uuid_copy(so->e_uuid, euuid);
8132 	}
8133 	/*
8134 	 * The following will clear the effective process name as it's the same
8135 	 * as the real process
8136 	 */
8137 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8138 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8139 	}
8140 done:
8141 	if (error == 0 && net_io_policy_log) {
8142 		uuid_unparse(so->e_uuid, buf);
8143 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8144 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8145 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8146 		    SOCK_TYPE(so), so->e_pid, buf,
8147 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8148 	} else if (error != 0 && net_io_policy_log) {
8149 		uuid_unparse(euuid, buf);
8150 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8151 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8152 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8153 		    SOCK_TYPE(so), buf, error);
8154 	}
8155 
8156 	/* Update this socket's policy upon success */
8157 	if (error == 0) {
8158 		so->so_policy_gencnt *= -1;
8159 		so_update_policy(so);
8160 #if NECP
8161 		so_update_necp_policy(so, NULL, NULL);
8162 #endif /* NECP */
8163 	}
8164 
8165 	return error;
8166 }
8167 
8168 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8169 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8170     uint32_t ev_datalen)
8171 {
8172 	struct kev_msg ev_msg;
8173 
8174 	/*
8175 	 * A netpolicy event always starts with a netpolicy_event_data
8176 	 * structure, but the caller can provide for a longer event
8177 	 * structure to post, depending on the event code.
8178 	 */
8179 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8180 
8181 	bzero(&ev_msg, sizeof(ev_msg));
8182 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8183 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8184 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8185 	ev_msg.event_code       = ev_code;
8186 
8187 	ev_msg.dv[0].data_ptr   = ev_data;
8188 	ev_msg.dv[0].data_length = ev_datalen;
8189 
8190 	kev_post_msg(&ev_msg);
8191 }
8192 
8193 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8194 socket_post_kev_msg(uint32_t ev_code,
8195     struct kev_socket_event_data *ev_data,
8196     uint32_t ev_datalen)
8197 {
8198 	struct kev_msg ev_msg;
8199 
8200 	bzero(&ev_msg, sizeof(ev_msg));
8201 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8202 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8203 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8204 	ev_msg.event_code = ev_code;
8205 
8206 	ev_msg.dv[0].data_ptr = ev_data;
8207 	ev_msg.dv[0].data_length = ev_datalen;
8208 
8209 	kev_post_msg(&ev_msg);
8210 }
8211 
8212 void
socket_post_kev_msg_closed(struct socket * so)8213 socket_post_kev_msg_closed(struct socket *so)
8214 {
8215 	struct kev_socket_closed ev = {};
8216 	struct sockaddr *socksa = NULL, *peersa = NULL;
8217 	int err;
8218 
8219 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8220 		return;
8221 	}
8222 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8223 	if (err == 0) {
8224 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8225 		    &peersa);
8226 		if (err == 0) {
8227 			memcpy(&ev.ev_data.kev_sockname, socksa,
8228 			    min(socksa->sa_len,
8229 			    sizeof(ev.ev_data.kev_sockname)));
8230 			memcpy(&ev.ev_data.kev_peername, peersa,
8231 			    min(peersa->sa_len,
8232 			    sizeof(ev.ev_data.kev_peername)));
8233 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8234 			    &ev.ev_data, sizeof(ev));
8235 		}
8236 	}
8237 	free_sockaddr(socksa);
8238 	free_sockaddr(peersa);
8239 }
8240