xref: /xnu-12377.41.6/bsd/kern/uipc_socket.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 1998-2022, 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <net/sockaddr_utils.h>
106 #include <netinet/in.h>
107 #include <netinet/in_pcb.h>
108 #include <netinet/in_tclass.h>
109 #include <netinet/in_var.h>
110 #include <netinet/tcp_var.h>
111 #include <netinet/ip6.h>
112 #include <netinet6/ip6_var.h>
113 #include <netinet/flow_divert.h>
114 #include <kern/assert.h>
115 #include <kern/locks.h>
116 #include <kern/mem_acct.h>
117 #include <kern/policy_internal.h>
118 #include <kern/uipc_domain.h>
119 #include <kern/uipc_socket.h>
120 #include <kern/task.h>
121 #include <kern/zalloc.h>
122 #include <machine/limits.h>
123 #include <libkern/OSAtomic.h>
124 #include <pexpert/pexpert.h>
125 
126 #include <sys/kpi_mbuf.h>
127 #include <sys/mcache.h>
128 #include <sys/unpcb.h>
129 #include <libkern/section_keywords.h>
130 
131 #include <os/log.h>
132 
133 #if CONFIG_MACF
134 #include <security/mac_framework.h>
135 #endif /* MAC */
136 
137 #if MULTIPATH
138 #include <netinet/mp_pcb.h>
139 #include <netinet/mptcp_var.h>
140 #endif /* MULTIPATH */
141 
142 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
143 
144 #if DEBUG || DEVELOPMENT
145 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
146 #else
147 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
148 #endif
149 
150 /* TODO: this should be in a header file somewhere */
151 extern char *proc_name_address(void *p);
152 
153 static int              socketinit_done;
154 struct mem_acct *socket_memacct;
155 
156 #include <machine/limits.h>
157 
158 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
159 static void     filt_sordetach(struct knote *kn);
160 static int      filt_soread(struct knote *kn, long hint);
161 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
162 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
163 
164 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void     filt_sowdetach(struct knote *kn);
166 static int      filt_sowrite(struct knote *kn, long hint);
167 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
169 
170 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void     filt_sockdetach(struct knote *kn);
172 static int      filt_sockev(struct knote *kn, long hint);
173 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
175 
176 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
177 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
178 
179 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
180 	.f_isfd = 1,
181 	.f_attach = filt_sorattach,
182 	.f_detach = filt_sordetach,
183 	.f_event = filt_soread,
184 	.f_touch = filt_sortouch,
185 	.f_process = filt_sorprocess,
186 };
187 
188 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
189 	.f_isfd = 1,
190 	.f_attach = filt_sowattach,
191 	.f_detach = filt_sowdetach,
192 	.f_event = filt_sowrite,
193 	.f_touch = filt_sowtouch,
194 	.f_process = filt_sowprocess,
195 };
196 
197 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
198 	.f_isfd = 1,
199 	.f_attach = filt_sockattach,
200 	.f_detach = filt_sockdetach,
201 	.f_event = filt_sockev,
202 	.f_touch = filt_socktouch,
203 	.f_process = filt_sockprocess,
204 };
205 
206 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
207 	.f_isfd = 1,
208 	.f_attach = filt_sorattach,
209 	.f_detach = filt_sordetach,
210 	.f_event = filt_soread,
211 	.f_touch = filt_sortouch,
212 	.f_process = filt_sorprocess,
213 };
214 
215 SYSCTL_DECL(_kern_ipc);
216 
217 #define EVEN_MORE_LOCKING_DEBUG 0
218 
219 int socket_debug = 0;
220 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
221     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
222 
223 #if (DEBUG || DEVELOPMENT)
224 #define DEFAULT_SOSEND_ASSERT_PANIC 1
225 #else
226 #define DEFAULT_SOSEND_ASSERT_PANIC 0
227 #endif /* (DEBUG || DEVELOPMENT) */
228 
229 int sosend_assert_panic = 0;
230 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
231     CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
232 
233 static unsigned long sodefunct_calls = 0;
234 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
235     &sodefunct_calls, "");
236 
237 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
238 so_gen_t        so_gencnt;      /* generation count for sockets */
239 
240 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
241 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
242 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
243 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
244 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
245 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
246 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
247 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
248 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
249 
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253 
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy  = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262 
263 /*
264  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
265  * writes on the socket for all protocols on any network interfaces.
266  * Be extra careful when setting this to 1, because sending down packets with
267  * clusters larger that 2 KB might lead to system panics or data corruption.
268  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
269  * on the outgoing interface
270  * Set this to 1  for testing/debugging purposes only.
271  */
272 int sosendbigcl_ignore_capab = 0;
273 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
274     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
275 
276 int sodefunctlog = 0;
277 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
278     &sodefunctlog, 0, "");
279 
280 int sothrottlelog = 0;
281 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
282     &sothrottlelog, 0, "");
283 
284 int sorestrictrecv = 1;
285 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
286     &sorestrictrecv, 0, "Enable inbound interface restrictions");
287 
288 int sorestrictsend = 1;
289 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
290     &sorestrictsend, 0, "Enable outbound interface restrictions");
291 
292 int soreserveheadroom = 1;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
294     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
295 
296 #if (DEBUG || DEVELOPMENT)
297 int so_notsent_lowat_check = 1;
298 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
299     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
300 #endif /* DEBUG || DEVELOPMENT */
301 
302 int so_accept_list_waits = 0;
303 #if (DEBUG || DEVELOPMENT)
304 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
305     &so_accept_list_waits, 0, "number of waits for listener incomp list");
306 #endif /* DEBUG || DEVELOPMENT */
307 
308 extern struct inpcbinfo tcbinfo;
309 
310 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
311     user_ssize_t *);
312 
313 /*
314  * Maximum of extended background idle sockets per process
315  * Set to zero to disable further setting of the option
316  */
317 
318 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
319 #define SO_IDLE_BK_IDLE_TIME            600
320 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
321 
322 struct soextbkidlestat soextbkidlestat;
323 
324 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
325     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
326     "Maximum of extended background idle sockets per process");
327 
328 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
329     &soextbkidlestat.so_xbkidle_time, 0,
330     "Time in seconds to keep extended background idle sockets");
331 
332 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
333     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
334     "High water mark for extended background idle sockets");
335 
336 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
337     &soextbkidlestat, soextbkidlestat, "");
338 
339 int so_set_extended_bk_idle(struct socket *, int);
340 
341 #define SO_MAX_MSG_X 1024
342 
343 /*
344  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
345  * setting the DSCP code on the packet based on the service class; see
346  * <rdar://problem/11277343> for details.
347  */
348 __private_extern__ u_int32_t sotcdb = 0;
349 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
350     &sotcdb, 0, "");
351 
352 void
socketinit(void)353 socketinit(void)
354 {
355 	static_assert(sizeof(so_gencnt) == sizeof(uint64_t));
356 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
357 
358 #ifdef __LP64__
359 	static_assert(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
360 	static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
361 	static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
362 	static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
363 	static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
364 	static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
365 #else
366 	static_assert(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
367 	static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
368 	static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
369 	static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
370 	static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
371 	static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
372 #endif
373 
374 	if (socketinit_done) {
375 		printf("socketinit: already called...\n");
376 		return;
377 	}
378 	socketinit_done = 1;
379 
380 	PE_parse_boot_argn("socket_debug", &socket_debug,
381 	    sizeof(socket_debug));
382 
383 	PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
384 	    sizeof(sosend_assert_panic));
385 
386 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
387 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
388 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
389 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
390 
391 	in_pcbinit();
392 
393 	socket_memacct = mem_acct_register("SOCKET", 0, 0);
394 	if (socket_memacct == NULL) {
395 		panic("mem_acct_register returned NULL");
396 	}
397 }
398 
399 void
so_update_last_owner_locked(struct socket * so,proc_t self)400 so_update_last_owner_locked(struct socket *so, proc_t self)
401 {
402 	if (so->last_pid != 0) {
403 		/*
404 		 * last_pid and last_upid should remain zero for sockets
405 		 * created using sock_socket. The check above achieves that
406 		 */
407 		if (self == PROC_NULL) {
408 			self = current_proc();
409 		}
410 
411 		if (so->last_upid != proc_uniqueid(self) ||
412 		    so->last_pid != proc_pid(self)) {
413 			so->last_upid = proc_uniqueid(self);
414 			so->last_pid = proc_pid(self);
415 			proc_getexecutableuuid(self, so->last_uuid,
416 			    sizeof(so->last_uuid));
417 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
418 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
419 			}
420 		}
421 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
422 	}
423 }
424 
425 void
so_update_policy(struct socket * so)426 so_update_policy(struct socket *so)
427 {
428 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
429 		(void) inp_update_policy(sotoinpcb(so));
430 	}
431 }
432 
433 #if NECP
434 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)435 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
436     struct sockaddr *override_remote_addr)
437 {
438 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
439 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
440 		    override_remote_addr, 0);
441 	}
442 }
443 #endif /* NECP */
444 
445 /*
446  * Get a socket structure from our zone, and initialize it.
447  *
448  * Note that it would probably be better to allocate socket
449  * and PCB at the same time, but I'm not convinced that all
450  * the protocols can be easily modified to do this.
451  */
452 struct socket *
soalloc(void)453 soalloc(void)
454 {
455 	struct socket *__single so;
456 
457 	so = zalloc_flags(socket_zone, Z_WAITOK_ZERO);
458 	if (so != NULL) {
459 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
460 
461 		/*
462 		 * Increment the socket allocation statistics
463 		 */
464 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
465 	}
466 
467 	return so;
468 }
469 
470 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)471 socreate_internal(int dom, struct socket **aso, int type, int proto,
472     struct proc *p, uint32_t flags, struct proc *ep)
473 {
474 	struct protosw *prp;
475 	struct socket *so;
476 	int error = 0;
477 	pid_t rpid = -1;
478 
479 	VERIFY(aso != NULL);
480 	*aso = NULL;
481 
482 	if (proto != 0) {
483 		prp = pffindproto(dom, proto, type);
484 	} else {
485 		prp = pffindtype(dom, type);
486 	}
487 
488 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
489 		if (pffinddomain(dom) == NULL) {
490 			return EAFNOSUPPORT;
491 		}
492 		if (proto != 0) {
493 			if (pffindprotonotype(dom, proto) != NULL) {
494 				return EPROTOTYPE;
495 			}
496 		}
497 		return EPROTONOSUPPORT;
498 	}
499 	if (prp->pr_type != type) {
500 		return EPROTOTYPE;
501 	}
502 	if (proto_memacct_hardlimit(prp)) {
503 		return ENOBUFS;
504 	}
505 	so = soalloc();
506 	if (so == NULL) {
507 		return ENOBUFS;
508 	}
509 
510 	switch (dom) {
511 	case PF_LOCAL:
512 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
513 		break;
514 	case PF_INET:
515 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
516 		if (type == SOCK_STREAM) {
517 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
518 		} else {
519 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
520 		}
521 		break;
522 	case PF_ROUTE:
523 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
524 		break;
525 	case PF_NDRV:
526 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
527 		break;
528 	case PF_KEY:
529 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
530 		break;
531 	case PF_INET6:
532 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
533 		if (type == SOCK_STREAM) {
534 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
535 		} else {
536 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
537 		}
538 		break;
539 	case PF_SYSTEM:
540 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
541 		break;
542 	case PF_MULTIPATH:
543 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
544 		break;
545 	default:
546 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
547 		break;
548 	}
549 
550 	if (flags & SOCF_MPTCP) {
551 		so->so_state |= SS_NBIO;
552 	}
553 
554 	TAILQ_INIT(&so->so_incomp);
555 	TAILQ_INIT(&so->so_comp);
556 	so->so_type = (short)type;
557 	so->so_family = prp->pr_domain->dom_family;
558 	so->so_protocol = prp->pr_protocol;
559 	so->last_upid = proc_uniqueid(p);
560 	so->last_pid = proc_pid(p);
561 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
562 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
563 
564 	so->so_rpid = -1;
565 	uuid_clear(so->so_ruuid);
566 
567 	if (ep != PROC_NULL && ep != p) {
568 		so->e_upid = proc_uniqueid(ep);
569 		so->e_pid = proc_pid(ep);
570 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
571 		so->so_flags |= SOF_DELEGATED;
572 		if (ep->p_responsible_pid != so->e_pid) {
573 			rpid = ep->p_responsible_pid;
574 			so->so_rpid = rpid;
575 			proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
576 		}
577 	}
578 
579 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
580 		rpid = p->p_responsible_pid;
581 		so->so_rpid = rpid;
582 		proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
583 	}
584 
585 	so->so_cred = kauth_cred_proc_ref(p);
586 	if (!suser(kauth_cred_get(), NULL)) {
587 		so->so_state |= SS_PRIV;
588 	}
589 
590 	so->so_persona_id = current_persona_get_id();
591 	so->so_proto = prp;
592 	so->so_rcv.sb_flags |= SB_RECV;
593 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
594 	so->next_lock_lr = 0;
595 	so->next_unlock_lr = 0;
596 
597 	proto_memacct_add(so->so_proto, sizeof(struct socket));
598 
599 	/*
600 	 * Attachment will create the per pcb lock if necessary and
601 	 * increase refcount for creation, make sure it's done before
602 	 * socket is inserted in lists.
603 	 */
604 	so->so_usecount++;
605 
606 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
607 	if (error != 0) {
608 		/*
609 		 * Warning:
610 		 * If so_pcb is not zero, the socket will be leaked,
611 		 * so protocol attachment handler must be coded carefuly
612 		 */
613 		if (so->so_pcb != NULL) {
614 			os_log_error(OS_LOG_DEFAULT,
615 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
616 			    error, dom, proto, type);
617 		}
618 		/*
619 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
620 		 */
621 		so->so_state |= SS_NOFDREF;
622 		so->so_flags |= SOF_PCBCLEARING;
623 		VERIFY(so->so_usecount > 0);
624 		so->so_usecount--;
625 		sofreelastref(so, 1);   /* will deallocate the socket */
626 		return error;
627 	}
628 
629 	/*
630 	 * Note: needs so_pcb to be set after pru_attach
631 	 */
632 	if (prp->pr_update_last_owner != NULL) {
633 		(*prp->pr_update_last_owner)(so, p, ep);
634 	}
635 
636 	os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
637 
638 	/* Attach socket filters for this protocol */
639 	sflt_initsock(so);
640 	so_set_default_traffic_class(so);
641 
642 	/*
643 	 * If this thread or task is marked to create backgrounded sockets,
644 	 * mark the socket as background.
645 	 */
646 	if (!(flags & SOCF_MPTCP) &&
647 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
648 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
649 		so->so_background_thread = current_thread();
650 	}
651 
652 	switch (dom) {
653 	/*
654 	 * Don't mark Unix domain or system
655 	 * eligible for defunct by default.
656 	 */
657 	case PF_LOCAL:
658 	case PF_SYSTEM:
659 		so->so_flags |= SOF_NODEFUNCT;
660 		break;
661 	default:
662 		break;
663 	}
664 
665 	/*
666 	 * Entitlements can't be checked at socket creation time except if the
667 	 * application requested a feature guarded by a privilege (c.f., socket
668 	 * delegation).
669 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
670 	 * a privilege check should only be triggered by a userland request.
671 	 * A privilege check at socket creation time is time consuming and
672 	 * could trigger many authorisation error messages from the security
673 	 * APIs.
674 	 */
675 
676 	*aso = so;
677 
678 	return 0;
679 }
680 
681 /*
682  * Returns:	0			Success
683  *		EAFNOSUPPORT
684  *		EPROTOTYPE
685  *		EPROTONOSUPPORT
686  *		ENOBUFS
687  *	<pru_attach>:ENOBUFS[AF_UNIX]
688  *	<pru_attach>:ENOBUFS[TCP]
689  *	<pru_attach>:ENOMEM[TCP]
690  *	<pru_attach>:???		[other protocol families, IPSEC]
691  */
692 int
socreate(int dom,struct socket ** aso,int type,int proto)693 socreate(int dom, struct socket **aso, int type, int proto)
694 {
695 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
696 	           PROC_NULL);
697 }
698 
699 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)700 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
701 {
702 	int error = 0;
703 	struct proc *ep = PROC_NULL;
704 
705 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
706 		error = ESRCH;
707 		goto done;
708 	}
709 
710 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
711 
712 	/*
713 	 * It might not be wise to hold the proc reference when calling
714 	 * socreate_internal since it calls soalloc with M_WAITOK
715 	 */
716 done:
717 	if (ep != PROC_NULL) {
718 		proc_rele(ep);
719 	}
720 
721 	return error;
722 }
723 
724 /*
725  * Returns:	0			Success
726  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
727  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
728  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
729  *	<pru_bind>:EINVAL		Invalid argument
730  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
731  *	<pru_bind>:EACCES		Permission denied
732  *	<pru_bind>:EADDRINUSE		Address in use
733  *	<pru_bind>:EAGAIN		Resource unavailable, try again
734  *	<pru_bind>:EPERM		Operation not permitted
735  *	<pru_bind>:???
736  *	<sf_bind>:???
737  *
738  * Notes:	It's not possible to fully enumerate the return codes above,
739  *		since socket filter authors and protocol family authors may
740  *		not choose to limit their error returns to those listed, even
741  *		though this may result in some software operating incorrectly.
742  *
743  *		The error codes which are enumerated above are those known to
744  *		be returned by the tcp_usr_bind function supplied.
745  */
746 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)747 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
748 {
749 	struct proc *p = current_proc();
750 	int error = 0;
751 
752 	if (dolock) {
753 		socket_lock(so, 1);
754 	}
755 
756 	so_update_last_owner_locked(so, p);
757 	so_update_policy(so);
758 
759 #if NECP
760 	so_update_necp_policy(so, nam, NULL);
761 #endif /* NECP */
762 
763 	/*
764 	 * If this is a bind request on a socket that has been marked
765 	 * as inactive, reject it now before we go any further.
766 	 */
767 	if (so->so_flags & SOF_DEFUNCT) {
768 		error = EINVAL;
769 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
770 		    __func__, proc_pid(p), proc_best_name(p),
771 		    so->so_gencnt,
772 		    SOCK_DOM(so), SOCK_TYPE(so), error);
773 		goto out;
774 	}
775 
776 	/* Socket filter */
777 	error = sflt_bind(so, nam);
778 
779 	if (error == 0) {
780 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
781 	}
782 out:
783 	if (dolock) {
784 		socket_unlock(so, 1);
785 	}
786 
787 	if (error == EJUSTRETURN) {
788 		error = 0;
789 	}
790 
791 	return error;
792 }
793 
794 void
sodealloc(struct socket * so)795 sodealloc(struct socket *so)
796 {
797 	proto_memacct_sub(so->so_proto, sizeof(struct socket));
798 
799 	kauth_cred_unref(&so->so_cred);
800 
801 	/* Remove any filters */
802 	sflt_termsock(so);
803 
804 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
805 
806 	zfree(socket_zone, so);
807 }
808 
809 /*
810  * Returns:	0			Success
811  *		EINVAL
812  *		EOPNOTSUPP
813  *	<pru_listen>:EINVAL[AF_UNIX]
814  *	<pru_listen>:EINVAL[TCP]
815  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
816  *	<pru_listen>:EINVAL[TCP]	Invalid argument
817  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
818  *	<pru_listen>:EACCES[TCP]	Permission denied
819  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
820  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
821  *	<pru_listen>:EPERM[TCP]		Operation not permitted
822  *	<sf_listen>:???
823  *
824  * Notes:	Other <pru_listen> returns depend on the protocol family; all
825  *		<sf_listen> returns depend on what the filter author causes
826  *		their filter to return.
827  */
828 int
solisten(struct socket * so,int backlog)829 solisten(struct socket *so, int backlog)
830 {
831 	struct proc *p = current_proc();
832 	int error = 0;
833 
834 	socket_lock(so, 1);
835 
836 	so_update_last_owner_locked(so, p);
837 	so_update_policy(so);
838 
839 	if (TAILQ_EMPTY(&so->so_comp)) {
840 		so->so_options |= SO_ACCEPTCONN;
841 	}
842 
843 #if NECP
844 	so_update_necp_policy(so, NULL, NULL);
845 #endif /* NECP */
846 
847 	if (so->so_proto == NULL) {
848 		error = EINVAL;
849 		so->so_options &= ~SO_ACCEPTCONN;
850 		goto out;
851 	}
852 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
853 		error = EOPNOTSUPP;
854 		so->so_options &= ~SO_ACCEPTCONN;
855 		goto out;
856 	}
857 
858 	/*
859 	 * If the listen request is made on a socket that is not fully
860 	 * disconnected, or on a socket that has been marked as inactive,
861 	 * reject the request now.
862 	 */
863 	if ((so->so_state &
864 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
865 	    (so->so_flags & SOF_DEFUNCT)) {
866 		error = EINVAL;
867 		if (so->so_flags & SOF_DEFUNCT) {
868 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
869 			    "(%d)\n", __func__, proc_pid(p),
870 			    proc_best_name(p),
871 			    so->so_gencnt,
872 			    SOCK_DOM(so), SOCK_TYPE(so), error);
873 		}
874 		so->so_options &= ~SO_ACCEPTCONN;
875 		goto out;
876 	}
877 
878 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
879 		error = EPERM;
880 		so->so_options &= ~SO_ACCEPTCONN;
881 		goto out;
882 	}
883 
884 	error = sflt_listen(so);
885 	if (error == 0) {
886 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
887 	}
888 
889 	if (error) {
890 		if (error == EJUSTRETURN) {
891 			error = 0;
892 		}
893 		so->so_options &= ~SO_ACCEPTCONN;
894 		goto out;
895 	}
896 
897 	/*
898 	 * POSIX: The implementation may have an upper limit on the length of
899 	 * the listen queue-either global or per accepting socket. If backlog
900 	 * exceeds this limit, the length of the listen queue is set to the
901 	 * limit.
902 	 *
903 	 * If listen() is called with a backlog argument value that is less
904 	 * than 0, the function behaves as if it had been called with a backlog
905 	 * argument value of 0.
906 	 *
907 	 * A backlog argument of 0 may allow the socket to accept connections,
908 	 * in which case the length of the listen queue may be set to an
909 	 * implementation-defined minimum value.
910 	 */
911 	if (backlog <= 0 || backlog > somaxconn) {
912 		backlog = somaxconn;
913 	}
914 
915 	so->so_qlimit = (short)backlog;
916 out:
917 	socket_unlock(so, 1);
918 	return error;
919 }
920 
921 /*
922  * The "accept list lock" protects the fields related to the listener queues
923  * because we can unlock a socket to respect the lock ordering between
924  * the listener socket and its clients sockets. The lock ordering is first to
925  * acquire the client socket before the listener socket.
926  *
927  * The accept list lock serializes access to the following fields:
928  * - of the listener socket:
929  *   - so_comp
930  *   - so_incomp
931  *   - so_qlen
932  *   - so_inqlen
933  * - of client sockets that are in so_comp or so_incomp:
934  *   - so_head
935  *   - so_list
936  *
937  * As one can see the accept list lock protects the consistent of the
938  * linkage of the client sockets.
939  *
940  * Note that those fields may be read without holding the accept list lock
941  * for a preflight provided the accept list lock is taken when committing
942  * to take an action based on the result of the preflight. The preflight
943  * saves the cost of doing the unlock/lock dance.
944  */
945 void
so_acquire_accept_list(struct socket * head,struct socket * so)946 so_acquire_accept_list(struct socket *head, struct socket *so)
947 {
948 	lck_mtx_t *mutex_held;
949 
950 	if (head->so_proto->pr_getlock == NULL) {
951 		return;
952 	}
953 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
954 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
955 
956 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
957 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
958 		return;
959 	}
960 	if (so != NULL) {
961 		socket_unlock(so, 0);
962 	}
963 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
964 		so_accept_list_waits += 1;
965 		msleep((caddr_t)&head->so_incomp, mutex_held,
966 		    PSOCK | PCATCH, __func__, NULL);
967 	}
968 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
969 	if (so != NULL) {
970 		socket_unlock(head, 0);
971 		socket_lock(so, 0);
972 		socket_lock(head, 0);
973 	}
974 }
975 
976 void
so_release_accept_list(struct socket * head)977 so_release_accept_list(struct socket *head)
978 {
979 	if (head->so_proto->pr_getlock != NULL) {
980 		lck_mtx_t *mutex_held;
981 
982 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
983 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
984 
985 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
986 		wakeup((caddr_t)&head->so_incomp);
987 	}
988 }
989 
990 void
sofreelastref(struct socket * so,int dealloc)991 sofreelastref(struct socket *so, int dealloc)
992 {
993 	struct socket *head = so->so_head;
994 
995 	/* Assume socket is locked */
996 
997 #if FLOW_DIVERT
998 	if (so->so_flags & SOF_FLOW_DIVERT) {
999 		flow_divert_detach(so);
1000 	}
1001 #endif  /* FLOW_DIVERT */
1002 
1003 #if CONTENT_FILTER
1004 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1005 		cfil_sock_detach(so);
1006 	}
1007 #endif /* CONTENT_FILTER */
1008 
1009 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1010 		soflow_detach(so);
1011 	}
1012 
1013 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1014 		selthreadclear(&so->so_snd.sb_sel);
1015 		selthreadclear(&so->so_rcv.sb_sel);
1016 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1017 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1018 		so->so_event = sonullevent;
1019 		return;
1020 	}
1021 	if (head != NULL) {
1022 		/*
1023 		 * Need to lock the listener when the protocol has
1024 		 * per socket locks
1025 		 */
1026 		if (head->so_proto->pr_getlock != NULL) {
1027 			socket_lock(head, 1);
1028 			so_acquire_accept_list(head, so);
1029 		}
1030 		if (so->so_state & SS_INCOMP) {
1031 			so->so_state &= ~SS_INCOMP;
1032 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1033 			head->so_incqlen--;
1034 			head->so_qlen--;
1035 			so->so_head = NULL;
1036 
1037 			if (head->so_proto->pr_getlock != NULL) {
1038 				so_release_accept_list(head);
1039 				socket_unlock(head, 1);
1040 			}
1041 		} else if (so->so_state & SS_COMP) {
1042 			if (head->so_proto->pr_getlock != NULL) {
1043 				so_release_accept_list(head);
1044 				socket_unlock(head, 1);
1045 			}
1046 			/*
1047 			 * We must not decommission a socket that's
1048 			 * on the accept(2) queue.  If we do, then
1049 			 * accept(2) may hang after select(2) indicated
1050 			 * that the listening socket was ready.
1051 			 */
1052 			selthreadclear(&so->so_snd.sb_sel);
1053 			selthreadclear(&so->so_rcv.sb_sel);
1054 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1055 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1056 			so->so_event = sonullevent;
1057 			return;
1058 		} else {
1059 			if (head->so_proto->pr_getlock != NULL) {
1060 				so_release_accept_list(head);
1061 				socket_unlock(head, 1);
1062 			}
1063 			printf("sofree: not queued\n");
1064 		}
1065 	}
1066 	sowflush(so);
1067 	sorflush(so);
1068 
1069 	/* 3932268: disable upcall */
1070 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1071 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1072 	so->so_event = sonullevent;
1073 
1074 	if (dealloc) {
1075 		sodealloc(so);
1076 	}
1077 }
1078 
1079 void
soclose_wait_locked(struct socket * so)1080 soclose_wait_locked(struct socket *so)
1081 {
1082 	lck_mtx_t *mutex_held;
1083 
1084 	if (so->so_proto->pr_getlock != NULL) {
1085 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1086 	} else {
1087 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1088 	}
1089 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1090 
1091 	/*
1092 	 * Double check here and return if there's no outstanding upcall;
1093 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1094 	 */
1095 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1096 		return;
1097 	}
1098 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1099 	so->so_snd.sb_flags &= ~SB_UPCALL;
1100 	so->so_flags |= SOF_CLOSEWAIT;
1101 
1102 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1103 	    "soclose_wait_locked", NULL);
1104 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1105 	so->so_flags &= ~SOF_CLOSEWAIT;
1106 }
1107 
1108 /*
1109  * Close a socket on last file table reference removal.
1110  * Initiate disconnect if connected.
1111  * Free socket when disconnect complete.
1112  */
1113 int
soclose_locked(struct socket * so)1114 soclose_locked(struct socket *so)
1115 {
1116 	int error = 0;
1117 	struct timespec ts;
1118 
1119 	if (so->so_usecount == 0) {
1120 		panic("soclose: so=%p refcount=0", so);
1121 		/* NOTREACHED */
1122 	}
1123 
1124 	sflt_notify(so, sock_evt_closing, NULL);
1125 
1126 	if (so->so_upcallusecount) {
1127 		soclose_wait_locked(so);
1128 	}
1129 
1130 #if CONTENT_FILTER
1131 	/*
1132 	 * We have to wait until the content filters are done
1133 	 */
1134 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1135 		cfil_sock_close_wait(so);
1136 		cfil_sock_is_closed(so);
1137 		cfil_sock_detach(so);
1138 	}
1139 #endif /* CONTENT_FILTER */
1140 
1141 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1142 		soflow_detach(so);
1143 	}
1144 
1145 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1146 		soresume(current_proc(), so, 1);
1147 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1148 	}
1149 
1150 	if ((so->so_options & SO_ACCEPTCONN)) {
1151 		struct socket *sp, *sonext;
1152 		int persocklock = 0;
1153 		int incomp_overflow_only;
1154 
1155 		/*
1156 		 * We do not want new connection to be added
1157 		 * to the connection queues
1158 		 */
1159 		so->so_options &= ~SO_ACCEPTCONN;
1160 
1161 		/*
1162 		 * We can drop the lock on the listener once
1163 		 * we've acquired the incoming list
1164 		 */
1165 		if (so->so_proto->pr_getlock != NULL) {
1166 			persocklock = 1;
1167 			so_acquire_accept_list(so, NULL);
1168 			socket_unlock(so, 0);
1169 		}
1170 again:
1171 		incomp_overflow_only = 1;
1172 
1173 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1174 			/*
1175 			 * Radar 5350314
1176 			 * skip sockets thrown away by tcpdropdropblreq
1177 			 * they will get cleanup by the garbage collection.
1178 			 * otherwise, remove the incomp socket from the queue
1179 			 * and let soabort trigger the appropriate cleanup.
1180 			 */
1181 			if (sp->so_flags & SOF_OVERFLOW) {
1182 				continue;
1183 			}
1184 
1185 			if (persocklock != 0) {
1186 				socket_lock(sp, 1);
1187 			}
1188 
1189 			/*
1190 			 * Radar 27945981
1191 			 * The extra reference for the list insure the
1192 			 * validity of the socket pointer when we perform the
1193 			 * unlock of the head above
1194 			 */
1195 			if (sp->so_state & SS_INCOMP) {
1196 				sp->so_state &= ~SS_INCOMP;
1197 				sp->so_head = NULL;
1198 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1199 				so->so_incqlen--;
1200 				so->so_qlen--;
1201 
1202 				(void) soabort(sp);
1203 			} else {
1204 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1205 				    __func__, sp);
1206 			}
1207 
1208 			if (persocklock != 0) {
1209 				socket_unlock(sp, 1);
1210 			}
1211 		}
1212 
1213 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1214 			/* Dequeue from so_comp since sofree() won't do it */
1215 			if (persocklock != 0) {
1216 				socket_lock(sp, 1);
1217 			}
1218 
1219 			if (sp->so_state & SS_COMP) {
1220 				sp->so_state &= ~SS_COMP;
1221 				sp->so_head = NULL;
1222 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1223 				so->so_qlen--;
1224 
1225 				(void) soabort(sp);
1226 			} else {
1227 				panic("%s sp %p in so_comp but !SS_COMP",
1228 				    __func__, sp);
1229 			}
1230 
1231 			if (persocklock) {
1232 				socket_unlock(sp, 1);
1233 			}
1234 		}
1235 
1236 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1237 #if (DEBUG | DEVELOPMENT)
1238 			panic("%s head %p so_comp not empty", __func__, so);
1239 #endif /* (DEVELOPMENT || DEBUG) */
1240 
1241 			goto again;
1242 		}
1243 
1244 		if (!TAILQ_EMPTY(&so->so_comp)) {
1245 #if (DEBUG | DEVELOPMENT)
1246 			panic("%s head %p so_comp not empty", __func__, so);
1247 #endif /* (DEVELOPMENT || DEBUG) */
1248 
1249 			goto again;
1250 		}
1251 
1252 		if (persocklock) {
1253 			socket_lock(so, 0);
1254 			so_release_accept_list(so);
1255 		}
1256 	}
1257 	if (so->so_pcb == NULL) {
1258 		/* 3915887: mark the socket as ready for dealloc */
1259 		so->so_flags |= SOF_PCBCLEARING;
1260 		goto discard;
1261 	}
1262 
1263 	if (so->so_state & SS_ISCONNECTED) {
1264 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1265 			error = sodisconnectlocked(so);
1266 			if (error) {
1267 				goto drop;
1268 			}
1269 		}
1270 		if (so->so_options & SO_LINGER) {
1271 			if ((so->so_state & SS_ISDISCONNECTING) &&
1272 			    (so->so_state & SS_NBIO)) {
1273 				goto drop;
1274 			}
1275 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1276 				lck_mtx_t *mutex_held;
1277 
1278 				if (so->so_proto->pr_getlock != NULL) {
1279 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1280 				} else {
1281 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1282 				}
1283 				ts.tv_sec = (so->so_linger / 100);
1284 				ts.tv_nsec = (so->so_linger % 100) *
1285 				    NSEC_PER_USEC * 1000 * 10;
1286 				error = msleep((caddr_t)&so->so_timeo,
1287 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1288 				if (error) {
1289 					/*
1290 					 * It's OK when the time fires,
1291 					 * don't report an error
1292 					 */
1293 					if (error == EWOULDBLOCK) {
1294 						error = 0;
1295 					}
1296 					break;
1297 				}
1298 			}
1299 		}
1300 	}
1301 drop:
1302 	if (so->so_usecount == 0) {
1303 		panic("soclose: usecount is zero so=%p", so);
1304 		/* NOTREACHED */
1305 	}
1306 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1307 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1308 		if (error == 0) {
1309 			error = error2;
1310 		}
1311 	}
1312 	if (so->so_usecount <= 0) {
1313 		panic("soclose: usecount is zero so=%p", so);
1314 		/* NOTREACHED */
1315 	}
1316 discard:
1317 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1318 	    (so->so_state & SS_NOFDREF)) {
1319 		panic("soclose: NOFDREF");
1320 		/* NOTREACHED */
1321 	}
1322 	so->so_state |= SS_NOFDREF;
1323 
1324 	if ((so->so_flags & SOF_KNOTE) != 0) {
1325 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1326 	}
1327 
1328 	os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1329 
1330 	VERIFY(so->so_usecount > 0);
1331 	so->so_usecount--;
1332 	sofree(so);
1333 	return error;
1334 }
1335 
1336 int
soclose(struct socket * so)1337 soclose(struct socket *so)
1338 {
1339 	int error = 0;
1340 	socket_lock(so, 1);
1341 
1342 	if (so->so_retaincnt == 0) {
1343 		error = soclose_locked(so);
1344 	} else {
1345 		/*
1346 		 * if the FD is going away, but socket is
1347 		 * retained in kernel remove its reference
1348 		 */
1349 		so->so_usecount--;
1350 		if (so->so_usecount < 2) {
1351 			panic("soclose: retaincnt non null and so=%p "
1352 			    "usecount=%d\n", so, so->so_usecount);
1353 		}
1354 	}
1355 	socket_unlock(so, 1);
1356 	return error;
1357 }
1358 
1359 /*
1360  * Must be called at splnet...
1361  */
1362 /* Should already be locked */
1363 int
soabort(struct socket * so)1364 soabort(struct socket *so)
1365 {
1366 	int error;
1367 
1368 #ifdef MORE_LOCKING_DEBUG
1369 	lck_mtx_t *mutex_held;
1370 
1371 	if (so->so_proto->pr_getlock != NULL) {
1372 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1373 	} else {
1374 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1375 	}
1376 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1377 #endif
1378 
1379 	if ((so->so_flags & SOF_ABORTED) == 0) {
1380 		so->so_flags |= SOF_ABORTED;
1381 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1382 		if (error) {
1383 			sofree(so);
1384 			return error;
1385 		}
1386 	}
1387 	return 0;
1388 }
1389 
1390 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1391 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1392 {
1393 	int error;
1394 
1395 	if (dolock) {
1396 		socket_lock(so, 1);
1397 	}
1398 
1399 	so_update_last_owner_locked(so, PROC_NULL);
1400 	so_update_policy(so);
1401 #if NECP
1402 	so_update_necp_policy(so, NULL, NULL);
1403 #endif /* NECP */
1404 
1405 	if ((so->so_state & SS_NOFDREF) == 0) {
1406 		panic("soaccept: !NOFDREF");
1407 	}
1408 	so->so_state &= ~SS_NOFDREF;
1409 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1410 
1411 	if (dolock) {
1412 		socket_unlock(so, 1);
1413 	}
1414 	return error;
1415 }
1416 
1417 int
soaccept(struct socket * so,struct sockaddr ** nam)1418 soaccept(struct socket *so, struct sockaddr **nam)
1419 {
1420 	return soacceptlock(so, nam, 1);
1421 }
1422 
1423 int
soacceptfilter(struct socket * so,struct socket * head)1424 soacceptfilter(struct socket *so, struct socket *head)
1425 {
1426 	struct sockaddr *__single local = NULL, *__single remote = NULL;
1427 	int error = 0;
1428 
1429 	/*
1430 	 * Hold the lock even if this socket has not been made visible
1431 	 * to the filter(s).  For sockets with global locks, this protects
1432 	 * against the head or peer going away
1433 	 */
1434 	socket_lock(so, 1);
1435 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1436 	    sogetaddr_locked(so, &local, 0) != 0) {
1437 		so->so_state &= ~SS_NOFDREF;
1438 		socket_unlock(so, 1);
1439 		soclose(so);
1440 		/* Out of resources; try it again next time */
1441 		error = ECONNABORTED;
1442 		goto done;
1443 	}
1444 
1445 	error = sflt_accept(head, so, local, remote);
1446 
1447 	/*
1448 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1449 	 * as inactive and return it anyway.  This newly accepted socket
1450 	 * will be disconnected later before we hand it off to the caller.
1451 	 */
1452 	if (error == EJUSTRETURN) {
1453 		error = 0;
1454 		(void) sosetdefunct(current_proc(), so,
1455 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1456 	}
1457 
1458 	if (error != 0) {
1459 		/*
1460 		 * This may seem like a duplication to the above error
1461 		 * handling part when we return ECONNABORTED, except
1462 		 * the following is done while holding the lock since
1463 		 * the socket has been exposed to the filter(s) earlier.
1464 		 */
1465 		so->so_state &= ~SS_NOFDREF;
1466 		socket_unlock(so, 1);
1467 		soclose(so);
1468 		/* Propagate socket filter's error code to the caller */
1469 	} else {
1470 		socket_unlock(so, 1);
1471 	}
1472 done:
1473 	/* Callee checks for NULL pointer */
1474 	sock_freeaddr(remote);
1475 	sock_freeaddr(local);
1476 	return error;
1477 }
1478 
1479 /*
1480  * Returns:	0			Success
1481  *		EOPNOTSUPP		Operation not supported on socket
1482  *		EISCONN			Socket is connected
1483  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1484  *	<pru_connect>:EINVAL		Invalid argument
1485  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1486  *	<pru_connect>:EACCES		Permission denied
1487  *	<pru_connect>:EADDRINUSE	Address in use
1488  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1489  *	<pru_connect>:EPERM		Operation not permitted
1490  *	<sf_connect_out>:???		[anything a filter writer might set]
1491  */
1492 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1493 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1494 {
1495 	int error;
1496 	struct proc *p = current_proc();
1497 	tracker_metadata_t metadata = { };
1498 
1499 	if (dolock) {
1500 		socket_lock(so, 1);
1501 	}
1502 
1503 	so_update_last_owner_locked(so, p);
1504 	so_update_policy(so);
1505 
1506 	/*
1507 	 * If this is a listening socket or if this is a previously-accepted
1508 	 * socket that has been marked as inactive, reject the connect request.
1509 	 */
1510 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1511 		error = EOPNOTSUPP;
1512 		if (so->so_flags & SOF_DEFUNCT) {
1513 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1514 			    "(%d)\n", __func__, proc_pid(p),
1515 			    proc_best_name(p),
1516 			    so->so_gencnt,
1517 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1518 		}
1519 		if (dolock) {
1520 			socket_unlock(so, 1);
1521 		}
1522 		return error;
1523 	}
1524 
1525 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1526 		if (dolock) {
1527 			socket_unlock(so, 1);
1528 		}
1529 		return EPERM;
1530 	}
1531 
1532 	/*
1533 	 * If protocol is connection-based, can only connect once.
1534 	 * Otherwise, if connected, try to disconnect first.
1535 	 * This allows user to disconnect by connecting to, e.g.,
1536 	 * a null address.
1537 	 */
1538 #if NECP
1539 	bool set_domain_from_tracker_lookup = false;
1540 #endif /* NECP */
1541 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1542 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1543 	    (error = sodisconnectlocked(so)))) {
1544 		error = EISCONN;
1545 	} else {
1546 		/*
1547 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1548 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1549 		 */
1550 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1551 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1552 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1553 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1554 				}
1555 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1556 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1557 				}
1558 #if NECP
1559 				set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1560 #endif /* NECP */
1561 				necp_set_socket_domain_attributes(so,
1562 				    __unsafe_null_terminated_from_indexable(metadata.domain),
1563 				    __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1564 			}
1565 		}
1566 
1567 #if NECP
1568 		/* Update NECP evaluation after setting any domain via the tracker checks */
1569 		so_update_necp_policy(so, NULL, nam);
1570 		if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1571 			// Mark extended timeout on tracker lookup to ensure that the entry stays around
1572 			tracker_metadata_t update_metadata = { };
1573 			update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1574 			(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &update_metadata);
1575 		}
1576 #endif /* NECP */
1577 
1578 		/*
1579 		 * Run connect filter before calling protocol:
1580 		 *  - non-blocking connect returns before completion;
1581 		 */
1582 		error = sflt_connectout(so, nam);
1583 		if (error != 0) {
1584 			if (error == EJUSTRETURN) {
1585 				error = 0;
1586 			}
1587 		} else {
1588 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1589 			    (so, nam, p);
1590 			if (error != 0) {
1591 				so->so_state &= ~SS_ISCONNECTING;
1592 			}
1593 		}
1594 	}
1595 	if (dolock) {
1596 		socket_unlock(so, 1);
1597 	}
1598 	return error;
1599 }
1600 
1601 int
soconnect(struct socket * so,struct sockaddr * nam)1602 soconnect(struct socket *so, struct sockaddr *nam)
1603 {
1604 	return soconnectlock(so, nam, 1);
1605 }
1606 
1607 /*
1608  * Returns:	0			Success
1609  *	<pru_connect2>:EINVAL[AF_UNIX]
1610  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1611  *	<pru_connect2>:???		[other protocol families]
1612  *
1613  * Notes:	<pru_connect2> is not supported by [TCP].
1614  */
1615 int
soconnect2(struct socket * so1,struct socket * so2)1616 soconnect2(struct socket *so1, struct socket *so2)
1617 {
1618 	int error;
1619 
1620 	socket_lock(so1, 1);
1621 	if (so2->so_proto->pr_lock) {
1622 		socket_lock(so2, 1);
1623 	}
1624 
1625 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1626 
1627 	socket_unlock(so1, 1);
1628 	if (so2->so_proto->pr_lock) {
1629 		socket_unlock(so2, 1);
1630 	}
1631 	return error;
1632 }
1633 
1634 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1635 soconnectxlocked(struct socket *so, struct sockaddr *src,
1636     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1637     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1638     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1639 {
1640 	int error;
1641 	tracker_metadata_t metadata = { };
1642 
1643 	so_update_last_owner_locked(so, p);
1644 	so_update_policy(so);
1645 
1646 	/*
1647 	 * If this is a listening socket or if this is a previously-accepted
1648 	 * socket that has been marked as inactive, reject the connect request.
1649 	 */
1650 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1651 		error = EOPNOTSUPP;
1652 		if (so->so_flags & SOF_DEFUNCT) {
1653 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1654 			    "(%d)\n", __func__, proc_pid(p),
1655 			    proc_best_name(p),
1656 			    so->so_gencnt,
1657 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1658 		}
1659 		return error;
1660 	}
1661 
1662 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1663 		return EPERM;
1664 	}
1665 
1666 	/*
1667 	 * If protocol is connection-based, can only connect once
1668 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1669 	 * try to disconnect first.  This allows user to disconnect
1670 	 * by connecting to, e.g., a null address.
1671 	 */
1672 #if NECP
1673 	bool set_domain_from_tracker_lookup = false;
1674 #endif /* NECP */
1675 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1676 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1677 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1678 	    (error = sodisconnectlocked(so)) != 0)) {
1679 		error = EISCONN;
1680 	} else {
1681 		/*
1682 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1683 		 * (only if it hasn't been marked yet).
1684 		 */
1685 		if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1686 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1687 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1688 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1689 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1690 				}
1691 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1692 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1693 				}
1694 #if NECP
1695 				set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1696 #endif /* NECP */
1697 				necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain),
1698 				    __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1699 			}
1700 		}
1701 
1702 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1703 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1704 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1705 
1706 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1707 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1708 			}
1709 		}
1710 
1711 		/*
1712 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1713 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1714 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1715 		 * Case 3 allows user to combine write with connect even if they have
1716 		 * no use for TFO (such as regular TCP, and UDP).
1717 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1718 		 */
1719 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1720 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1721 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1722 		}
1723 
1724 		/*
1725 		 * If a user sets data idempotent and does not pass an uio, or
1726 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1727 		 * SOF1_DATA_IDEMPOTENT.
1728 		 */
1729 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1730 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1731 			/* We should return EINVAL instead perhaps. */
1732 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1733 		}
1734 
1735 		/*
1736 		 * Run connect filter before calling protocol:
1737 		 *  - non-blocking connect returns before completion;
1738 		 */
1739 		error = sflt_connectout(so, dst);
1740 		if (error != 0) {
1741 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1742 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1743 			if (error == EJUSTRETURN) {
1744 				error = 0;
1745 			}
1746 		} else {
1747 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1748 			    (so, src, dst, p, ifscope, aid, pcid,
1749 			    flags, arg, arglen, auio, bytes_written);
1750 			if (error != 0) {
1751 				so->so_state &= ~SS_ISCONNECTING;
1752 				if (error != EINPROGRESS) {
1753 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1754 				}
1755 			}
1756 
1757 #if NECP
1758 			if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1759 				// Mark extended timeout on tracker lookup to ensure that the entry stays around
1760 				tracker_metadata_t update_metadata = { };
1761 				update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1762 				(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &update_metadata);
1763 			}
1764 #endif /* NECP */
1765 		}
1766 	}
1767 
1768 	return error;
1769 }
1770 
1771 int
sodisconnectlocked(struct socket * so)1772 sodisconnectlocked(struct socket *so)
1773 {
1774 	int error;
1775 
1776 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1777 		error = ENOTCONN;
1778 		goto bad;
1779 	}
1780 	if (so->so_state & SS_ISDISCONNECTING) {
1781 		error = EALREADY;
1782 		goto bad;
1783 	}
1784 
1785 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1786 	if (error == 0) {
1787 		sflt_notify(so, sock_evt_disconnected, NULL);
1788 	}
1789 
1790 bad:
1791 	return error;
1792 }
1793 
1794 /* Locking version */
1795 int
sodisconnect(struct socket * so)1796 sodisconnect(struct socket *so)
1797 {
1798 	int error;
1799 
1800 	socket_lock(so, 1);
1801 	error = sodisconnectlocked(so);
1802 	socket_unlock(so, 1);
1803 	return error;
1804 }
1805 
1806 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1807 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1808 {
1809 	int error;
1810 
1811 	/*
1812 	 * Call the protocol disconnectx handler; let it handle all
1813 	 * matters related to the connection state of this session.
1814 	 */
1815 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1816 	if (error == 0) {
1817 		/*
1818 		 * The event applies only for the session, not for
1819 		 * the disconnection of individual subflows.
1820 		 */
1821 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1822 			sflt_notify(so, sock_evt_disconnected, NULL);
1823 		}
1824 	}
1825 	return error;
1826 }
1827 
1828 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1829 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1830 {
1831 	int error;
1832 
1833 	socket_lock(so, 1);
1834 	error = sodisconnectxlocked(so, aid, cid);
1835 	socket_unlock(so, 1);
1836 	return error;
1837 }
1838 
1839 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1840 
1841 /*
1842  * sosendcheck will lock the socket buffer if it isn't locked and
1843  * verify that there is space for the data being inserted.
1844  *
1845  * Returns:	0			Success
1846  *		EPIPE
1847  *	sblock:EWOULDBLOCK
1848  *	sblock:EINTR
1849  *	sbwait:EBADF
1850  *	sbwait:EINTR
1851  *	[so_error]:???
1852  */
1853 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1854 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1855     int32_t clen, int32_t atomic, int flags, int *sblocked)
1856 {
1857 	int assumelock = 0;
1858 	int error = 0;
1859 	int32_t space;
1860 	int ret;
1861 
1862 restart:
1863 	if (*sblocked == 0) {
1864 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1865 		    so->so_send_filt_thread != 0 &&
1866 		    so->so_send_filt_thread == current_thread()) {
1867 			/*
1868 			 * We're being called recursively from a filter,
1869 			 * allow this to continue. Radar 4150520.
1870 			 * Don't set sblocked because we don't want
1871 			 * to perform an unlock later.
1872 			 */
1873 			assumelock = 1;
1874 		} else {
1875 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1876 			if (error) {
1877 				if (so->so_flags & SOF_DEFUNCT) {
1878 					goto defunct;
1879 				}
1880 				return error;
1881 			}
1882 			*sblocked = 1;
1883 		}
1884 	}
1885 
1886 	/*
1887 	 * If a send attempt is made on a socket that has been marked
1888 	 * as inactive (disconnected), reject the request.
1889 	 */
1890 	if (so->so_flags & SOF_DEFUNCT) {
1891 defunct:
1892 		error = EPIPE;
1893 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
1894 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
1895 		    so->so_gencnt,
1896 		    SOCK_DOM(so), SOCK_TYPE(so), error);
1897 		return error;
1898 	}
1899 
1900 	if (so->so_state & SS_CANTSENDMORE) {
1901 #if CONTENT_FILTER
1902 		/*
1903 		 * Can re-inject data of half closed connections
1904 		 */
1905 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1906 		    so->so_snd.sb_cfil_thread == current_thread() &&
1907 		    cfil_sock_data_pending(&so->so_snd) != 0) {
1908 			CFIL_LOG(LOG_INFO,
1909 			    "so %llx ignore SS_CANTSENDMORE",
1910 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1911 		} else
1912 #endif /* CONTENT_FILTER */
1913 		return EPIPE;
1914 	}
1915 	if (so->so_error) {
1916 		error = so->so_error;
1917 		so->so_error = 0;
1918 		return error;
1919 	}
1920 
1921 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1922 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1923 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1924 			    (resid != 0 || clen == 0) &&
1925 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1926 				return ENOTCONN;
1927 			}
1928 		} else if (addr == 0) {
1929 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1930 			       ENOTCONN : EDESTADDRREQ;
1931 		}
1932 	}
1933 
1934 	space = sbspace(&so->so_snd);
1935 
1936 	if (flags & MSG_OOB) {
1937 		space += 1024;
1938 	}
1939 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
1940 	    clen > so->so_snd.sb_hiwat) {
1941 		return EMSGSIZE;
1942 	}
1943 
1944 	if ((space < resid + clen &&
1945 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1946 	    space < clen)) ||
1947 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1948 		/*
1949 		 * don't block the connectx call when there's more data
1950 		 * than can be copied.
1951 		 */
1952 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1953 			if (space == 0) {
1954 				return EWOULDBLOCK;
1955 			}
1956 			if (space < (int32_t)so->so_snd.sb_lowat) {
1957 				return 0;
1958 			}
1959 		}
1960 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1961 		    assumelock) {
1962 			return EWOULDBLOCK;
1963 		}
1964 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1965 		*sblocked = 0;
1966 		error = sbwait(&so->so_snd);
1967 		if (error) {
1968 			if (so->so_flags & SOF_DEFUNCT) {
1969 				goto defunct;
1970 			}
1971 			return error;
1972 		}
1973 		goto restart;
1974 	}
1975 
1976 	ret = proto_memacct_limited(so->so_proto);
1977 	if (ret == MEMACCT_HARDLIMIT ||
1978 	    (ret == MEMACCT_SOFTLIMIT && so->so_snd.sb_cc > 0)) {
1979 		return ENOMEM;
1980 	}
1981 	return 0;
1982 }
1983 
1984 /*
1985  * Send on a socket.
1986  * If send must go all at once and message is larger than
1987  * send buffering, then hard error.
1988  * Lock against other senders.
1989  * If must go all at once and not enough room now, then
1990  * inform user that this would block and do nothing.
1991  * Otherwise, if nonblocking, send as much as possible.
1992  * The data to be sent is described by "uio" if nonzero,
1993  * otherwise by the mbuf chain "top" (which must be null
1994  * if uio is not).  Data provided in mbuf chain must be small
1995  * enough to send all at once.
1996  *
1997  * Returns nonzero on error, timeout or signal; callers
1998  * must check for short counts if EINTR/ERESTART are returned.
1999  * Data and control buffers are freed on return.
2000  *
2001  * Returns:	0			Success
2002  *		EOPNOTSUPP
2003  *		EINVAL
2004  *		ENOBUFS
2005  *	uiomove:EFAULT
2006  *	sosendcheck:EPIPE
2007  *	sosendcheck:EWOULDBLOCK
2008  *	sosendcheck:EINTR
2009  *	sosendcheck:EBADF
2010  *	sosendcheck:EINTR
2011  *	sosendcheck:???			[value from so_error]
2012  *	<pru_send>:ECONNRESET[TCP]
2013  *	<pru_send>:EINVAL[TCP]
2014  *	<pru_send>:ENOBUFS[TCP]
2015  *	<pru_send>:EADDRINUSE[TCP]
2016  *	<pru_send>:EADDRNOTAVAIL[TCP]
2017  *	<pru_send>:EAFNOSUPPORT[TCP]
2018  *	<pru_send>:EACCES[TCP]
2019  *	<pru_send>:EAGAIN[TCP]
2020  *	<pru_send>:EPERM[TCP]
2021  *	<pru_send>:EMSGSIZE[TCP]
2022  *	<pru_send>:EHOSTUNREACH[TCP]
2023  *	<pru_send>:ENETUNREACH[TCP]
2024  *	<pru_send>:ENETDOWN[TCP]
2025  *	<pru_send>:ENOMEM[TCP]
2026  *	<pru_send>:ENOBUFS[TCP]
2027  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2028  *	<pru_send>:EINVAL[AF_UNIX]
2029  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2030  *	<pru_send>:EPIPE[AF_UNIX]
2031  *	<pru_send>:ENOTCONN[AF_UNIX]
2032  *	<pru_send>:EISCONN[AF_UNIX]
2033  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2034  *	<sf_data_out>:???		[whatever a filter author chooses]
2035  *
2036  * Notes:	Other <pru_send> returns depend on the protocol family; all
2037  *		<sf_data_out> returns depend on what the filter author causes
2038  *		their filter to return.
2039  */
2040 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2041 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2042     struct mbuf *top, struct mbuf *control, int flags)
2043 {
2044 	mbuf_ref_ref_t mp;
2045 	mbuf_ref_t m, freelist = NULL;
2046 	struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2047 	user_ssize_t space, len, resid, orig_resid;
2048 	int clen = 0, error, dontroute, sendflags;
2049 	int atomic = sosendallatonce(so) || top;
2050 	int sblocked = 0;
2051 	struct proc *p = current_proc();
2052 	uint16_t headroom = 0;
2053 	ssize_t mlen;
2054 	boolean_t en_tracing = FALSE;
2055 
2056 	if (uio != NULL) {
2057 		resid = uio_resid(uio);
2058 	} else {
2059 		resid = top->m_pkthdr.len;
2060 	}
2061 	orig_resid = resid;
2062 
2063 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2064 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2065 
2066 	socket_lock(so, 1);
2067 
2068 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2069 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, SOFLOW_DIRECTION_OUTBOUND, 0);
2070 	}
2071 
2072 	/*
2073 	 * trace if tracing & network (vs. unix) sockets & and
2074 	 * non-loopback
2075 	 */
2076 	if (ENTR_SHOULDTRACE &&
2077 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2078 		struct inpcb *inp = sotoinpcb(so);
2079 		if (inp->inp_last_outifp != NULL &&
2080 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2081 			en_tracing = TRUE;
2082 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2083 			    VM_KERNEL_ADDRPERM(so),
2084 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2085 			    (int64_t)resid);
2086 		}
2087 	}
2088 
2089 	/*
2090 	 * Re-injection should not affect process accounting
2091 	 */
2092 	if ((flags & MSG_SKIPCFIL) == 0) {
2093 		so_update_last_owner_locked(so, p);
2094 		so_update_policy(so);
2095 
2096 #if NECP
2097 		so_update_necp_policy(so, NULL, addr);
2098 #endif /* NECP */
2099 	}
2100 
2101 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2102 		error = EOPNOTSUPP;
2103 		goto out_locked;
2104 	}
2105 
2106 	/*
2107 	 * In theory resid should be unsigned.
2108 	 * However, space must be signed, as it might be less than 0
2109 	 * if we over-committed, and we must use a signed comparison
2110 	 * of space and resid.  On the other hand, a negative resid
2111 	 * causes us to loop sending 0-length segments to the protocol.
2112 	 *
2113 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2114 	 *
2115 	 * Note: We limit resid to be a positive int value as we use
2116 	 * imin() to set bytes_to_copy -- radr://14558484
2117 	 */
2118 	if (resid < 0 || resid > INT_MAX ||
2119 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2120 		error = EINVAL;
2121 		goto out_locked;
2122 	}
2123 
2124 	dontroute = (flags & MSG_DONTROUTE) &&
2125 	    (so->so_options & SO_DONTROUTE) == 0 &&
2126 	    (so->so_proto->pr_flags & PR_ATOMIC);
2127 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2128 
2129 	if (control != NULL) {
2130 		clen = control->m_len;
2131 	}
2132 
2133 	if (soreserveheadroom != 0) {
2134 		headroom = so->so_pktheadroom;
2135 	}
2136 
2137 	do {
2138 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2139 		    &sblocked);
2140 		if (error) {
2141 			goto out_locked;
2142 		}
2143 
2144 		mp = &top;
2145 		space = sbspace(&so->so_snd) - clen;
2146 		space += ((flags & MSG_OOB) ? 1024 : 0);
2147 
2148 		do {
2149 			if (uio == NULL) {
2150 				/*
2151 				 * Data is prepackaged in "top".
2152 				 */
2153 				resid = 0;
2154 				if (flags & MSG_EOR) {
2155 					top->m_flags |= M_EOR;
2156 				}
2157 			} else {
2158 				int chainlength;
2159 				int bytes_to_copy;
2160 				boolean_t jumbocl;
2161 				boolean_t bigcl;
2162 				int bytes_to_alloc;
2163 
2164 				bytes_to_copy = imin((int)resid, (int)space);
2165 
2166 				bytes_to_alloc = bytes_to_copy;
2167 				if (top == NULL) {
2168 					bytes_to_alloc += headroom;
2169 				}
2170 
2171 				if (sosendminchain > 0) {
2172 					chainlength = 0;
2173 				} else {
2174 					chainlength = sosendmaxchain;
2175 				}
2176 
2177 				/*
2178 				 * Use big 4 KB cluster when the outgoing interface
2179 				 * does not prefer 2 KB clusters
2180 				 */
2181 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2182 				    sosendbigcl_ignore_capab;
2183 
2184 				/*
2185 				 * Attempt to use larger than system page-size
2186 				 * clusters for large writes only if there is
2187 				 * a jumbo cluster pool and if the socket is
2188 				 * marked accordingly.
2189 				 */
2190 				jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0 &&
2191 				    bigcl;
2192 
2193 				socket_unlock(so, 0);
2194 
2195 				do {
2196 					int num_needed;
2197 					int hdrs_needed = (top == NULL) ? 1 : 0;
2198 
2199 					/*
2200 					 * try to maintain a local cache of mbuf
2201 					 * clusters needed to complete this
2202 					 * write the list is further limited to
2203 					 * the number that are currently needed
2204 					 * to fill the socket this mechanism
2205 					 * allows a large number of mbufs/
2206 					 * clusters to be grabbed under a single
2207 					 * mbuf lock... if we can't get any
2208 					 * clusters, than fall back to trying
2209 					 * for mbufs if we fail early (or
2210 					 * miscalcluate the number needed) make
2211 					 * sure to release any clusters we
2212 					 * haven't yet consumed.
2213 					 */
2214 					if (freelist == NULL &&
2215 					    bytes_to_alloc > MBIGCLBYTES &&
2216 					    jumbocl) {
2217 						num_needed =
2218 						    bytes_to_alloc / M16KCLBYTES;
2219 
2220 						if ((bytes_to_alloc -
2221 						    (num_needed * M16KCLBYTES))
2222 						    >= MINCLSIZE) {
2223 							num_needed++;
2224 						}
2225 
2226 						freelist =
2227 						    m_getpackets_internal(
2228 							(unsigned int *)&num_needed,
2229 							hdrs_needed, M_WAIT, 0,
2230 							M16KCLBYTES);
2231 						/*
2232 						 * Fall back to 4K cluster size
2233 						 * if allocation failed
2234 						 */
2235 					}
2236 
2237 					if (freelist == NULL &&
2238 					    bytes_to_alloc > MCLBYTES &&
2239 					    bigcl) {
2240 						num_needed =
2241 						    bytes_to_alloc / MBIGCLBYTES;
2242 
2243 						if ((bytes_to_alloc -
2244 						    (num_needed * MBIGCLBYTES)) >=
2245 						    MINCLSIZE) {
2246 							num_needed++;
2247 						}
2248 
2249 						freelist =
2250 						    m_getpackets_internal(
2251 							(unsigned int *)&num_needed,
2252 							hdrs_needed, M_WAIT, 0,
2253 							MBIGCLBYTES);
2254 						/*
2255 						 * Fall back to cluster size
2256 						 * if allocation failed
2257 						 */
2258 					}
2259 
2260 					/*
2261 					 * Allocate a cluster as we want to
2262 					 * avoid to split the data in more
2263 					 * that one segment and using MINCLSIZE
2264 					 * would lead us to allocate two mbufs
2265 					 */
2266 					if (soreserveheadroom != 0 &&
2267 					    freelist == NULL &&
2268 					    ((top == NULL &&
2269 					    bytes_to_alloc > _MHLEN) ||
2270 					    bytes_to_alloc > _MLEN)) {
2271 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2272 						    MCLBYTES;
2273 						freelist =
2274 						    m_getpackets_internal(
2275 							(unsigned int *)&num_needed,
2276 							hdrs_needed, M_WAIT, 0,
2277 							MCLBYTES);
2278 						/*
2279 						 * Fall back to a single mbuf
2280 						 * if allocation failed
2281 						 */
2282 					} else if (freelist == NULL &&
2283 					    bytes_to_alloc > MINCLSIZE) {
2284 						num_needed =
2285 						    bytes_to_alloc / MCLBYTES;
2286 
2287 						if ((bytes_to_alloc -
2288 						    (num_needed * MCLBYTES)) >=
2289 						    MINCLSIZE) {
2290 							num_needed++;
2291 						}
2292 
2293 						freelist =
2294 						    m_getpackets_internal(
2295 							(unsigned int *)&num_needed,
2296 							hdrs_needed, M_WAIT, 0,
2297 							MCLBYTES);
2298 						/*
2299 						 * Fall back to a single mbuf
2300 						 * if allocation failed
2301 						 */
2302 					}
2303 					/*
2304 					 * For datagram protocols, leave
2305 					 * headroom for protocol headers
2306 					 * in the first cluster of the chain
2307 					 */
2308 					if (freelist != NULL && atomic &&
2309 					    top == NULL && headroom > 0) {
2310 						freelist->m_data += headroom;
2311 					}
2312 
2313 					/*
2314 					 * Fall back to regular mbufs without
2315 					 * reserving the socket headroom
2316 					 */
2317 					if (freelist == NULL) {
2318 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2319 							if (top == NULL) {
2320 								MGETHDR(freelist,
2321 								    M_WAIT, MT_DATA);
2322 							} else {
2323 								MGET(freelist,
2324 								    M_WAIT, MT_DATA);
2325 							}
2326 						}
2327 
2328 						if (freelist == NULL) {
2329 							error = ENOBUFS;
2330 							socket_lock(so, 0);
2331 							goto out_locked;
2332 						}
2333 						/*
2334 						 * For datagram protocols,
2335 						 * leave room for protocol
2336 						 * headers in first mbuf.
2337 						 */
2338 						if (atomic && top == NULL &&
2339 						    bytes_to_copy > 0 &&
2340 						    bytes_to_copy < MHLEN) {
2341 							MH_ALIGN(freelist,
2342 							    bytes_to_copy);
2343 						}
2344 					}
2345 					m = freelist;
2346 					freelist = m->m_next;
2347 					m->m_next = NULL;
2348 
2349 					if ((m->m_flags & M_EXT)) {
2350 						mlen = m->m_ext.ext_size -
2351 						    M_LEADINGSPACE(m);
2352 					} else if ((m->m_flags & M_PKTHDR)) {
2353 						mlen = MHLEN - M_LEADINGSPACE(m);
2354 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2355 					} else {
2356 						mlen = MLEN - M_LEADINGSPACE(m);
2357 					}
2358 					len = imin((int)mlen, bytes_to_copy);
2359 
2360 					chainlength += len;
2361 
2362 					space -= len;
2363 
2364 					error = uiomove(mtod(m, caddr_t),
2365 					    (int)len, uio);
2366 
2367 					resid = uio_resid(uio);
2368 
2369 					m->m_len = (int32_t)len;
2370 					*mp = m;
2371 					top->m_pkthdr.len += len;
2372 					if (error) {
2373 						break;
2374 					}
2375 					mp = &m->m_next;
2376 					if (resid <= 0) {
2377 						if (flags & MSG_EOR) {
2378 							top->m_flags |= M_EOR;
2379 						}
2380 						break;
2381 					}
2382 					bytes_to_copy = imin((int)resid, (int)space);
2383 				} while (space > 0 &&
2384 				    (chainlength < sosendmaxchain || atomic ||
2385 				    resid < MINCLSIZE));
2386 
2387 				socket_lock(so, 0);
2388 
2389 				if (error) {
2390 					goto out_locked;
2391 				}
2392 			}
2393 
2394 			if (dontroute) {
2395 				so->so_options |= SO_DONTROUTE;
2396 			}
2397 
2398 			/*
2399 			 * Compute flags here, for pru_send and NKEs
2400 			 *
2401 			 * If the user set MSG_EOF, the protocol
2402 			 * understands this flag and nothing left to
2403 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2404 			 */
2405 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2406 			    ((flags & MSG_EOF) &&
2407 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2408 			    (resid <= 0)) ? PRUS_EOF :
2409 			    /* If there is more to send set PRUS_MORETOCOME */
2410 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2411 
2412 			if ((flags & MSG_SKIPCFIL) == 0) {
2413 				/*
2414 				 * Socket filter processing
2415 				 */
2416 				error = sflt_data_out(so, addr, &top,
2417 				    &control, (sendflags & MSG_OOB) ?
2418 				    sock_data_filt_flag_oob : 0);
2419 				if (error) {
2420 					if (error == EJUSTRETURN) {
2421 						error = 0;
2422 						goto packet_consumed;
2423 					}
2424 					goto out_locked;
2425 				}
2426 #if CONTENT_FILTER
2427 				/*
2428 				 * Content filter processing
2429 				 */
2430 				error = cfil_sock_data_out(so, addr, top,
2431 				    control, sendflags, dgram_flow_entry);
2432 				if (error) {
2433 					if (error == EJUSTRETURN) {
2434 						error = 0;
2435 						goto packet_consumed;
2436 					}
2437 					goto out_locked;
2438 				}
2439 #endif /* CONTENT_FILTER */
2440 			}
2441 			error = (*so->so_proto->pr_usrreqs->pru_send)
2442 			    (so, sendflags, top, addr, control, p);
2443 			if (error == EJUSTRETURN) {
2444 				error = 0;
2445 			}
2446 
2447 packet_consumed:
2448 			if (dontroute) {
2449 				so->so_options &= ~SO_DONTROUTE;
2450 			}
2451 
2452 			clen = 0;
2453 			control = NULL;
2454 			top = NULL;
2455 			mp = &top;
2456 			if (error) {
2457 				goto out_locked;
2458 			}
2459 		} while (resid && space > 0);
2460 	} while (resid);
2461 
2462 
2463 out_locked:
2464 	if (resid > orig_resid) {
2465 		char pname[MAXCOMLEN] = {};
2466 		pid_t current_pid = proc_pid(current_proc());
2467 		proc_name(current_pid, pname, sizeof(pname));
2468 
2469 		if (sosend_assert_panic != 0) {
2470 			panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2471 			    so, resid, orig_resid, pname, current_pid);
2472 		} else {
2473 			os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2474 			    so->so_gencnt, resid, orig_resid, pname, current_pid);
2475 		}
2476 	}
2477 
2478 	if (sblocked) {
2479 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2480 	} else {
2481 		socket_unlock(so, 1);
2482 	}
2483 	if (top != NULL) {
2484 		m_freem(top);
2485 	}
2486 	if (control != NULL) {
2487 		m_freem(control);
2488 	}
2489 	if (freelist != NULL) {
2490 		m_freem_list(freelist);
2491 	}
2492 
2493 	if (dgram_flow_entry != NULL) {
2494 		soflow_free_flow(dgram_flow_entry);
2495 	}
2496 
2497 	soclearfastopen(so);
2498 
2499 	if (en_tracing) {
2500 		/* resid passed here is the bytes left in uio */
2501 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2502 		    VM_KERNEL_ADDRPERM(so),
2503 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2504 		    (int64_t)(orig_resid - resid));
2505 	}
2506 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2507 	    so->so_snd.sb_cc, space, error);
2508 
2509 	return error;
2510 }
2511 
2512 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2513 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2514 {
2515 	struct mbuf *m0 = NULL, *control_end = NULL;
2516 
2517 	socket_lock_assert_owned(so);
2518 
2519 	/*
2520 	 * top must points to mbuf chain to be sent.
2521 	 * If control is not NULL, top must be packet header
2522 	 */
2523 	VERIFY(top != NULL &&
2524 	    (control == NULL || top->m_flags & M_PKTHDR));
2525 
2526 	/*
2527 	 * If control is not passed in, see if we can get it
2528 	 * from top.
2529 	 */
2530 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2531 		// Locate start of control if present and start of data
2532 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2533 			if (m0->m_flags & M_PKTHDR) {
2534 				top = m0;
2535 				break;
2536 			} else if (m0->m_type == MT_CONTROL) {
2537 				if (control == NULL) {
2538 					// Found start of control
2539 					control = m0;
2540 				}
2541 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2542 					// Found end of control
2543 					control_end = m0;
2544 				}
2545 			}
2546 		}
2547 		if (control_end != NULL) {
2548 			control_end->m_next = NULL;
2549 		}
2550 	}
2551 
2552 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2553 	    (so, sendflags, top, addr, control, current_proc());
2554 	if (error == EJUSTRETURN) {
2555 		error = 0;
2556 	}
2557 
2558 	return error;
2559 }
2560 
2561 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp,struct mbuf ** last_control)2562 mbuf_detach_control_from_list(struct mbuf **mp, struct mbuf **last_control)
2563 {
2564 	struct mbuf *control = NULL;
2565 	struct mbuf *m = *mp;
2566 
2567 	if (m->m_type == MT_CONTROL) {
2568 		struct mbuf *control_end;
2569 		struct mbuf *n;
2570 
2571 		n = control_end = control = m;
2572 
2573 		/*
2574 		 * Break the chain per mbuf type
2575 		 */
2576 		while (n != NULL && n->m_type == MT_CONTROL) {
2577 			control_end = n;
2578 			n = n->m_next;
2579 		}
2580 		control_end->m_next = NULL;
2581 		*mp = n;
2582 		if (last_control != NULL) {
2583 			*last_control = control_end;
2584 		}
2585 	}
2586 	VERIFY(*mp != NULL);
2587 
2588 	return control;
2589 }
2590 
2591 /*
2592  * Supported only connected sockets (no address) without ancillary data
2593  * (control mbuf) for atomic protocols
2594  */
2595 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2596 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2597 {
2598 	mbuf_ref_t m, control = NULL;
2599 	struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2600 	int error, dontroute;
2601 	int atomic = sosendallatonce(so);
2602 	int sblocked = 0;
2603 	struct proc *p = current_proc();
2604 	struct mbuf *top = pktlist;
2605 	bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2606 
2607 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2608 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2609 
2610 	if (so->so_type != SOCK_DGRAM) {
2611 		error = EINVAL;
2612 		os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2613 		    error);
2614 		goto out;
2615 	}
2616 	if (atomic == 0) {
2617 		error = EINVAL;
2618 		os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2619 		    error);
2620 		goto out;
2621 	}
2622 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2623 		error = ENOTCONN;
2624 		os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2625 		    error);
2626 		goto out;
2627 	}
2628 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2629 		error = EINVAL;
2630 		os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2631 		    flags, error);
2632 		goto out;
2633 	}
2634 
2635 	socket_lock(so, 1);
2636 	so_update_last_owner_locked(so, p);
2637 	so_update_policy(so);
2638 
2639 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2640 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, SOFLOW_DIRECTION_OUTBOUND, 0);
2641 	}
2642 
2643 #if NECP
2644 	so_update_necp_policy(so, NULL, NULL);
2645 #endif /* NECP */
2646 
2647 	dontroute = (flags & MSG_DONTROUTE) &&
2648 	    (so->so_options & SO_DONTROUTE) == 0 &&
2649 	    (so->so_proto->pr_flags & PR_ATOMIC);
2650 	if (dontroute) {
2651 		so->so_options |= SO_DONTROUTE;
2652 	}
2653 
2654 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2655 
2656 	error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2657 	if (error) {
2658 		os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2659 		    error);
2660 		goto release;
2661 	}
2662 
2663 	if (!skip_filt) {
2664 		mbuf_ref_ref_t prevnextp = NULL;
2665 
2666 		for (m = top; m != NULL; m = m->m_nextpkt) {
2667 			mbuf_ref_t nextpkt, last_control;
2668 
2669 			/*
2670 			 * Remove packet from the list of packets
2671 			 */
2672 			nextpkt = m->m_nextpkt;
2673 			if (prevnextp != NULL) {
2674 				*prevnextp = nextpkt;
2675 			} else {
2676 				top = nextpkt;
2677 			}
2678 			m->m_nextpkt = NULL;
2679 
2680 			/*
2681 			 * Break the chain per mbuf type
2682 			 */
2683 			if (m->m_type == MT_CONTROL) {
2684 				control = mbuf_detach_control_from_list(&m, &last_control);
2685 			}
2686 			/*
2687 			 * Socket filter processing
2688 			 */
2689 			error = sflt_data_out(so, NULL, &m,
2690 			    &control, 0);
2691 			if (error != 0 && error != EJUSTRETURN) {
2692 				os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2693 				    error);
2694 				m_freem(m);
2695 				goto release;
2696 			}
2697 
2698 #if CONTENT_FILTER
2699 			if (error == 0) {
2700 				/*
2701 				 * Content filter processing
2702 				 */
2703 				error = cfil_sock_data_out(so, NULL, m,
2704 				    control, 0, dgram_flow_entry);
2705 				if (error != 0 && error != EJUSTRETURN) {
2706 					os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2707 					    error);
2708 					m_freem(m);
2709 					goto release;
2710 				}
2711 			}
2712 #endif /* CONTENT_FILTER */
2713 			if (error == EJUSTRETURN) {
2714 				/*
2715 				 * When swallowed by a filter, the packet is not
2716 				 * in the list anymore
2717 				 */
2718 				error = 0;
2719 			} else {
2720 				/*
2721 				 * Rebuild the mbuf chain of the packet
2722 				 */
2723 				if (control != NULL) {
2724 					last_control->m_next = m;
2725 					m = control;
2726 				}
2727 				/*
2728 				 * Reinsert the packet in the list of packets
2729 				 */
2730 				m->m_nextpkt = nextpkt;
2731 				if (prevnextp != NULL) {
2732 					*prevnextp = m;
2733 				} else {
2734 					top = m;
2735 				}
2736 				prevnextp = &m->m_nextpkt;
2737 			}
2738 			control = NULL;
2739 		}
2740 	}
2741 
2742 	if (top != NULL) {
2743 		if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2744 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2745 			    (so, top, pktcnt, flags);
2746 			if (error != 0 && error != ENOBUFS) {
2747 				os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2748 				    error);
2749 			}
2750 			top = NULL;
2751 		} else {
2752 			*pktcnt = 0;
2753 			control = NULL;
2754 			for (m = top; m != NULL; m = top) {
2755 				top = m->m_nextpkt;
2756 				m->m_nextpkt = NULL;
2757 
2758 				/*
2759 				 * Break the chain per mbuf type
2760 				 */
2761 				if (m->m_type == MT_CONTROL) {
2762 					control = mbuf_detach_control_from_list(&m, NULL);
2763 				}
2764 
2765 				error = (*so->so_proto->pr_usrreqs->pru_send)
2766 				    (so, 0, m, NULL, control, current_proc());
2767 				if (error == EJUSTRETURN) {
2768 					error = 0;
2769 				}
2770 				if (error != 0) {
2771 					if (error != ENOBUFS) {
2772 						os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2773 						    error);
2774 					}
2775 					control = NULL;
2776 					goto release;
2777 				}
2778 				*pktcnt += 1;
2779 				control = NULL;
2780 			}
2781 		}
2782 	}
2783 
2784 release:
2785 	if (dontroute) {
2786 		so->so_options &= ~SO_DONTROUTE;
2787 	}
2788 	if (sblocked) {
2789 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2790 	} else {
2791 		socket_unlock(so, 1);
2792 	}
2793 out:
2794 	if (control != NULL) {
2795 		m_freem(control);
2796 	}
2797 	if (top != NULL) {
2798 		if (error != ENOBUFS) {
2799 			os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2800 			    error);
2801 		}
2802 		m_freem_list(top);
2803 	}
2804 
2805 	if (dgram_flow_entry != NULL) {
2806 		soflow_free_flow(dgram_flow_entry);
2807 	}
2808 
2809 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2810 	    so->so_snd.sb_cc, 0, error);
2811 
2812 	return error;
2813 }
2814 
2815 /*
2816  * May return ERESTART when packet is dropped by MAC policy check
2817  */
2818 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2819 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2820     struct mbuf **maddrp,
2821     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2822 {
2823 	int error = 0;
2824 	struct mbuf *m = *mp;
2825 	struct mbuf *nextrecord = *nextrecordp;
2826 
2827 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2828 #if CONFIG_MACF_SOCKET_SUBSET
2829 	/*
2830 	 * Call the MAC framework for policy checking if we're in
2831 	 * the user process context and the socket isn't connected.
2832 	 */
2833 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2834 		struct mbuf *m0 = m;
2835 		/*
2836 		 * Dequeue this record (temporarily) from the receive
2837 		 * list since we're about to drop the socket's lock
2838 		 * where a new record may arrive and be appended to
2839 		 * the list.  Upon MAC policy failure, the record
2840 		 * will be freed.  Otherwise, we'll add it back to
2841 		 * the head of the list.  We cannot rely on SB_LOCK
2842 		 * because append operation uses the socket's lock.
2843 		 */
2844 		do {
2845 			m->m_nextpkt = NULL;
2846 			sbfree(&so->so_rcv, m);
2847 			m = m->m_next;
2848 		} while (m != NULL);
2849 		m = m0;
2850 		so->so_rcv.sb_mb = nextrecord;
2851 		SB_EMPTY_FIXUP(&so->so_rcv);
2852 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2853 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2854 		socket_unlock(so, 0);
2855 
2856 		error = mac_socket_check_received(kauth_cred_get(), so,
2857 		    mtod(m, struct sockaddr *));
2858 
2859 		if (error != 0) {
2860 			/*
2861 			 * MAC policy failure; free this record and
2862 			 * process the next record (or block until
2863 			 * one is available).  We have adjusted sb_cc
2864 			 * and sb_mbcnt above so there is no need to
2865 			 * call sbfree() again.
2866 			 */
2867 			m_freem(m);
2868 			/*
2869 			 * Clear SB_LOCK but don't unlock the socket.
2870 			 * Process the next record or wait for one.
2871 			 */
2872 			socket_lock(so, 0);
2873 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
2874 			error = ERESTART;
2875 			goto done;
2876 		}
2877 		socket_lock(so, 0);
2878 		/*
2879 		 * If the socket has been defunct'd, drop it.
2880 		 */
2881 		if (so->so_flags & SOF_DEFUNCT) {
2882 			m_freem(m);
2883 			error = ENOTCONN;
2884 			goto done;
2885 		}
2886 		/*
2887 		 * Re-adjust the socket receive list and re-enqueue
2888 		 * the record in front of any packets which may have
2889 		 * been appended while we dropped the lock.
2890 		 */
2891 		for (m = m0; m->m_next != NULL; m = m->m_next) {
2892 			sballoc(&so->so_rcv, m);
2893 		}
2894 		sballoc(&so->so_rcv, m);
2895 		if (so->so_rcv.sb_mb == NULL) {
2896 			so->so_rcv.sb_lastrecord = m0;
2897 			so->so_rcv.sb_mbtail = m;
2898 		}
2899 		m = m0;
2900 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2901 		so->so_rcv.sb_mb = m;
2902 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2903 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2904 	}
2905 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2906 	if (psa != NULL) {
2907 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2908 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2909 			error = EWOULDBLOCK;
2910 			goto done;
2911 		}
2912 	} else if (maddrp != NULL) {
2913 		*maddrp = m;
2914 	}
2915 	if (flags & MSG_PEEK) {
2916 		m = m->m_next;
2917 	} else {
2918 		sbfree(&so->so_rcv, m);
2919 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2920 			panic("%s: about to create invalid socketbuf",
2921 			    __func__);
2922 			/* NOTREACHED */
2923 		}
2924 		if (maddrp == NULL) {
2925 			MFREE(m, so->so_rcv.sb_mb);
2926 		} else {
2927 			so->so_rcv.sb_mb = m->m_next;
2928 			m->m_next = NULL;
2929 		}
2930 		m = so->so_rcv.sb_mb;
2931 		if (m != NULL) {
2932 			m->m_nextpkt = nextrecord;
2933 		} else {
2934 			so->so_rcv.sb_mb = nextrecord;
2935 			SB_EMPTY_FIXUP(&so->so_rcv);
2936 		}
2937 	}
2938 done:
2939 	*mp = m;
2940 	*nextrecordp = nextrecord;
2941 
2942 	return error;
2943 }
2944 
2945 /*
2946  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
2947  * so clear the data portion in order not to leak the file pointers
2948  */
2949 static void
sopeek_scm_rights(struct mbuf * rights)2950 sopeek_scm_rights(struct mbuf *rights)
2951 {
2952 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
2953 
2954 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
2955 		VERIFY(cm->cmsg_len <= rights->m_len);
2956 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
2957 	}
2958 }
2959 
2960 /*
2961  * Process one or more MT_CONTROL mbufs present before any data mbufs
2962  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2963  * just copy the data; if !MSG_PEEK, we call into the protocol to
2964  * perform externalization.
2965  */
2966 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)2967 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2968     struct mbuf **mp, struct mbuf **nextrecordp)
2969 {
2970 	int error = 0;
2971 	mbuf_ref_t cm = NULL, cmn;
2972 	mbuf_ref_ref_t cme = &cm;
2973 	struct sockbuf *sb_rcv = &so->so_rcv;
2974 	mbuf_ref_ref_t msgpcm = NULL;
2975 	mbuf_ref_t m = *mp;
2976 	mbuf_ref_t nextrecord = *nextrecordp;
2977 	struct protosw *pr = so->so_proto;
2978 
2979 	/*
2980 	 * Externalizing the control messages would require us to
2981 	 * drop the socket's lock below.  Once we re-acquire the
2982 	 * lock, the mbuf chain might change.  In order to preserve
2983 	 * consistency, we unlink all control messages from the
2984 	 * first mbuf chain in one shot and link them separately
2985 	 * onto a different chain.
2986 	 */
2987 	do {
2988 		if (flags & MSG_PEEK) {
2989 			if (controlp != NULL) {
2990 				if (*controlp == NULL) {
2991 					msgpcm = controlp;
2992 				}
2993 				*controlp = m_copy(m, 0, m->m_len);
2994 
2995 				/*
2996 				 * If we failed to allocate an mbuf,
2997 				 * release any previously allocated
2998 				 * mbufs for control data. Return
2999 				 * an error. Keep the mbufs in the
3000 				 * socket as this is using
3001 				 * MSG_PEEK flag.
3002 				 */
3003 				if (*controlp == NULL) {
3004 					m_freem(*msgpcm);
3005 					error = ENOBUFS;
3006 					goto done;
3007 				}
3008 
3009 				if (pr->pr_domain->dom_externalize != NULL) {
3010 					sopeek_scm_rights(*controlp);
3011 				}
3012 
3013 				controlp = &(*controlp)->m_next;
3014 			}
3015 			m = m->m_next;
3016 		} else {
3017 			m->m_nextpkt = NULL;
3018 			sbfree(sb_rcv, m);
3019 			sb_rcv->sb_mb = m->m_next;
3020 			m->m_next = NULL;
3021 			*cme = m;
3022 			cme = &(*cme)->m_next;
3023 			m = sb_rcv->sb_mb;
3024 		}
3025 	} while (m != NULL && m->m_type == MT_CONTROL);
3026 
3027 	if (!(flags & MSG_PEEK)) {
3028 		if (sb_rcv->sb_mb != NULL) {
3029 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3030 		} else {
3031 			sb_rcv->sb_mb = nextrecord;
3032 			SB_EMPTY_FIXUP(sb_rcv);
3033 		}
3034 		if (nextrecord == NULL) {
3035 			sb_rcv->sb_lastrecord = m;
3036 		}
3037 	}
3038 
3039 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3040 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3041 
3042 	while (cm != NULL) {
3043 		int cmsg_level;
3044 		int cmsg_type;
3045 
3046 		cmn = cm->m_next;
3047 		cm->m_next = NULL;
3048 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3049 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3050 
3051 		/*
3052 		 * Call the protocol to externalize SCM_RIGHTS message
3053 		 * and return the modified message to the caller upon
3054 		 * success.  Otherwise, all other control messages are
3055 		 * returned unmodified to the caller.  Note that we
3056 		 * only get into this loop if MSG_PEEK is not set.
3057 		 */
3058 		if (pr->pr_domain->dom_externalize != NULL &&
3059 		    cmsg_level == SOL_SOCKET &&
3060 		    cmsg_type == SCM_RIGHTS) {
3061 			/*
3062 			 * Release socket lock: see 3903171.  This
3063 			 * would also allow more records to be appended
3064 			 * to the socket buffer.  We still have SB_LOCK
3065 			 * set on it, so we can be sure that the head
3066 			 * of the mbuf chain won't change.
3067 			 */
3068 			socket_unlock(so, 0);
3069 			error = (*pr->pr_domain->dom_externalize)(cm);
3070 			socket_lock(so, 0);
3071 		} else {
3072 			error = 0;
3073 		}
3074 
3075 		if (controlp != NULL && error == 0) {
3076 			*controlp = cm;
3077 			controlp = &(*controlp)->m_next;
3078 		} else {
3079 			(void) m_free(cm);
3080 		}
3081 		cm = cmn;
3082 	}
3083 	/*
3084 	 * Update the value of nextrecord in case we received new
3085 	 * records when the socket was unlocked above for
3086 	 * externalizing SCM_RIGHTS.
3087 	 */
3088 	if (m != NULL) {
3089 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3090 	} else {
3091 		nextrecord = sb_rcv->sb_mb;
3092 	}
3093 
3094 done:
3095 	*mp = m;
3096 	*nextrecordp = nextrecord;
3097 
3098 	return error;
3099 }
3100 
3101 /*
3102  * If we have less data than requested, block awaiting more
3103  * (subject to any timeout) if:
3104  *   1. the current count is less than the low water mark, or
3105  *   2. MSG_WAITALL is set, and it is possible to do the entire
3106  *	receive operation at once if we block (resid <= hiwat).
3107  *   3. MSG_DONTWAIT is not set
3108  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3109  * we have to do the receive in sections, and thus risk returning
3110  * a short count if a timeout or signal occurs after we start.
3111  */
3112 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3113 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3114 {
3115 	struct protosw *pr = so->so_proto;
3116 
3117 	/* No mbufs in the receive-queue? Wait! */
3118 	if (m == NULL) {
3119 		return true;
3120 	}
3121 
3122 	/* Not enough data in the receive socket-buffer - we may have to wait */
3123 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3124 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3125 		/*
3126 		 * Application did set the lowater-mark, so we should wait for
3127 		 * this data to be present.
3128 		 */
3129 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3130 			return true;
3131 		}
3132 
3133 		/*
3134 		 * Application wants all the data - so let's try to do the
3135 		 * receive-operation at once by waiting for everything to
3136 		 * be there.
3137 		 */
3138 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3139 			return true;
3140 		}
3141 	}
3142 
3143 	return false;
3144 }
3145 
3146 /*
3147  * Implement receive operations on a socket.
3148  * We depend on the way that records are added to the sockbuf
3149  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3150  * must begin with an address if the protocol so specifies,
3151  * followed by an optional mbuf or mbufs containing ancillary data,
3152  * and then zero or more mbufs of data.
3153  * In order to avoid blocking network interrupts for the entire time here,
3154  * we splx() while doing the actual copy to user space.
3155  * Although the sockbuf is locked, new data may still be appended,
3156  * and thus we must maintain consistency of the sockbuf during that time.
3157  *
3158  * The caller may receive the data as a single mbuf chain by supplying
3159  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3160  * only for the count in uio_resid.
3161  *
3162  * Returns:	0			Success
3163  *		ENOBUFS
3164  *		ENOTCONN
3165  *		EWOULDBLOCK
3166  *	uiomove:EFAULT
3167  *	sblock:EWOULDBLOCK
3168  *	sblock:EINTR
3169  *	sbwait:EBADF
3170  *	sbwait:EINTR
3171  *	sodelayed_copy:EFAULT
3172  *	<pru_rcvoob>:EINVAL[TCP]
3173  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3174  *	<pru_rcvoob>:???
3175  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3176  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3177  *	<pr_domain->dom_externalize>:???
3178  *
3179  * Notes:	Additional return values from calls through <pru_rcvoob> and
3180  *		<pr_domain->dom_externalize> depend on protocols other than
3181  *		TCP or AF_UNIX, which are documented above.
3182  */
3183 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3184 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3185     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3186 {
3187 	mbuf_ref_t m;
3188 	mbuf_ref_ref_t mp;
3189 	mbuf_ref_t ml = NULL;
3190 	mbuf_ref_t nextrecord, free_list;
3191 	int flags, error, offset;
3192 	user_ssize_t len;
3193 	struct protosw *pr = so->so_proto;
3194 	int moff, type = 0;
3195 	user_ssize_t orig_resid = uio_resid(uio);
3196 	user_ssize_t delayed_copy_len;
3197 	int can_delay;
3198 	struct proc *p = current_proc();
3199 	boolean_t en_tracing = FALSE;
3200 
3201 	/*
3202 	 * Sanity check on the length passed by caller as we are making 'int'
3203 	 * comparisons
3204 	 */
3205 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3206 		return EINVAL;
3207 	}
3208 
3209 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3210 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3211 	    so->so_rcv.sb_hiwat);
3212 
3213 	socket_lock(so, 1);
3214 	so_update_last_owner_locked(so, p);
3215 	so_update_policy(so);
3216 
3217 #ifdef MORE_LOCKING_DEBUG
3218 	if (so->so_usecount == 1) {
3219 		panic("%s: so=%x no other reference on socket", __func__, so);
3220 		/* NOTREACHED */
3221 	}
3222 #endif
3223 	mp = mp0;
3224 	if (psa != NULL) {
3225 		*psa = NULL;
3226 	}
3227 	if (controlp != NULL) {
3228 		*controlp = NULL;
3229 	}
3230 	if (flagsp != NULL) {
3231 		flags = *flagsp & ~MSG_EOR;
3232 	} else {
3233 		flags = 0;
3234 	}
3235 
3236 	/*
3237 	 * If a recv attempt is made on a previously-accepted socket
3238 	 * that has been marked as inactive (disconnected), reject
3239 	 * the request.
3240 	 */
3241 	if (so->so_flags & SOF_DEFUNCT) {
3242 		struct sockbuf *sb = &so->so_rcv;
3243 
3244 		error = ENOTCONN;
3245 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3246 		    __func__, proc_pid(p), proc_best_name(p),
3247 		    so->so_gencnt,
3248 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3249 		/*
3250 		 * This socket should have been disconnected and flushed
3251 		 * prior to being returned from sodefunct(); there should
3252 		 * be no data on its receive list, so panic otherwise.
3253 		 */
3254 		if (so->so_state & SS_DEFUNCT) {
3255 			sb_empty_assert(sb, __func__);
3256 		}
3257 		socket_unlock(so, 1);
3258 		return error;
3259 	}
3260 
3261 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3262 	    pr->pr_usrreqs->pru_preconnect) {
3263 		/*
3264 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3265 		 * calling write() right after this. *If* the app calls a read
3266 		 * we do not want to block this read indefinetely. Thus,
3267 		 * we trigger a connect so that the session gets initiated.
3268 		 */
3269 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3270 
3271 		if (error) {
3272 			socket_unlock(so, 1);
3273 			return error;
3274 		}
3275 	}
3276 
3277 	if (ENTR_SHOULDTRACE &&
3278 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3279 		/*
3280 		 * enable energy tracing for inet sockets that go over
3281 		 * non-loopback interfaces only.
3282 		 */
3283 		struct inpcb *inp = sotoinpcb(so);
3284 		if (inp->inp_last_outifp != NULL &&
3285 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3286 			en_tracing = TRUE;
3287 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3288 			    VM_KERNEL_ADDRPERM(so),
3289 			    ((so->so_state & SS_NBIO) ?
3290 			    kEnTrFlagNonBlocking : 0),
3291 			    (int64_t)orig_resid);
3292 		}
3293 	}
3294 
3295 	/*
3296 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3297 	 * regardless of the flags argument. Here is the case were
3298 	 * out-of-band data is not inline.
3299 	 */
3300 	if ((flags & MSG_OOB) ||
3301 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3302 	    (so->so_options & SO_OOBINLINE) == 0 &&
3303 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3304 		m = m_get(M_WAIT, MT_DATA);
3305 		if (m == NULL) {
3306 			socket_unlock(so, 1);
3307 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3308 			    ENOBUFS, 0, 0, 0, 0);
3309 			return ENOBUFS;
3310 		}
3311 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3312 		if (error) {
3313 			goto bad;
3314 		}
3315 		socket_unlock(so, 0);
3316 		do {
3317 			error = uiomove(mtod(m, caddr_t),
3318 			    imin((int)uio_resid(uio), m->m_len), uio);
3319 			m = m_free(m);
3320 		} while (uio_resid(uio) && error == 0 && m != NULL);
3321 		socket_lock(so, 0);
3322 bad:
3323 		if (m != NULL) {
3324 			m_freem(m);
3325 		}
3326 
3327 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3328 			if (error == EWOULDBLOCK || error == EINVAL) {
3329 				/*
3330 				 * Let's try to get normal data:
3331 				 * EWOULDBLOCK: out-of-band data not
3332 				 * receive yet. EINVAL: out-of-band data
3333 				 * already read.
3334 				 */
3335 				error = 0;
3336 				goto nooob;
3337 			} else if (error == 0 && flagsp != NULL) {
3338 				*flagsp |= MSG_OOB;
3339 			}
3340 		}
3341 		socket_unlock(so, 1);
3342 		if (en_tracing) {
3343 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3344 			    VM_KERNEL_ADDRPERM(so), 0,
3345 			    (int64_t)(orig_resid - uio_resid(uio)));
3346 		}
3347 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3348 		    0, 0, 0, 0);
3349 
3350 		return error;
3351 	}
3352 nooob:
3353 	if (mp != NULL) {
3354 		*mp = NULL;
3355 	}
3356 
3357 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3358 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3359 	}
3360 
3361 	free_list = NULL;
3362 	delayed_copy_len = 0;
3363 restart:
3364 #ifdef MORE_LOCKING_DEBUG
3365 	if (so->so_usecount <= 1) {
3366 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3367 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3368 	}
3369 #endif
3370 	/*
3371 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3372 	 * and if so just return to the caller.  This could happen when
3373 	 * soreceive() is called by a socket upcall function during the
3374 	 * time the socket is freed.  The socket buffer would have been
3375 	 * locked across the upcall, therefore we cannot put this thread
3376 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3377 	 * we may livelock), because the lock on the socket buffer will
3378 	 * only be released when the upcall routine returns to its caller.
3379 	 * Because the socket has been officially closed, there can be
3380 	 * no further read on it.
3381 	 *
3382 	 * A multipath subflow socket would have its SS_NOFDREF set by
3383 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3384 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3385 	 */
3386 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3387 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3388 		socket_unlock(so, 1);
3389 		return 0;
3390 	}
3391 
3392 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3393 	if (error) {
3394 		socket_unlock(so, 1);
3395 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3396 		    0, 0, 0, 0);
3397 		if (en_tracing) {
3398 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3399 			    VM_KERNEL_ADDRPERM(so), 0,
3400 			    (int64_t)(orig_resid - uio_resid(uio)));
3401 		}
3402 		return error;
3403 	}
3404 
3405 	m = so->so_rcv.sb_mb;
3406 	if (so_should_wait(so, uio, m, flags)) {
3407 		/*
3408 		 * Panic if we notice inconsistencies in the socket's
3409 		 * receive list; both sb_mb and sb_cc should correctly
3410 		 * reflect the contents of the list, otherwise we may
3411 		 * end up with false positives during select() or poll()
3412 		 * which could put the application in a bad state.
3413 		 */
3414 		SB_MB_CHECK(&so->so_rcv);
3415 
3416 		if (so->so_error) {
3417 			if (m != NULL) {
3418 				goto dontblock;
3419 			}
3420 			error = so->so_error;
3421 			if ((flags & MSG_PEEK) == 0) {
3422 				so->so_error = 0;
3423 			}
3424 			goto release;
3425 		}
3426 		if (so->so_state & SS_CANTRCVMORE) {
3427 #if CONTENT_FILTER
3428 			/*
3429 			 * Deal with half closed connections
3430 			 */
3431 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3432 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3433 				CFIL_LOG(LOG_INFO,
3434 				    "so %llx ignore SS_CANTRCVMORE",
3435 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3436 			} else
3437 #endif /* CONTENT_FILTER */
3438 			if (m != NULL) {
3439 				goto dontblock;
3440 			} else {
3441 				goto release;
3442 			}
3443 		}
3444 		for (; m != NULL; m = m->m_next) {
3445 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3446 				m = so->so_rcv.sb_mb;
3447 				goto dontblock;
3448 			}
3449 		}
3450 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3451 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3452 			error = ENOTCONN;
3453 			goto release;
3454 		}
3455 		if (uio_resid(uio) == 0) {
3456 			goto release;
3457 		}
3458 
3459 		if ((so->so_state & SS_NBIO) ||
3460 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3461 			error = EWOULDBLOCK;
3462 			goto release;
3463 		}
3464 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3465 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3466 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3467 #if EVEN_MORE_LOCKING_DEBUG
3468 		if (socket_debug) {
3469 			printf("Waiting for socket data\n");
3470 		}
3471 #endif
3472 
3473 		/*
3474 		 * Depending on the protocol (e.g. TCP), the following
3475 		 * might cause the socket lock to be dropped and later
3476 		 * be reacquired, and more data could have arrived and
3477 		 * have been appended to the receive socket buffer by
3478 		 * the time it returns.  Therefore, we only sleep in
3479 		 * sbwait() below if and only if the wait-condition is still
3480 		 * true.
3481 		 */
3482 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3483 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3484 		}
3485 
3486 		error = 0;
3487 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3488 			error = sbwait(&so->so_rcv);
3489 		}
3490 
3491 #if EVEN_MORE_LOCKING_DEBUG
3492 		if (socket_debug) {
3493 			printf("SORECEIVE - sbwait returned %d\n", error);
3494 		}
3495 #endif
3496 		if (so->so_usecount < 1) {
3497 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3498 			    __func__, so, so->so_usecount);
3499 			/* NOTREACHED */
3500 		}
3501 		if (error) {
3502 			socket_unlock(so, 1);
3503 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3504 			    0, 0, 0, 0);
3505 			if (en_tracing) {
3506 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3507 				    VM_KERNEL_ADDRPERM(so), 0,
3508 				    (int64_t)(orig_resid - uio_resid(uio)));
3509 			}
3510 			return error;
3511 		}
3512 		goto restart;
3513 	}
3514 dontblock:
3515 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3516 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3517 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3518 	nextrecord = m->m_nextpkt;
3519 
3520 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3521 		error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3522 		    mp0 == NULL);
3523 		if (error == ERESTART) {
3524 			goto restart;
3525 		} else if (error != 0) {
3526 			goto release;
3527 		}
3528 		orig_resid = 0;
3529 	}
3530 
3531 	/*
3532 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3533 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3534 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3535 	 * perform externalization.
3536 	 */
3537 	if (m != NULL && m->m_type == MT_CONTROL) {
3538 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3539 		if (error != 0) {
3540 			goto release;
3541 		}
3542 		orig_resid = 0;
3543 	}
3544 
3545 	if (m != NULL) {
3546 		if (!(flags & MSG_PEEK)) {
3547 			/*
3548 			 * We get here because m points to an mbuf following
3549 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3550 			 * processed above.  In any case, m should be pointing
3551 			 * to the head of the mbuf chain, and the nextrecord
3552 			 * should be either NULL or equal to m->m_nextpkt.
3553 			 * See comments above about SB_LOCK.
3554 			 */
3555 			if (m != so->so_rcv.sb_mb ||
3556 			    m->m_nextpkt != nextrecord) {
3557 				panic("%s: post-control !sync so=%p m=%p "
3558 				    "nextrecord=%p\n", __func__, so, m,
3559 				    nextrecord);
3560 				/* NOTREACHED */
3561 			}
3562 			if (nextrecord == NULL) {
3563 				so->so_rcv.sb_lastrecord = m;
3564 			}
3565 		}
3566 		type = m->m_type;
3567 		if (type == MT_OOBDATA) {
3568 			flags |= MSG_OOB;
3569 		}
3570 	} else {
3571 		if (!(flags & MSG_PEEK)) {
3572 			SB_EMPTY_FIXUP(&so->so_rcv);
3573 		}
3574 	}
3575 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3576 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3577 
3578 	moff = 0;
3579 	offset = 0;
3580 
3581 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3582 		can_delay = 1;
3583 	} else {
3584 		can_delay = 0;
3585 	}
3586 
3587 	while (m != NULL &&
3588 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3589 		if (m->m_type == MT_OOBDATA) {
3590 			if (type != MT_OOBDATA) {
3591 				break;
3592 			}
3593 		} else if (type == MT_OOBDATA) {
3594 			break;
3595 		}
3596 
3597 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3598 			break;
3599 		}
3600 		/*
3601 		 * Make sure to allways set MSG_OOB event when getting
3602 		 * out of band data inline.
3603 		 */
3604 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3605 		    (so->so_options & SO_OOBINLINE) != 0 &&
3606 		    (so->so_state & SS_RCVATMARK) != 0) {
3607 			flags |= MSG_OOB;
3608 		}
3609 		so->so_state &= ~SS_RCVATMARK;
3610 		len = uio_resid(uio) - delayed_copy_len;
3611 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3612 			len = so->so_oobmark - offset;
3613 		}
3614 		if (len > m->m_len - moff) {
3615 			len = m->m_len - moff;
3616 		}
3617 		/*
3618 		 * If mp is set, just pass back the mbufs.
3619 		 * Otherwise copy them out via the uio, then free.
3620 		 * Sockbuf must be consistent here (points to current mbuf,
3621 		 * it points to next record) when we drop priority;
3622 		 * we must note any additions to the sockbuf when we
3623 		 * block interrupts again.
3624 		 */
3625 		if (mp == NULL) {
3626 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3627 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3628 			if (can_delay && len == m->m_len) {
3629 				/*
3630 				 * only delay the copy if we're consuming the
3631 				 * mbuf and we're NOT in MSG_PEEK mode
3632 				 * and we have enough data to make it worthwile
3633 				 * to drop and retake the lock... can_delay
3634 				 * reflects the state of the 2 latter
3635 				 * constraints moff should always be zero
3636 				 * in these cases
3637 				 */
3638 				delayed_copy_len += len;
3639 			} else {
3640 				if (delayed_copy_len) {
3641 					error = sodelayed_copy(so, uio,
3642 					    &free_list, &delayed_copy_len);
3643 
3644 					if (error) {
3645 						goto release;
3646 					}
3647 					/*
3648 					 * can only get here if MSG_PEEK is not
3649 					 * set therefore, m should point at the
3650 					 * head of the rcv queue; if it doesn't,
3651 					 * it means something drastically
3652 					 * changed while we were out from behind
3653 					 * the lock in sodelayed_copy. perhaps
3654 					 * a RST on the stream. in any event,
3655 					 * the stream has been interrupted. it's
3656 					 * probably best just to return whatever
3657 					 * data we've moved and let the caller
3658 					 * sort it out...
3659 					 */
3660 					if (m != so->so_rcv.sb_mb) {
3661 						break;
3662 					}
3663 				}
3664 				socket_unlock(so, 0);
3665 				error = uiomove(mtod(m, caddr_t) + moff,
3666 				    (int)len, uio);
3667 				socket_lock(so, 0);
3668 
3669 				if (error) {
3670 					goto release;
3671 				}
3672 			}
3673 		} else {
3674 			uio_setresid(uio, (uio_resid(uio) - len));
3675 		}
3676 		if (len == m->m_len - moff) {
3677 			if (m->m_flags & M_EOR) {
3678 				flags |= MSG_EOR;
3679 			}
3680 			if (flags & MSG_PEEK) {
3681 				m = m->m_next;
3682 				moff = 0;
3683 			} else {
3684 				nextrecord = m->m_nextpkt;
3685 				sbfree(&so->so_rcv, m);
3686 				m->m_nextpkt = NULL;
3687 
3688 				if (mp != NULL) {
3689 					*mp = m;
3690 					mp = &m->m_next;
3691 					so->so_rcv.sb_mb = m = m->m_next;
3692 					*mp = NULL;
3693 				} else {
3694 					if (free_list == NULL) {
3695 						free_list = m;
3696 					} else {
3697 						ml->m_next = m;
3698 					}
3699 					ml = m;
3700 					so->so_rcv.sb_mb = m = m->m_next;
3701 					ml->m_next = NULL;
3702 				}
3703 				if (m != NULL) {
3704 					m->m_nextpkt = nextrecord;
3705 					if (nextrecord == NULL) {
3706 						so->so_rcv.sb_lastrecord = m;
3707 					}
3708 				} else {
3709 					so->so_rcv.sb_mb = nextrecord;
3710 					SB_EMPTY_FIXUP(&so->so_rcv);
3711 				}
3712 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3713 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3714 			}
3715 		} else {
3716 			if (flags & MSG_PEEK) {
3717 				moff += len;
3718 			} else {
3719 				if (mp != NULL) {
3720 					int copy_flag;
3721 
3722 					if (flags & MSG_DONTWAIT) {
3723 						copy_flag = M_DONTWAIT;
3724 					} else {
3725 						copy_flag = M_WAIT;
3726 					}
3727 					*mp = m_copym(m, 0, (int)len, copy_flag);
3728 					/*
3729 					 * Failed to allocate an mbuf?
3730 					 * Adjust uio_resid back, it was
3731 					 * adjusted down by len bytes which
3732 					 * we didn't copy over.
3733 					 */
3734 					if (*mp == NULL) {
3735 						uio_setresid(uio,
3736 						    (uio_resid(uio) + len));
3737 						break;
3738 					}
3739 				}
3740 				m->m_data += len;
3741 				m->m_len -= len;
3742 				so->so_rcv.sb_cc -= len;
3743 			}
3744 		}
3745 		if (so->so_oobmark) {
3746 			if ((flags & MSG_PEEK) == 0) {
3747 				so->so_oobmark -= len;
3748 				if (so->so_oobmark == 0) {
3749 					so->so_state |= SS_RCVATMARK;
3750 					break;
3751 				}
3752 			} else {
3753 				offset += len;
3754 				if (offset == so->so_oobmark) {
3755 					break;
3756 				}
3757 			}
3758 		}
3759 		if (flags & MSG_EOR) {
3760 			break;
3761 		}
3762 		/*
3763 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3764 		 * (for non-atomic socket), we must not quit until
3765 		 * "uio->uio_resid == 0" or an error termination.
3766 		 * If a signal/timeout occurs, return with a short
3767 		 * count but without error.  Keep sockbuf locked
3768 		 * against other readers.
3769 		 */
3770 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3771 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3772 		    !sosendallatonce(so) && !nextrecord) {
3773 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3774 #if CONTENT_FILTER
3775 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3776 #endif /* CONTENT_FILTER */
3777 			    )) {
3778 				goto release;
3779 			}
3780 
3781 			/*
3782 			 * Depending on the protocol (e.g. TCP), the following
3783 			 * might cause the socket lock to be dropped and later
3784 			 * be reacquired, and more data could have arrived and
3785 			 * have been appended to the receive socket buffer by
3786 			 * the time it returns.  Therefore, we only sleep in
3787 			 * sbwait() below if and only if the socket buffer is
3788 			 * empty, in order to avoid a false sleep.
3789 			 */
3790 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3791 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3792 			}
3793 
3794 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3795 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3796 
3797 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3798 				error = 0;
3799 				goto release;
3800 			}
3801 			/*
3802 			 * have to wait until after we get back from the sbwait
3803 			 * to do the copy because we will drop the lock if we
3804 			 * have enough data that has been delayed... by dropping
3805 			 * the lock we open up a window allowing the netisr
3806 			 * thread to process the incoming packets and to change
3807 			 * the state of this socket... we're issuing the sbwait
3808 			 * because the socket is empty and we're expecting the
3809 			 * netisr thread to wake us up when more packets arrive;
3810 			 * if we allow that processing to happen and then sbwait
3811 			 * we could stall forever with packets sitting in the
3812 			 * socket if no further packets arrive from the remote
3813 			 * side.
3814 			 *
3815 			 * we want to copy before we've collected all the data
3816 			 * to satisfy this request to allow the copy to overlap
3817 			 * the incoming packet processing on an MP system
3818 			 */
3819 			if (delayed_copy_len > sorecvmincopy &&
3820 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3821 				error = sodelayed_copy(so, uio,
3822 				    &free_list, &delayed_copy_len);
3823 
3824 				if (error) {
3825 					goto release;
3826 				}
3827 			}
3828 			m = so->so_rcv.sb_mb;
3829 			if (m != NULL) {
3830 				nextrecord = m->m_nextpkt;
3831 			}
3832 			SB_MB_CHECK(&so->so_rcv);
3833 		}
3834 	}
3835 #ifdef MORE_LOCKING_DEBUG
3836 	if (so->so_usecount <= 1) {
3837 		panic("%s: after big while so=%p ref=%d on socket",
3838 		    __func__, so, so->so_usecount);
3839 		/* NOTREACHED */
3840 	}
3841 #endif
3842 
3843 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3844 		if (so->so_options & SO_DONTTRUNC) {
3845 			flags |= MSG_RCVMORE;
3846 		} else {
3847 			flags |= MSG_TRUNC;
3848 			if ((flags & MSG_PEEK) == 0) {
3849 				(void) sbdroprecord(&so->so_rcv);
3850 			}
3851 		}
3852 	}
3853 
3854 	/*
3855 	 * pru_rcvd below (for TCP) may cause more data to be received
3856 	 * if the socket lock is dropped prior to sending the ACK; some
3857 	 * legacy OpenTransport applications don't handle this well
3858 	 * (if it receives less data than requested while MSG_HAVEMORE
3859 	 * is set), and so we set the flag now based on what we know
3860 	 * prior to calling pru_rcvd.
3861 	 */
3862 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3863 		flags |= MSG_HAVEMORE;
3864 	}
3865 
3866 	if ((flags & MSG_PEEK) == 0) {
3867 		if (m == NULL) {
3868 			so->so_rcv.sb_mb = nextrecord;
3869 			/*
3870 			 * First part is an inline SB_EMPTY_FIXUP().  Second
3871 			 * part makes sure sb_lastrecord is up-to-date if
3872 			 * there is still data in the socket buffer.
3873 			 */
3874 			if (so->so_rcv.sb_mb == NULL) {
3875 				so->so_rcv.sb_mbtail = NULL;
3876 				so->so_rcv.sb_lastrecord = NULL;
3877 			} else if (nextrecord->m_nextpkt == NULL) {
3878 				so->so_rcv.sb_lastrecord = nextrecord;
3879 			}
3880 			SB_MB_CHECK(&so->so_rcv);
3881 		}
3882 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3883 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3884 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3885 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3886 		}
3887 	}
3888 
3889 	if (delayed_copy_len) {
3890 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3891 		if (error) {
3892 			goto release;
3893 		}
3894 	}
3895 	if (free_list != NULL) {
3896 		m_freem_list(free_list);
3897 		free_list = NULL;
3898 	}
3899 
3900 	if (orig_resid == uio_resid(uio) && orig_resid &&
3901 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3902 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3903 		goto restart;
3904 	}
3905 
3906 	if (flagsp != NULL) {
3907 		*flagsp |= flags;
3908 	}
3909 release:
3910 #ifdef MORE_LOCKING_DEBUG
3911 	if (so->so_usecount <= 1) {
3912 		panic("%s: release so=%p ref=%d on socket", __func__,
3913 		    so, so->so_usecount);
3914 		/* NOTREACHED */
3915 	}
3916 #endif
3917 	if (delayed_copy_len) {
3918 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3919 	}
3920 
3921 	if (free_list != NULL) {
3922 		m_freem_list(free_list);
3923 	}
3924 
3925 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3926 
3927 	if (en_tracing) {
3928 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3929 		    VM_KERNEL_ADDRPERM(so),
3930 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3931 		    (int64_t)(orig_resid - uio_resid(uio)));
3932 	}
3933 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3934 	    so->so_rcv.sb_cc, 0, error);
3935 
3936 	return error;
3937 }
3938 
3939 /*
3940  * Returns:	0			Success
3941  *	uiomove:EFAULT
3942  */
3943 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)3944 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3945     user_ssize_t *resid)
3946 {
3947 	int error = 0;
3948 	struct mbuf *m;
3949 
3950 	m = *free_list;
3951 
3952 	socket_unlock(so, 0);
3953 
3954 	while (m != NULL && error == 0) {
3955 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3956 		m = m->m_next;
3957 	}
3958 	m_freem_list(*free_list);
3959 
3960 	*free_list = NULL;
3961 	*resid = 0;
3962 
3963 	socket_lock(so, 0);
3964 
3965 	return error;
3966 }
3967 
3968 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3969 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
3970     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3971 {
3972 	mbuf_ref_t m;
3973 	mbuf_ref_ref_t mp;
3974 	mbuf_ref_t nextrecord;
3975 	int flags, error;
3976 	struct protosw *pr = so->so_proto;
3977 	struct proc *p = current_proc();
3978 	u_int npkts = 0;
3979 	mbuf_ref_t free_list = NULL;
3980 	int sblocked = 0;
3981 
3982 	/*
3983 	 * Sanity check on the parameters passed by caller
3984 	 */
3985 	if (mp0 == NULL || pktcntp == NULL) {
3986 		return EINVAL;
3987 	}
3988 	if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
3989 		return EINVAL;
3990 	}
3991 
3992 	mp = mp0;
3993 	*mp0 = NULL;
3994 	if (controlp != NULL) {
3995 		*controlp = NULL;
3996 	}
3997 	if (maddrp != NULL) {
3998 		*maddrp = NULL;
3999 	}
4000 	if (flagsp != NULL) {
4001 		flags = *flagsp;
4002 	} else {
4003 		flags = 0;
4004 	}
4005 
4006 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4007 	    *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4008 	    so->so_rcv.sb_hiwat);
4009 
4010 	socket_lock(so, 1);
4011 	so_update_last_owner_locked(so, p);
4012 	so_update_policy(so);
4013 
4014 #if NECP
4015 	so_update_necp_policy(so, NULL, NULL);
4016 #endif /* NECP */
4017 
4018 	/*
4019 	 * If a recv attempt is made on a previously-accepted socket
4020 	 * that has been marked as inactive (disconnected), reject
4021 	 * the request.
4022 	 */
4023 	if (so->so_flags & SOF_DEFUNCT) {
4024 		struct sockbuf *sb = &so->so_rcv;
4025 
4026 		error = ENOTCONN;
4027 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4028 		    __func__, proc_pid(p), proc_best_name(p),
4029 		    so->so_gencnt,
4030 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4031 		/*
4032 		 * This socket should have been disconnected and flushed
4033 		 * prior to being returned from sodefunct(); there should
4034 		 * be no data on its receive list, so panic otherwise.
4035 		 */
4036 		if (so->so_state & SS_DEFUNCT) {
4037 			sb_empty_assert(sb, __func__);
4038 		}
4039 		goto release;
4040 	}
4041 
4042 	*mp = NULL;
4043 
4044 restart:
4045 	/*
4046 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4047 	 * and if so just return to the caller.  This could happen when
4048 	 * soreceive() is called by a socket upcall function during the
4049 	 * time the socket is freed.  The socket buffer would have been
4050 	 * locked across the upcall, therefore we cannot put this thread
4051 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4052 	 * we may livelock), because the lock on the socket buffer will
4053 	 * only be released when the upcall routine returns to its caller.
4054 	 * Because the socket has been officially closed, there can be
4055 	 * no further read on it.
4056 	 */
4057 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4058 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4059 		error = 0;
4060 		goto release;
4061 	}
4062 
4063 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4064 	if (error) {
4065 		goto release;
4066 	}
4067 	sblocked = 1;
4068 
4069 	m = so->so_rcv.sb_mb;
4070 	/*
4071 	 * Block awaiting more datagram if needed
4072 	 */
4073 	if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4074 	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4075 		/*
4076 		 * Panic if we notice inconsistencies in the socket's
4077 		 * receive list; both sb_mb and sb_cc should correctly
4078 		 * reflect the contents of the list, otherwise we may
4079 		 * end up with false positives during select() or poll()
4080 		 * which could put the application in a bad state.
4081 		 */
4082 		SB_MB_CHECK(&so->so_rcv);
4083 
4084 		if (so->so_error) {
4085 			if (m != NULL) {
4086 				goto dontblock;
4087 			}
4088 			error = so->so_error;
4089 			if ((flags & MSG_PEEK) == 0) {
4090 				so->so_error = 0;
4091 			}
4092 			goto release;
4093 		}
4094 		if (so->so_state & SS_CANTRCVMORE) {
4095 			if (m != NULL) {
4096 				goto dontblock;
4097 			} else {
4098 				goto release;
4099 			}
4100 		}
4101 		for (; m != NULL; m = m->m_next) {
4102 			if (m->m_flags & M_EOR) {
4103 				m = so->so_rcv.sb_mb;
4104 				goto dontblock;
4105 			}
4106 		}
4107 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4108 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4109 			error = ENOTCONN;
4110 			goto release;
4111 		}
4112 		if ((so->so_state & SS_NBIO) ||
4113 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4114 			error = EWOULDBLOCK;
4115 			goto release;
4116 		}
4117 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4118 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4119 
4120 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4121 		sblocked = 0;
4122 
4123 		error = sbwait(&so->so_rcv);
4124 		if (error != 0) {
4125 			goto release;
4126 		}
4127 		goto restart;
4128 	}
4129 dontblock:
4130 	m = so->so_rcv.sb_mb;
4131 	if (m == NULL) {
4132 		goto release;
4133 	}
4134 
4135 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4136 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4137 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4138 	nextrecord = m->m_nextpkt;
4139 
4140 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4141 		mbuf_ref_t maddr = NULL;
4142 
4143 		error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4144 		    &nextrecord, 1);
4145 		if (error == ERESTART) {
4146 			goto restart;
4147 		} else if (error != 0) {
4148 			goto release;
4149 		}
4150 
4151 		if (maddr != NULL) {
4152 			maddr->m_nextpkt = NULL;
4153 			maddr->m_next = NULL;
4154 			if (maddrp != NULL) {
4155 				*maddrp = maddr;
4156 				maddrp = &maddr->m_nextpkt;
4157 			} else {
4158 				maddr->m_next = free_list;
4159 				free_list = maddr;
4160 			}
4161 		}
4162 	}
4163 
4164 	/*
4165 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
4166 	 * in the first mbuf chain on the socket buffer.
4167 	 * We call into the protocol to perform externalization.
4168 	 */
4169 	if (m != NULL && m->m_type == MT_CONTROL) {
4170 		mbuf_ref_t control = NULL;
4171 
4172 		error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4173 		if (error != 0) {
4174 			goto release;
4175 		}
4176 		if (control != NULL) {
4177 			control->m_nextpkt = NULL;
4178 			control->m_next = NULL;
4179 			if (controlp != NULL) {
4180 				*controlp = control;
4181 				controlp = &control->m_nextpkt;
4182 			} else {
4183 				control->m_next = free_list;
4184 				free_list = control;
4185 			}
4186 		}
4187 	}
4188 
4189 	/*
4190 	 * Link the packet to the list
4191 	 */
4192 	if (m != NULL) {
4193 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4194 			panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4195 		}
4196 		m->m_nextpkt = NULL;
4197 		*mp = m;
4198 		mp = &m->m_nextpkt;
4199 	}
4200 	while (m != NULL) {
4201 		sbfree(&so->so_rcv, m);
4202 
4203 		m = m->m_next;
4204 	}
4205 
4206 	so->so_rcv.sb_mb = nextrecord;
4207 	/*
4208 	 * First part is an inline SB_EMPTY_FIXUP().  Second
4209 	 * part makes sure sb_lastrecord is up-to-date if
4210 	 * there is still data in the socket buffer.
4211 	 */
4212 	if (so->so_rcv.sb_mb == NULL) {
4213 		so->so_rcv.sb_mbtail = NULL;
4214 		so->so_rcv.sb_lastrecord = NULL;
4215 	} else if (nextrecord->m_nextpkt == NULL) {
4216 		so->so_rcv.sb_lastrecord = nextrecord;
4217 	}
4218 	SB_MB_CHECK(&so->so_rcv);
4219 
4220 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4221 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4222 
4223 	npkts += 1;
4224 
4225 	/*
4226 	 * We continue as long as all those conditions as we have less packets
4227 	 * than requested and the socket buffer is not empty
4228 	 */
4229 	if (npkts < *pktcntp) {
4230 		if (so->so_rcv.sb_mb != NULL) {
4231 			goto dontblock;
4232 		}
4233 		if ((flags & MSG_WAITALL) != 0) {
4234 			goto restart;
4235 		}
4236 	}
4237 
4238 	if (flagsp != NULL) {
4239 		*flagsp |= flags;
4240 	}
4241 
4242 release:
4243 	/*
4244 	 * pru_rcvd may cause more data to be received if the socket lock
4245 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4246 	 * That way the caller won't be surprised if it receives less data
4247 	 * than requested.
4248 	 */
4249 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4250 		flags |= MSG_HAVEMORE;
4251 	}
4252 
4253 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4254 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4255 	}
4256 
4257 	if (sblocked) {
4258 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4259 	} else {
4260 		socket_unlock(so, 1);
4261 	}
4262 
4263 	*pktcntp = npkts;
4264 	/*
4265 	 * Amortize the cost of freeing the mbufs
4266 	 */
4267 	if (free_list != NULL) {
4268 		m_freem_list(free_list);
4269 	}
4270 
4271 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4272 	    0, 0, 0, 0);
4273 	return error;
4274 }
4275 
4276 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4277 so_statistics_event_to_nstat_event(int64_t *input_options,
4278     uint64_t *nstat_event)
4279 {
4280 	int error = 0;
4281 	switch (*input_options) {
4282 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4283 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4284 		break;
4285 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4286 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4287 		break;
4288 #if (DEBUG || DEVELOPMENT)
4289 	case SO_STATISTICS_EVENT_RESERVED_1:
4290 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4291 		break;
4292 	case SO_STATISTICS_EVENT_RESERVED_2:
4293 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4294 		break;
4295 #endif /* (DEBUG || DEVELOPMENT) */
4296 	default:
4297 		error = EINVAL;
4298 		break;
4299 	}
4300 	return error;
4301 }
4302 
4303 /*
4304  * Returns:	0			Success
4305  *		EINVAL
4306  *		ENOTCONN
4307  *	<pru_shutdown>:EINVAL
4308  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4309  *	<pru_shutdown>:ENOBUFS[TCP]
4310  *	<pru_shutdown>:EMSGSIZE[TCP]
4311  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4312  *	<pru_shutdown>:ENETUNREACH[TCP]
4313  *	<pru_shutdown>:ENETDOWN[TCP]
4314  *	<pru_shutdown>:ENOMEM[TCP]
4315  *	<pru_shutdown>:EACCES[TCP]
4316  *	<pru_shutdown>:EMSGSIZE[TCP]
4317  *	<pru_shutdown>:ENOBUFS[TCP]
4318  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4319  *	<pru_shutdown>:???		[other protocol families]
4320  */
4321 int
soshutdown(struct socket * so,int how)4322 soshutdown(struct socket *so, int how)
4323 {
4324 	int error;
4325 
4326 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4327 
4328 	switch (how) {
4329 	case SHUT_RD:
4330 	case SHUT_WR:
4331 	case SHUT_RDWR:
4332 		socket_lock(so, 1);
4333 		if ((so->so_state &
4334 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4335 			error = ENOTCONN;
4336 		} else {
4337 			error = soshutdownlock(so, how);
4338 		}
4339 		socket_unlock(so, 1);
4340 		break;
4341 	default:
4342 		error = EINVAL;
4343 		break;
4344 	}
4345 
4346 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4347 
4348 	return error;
4349 }
4350 
4351 int
soshutdownlock_final(struct socket * so,int how)4352 soshutdownlock_final(struct socket *so, int how)
4353 {
4354 	struct protosw *pr = so->so_proto;
4355 	int error = 0;
4356 
4357 	sflt_notify(so, sock_evt_shutdown, &how);
4358 
4359 	if (how != SHUT_WR) {
4360 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4361 			/* read already shut down */
4362 			error = ENOTCONN;
4363 			goto done;
4364 		}
4365 		sorflush(so);
4366 	}
4367 	if (how != SHUT_RD) {
4368 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4369 			/* write already shut down */
4370 			error = ENOTCONN;
4371 			goto done;
4372 		}
4373 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4374 	}
4375 done:
4376 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4377 	return error;
4378 }
4379 
4380 int
soshutdownlock(struct socket * so,int how)4381 soshutdownlock(struct socket *so, int how)
4382 {
4383 	int error = 0;
4384 
4385 #if CONTENT_FILTER
4386 	/*
4387 	 * A content filter may delay the actual shutdown until it
4388 	 * has processed the pending data
4389 	 */
4390 	if (so->so_flags & SOF_CONTENT_FILTER) {
4391 		error = cfil_sock_shutdown(so, &how);
4392 		if (error == EJUSTRETURN) {
4393 			error = 0;
4394 			goto done;
4395 		} else if (error != 0) {
4396 			goto done;
4397 		}
4398 	}
4399 #endif /* CONTENT_FILTER */
4400 
4401 	error = soshutdownlock_final(so, how);
4402 
4403 done:
4404 	return error;
4405 }
4406 
4407 void
sowflush(struct socket * so)4408 sowflush(struct socket *so)
4409 {
4410 	struct sockbuf *sb = &so->so_snd;
4411 
4412 	/*
4413 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4414 	 * to prevent the socket buffer from being unexpectedly altered
4415 	 * while it is used by another thread in socket send/receive.
4416 	 *
4417 	 * sblock() must not fail here, hence the assertion.
4418 	 */
4419 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4420 	VERIFY(sb->sb_flags & SB_LOCK);
4421 
4422 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4423 	sb->sb_flags            |= SB_DROP;
4424 	sb->sb_upcall           = NULL;
4425 	sb->sb_upcallarg        = NULL;
4426 
4427 	sbunlock(sb, TRUE);     /* keep socket locked */
4428 
4429 	selthreadclear(&sb->sb_sel);
4430 	sbrelease(sb);
4431 }
4432 
4433 void
sorflush(struct socket * so)4434 sorflush(struct socket *so)
4435 {
4436 	struct sockbuf *sb = &so->so_rcv;
4437 	struct protosw *pr = so->so_proto;
4438 	struct sockbuf asb;
4439 #ifdef notyet
4440 	lck_mtx_t *mutex_held;
4441 	/*
4442 	 * XXX: This code is currently commented out, because we may get here
4443 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4444 	 * longer be able to return us the lock; this will be fixed in future.
4445 	 */
4446 	if (so->so_proto->pr_getlock != NULL) {
4447 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4448 	} else {
4449 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4450 	}
4451 
4452 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4453 #endif /* notyet */
4454 
4455 	sflt_notify(so, sock_evt_flush_read, NULL);
4456 
4457 	socantrcvmore(so);
4458 
4459 	/*
4460 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4461 	 * to prevent the socket buffer from being unexpectedly altered
4462 	 * while it is used by another thread in socket send/receive.
4463 	 *
4464 	 * sblock() must not fail here, hence the assertion.
4465 	 */
4466 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4467 	VERIFY(sb->sb_flags & SB_LOCK);
4468 
4469 	/*
4470 	 * Copy only the relevant fields from "sb" to "asb" which we
4471 	 * need for sbrelease() to function.  In particular, skip
4472 	 * sb_sel as it contains the wait queue linkage, which would
4473 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4474 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4475 	 * to acquire it later as part of sbrelease().
4476 	 */
4477 	bzero(&asb, sizeof(asb));
4478 	asb.sb_cc               = sb->sb_cc;
4479 	asb.sb_hiwat            = sb->sb_hiwat;
4480 	asb.sb_mbcnt            = sb->sb_mbcnt;
4481 	asb.sb_mbmax            = sb->sb_mbmax;
4482 	asb.sb_ctl              = sb->sb_ctl;
4483 	asb.sb_lowat            = sb->sb_lowat;
4484 	asb.sb_mb               = sb->sb_mb;
4485 	asb.sb_mbtail           = sb->sb_mbtail;
4486 	asb.sb_lastrecord       = sb->sb_lastrecord;
4487 	asb.sb_so               = sb->sb_so;
4488 	asb.sb_flags            = sb->sb_flags;
4489 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4490 	asb.sb_flags            |= SB_DROP;
4491 
4492 	/*
4493 	 * Ideally we'd bzero() these and preserve the ones we need;
4494 	 * but to do that we'd need to shuffle things around in the
4495 	 * sockbuf, and we can't do it now because there are KEXTS
4496 	 * that are directly referring to the socket structure.
4497 	 *
4498 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4499 	 * Clearing SB_SEL is done for selthreadclear() below.
4500 	 */
4501 	sb->sb_cc               = 0;
4502 	sb->sb_hiwat            = 0;
4503 	sb->sb_mbcnt            = 0;
4504 	sb->sb_mbmax            = 0;
4505 	sb->sb_ctl              = 0;
4506 	sb->sb_lowat            = 0;
4507 	sb->sb_mb               = NULL;
4508 	sb->sb_mbtail           = NULL;
4509 	sb->sb_lastrecord       = NULL;
4510 	sb->sb_timeo.tv_sec     = 0;
4511 	sb->sb_timeo.tv_usec    = 0;
4512 	sb->sb_upcall           = NULL;
4513 	sb->sb_upcallarg        = NULL;
4514 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4515 	sb->sb_flags            |= SB_DROP;
4516 
4517 	sbunlock(sb, TRUE);     /* keep socket locked */
4518 
4519 	/*
4520 	 * Note that selthreadclear() is called on the original "sb" and
4521 	 * not the local "asb" because of the way wait queue linkage is
4522 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4523 	 * should no longer be set (cleared above.)
4524 	 */
4525 	selthreadclear(&sb->sb_sel);
4526 
4527 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4528 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4529 	}
4530 
4531 	sbrelease(&asb);
4532 }
4533 
4534 /*
4535  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4536  * an additional variant to handle the case where the option value needs
4537  * to be some kind of integer, but not a specific size.
4538  * In addition to their use here, these functions are also called by the
4539  * protocol-level pr_ctloutput() routines.
4540  *
4541  * Returns:	0			Success
4542  *		EINVAL
4543  *	copyin:EFAULT
4544  */
4545 int
sooptcopyin(struct sockopt * sopt,void * __sized_by (len)buf,size_t len,size_t minlen)4546 sooptcopyin(struct sockopt *sopt, void *__sized_by(len) buf, size_t len, size_t minlen)
4547 {
4548 	size_t  valsize;
4549 
4550 	/*
4551 	 * If the user gives us more than we wanted, we ignore it,
4552 	 * but if we don't get the minimum length the caller
4553 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4554 	 * is set to however much we actually retrieved.
4555 	 */
4556 	if ((valsize = sopt->sopt_valsize) < minlen) {
4557 		return EINVAL;
4558 	}
4559 	if (valsize > len) {
4560 		sopt->sopt_valsize = valsize = len;
4561 	}
4562 
4563 	if (sopt->sopt_p != kernproc) {
4564 		return copyin(sopt->sopt_val, buf, valsize);
4565 	}
4566 
4567 	caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4568 	    CAST_DOWN(caddr_t, sopt->sopt_val),
4569 	    valsize);
4570 	bcopy(tmp, buf, valsize);
4571 
4572 	return 0;
4573 }
4574 
4575 /*
4576  * sooptcopyin_timeval
4577  *   Copy in a timeval value into tv_p, and take into account whether the
4578  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4579  *   code here so that we can verify the 64-bit tv_sec value before we lose
4580  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4581  */
4582 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4583 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4584 {
4585 	int                     error;
4586 
4587 	if (proc_is64bit(sopt->sopt_p)) {
4588 		struct user64_timeval   tv64;
4589 
4590 		if (sopt->sopt_valsize < sizeof(tv64)) {
4591 			return EINVAL;
4592 		}
4593 
4594 		sopt->sopt_valsize = sizeof(tv64);
4595 		if (sopt->sopt_p != kernproc) {
4596 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4597 			if (error != 0) {
4598 				return error;
4599 			}
4600 		} else {
4601 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4602 			    CAST_DOWN(caddr_t, sopt->sopt_val),
4603 			    sizeof(tv64));
4604 			bcopy(tmp, &tv64, sizeof(tv64));
4605 		}
4606 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4607 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4608 			return EDOM;
4609 		}
4610 
4611 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4612 		tv_p->tv_usec = tv64.tv_usec;
4613 	} else {
4614 		struct user32_timeval   tv32;
4615 
4616 		if (sopt->sopt_valsize < sizeof(tv32)) {
4617 			return EINVAL;
4618 		}
4619 
4620 		sopt->sopt_valsize = sizeof(tv32);
4621 		if (sopt->sopt_p != kernproc) {
4622 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4623 			if (error != 0) {
4624 				return error;
4625 			}
4626 		} else {
4627 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4628 			    CAST_DOWN(caddr_t, sopt->sopt_val),
4629 			    sizeof(tv32));
4630 			bcopy(tmp, &tv32, sizeof(tv32));
4631 		}
4632 #ifndef __LP64__
4633 		/*
4634 		 * K64todo "comparison is always false due to
4635 		 * limited range of data type"
4636 		 */
4637 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4638 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4639 			return EDOM;
4640 		}
4641 #endif
4642 		tv_p->tv_sec = tv32.tv_sec;
4643 		tv_p->tv_usec = tv32.tv_usec;
4644 	}
4645 	return 0;
4646 }
4647 
4648 int
sooptcopyin_bindtodevice(struct sockopt * sopt,char * __sized_by (bufsize)buf,size_t bufsize)4649 sooptcopyin_bindtodevice(struct sockopt *sopt, char * __sized_by(bufsize) buf, size_t bufsize)
4650 {
4651 #define MIN_BINDTODEVICE_NAME_SIZE    2
4652 	size_t maxlen = bufsize - 1;             /* the max string length that fits in the buffer */
4653 
4654 	if (bufsize < MIN_BINDTODEVICE_NAME_SIZE) {
4655 #if DEBUG || DEVELOPMENT
4656 		os_log(OS_LOG_DEFAULT, "%s: bufsize %lu < MIN_BINDTODEVICE_NAME_SIZE %d",
4657 		    __func__, bufsize, MIN_BINDTODEVICE_NAME_SIZE);
4658 #endif /* DEBUG || DEVELOPMENT */
4659 		return EINVAL;
4660 	}
4661 
4662 	memset(buf, 0, bufsize);
4663 
4664 	/*
4665 	 * bufsize includes the end-of-string because of the uncertainty wether
4666 	 * interface names are passed as strings or byte buffers.
4667 	 * If the user gives us more than the max string length return EINVAL.
4668 	 * On success, sopt->sopt_valsize is not modified
4669 	 */
4670 	maxlen = bufsize - 1;
4671 	if (sopt->sopt_valsize > maxlen) {
4672 		os_log(OS_LOG_DEFAULT, "%s: sopt_valsize %lu > maxlen %lu",
4673 		    __func__, sopt->sopt_valsize, maxlen);
4674 		return EINVAL;
4675 	}
4676 
4677 	if (sopt->sopt_p != kernproc) {
4678 		return copyin(sopt->sopt_val, buf, sopt->sopt_valsize);
4679 	} else {
4680 		caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4681 		    CAST_DOWN(caddr_t, sopt->sopt_val),
4682 		    sopt->sopt_valsize);
4683 		bcopy(tmp, buf, sopt->sopt_valsize);
4684 	}
4685 
4686 	return 0;
4687 #undef MIN_BINDTODEVICE_NAME_SIZE
4688 }
4689 
4690 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4691 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4692     boolean_t ignore_delegate)
4693 {
4694 	kauth_cred_t cred =  NULL;
4695 	proc_t ep = PROC_NULL;
4696 	uid_t uid;
4697 	int error = 0;
4698 
4699 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4700 		ep = proc_find(so->e_pid);
4701 		if (ep) {
4702 			cred = kauth_cred_proc_ref(ep);
4703 		}
4704 	}
4705 
4706 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4707 
4708 	/* uid is 0 for root */
4709 	if (uid != 0 || !allow_root) {
4710 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4711 	}
4712 	if (cred) {
4713 		kauth_cred_unref(&cred);
4714 	}
4715 	if (ep != PROC_NULL) {
4716 		proc_rele(ep);
4717 	}
4718 
4719 	return error;
4720 }
4721 
4722 /*
4723  * Returns:	0			Success
4724  *		EINVAL
4725  *		ENOPROTOOPT
4726  *		ENOBUFS
4727  *		EDOM
4728  *	sooptcopyin:EINVAL
4729  *	sooptcopyin:EFAULT
4730  *	sooptcopyin_timeval:EINVAL
4731  *	sooptcopyin_timeval:EFAULT
4732  *	sooptcopyin_timeval:EDOM
4733  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4734  *	<pr_ctloutput>:???w
4735  *	sflt_attach_private:???		[whatever a filter author chooses]
4736  *	<sf_setoption>:???		[whatever a filter author chooses]
4737  *
4738  * Notes:	Other <pru_listen> returns depend on the protocol family; all
4739  *		<sf_listen> returns depend on what the filter author causes
4740  *		their filter to return.
4741  */
4742 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4743 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4744 {
4745 	int     error, optval;
4746 	int64_t long_optval;
4747 	struct  linger l;
4748 	struct  timeval tv;
4749 
4750 	if (sopt->sopt_dir != SOPT_SET) {
4751 		sopt->sopt_dir = SOPT_SET;
4752 	}
4753 
4754 	if (dolock) {
4755 		socket_lock(so, 1);
4756 	}
4757 
4758 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4759 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4760 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4761 		/* the socket has been shutdown, no more sockopt's */
4762 		error = EINVAL;
4763 		goto out;
4764 	}
4765 
4766 	error = sflt_setsockopt(so, sopt);
4767 	if (error != 0) {
4768 		if (error == EJUSTRETURN) {
4769 			error = 0;
4770 		}
4771 		goto out;
4772 	}
4773 
4774 	if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
4775 		if (so->so_proto != NULL &&
4776 		    so->so_proto->pr_ctloutput != NULL) {
4777 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
4778 			goto out;
4779 		}
4780 		error = ENOPROTOOPT;
4781 	} else {
4782 		/*
4783 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
4784 		 * the protocol layer, if needed.  A zero value returned from
4785 		 * the handler means use default socket-level processing as
4786 		 * done by the rest of this routine.  Otherwise, any other
4787 		 * return value indicates that the option is unsupported.
4788 		 */
4789 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4790 		    pru_socheckopt(so, sopt)) != 0) {
4791 			goto out;
4792 		}
4793 
4794 		error = 0;
4795 		switch (sopt->sopt_name) {
4796 		case SO_LINGER:
4797 		case SO_LINGER_SEC: {
4798 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4799 			if (error != 0) {
4800 				goto out;
4801 			}
4802 			/* Make sure to use sane values */
4803 			if (sopt->sopt_name == SO_LINGER) {
4804 				so->so_linger = (short)l.l_linger;
4805 			} else {
4806 				so->so_linger = (short)((long)l.l_linger * hz);
4807 			}
4808 			if (l.l_onoff != 0) {
4809 				so->so_options |= SO_LINGER;
4810 			} else {
4811 				so->so_options &= ~SO_LINGER;
4812 			}
4813 			break;
4814 		}
4815 		case SO_DEBUG:
4816 		case SO_KEEPALIVE:
4817 		case SO_DONTROUTE:
4818 		case SO_USELOOPBACK:
4819 		case SO_BROADCAST:
4820 		case SO_REUSEADDR:
4821 		case SO_REUSEPORT:
4822 		case SO_OOBINLINE:
4823 		case SO_TIMESTAMP:
4824 		case SO_TIMESTAMP_MONOTONIC:
4825 		case SO_TIMESTAMP_CONTINUOUS:
4826 		case SO_DONTTRUNC:
4827 		case SO_WANTMORE:
4828 		case SO_WANTOOBFLAG:
4829 		case SO_NOWAKEFROMSLEEP:
4830 		case SO_NOAPNFALLBK:
4831 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4832 			    sizeof(optval));
4833 			if (error != 0) {
4834 				goto out;
4835 			}
4836 			if (optval) {
4837 				so->so_options |= sopt->sopt_name;
4838 			} else {
4839 				so->so_options &= ~sopt->sopt_name;
4840 			}
4841 #if SKYWALK
4842 			inp_update_netns_flags(so);
4843 #endif /* SKYWALK */
4844 			break;
4845 
4846 		case SO_SNDBUF:
4847 		case SO_RCVBUF:
4848 		case SO_SNDLOWAT:
4849 		case SO_RCVLOWAT:
4850 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4851 			    sizeof(optval));
4852 			if (error != 0) {
4853 				goto out;
4854 			}
4855 
4856 			/*
4857 			 * Values < 1 make no sense for any of these
4858 			 * options, so disallow them.
4859 			 */
4860 			if (optval < 1) {
4861 				error = EINVAL;
4862 				goto out;
4863 			}
4864 
4865 			switch (sopt->sopt_name) {
4866 			case SO_SNDBUF:
4867 			case SO_RCVBUF: {
4868 				struct sockbuf *sb =
4869 				    (sopt->sopt_name == SO_SNDBUF) ?
4870 				    &so->so_snd : &so->so_rcv;
4871 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
4872 					error = ENOBUFS;
4873 					goto out;
4874 				}
4875 				sb->sb_flags |= SB_USRSIZE;
4876 				sb->sb_flags &= ~SB_AUTOSIZE;
4877 				sb->sb_idealsize = (u_int32_t)optval;
4878 				break;
4879 			}
4880 			/*
4881 			 * Make sure the low-water is never greater than
4882 			 * the high-water.
4883 			 */
4884 			case SO_SNDLOWAT: {
4885 				int space = sbspace(&so->so_snd);
4886 				uint32_t hiwat = so->so_snd.sb_hiwat;
4887 
4888 				if (so->so_snd.sb_flags & SB_UNIX) {
4889 					struct unpcb *unp =
4890 					    (struct unpcb *)(so->so_pcb);
4891 					if (unp != NULL &&
4892 					    unp->unp_conn != NULL) {
4893 						struct socket *so2 = unp->unp_conn->unp_socket;
4894 						hiwat += unp->unp_conn->unp_cc;
4895 						space = sbspace(&so2->so_rcv);
4896 					}
4897 				}
4898 
4899 				so->so_snd.sb_lowat =
4900 				    (optval > hiwat) ?
4901 				    hiwat : optval;
4902 
4903 				if (space >= so->so_snd.sb_lowat) {
4904 					sowwakeup(so);
4905 				}
4906 				break;
4907 			}
4908 			case SO_RCVLOWAT: {
4909 				int64_t data_len;
4910 				so->so_rcv.sb_lowat =
4911 				    (optval > so->so_rcv.sb_hiwat) ?
4912 				    so->so_rcv.sb_hiwat : optval;
4913 				if (so->so_rcv.sb_flags & SB_UNIX) {
4914 					struct unpcb *unp =
4915 					    (struct unpcb *)(so->so_pcb);
4916 					if (unp != NULL &&
4917 					    unp->unp_conn != NULL) {
4918 						struct socket *so2 = unp->unp_conn->unp_socket;
4919 						data_len = so2->so_snd.sb_cc
4920 						    - so2->so_snd.sb_ctl;
4921 					} else {
4922 						data_len = so->so_rcv.sb_cc
4923 						    - so->so_rcv.sb_ctl;
4924 					}
4925 				} else {
4926 					data_len = so->so_rcv.sb_cc
4927 					    - so->so_rcv.sb_ctl;
4928 				}
4929 
4930 				if (data_len >= so->so_rcv.sb_lowat) {
4931 					sorwakeup(so);
4932 				}
4933 				break;
4934 			}
4935 			}
4936 			break;
4937 
4938 		case SO_SNDTIMEO:
4939 		case SO_RCVTIMEO:
4940 			error = sooptcopyin_timeval(sopt, &tv);
4941 			if (error != 0) {
4942 				goto out;
4943 			}
4944 
4945 			switch (sopt->sopt_name) {
4946 			case SO_SNDTIMEO:
4947 				so->so_snd.sb_timeo = tv;
4948 				break;
4949 			case SO_RCVTIMEO:
4950 				so->so_rcv.sb_timeo = tv;
4951 				break;
4952 			}
4953 			break;
4954 
4955 		case SO_NKE: {
4956 			struct so_nke nke;
4957 
4958 			error = sooptcopyin(sopt, &nke, sizeof(nke),
4959 			    sizeof(nke));
4960 			if (error != 0) {
4961 				goto out;
4962 			}
4963 
4964 			error = sflt_attach_internal(so, nke.nke_handle);
4965 			break;
4966 		}
4967 
4968 		case SO_NOSIGPIPE:
4969 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4970 			    sizeof(optval));
4971 			if (error != 0) {
4972 				goto out;
4973 			}
4974 			if (optval != 0) {
4975 				so->so_flags |= SOF_NOSIGPIPE;
4976 			} else {
4977 				so->so_flags &= ~SOF_NOSIGPIPE;
4978 			}
4979 			break;
4980 
4981 		case SO_NOADDRERR:
4982 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4983 			    sizeof(optval));
4984 			if (error != 0) {
4985 				goto out;
4986 			}
4987 			if (optval != 0) {
4988 				so->so_flags |= SOF_NOADDRAVAIL;
4989 			} else {
4990 				so->so_flags &= ~SOF_NOADDRAVAIL;
4991 			}
4992 			break;
4993 
4994 		case SO_REUSESHAREUID:
4995 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4996 			    sizeof(optval));
4997 			if (error != 0) {
4998 				goto out;
4999 			}
5000 			if (optval != 0) {
5001 				so->so_flags |= SOF_REUSESHAREUID;
5002 			} else {
5003 				so->so_flags &= ~SOF_REUSESHAREUID;
5004 			}
5005 			break;
5006 
5007 		case SO_NOTIFYCONFLICT:
5008 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5009 				error = EPERM;
5010 				goto out;
5011 			}
5012 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5013 			    sizeof(optval));
5014 			if (error != 0) {
5015 				goto out;
5016 			}
5017 			if (optval != 0) {
5018 				so->so_flags |= SOF_NOTIFYCONFLICT;
5019 			} else {
5020 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5021 			}
5022 			break;
5023 
5024 		case SO_RESTRICTIONS:
5025 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5026 			    sizeof(optval));
5027 			if (error != 0) {
5028 				goto out;
5029 			}
5030 
5031 			error = so_set_restrictions(so, optval);
5032 			break;
5033 
5034 		case SO_AWDL_UNRESTRICTED:
5035 			if (SOCK_DOM(so) != PF_INET &&
5036 			    SOCK_DOM(so) != PF_INET6) {
5037 				error = EOPNOTSUPP;
5038 				goto out;
5039 			}
5040 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5041 			    sizeof(optval));
5042 			if (error != 0) {
5043 				goto out;
5044 			}
5045 			if (optval != 0) {
5046 				error = soopt_cred_check(so,
5047 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5048 				if (error == 0) {
5049 					inp_set_awdl_unrestricted(
5050 						sotoinpcb(so));
5051 				}
5052 			} else {
5053 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5054 			}
5055 			break;
5056 		case SO_INTCOPROC_ALLOW:
5057 			if (SOCK_DOM(so) != PF_INET6) {
5058 				error = EOPNOTSUPP;
5059 				goto out;
5060 			}
5061 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5062 			    sizeof(optval));
5063 			if (error != 0) {
5064 				goto out;
5065 			}
5066 			if (optval != 0 &&
5067 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5068 				error = soopt_cred_check(so,
5069 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5070 				if (error == 0) {
5071 					inp_set_intcoproc_allowed(
5072 						sotoinpcb(so));
5073 				}
5074 			} else if (optval == 0) {
5075 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5076 			}
5077 			break;
5078 
5079 		case SO_LABEL:
5080 			error = EOPNOTSUPP;
5081 			break;
5082 
5083 		case SO_UPCALLCLOSEWAIT:
5084 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5085 			    sizeof(optval));
5086 			if (error != 0) {
5087 				goto out;
5088 			}
5089 			if (optval != 0) {
5090 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5091 			} else {
5092 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5093 			}
5094 			break;
5095 
5096 		case SO_RANDOMPORT:
5097 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5098 			    sizeof(optval));
5099 			if (error != 0) {
5100 				goto out;
5101 			}
5102 			if (optval != 0) {
5103 				so->so_flags |= SOF_BINDRANDOMPORT;
5104 			} else {
5105 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5106 			}
5107 			break;
5108 
5109 		case SO_NP_EXTENSIONS: {
5110 			struct so_np_extensions sonpx;
5111 
5112 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5113 			    sizeof(sonpx));
5114 			if (error != 0) {
5115 				goto out;
5116 			}
5117 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5118 				error = EINVAL;
5119 				goto out;
5120 			}
5121 			/*
5122 			 * Only one bit defined for now
5123 			 */
5124 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5125 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5126 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5127 				} else {
5128 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5129 				}
5130 			}
5131 			break;
5132 		}
5133 
5134 		case SO_TRAFFIC_CLASS: {
5135 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5136 			    sizeof(optval));
5137 			if (error != 0) {
5138 				goto out;
5139 			}
5140 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5141 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5142 				error = so_set_net_service_type(so, netsvc);
5143 				goto out;
5144 			}
5145 			error = so_set_traffic_class(so, optval);
5146 			if (error != 0) {
5147 				goto out;
5148 			}
5149 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5150 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5151 			break;
5152 		}
5153 
5154 		case SO_RECV_TRAFFIC_CLASS: {
5155 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5156 			    sizeof(optval));
5157 			if (error != 0) {
5158 				goto out;
5159 			}
5160 			if (optval == 0) {
5161 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5162 			} else {
5163 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5164 			}
5165 			break;
5166 		}
5167 
5168 #if (DEVELOPMENT || DEBUG)
5169 		case SO_TRAFFIC_CLASS_DBG: {
5170 			struct so_tcdbg so_tcdbg;
5171 
5172 			error = sooptcopyin(sopt, &so_tcdbg,
5173 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5174 			if (error != 0) {
5175 				goto out;
5176 			}
5177 			error = so_set_tcdbg(so, &so_tcdbg);
5178 			if (error != 0) {
5179 				goto out;
5180 			}
5181 			break;
5182 		}
5183 #endif /* (DEVELOPMENT || DEBUG) */
5184 
5185 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5186 			error = priv_check_cred(kauth_cred_get(),
5187 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5188 			if (error != 0) {
5189 				goto out;
5190 			}
5191 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5192 			    sizeof(optval));
5193 			if (error != 0) {
5194 				goto out;
5195 			}
5196 			if (optval == 0) {
5197 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5198 			} else {
5199 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5200 			}
5201 			break;
5202 
5203 #if (DEVELOPMENT || DEBUG)
5204 		case SO_DEFUNCTIT:
5205 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5206 			if (error == 0) {
5207 				error = sodefunct(current_proc(), so, 0);
5208 			}
5209 
5210 			break;
5211 #endif /* (DEVELOPMENT || DEBUG) */
5212 
5213 		case SO_DEFUNCTOK:
5214 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5215 			    sizeof(optval));
5216 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5217 				if (error == 0) {
5218 					error = EBADF;
5219 				}
5220 				goto out;
5221 			}
5222 			/*
5223 			 * Any process can set SO_DEFUNCTOK (clear
5224 			 * SOF_NODEFUNCT), but only root can clear
5225 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5226 			 */
5227 			if (optval == 0 &&
5228 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5229 				error = EPERM;
5230 				goto out;
5231 			}
5232 			if (optval) {
5233 				so->so_flags &= ~SOF_NODEFUNCT;
5234 			} else {
5235 				so->so_flags |= SOF_NODEFUNCT;
5236 			}
5237 
5238 			if (SOCK_DOM(so) == PF_INET ||
5239 			    SOCK_DOM(so) == PF_INET6) {
5240 				char s[MAX_IPv6_STR_LEN];
5241 				char d[MAX_IPv6_STR_LEN];
5242 				struct inpcb *inp = sotoinpcb(so);
5243 
5244 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5245 				    "[%s %s:%d -> %s:%d] is now marked "
5246 				    "as %seligible for "
5247 				    "defunct\n", __func__, proc_selfpid(),
5248 				    proc_best_name(current_proc()),
5249 				    so->so_gencnt,
5250 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5251 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5252 				    ((SOCK_DOM(so) == PF_INET) ?
5253 				    (void *)&inp->inp_laddr.s_addr :
5254 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5255 				    ntohs(inp->in6p_lport),
5256 				    inet_ntop(SOCK_DOM(so),
5257 				    (SOCK_DOM(so) == PF_INET) ?
5258 				    (void *)&inp->inp_faddr.s_addr :
5259 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5260 				    ntohs(inp->in6p_fport),
5261 				    (so->so_flags & SOF_NODEFUNCT) ?
5262 				    "not " : "");
5263 			} else {
5264 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5265 				    "is now marked as %seligible for "
5266 				    "defunct\n",
5267 				    __func__, proc_selfpid(),
5268 				    proc_best_name(current_proc()),
5269 				    so->so_gencnt,
5270 				    SOCK_DOM(so), SOCK_TYPE(so),
5271 				    (so->so_flags & SOF_NODEFUNCT) ?
5272 				    "not " : "");
5273 			}
5274 			break;
5275 
5276 		case SO_ISDEFUNCT:
5277 			/* This option is not settable */
5278 			error = EINVAL;
5279 			break;
5280 
5281 		case SO_OPPORTUNISTIC:
5282 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5283 			    sizeof(optval));
5284 			if (error == 0) {
5285 				error = so_set_opportunistic(so, optval);
5286 			}
5287 			break;
5288 
5289 		case SO_FLUSH:
5290 			/* This option is handled by lower layer(s) */
5291 			error = 0;
5292 			break;
5293 
5294 		case SO_RECV_ANYIF:
5295 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5296 			    sizeof(optval));
5297 			if (error == 0) {
5298 				error = so_set_recv_anyif(so, optval);
5299 			}
5300 			break;
5301 
5302 		case SO_TRAFFIC_MGT_BACKGROUND: {
5303 			/* This option is handled by lower layer(s) */
5304 			error = 0;
5305 			break;
5306 		}
5307 
5308 #if FLOW_DIVERT
5309 		case SO_FLOW_DIVERT_TOKEN:
5310 			error = flow_divert_token_set(so, sopt);
5311 			break;
5312 #endif  /* FLOW_DIVERT */
5313 
5314 
5315 		case SO_DELEGATED:
5316 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5317 			    sizeof(optval))) != 0) {
5318 				break;
5319 			}
5320 
5321 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5322 			break;
5323 
5324 		case SO_DELEGATED_UUID: {
5325 			uuid_t euuid;
5326 
5327 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5328 			    sizeof(euuid))) != 0) {
5329 				break;
5330 			}
5331 
5332 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5333 			break;
5334 		}
5335 
5336 #if NECP
5337 		case SO_NECP_ATTRIBUTES:
5338 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5339 				/* Handled by MPTCP itself */
5340 				break;
5341 			}
5342 
5343 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5344 				error = EINVAL;
5345 				goto out;
5346 			}
5347 
5348 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5349 			break;
5350 
5351 		case SO_NECP_CLIENTUUID: {
5352 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5353 				/* Handled by MPTCP itself */
5354 				break;
5355 			}
5356 
5357 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5358 				error = EINVAL;
5359 				goto out;
5360 			}
5361 
5362 			struct inpcb *inp = sotoinpcb(so);
5363 			if (!uuid_is_null(inp->necp_client_uuid)) {
5364 				// Clear out the old client UUID if present
5365 				necp_inpcb_remove_cb(inp);
5366 			}
5367 
5368 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5369 			    sizeof(uuid_t), sizeof(uuid_t));
5370 			if (error != 0) {
5371 				goto out;
5372 			}
5373 
5374 			if (uuid_is_null(inp->necp_client_uuid)) {
5375 				error = EINVAL;
5376 				goto out;
5377 			}
5378 
5379 			pid_t current_pid = proc_pid(current_proc());
5380 			error = necp_client_register_socket_flow(current_pid,
5381 			    inp->necp_client_uuid, inp);
5382 			if (error != 0) {
5383 				uuid_clear(inp->necp_client_uuid);
5384 				goto out;
5385 			}
5386 
5387 			if (inp->inp_lport != 0) {
5388 				// There is a bound local port, so this is not
5389 				// a fresh socket. Assign to the client.
5390 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5391 			}
5392 
5393 			break;
5394 		}
5395 		case SO_NECP_LISTENUUID: {
5396 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5397 				error = EINVAL;
5398 				goto out;
5399 			}
5400 
5401 			struct inpcb *inp = sotoinpcb(so);
5402 			if (!uuid_is_null(inp->necp_client_uuid)) {
5403 				error = EINVAL;
5404 				goto out;
5405 			}
5406 
5407 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5408 			    sizeof(uuid_t), sizeof(uuid_t));
5409 			if (error != 0) {
5410 				goto out;
5411 			}
5412 
5413 			if (uuid_is_null(inp->necp_client_uuid)) {
5414 				error = EINVAL;
5415 				goto out;
5416 			}
5417 
5418 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5419 			    inp->necp_client_uuid, inp);
5420 			if (error != 0) {
5421 				uuid_clear(inp->necp_client_uuid);
5422 				goto out;
5423 			}
5424 
5425 			// Mark that the port registration is held by NECP
5426 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5427 
5428 			break;
5429 		}
5430 
5431 		case SO_RESOLVER_SIGNATURE: {
5432 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5433 				error = EINVAL;
5434 				goto out;
5435 			}
5436 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5437 			break;
5438 		}
5439 #endif /* NECP */
5440 
5441 		case SO_EXTENDED_BK_IDLE:
5442 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5443 			    sizeof(optval));
5444 			if (error == 0) {
5445 				error = so_set_extended_bk_idle(so, optval);
5446 			}
5447 			break;
5448 
5449 		case SO_MARK_CELLFALLBACK:
5450 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5451 			    sizeof(optval));
5452 			if (error != 0) {
5453 				goto out;
5454 			}
5455 			if (optval < 0) {
5456 				error = EINVAL;
5457 				goto out;
5458 			}
5459 			if (optval == 0) {
5460 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5461 			} else {
5462 				so->so_flags1 |= SOF1_CELLFALLBACK;
5463 			}
5464 			break;
5465 
5466 		case SO_MARK_CELLFALLBACK_UUID:
5467 		{
5468 			struct so_mark_cellfallback_uuid_args args;
5469 
5470 			error = sooptcopyin(sopt, &args, sizeof(args),
5471 			    sizeof(args));
5472 			if (error != 0) {
5473 				goto out;
5474 			}
5475 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5476 			    args.flow_cellfallback);
5477 			break;
5478 		}
5479 
5480 		case SO_FALLBACK_MODE:
5481 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5482 			    sizeof(optval));
5483 			if (error != 0) {
5484 				goto out;
5485 			}
5486 			if (optval < SO_FALLBACK_MODE_NONE ||
5487 			    optval > SO_FALLBACK_MODE_PREFER) {
5488 				error = EINVAL;
5489 				goto out;
5490 			}
5491 			so->so_fallback_mode = (u_int8_t)optval;
5492 			break;
5493 
5494 		case SO_MARK_KNOWN_TRACKER: {
5495 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5496 			    sizeof(optval));
5497 			if (error != 0) {
5498 				goto out;
5499 			}
5500 			if (optval < 0) {
5501 				error = EINVAL;
5502 				goto out;
5503 			}
5504 			if (optval == 0) {
5505 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5506 			} else {
5507 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5508 			}
5509 			break;
5510 		}
5511 
5512 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5513 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5514 			    sizeof(optval));
5515 			if (error != 0) {
5516 				goto out;
5517 			}
5518 			if (optval < 0) {
5519 				error = EINVAL;
5520 				goto out;
5521 			}
5522 			if (optval == 0) {
5523 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5524 			} else {
5525 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5526 			}
5527 			break;
5528 		}
5529 
5530 		case SO_MARK_APPROVED_APP_DOMAIN: {
5531 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5532 			    sizeof(optval));
5533 			if (error != 0) {
5534 				goto out;
5535 			}
5536 			if (optval < 0) {
5537 				error = EINVAL;
5538 				goto out;
5539 			}
5540 			if (optval == 0) {
5541 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5542 			} else {
5543 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5544 			}
5545 			break;
5546 		}
5547 
5548 		case SO_STATISTICS_EVENT:
5549 			error = sooptcopyin(sopt, &long_optval,
5550 			    sizeof(long_optval), sizeof(long_optval));
5551 			if (error != 0) {
5552 				goto out;
5553 			}
5554 			u_int64_t nstat_event = 0;
5555 			error = so_statistics_event_to_nstat_event(
5556 				&long_optval, &nstat_event);
5557 			if (error != 0) {
5558 				goto out;
5559 			}
5560 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5561 			break;
5562 
5563 		case SO_NET_SERVICE_TYPE: {
5564 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5565 			    sizeof(optval));
5566 			if (error != 0) {
5567 				goto out;
5568 			}
5569 			error = so_set_net_service_type(so, optval);
5570 			break;
5571 		}
5572 
5573 		case SO_QOSMARKING_POLICY_OVERRIDE:
5574 			error = priv_check_cred(kauth_cred_get(),
5575 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5576 			if (error != 0) {
5577 				goto out;
5578 			}
5579 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5580 			    sizeof(optval));
5581 			if (error != 0) {
5582 				goto out;
5583 			}
5584 			if (optval == 0) {
5585 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5586 			} else {
5587 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5588 			}
5589 			break;
5590 
5591 		case SO_MPKL_SEND_INFO: {
5592 			struct so_mpkl_send_info so_mpkl_send_info;
5593 
5594 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5595 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5596 			if (error != 0) {
5597 				goto out;
5598 			}
5599 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5600 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5601 
5602 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5603 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5604 			} else {
5605 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5606 			}
5607 			break;
5608 		}
5609 		case SO_WANT_KEV_SOCKET_CLOSED: {
5610 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5611 			    sizeof(optval));
5612 			if (error != 0) {
5613 				goto out;
5614 			}
5615 			if (optval == 0) {
5616 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5617 			} else {
5618 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5619 			}
5620 			break;
5621 		}
5622 		case SO_MARK_WAKE_PKT: {
5623 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5624 			    sizeof(optval));
5625 			if (error != 0) {
5626 				goto out;
5627 			}
5628 			if (optval == 0) {
5629 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5630 			} else {
5631 				so->so_flags |= SOF_MARK_WAKE_PKT;
5632 			}
5633 			break;
5634 		}
5635 		case SO_RECV_WAKE_PKT: {
5636 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5637 			    sizeof(optval));
5638 			if (error != 0) {
5639 				goto out;
5640 			}
5641 			if (optval == 0) {
5642 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5643 			} else {
5644 				so->so_flags |= SOF_RECV_WAKE_PKT;
5645 			}
5646 			break;
5647 		}
5648 		case SO_APPLICATION_ID: {
5649 			so_application_id_t application_id = { 0 };
5650 
5651 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5652 				error = EINVAL;
5653 				goto out;
5654 			}
5655 			error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5656 			    sizeof(application_id));
5657 			if (error != 0) {
5658 				goto out;
5659 			}
5660 
5661 			// The user needs to match
5662 			if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5663 				error = EINVAL;
5664 				printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5665 				goto out;
5666 			}
5667 			error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5668 			if (error != 0) {
5669 				printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5670 				goto out;
5671 			}
5672 			if (application_id.persona_id != PERSONA_ID_NONE) {
5673 				so->so_persona_id = application_id.persona_id;
5674 			}
5675 			break;
5676 		}
5677 		case SO_MARK_DOMAIN_INFO_SILENT:
5678 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5679 			    sizeof(optval));
5680 			if (error != 0) {
5681 				goto out;
5682 			}
5683 			if (optval < 0) {
5684 				error = EINVAL;
5685 				goto out;
5686 			}
5687 			if (optval == 0) {
5688 				so->so_flags1 &= ~SOF1_DOMAIN_INFO_SILENT;
5689 			} else {
5690 				so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
5691 			}
5692 			break;
5693 		case SO_MAX_PACING_RATE: {
5694 			uint64_t pacingrate;
5695 
5696 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5697 				error = EINVAL;
5698 				goto out;
5699 			}
5700 
5701 			error = sooptcopyin(sopt, &pacingrate,
5702 			    sizeof(pacingrate), sizeof(pacingrate));
5703 			if (error != 0) {
5704 				goto out;
5705 			}
5706 
5707 			if (pacingrate == 0) {
5708 				error = EINVAL;
5709 				goto out;
5710 			}
5711 			sotoinpcb(so)->inp_max_pacing_rate = pacingrate;
5712 			break;
5713 		}
5714 		case SO_CONNECTION_IDLE: {
5715 			int is_idle;
5716 
5717 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5718 				error = EINVAL;
5719 				goto out;
5720 			}
5721 
5722 			error = sooptcopyin(sopt, &is_idle,
5723 			    sizeof(is_idle), sizeof(is_idle));
5724 			if (error != 0) {
5725 				goto out;
5726 			}
5727 
5728 			if (is_idle != 0) {
5729 				sotoinpcb(so)->inp_flags2 |= INP2_CONNECTION_IDLE;
5730 			} else {
5731 				sotoinpcb(so)->inp_flags2 &= ~INP2_CONNECTION_IDLE;
5732 			}
5733 			break;
5734 		}
5735 		default:
5736 			error = ENOPROTOOPT;
5737 			break;
5738 		}
5739 		if (error == 0 && so->so_proto != NULL &&
5740 		    so->so_proto->pr_ctloutput != NULL) {
5741 			(void) so->so_proto->pr_ctloutput(so, sopt);
5742 		}
5743 	}
5744 out:
5745 	if (dolock) {
5746 		socket_unlock(so, 1);
5747 	}
5748 	return error;
5749 }
5750 
5751 /* Helper routines for getsockopt */
5752 int
sooptcopyout(struct sockopt * sopt,void * __sized_by (len)buf,size_t len)5753 sooptcopyout(struct sockopt *sopt, void *__sized_by(len) buf, size_t len)
5754 {
5755 	int     error;
5756 	size_t  valsize;
5757 
5758 	error = 0;
5759 
5760 	/*
5761 	 * Documented get behavior is that we always return a value,
5762 	 * possibly truncated to fit in the user's buffer.
5763 	 * Traditional behavior is that we always tell the user
5764 	 * precisely how much we copied, rather than something useful
5765 	 * like the total amount we had available for her.
5766 	 * Note that this interface is not idempotent; the entire answer must
5767 	 * generated ahead of time.
5768 	 */
5769 	valsize = MIN(len, sopt->sopt_valsize);
5770 	sopt->sopt_valsize = valsize;
5771 	if (sopt->sopt_valsize != 0 && sopt->sopt_val != USER_ADDR_NULL) {
5772 		if (sopt->sopt_p != kernproc) {
5773 			error = copyout(buf, sopt->sopt_val, valsize);
5774 		} else {
5775 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5776 			    CAST_DOWN(caddr_t, sopt->sopt_val),
5777 			    valsize);
5778 			bcopy(buf, tmp, valsize);
5779 		}
5780 	}
5781 	return error;
5782 }
5783 
5784 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5785 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5786 {
5787 	int                     error;
5788 	size_t                  len;
5789 	struct user64_timeval   tv64 = {};
5790 	struct user32_timeval   tv32 = {};
5791 	const void *            val;
5792 	size_t                  valsize;
5793 
5794 	error = 0;
5795 	if (proc_is64bit(sopt->sopt_p)) {
5796 		len = sizeof(tv64);
5797 		tv64.tv_sec = tv_p->tv_sec;
5798 		tv64.tv_usec = tv_p->tv_usec;
5799 		val = &tv64;
5800 	} else {
5801 		len = sizeof(tv32);
5802 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5803 		tv32.tv_usec = tv_p->tv_usec;
5804 		val = &tv32;
5805 	}
5806 	valsize = MIN(len, sopt->sopt_valsize);
5807 	sopt->sopt_valsize = valsize;
5808 	if (sopt->sopt_val != USER_ADDR_NULL) {
5809 		if (sopt->sopt_p != kernproc) {
5810 			error = copyout(val, sopt->sopt_val, valsize);
5811 		} else {
5812 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5813 			    CAST_DOWN(caddr_t, sopt->sopt_val),
5814 			    valsize);
5815 			bcopy(val, tmp, valsize);
5816 		}
5817 	}
5818 	return error;
5819 }
5820 
5821 /*
5822  * Return:	0			Success
5823  *		ENOPROTOOPT
5824  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5825  *	<pr_ctloutput>:???
5826  *	<sf_getoption>:???
5827  */
5828 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5829 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5830 {
5831 	int     error, optval;
5832 	struct  linger l;
5833 	struct  timeval tv;
5834 
5835 	if (sopt->sopt_dir != SOPT_GET) {
5836 		sopt->sopt_dir = SOPT_GET;
5837 	}
5838 
5839 	if (dolock) {
5840 		socket_lock(so, 1);
5841 	}
5842 
5843 	error = sflt_getsockopt(so, sopt);
5844 	if (error != 0) {
5845 		if (error == EJUSTRETURN) {
5846 			error = 0;
5847 		}
5848 		goto out;
5849 	}
5850 
5851 	if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
5852 		if (so->so_proto != NULL &&
5853 		    so->so_proto->pr_ctloutput != NULL) {
5854 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5855 			goto out;
5856 		}
5857 		error = ENOPROTOOPT;
5858 	} else {
5859 		/*
5860 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5861 		 * the protocol layer, if needed.  A zero value returned from
5862 		 * the handler means use default socket-level processing as
5863 		 * done by the rest of this routine.  Otherwise, any other
5864 		 * return value indicates that the option is unsupported.
5865 		 */
5866 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5867 		    pru_socheckopt(so, sopt)) != 0) {
5868 			goto out;
5869 		}
5870 
5871 		error = 0;
5872 		switch (sopt->sopt_name) {
5873 		case SO_LINGER:
5874 		case SO_LINGER_SEC:
5875 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5876 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5877 			    so->so_linger : so->so_linger / hz;
5878 			error = sooptcopyout(sopt, &l, sizeof(l));
5879 			break;
5880 
5881 		case SO_USELOOPBACK:
5882 		case SO_DONTROUTE:
5883 		case SO_DEBUG:
5884 		case SO_KEEPALIVE:
5885 		case SO_REUSEADDR:
5886 		case SO_REUSEPORT:
5887 		case SO_BROADCAST:
5888 		case SO_OOBINLINE:
5889 		case SO_TIMESTAMP:
5890 		case SO_TIMESTAMP_MONOTONIC:
5891 		case SO_TIMESTAMP_CONTINUOUS:
5892 		case SO_DONTTRUNC:
5893 		case SO_WANTMORE:
5894 		case SO_WANTOOBFLAG:
5895 		case SO_NOWAKEFROMSLEEP:
5896 		case SO_NOAPNFALLBK:
5897 			optval = so->so_options & sopt->sopt_name;
5898 integer:
5899 			error = sooptcopyout(sopt, &optval, sizeof(optval));
5900 			break;
5901 
5902 		case SO_TYPE:
5903 			optval = so->so_type;
5904 			goto integer;
5905 
5906 		case SO_NREAD:
5907 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5908 				int pkt_total;
5909 				struct mbuf *m1;
5910 
5911 				pkt_total = 0;
5912 				m1 = so->so_rcv.sb_mb;
5913 				while (m1 != NULL) {
5914 					if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5915 						pkt_total += m1->m_len;
5916 					}
5917 					m1 = m1->m_next;
5918 				}
5919 				optval = pkt_total;
5920 			} else {
5921 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5922 			}
5923 			goto integer;
5924 
5925 		case SO_NUMRCVPKT:
5926 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5927 				int cnt = 0;
5928 				struct mbuf *m1;
5929 
5930 				m1 = so->so_rcv.sb_mb;
5931 				while (m1 != NULL) {
5932 					cnt += 1;
5933 					m1 = m1->m_nextpkt;
5934 				}
5935 				optval = cnt;
5936 				goto integer;
5937 			} else {
5938 				error = ENOPROTOOPT;
5939 				break;
5940 			}
5941 
5942 		case SO_NWRITE:
5943 			optval = so->so_snd.sb_cc;
5944 			goto integer;
5945 
5946 		case SO_ERROR:
5947 			optval = so->so_error;
5948 			so->so_error = 0;
5949 			goto integer;
5950 
5951 		case SO_SNDBUF: {
5952 			u_int32_t hiwat = so->so_snd.sb_hiwat;
5953 
5954 			if (so->so_snd.sb_flags & SB_UNIX) {
5955 				struct unpcb *unp =
5956 				    (struct unpcb *)(so->so_pcb);
5957 				if (unp != NULL && unp->unp_conn != NULL) {
5958 					hiwat += unp->unp_conn->unp_cc;
5959 				}
5960 			}
5961 
5962 			optval = hiwat;
5963 			goto integer;
5964 		}
5965 		case SO_RCVBUF:
5966 			optval = so->so_rcv.sb_hiwat;
5967 			goto integer;
5968 
5969 		case SO_SNDLOWAT:
5970 			optval = so->so_snd.sb_lowat;
5971 			goto integer;
5972 
5973 		case SO_RCVLOWAT:
5974 			optval = so->so_rcv.sb_lowat;
5975 			goto integer;
5976 
5977 		case SO_SNDTIMEO:
5978 		case SO_RCVTIMEO:
5979 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
5980 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5981 
5982 			error = sooptcopyout_timeval(sopt, &tv);
5983 			break;
5984 
5985 		case SO_NOSIGPIPE:
5986 			optval = (so->so_flags & SOF_NOSIGPIPE);
5987 			goto integer;
5988 
5989 		case SO_NOADDRERR:
5990 			optval = (so->so_flags & SOF_NOADDRAVAIL);
5991 			goto integer;
5992 
5993 		case SO_REUSESHAREUID:
5994 			optval = (so->so_flags & SOF_REUSESHAREUID);
5995 			goto integer;
5996 
5997 
5998 		case SO_NOTIFYCONFLICT:
5999 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6000 			goto integer;
6001 
6002 		case SO_RESTRICTIONS:
6003 			optval = so_get_restrictions(so);
6004 			goto integer;
6005 
6006 		case SO_AWDL_UNRESTRICTED:
6007 			if (SOCK_DOM(so) == PF_INET ||
6008 			    SOCK_DOM(so) == PF_INET6) {
6009 				optval = inp_get_awdl_unrestricted(
6010 					sotoinpcb(so));
6011 				goto integer;
6012 			} else {
6013 				error = EOPNOTSUPP;
6014 			}
6015 			break;
6016 
6017 		case SO_INTCOPROC_ALLOW:
6018 			if (SOCK_DOM(so) == PF_INET6) {
6019 				optval = inp_get_intcoproc_allowed(
6020 					sotoinpcb(so));
6021 				goto integer;
6022 			} else {
6023 				error = EOPNOTSUPP;
6024 			}
6025 			break;
6026 
6027 		case SO_LABEL:
6028 			error = EOPNOTSUPP;
6029 			break;
6030 
6031 		case SO_PEERLABEL:
6032 			error = EOPNOTSUPP;
6033 			break;
6034 
6035 #ifdef __APPLE_API_PRIVATE
6036 		case SO_UPCALLCLOSEWAIT:
6037 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6038 			goto integer;
6039 #endif
6040 		case SO_RANDOMPORT:
6041 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6042 			goto integer;
6043 
6044 		case SO_NP_EXTENSIONS: {
6045 			struct so_np_extensions sonpx = {};
6046 
6047 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6048 			    SONPX_SETOPTSHUT : 0;
6049 			sonpx.npx_mask = SONPX_MASK_VALID;
6050 
6051 			error = sooptcopyout(sopt, &sonpx,
6052 			    sizeof(struct so_np_extensions));
6053 			break;
6054 		}
6055 
6056 		case SO_TRAFFIC_CLASS:
6057 			optval = so->so_traffic_class;
6058 			goto integer;
6059 
6060 		case SO_RECV_TRAFFIC_CLASS:
6061 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6062 			goto integer;
6063 
6064 #if (DEVELOPMENT || DEBUG)
6065 		case SO_TRAFFIC_CLASS_DBG:
6066 			error = sogetopt_tcdbg(so, sopt);
6067 			break;
6068 #endif /* (DEVELOPMENT || DEBUG) */
6069 
6070 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6071 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6072 			goto integer;
6073 
6074 		case SO_DEFUNCTOK:
6075 			optval = !(so->so_flags & SOF_NODEFUNCT);
6076 			goto integer;
6077 
6078 		case SO_ISDEFUNCT:
6079 			optval = (so->so_flags & SOF_DEFUNCT);
6080 			goto integer;
6081 
6082 		case SO_OPPORTUNISTIC:
6083 			optval = so_get_opportunistic(so);
6084 			goto integer;
6085 
6086 		case SO_FLUSH:
6087 			/* This option is not gettable */
6088 			error = EINVAL;
6089 			break;
6090 
6091 		case SO_RECV_ANYIF:
6092 			optval = so_get_recv_anyif(so);
6093 			goto integer;
6094 
6095 		case SO_TRAFFIC_MGT_BACKGROUND:
6096 			/* This option is handled by lower layer(s) */
6097 			if (so->so_proto != NULL &&
6098 			    so->so_proto->pr_ctloutput != NULL) {
6099 				(void) so->so_proto->pr_ctloutput(so, sopt);
6100 			}
6101 			break;
6102 
6103 #if FLOW_DIVERT
6104 		case SO_FLOW_DIVERT_TOKEN:
6105 			error = flow_divert_token_get(so, sopt);
6106 			break;
6107 #endif  /* FLOW_DIVERT */
6108 
6109 #if NECP
6110 		case SO_NECP_ATTRIBUTES:
6111 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6112 				/* Handled by MPTCP itself */
6113 				break;
6114 			}
6115 
6116 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6117 				error = EINVAL;
6118 				goto out;
6119 			}
6120 
6121 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6122 			break;
6123 
6124 		case SO_NECP_CLIENTUUID: {
6125 			uuid_t *ncu;
6126 
6127 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6128 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6129 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6130 				ncu = &sotoinpcb(so)->necp_client_uuid;
6131 			} else {
6132 				error = EINVAL;
6133 				goto out;
6134 			}
6135 
6136 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6137 			break;
6138 		}
6139 
6140 		case SO_NECP_LISTENUUID: {
6141 			uuid_t *nlu;
6142 
6143 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6144 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6145 					nlu = &sotoinpcb(so)->necp_client_uuid;
6146 				} else {
6147 					error = ENOENT;
6148 					goto out;
6149 				}
6150 			} else {
6151 				error = EINVAL;
6152 				goto out;
6153 			}
6154 
6155 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6156 			break;
6157 		}
6158 
6159 		case SO_RESOLVER_SIGNATURE: {
6160 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6161 				error = EINVAL;
6162 				goto out;
6163 			}
6164 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6165 			break;
6166 		}
6167 
6168 #endif /* NECP */
6169 
6170 #if CONTENT_FILTER
6171 		case SO_CFIL_SOCK_ID: {
6172 			cfil_sock_id_t sock_id;
6173 
6174 			sock_id = cfil_sock_id_from_socket(so);
6175 
6176 			error = sooptcopyout(sopt, &sock_id,
6177 			    sizeof(cfil_sock_id_t));
6178 			break;
6179 		}
6180 #endif  /* CONTENT_FILTER */
6181 
6182 		case SO_EXTENDED_BK_IDLE:
6183 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6184 			goto integer;
6185 		case SO_MARK_CELLFALLBACK:
6186 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6187 			    ? 1 : 0;
6188 			goto integer;
6189 		case SO_FALLBACK_MODE:
6190 			optval = so->so_fallback_mode;
6191 			goto integer;
6192 		case SO_MARK_KNOWN_TRACKER: {
6193 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6194 			    ? 1 : 0;
6195 			goto integer;
6196 		}
6197 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6198 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6199 			    ? 1 : 0;
6200 			goto integer;
6201 		}
6202 		case SO_MARK_APPROVED_APP_DOMAIN: {
6203 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6204 			    ? 1 : 0;
6205 			goto integer;
6206 		}
6207 		case SO_NET_SERVICE_TYPE: {
6208 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6209 				optval = so->so_netsvctype;
6210 			} else {
6211 				optval = NET_SERVICE_TYPE_BE;
6212 			}
6213 			goto integer;
6214 		}
6215 		case SO_NETSVC_MARKING_LEVEL:
6216 			optval = so_get_netsvc_marking_level(so);
6217 			goto integer;
6218 
6219 		case SO_MPKL_SEND_INFO: {
6220 			struct so_mpkl_send_info so_mpkl_send_info;
6221 
6222 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6223 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6224 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6225 			    sizeof(struct so_mpkl_send_info));
6226 			break;
6227 		}
6228 		case SO_MARK_WAKE_PKT:
6229 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6230 			goto integer;
6231 		case SO_RECV_WAKE_PKT:
6232 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6233 			goto integer;
6234 		case SO_APPLICATION_ID: {
6235 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6236 				error = EINVAL;
6237 				goto out;
6238 			}
6239 			so_application_id_t application_id = { 0 };
6240 			application_id.uid = kauth_cred_getuid(so->so_cred);
6241 			uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6242 			application_id.persona_id = so->so_persona_id;
6243 			error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6244 			break;
6245 		}
6246 		case SO_MARK_DOMAIN_INFO_SILENT:
6247 			optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
6248 			    ? 1 : 0;
6249 			goto integer;
6250 		case SO_MAX_PACING_RATE: {
6251 			uint64_t pacingrate;
6252 
6253 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6254 				error = EINVAL;
6255 				goto out;
6256 			}
6257 
6258 			pacingrate = sotoinpcb(so)->inp_max_pacing_rate;
6259 
6260 			error = sooptcopyout(sopt, &pacingrate, sizeof(pacingrate));
6261 			break;
6262 		}
6263 		case SO_CONNECTION_IDLE: {
6264 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6265 				error = EINVAL;
6266 				goto out;
6267 			}
6268 			optval = sotoinpcb(so)->inp_flags2 & INP2_CONNECTION_IDLE ?
6269 			    1 : 0;
6270 			goto integer;
6271 		}
6272 		default:
6273 			error = ENOPROTOOPT;
6274 			break;
6275 		}
6276 	}
6277 out:
6278 	if (dolock) {
6279 		socket_unlock(so, 1);
6280 	}
6281 	return error;
6282 }
6283 
6284 /*
6285  * The size limits on our soopt_getm is different from that on FreeBSD.
6286  * We limit the size of options to MCLBYTES. This will have to change
6287  * if we need to define options that need more space than MCLBYTES.
6288  */
6289 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6290 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6291 {
6292 	struct mbuf *m, *m_prev;
6293 	int sopt_size = (int)sopt->sopt_valsize;
6294 	int how;
6295 
6296 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6297 		return EMSGSIZE;
6298 	}
6299 
6300 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6301 	MGET(m, how, MT_DATA);
6302 	if (m == NULL) {
6303 		return ENOBUFS;
6304 	}
6305 	if (sopt_size > MLEN) {
6306 		MCLGET(m, how);
6307 		if ((m->m_flags & M_EXT) == 0) {
6308 			m_free(m);
6309 			return ENOBUFS;
6310 		}
6311 		m->m_len = min(MCLBYTES, sopt_size);
6312 	} else {
6313 		m->m_len = min(MLEN, sopt_size);
6314 	}
6315 	sopt_size -= m->m_len;
6316 	*mp = m;
6317 	m_prev = m;
6318 
6319 	while (sopt_size > 0) {
6320 		MGET(m, how, MT_DATA);
6321 		if (m == NULL) {
6322 			m_freem(*mp);
6323 			return ENOBUFS;
6324 		}
6325 		if (sopt_size > MLEN) {
6326 			MCLGET(m, how);
6327 			if ((m->m_flags & M_EXT) == 0) {
6328 				m_freem(*mp);
6329 				m_freem(m);
6330 				return ENOBUFS;
6331 			}
6332 			m->m_len = min(MCLBYTES, sopt_size);
6333 		} else {
6334 			m->m_len = min(MLEN, sopt_size);
6335 		}
6336 		sopt_size -= m->m_len;
6337 		m_prev->m_next = m;
6338 		m_prev = m;
6339 	}
6340 	return 0;
6341 }
6342 
6343 /* copyin sopt data into mbuf chain */
6344 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6345 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6346 {
6347 	struct mbuf *m0 = m;
6348 
6349 	if (sopt->sopt_val == USER_ADDR_NULL) {
6350 		return 0;
6351 	}
6352 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6353 		if (sopt->sopt_p != kernproc) {
6354 			int error;
6355 
6356 			error = copyin(sopt->sopt_val, mtod(m, char *),
6357 			    m->m_len);
6358 			if (error != 0) {
6359 				m_freem(m0);
6360 				return error;
6361 			}
6362 		} else {
6363 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6364 			    CAST_DOWN(caddr_t, sopt->sopt_val),
6365 			    m->m_len);
6366 			bcopy(tmp, mtod(m, char *), m->m_len);
6367 		}
6368 		sopt->sopt_valsize -= m->m_len;
6369 		sopt->sopt_val += m->m_len;
6370 		m = m->m_next;
6371 	}
6372 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6373 	if (m != NULL) {
6374 		panic("soopt_mcopyin");
6375 		/* NOTREACHED */
6376 	}
6377 	return 0;
6378 }
6379 
6380 /* copyout mbuf chain data into soopt */
6381 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6382 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6383 {
6384 	struct mbuf *m0 = m;
6385 	size_t valsize = 0;
6386 
6387 	if (sopt->sopt_val == USER_ADDR_NULL) {
6388 		return 0;
6389 	}
6390 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6391 		if (sopt->sopt_p != kernproc) {
6392 			int error;
6393 
6394 			error = copyout(mtod(m, char *), sopt->sopt_val,
6395 			    m->m_len);
6396 			if (error != 0) {
6397 				m_freem(m0);
6398 				return error;
6399 			}
6400 		} else {
6401 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6402 			    CAST_DOWN(caddr_t, sopt->sopt_val),
6403 			    m->m_len);
6404 
6405 			bcopy(mtod(m, char *), tmp, m->m_len);
6406 		}
6407 		sopt->sopt_valsize -= m->m_len;
6408 		sopt->sopt_val += m->m_len;
6409 		valsize += m->m_len;
6410 		m = m->m_next;
6411 	}
6412 	if (m != NULL) {
6413 		/* enough soopt buffer should be given from user-land */
6414 		m_freem(m0);
6415 		return EINVAL;
6416 	}
6417 	sopt->sopt_valsize = valsize;
6418 	return 0;
6419 }
6420 
6421 void
sohasoutofband(struct socket * so)6422 sohasoutofband(struct socket *so)
6423 {
6424 	if (so->so_pgid < 0) {
6425 		gsignal(-so->so_pgid, SIGURG);
6426 	} else if (so->so_pgid > 0) {
6427 		proc_signal(so->so_pgid, SIGURG);
6428 	}
6429 	selwakeup(&so->so_rcv.sb_sel);
6430 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6431 		KNOTE(&so->so_rcv.sb_sel.si_note,
6432 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6433 	}
6434 }
6435 
6436 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6437 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6438 {
6439 #pragma unused(cred)
6440 	struct proc *p = current_proc();
6441 	int revents = 0;
6442 
6443 	socket_lock(so, 1);
6444 	so_update_last_owner_locked(so, PROC_NULL);
6445 	so_update_policy(so);
6446 
6447 	if (events & (POLLIN | POLLRDNORM)) {
6448 		if (soreadable(so)) {
6449 			revents |= events & (POLLIN | POLLRDNORM);
6450 		}
6451 	}
6452 
6453 	if (events & (POLLOUT | POLLWRNORM)) {
6454 		if (sowriteable(so)) {
6455 			revents |= events & (POLLOUT | POLLWRNORM);
6456 		}
6457 	}
6458 
6459 	if (events & (POLLPRI | POLLRDBAND)) {
6460 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6461 			revents |= events & (POLLPRI | POLLRDBAND);
6462 		}
6463 	}
6464 
6465 	if (revents == 0) {
6466 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6467 			/*
6468 			 * Darwin sets the flag first,
6469 			 * BSD calls selrecord first
6470 			 */
6471 			so->so_rcv.sb_flags |= SB_SEL;
6472 			selrecord(p, &so->so_rcv.sb_sel, wql);
6473 		}
6474 
6475 		if (events & (POLLOUT | POLLWRNORM)) {
6476 			/*
6477 			 * Darwin sets the flag first,
6478 			 * BSD calls selrecord first
6479 			 */
6480 			so->so_snd.sb_flags |= SB_SEL;
6481 			selrecord(p, &so->so_snd.sb_sel, wql);
6482 		}
6483 	}
6484 
6485 	socket_unlock(so, 1);
6486 	return revents;
6487 }
6488 
6489 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6490 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6491 {
6492 	struct socket *so = (struct socket *)fp_get_data(fp);
6493 	int result;
6494 
6495 	socket_lock(so, 1);
6496 	so_update_last_owner_locked(so, PROC_NULL);
6497 	so_update_policy(so);
6498 
6499 	switch (kn->kn_filter) {
6500 	case EVFILT_READ:
6501 		kn->kn_filtid = EVFILTID_SOREAD;
6502 		break;
6503 	case EVFILT_WRITE:
6504 		kn->kn_filtid = EVFILTID_SOWRITE;
6505 		break;
6506 	case EVFILT_SOCK:
6507 		kn->kn_filtid = EVFILTID_SCK;
6508 		break;
6509 	case EVFILT_EXCEPT:
6510 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6511 		break;
6512 	default:
6513 		socket_unlock(so, 1);
6514 		knote_set_error(kn, EINVAL);
6515 		return 0;
6516 	}
6517 
6518 	/*
6519 	 * call the appropriate sub-filter attach
6520 	 * with the socket still locked
6521 	 */
6522 	result = knote_fops(kn)->f_attach(kn, kev);
6523 
6524 	socket_unlock(so, 1);
6525 
6526 	return result;
6527 }
6528 
6529 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6530 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6531 {
6532 	int retval = 0;
6533 	int64_t data = 0;
6534 
6535 	if (so->so_options & SO_ACCEPTCONN) {
6536 		/*
6537 		 * Radar 6615193 handle the listen case dynamically
6538 		 * for kqueue read filter. This allows to call listen()
6539 		 * after registering the kqueue EVFILT_READ.
6540 		 */
6541 
6542 		retval = !TAILQ_EMPTY(&so->so_comp);
6543 		data = so->so_qlen;
6544 		goto out;
6545 	}
6546 
6547 	/* socket isn't a listener */
6548 	/*
6549 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6550 	 * the bytes of protocol data. We therefore exclude any
6551 	 * control bytes.
6552 	 */
6553 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6554 
6555 	if (kn->kn_sfflags & NOTE_OOB) {
6556 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6557 			kn->kn_fflags |= NOTE_OOB;
6558 			data -= so->so_oobmark;
6559 			retval = 1;
6560 			goto out;
6561 		}
6562 	}
6563 
6564 	if ((so->so_state & SS_CANTRCVMORE)
6565 #if CONTENT_FILTER
6566 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6567 #endif /* CONTENT_FILTER */
6568 	    ) {
6569 		kn->kn_flags |= EV_EOF;
6570 		kn->kn_fflags = so->so_error;
6571 		retval = 1;
6572 		goto out;
6573 	}
6574 
6575 	if (so->so_error) {     /* temporary udp error */
6576 		retval = 1;
6577 		goto out;
6578 	}
6579 
6580 	int64_t lowwat = so->so_rcv.sb_lowat;
6581 	/*
6582 	 * Ensure that when NOTE_LOWAT is used, the derived
6583 	 * low water mark is bounded by socket's rcv buf's
6584 	 * high and low water mark values.
6585 	 */
6586 	if (kn->kn_sfflags & NOTE_LOWAT) {
6587 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6588 			lowwat = so->so_rcv.sb_hiwat;
6589 		} else if (kn->kn_sdata > lowwat) {
6590 			lowwat = kn->kn_sdata;
6591 		}
6592 	}
6593 
6594 	/*
6595 	 * While the `data` field is the amount of data to read,
6596 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6597 	 * so we need to take control bytes into account too.
6598 	 */
6599 	retval = (so->so_rcv.sb_cc >= lowwat);
6600 
6601 out:
6602 	if (retval && kev) {
6603 		knote_fill_kevent(kn, kev, data);
6604 	}
6605 	return retval;
6606 }
6607 
6608 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6609 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6610 {
6611 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6612 
6613 	/* socket locked */
6614 
6615 	/*
6616 	 * If the caller explicitly asked for OOB results (e.g. poll())
6617 	 * from EVFILT_READ, then save that off in the hookid field
6618 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6619 	 */
6620 	if (kn->kn_filter == EVFILT_READ &&
6621 	    kn->kn_flags & EV_OOBAND) {
6622 		kn->kn_flags &= ~EV_OOBAND;
6623 		kn->kn_hook32 = EV_OOBAND;
6624 	} else {
6625 		kn->kn_hook32 = 0;
6626 	}
6627 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6628 		so->so_rcv.sb_flags |= SB_KNOTE;
6629 	}
6630 
6631 	/* indicate if event is already fired */
6632 	return filt_soread_common(kn, NULL, so);
6633 }
6634 
6635 static void
filt_sordetach(struct knote * kn)6636 filt_sordetach(struct knote *kn)
6637 {
6638 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6639 
6640 	socket_lock(so, 1);
6641 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6642 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6643 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6644 		}
6645 	}
6646 	socket_unlock(so, 1);
6647 }
6648 
6649 /*ARGSUSED*/
6650 static int
filt_soread(struct knote * kn,long hint)6651 filt_soread(struct knote *kn, long hint)
6652 {
6653 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6654 	int retval;
6655 
6656 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6657 		socket_lock(so, 1);
6658 	}
6659 
6660 	retval = filt_soread_common(kn, NULL, so);
6661 
6662 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6663 		socket_unlock(so, 1);
6664 	}
6665 
6666 	return retval;
6667 }
6668 
6669 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6670 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6671 {
6672 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6673 	int retval;
6674 
6675 	socket_lock(so, 1);
6676 
6677 	/* save off the new input fflags and data */
6678 	kn->kn_sfflags = kev->fflags;
6679 	kn->kn_sdata = kev->data;
6680 
6681 	/* determine if changes result in fired events */
6682 	retval = filt_soread_common(kn, NULL, so);
6683 
6684 	socket_unlock(so, 1);
6685 
6686 	return retval;
6687 }
6688 
6689 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6690 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6691 {
6692 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6693 	int retval;
6694 
6695 	socket_lock(so, 1);
6696 	retval = filt_soread_common(kn, kev, so);
6697 	socket_unlock(so, 1);
6698 
6699 	return retval;
6700 }
6701 
6702 int
so_wait_for_if_feedback(struct socket * so)6703 so_wait_for_if_feedback(struct socket *so)
6704 {
6705 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6706 	    (so->so_state & SS_ISCONNECTED)) {
6707 		struct inpcb *inp = sotoinpcb(so);
6708 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6709 			return 1;
6710 		}
6711 	}
6712 	return 0;
6713 }
6714 
6715 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6716 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6717 {
6718 	int ret = 0;
6719 	int64_t data = sbspace(&so->so_snd);
6720 
6721 	if (so->so_state & SS_CANTSENDMORE) {
6722 		kn->kn_flags |= EV_EOF;
6723 		kn->kn_fflags = so->so_error;
6724 		ret = 1;
6725 		goto out;
6726 	}
6727 
6728 	if (so->so_error) {     /* temporary udp error */
6729 		ret = 1;
6730 		goto out;
6731 	}
6732 
6733 	if (!socanwrite(so)) {
6734 		ret = 0;
6735 		goto out;
6736 	}
6737 
6738 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6739 		ret = 1;
6740 		goto out;
6741 	}
6742 
6743 	int64_t lowwat = so->so_snd.sb_lowat;
6744 	const int64_t hiwat = so->so_snd.sb_hiwat;
6745 	/*
6746 	 * Deal with connected UNIX domain sockets which
6747 	 * rely on the fact that the sender's socket buffer is
6748 	 * actually the receiver's socket buffer.
6749 	 */
6750 	if (SOCK_DOM(so) == PF_LOCAL) {
6751 		struct unpcb *unp = sotounpcb(so);
6752 		if (unp != NULL && unp->unp_conn != NULL &&
6753 		    unp->unp_conn->unp_socket != NULL) {
6754 			struct socket *so2 = unp->unp_conn->unp_socket;
6755 			/*
6756 			 * At this point we know that `so' is locked
6757 			 * and that `unp_conn` isn't going to change.
6758 			 * However, we don't lock `so2` because doing so
6759 			 * may require unlocking `so'
6760 			 * (see unp_get_locks_in_order()).
6761 			 *
6762 			 * Two cases can happen:
6763 			 *
6764 			 * 1) we return 1 and tell the application that
6765 			 *    it can write.  Meanwhile, another thread
6766 			 *    fills up the socket buffer.  This will either
6767 			 *    lead to a blocking send or EWOULDBLOCK
6768 			 *    which the application should deal with.
6769 			 * 2) we return 0 and tell the application that
6770 			 *    the socket is not writable.  Meanwhile,
6771 			 *    another thread depletes the receive socket
6772 			 *    buffer. In this case the application will
6773 			 *    be woken up by sb_notify().
6774 			 *
6775 			 * MIN() is required because otherwise sosendcheck()
6776 			 * may return EWOULDBLOCK since it only considers
6777 			 * so->so_snd.
6778 			 */
6779 			data = MIN(data, sbspace(&so2->so_rcv));
6780 		}
6781 	}
6782 
6783 	if (kn->kn_sfflags & NOTE_LOWAT) {
6784 		if (kn->kn_sdata > hiwat) {
6785 			lowwat = hiwat;
6786 		} else if (kn->kn_sdata > lowwat) {
6787 			lowwat = kn->kn_sdata;
6788 		}
6789 	}
6790 
6791 	if (data > 0 && data >= lowwat) {
6792 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6793 #if (DEBUG || DEVELOPMENT)
6794 		    && so_notsent_lowat_check == 1
6795 #endif /* DEBUG || DEVELOPMENT */
6796 		    ) {
6797 			if ((SOCK_DOM(so) == PF_INET ||
6798 			    SOCK_DOM(so) == PF_INET6) &&
6799 			    so->so_type == SOCK_STREAM) {
6800 				ret = tcp_notsent_lowat_check(so);
6801 			}
6802 #if MPTCP
6803 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6804 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6805 				ret = mptcp_notsent_lowat_check(so);
6806 			}
6807 #endif
6808 			else {
6809 				ret = 1;
6810 				goto out;
6811 			}
6812 		} else {
6813 			ret = 1;
6814 		}
6815 	}
6816 	if (so_wait_for_if_feedback(so)) {
6817 		ret = 0;
6818 	}
6819 
6820 out:
6821 	if (ret && kev) {
6822 		knote_fill_kevent(kn, kev, data);
6823 	}
6824 	return ret;
6825 }
6826 
6827 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6828 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6829 {
6830 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6831 
6832 	/* socket locked */
6833 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6834 		so->so_snd.sb_flags |= SB_KNOTE;
6835 	}
6836 
6837 	/* determine if its already fired */
6838 	return filt_sowrite_common(kn, NULL, so);
6839 }
6840 
6841 static void
filt_sowdetach(struct knote * kn)6842 filt_sowdetach(struct knote *kn)
6843 {
6844 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6845 	socket_lock(so, 1);
6846 
6847 	if (so->so_snd.sb_flags & SB_KNOTE) {
6848 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6849 			so->so_snd.sb_flags &= ~SB_KNOTE;
6850 		}
6851 	}
6852 	socket_unlock(so, 1);
6853 }
6854 
6855 /*ARGSUSED*/
6856 static int
filt_sowrite(struct knote * kn,long hint)6857 filt_sowrite(struct knote *kn, long hint)
6858 {
6859 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6860 	int ret;
6861 
6862 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6863 		socket_lock(so, 1);
6864 	}
6865 
6866 	ret = filt_sowrite_common(kn, NULL, so);
6867 
6868 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6869 		socket_unlock(so, 1);
6870 	}
6871 
6872 	return ret;
6873 }
6874 
6875 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6876 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6877 {
6878 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6879 	int ret;
6880 
6881 	socket_lock(so, 1);
6882 
6883 	/*save off the new input fflags and data */
6884 	kn->kn_sfflags = kev->fflags;
6885 	kn->kn_sdata = kev->data;
6886 
6887 	/* determine if these changes result in a triggered event */
6888 	ret = filt_sowrite_common(kn, NULL, so);
6889 
6890 	socket_unlock(so, 1);
6891 
6892 	return ret;
6893 }
6894 
6895 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6896 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6897 {
6898 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6899 	int ret;
6900 
6901 	socket_lock(so, 1);
6902 	ret = filt_sowrite_common(kn, kev, so);
6903 	socket_unlock(so, 1);
6904 
6905 	return ret;
6906 }
6907 
6908 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6909 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6910     struct socket *so, long ev_hint)
6911 {
6912 	int ret = 0;
6913 	int64_t data = 0;
6914 	uint32_t level_trigger = 0;
6915 
6916 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6917 		kn->kn_fflags |= NOTE_CONNRESET;
6918 	}
6919 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6920 		kn->kn_fflags |= NOTE_TIMEOUT;
6921 	}
6922 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6923 		kn->kn_fflags |= NOTE_NOSRCADDR;
6924 	}
6925 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6926 		kn->kn_fflags |= NOTE_IFDENIED;
6927 	}
6928 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6929 		kn->kn_fflags |= NOTE_KEEPALIVE;
6930 	}
6931 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6932 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6933 	}
6934 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6935 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6936 	}
6937 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6938 	    (so->so_state & SS_ISCONNECTED)) {
6939 		kn->kn_fflags |= NOTE_CONNECTED;
6940 		level_trigger |= NOTE_CONNECTED;
6941 	}
6942 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6943 	    (so->so_state & SS_ISDISCONNECTED)) {
6944 		kn->kn_fflags |= NOTE_DISCONNECTED;
6945 		level_trigger |= NOTE_DISCONNECTED;
6946 	}
6947 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6948 		if (so->so_proto != NULL &&
6949 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6950 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6951 		}
6952 	}
6953 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6954 	    tcp_notify_ack_active(so)) {
6955 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
6956 	}
6957 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6958 		kn->kn_fflags |= NOTE_WAKE_PKT;
6959 	}
6960 
6961 	if ((so->so_state & SS_CANTRCVMORE)
6962 #if CONTENT_FILTER
6963 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6964 #endif /* CONTENT_FILTER */
6965 	    ) {
6966 		kn->kn_fflags |= NOTE_READCLOSED;
6967 		level_trigger |= NOTE_READCLOSED;
6968 	}
6969 
6970 	if (so->so_state & SS_CANTSENDMORE) {
6971 		kn->kn_fflags |= NOTE_WRITECLOSED;
6972 		level_trigger |= NOTE_WRITECLOSED;
6973 	}
6974 
6975 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6976 	    (so->so_flags & SOF_SUSPENDED)) {
6977 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6978 
6979 		/* If resume event was delivered before, reset it */
6980 		kn->kn_hook32 &= ~NOTE_RESUME;
6981 
6982 		kn->kn_fflags |= NOTE_SUSPEND;
6983 		level_trigger |= NOTE_SUSPEND;
6984 	}
6985 
6986 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
6987 	    (so->so_flags & SOF_SUSPENDED) == 0) {
6988 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6989 
6990 		/* If suspend event was delivered before, reset it */
6991 		kn->kn_hook32 &= ~NOTE_SUSPEND;
6992 
6993 		kn->kn_fflags |= NOTE_RESUME;
6994 		level_trigger |= NOTE_RESUME;
6995 	}
6996 
6997 	if (so->so_error != 0) {
6998 		ret = 1;
6999 		data = so->so_error;
7000 		kn->kn_flags |= EV_EOF;
7001 	} else {
7002 		u_int32_t data32 = 0;
7003 		get_sockev_state(so, &data32);
7004 		data = data32;
7005 	}
7006 
7007 	/* Reset any events that are not requested on this knote */
7008 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7009 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7010 
7011 	/* Find the level triggerred events that are already delivered */
7012 	level_trigger &= kn->kn_hook32;
7013 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7014 
7015 	/* Do not deliver level triggerred events more than once */
7016 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7017 		ret = 1;
7018 	}
7019 
7020 	if (ret && kev) {
7021 		/*
7022 		 * Store the state of the events being delivered. This
7023 		 * state can be used to deliver level triggered events
7024 		 * ateast once and still avoid waking up the application
7025 		 * multiple times as long as the event is active.
7026 		 */
7027 		if (kn->kn_fflags != 0) {
7028 			kn->kn_hook32 |= (kn->kn_fflags &
7029 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7030 		}
7031 
7032 		/*
7033 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7034 		 * only one of them and remember the last one that was
7035 		 * delivered last
7036 		 */
7037 		if (kn->kn_fflags & NOTE_SUSPEND) {
7038 			kn->kn_hook32 &= ~NOTE_RESUME;
7039 		}
7040 		if (kn->kn_fflags & NOTE_RESUME) {
7041 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7042 		}
7043 
7044 		knote_fill_kevent(kn, kev, data);
7045 	}
7046 	return ret;
7047 }
7048 
7049 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7050 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7051 {
7052 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7053 
7054 	/* socket locked */
7055 	kn->kn_hook32 = 0;
7056 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7057 		so->so_flags |= SOF_KNOTE;
7058 	}
7059 
7060 	/* determine if event already fired */
7061 	return filt_sockev_common(kn, NULL, so, 0);
7062 }
7063 
7064 static void
filt_sockdetach(struct knote * kn)7065 filt_sockdetach(struct knote *kn)
7066 {
7067 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7068 	socket_lock(so, 1);
7069 
7070 	if ((so->so_flags & SOF_KNOTE) != 0) {
7071 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7072 			so->so_flags &= ~SOF_KNOTE;
7073 		}
7074 	}
7075 	socket_unlock(so, 1);
7076 }
7077 
7078 static int
filt_sockev(struct knote * kn,long hint)7079 filt_sockev(struct knote *kn, long hint)
7080 {
7081 	int ret = 0, locked = 0;
7082 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7083 	long ev_hint = (hint & SO_FILT_HINT_EV);
7084 
7085 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7086 		socket_lock(so, 1);
7087 		locked = 1;
7088 	}
7089 
7090 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7091 
7092 	if (locked) {
7093 		socket_unlock(so, 1);
7094 	}
7095 
7096 	return ret;
7097 }
7098 
7099 
7100 
7101 /*
7102  *	filt_socktouch - update event state
7103  */
7104 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7105 filt_socktouch(
7106 	struct knote *kn,
7107 	struct kevent_qos_s *kev)
7108 {
7109 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7110 	uint32_t changed_flags;
7111 	int ret;
7112 
7113 	socket_lock(so, 1);
7114 
7115 	/* save off the [result] data and fflags */
7116 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7117 
7118 	/* save off the new input fflags and data */
7119 	kn->kn_sfflags = kev->fflags;
7120 	kn->kn_sdata = kev->data;
7121 
7122 	/* restrict the current results to the (smaller?) set of new interest */
7123 	/*
7124 	 * For compatibility with previous implementations, we leave kn_fflags
7125 	 * as they were before.
7126 	 */
7127 	//kn->kn_fflags &= kev->fflags;
7128 
7129 	/*
7130 	 * Since we keep track of events that are already
7131 	 * delivered, if any of those events are not requested
7132 	 * anymore the state related to them can be reset
7133 	 */
7134 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7135 
7136 	/* determine if we have events to deliver */
7137 	ret = filt_sockev_common(kn, NULL, so, 0);
7138 
7139 	socket_unlock(so, 1);
7140 
7141 	return ret;
7142 }
7143 
7144 /*
7145  *	filt_sockprocess - query event fired state and return data
7146  */
7147 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7148 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7149 {
7150 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7151 	int ret = 0;
7152 
7153 	socket_lock(so, 1);
7154 
7155 	ret = filt_sockev_common(kn, kev, so, 0);
7156 
7157 	socket_unlock(so, 1);
7158 
7159 	return ret;
7160 }
7161 
7162 void
get_sockev_state(struct socket * so,u_int32_t * statep)7163 get_sockev_state(struct socket *so, u_int32_t *statep)
7164 {
7165 	u_int32_t state = *(statep);
7166 
7167 	/*
7168 	 * If the state variable is already used by a previous event,
7169 	 * reset it.
7170 	 */
7171 	if (state != 0) {
7172 		return;
7173 	}
7174 
7175 	if (so->so_state & SS_ISCONNECTED) {
7176 		state |= SOCKEV_CONNECTED;
7177 	} else {
7178 		state &= ~(SOCKEV_CONNECTED);
7179 	}
7180 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7181 	*(statep) = state;
7182 }
7183 
7184 #define SO_LOCK_HISTORY_STR_LEN \
7185 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7186 
7187 __private_extern__ const char *
solockhistory_nr(struct socket * so)7188 solockhistory_nr(struct socket *so)
7189 {
7190 	size_t n = 0;
7191 	int i;
7192 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7193 
7194 	bzero(lock_history_str, sizeof(lock_history_str));
7195 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7196 		n += scnprintf(lock_history_str + n,
7197 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7198 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7199 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7200 	}
7201 	return __unsafe_null_terminated_from_indexable(lock_history_str);
7202 }
7203 
7204 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7205 socket_getlock(struct socket *so, int flags)
7206 {
7207 	if (so->so_proto->pr_getlock != NULL) {
7208 		return (*so->so_proto->pr_getlock)(so, flags);
7209 	} else {
7210 		return so->so_proto->pr_domain->dom_mtx;
7211 	}
7212 }
7213 
7214 void
socket_lock(struct socket * so,int refcount)7215 socket_lock(struct socket *so, int refcount)
7216 {
7217 	void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7218 
7219 	if (so->so_proto->pr_lock) {
7220 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7221 	} else {
7222 #ifdef MORE_LOCKING_DEBUG
7223 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7224 		    LCK_MTX_ASSERT_NOTOWNED);
7225 #endif
7226 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7227 		if (refcount) {
7228 			so->so_usecount++;
7229 		}
7230 		so->lock_lr[so->next_lock_lr] = lr_saved;
7231 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7232 	}
7233 }
7234 
7235 void
socket_lock_assert_owned(struct socket * so)7236 socket_lock_assert_owned(struct socket *so)
7237 {
7238 	lck_mtx_t *mutex_held;
7239 
7240 	if (so->so_proto->pr_getlock != NULL) {
7241 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7242 	} else {
7243 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7244 	}
7245 
7246 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7247 }
7248 
7249 int
socket_try_lock(struct socket * so)7250 socket_try_lock(struct socket *so)
7251 {
7252 	lck_mtx_t *mtx;
7253 
7254 	if (so->so_proto->pr_getlock != NULL) {
7255 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7256 	} else {
7257 		mtx = so->so_proto->pr_domain->dom_mtx;
7258 	}
7259 
7260 	return lck_mtx_try_lock(mtx);
7261 }
7262 
7263 void
socket_unlock(struct socket * so,int refcount)7264 socket_unlock(struct socket *so, int refcount)
7265 {
7266 	lck_mtx_t *mutex_held;
7267 	void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7268 
7269 	if (so == NULL || so->so_proto == NULL) {
7270 		panic("%s: null so_proto so=%p", __func__, so);
7271 		/* NOTREACHED */
7272 	}
7273 
7274 	if (so->so_proto->pr_unlock) {
7275 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7276 	} else {
7277 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7278 #ifdef MORE_LOCKING_DEBUG
7279 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7280 #endif
7281 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7282 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7283 
7284 		if (refcount) {
7285 			if (so->so_usecount <= 0) {
7286 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7287 				    "lrh=%s", __func__, so->so_usecount, so,
7288 				    SOCK_DOM(so), so->so_type,
7289 				    SOCK_PROTO(so), solockhistory_nr(so));
7290 				/* NOTREACHED */
7291 			}
7292 
7293 			so->so_usecount--;
7294 			if (so->so_usecount == 0) {
7295 				sofreelastref(so, 1);
7296 			}
7297 		}
7298 		lck_mtx_unlock(mutex_held);
7299 	}
7300 }
7301 
7302 /* Called with socket locked, will unlock socket */
7303 void
sofree(struct socket * so)7304 sofree(struct socket *so)
7305 {
7306 	lck_mtx_t *mutex_held;
7307 
7308 	if (so->so_proto->pr_getlock != NULL) {
7309 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7310 	} else {
7311 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7312 	}
7313 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7314 
7315 	sofreelastref(so, 0);
7316 }
7317 
7318 void
soreference(struct socket * so)7319 soreference(struct socket *so)
7320 {
7321 	socket_lock(so, 1);     /* locks & take one reference on socket */
7322 	socket_unlock(so, 0);   /* unlock only */
7323 }
7324 
7325 void
sodereference(struct socket * so)7326 sodereference(struct socket *so)
7327 {
7328 	socket_lock(so, 0);
7329 	socket_unlock(so, 1);
7330 }
7331 
7332 /*
7333  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7334  * possibility of using jumbo clusters.  Caller must ensure to hold
7335  * the socket lock.
7336  */
7337 void
somultipages(struct socket * so,boolean_t set)7338 somultipages(struct socket *so, boolean_t set)
7339 {
7340 	if (set) {
7341 		so->so_flags |= SOF_MULTIPAGES;
7342 	} else {
7343 		so->so_flags &= ~SOF_MULTIPAGES;
7344 	}
7345 }
7346 
7347 void
soif2kcl(struct socket * so,boolean_t set)7348 soif2kcl(struct socket *so, boolean_t set)
7349 {
7350 	if (set) {
7351 		so->so_flags1 |= SOF1_IF_2KCL;
7352 	} else {
7353 		so->so_flags1 &= ~SOF1_IF_2KCL;
7354 	}
7355 }
7356 
7357 int
so_isdstlocal(struct socket * so)7358 so_isdstlocal(struct socket *so)
7359 {
7360 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7361 
7362 	if (SOCK_DOM(so) == PF_INET) {
7363 		return inaddr_local(inp->inp_faddr);
7364 	} else if (SOCK_DOM(so) == PF_INET6) {
7365 		return in6addr_local(&inp->in6p_faddr);
7366 	}
7367 
7368 	return 0;
7369 }
7370 
7371 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7372 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7373 {
7374 	struct sockbuf *rcv, *snd;
7375 	int err = 0, defunct;
7376 
7377 	rcv = &so->so_rcv;
7378 	snd = &so->so_snd;
7379 
7380 	defunct = (so->so_flags & SOF_DEFUNCT);
7381 	if (defunct) {
7382 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7383 			panic("%s: SB_DROP not set", __func__);
7384 			/* NOTREACHED */
7385 		}
7386 		goto done;
7387 	}
7388 
7389 	if (so->so_flags & SOF_NODEFUNCT) {
7390 		if (noforce) {
7391 			err = EOPNOTSUPP;
7392 			if (p != PROC_NULL) {
7393 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7394 				    "name %s level %d) so 0x%llu [%d,%d] "
7395 				    "is not eligible for defunct "
7396 				    "(%d)\n", __func__, proc_selfpid(),
7397 				    proc_best_name(current_proc()), proc_pid(p),
7398 				    proc_best_name(p), level,
7399 				    so->so_gencnt,
7400 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7401 			}
7402 			return err;
7403 		}
7404 		so->so_flags &= ~SOF_NODEFUNCT;
7405 		if (p != PROC_NULL) {
7406 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7407 			    "name %s level %d) so 0x%llu [%d,%d] "
7408 			    "defunct by force "
7409 			    "(%d)\n", __func__, proc_selfpid(),
7410 			    proc_best_name(current_proc()), proc_pid(p),
7411 			    proc_best_name(p), level,
7412 			    so->so_gencnt,
7413 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7414 		}
7415 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7416 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7417 		struct ifnet *ifp = inp->inp_last_outifp;
7418 
7419 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7420 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7421 		} else if (so->so_flags & SOF_DELEGATED) {
7422 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7423 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7424 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7425 		} else if (noforce && p != PROC_NULL) {
7426 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7427 
7428 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7429 			so->so_extended_bk_start = net_uptime();
7430 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7431 
7432 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7433 
7434 			err = EOPNOTSUPP;
7435 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7436 			    "name %s level %d) so 0x%llu [%d,%d] "
7437 			    "extend bk idle "
7438 			    "(%d)\n", __func__, proc_selfpid(),
7439 			    proc_best_name(current_proc()), proc_pid(p),
7440 			    proc_best_name(p), level,
7441 			    so->so_gencnt,
7442 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7443 			return err;
7444 		} else {
7445 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7446 		}
7447 	}
7448 
7449 	so->so_flags |= SOF_DEFUNCT;
7450 
7451 	/* Prevent further data from being appended to the socket buffers */
7452 	snd->sb_flags |= SB_DROP;
7453 	rcv->sb_flags |= SB_DROP;
7454 
7455 	/* Flush any existing data in the socket buffers */
7456 	if (rcv->sb_cc != 0) {
7457 		rcv->sb_flags &= ~SB_SEL;
7458 		selthreadclear(&rcv->sb_sel);
7459 		sbrelease(rcv);
7460 	}
7461 	if (snd->sb_cc != 0) {
7462 		snd->sb_flags &= ~SB_SEL;
7463 		selthreadclear(&snd->sb_sel);
7464 		sbrelease(snd);
7465 	}
7466 
7467 done:
7468 	if (p != PROC_NULL) {
7469 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7470 		    "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7471 		    proc_selfpid(), proc_best_name(current_proc()),
7472 		    proc_pid(p), proc_best_name(p), level,
7473 		    so->so_gencnt, SOCK_DOM(so),
7474 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7475 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7476 		    " extbkidle" : "");
7477 	}
7478 	return err;
7479 }
7480 
7481 int
sodefunct(struct proc * p,struct socket * so,int level)7482 sodefunct(struct proc *p, struct socket *so, int level)
7483 {
7484 	struct sockbuf *rcv, *snd;
7485 
7486 	if (!(so->so_flags & SOF_DEFUNCT)) {
7487 		panic("%s improperly called", __func__);
7488 		/* NOTREACHED */
7489 	}
7490 	if (so->so_state & SS_DEFUNCT) {
7491 		goto done;
7492 	}
7493 
7494 	rcv = &so->so_rcv;
7495 	snd = &so->so_snd;
7496 
7497 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7498 		char s[MAX_IPv6_STR_LEN];
7499 		char d[MAX_IPv6_STR_LEN];
7500 		struct inpcb *inp = sotoinpcb(so);
7501 
7502 		if (p != PROC_NULL) {
7503 			SODEFUNCTLOG(
7504 				"%s[%d, %s]: (target pid %d name %s level %d) "
7505 				"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7506 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7507 				" snd_fl 0x%x]\n", __func__,
7508 				proc_selfpid(), proc_best_name(current_proc()),
7509 				proc_pid(p), proc_best_name(p), level,
7510 				so->so_gencnt,
7511 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7512 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7513 				(void *)&inp->inp_laddr.s_addr :
7514 				(void *)&inp->in6p_laddr),
7515 				s, sizeof(s)), ntohs(inp->in6p_lport),
7516 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7517 				(void *)&inp->inp_faddr.s_addr :
7518 				(void *)&inp->in6p_faddr,
7519 				d, sizeof(d)), ntohs(inp->in6p_fport),
7520 				(uint32_t)rcv->sb_sel.si_flags,
7521 				(uint32_t)snd->sb_sel.si_flags,
7522 				rcv->sb_flags, snd->sb_flags);
7523 		}
7524 	} else if (p != PROC_NULL) {
7525 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7526 		    "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7527 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7528 		    proc_selfpid(), proc_best_name(current_proc()),
7529 		    proc_pid(p), proc_best_name(p), level,
7530 		    so->so_gencnt,
7531 		    SOCK_DOM(so), SOCK_TYPE(so),
7532 		    (uint32_t)rcv->sb_sel.si_flags,
7533 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7534 		    snd->sb_flags);
7535 	}
7536 
7537 	/*
7538 	 * First tell the protocol the flow is defunct
7539 	 */
7540 	(void)  (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7541 
7542 	/*
7543 	 * Unwedge threads blocked on sbwait() and sb_lock().
7544 	 */
7545 	sbwakeup(rcv);
7546 	sbwakeup(snd);
7547 
7548 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7549 	if (rcv->sb_flags & SB_LOCK) {
7550 		sbunlock(rcv, TRUE);    /* keep socket locked */
7551 	}
7552 	if (snd->sb_flags & SB_LOCK) {
7553 		sbunlock(snd, TRUE);    /* keep socket locked */
7554 	}
7555 	/*
7556 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7557 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7558 	 * states are set for the socket.  This would also flush out data
7559 	 * hanging off the receive list of this socket.
7560 	 */
7561 	(void) soshutdownlock_final(so, SHUT_RD);
7562 	(void) soshutdownlock_final(so, SHUT_WR);
7563 	(void) sodisconnectlocked(so);
7564 
7565 	/*
7566 	 * Explicitly handle connectionless-protocol disconnection
7567 	 * and release any remaining data in the socket buffers.
7568 	 */
7569 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7570 		(void) soisdisconnected(so);
7571 	}
7572 
7573 	if (so->so_error == 0) {
7574 		so->so_error = EBADF;
7575 	}
7576 
7577 	if (rcv->sb_cc != 0) {
7578 		rcv->sb_flags &= ~SB_SEL;
7579 		selthreadclear(&rcv->sb_sel);
7580 		sbrelease(rcv);
7581 	}
7582 	if (snd->sb_cc != 0) {
7583 		snd->sb_flags &= ~SB_SEL;
7584 		selthreadclear(&snd->sb_sel);
7585 		sbrelease(snd);
7586 	}
7587 	so->so_state |= SS_DEFUNCT;
7588 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7589 
7590 done:
7591 	return 0;
7592 }
7593 
7594 int
soresume(struct proc * p,struct socket * so,int locked)7595 soresume(struct proc *p, struct socket *so, int locked)
7596 {
7597 	if (locked == 0) {
7598 		socket_lock(so, 1);
7599 	}
7600 
7601 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7602 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7603 		    "[%d,%d] resumed from bk idle\n",
7604 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7605 		    proc_pid(p), proc_best_name(p),
7606 		    so->so_gencnt,
7607 		    SOCK_DOM(so), SOCK_TYPE(so));
7608 
7609 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7610 		so->so_extended_bk_start = 0;
7611 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7612 
7613 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7614 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7615 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7616 	}
7617 	if (locked == 0) {
7618 		socket_unlock(so, 1);
7619 	}
7620 
7621 	return 0;
7622 }
7623 
7624 /*
7625  * Does not attempt to account for sockets that are delegated from
7626  * the current process
7627  */
7628 int
so_set_extended_bk_idle(struct socket * so,int optval)7629 so_set_extended_bk_idle(struct socket *so, int optval)
7630 {
7631 	int error = 0;
7632 
7633 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7634 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7635 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7636 		error = EOPNOTSUPP;
7637 	} else if (optval == 0) {
7638 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7639 
7640 		soresume(current_proc(), so, 1);
7641 	} else {
7642 		struct proc *p = current_proc();
7643 		struct fileproc *fp;
7644 		int count = 0;
7645 
7646 		/*
7647 		 * Unlock socket to avoid lock ordering issue with
7648 		 * the proc fd table lock
7649 		 */
7650 		socket_unlock(so, 0);
7651 
7652 		proc_fdlock(p);
7653 		fdt_foreach(fp, p) {
7654 			struct socket *so2;
7655 
7656 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7657 				continue;
7658 			}
7659 
7660 			so2 = (struct socket *)fp_get_data(fp);
7661 			if (so != so2 &&
7662 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7663 				count++;
7664 			}
7665 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7666 				break;
7667 			}
7668 		}
7669 		proc_fdunlock(p);
7670 
7671 		socket_lock(so, 0);
7672 
7673 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7674 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7675 			error = EBUSY;
7676 		} else if (so->so_flags & SOF_DELEGATED) {
7677 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7678 			error = EBUSY;
7679 		} else {
7680 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7681 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7682 		}
7683 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7684 		    "%s marked for extended bk idle\n",
7685 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7686 		    so->so_gencnt,
7687 		    SOCK_DOM(so), SOCK_TYPE(so),
7688 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7689 		    "is" : "not");
7690 	}
7691 
7692 	return error;
7693 }
7694 
7695 static void
so_stop_extended_bk_idle(struct socket * so)7696 so_stop_extended_bk_idle(struct socket *so)
7697 {
7698 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7699 	so->so_extended_bk_start = 0;
7700 
7701 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7702 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7703 	/*
7704 	 * Force defunct
7705 	 */
7706 	sosetdefunct(current_proc(), so,
7707 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7708 	if (so->so_flags & SOF_DEFUNCT) {
7709 		sodefunct(current_proc(), so,
7710 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7711 	}
7712 }
7713 
7714 void
so_drain_extended_bk_idle(struct socket * so)7715 so_drain_extended_bk_idle(struct socket *so)
7716 {
7717 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7718 		/*
7719 		 * Only penalize sockets that have outstanding data
7720 		 */
7721 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7722 			so_stop_extended_bk_idle(so);
7723 
7724 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7725 		}
7726 	}
7727 }
7728 
7729 /*
7730  * Return values tells if socket is still in extended background idle
7731  */
7732 int
so_check_extended_bk_idle_time(struct socket * so)7733 so_check_extended_bk_idle_time(struct socket *so)
7734 {
7735 	int ret = 1;
7736 
7737 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7738 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7739 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7740 		    so->so_gencnt,
7741 		    SOCK_DOM(so), SOCK_TYPE(so));
7742 		if (net_uptime() - so->so_extended_bk_start >
7743 		    soextbkidlestat.so_xbkidle_time) {
7744 			so_stop_extended_bk_idle(so);
7745 
7746 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7747 
7748 			ret = 0;
7749 		} else {
7750 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7751 
7752 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7753 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7754 		}
7755 	}
7756 
7757 	return ret;
7758 }
7759 
7760 void
resume_proc_sockets(proc_t p)7761 resume_proc_sockets(proc_t p)
7762 {
7763 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7764 		struct fileproc *fp;
7765 		struct socket *so;
7766 
7767 		proc_fdlock(p);
7768 		fdt_foreach(fp, p) {
7769 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7770 				continue;
7771 			}
7772 
7773 			so = (struct socket *)fp_get_data(fp);
7774 			(void) soresume(p, so, 0);
7775 		}
7776 		proc_fdunlock(p);
7777 
7778 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7779 	}
7780 }
7781 
7782 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7783 so_set_recv_anyif(struct socket *so, int optval)
7784 {
7785 	int ret = 0;
7786 
7787 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7788 		if (optval) {
7789 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7790 		} else {
7791 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7792 		}
7793 #if SKYWALK
7794 		inp_update_netns_flags(so);
7795 #endif /* SKYWALK */
7796 	}
7797 
7798 
7799 	return ret;
7800 }
7801 
7802 __private_extern__ int
so_get_recv_anyif(struct socket * so)7803 so_get_recv_anyif(struct socket *so)
7804 {
7805 	int ret = 0;
7806 
7807 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7808 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7809 	}
7810 
7811 	return ret;
7812 }
7813 
7814 int
so_set_restrictions(struct socket * so,uint32_t vals)7815 so_set_restrictions(struct socket *so, uint32_t vals)
7816 {
7817 	int nocell_old, nocell_new;
7818 	int noexpensive_old, noexpensive_new;
7819 	int noconstrained_old, noconstrained_new;
7820 
7821 	/*
7822 	 * Deny-type restrictions are trapdoors; once set they cannot be
7823 	 * unset for the lifetime of the socket.  This allows them to be
7824 	 * issued by a framework on behalf of the application without
7825 	 * having to worry that they can be undone.
7826 	 *
7827 	 * Note here that socket-level restrictions overrides any protocol
7828 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7829 	 * socket restriction issued on the socket has a higher precendence
7830 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7831 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7832 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7833 	 */
7834 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7835 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7836 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7837 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7838 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7839 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7840 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7841 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7842 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7843 
7844 	/* we can only set, not clear restrictions */
7845 	if ((nocell_new - nocell_old) == 0 &&
7846 	    (noexpensive_new - noexpensive_old) == 0 &&
7847 	    (noconstrained_new - noconstrained_old) == 0) {
7848 		return 0;
7849 	}
7850 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7851 		if (nocell_new - nocell_old != 0) {
7852 			/*
7853 			 * if deny cellular is now set, do what's needed
7854 			 * for INPCB
7855 			 */
7856 			inp_set_nocellular(sotoinpcb(so));
7857 		}
7858 		if (noexpensive_new - noexpensive_old != 0) {
7859 			inp_set_noexpensive(sotoinpcb(so));
7860 		}
7861 		if (noconstrained_new - noconstrained_old != 0) {
7862 			inp_set_noconstrained(sotoinpcb(so));
7863 		}
7864 	}
7865 
7866 	if (SOCK_DOM(so) == PF_MULTIPATH) {
7867 		mptcp_set_restrictions(so);
7868 	}
7869 
7870 	return 0;
7871 }
7872 
7873 uint32_t
so_get_restrictions(struct socket * so)7874 so_get_restrictions(struct socket *so)
7875 {
7876 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7877 	       SO_RESTRICT_DENY_OUT |
7878 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7879 }
7880 
7881 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7882 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7883 {
7884 	struct proc *ep = PROC_NULL;
7885 	int error = 0;
7886 
7887 	/* pid 0 is reserved for kernel */
7888 	if (epid == 0) {
7889 		error = EINVAL;
7890 		goto done;
7891 	}
7892 
7893 	/*
7894 	 * If this is an in-kernel socket, prevent its delegate
7895 	 * association from changing unless the socket option is
7896 	 * coming from within the kernel itself.
7897 	 */
7898 	if (so->last_pid == 0 && p != kernproc) {
7899 		error = EACCES;
7900 		goto done;
7901 	}
7902 
7903 	/*
7904 	 * If this is issued by a process that's recorded as the
7905 	 * real owner of the socket, or if the pid is the same as
7906 	 * the process's own pid, then proceed.  Otherwise ensure
7907 	 * that the issuing process has the necessary privileges.
7908 	 */
7909 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7910 		if ((error = priv_check_cred(kauth_cred_get(),
7911 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7912 			error = EACCES;
7913 			goto done;
7914 		}
7915 	}
7916 
7917 	/* Find the process that corresponds to the effective pid */
7918 	if ((ep = proc_find(epid)) == PROC_NULL) {
7919 		error = ESRCH;
7920 		goto done;
7921 	}
7922 
7923 	/*
7924 	 * If a process tries to delegate the socket to itself, then
7925 	 * there's really nothing to do; treat it as a way for the
7926 	 * delegate association to be cleared.  Note that we check
7927 	 * the passed-in proc rather than calling proc_selfpid(),
7928 	 * as we need to check the process issuing the socket option
7929 	 * which could be kernproc.  Given that we don't allow 0 for
7930 	 * effective pid, it means that a delegated in-kernel socket
7931 	 * stays delegated during its lifetime (which is probably OK.)
7932 	 */
7933 	if (epid == proc_pid(p)) {
7934 		so->so_flags &= ~SOF_DELEGATED;
7935 		so->e_upid = 0;
7936 		so->e_pid = 0;
7937 		uuid_clear(so->e_uuid);
7938 	} else {
7939 		so->so_flags |= SOF_DELEGATED;
7940 		so->e_upid = proc_uniqueid(ep);
7941 		so->e_pid = proc_pid(ep);
7942 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7943 
7944 #if defined(XNU_TARGET_OS_OSX)
7945 		if (ep->p_responsible_pid != so->e_pid) {
7946 			proc_t rp = proc_find(ep->p_responsible_pid);
7947 			if (rp != PROC_NULL) {
7948 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7949 				so->so_rpid = ep->p_responsible_pid;
7950 				proc_rele(rp);
7951 			} else {
7952 				uuid_clear(so->so_ruuid);
7953 				so->so_rpid = -1;
7954 			}
7955 		}
7956 #endif
7957 	}
7958 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7959 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7960 	}
7961 done:
7962 	if (error == 0 && net_io_policy_log) {
7963 		uuid_string_t buf;
7964 
7965 		uuid_unparse(so->e_uuid, buf);
7966 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7967 		    "euuid %s%s\n", __func__, proc_name_address(p),
7968 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7969 		    SOCK_DOM(so), SOCK_TYPE(so),
7970 		    so->e_pid, proc_name_address(ep), buf,
7971 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7972 	} else if (error != 0 && net_io_policy_log) {
7973 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7974 		    "ERROR (%d)\n", __func__, proc_name_address(p),
7975 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7976 		    SOCK_DOM(so), SOCK_TYPE(so),
7977 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
7978 		    proc_name_address(ep), error);
7979 	}
7980 
7981 	/* Update this socket's policy upon success */
7982 	if (error == 0) {
7983 		so->so_policy_gencnt *= -1;
7984 		so_update_policy(so);
7985 #if NECP
7986 		so_update_necp_policy(so, NULL, NULL);
7987 #endif /* NECP */
7988 	}
7989 
7990 	if (ep != PROC_NULL) {
7991 		proc_rele(ep);
7992 	}
7993 
7994 	return error;
7995 }
7996 
7997 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7998 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7999 {
8000 	uuid_string_t buf;
8001 	uuid_t uuid;
8002 	int error = 0;
8003 
8004 	/* UUID must not be all-zeroes (reserved for kernel) */
8005 	if (uuid_is_null(euuid)) {
8006 		error = EINVAL;
8007 		goto done;
8008 	}
8009 
8010 	/*
8011 	 * If this is an in-kernel socket, prevent its delegate
8012 	 * association from changing unless the socket option is
8013 	 * coming from within the kernel itself.
8014 	 */
8015 	if (so->last_pid == 0 && p != kernproc) {
8016 		error = EACCES;
8017 		goto done;
8018 	}
8019 
8020 	/* Get the UUID of the issuing process */
8021 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8022 
8023 	/*
8024 	 * If this is issued by a process that's recorded as the
8025 	 * real owner of the socket, or if the uuid is the same as
8026 	 * the process's own uuid, then proceed.  Otherwise ensure
8027 	 * that the issuing process has the necessary privileges.
8028 	 */
8029 	if (check_cred &&
8030 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8031 	    uuid_compare(euuid, uuid) != 0)) {
8032 		if ((error = priv_check_cred(kauth_cred_get(),
8033 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8034 			error = EACCES;
8035 			goto done;
8036 		}
8037 	}
8038 
8039 	/*
8040 	 * If a process tries to delegate the socket to itself, then
8041 	 * there's really nothing to do; treat it as a way for the
8042 	 * delegate association to be cleared.  Note that we check
8043 	 * the uuid of the passed-in proc rather than that of the
8044 	 * current process, as we need to check the process issuing
8045 	 * the socket option which could be kernproc itself.  Given
8046 	 * that we don't allow 0 for effective uuid, it means that
8047 	 * a delegated in-kernel socket stays delegated during its
8048 	 * lifetime (which is okay.)
8049 	 */
8050 	if (uuid_compare(euuid, uuid) == 0) {
8051 		so->so_flags &= ~SOF_DELEGATED;
8052 		so->e_upid = 0;
8053 		so->e_pid = 0;
8054 		uuid_clear(so->e_uuid);
8055 	} else {
8056 		so->so_flags |= SOF_DELEGATED;
8057 		/*
8058 		 * Unlike so_set_effective_pid(), we only have the UUID
8059 		 * here and the process ID is not known.  Inherit the
8060 		 * real {pid,upid} of the socket.
8061 		 */
8062 		so->e_upid = so->last_upid;
8063 		so->e_pid = so->last_pid;
8064 		uuid_copy(so->e_uuid, euuid);
8065 	}
8066 	/*
8067 	 * The following will clear the effective process name as it's the same
8068 	 * as the real process
8069 	 */
8070 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8071 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8072 	}
8073 done:
8074 	if (error == 0 && net_io_policy_log) {
8075 		uuid_unparse(so->e_uuid, buf);
8076 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8077 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8078 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8079 		    SOCK_TYPE(so), so->e_pid, buf,
8080 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8081 	} else if (error != 0 && net_io_policy_log) {
8082 		uuid_unparse(euuid, buf);
8083 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8084 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8085 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8086 		    SOCK_TYPE(so), buf, error);
8087 	}
8088 
8089 	/* Update this socket's policy upon success */
8090 	if (error == 0) {
8091 		so->so_policy_gencnt *= -1;
8092 		so_update_policy(so);
8093 #if NECP
8094 		so_update_necp_policy(so, NULL, NULL);
8095 #endif /* NECP */
8096 	}
8097 
8098 	return error;
8099 }
8100 
8101 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8102 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8103     uint32_t ev_datalen)
8104 {
8105 	struct kev_msg ev_msg;
8106 
8107 	/*
8108 	 * A netpolicy event always starts with a netpolicy_event_data
8109 	 * structure, but the caller can provide for a longer event
8110 	 * structure to post, depending on the event code.
8111 	 */
8112 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8113 
8114 	bzero(&ev_msg, sizeof(ev_msg));
8115 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8116 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8117 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8118 	ev_msg.event_code       = ev_code;
8119 
8120 	ev_msg.dv[0].data_ptr   = ev_data;
8121 	ev_msg.dv[0].data_length = ev_datalen;
8122 
8123 	kev_post_msg(&ev_msg);
8124 }
8125 
8126 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8127 socket_post_kev_msg(uint32_t ev_code,
8128     struct kev_socket_event_data *ev_data,
8129     uint32_t ev_datalen)
8130 {
8131 	struct kev_msg ev_msg;
8132 
8133 	bzero(&ev_msg, sizeof(ev_msg));
8134 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8135 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8136 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8137 	ev_msg.event_code = ev_code;
8138 
8139 	ev_msg.dv[0].data_ptr = ev_data;
8140 	ev_msg.dv[0].data_length = ev_datalen;
8141 
8142 	kev_post_msg(&ev_msg);
8143 }
8144 
8145 void
socket_post_kev_msg_closed(struct socket * so)8146 socket_post_kev_msg_closed(struct socket *so)
8147 {
8148 	struct kev_socket_closed ev = {};
8149 	struct sockaddr *__single socksa = NULL, *__single peersa = NULL;
8150 	int err;
8151 
8152 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8153 		return;
8154 	}
8155 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8156 	if (err == 0) {
8157 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8158 		    &peersa);
8159 		if (err == 0) {
8160 			SOCKADDR_COPY(socksa, &ev.ev_data.kev_sockname,
8161 			    min(socksa->sa_len,
8162 			    sizeof(ev.ev_data.kev_sockname)));
8163 			SOCKADDR_COPY(peersa, &ev.ev_data.kev_peername,
8164 			    min(peersa->sa_len,
8165 			    sizeof(ev.ev_data.kev_peername)));
8166 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8167 			    &ev.ev_data, sizeof(ev));
8168 		}
8169 	}
8170 	free_sockaddr(socksa);
8171 	free_sockaddr(peersa);
8172 }
8173 
8174 void
sock_parse_cm_info(struct mbuf * control,struct sock_cm_info * sockcminfo)8175 sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo)
8176 {
8177 	struct cmsghdr *cm;
8178 
8179 	for (cm = M_FIRST_CMSGHDR(control);
8180 	    is_cmsg_valid(control, cm);
8181 	    cm = M_NXT_CMSGHDR(control, cm)) {
8182 		int val;
8183 
8184 		if (cm->cmsg_level != SOL_SOCKET) {
8185 			continue;
8186 		}
8187 
8188 		if (cm->cmsg_len == CMSG_LEN(sizeof(int))) {
8189 			val = *(int *)(void *)CMSG_DATA(cm);
8190 		}
8191 
8192 		switch (cm->cmsg_type) {
8193 		case SO_TRAFFIC_CLASS:
8194 			if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8195 				break;
8196 			}
8197 			if (SO_VALID_TC(val)) {
8198 				sockcminfo->sotc = val;
8199 				break;
8200 			} else if (val < SO_TC_NET_SERVICE_OFFSET) {
8201 				break;
8202 			}
8203 			/*
8204 			 * Handle the case SO_NET_SERVICE_TYPE values are
8205 			 * passed using SO_TRAFFIC_CLASS
8206 			 */
8207 			val = val - SO_TC_NET_SERVICE_OFFSET;
8208 
8209 			OS_FALLTHROUGH;
8210 		case SO_NET_SERVICE_TYPE:
8211 			if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8212 				break;
8213 			}
8214 
8215 			if (!IS_VALID_NET_SERVICE_TYPE(val)) {
8216 				break;
8217 			}
8218 			sockcminfo->netsvctype = val;
8219 			sockcminfo->sotc = sotc_by_netservicetype[val];
8220 			break;
8221 		case SCM_TXTIME:
8222 			if (cm->cmsg_len != CMSG_LEN(sizeof(uint64_t))) {
8223 				break;
8224 			}
8225 
8226 			sockcminfo->tx_time = *(uint64_t *)(void *)CMSG_DATA(cm);
8227 			break;
8228 		default:
8229 			break;
8230 		}
8231 	}
8232 }
8233 
8234 __attribute__((noinline, cold, not_tail_called, noreturn))
8235 __private_extern__ int
assfail(const char * a,const char * f,int l)8236 assfail(const char *a, const char *f, int l)
8237 {
8238 	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8239 	/* NOTREACHED */
8240 	__builtin_unreachable();
8241 }
8242