xref: /xnu-12377.1.9/bsd/kern/uipc_socket.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 1998-2022, 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <net/sockaddr_utils.h>
106 #include <netinet/in.h>
107 #include <netinet/in_pcb.h>
108 #include <netinet/in_tclass.h>
109 #include <netinet/in_var.h>
110 #include <netinet/tcp_var.h>
111 #include <netinet/ip6.h>
112 #include <netinet6/ip6_var.h>
113 #include <netinet/flow_divert.h>
114 #include <kern/assert.h>
115 #include <kern/locks.h>
116 #include <kern/mem_acct.h>
117 #include <kern/policy_internal.h>
118 #include <kern/uipc_domain.h>
119 #include <kern/uipc_socket.h>
120 #include <kern/task.h>
121 #include <kern/zalloc.h>
122 #include <machine/limits.h>
123 #include <libkern/OSAtomic.h>
124 #include <pexpert/pexpert.h>
125 
126 #include <sys/kpi_mbuf.h>
127 #include <sys/mcache.h>
128 #include <sys/unpcb.h>
129 #include <libkern/section_keywords.h>
130 
131 #include <os/log.h>
132 
133 #if CONFIG_MACF
134 #include <security/mac_framework.h>
135 #endif /* MAC */
136 
137 #if MULTIPATH
138 #include <netinet/mp_pcb.h>
139 #include <netinet/mptcp_var.h>
140 #endif /* MULTIPATH */
141 
142 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
143 
144 #if DEBUG || DEVELOPMENT
145 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
146 #else
147 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
148 #endif
149 
150 /* TODO: this should be in a header file somewhere */
151 extern char *proc_name_address(void *p);
152 
153 static int              socketinit_done;
154 struct mem_acct *socket_memacct;
155 
156 #include <machine/limits.h>
157 
158 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
159 static void     filt_sordetach(struct knote *kn);
160 static int      filt_soread(struct knote *kn, long hint);
161 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
162 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
163 
164 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void     filt_sowdetach(struct knote *kn);
166 static int      filt_sowrite(struct knote *kn, long hint);
167 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
169 
170 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void     filt_sockdetach(struct knote *kn);
172 static int      filt_sockev(struct knote *kn, long hint);
173 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
175 
176 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
177 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
178 
179 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
180 	.f_isfd = 1,
181 	.f_attach = filt_sorattach,
182 	.f_detach = filt_sordetach,
183 	.f_event = filt_soread,
184 	.f_touch = filt_sortouch,
185 	.f_process = filt_sorprocess,
186 };
187 
188 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
189 	.f_isfd = 1,
190 	.f_attach = filt_sowattach,
191 	.f_detach = filt_sowdetach,
192 	.f_event = filt_sowrite,
193 	.f_touch = filt_sowtouch,
194 	.f_process = filt_sowprocess,
195 };
196 
197 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
198 	.f_isfd = 1,
199 	.f_attach = filt_sockattach,
200 	.f_detach = filt_sockdetach,
201 	.f_event = filt_sockev,
202 	.f_touch = filt_socktouch,
203 	.f_process = filt_sockprocess,
204 };
205 
206 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
207 	.f_isfd = 1,
208 	.f_attach = filt_sorattach,
209 	.f_detach = filt_sordetach,
210 	.f_event = filt_soread,
211 	.f_touch = filt_sortouch,
212 	.f_process = filt_sorprocess,
213 };
214 
215 SYSCTL_DECL(_kern_ipc);
216 
217 #define EVEN_MORE_LOCKING_DEBUG 0
218 
219 int socket_debug = 0;
220 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
221     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
222 
223 #if (DEBUG || DEVELOPMENT)
224 #define DEFAULT_SOSEND_ASSERT_PANIC 1
225 #else
226 #define DEFAULT_SOSEND_ASSERT_PANIC 0
227 #endif /* (DEBUG || DEVELOPMENT) */
228 
229 int sosend_assert_panic = 0;
230 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
231     CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
232 
233 static unsigned long sodefunct_calls = 0;
234 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
235     &sodefunct_calls, "");
236 
237 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
238 so_gen_t        so_gencnt;      /* generation count for sockets */
239 
240 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
241 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
242 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
243 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
244 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
245 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
246 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
247 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
248 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
249 
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253 
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy  = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262 
263 /*
264  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
265  * writes on the socket for all protocols on any network interfaces.
266  * Be extra careful when setting this to 1, because sending down packets with
267  * clusters larger that 2 KB might lead to system panics or data corruption.
268  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
269  * on the outgoing interface
270  * Set this to 1  for testing/debugging purposes only.
271  */
272 int sosendbigcl_ignore_capab = 0;
273 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
274     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
275 
276 int sodefunctlog = 0;
277 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
278     &sodefunctlog, 0, "");
279 
280 int sothrottlelog = 0;
281 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
282     &sothrottlelog, 0, "");
283 
284 int sorestrictrecv = 1;
285 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
286     &sorestrictrecv, 0, "Enable inbound interface restrictions");
287 
288 int sorestrictsend = 1;
289 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
290     &sorestrictsend, 0, "Enable outbound interface restrictions");
291 
292 int soreserveheadroom = 1;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
294     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
295 
296 #if (DEBUG || DEVELOPMENT)
297 int so_notsent_lowat_check = 1;
298 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
299     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
300 #endif /* DEBUG || DEVELOPMENT */
301 
302 int so_accept_list_waits = 0;
303 #if (DEBUG || DEVELOPMENT)
304 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
305     &so_accept_list_waits, 0, "number of waits for listener incomp list");
306 #endif /* DEBUG || DEVELOPMENT */
307 
308 extern struct inpcbinfo tcbinfo;
309 
310 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
311     user_ssize_t *);
312 
313 /*
314  * Maximum of extended background idle sockets per process
315  * Set to zero to disable further setting of the option
316  */
317 
318 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
319 #define SO_IDLE_BK_IDLE_TIME            600
320 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
321 
322 struct soextbkidlestat soextbkidlestat;
323 
324 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
325     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
326     "Maximum of extended background idle sockets per process");
327 
328 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
329     &soextbkidlestat.so_xbkidle_time, 0,
330     "Time in seconds to keep extended background idle sockets");
331 
332 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
333     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
334     "High water mark for extended background idle sockets");
335 
336 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
337     &soextbkidlestat, soextbkidlestat, "");
338 
339 int so_set_extended_bk_idle(struct socket *, int);
340 
341 #define SO_MAX_MSG_X 1024
342 
343 /*
344  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
345  * setting the DSCP code on the packet based on the service class; see
346  * <rdar://problem/11277343> for details.
347  */
348 __private_extern__ u_int32_t sotcdb = 0;
349 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
350     &sotcdb, 0, "");
351 
352 void
socketinit(void)353 socketinit(void)
354 {
355 	static_assert(sizeof(so_gencnt) == sizeof(uint64_t));
356 	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
357 
358 #ifdef __LP64__
359 	static_assert(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
360 	static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
361 	static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
362 	static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
363 	static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
364 	static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
365 #else
366 	static_assert(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
367 	static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
368 	static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
369 	static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
370 	static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
371 	static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
372 #endif
373 
374 	if (socketinit_done) {
375 		printf("socketinit: already called...\n");
376 		return;
377 	}
378 	socketinit_done = 1;
379 
380 	PE_parse_boot_argn("socket_debug", &socket_debug,
381 	    sizeof(socket_debug));
382 
383 	PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
384 	    sizeof(sosend_assert_panic));
385 
386 	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
387 	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
388 	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
389 	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
390 
391 	in_pcbinit();
392 
393 	socket_memacct = mem_acct_register("SOCKET", 0, 0);
394 	if (socket_memacct == NULL) {
395 		panic("mem_acct_register returned NULL");
396 	}
397 }
398 
399 void
so_update_last_owner_locked(struct socket * so,proc_t self)400 so_update_last_owner_locked(struct socket *so, proc_t self)
401 {
402 	if (so->last_pid != 0) {
403 		/*
404 		 * last_pid and last_upid should remain zero for sockets
405 		 * created using sock_socket. The check above achieves that
406 		 */
407 		if (self == PROC_NULL) {
408 			self = current_proc();
409 		}
410 
411 		if (so->last_upid != proc_uniqueid(self) ||
412 		    so->last_pid != proc_pid(self)) {
413 			so->last_upid = proc_uniqueid(self);
414 			so->last_pid = proc_pid(self);
415 			proc_getexecutableuuid(self, so->last_uuid,
416 			    sizeof(so->last_uuid));
417 			if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
418 				(*so->so_proto->pr_update_last_owner)(so, self, NULL);
419 			}
420 		}
421 		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
422 	}
423 }
424 
425 void
so_update_policy(struct socket * so)426 so_update_policy(struct socket *so)
427 {
428 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
429 		(void) inp_update_policy(sotoinpcb(so));
430 	}
431 }
432 
433 #if NECP
434 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)435 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
436     struct sockaddr *override_remote_addr)
437 {
438 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
439 		inp_update_necp_policy(sotoinpcb(so), override_local_addr,
440 		    override_remote_addr, 0);
441 	}
442 }
443 #endif /* NECP */
444 
445 /*
446  * Get a socket structure from our zone, and initialize it.
447  *
448  * Note that it would probably be better to allocate socket
449  * and PCB at the same time, but I'm not convinced that all
450  * the protocols can be easily modified to do this.
451  */
452 struct socket *
soalloc(void)453 soalloc(void)
454 {
455 	struct socket *__single so;
456 
457 	so = zalloc_flags(socket_zone, Z_WAITOK_ZERO);
458 	if (so != NULL) {
459 		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
460 
461 		/*
462 		 * Increment the socket allocation statistics
463 		 */
464 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
465 	}
466 
467 	return so;
468 }
469 
470 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)471 socreate_internal(int dom, struct socket **aso, int type, int proto,
472     struct proc *p, uint32_t flags, struct proc *ep)
473 {
474 	struct protosw *prp;
475 	struct socket *so;
476 	int error = 0;
477 	pid_t rpid = -1;
478 
479 	VERIFY(aso != NULL);
480 	*aso = NULL;
481 
482 	if (proto != 0) {
483 		prp = pffindproto(dom, proto, type);
484 	} else {
485 		prp = pffindtype(dom, type);
486 	}
487 
488 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
489 		if (pffinddomain(dom) == NULL) {
490 			return EAFNOSUPPORT;
491 		}
492 		if (proto != 0) {
493 			if (pffindprotonotype(dom, proto) != NULL) {
494 				return EPROTOTYPE;
495 			}
496 		}
497 		return EPROTONOSUPPORT;
498 	}
499 	if (prp->pr_type != type) {
500 		return EPROTOTYPE;
501 	}
502 	if (proto_memacct_hardlimit(prp)) {
503 		return ENOBUFS;
504 	}
505 	so = soalloc();
506 	if (so == NULL) {
507 		return ENOBUFS;
508 	}
509 
510 	switch (dom) {
511 	case PF_LOCAL:
512 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
513 		break;
514 	case PF_INET:
515 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
516 		if (type == SOCK_STREAM) {
517 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
518 		} else {
519 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
520 		}
521 		break;
522 	case PF_ROUTE:
523 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
524 		break;
525 	case PF_NDRV:
526 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
527 		break;
528 	case PF_KEY:
529 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
530 		break;
531 	case PF_INET6:
532 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
533 		if (type == SOCK_STREAM) {
534 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
535 		} else {
536 			INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
537 		}
538 		break;
539 	case PF_SYSTEM:
540 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
541 		break;
542 	case PF_MULTIPATH:
543 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
544 		break;
545 	default:
546 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
547 		break;
548 	}
549 
550 	if (flags & SOCF_MPTCP) {
551 		so->so_state |= SS_NBIO;
552 	}
553 
554 	TAILQ_INIT(&so->so_incomp);
555 	TAILQ_INIT(&so->so_comp);
556 	so->so_type = (short)type;
557 	so->so_family = prp->pr_domain->dom_family;
558 	so->so_protocol = prp->pr_protocol;
559 	so->last_upid = proc_uniqueid(p);
560 	so->last_pid = proc_pid(p);
561 	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
562 	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
563 
564 	so->so_rpid = -1;
565 	uuid_clear(so->so_ruuid);
566 
567 	if (ep != PROC_NULL && ep != p) {
568 		so->e_upid = proc_uniqueid(ep);
569 		so->e_pid = proc_pid(ep);
570 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
571 		so->so_flags |= SOF_DELEGATED;
572 		if (ep->p_responsible_pid != so->e_pid) {
573 			rpid = ep->p_responsible_pid;
574 			so->so_rpid = rpid;
575 			proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
576 		}
577 	}
578 
579 	if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
580 		rpid = p->p_responsible_pid;
581 		so->so_rpid = rpid;
582 		proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
583 	}
584 
585 	so->so_cred = kauth_cred_proc_ref(p);
586 	if (!suser(kauth_cred_get(), NULL)) {
587 		so->so_state |= SS_PRIV;
588 	}
589 
590 	so->so_persona_id = current_persona_get_id();
591 	so->so_proto = prp;
592 	so->so_rcv.sb_flags |= SB_RECV;
593 	so->so_rcv.sb_so = so->so_snd.sb_so = so;
594 	so->next_lock_lr = 0;
595 	so->next_unlock_lr = 0;
596 
597 	proto_memacct_add(so->so_proto, sizeof(struct socket));
598 
599 	/*
600 	 * Attachment will create the per pcb lock if necessary and
601 	 * increase refcount for creation, make sure it's done before
602 	 * socket is inserted in lists.
603 	 */
604 	so->so_usecount++;
605 
606 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
607 	if (error != 0) {
608 		/*
609 		 * Warning:
610 		 * If so_pcb is not zero, the socket will be leaked,
611 		 * so protocol attachment handler must be coded carefuly
612 		 */
613 		if (so->so_pcb != NULL) {
614 			os_log_error(OS_LOG_DEFAULT,
615 			    "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
616 			    error, dom, proto, type);
617 		}
618 		/*
619 		 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
620 		 */
621 		so->so_state |= SS_NOFDREF;
622 		so->so_flags |= SOF_PCBCLEARING;
623 		VERIFY(so->so_usecount > 0);
624 		so->so_usecount--;
625 		sofreelastref(so, 1);   /* will deallocate the socket */
626 		return error;
627 	}
628 
629 	/*
630 	 * Note: needs so_pcb to be set after pru_attach
631 	 */
632 	if (prp->pr_update_last_owner != NULL) {
633 		(*prp->pr_update_last_owner)(so, p, ep);
634 	}
635 
636 	os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
637 
638 	/* Attach socket filters for this protocol */
639 	sflt_initsock(so);
640 	so_set_default_traffic_class(so);
641 
642 	/*
643 	 * If this thread or task is marked to create backgrounded sockets,
644 	 * mark the socket as background.
645 	 */
646 	if (!(flags & SOCF_MPTCP) &&
647 	    proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
648 		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
649 		so->so_background_thread = current_thread();
650 	}
651 
652 	switch (dom) {
653 	/*
654 	 * Don't mark Unix domain or system
655 	 * eligible for defunct by default.
656 	 */
657 	case PF_LOCAL:
658 	case PF_SYSTEM:
659 		so->so_flags |= SOF_NODEFUNCT;
660 		break;
661 	default:
662 		break;
663 	}
664 
665 	/*
666 	 * Entitlements can't be checked at socket creation time except if the
667 	 * application requested a feature guarded by a privilege (c.f., socket
668 	 * delegation).
669 	 * The priv(9) and the Sandboxing APIs are designed with the idea that
670 	 * a privilege check should only be triggered by a userland request.
671 	 * A privilege check at socket creation time is time consuming and
672 	 * could trigger many authorisation error messages from the security
673 	 * APIs.
674 	 */
675 
676 	*aso = so;
677 
678 	return 0;
679 }
680 
681 /*
682  * Returns:	0			Success
683  *		EAFNOSUPPORT
684  *		EPROTOTYPE
685  *		EPROTONOSUPPORT
686  *		ENOBUFS
687  *	<pru_attach>:ENOBUFS[AF_UNIX]
688  *	<pru_attach>:ENOBUFS[TCP]
689  *	<pru_attach>:ENOMEM[TCP]
690  *	<pru_attach>:???		[other protocol families, IPSEC]
691  */
692 int
socreate(int dom,struct socket ** aso,int type,int proto)693 socreate(int dom, struct socket **aso, int type, int proto)
694 {
695 	return socreate_internal(dom, aso, type, proto, current_proc(), 0,
696 	           PROC_NULL);
697 }
698 
699 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)700 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
701 {
702 	int error = 0;
703 	struct proc *ep = PROC_NULL;
704 
705 	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
706 		error = ESRCH;
707 		goto done;
708 	}
709 
710 	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
711 
712 	/*
713 	 * It might not be wise to hold the proc reference when calling
714 	 * socreate_internal since it calls soalloc with M_WAITOK
715 	 */
716 done:
717 	if (ep != PROC_NULL) {
718 		proc_rele(ep);
719 	}
720 
721 	return error;
722 }
723 
724 /*
725  * Returns:	0			Success
726  *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
727  *	<pru_bind>:EAFNOSUPPORT		Address family not supported
728  *	<pru_bind>:EADDRNOTAVAIL	Address not available.
729  *	<pru_bind>:EINVAL		Invalid argument
730  *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
731  *	<pru_bind>:EACCES		Permission denied
732  *	<pru_bind>:EADDRINUSE		Address in use
733  *	<pru_bind>:EAGAIN		Resource unavailable, try again
734  *	<pru_bind>:EPERM		Operation not permitted
735  *	<pru_bind>:???
736  *	<sf_bind>:???
737  *
738  * Notes:	It's not possible to fully enumerate the return codes above,
739  *		since socket filter authors and protocol family authors may
740  *		not choose to limit their error returns to those listed, even
741  *		though this may result in some software operating incorrectly.
742  *
743  *		The error codes which are enumerated above are those known to
744  *		be returned by the tcp_usr_bind function supplied.
745  */
746 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)747 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
748 {
749 	struct proc *p = current_proc();
750 	int error = 0;
751 
752 	if (dolock) {
753 		socket_lock(so, 1);
754 	}
755 
756 	so_update_last_owner_locked(so, p);
757 	so_update_policy(so);
758 
759 #if NECP
760 	so_update_necp_policy(so, nam, NULL);
761 #endif /* NECP */
762 
763 	/*
764 	 * If this is a bind request on a socket that has been marked
765 	 * as inactive, reject it now before we go any further.
766 	 */
767 	if (so->so_flags & SOF_DEFUNCT) {
768 		error = EINVAL;
769 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
770 		    __func__, proc_pid(p), proc_best_name(p),
771 		    so->so_gencnt,
772 		    SOCK_DOM(so), SOCK_TYPE(so), error);
773 		goto out;
774 	}
775 
776 	/* Socket filter */
777 	error = sflt_bind(so, nam);
778 
779 	if (error == 0) {
780 		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
781 	}
782 out:
783 	if (dolock) {
784 		socket_unlock(so, 1);
785 	}
786 
787 	if (error == EJUSTRETURN) {
788 		error = 0;
789 	}
790 
791 	return error;
792 }
793 
794 void
sodealloc(struct socket * so)795 sodealloc(struct socket *so)
796 {
797 	proto_memacct_sub(so->so_proto, sizeof(struct socket));
798 
799 	kauth_cred_unref(&so->so_cred);
800 
801 	/* Remove any filters */
802 	sflt_termsock(so);
803 
804 	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
805 
806 	zfree(socket_zone, so);
807 }
808 
809 /*
810  * Returns:	0			Success
811  *		EINVAL
812  *		EOPNOTSUPP
813  *	<pru_listen>:EINVAL[AF_UNIX]
814  *	<pru_listen>:EINVAL[TCP]
815  *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
816  *	<pru_listen>:EINVAL[TCP]	Invalid argument
817  *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
818  *	<pru_listen>:EACCES[TCP]	Permission denied
819  *	<pru_listen>:EADDRINUSE[TCP]	Address in use
820  *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
821  *	<pru_listen>:EPERM[TCP]		Operation not permitted
822  *	<sf_listen>:???
823  *
824  * Notes:	Other <pru_listen> returns depend on the protocol family; all
825  *		<sf_listen> returns depend on what the filter author causes
826  *		their filter to return.
827  */
828 int
solisten(struct socket * so,int backlog)829 solisten(struct socket *so, int backlog)
830 {
831 	struct proc *p = current_proc();
832 	int error = 0;
833 
834 	socket_lock(so, 1);
835 
836 	so_update_last_owner_locked(so, p);
837 	so_update_policy(so);
838 
839 	if (TAILQ_EMPTY(&so->so_comp)) {
840 		so->so_options |= SO_ACCEPTCONN;
841 	}
842 
843 #if NECP
844 	so_update_necp_policy(so, NULL, NULL);
845 #endif /* NECP */
846 
847 	if (so->so_proto == NULL) {
848 		error = EINVAL;
849 		so->so_options &= ~SO_ACCEPTCONN;
850 		goto out;
851 	}
852 	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
853 		error = EOPNOTSUPP;
854 		so->so_options &= ~SO_ACCEPTCONN;
855 		goto out;
856 	}
857 
858 	/*
859 	 * If the listen request is made on a socket that is not fully
860 	 * disconnected, or on a socket that has been marked as inactive,
861 	 * reject the request now.
862 	 */
863 	if ((so->so_state &
864 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
865 	    (so->so_flags & SOF_DEFUNCT)) {
866 		error = EINVAL;
867 		if (so->so_flags & SOF_DEFUNCT) {
868 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
869 			    "(%d)\n", __func__, proc_pid(p),
870 			    proc_best_name(p),
871 			    so->so_gencnt,
872 			    SOCK_DOM(so), SOCK_TYPE(so), error);
873 		}
874 		so->so_options &= ~SO_ACCEPTCONN;
875 		goto out;
876 	}
877 
878 	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
879 		error = EPERM;
880 		so->so_options &= ~SO_ACCEPTCONN;
881 		goto out;
882 	}
883 
884 	error = sflt_listen(so);
885 	if (error == 0) {
886 		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
887 	}
888 
889 	if (error) {
890 		if (error == EJUSTRETURN) {
891 			error = 0;
892 		}
893 		so->so_options &= ~SO_ACCEPTCONN;
894 		goto out;
895 	}
896 
897 	/*
898 	 * POSIX: The implementation may have an upper limit on the length of
899 	 * the listen queue-either global or per accepting socket. If backlog
900 	 * exceeds this limit, the length of the listen queue is set to the
901 	 * limit.
902 	 *
903 	 * If listen() is called with a backlog argument value that is less
904 	 * than 0, the function behaves as if it had been called with a backlog
905 	 * argument value of 0.
906 	 *
907 	 * A backlog argument of 0 may allow the socket to accept connections,
908 	 * in which case the length of the listen queue may be set to an
909 	 * implementation-defined minimum value.
910 	 */
911 	if (backlog <= 0 || backlog > somaxconn) {
912 		backlog = somaxconn;
913 	}
914 
915 	so->so_qlimit = (short)backlog;
916 out:
917 	socket_unlock(so, 1);
918 	return error;
919 }
920 
921 /*
922  * The "accept list lock" protects the fields related to the listener queues
923  * because we can unlock a socket to respect the lock ordering between
924  * the listener socket and its clients sockets. The lock ordering is first to
925  * acquire the client socket before the listener socket.
926  *
927  * The accept list lock serializes access to the following fields:
928  * - of the listener socket:
929  *   - so_comp
930  *   - so_incomp
931  *   - so_qlen
932  *   - so_inqlen
933  * - of client sockets that are in so_comp or so_incomp:
934  *   - so_head
935  *   - so_list
936  *
937  * As one can see the accept list lock protects the consistent of the
938  * linkage of the client sockets.
939  *
940  * Note that those fields may be read without holding the accept list lock
941  * for a preflight provided the accept list lock is taken when committing
942  * to take an action based on the result of the preflight. The preflight
943  * saves the cost of doing the unlock/lock dance.
944  */
945 void
so_acquire_accept_list(struct socket * head,struct socket * so)946 so_acquire_accept_list(struct socket *head, struct socket *so)
947 {
948 	lck_mtx_t *mutex_held;
949 
950 	if (head->so_proto->pr_getlock == NULL) {
951 		return;
952 	}
953 	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
954 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
955 
956 	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
957 		head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
958 		return;
959 	}
960 	if (so != NULL) {
961 		socket_unlock(so, 0);
962 	}
963 	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
964 		so_accept_list_waits += 1;
965 		msleep((caddr_t)&head->so_incomp, mutex_held,
966 		    PSOCK | PCATCH, __func__, NULL);
967 	}
968 	head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
969 	if (so != NULL) {
970 		socket_unlock(head, 0);
971 		socket_lock(so, 0);
972 		socket_lock(head, 0);
973 	}
974 }
975 
976 void
so_release_accept_list(struct socket * head)977 so_release_accept_list(struct socket *head)
978 {
979 	if (head->so_proto->pr_getlock != NULL) {
980 		lck_mtx_t *mutex_held;
981 
982 		mutex_held = (*head->so_proto->pr_getlock)(head, 0);
983 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
984 
985 		head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
986 		wakeup((caddr_t)&head->so_incomp);
987 	}
988 }
989 
990 void
sofreelastref(struct socket * so,int dealloc)991 sofreelastref(struct socket *so, int dealloc)
992 {
993 	struct socket *head = so->so_head;
994 
995 	/* Assume socket is locked */
996 
997 #if FLOW_DIVERT
998 	if (so->so_flags & SOF_FLOW_DIVERT) {
999 		flow_divert_detach(so);
1000 	}
1001 #endif  /* FLOW_DIVERT */
1002 
1003 #if CONTENT_FILTER
1004 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1005 		cfil_sock_detach(so);
1006 	}
1007 #endif /* CONTENT_FILTER */
1008 
1009 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1010 		soflow_detach(so);
1011 	}
1012 
1013 	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1014 		selthreadclear(&so->so_snd.sb_sel);
1015 		selthreadclear(&so->so_rcv.sb_sel);
1016 		so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1017 		so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1018 		so->so_event = sonullevent;
1019 		return;
1020 	}
1021 	if (head != NULL) {
1022 		/*
1023 		 * Need to lock the listener when the protocol has
1024 		 * per socket locks
1025 		 */
1026 		if (head->so_proto->pr_getlock != NULL) {
1027 			socket_lock(head, 1);
1028 			so_acquire_accept_list(head, so);
1029 		}
1030 		if (so->so_state & SS_INCOMP) {
1031 			so->so_state &= ~SS_INCOMP;
1032 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1033 			head->so_incqlen--;
1034 			head->so_qlen--;
1035 			so->so_head = NULL;
1036 
1037 			if (head->so_proto->pr_getlock != NULL) {
1038 				so_release_accept_list(head);
1039 				socket_unlock(head, 1);
1040 			}
1041 		} else if (so->so_state & SS_COMP) {
1042 			if (head->so_proto->pr_getlock != NULL) {
1043 				so_release_accept_list(head);
1044 				socket_unlock(head, 1);
1045 			}
1046 			/*
1047 			 * We must not decommission a socket that's
1048 			 * on the accept(2) queue.  If we do, then
1049 			 * accept(2) may hang after select(2) indicated
1050 			 * that the listening socket was ready.
1051 			 */
1052 			selthreadclear(&so->so_snd.sb_sel);
1053 			selthreadclear(&so->so_rcv.sb_sel);
1054 			so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1055 			so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1056 			so->so_event = sonullevent;
1057 			return;
1058 		} else {
1059 			if (head->so_proto->pr_getlock != NULL) {
1060 				so_release_accept_list(head);
1061 				socket_unlock(head, 1);
1062 			}
1063 			printf("sofree: not queued\n");
1064 		}
1065 	}
1066 	sowflush(so);
1067 	sorflush(so);
1068 
1069 	/* 3932268: disable upcall */
1070 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1071 	so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1072 	so->so_event = sonullevent;
1073 
1074 	if (dealloc) {
1075 		sodealloc(so);
1076 	}
1077 }
1078 
1079 void
soclose_wait_locked(struct socket * so)1080 soclose_wait_locked(struct socket *so)
1081 {
1082 	lck_mtx_t *mutex_held;
1083 
1084 	if (so->so_proto->pr_getlock != NULL) {
1085 		mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1086 	} else {
1087 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1088 	}
1089 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1090 
1091 	/*
1092 	 * Double check here and return if there's no outstanding upcall;
1093 	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1094 	 */
1095 	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1096 		return;
1097 	}
1098 	so->so_rcv.sb_flags &= ~SB_UPCALL;
1099 	so->so_snd.sb_flags &= ~SB_UPCALL;
1100 	so->so_flags |= SOF_CLOSEWAIT;
1101 
1102 	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1103 	    "soclose_wait_locked", NULL);
1104 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1105 	so->so_flags &= ~SOF_CLOSEWAIT;
1106 }
1107 
1108 /*
1109  * Close a socket on last file table reference removal.
1110  * Initiate disconnect if connected.
1111  * Free socket when disconnect complete.
1112  */
1113 int
soclose_locked(struct socket * so)1114 soclose_locked(struct socket *so)
1115 {
1116 	int error = 0;
1117 	struct timespec ts;
1118 
1119 	if (so->so_usecount == 0) {
1120 		panic("soclose: so=%p refcount=0", so);
1121 		/* NOTREACHED */
1122 	}
1123 
1124 	sflt_notify(so, sock_evt_closing, NULL);
1125 
1126 	if (so->so_upcallusecount) {
1127 		soclose_wait_locked(so);
1128 	}
1129 
1130 #if CONTENT_FILTER
1131 	/*
1132 	 * We have to wait until the content filters are done
1133 	 */
1134 	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1135 		cfil_sock_close_wait(so);
1136 		cfil_sock_is_closed(so);
1137 		cfil_sock_detach(so);
1138 	}
1139 #endif /* CONTENT_FILTER */
1140 
1141 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1142 		soflow_detach(so);
1143 	}
1144 
1145 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1146 		soresume(current_proc(), so, 1);
1147 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1148 	}
1149 
1150 	if ((so->so_options & SO_ACCEPTCONN)) {
1151 		struct socket *sp, *sonext;
1152 		int persocklock = 0;
1153 		int incomp_overflow_only;
1154 
1155 		/*
1156 		 * We do not want new connection to be added
1157 		 * to the connection queues
1158 		 */
1159 		so->so_options &= ~SO_ACCEPTCONN;
1160 
1161 		/*
1162 		 * We can drop the lock on the listener once
1163 		 * we've acquired the incoming list
1164 		 */
1165 		if (so->so_proto->pr_getlock != NULL) {
1166 			persocklock = 1;
1167 			so_acquire_accept_list(so, NULL);
1168 			socket_unlock(so, 0);
1169 		}
1170 again:
1171 		incomp_overflow_only = 1;
1172 
1173 		TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1174 			/*
1175 			 * Radar 5350314
1176 			 * skip sockets thrown away by tcpdropdropblreq
1177 			 * they will get cleanup by the garbage collection.
1178 			 * otherwise, remove the incomp socket from the queue
1179 			 * and let soabort trigger the appropriate cleanup.
1180 			 */
1181 			if (sp->so_flags & SOF_OVERFLOW) {
1182 				continue;
1183 			}
1184 
1185 			if (persocklock != 0) {
1186 				socket_lock(sp, 1);
1187 			}
1188 
1189 			/*
1190 			 * Radar 27945981
1191 			 * The extra reference for the list insure the
1192 			 * validity of the socket pointer when we perform the
1193 			 * unlock of the head above
1194 			 */
1195 			if (sp->so_state & SS_INCOMP) {
1196 				sp->so_state &= ~SS_INCOMP;
1197 				sp->so_head = NULL;
1198 				TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1199 				so->so_incqlen--;
1200 				so->so_qlen--;
1201 
1202 				(void) soabort(sp);
1203 			} else {
1204 				panic("%s sp %p in so_incomp but !SS_INCOMP",
1205 				    __func__, sp);
1206 			}
1207 
1208 			if (persocklock != 0) {
1209 				socket_unlock(sp, 1);
1210 			}
1211 		}
1212 
1213 		TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1214 			/* Dequeue from so_comp since sofree() won't do it */
1215 			if (persocklock != 0) {
1216 				socket_lock(sp, 1);
1217 			}
1218 
1219 			if (sp->so_state & SS_COMP) {
1220 				sp->so_state &= ~SS_COMP;
1221 				sp->so_head = NULL;
1222 				TAILQ_REMOVE(&so->so_comp, sp, so_list);
1223 				so->so_qlen--;
1224 
1225 				(void) soabort(sp);
1226 			} else {
1227 				panic("%s sp %p in so_comp but !SS_COMP",
1228 				    __func__, sp);
1229 			}
1230 
1231 			if (persocklock) {
1232 				socket_unlock(sp, 1);
1233 			}
1234 		}
1235 
1236 		if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1237 #if (DEBUG | DEVELOPMENT)
1238 			panic("%s head %p so_comp not empty", __func__, so);
1239 #endif /* (DEVELOPMENT || DEBUG) */
1240 
1241 			goto again;
1242 		}
1243 
1244 		if (!TAILQ_EMPTY(&so->so_comp)) {
1245 #if (DEBUG | DEVELOPMENT)
1246 			panic("%s head %p so_comp not empty", __func__, so);
1247 #endif /* (DEVELOPMENT || DEBUG) */
1248 
1249 			goto again;
1250 		}
1251 
1252 		if (persocklock) {
1253 			socket_lock(so, 0);
1254 			so_release_accept_list(so);
1255 		}
1256 	}
1257 	if (so->so_pcb == NULL) {
1258 		/* 3915887: mark the socket as ready for dealloc */
1259 		so->so_flags |= SOF_PCBCLEARING;
1260 		goto discard;
1261 	}
1262 
1263 	if (so->so_state & SS_ISCONNECTED) {
1264 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1265 			error = sodisconnectlocked(so);
1266 			if (error) {
1267 				goto drop;
1268 			}
1269 		}
1270 		if (so->so_options & SO_LINGER) {
1271 			if ((so->so_state & SS_ISDISCONNECTING) &&
1272 			    (so->so_state & SS_NBIO)) {
1273 				goto drop;
1274 			}
1275 			while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1276 				lck_mtx_t *mutex_held;
1277 
1278 				if (so->so_proto->pr_getlock != NULL) {
1279 					mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1280 				} else {
1281 					mutex_held = so->so_proto->pr_domain->dom_mtx;
1282 				}
1283 				ts.tv_sec = (so->so_linger / 100);
1284 				ts.tv_nsec = (so->so_linger % 100) *
1285 				    NSEC_PER_USEC * 1000 * 10;
1286 				error = msleep((caddr_t)&so->so_timeo,
1287 				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1288 				if (error) {
1289 					/*
1290 					 * It's OK when the time fires,
1291 					 * don't report an error
1292 					 */
1293 					if (error == EWOULDBLOCK) {
1294 						error = 0;
1295 					}
1296 					break;
1297 				}
1298 			}
1299 		}
1300 	}
1301 drop:
1302 	if (so->so_usecount == 0) {
1303 		panic("soclose: usecount is zero so=%p", so);
1304 		/* NOTREACHED */
1305 	}
1306 	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1307 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1308 		if (error == 0) {
1309 			error = error2;
1310 		}
1311 	}
1312 	if (so->so_usecount <= 0) {
1313 		panic("soclose: usecount is zero so=%p", so);
1314 		/* NOTREACHED */
1315 	}
1316 discard:
1317 	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1318 	    (so->so_state & SS_NOFDREF)) {
1319 		panic("soclose: NOFDREF");
1320 		/* NOTREACHED */
1321 	}
1322 	so->so_state |= SS_NOFDREF;
1323 
1324 	if ((so->so_flags & SOF_KNOTE) != 0) {
1325 		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1326 	}
1327 
1328 	os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1329 
1330 	VERIFY(so->so_usecount > 0);
1331 	so->so_usecount--;
1332 	sofree(so);
1333 	return error;
1334 }
1335 
1336 int
soclose(struct socket * so)1337 soclose(struct socket *so)
1338 {
1339 	int error = 0;
1340 	socket_lock(so, 1);
1341 
1342 	if (so->so_retaincnt == 0) {
1343 		error = soclose_locked(so);
1344 	} else {
1345 		/*
1346 		 * if the FD is going away, but socket is
1347 		 * retained in kernel remove its reference
1348 		 */
1349 		so->so_usecount--;
1350 		if (so->so_usecount < 2) {
1351 			panic("soclose: retaincnt non null and so=%p "
1352 			    "usecount=%d\n", so, so->so_usecount);
1353 		}
1354 	}
1355 	socket_unlock(so, 1);
1356 	return error;
1357 }
1358 
1359 /*
1360  * Must be called at splnet...
1361  */
1362 /* Should already be locked */
1363 int
soabort(struct socket * so)1364 soabort(struct socket *so)
1365 {
1366 	int error;
1367 
1368 #ifdef MORE_LOCKING_DEBUG
1369 	lck_mtx_t *mutex_held;
1370 
1371 	if (so->so_proto->pr_getlock != NULL) {
1372 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1373 	} else {
1374 		mutex_held = so->so_proto->pr_domain->dom_mtx;
1375 	}
1376 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1377 #endif
1378 
1379 	if ((so->so_flags & SOF_ABORTED) == 0) {
1380 		so->so_flags |= SOF_ABORTED;
1381 		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1382 		if (error) {
1383 			sofree(so);
1384 			return error;
1385 		}
1386 	}
1387 	return 0;
1388 }
1389 
1390 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1391 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1392 {
1393 	int error;
1394 
1395 	if (dolock) {
1396 		socket_lock(so, 1);
1397 	}
1398 
1399 	so_update_last_owner_locked(so, PROC_NULL);
1400 	so_update_policy(so);
1401 #if NECP
1402 	so_update_necp_policy(so, NULL, NULL);
1403 #endif /* NECP */
1404 
1405 	if ((so->so_state & SS_NOFDREF) == 0) {
1406 		panic("soaccept: !NOFDREF");
1407 	}
1408 	so->so_state &= ~SS_NOFDREF;
1409 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1410 
1411 	if (dolock) {
1412 		socket_unlock(so, 1);
1413 	}
1414 	return error;
1415 }
1416 
1417 int
soaccept(struct socket * so,struct sockaddr ** nam)1418 soaccept(struct socket *so, struct sockaddr **nam)
1419 {
1420 	return soacceptlock(so, nam, 1);
1421 }
1422 
1423 int
soacceptfilter(struct socket * so,struct socket * head)1424 soacceptfilter(struct socket *so, struct socket *head)
1425 {
1426 	struct sockaddr *__single local = NULL, *__single remote = NULL;
1427 	int error = 0;
1428 
1429 	/*
1430 	 * Hold the lock even if this socket has not been made visible
1431 	 * to the filter(s).  For sockets with global locks, this protects
1432 	 * against the head or peer going away
1433 	 */
1434 	socket_lock(so, 1);
1435 	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1436 	    sogetaddr_locked(so, &local, 0) != 0) {
1437 		so->so_state &= ~SS_NOFDREF;
1438 		socket_unlock(so, 1);
1439 		soclose(so);
1440 		/* Out of resources; try it again next time */
1441 		error = ECONNABORTED;
1442 		goto done;
1443 	}
1444 
1445 	error = sflt_accept(head, so, local, remote);
1446 
1447 	/*
1448 	 * If we get EJUSTRETURN from one of the filters, mark this socket
1449 	 * as inactive and return it anyway.  This newly accepted socket
1450 	 * will be disconnected later before we hand it off to the caller.
1451 	 */
1452 	if (error == EJUSTRETURN) {
1453 		error = 0;
1454 		(void) sosetdefunct(current_proc(), so,
1455 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1456 	}
1457 
1458 	if (error != 0) {
1459 		/*
1460 		 * This may seem like a duplication to the above error
1461 		 * handling part when we return ECONNABORTED, except
1462 		 * the following is done while holding the lock since
1463 		 * the socket has been exposed to the filter(s) earlier.
1464 		 */
1465 		so->so_state &= ~SS_NOFDREF;
1466 		socket_unlock(so, 1);
1467 		soclose(so);
1468 		/* Propagate socket filter's error code to the caller */
1469 	} else {
1470 		socket_unlock(so, 1);
1471 	}
1472 done:
1473 	/* Callee checks for NULL pointer */
1474 	sock_freeaddr(remote);
1475 	sock_freeaddr(local);
1476 	return error;
1477 }
1478 
1479 /*
1480  * Returns:	0			Success
1481  *		EOPNOTSUPP		Operation not supported on socket
1482  *		EISCONN			Socket is connected
1483  *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1484  *	<pru_connect>:EINVAL		Invalid argument
1485  *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1486  *	<pru_connect>:EACCES		Permission denied
1487  *	<pru_connect>:EADDRINUSE	Address in use
1488  *	<pru_connect>:EAGAIN		Resource unavailable, try again
1489  *	<pru_connect>:EPERM		Operation not permitted
1490  *	<sf_connect_out>:???		[anything a filter writer might set]
1491  */
1492 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1493 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1494 {
1495 	int error;
1496 	struct proc *p = current_proc();
1497 	tracker_metadata_t metadata = { };
1498 
1499 	if (dolock) {
1500 		socket_lock(so, 1);
1501 	}
1502 
1503 	so_update_last_owner_locked(so, p);
1504 	so_update_policy(so);
1505 
1506 	/*
1507 	 * If this is a listening socket or if this is a previously-accepted
1508 	 * socket that has been marked as inactive, reject the connect request.
1509 	 */
1510 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1511 		error = EOPNOTSUPP;
1512 		if (so->so_flags & SOF_DEFUNCT) {
1513 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1514 			    "(%d)\n", __func__, proc_pid(p),
1515 			    proc_best_name(p),
1516 			    so->so_gencnt,
1517 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1518 		}
1519 		if (dolock) {
1520 			socket_unlock(so, 1);
1521 		}
1522 		return error;
1523 	}
1524 
1525 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1526 		if (dolock) {
1527 			socket_unlock(so, 1);
1528 		}
1529 		return EPERM;
1530 	}
1531 
1532 	/*
1533 	 * If protocol is connection-based, can only connect once.
1534 	 * Otherwise, if connected, try to disconnect first.
1535 	 * This allows user to disconnect by connecting to, e.g.,
1536 	 * a null address.
1537 	 */
1538 #if NECP
1539 	bool set_domain_from_tracker_lookup = false;
1540 #endif /* NECP */
1541 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1542 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1543 	    (error = sodisconnectlocked(so)))) {
1544 		error = EISCONN;
1545 	} else {
1546 		/*
1547 		 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1548 		 * a tracker domain.  Mark socket accordingly.  Skip lookup if socket has already been marked a tracker.
1549 		 */
1550 		if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1551 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1552 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1553 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1554 				}
1555 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1556 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1557 				}
1558 #if NECP
1559 				set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1560 #endif /* NECP */
1561 				necp_set_socket_domain_attributes(so,
1562 				    __unsafe_null_terminated_from_indexable(metadata.domain),
1563 				    __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1564 			}
1565 		}
1566 
1567 #if NECP
1568 		/* Update NECP evaluation after setting any domain via the tracker checks */
1569 		so_update_necp_policy(so, NULL, nam);
1570 		if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1571 			// Mark extended timeout on tracker lookup to ensure that the entry stays around
1572 			tracker_metadata_t update_metadata = { };
1573 			update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1574 			(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &update_metadata);
1575 		}
1576 #endif /* NECP */
1577 
1578 		/*
1579 		 * Run connect filter before calling protocol:
1580 		 *  - non-blocking connect returns before completion;
1581 		 */
1582 		error = sflt_connectout(so, nam);
1583 		if (error != 0) {
1584 			if (error == EJUSTRETURN) {
1585 				error = 0;
1586 			}
1587 		} else {
1588 			error = (*so->so_proto->pr_usrreqs->pru_connect)
1589 			    (so, nam, p);
1590 			if (error != 0) {
1591 				so->so_state &= ~SS_ISCONNECTING;
1592 			}
1593 		}
1594 	}
1595 	if (dolock) {
1596 		socket_unlock(so, 1);
1597 	}
1598 	return error;
1599 }
1600 
1601 int
soconnect(struct socket * so,struct sockaddr * nam)1602 soconnect(struct socket *so, struct sockaddr *nam)
1603 {
1604 	return soconnectlock(so, nam, 1);
1605 }
1606 
1607 /*
1608  * Returns:	0			Success
1609  *	<pru_connect2>:EINVAL[AF_UNIX]
1610  *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1611  *	<pru_connect2>:???		[other protocol families]
1612  *
1613  * Notes:	<pru_connect2> is not supported by [TCP].
1614  */
1615 int
soconnect2(struct socket * so1,struct socket * so2)1616 soconnect2(struct socket *so1, struct socket *so2)
1617 {
1618 	int error;
1619 
1620 	socket_lock(so1, 1);
1621 	if (so2->so_proto->pr_lock) {
1622 		socket_lock(so2, 1);
1623 	}
1624 
1625 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1626 
1627 	socket_unlock(so1, 1);
1628 	if (so2->so_proto->pr_lock) {
1629 		socket_unlock(so2, 1);
1630 	}
1631 	return error;
1632 }
1633 
1634 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1635 soconnectxlocked(struct socket *so, struct sockaddr *src,
1636     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1637     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1638     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1639 {
1640 	int error;
1641 	tracker_metadata_t metadata = { };
1642 
1643 	so_update_last_owner_locked(so, p);
1644 	so_update_policy(so);
1645 
1646 	/*
1647 	 * If this is a listening socket or if this is a previously-accepted
1648 	 * socket that has been marked as inactive, reject the connect request.
1649 	 */
1650 	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1651 		error = EOPNOTSUPP;
1652 		if (so->so_flags & SOF_DEFUNCT) {
1653 			SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1654 			    "(%d)\n", __func__, proc_pid(p),
1655 			    proc_best_name(p),
1656 			    so->so_gencnt,
1657 			    SOCK_DOM(so), SOCK_TYPE(so), error);
1658 		}
1659 		return error;
1660 	}
1661 
1662 	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1663 		return EPERM;
1664 	}
1665 
1666 	/*
1667 	 * If protocol is connection-based, can only connect once
1668 	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1669 	 * try to disconnect first.  This allows user to disconnect
1670 	 * by connecting to, e.g., a null address.
1671 	 */
1672 #if NECP
1673 	bool set_domain_from_tracker_lookup = false;
1674 #endif /* NECP */
1675 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1676 	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1677 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1678 	    (error = sodisconnectlocked(so)) != 0)) {
1679 		error = EISCONN;
1680 	} else {
1681 		/*
1682 		 * For TCP, check if destination address is a tracker and mark the socket accordingly
1683 		 * (only if it hasn't been marked yet).
1684 		 */
1685 		if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1686 		    !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1687 			if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1688 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1689 					so->so_flags1 |= SOF1_KNOWN_TRACKER;
1690 				}
1691 				if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1692 					so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1693 				}
1694 #if NECP
1695 				set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1696 #endif /* NECP */
1697 				necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain),
1698 				    __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1699 			}
1700 		}
1701 
1702 		if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1703 		    (flags & CONNECT_DATA_IDEMPOTENT)) {
1704 			so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1705 
1706 			if (flags & CONNECT_DATA_AUTHENTICATED) {
1707 				so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1708 			}
1709 		}
1710 
1711 		/*
1712 		 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1713 		 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1714 		 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1715 		 * Case 3 allows user to combine write with connect even if they have
1716 		 * no use for TFO (such as regular TCP, and UDP).
1717 		 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1718 		 */
1719 		if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1720 		    ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1721 			so->so_flags1 |= SOF1_PRECONNECT_DATA;
1722 		}
1723 
1724 		/*
1725 		 * If a user sets data idempotent and does not pass an uio, or
1726 		 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1727 		 * SOF1_DATA_IDEMPOTENT.
1728 		 */
1729 		if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1730 		    (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1731 			/* We should return EINVAL instead perhaps. */
1732 			so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1733 		}
1734 
1735 		/*
1736 		 * Run connect filter before calling protocol:
1737 		 *  - non-blocking connect returns before completion;
1738 		 */
1739 		error = sflt_connectout(so, dst);
1740 		if (error != 0) {
1741 			/* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1742 			so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1743 			if (error == EJUSTRETURN) {
1744 				error = 0;
1745 			}
1746 		} else {
1747 			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1748 			    (so, src, dst, p, ifscope, aid, pcid,
1749 			    flags, arg, arglen, auio, bytes_written);
1750 			if (error != 0) {
1751 				so->so_state &= ~SS_ISCONNECTING;
1752 				if (error != EINPROGRESS) {
1753 					so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1754 				}
1755 			}
1756 
1757 #if NECP
1758 			if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1759 				// Mark extended timeout on tracker lookup to ensure that the entry stays around
1760 				tracker_metadata_t update_metadata = { };
1761 				update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1762 				(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &update_metadata);
1763 			}
1764 #endif /* NECP */
1765 		}
1766 	}
1767 
1768 	return error;
1769 }
1770 
1771 int
sodisconnectlocked(struct socket * so)1772 sodisconnectlocked(struct socket *so)
1773 {
1774 	int error;
1775 
1776 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1777 		error = ENOTCONN;
1778 		goto bad;
1779 	}
1780 	if (so->so_state & SS_ISDISCONNECTING) {
1781 		error = EALREADY;
1782 		goto bad;
1783 	}
1784 
1785 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1786 	if (error == 0) {
1787 		sflt_notify(so, sock_evt_disconnected, NULL);
1788 	}
1789 
1790 bad:
1791 	return error;
1792 }
1793 
1794 /* Locking version */
1795 int
sodisconnect(struct socket * so)1796 sodisconnect(struct socket *so)
1797 {
1798 	int error;
1799 
1800 	socket_lock(so, 1);
1801 	error = sodisconnectlocked(so);
1802 	socket_unlock(so, 1);
1803 	return error;
1804 }
1805 
1806 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1807 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1808 {
1809 	int error;
1810 
1811 	/*
1812 	 * Call the protocol disconnectx handler; let it handle all
1813 	 * matters related to the connection state of this session.
1814 	 */
1815 	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1816 	if (error == 0) {
1817 		/*
1818 		 * The event applies only for the session, not for
1819 		 * the disconnection of individual subflows.
1820 		 */
1821 		if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1822 			sflt_notify(so, sock_evt_disconnected, NULL);
1823 		}
1824 	}
1825 	return error;
1826 }
1827 
1828 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1829 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1830 {
1831 	int error;
1832 
1833 	socket_lock(so, 1);
1834 	error = sodisconnectxlocked(so, aid, cid);
1835 	socket_unlock(so, 1);
1836 	return error;
1837 }
1838 
1839 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1840 
1841 /*
1842  * sosendcheck will lock the socket buffer if it isn't locked and
1843  * verify that there is space for the data being inserted.
1844  *
1845  * Returns:	0			Success
1846  *		EPIPE
1847  *	sblock:EWOULDBLOCK
1848  *	sblock:EINTR
1849  *	sbwait:EBADF
1850  *	sbwait:EINTR
1851  *	[so_error]:???
1852  */
1853 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1854 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1855     int32_t clen, int32_t atomic, int flags, int *sblocked)
1856 {
1857 	int assumelock = 0;
1858 	int error = 0;
1859 	int32_t space;
1860 	int ret;
1861 
1862 restart:
1863 	if (*sblocked == 0) {
1864 		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1865 		    so->so_send_filt_thread != 0 &&
1866 		    so->so_send_filt_thread == current_thread()) {
1867 			/*
1868 			 * We're being called recursively from a filter,
1869 			 * allow this to continue. Radar 4150520.
1870 			 * Don't set sblocked because we don't want
1871 			 * to perform an unlock later.
1872 			 */
1873 			assumelock = 1;
1874 		} else {
1875 			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1876 			if (error) {
1877 				if (so->so_flags & SOF_DEFUNCT) {
1878 					goto defunct;
1879 				}
1880 				return error;
1881 			}
1882 			*sblocked = 1;
1883 		}
1884 	}
1885 
1886 	/*
1887 	 * If a send attempt is made on a socket that has been marked
1888 	 * as inactive (disconnected), reject the request.
1889 	 */
1890 	if (so->so_flags & SOF_DEFUNCT) {
1891 defunct:
1892 		error = EPIPE;
1893 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
1894 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
1895 		    so->so_gencnt,
1896 		    SOCK_DOM(so), SOCK_TYPE(so), error);
1897 		return error;
1898 	}
1899 
1900 	if (so->so_state & SS_CANTSENDMORE) {
1901 #if CONTENT_FILTER
1902 		/*
1903 		 * Can re-inject data of half closed connections
1904 		 */
1905 		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1906 		    so->so_snd.sb_cfil_thread == current_thread() &&
1907 		    cfil_sock_data_pending(&so->so_snd) != 0) {
1908 			CFIL_LOG(LOG_INFO,
1909 			    "so %llx ignore SS_CANTSENDMORE",
1910 			    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1911 		} else
1912 #endif /* CONTENT_FILTER */
1913 		return EPIPE;
1914 	}
1915 	if (so->so_error) {
1916 		error = so->so_error;
1917 		so->so_error = 0;
1918 		return error;
1919 	}
1920 
1921 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1922 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1923 			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1924 			    (resid != 0 || clen == 0) &&
1925 			    !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1926 				return ENOTCONN;
1927 			}
1928 		} else if (addr == 0) {
1929 			return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1930 			       ENOTCONN : EDESTADDRREQ;
1931 		}
1932 	}
1933 
1934 	space = sbspace(&so->so_snd);
1935 
1936 	if (flags & MSG_OOB) {
1937 		space += 1024;
1938 	}
1939 	if ((atomic && resid > so->so_snd.sb_hiwat) ||
1940 	    clen > so->so_snd.sb_hiwat) {
1941 		return EMSGSIZE;
1942 	}
1943 
1944 	if ((space < resid + clen &&
1945 	    (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1946 	    space < clen)) ||
1947 	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1948 		/*
1949 		 * don't block the connectx call when there's more data
1950 		 * than can be copied.
1951 		 */
1952 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1953 			if (space == 0) {
1954 				return EWOULDBLOCK;
1955 			}
1956 			if (space < (int32_t)so->so_snd.sb_lowat) {
1957 				return 0;
1958 			}
1959 		}
1960 		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1961 		    assumelock) {
1962 			return EWOULDBLOCK;
1963 		}
1964 		sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1965 		*sblocked = 0;
1966 		error = sbwait(&so->so_snd);
1967 		if (error) {
1968 			if (so->so_flags & SOF_DEFUNCT) {
1969 				goto defunct;
1970 			}
1971 			return error;
1972 		}
1973 		goto restart;
1974 	}
1975 
1976 	ret = proto_memacct_limited(so->so_proto);
1977 	if (ret == MEMACCT_HARDLIMIT ||
1978 	    (ret == MEMACCT_SOFTLIMIT && so->so_snd.sb_cc > 0)) {
1979 		return ENOMEM;
1980 	}
1981 	return 0;
1982 }
1983 
1984 /*
1985  * Send on a socket.
1986  * If send must go all at once and message is larger than
1987  * send buffering, then hard error.
1988  * Lock against other senders.
1989  * If must go all at once and not enough room now, then
1990  * inform user that this would block and do nothing.
1991  * Otherwise, if nonblocking, send as much as possible.
1992  * The data to be sent is described by "uio" if nonzero,
1993  * otherwise by the mbuf chain "top" (which must be null
1994  * if uio is not).  Data provided in mbuf chain must be small
1995  * enough to send all at once.
1996  *
1997  * Returns nonzero on error, timeout or signal; callers
1998  * must check for short counts if EINTR/ERESTART are returned.
1999  * Data and control buffers are freed on return.
2000  *
2001  * Returns:	0			Success
2002  *		EOPNOTSUPP
2003  *		EINVAL
2004  *		ENOBUFS
2005  *	uiomove:EFAULT
2006  *	sosendcheck:EPIPE
2007  *	sosendcheck:EWOULDBLOCK
2008  *	sosendcheck:EINTR
2009  *	sosendcheck:EBADF
2010  *	sosendcheck:EINTR
2011  *	sosendcheck:???			[value from so_error]
2012  *	<pru_send>:ECONNRESET[TCP]
2013  *	<pru_send>:EINVAL[TCP]
2014  *	<pru_send>:ENOBUFS[TCP]
2015  *	<pru_send>:EADDRINUSE[TCP]
2016  *	<pru_send>:EADDRNOTAVAIL[TCP]
2017  *	<pru_send>:EAFNOSUPPORT[TCP]
2018  *	<pru_send>:EACCES[TCP]
2019  *	<pru_send>:EAGAIN[TCP]
2020  *	<pru_send>:EPERM[TCP]
2021  *	<pru_send>:EMSGSIZE[TCP]
2022  *	<pru_send>:EHOSTUNREACH[TCP]
2023  *	<pru_send>:ENETUNREACH[TCP]
2024  *	<pru_send>:ENETDOWN[TCP]
2025  *	<pru_send>:ENOMEM[TCP]
2026  *	<pru_send>:ENOBUFS[TCP]
2027  *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
2028  *	<pru_send>:EINVAL[AF_UNIX]
2029  *	<pru_send>:EOPNOTSUPP[AF_UNIX]
2030  *	<pru_send>:EPIPE[AF_UNIX]
2031  *	<pru_send>:ENOTCONN[AF_UNIX]
2032  *	<pru_send>:EISCONN[AF_UNIX]
2033  *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
2034  *	<sf_data_out>:???		[whatever a filter author chooses]
2035  *
2036  * Notes:	Other <pru_send> returns depend on the protocol family; all
2037  *		<sf_data_out> returns depend on what the filter author causes
2038  *		their filter to return.
2039  */
2040 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2041 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2042     struct mbuf *top, struct mbuf *control, int flags)
2043 {
2044 	mbuf_ref_ref_t mp;
2045 	mbuf_ref_t m, freelist = NULL;
2046 	struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2047 	user_ssize_t space, len, resid, orig_resid;
2048 	int clen = 0, error, dontroute, sendflags;
2049 	int atomic = sosendallatonce(so) || top;
2050 	int sblocked = 0;
2051 	struct proc *p = current_proc();
2052 	uint16_t headroom = 0;
2053 	ssize_t mlen;
2054 	boolean_t en_tracing = FALSE;
2055 
2056 	if (uio != NULL) {
2057 		resid = uio_resid(uio);
2058 	} else {
2059 		resid = top->m_pkthdr.len;
2060 	}
2061 	orig_resid = resid;
2062 
2063 	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2064 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2065 
2066 	socket_lock(so, 1);
2067 
2068 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2069 		dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, SOFLOW_DIRECTION_OUTBOUND, 0);
2070 	}
2071 
2072 	/*
2073 	 * trace if tracing & network (vs. unix) sockets & and
2074 	 * non-loopback
2075 	 */
2076 	if (ENTR_SHOULDTRACE &&
2077 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2078 		struct inpcb *inp = sotoinpcb(so);
2079 		if (inp->inp_last_outifp != NULL &&
2080 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2081 			en_tracing = TRUE;
2082 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2083 			    VM_KERNEL_ADDRPERM(so),
2084 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2085 			    (int64_t)resid);
2086 		}
2087 	}
2088 
2089 	/*
2090 	 * Re-injection should not affect process accounting
2091 	 */
2092 	if ((flags & MSG_SKIPCFIL) == 0) {
2093 		so_update_last_owner_locked(so, p);
2094 		so_update_policy(so);
2095 
2096 #if NECP
2097 		so_update_necp_policy(so, NULL, addr);
2098 #endif /* NECP */
2099 	}
2100 
2101 	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2102 		error = EOPNOTSUPP;
2103 		goto out_locked;
2104 	}
2105 
2106 	/*
2107 	 * In theory resid should be unsigned.
2108 	 * However, space must be signed, as it might be less than 0
2109 	 * if we over-committed, and we must use a signed comparison
2110 	 * of space and resid.  On the other hand, a negative resid
2111 	 * causes us to loop sending 0-length segments to the protocol.
2112 	 *
2113 	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2114 	 *
2115 	 * Note: We limit resid to be a positive int value as we use
2116 	 * imin() to set bytes_to_copy -- radr://14558484
2117 	 */
2118 	if (resid < 0 || resid > INT_MAX ||
2119 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2120 		error = EINVAL;
2121 		goto out_locked;
2122 	}
2123 
2124 	dontroute = (flags & MSG_DONTROUTE) &&
2125 	    (so->so_options & SO_DONTROUTE) == 0 &&
2126 	    (so->so_proto->pr_flags & PR_ATOMIC);
2127 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2128 
2129 	if (control != NULL) {
2130 		clen = control->m_len;
2131 	}
2132 
2133 	if (soreserveheadroom != 0) {
2134 		headroom = so->so_pktheadroom;
2135 	}
2136 
2137 	do {
2138 		error = sosendcheck(so, addr, resid, clen, atomic, flags,
2139 		    &sblocked);
2140 		if (error) {
2141 			goto out_locked;
2142 		}
2143 
2144 		mp = &top;
2145 		space = sbspace(&so->so_snd) - clen;
2146 		space += ((flags & MSG_OOB) ? 1024 : 0);
2147 
2148 		do {
2149 			if (uio == NULL) {
2150 				/*
2151 				 * Data is prepackaged in "top".
2152 				 */
2153 				resid = 0;
2154 				if (flags & MSG_EOR) {
2155 					top->m_flags |= M_EOR;
2156 				}
2157 			} else {
2158 				int chainlength;
2159 				int bytes_to_copy;
2160 				boolean_t jumbocl;
2161 				boolean_t bigcl;
2162 				int bytes_to_alloc;
2163 
2164 				bytes_to_copy = imin((int)resid, (int)space);
2165 
2166 				bytes_to_alloc = bytes_to_copy;
2167 				if (top == NULL) {
2168 					bytes_to_alloc += headroom;
2169 				}
2170 
2171 				if (sosendminchain > 0) {
2172 					chainlength = 0;
2173 				} else {
2174 					chainlength = sosendmaxchain;
2175 				}
2176 
2177 				/*
2178 				 * Use big 4 KB cluster when the outgoing interface
2179 				 * does not prefer 2 KB clusters
2180 				 */
2181 				bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2182 				    sosendbigcl_ignore_capab;
2183 
2184 				/*
2185 				 * Attempt to use larger than system page-size
2186 				 * clusters for large writes only if there is
2187 				 * a jumbo cluster pool and if the socket is
2188 				 * marked accordingly.
2189 				 */
2190 				jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0 &&
2191 				    bigcl;
2192 
2193 				socket_unlock(so, 0);
2194 
2195 				do {
2196 					int num_needed;
2197 					int hdrs_needed = (top == NULL) ? 1 : 0;
2198 
2199 					/*
2200 					 * try to maintain a local cache of mbuf
2201 					 * clusters needed to complete this
2202 					 * write the list is further limited to
2203 					 * the number that are currently needed
2204 					 * to fill the socket this mechanism
2205 					 * allows a large number of mbufs/
2206 					 * clusters to be grabbed under a single
2207 					 * mbuf lock... if we can't get any
2208 					 * clusters, than fall back to trying
2209 					 * for mbufs if we fail early (or
2210 					 * miscalcluate the number needed) make
2211 					 * sure to release any clusters we
2212 					 * haven't yet consumed.
2213 					 */
2214 					if (freelist == NULL &&
2215 					    bytes_to_alloc > MBIGCLBYTES &&
2216 					    jumbocl) {
2217 						num_needed =
2218 						    bytes_to_alloc / M16KCLBYTES;
2219 
2220 						if ((bytes_to_alloc -
2221 						    (num_needed * M16KCLBYTES))
2222 						    >= MINCLSIZE) {
2223 							num_needed++;
2224 						}
2225 
2226 						freelist =
2227 						    m_getpackets_internal(
2228 							(unsigned int *)&num_needed,
2229 							hdrs_needed, M_WAIT, 0,
2230 							M16KCLBYTES);
2231 						/*
2232 						 * Fall back to 4K cluster size
2233 						 * if allocation failed
2234 						 */
2235 					}
2236 
2237 					if (freelist == NULL &&
2238 					    bytes_to_alloc > MCLBYTES &&
2239 					    bigcl) {
2240 						num_needed =
2241 						    bytes_to_alloc / MBIGCLBYTES;
2242 
2243 						if ((bytes_to_alloc -
2244 						    (num_needed * MBIGCLBYTES)) >=
2245 						    MINCLSIZE) {
2246 							num_needed++;
2247 						}
2248 
2249 						freelist =
2250 						    m_getpackets_internal(
2251 							(unsigned int *)&num_needed,
2252 							hdrs_needed, M_WAIT, 0,
2253 							MBIGCLBYTES);
2254 						/*
2255 						 * Fall back to cluster size
2256 						 * if allocation failed
2257 						 */
2258 					}
2259 
2260 					/*
2261 					 * Allocate a cluster as we want to
2262 					 * avoid to split the data in more
2263 					 * that one segment and using MINCLSIZE
2264 					 * would lead us to allocate two mbufs
2265 					 */
2266 					if (soreserveheadroom != 0 &&
2267 					    freelist == NULL &&
2268 					    ((top == NULL &&
2269 					    bytes_to_alloc > _MHLEN) ||
2270 					    bytes_to_alloc > _MLEN)) {
2271 						num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2272 						    MCLBYTES;
2273 						freelist =
2274 						    m_getpackets_internal(
2275 							(unsigned int *)&num_needed,
2276 							hdrs_needed, M_WAIT, 0,
2277 							MCLBYTES);
2278 						/*
2279 						 * Fall back to a single mbuf
2280 						 * if allocation failed
2281 						 */
2282 					} else if (freelist == NULL &&
2283 					    bytes_to_alloc > MINCLSIZE) {
2284 						num_needed =
2285 						    bytes_to_alloc / MCLBYTES;
2286 
2287 						if ((bytes_to_alloc -
2288 						    (num_needed * MCLBYTES)) >=
2289 						    MINCLSIZE) {
2290 							num_needed++;
2291 						}
2292 
2293 						freelist =
2294 						    m_getpackets_internal(
2295 							(unsigned int *)&num_needed,
2296 							hdrs_needed, M_WAIT, 0,
2297 							MCLBYTES);
2298 						/*
2299 						 * Fall back to a single mbuf
2300 						 * if allocation failed
2301 						 */
2302 					}
2303 					/*
2304 					 * For datagram protocols, leave
2305 					 * headroom for protocol headers
2306 					 * in the first cluster of the chain
2307 					 */
2308 					if (freelist != NULL && atomic &&
2309 					    top == NULL && headroom > 0) {
2310 						freelist->m_data += headroom;
2311 					}
2312 
2313 					/*
2314 					 * Fall back to regular mbufs without
2315 					 * reserving the socket headroom
2316 					 */
2317 					if (freelist == NULL) {
2318 						if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2319 							if (top == NULL) {
2320 								MGETHDR(freelist,
2321 								    M_WAIT, MT_DATA);
2322 							} else {
2323 								MGET(freelist,
2324 								    M_WAIT, MT_DATA);
2325 							}
2326 						}
2327 
2328 						if (freelist == NULL) {
2329 							error = ENOBUFS;
2330 							socket_lock(so, 0);
2331 							goto out_locked;
2332 						}
2333 						/*
2334 						 * For datagram protocols,
2335 						 * leave room for protocol
2336 						 * headers in first mbuf.
2337 						 */
2338 						if (atomic && top == NULL &&
2339 						    bytes_to_copy > 0 &&
2340 						    bytes_to_copy < MHLEN) {
2341 							MH_ALIGN(freelist,
2342 							    bytes_to_copy);
2343 						}
2344 					}
2345 					m = freelist;
2346 					freelist = m->m_next;
2347 					m->m_next = NULL;
2348 
2349 					if ((m->m_flags & M_EXT)) {
2350 						mlen = m->m_ext.ext_size -
2351 						    M_LEADINGSPACE(m);
2352 					} else if ((m->m_flags & M_PKTHDR)) {
2353 						mlen = MHLEN - M_LEADINGSPACE(m);
2354 						m_add_crumb(m, PKT_CRUMB_SOSEND);
2355 					} else {
2356 						mlen = MLEN - M_LEADINGSPACE(m);
2357 					}
2358 					len = imin((int)mlen, bytes_to_copy);
2359 
2360 					chainlength += len;
2361 
2362 					space -= len;
2363 
2364 					error = uiomove(mtod(m, caddr_t),
2365 					    (int)len, uio);
2366 
2367 					resid = uio_resid(uio);
2368 
2369 					m->m_len = (int32_t)len;
2370 					*mp = m;
2371 					top->m_pkthdr.len += len;
2372 					if (error) {
2373 						break;
2374 					}
2375 					mp = &m->m_next;
2376 					if (resid <= 0) {
2377 						if (flags & MSG_EOR) {
2378 							top->m_flags |= M_EOR;
2379 						}
2380 						break;
2381 					}
2382 					bytes_to_copy = imin((int)resid, (int)space);
2383 				} while (space > 0 &&
2384 				    (chainlength < sosendmaxchain || atomic ||
2385 				    resid < MINCLSIZE));
2386 
2387 				socket_lock(so, 0);
2388 
2389 				if (error) {
2390 					goto out_locked;
2391 				}
2392 			}
2393 
2394 			if (dontroute) {
2395 				so->so_options |= SO_DONTROUTE;
2396 			}
2397 
2398 			/*
2399 			 * Compute flags here, for pru_send and NKEs
2400 			 *
2401 			 * If the user set MSG_EOF, the protocol
2402 			 * understands this flag and nothing left to
2403 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
2404 			 */
2405 			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2406 			    ((flags & MSG_EOF) &&
2407 			    (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2408 			    (resid <= 0)) ? PRUS_EOF :
2409 			    /* If there is more to send set PRUS_MORETOCOME */
2410 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2411 
2412 			if ((flags & MSG_SKIPCFIL) == 0) {
2413 				/*
2414 				 * Socket filter processing
2415 				 */
2416 				error = sflt_data_out(so, addr, &top,
2417 				    &control, (sendflags & MSG_OOB) ?
2418 				    sock_data_filt_flag_oob : 0);
2419 				if (error) {
2420 					if (error == EJUSTRETURN) {
2421 						error = 0;
2422 						goto packet_consumed;
2423 					}
2424 					goto out_locked;
2425 				}
2426 #if CONTENT_FILTER
2427 				/*
2428 				 * Content filter processing
2429 				 */
2430 				error = cfil_sock_data_out(so, addr, top,
2431 				    control, sendflags, dgram_flow_entry);
2432 				if (error) {
2433 					if (error == EJUSTRETURN) {
2434 						error = 0;
2435 						goto packet_consumed;
2436 					}
2437 					goto out_locked;
2438 				}
2439 #endif /* CONTENT_FILTER */
2440 			}
2441 			error = (*so->so_proto->pr_usrreqs->pru_send)
2442 			    (so, sendflags, top, addr, control, p);
2443 
2444 packet_consumed:
2445 			if (dontroute) {
2446 				so->so_options &= ~SO_DONTROUTE;
2447 			}
2448 
2449 			clen = 0;
2450 			control = NULL;
2451 			top = NULL;
2452 			mp = &top;
2453 			if (error) {
2454 				goto out_locked;
2455 			}
2456 		} while (resid && space > 0);
2457 	} while (resid);
2458 
2459 
2460 out_locked:
2461 	if (resid > orig_resid) {
2462 		char pname[MAXCOMLEN] = {};
2463 		pid_t current_pid = proc_pid(current_proc());
2464 		proc_name(current_pid, pname, sizeof(pname));
2465 
2466 		if (sosend_assert_panic != 0) {
2467 			panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2468 			    so, resid, orig_resid, pname, current_pid);
2469 		} else {
2470 			os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2471 			    so->so_gencnt, resid, orig_resid, pname, current_pid);
2472 		}
2473 	}
2474 
2475 	if (sblocked) {
2476 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2477 	} else {
2478 		socket_unlock(so, 1);
2479 	}
2480 	if (top != NULL) {
2481 		m_freem(top);
2482 	}
2483 	if (control != NULL) {
2484 		m_freem(control);
2485 	}
2486 	if (freelist != NULL) {
2487 		m_freem_list(freelist);
2488 	}
2489 
2490 	if (dgram_flow_entry != NULL) {
2491 		soflow_free_flow(dgram_flow_entry);
2492 	}
2493 
2494 	soclearfastopen(so);
2495 
2496 	if (en_tracing) {
2497 		/* resid passed here is the bytes left in uio */
2498 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2499 		    VM_KERNEL_ADDRPERM(so),
2500 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2501 		    (int64_t)(orig_resid - resid));
2502 	}
2503 	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2504 	    so->so_snd.sb_cc, space, error);
2505 
2506 	return error;
2507 }
2508 
2509 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2510 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2511 {
2512 	struct mbuf *m0 = NULL, *control_end = NULL;
2513 
2514 	socket_lock_assert_owned(so);
2515 
2516 	/*
2517 	 * top must points to mbuf chain to be sent.
2518 	 * If control is not NULL, top must be packet header
2519 	 */
2520 	VERIFY(top != NULL &&
2521 	    (control == NULL || top->m_flags & M_PKTHDR));
2522 
2523 	/*
2524 	 * If control is not passed in, see if we can get it
2525 	 * from top.
2526 	 */
2527 	if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2528 		// Locate start of control if present and start of data
2529 		for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2530 			if (m0->m_flags & M_PKTHDR) {
2531 				top = m0;
2532 				break;
2533 			} else if (m0->m_type == MT_CONTROL) {
2534 				if (control == NULL) {
2535 					// Found start of control
2536 					control = m0;
2537 				}
2538 				if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2539 					// Found end of control
2540 					control_end = m0;
2541 				}
2542 			}
2543 		}
2544 		if (control_end != NULL) {
2545 			control_end->m_next = NULL;
2546 		}
2547 	}
2548 
2549 	int error = (*so->so_proto->pr_usrreqs->pru_send)
2550 	    (so, sendflags, top, addr, control, current_proc());
2551 
2552 	return error;
2553 }
2554 
2555 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp,struct mbuf ** last_control)2556 mbuf_detach_control_from_list(struct mbuf **mp, struct mbuf **last_control)
2557 {
2558 	struct mbuf *control = NULL;
2559 	struct mbuf *m = *mp;
2560 
2561 	if (m->m_type == MT_CONTROL) {
2562 		struct mbuf *control_end;
2563 		struct mbuf *n;
2564 
2565 		n = control_end = control = m;
2566 
2567 		/*
2568 		 * Break the chain per mbuf type
2569 		 */
2570 		while (n != NULL && n->m_type == MT_CONTROL) {
2571 			control_end = n;
2572 			n = n->m_next;
2573 		}
2574 		control_end->m_next = NULL;
2575 		*mp = n;
2576 		if (last_control != NULL) {
2577 			*last_control = control_end;
2578 		}
2579 	}
2580 	VERIFY(*mp != NULL);
2581 
2582 	return control;
2583 }
2584 
2585 /*
2586  * Supported only connected sockets (no address) without ancillary data
2587  * (control mbuf) for atomic protocols
2588  */
2589 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2590 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2591 {
2592 	mbuf_ref_t m, control = NULL;
2593 	struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2594 	int error, dontroute;
2595 	int atomic = sosendallatonce(so);
2596 	int sblocked = 0;
2597 	struct proc *p = current_proc();
2598 	struct mbuf *top = pktlist;
2599 	bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2600 
2601 	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2602 	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2603 
2604 	if (so->so_type != SOCK_DGRAM) {
2605 		error = EINVAL;
2606 		os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2607 		    error);
2608 		goto out;
2609 	}
2610 	if (atomic == 0) {
2611 		error = EINVAL;
2612 		os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2613 		    error);
2614 		goto out;
2615 	}
2616 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2617 		error = ENOTCONN;
2618 		os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2619 		    error);
2620 		goto out;
2621 	}
2622 	if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2623 		error = EINVAL;
2624 		os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2625 		    flags, error);
2626 		goto out;
2627 	}
2628 
2629 	socket_lock(so, 1);
2630 	so_update_last_owner_locked(so, p);
2631 	so_update_policy(so);
2632 
2633 	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2634 		dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, SOFLOW_DIRECTION_OUTBOUND, 0);
2635 	}
2636 
2637 #if NECP
2638 	so_update_necp_policy(so, NULL, NULL);
2639 #endif /* NECP */
2640 
2641 	dontroute = (flags & MSG_DONTROUTE) &&
2642 	    (so->so_options & SO_DONTROUTE) == 0 &&
2643 	    (so->so_proto->pr_flags & PR_ATOMIC);
2644 	if (dontroute) {
2645 		so->so_options |= SO_DONTROUTE;
2646 	}
2647 
2648 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2649 
2650 	error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2651 	if (error) {
2652 		os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2653 		    error);
2654 		goto release;
2655 	}
2656 
2657 	if (!skip_filt) {
2658 		mbuf_ref_ref_t prevnextp = NULL;
2659 
2660 		for (m = top; m != NULL; m = m->m_nextpkt) {
2661 			mbuf_ref_t nextpkt, last_control;
2662 
2663 			/*
2664 			 * Remove packet from the list of packets
2665 			 */
2666 			nextpkt = m->m_nextpkt;
2667 			if (prevnextp != NULL) {
2668 				*prevnextp = nextpkt;
2669 			} else {
2670 				top = nextpkt;
2671 			}
2672 			m->m_nextpkt = NULL;
2673 
2674 			/*
2675 			 * Break the chain per mbuf type
2676 			 */
2677 			if (m->m_type == MT_CONTROL) {
2678 				control = mbuf_detach_control_from_list(&m, &last_control);
2679 			}
2680 			/*
2681 			 * Socket filter processing
2682 			 */
2683 			error = sflt_data_out(so, NULL, &m,
2684 			    &control, 0);
2685 			if (error != 0 && error != EJUSTRETURN) {
2686 				os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2687 				    error);
2688 				m_freem(m);
2689 				goto release;
2690 			}
2691 
2692 #if CONTENT_FILTER
2693 			if (error == 0) {
2694 				/*
2695 				 * Content filter processing
2696 				 */
2697 				error = cfil_sock_data_out(so, NULL, m,
2698 				    control, 0, dgram_flow_entry);
2699 				if (error != 0 && error != EJUSTRETURN) {
2700 					os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2701 					    error);
2702 					m_freem(m);
2703 					goto release;
2704 				}
2705 			}
2706 #endif /* CONTENT_FILTER */
2707 			if (error == EJUSTRETURN) {
2708 				/*
2709 				 * When swallowed by a filter, the packet is not
2710 				 * in the list anymore
2711 				 */
2712 				error = 0;
2713 			} else {
2714 				/*
2715 				 * Rebuild the mbuf chain of the packet
2716 				 */
2717 				if (control != NULL) {
2718 					last_control->m_next = m;
2719 					m = control;
2720 				}
2721 				/*
2722 				 * Reinsert the packet in the list of packets
2723 				 */
2724 				m->m_nextpkt = nextpkt;
2725 				if (prevnextp != NULL) {
2726 					*prevnextp = m;
2727 				} else {
2728 					top = m;
2729 				}
2730 				prevnextp = &m->m_nextpkt;
2731 			}
2732 			control = NULL;
2733 		}
2734 	}
2735 
2736 	if (top != NULL) {
2737 		if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2738 			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2739 			    (so, top, pktcnt, flags);
2740 			if (error != 0 && error != ENOBUFS) {
2741 				os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2742 				    error);
2743 			}
2744 			top = NULL;
2745 		} else {
2746 			*pktcnt = 0;
2747 			control = NULL;
2748 			for (m = top; m != NULL; m = top) {
2749 				top = m->m_nextpkt;
2750 				m->m_nextpkt = NULL;
2751 
2752 				/*
2753 				 * Break the chain per mbuf type
2754 				 */
2755 				if (m->m_type == MT_CONTROL) {
2756 					control = mbuf_detach_control_from_list(&m, NULL);
2757 				}
2758 
2759 				error = (*so->so_proto->pr_usrreqs->pru_send)
2760 				    (so, 0, m, NULL, control, current_proc());
2761 				if (error != 0) {
2762 					if (error != ENOBUFS) {
2763 						os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2764 						    error);
2765 					}
2766 					control = NULL;
2767 					goto release;
2768 				}
2769 				*pktcnt += 1;
2770 				control = NULL;
2771 			}
2772 		}
2773 	}
2774 
2775 release:
2776 	if (dontroute) {
2777 		so->so_options &= ~SO_DONTROUTE;
2778 	}
2779 	if (sblocked) {
2780 		sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2781 	} else {
2782 		socket_unlock(so, 1);
2783 	}
2784 out:
2785 	if (control != NULL) {
2786 		m_freem(control);
2787 	}
2788 	if (top != NULL) {
2789 		if (error != ENOBUFS) {
2790 			os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2791 			    error);
2792 		}
2793 		m_freem_list(top);
2794 	}
2795 
2796 	if (dgram_flow_entry != NULL) {
2797 		soflow_free_flow(dgram_flow_entry);
2798 	}
2799 
2800 	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2801 	    so->so_snd.sb_cc, 0, error);
2802 
2803 	return error;
2804 }
2805 
2806 /*
2807  * May return ERESTART when packet is dropped by MAC policy check
2808  */
2809 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2810 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2811     struct mbuf **maddrp,
2812     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2813 {
2814 	int error = 0;
2815 	struct mbuf *m = *mp;
2816 	struct mbuf *nextrecord = *nextrecordp;
2817 
2818 	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2819 #if CONFIG_MACF_SOCKET_SUBSET
2820 	/*
2821 	 * Call the MAC framework for policy checking if we're in
2822 	 * the user process context and the socket isn't connected.
2823 	 */
2824 	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2825 		struct mbuf *m0 = m;
2826 		/*
2827 		 * Dequeue this record (temporarily) from the receive
2828 		 * list since we're about to drop the socket's lock
2829 		 * where a new record may arrive and be appended to
2830 		 * the list.  Upon MAC policy failure, the record
2831 		 * will be freed.  Otherwise, we'll add it back to
2832 		 * the head of the list.  We cannot rely on SB_LOCK
2833 		 * because append operation uses the socket's lock.
2834 		 */
2835 		do {
2836 			m->m_nextpkt = NULL;
2837 			sbfree(&so->so_rcv, m);
2838 			m = m->m_next;
2839 		} while (m != NULL);
2840 		m = m0;
2841 		so->so_rcv.sb_mb = nextrecord;
2842 		SB_EMPTY_FIXUP(&so->so_rcv);
2843 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2844 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2845 		socket_unlock(so, 0);
2846 
2847 		error = mac_socket_check_received(kauth_cred_get(), so,
2848 		    mtod(m, struct sockaddr *));
2849 
2850 		if (error != 0) {
2851 			/*
2852 			 * MAC policy failure; free this record and
2853 			 * process the next record (or block until
2854 			 * one is available).  We have adjusted sb_cc
2855 			 * and sb_mbcnt above so there is no need to
2856 			 * call sbfree() again.
2857 			 */
2858 			m_freem(m);
2859 			/*
2860 			 * Clear SB_LOCK but don't unlock the socket.
2861 			 * Process the next record or wait for one.
2862 			 */
2863 			socket_lock(so, 0);
2864 			sbunlock(&so->so_rcv, TRUE); /* stay locked */
2865 			error = ERESTART;
2866 			goto done;
2867 		}
2868 		socket_lock(so, 0);
2869 		/*
2870 		 * If the socket has been defunct'd, drop it.
2871 		 */
2872 		if (so->so_flags & SOF_DEFUNCT) {
2873 			m_freem(m);
2874 			error = ENOTCONN;
2875 			goto done;
2876 		}
2877 		/*
2878 		 * Re-adjust the socket receive list and re-enqueue
2879 		 * the record in front of any packets which may have
2880 		 * been appended while we dropped the lock.
2881 		 */
2882 		for (m = m0; m->m_next != NULL; m = m->m_next) {
2883 			sballoc(&so->so_rcv, m);
2884 		}
2885 		sballoc(&so->so_rcv, m);
2886 		if (so->so_rcv.sb_mb == NULL) {
2887 			so->so_rcv.sb_lastrecord = m0;
2888 			so->so_rcv.sb_mbtail = m;
2889 		}
2890 		m = m0;
2891 		nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2892 		so->so_rcv.sb_mb = m;
2893 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2894 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2895 	}
2896 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2897 	if (psa != NULL) {
2898 		*psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2899 		if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2900 			error = EWOULDBLOCK;
2901 			goto done;
2902 		}
2903 	} else if (maddrp != NULL) {
2904 		*maddrp = m;
2905 	}
2906 	if (flags & MSG_PEEK) {
2907 		m = m->m_next;
2908 	} else {
2909 		sbfree(&so->so_rcv, m);
2910 		if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2911 			panic("%s: about to create invalid socketbuf",
2912 			    __func__);
2913 			/* NOTREACHED */
2914 		}
2915 		if (maddrp == NULL) {
2916 			MFREE(m, so->so_rcv.sb_mb);
2917 		} else {
2918 			so->so_rcv.sb_mb = m->m_next;
2919 			m->m_next = NULL;
2920 		}
2921 		m = so->so_rcv.sb_mb;
2922 		if (m != NULL) {
2923 			m->m_nextpkt = nextrecord;
2924 		} else {
2925 			so->so_rcv.sb_mb = nextrecord;
2926 			SB_EMPTY_FIXUP(&so->so_rcv);
2927 		}
2928 	}
2929 done:
2930 	*mp = m;
2931 	*nextrecordp = nextrecord;
2932 
2933 	return error;
2934 }
2935 
2936 /*
2937  * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
2938  * so clear the data portion in order not to leak the file pointers
2939  */
2940 static void
sopeek_scm_rights(struct mbuf * rights)2941 sopeek_scm_rights(struct mbuf *rights)
2942 {
2943 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
2944 
2945 	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
2946 		VERIFY(cm->cmsg_len <= rights->m_len);
2947 		memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
2948 	}
2949 }
2950 
2951 /*
2952  * Process one or more MT_CONTROL mbufs present before any data mbufs
2953  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2954  * just copy the data; if !MSG_PEEK, we call into the protocol to
2955  * perform externalization.
2956  */
2957 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)2958 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2959     struct mbuf **mp, struct mbuf **nextrecordp)
2960 {
2961 	int error = 0;
2962 	mbuf_ref_t cm = NULL, cmn;
2963 	mbuf_ref_ref_t cme = &cm;
2964 	struct sockbuf *sb_rcv = &so->so_rcv;
2965 	mbuf_ref_ref_t msgpcm = NULL;
2966 	mbuf_ref_t m = *mp;
2967 	mbuf_ref_t nextrecord = *nextrecordp;
2968 	struct protosw *pr = so->so_proto;
2969 
2970 	/*
2971 	 * Externalizing the control messages would require us to
2972 	 * drop the socket's lock below.  Once we re-acquire the
2973 	 * lock, the mbuf chain might change.  In order to preserve
2974 	 * consistency, we unlink all control messages from the
2975 	 * first mbuf chain in one shot and link them separately
2976 	 * onto a different chain.
2977 	 */
2978 	do {
2979 		if (flags & MSG_PEEK) {
2980 			if (controlp != NULL) {
2981 				if (*controlp == NULL) {
2982 					msgpcm = controlp;
2983 				}
2984 				*controlp = m_copy(m, 0, m->m_len);
2985 
2986 				/*
2987 				 * If we failed to allocate an mbuf,
2988 				 * release any previously allocated
2989 				 * mbufs for control data. Return
2990 				 * an error. Keep the mbufs in the
2991 				 * socket as this is using
2992 				 * MSG_PEEK flag.
2993 				 */
2994 				if (*controlp == NULL) {
2995 					m_freem(*msgpcm);
2996 					error = ENOBUFS;
2997 					goto done;
2998 				}
2999 
3000 				if (pr->pr_domain->dom_externalize != NULL) {
3001 					sopeek_scm_rights(*controlp);
3002 				}
3003 
3004 				controlp = &(*controlp)->m_next;
3005 			}
3006 			m = m->m_next;
3007 		} else {
3008 			m->m_nextpkt = NULL;
3009 			sbfree(sb_rcv, m);
3010 			sb_rcv->sb_mb = m->m_next;
3011 			m->m_next = NULL;
3012 			*cme = m;
3013 			cme = &(*cme)->m_next;
3014 			m = sb_rcv->sb_mb;
3015 		}
3016 	} while (m != NULL && m->m_type == MT_CONTROL);
3017 
3018 	if (!(flags & MSG_PEEK)) {
3019 		if (sb_rcv->sb_mb != NULL) {
3020 			sb_rcv->sb_mb->m_nextpkt = nextrecord;
3021 		} else {
3022 			sb_rcv->sb_mb = nextrecord;
3023 			SB_EMPTY_FIXUP(sb_rcv);
3024 		}
3025 		if (nextrecord == NULL) {
3026 			sb_rcv->sb_lastrecord = m;
3027 		}
3028 	}
3029 
3030 	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3031 	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3032 
3033 	while (cm != NULL) {
3034 		int cmsg_level;
3035 		int cmsg_type;
3036 
3037 		cmn = cm->m_next;
3038 		cm->m_next = NULL;
3039 		cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3040 		cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3041 
3042 		/*
3043 		 * Call the protocol to externalize SCM_RIGHTS message
3044 		 * and return the modified message to the caller upon
3045 		 * success.  Otherwise, all other control messages are
3046 		 * returned unmodified to the caller.  Note that we
3047 		 * only get into this loop if MSG_PEEK is not set.
3048 		 */
3049 		if (pr->pr_domain->dom_externalize != NULL &&
3050 		    cmsg_level == SOL_SOCKET &&
3051 		    cmsg_type == SCM_RIGHTS) {
3052 			/*
3053 			 * Release socket lock: see 3903171.  This
3054 			 * would also allow more records to be appended
3055 			 * to the socket buffer.  We still have SB_LOCK
3056 			 * set on it, so we can be sure that the head
3057 			 * of the mbuf chain won't change.
3058 			 */
3059 			socket_unlock(so, 0);
3060 			error = (*pr->pr_domain->dom_externalize)(cm);
3061 			socket_lock(so, 0);
3062 		} else {
3063 			error = 0;
3064 		}
3065 
3066 		if (controlp != NULL && error == 0) {
3067 			*controlp = cm;
3068 			controlp = &(*controlp)->m_next;
3069 		} else {
3070 			(void) m_free(cm);
3071 		}
3072 		cm = cmn;
3073 	}
3074 	/*
3075 	 * Update the value of nextrecord in case we received new
3076 	 * records when the socket was unlocked above for
3077 	 * externalizing SCM_RIGHTS.
3078 	 */
3079 	if (m != NULL) {
3080 		nextrecord = sb_rcv->sb_mb->m_nextpkt;
3081 	} else {
3082 		nextrecord = sb_rcv->sb_mb;
3083 	}
3084 
3085 done:
3086 	*mp = m;
3087 	*nextrecordp = nextrecord;
3088 
3089 	return error;
3090 }
3091 
3092 /*
3093  * If we have less data than requested, block awaiting more
3094  * (subject to any timeout) if:
3095  *   1. the current count is less than the low water mark, or
3096  *   2. MSG_WAITALL is set, and it is possible to do the entire
3097  *	receive operation at once if we block (resid <= hiwat).
3098  *   3. MSG_DONTWAIT is not set
3099  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3100  * we have to do the receive in sections, and thus risk returning
3101  * a short count if a timeout or signal occurs after we start.
3102  */
3103 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3104 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3105 {
3106 	struct protosw *pr = so->so_proto;
3107 
3108 	/* No mbufs in the receive-queue? Wait! */
3109 	if (m == NULL) {
3110 		return true;
3111 	}
3112 
3113 	/* Not enough data in the receive socket-buffer - we may have to wait */
3114 	if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3115 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3116 		/*
3117 		 * Application did set the lowater-mark, so we should wait for
3118 		 * this data to be present.
3119 		 */
3120 		if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3121 			return true;
3122 		}
3123 
3124 		/*
3125 		 * Application wants all the data - so let's try to do the
3126 		 * receive-operation at once by waiting for everything to
3127 		 * be there.
3128 		 */
3129 		if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3130 			return true;
3131 		}
3132 	}
3133 
3134 	return false;
3135 }
3136 
3137 /*
3138  * Implement receive operations on a socket.
3139  * We depend on the way that records are added to the sockbuf
3140  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3141  * must begin with an address if the protocol so specifies,
3142  * followed by an optional mbuf or mbufs containing ancillary data,
3143  * and then zero or more mbufs of data.
3144  * In order to avoid blocking network interrupts for the entire time here,
3145  * we splx() while doing the actual copy to user space.
3146  * Although the sockbuf is locked, new data may still be appended,
3147  * and thus we must maintain consistency of the sockbuf during that time.
3148  *
3149  * The caller may receive the data as a single mbuf chain by supplying
3150  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3151  * only for the count in uio_resid.
3152  *
3153  * Returns:	0			Success
3154  *		ENOBUFS
3155  *		ENOTCONN
3156  *		EWOULDBLOCK
3157  *	uiomove:EFAULT
3158  *	sblock:EWOULDBLOCK
3159  *	sblock:EINTR
3160  *	sbwait:EBADF
3161  *	sbwait:EINTR
3162  *	sodelayed_copy:EFAULT
3163  *	<pru_rcvoob>:EINVAL[TCP]
3164  *	<pru_rcvoob>:EWOULDBLOCK[TCP]
3165  *	<pru_rcvoob>:???
3166  *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3167  *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3168  *	<pr_domain->dom_externalize>:???
3169  *
3170  * Notes:	Additional return values from calls through <pru_rcvoob> and
3171  *		<pr_domain->dom_externalize> depend on protocols other than
3172  *		TCP or AF_UNIX, which are documented above.
3173  */
3174 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3175 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3176     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3177 {
3178 	mbuf_ref_t m;
3179 	mbuf_ref_ref_t mp;
3180 	mbuf_ref_t ml = NULL;
3181 	mbuf_ref_t nextrecord, free_list;
3182 	int flags, error, offset;
3183 	user_ssize_t len;
3184 	struct protosw *pr = so->so_proto;
3185 	int moff, type = 0;
3186 	user_ssize_t orig_resid = uio_resid(uio);
3187 	user_ssize_t delayed_copy_len;
3188 	int can_delay;
3189 	struct proc *p = current_proc();
3190 	boolean_t en_tracing = FALSE;
3191 
3192 	/*
3193 	 * Sanity check on the length passed by caller as we are making 'int'
3194 	 * comparisons
3195 	 */
3196 	if (orig_resid < 0 || orig_resid > INT_MAX) {
3197 		return EINVAL;
3198 	}
3199 
3200 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3201 	    uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3202 	    so->so_rcv.sb_hiwat);
3203 
3204 	socket_lock(so, 1);
3205 	so_update_last_owner_locked(so, p);
3206 	so_update_policy(so);
3207 
3208 #ifdef MORE_LOCKING_DEBUG
3209 	if (so->so_usecount == 1) {
3210 		panic("%s: so=%x no other reference on socket", __func__, so);
3211 		/* NOTREACHED */
3212 	}
3213 #endif
3214 	mp = mp0;
3215 	if (psa != NULL) {
3216 		*psa = NULL;
3217 	}
3218 	if (controlp != NULL) {
3219 		*controlp = NULL;
3220 	}
3221 	if (flagsp != NULL) {
3222 		flags = *flagsp & ~MSG_EOR;
3223 	} else {
3224 		flags = 0;
3225 	}
3226 
3227 	/*
3228 	 * If a recv attempt is made on a previously-accepted socket
3229 	 * that has been marked as inactive (disconnected), reject
3230 	 * the request.
3231 	 */
3232 	if (so->so_flags & SOF_DEFUNCT) {
3233 		struct sockbuf *sb = &so->so_rcv;
3234 
3235 		error = ENOTCONN;
3236 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3237 		    __func__, proc_pid(p), proc_best_name(p),
3238 		    so->so_gencnt,
3239 		    SOCK_DOM(so), SOCK_TYPE(so), error);
3240 		/*
3241 		 * This socket should have been disconnected and flushed
3242 		 * prior to being returned from sodefunct(); there should
3243 		 * be no data on its receive list, so panic otherwise.
3244 		 */
3245 		if (so->so_state & SS_DEFUNCT) {
3246 			sb_empty_assert(sb, __func__);
3247 		}
3248 		socket_unlock(so, 1);
3249 		return error;
3250 	}
3251 
3252 	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3253 	    pr->pr_usrreqs->pru_preconnect) {
3254 		/*
3255 		 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3256 		 * calling write() right after this. *If* the app calls a read
3257 		 * we do not want to block this read indefinetely. Thus,
3258 		 * we trigger a connect so that the session gets initiated.
3259 		 */
3260 		error = (*pr->pr_usrreqs->pru_preconnect)(so);
3261 
3262 		if (error) {
3263 			socket_unlock(so, 1);
3264 			return error;
3265 		}
3266 	}
3267 
3268 	if (ENTR_SHOULDTRACE &&
3269 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3270 		/*
3271 		 * enable energy tracing for inet sockets that go over
3272 		 * non-loopback interfaces only.
3273 		 */
3274 		struct inpcb *inp = sotoinpcb(so);
3275 		if (inp->inp_last_outifp != NULL &&
3276 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3277 			en_tracing = TRUE;
3278 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3279 			    VM_KERNEL_ADDRPERM(so),
3280 			    ((so->so_state & SS_NBIO) ?
3281 			    kEnTrFlagNonBlocking : 0),
3282 			    (int64_t)orig_resid);
3283 		}
3284 	}
3285 
3286 	/*
3287 	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3288 	 * regardless of the flags argument. Here is the case were
3289 	 * out-of-band data is not inline.
3290 	 */
3291 	if ((flags & MSG_OOB) ||
3292 	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3293 	    (so->so_options & SO_OOBINLINE) == 0 &&
3294 	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3295 		m = m_get(M_WAIT, MT_DATA);
3296 		if (m == NULL) {
3297 			socket_unlock(so, 1);
3298 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3299 			    ENOBUFS, 0, 0, 0, 0);
3300 			return ENOBUFS;
3301 		}
3302 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3303 		if (error) {
3304 			goto bad;
3305 		}
3306 		socket_unlock(so, 0);
3307 		do {
3308 			error = uiomove(mtod(m, caddr_t),
3309 			    imin((int)uio_resid(uio), m->m_len), uio);
3310 			m = m_free(m);
3311 		} while (uio_resid(uio) && error == 0 && m != NULL);
3312 		socket_lock(so, 0);
3313 bad:
3314 		if (m != NULL) {
3315 			m_freem(m);
3316 		}
3317 
3318 		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3319 			if (error == EWOULDBLOCK || error == EINVAL) {
3320 				/*
3321 				 * Let's try to get normal data:
3322 				 * EWOULDBLOCK: out-of-band data not
3323 				 * receive yet. EINVAL: out-of-band data
3324 				 * already read.
3325 				 */
3326 				error = 0;
3327 				goto nooob;
3328 			} else if (error == 0 && flagsp != NULL) {
3329 				*flagsp |= MSG_OOB;
3330 			}
3331 		}
3332 		socket_unlock(so, 1);
3333 		if (en_tracing) {
3334 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3335 			    VM_KERNEL_ADDRPERM(so), 0,
3336 			    (int64_t)(orig_resid - uio_resid(uio)));
3337 		}
3338 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3339 		    0, 0, 0, 0);
3340 
3341 		return error;
3342 	}
3343 nooob:
3344 	if (mp != NULL) {
3345 		*mp = NULL;
3346 	}
3347 
3348 	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3349 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
3350 	}
3351 
3352 	free_list = NULL;
3353 	delayed_copy_len = 0;
3354 restart:
3355 #ifdef MORE_LOCKING_DEBUG
3356 	if (so->so_usecount <= 1) {
3357 		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3358 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3359 	}
3360 #endif
3361 	/*
3362 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3363 	 * and if so just return to the caller.  This could happen when
3364 	 * soreceive() is called by a socket upcall function during the
3365 	 * time the socket is freed.  The socket buffer would have been
3366 	 * locked across the upcall, therefore we cannot put this thread
3367 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3368 	 * we may livelock), because the lock on the socket buffer will
3369 	 * only be released when the upcall routine returns to its caller.
3370 	 * Because the socket has been officially closed, there can be
3371 	 * no further read on it.
3372 	 *
3373 	 * A multipath subflow socket would have its SS_NOFDREF set by
3374 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3375 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3376 	 */
3377 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3378 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3379 		socket_unlock(so, 1);
3380 		return 0;
3381 	}
3382 
3383 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3384 	if (error) {
3385 		socket_unlock(so, 1);
3386 		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3387 		    0, 0, 0, 0);
3388 		if (en_tracing) {
3389 			KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3390 			    VM_KERNEL_ADDRPERM(so), 0,
3391 			    (int64_t)(orig_resid - uio_resid(uio)));
3392 		}
3393 		return error;
3394 	}
3395 
3396 	m = so->so_rcv.sb_mb;
3397 	if (so_should_wait(so, uio, m, flags)) {
3398 		/*
3399 		 * Panic if we notice inconsistencies in the socket's
3400 		 * receive list; both sb_mb and sb_cc should correctly
3401 		 * reflect the contents of the list, otherwise we may
3402 		 * end up with false positives during select() or poll()
3403 		 * which could put the application in a bad state.
3404 		 */
3405 		SB_MB_CHECK(&so->so_rcv);
3406 
3407 		if (so->so_error) {
3408 			if (m != NULL) {
3409 				goto dontblock;
3410 			}
3411 			error = so->so_error;
3412 			if ((flags & MSG_PEEK) == 0) {
3413 				so->so_error = 0;
3414 			}
3415 			goto release;
3416 		}
3417 		if (so->so_state & SS_CANTRCVMORE) {
3418 #if CONTENT_FILTER
3419 			/*
3420 			 * Deal with half closed connections
3421 			 */
3422 			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3423 			    cfil_sock_data_pending(&so->so_rcv) != 0) {
3424 				CFIL_LOG(LOG_INFO,
3425 				    "so %llx ignore SS_CANTRCVMORE",
3426 				    (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3427 			} else
3428 #endif /* CONTENT_FILTER */
3429 			if (m != NULL) {
3430 				goto dontblock;
3431 			} else {
3432 				goto release;
3433 			}
3434 		}
3435 		for (; m != NULL; m = m->m_next) {
3436 			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3437 				m = so->so_rcv.sb_mb;
3438 				goto dontblock;
3439 			}
3440 		}
3441 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3442 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3443 			error = ENOTCONN;
3444 			goto release;
3445 		}
3446 		if (uio_resid(uio) == 0) {
3447 			goto release;
3448 		}
3449 
3450 		if ((so->so_state & SS_NBIO) ||
3451 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3452 			error = EWOULDBLOCK;
3453 			goto release;
3454 		}
3455 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3456 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3457 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3458 #if EVEN_MORE_LOCKING_DEBUG
3459 		if (socket_debug) {
3460 			printf("Waiting for socket data\n");
3461 		}
3462 #endif
3463 
3464 		/*
3465 		 * Depending on the protocol (e.g. TCP), the following
3466 		 * might cause the socket lock to be dropped and later
3467 		 * be reacquired, and more data could have arrived and
3468 		 * have been appended to the receive socket buffer by
3469 		 * the time it returns.  Therefore, we only sleep in
3470 		 * sbwait() below if and only if the wait-condition is still
3471 		 * true.
3472 		 */
3473 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3474 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3475 		}
3476 
3477 		error = 0;
3478 		if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3479 			error = sbwait(&so->so_rcv);
3480 		}
3481 
3482 #if EVEN_MORE_LOCKING_DEBUG
3483 		if (socket_debug) {
3484 			printf("SORECEIVE - sbwait returned %d\n", error);
3485 		}
3486 #endif
3487 		if (so->so_usecount < 1) {
3488 			panic("%s: after 2nd sblock so=%p ref=%d on socket",
3489 			    __func__, so, so->so_usecount);
3490 			/* NOTREACHED */
3491 		}
3492 		if (error) {
3493 			socket_unlock(so, 1);
3494 			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3495 			    0, 0, 0, 0);
3496 			if (en_tracing) {
3497 				KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3498 				    VM_KERNEL_ADDRPERM(so), 0,
3499 				    (int64_t)(orig_resid - uio_resid(uio)));
3500 			}
3501 			return error;
3502 		}
3503 		goto restart;
3504 	}
3505 dontblock:
3506 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3507 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3508 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3509 	nextrecord = m->m_nextpkt;
3510 
3511 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3512 		error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3513 		    mp0 == NULL);
3514 		if (error == ERESTART) {
3515 			goto restart;
3516 		} else if (error != 0) {
3517 			goto release;
3518 		}
3519 		orig_resid = 0;
3520 	}
3521 
3522 	/*
3523 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3524 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3525 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3526 	 * perform externalization.
3527 	 */
3528 	if (m != NULL && m->m_type == MT_CONTROL) {
3529 		error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3530 		if (error != 0) {
3531 			goto release;
3532 		}
3533 		orig_resid = 0;
3534 	}
3535 
3536 	if (m != NULL) {
3537 		if (!(flags & MSG_PEEK)) {
3538 			/*
3539 			 * We get here because m points to an mbuf following
3540 			 * any MT_SONAME or MT_CONTROL mbufs which have been
3541 			 * processed above.  In any case, m should be pointing
3542 			 * to the head of the mbuf chain, and the nextrecord
3543 			 * should be either NULL or equal to m->m_nextpkt.
3544 			 * See comments above about SB_LOCK.
3545 			 */
3546 			if (m != so->so_rcv.sb_mb ||
3547 			    m->m_nextpkt != nextrecord) {
3548 				panic("%s: post-control !sync so=%p m=%p "
3549 				    "nextrecord=%p\n", __func__, so, m,
3550 				    nextrecord);
3551 				/* NOTREACHED */
3552 			}
3553 			if (nextrecord == NULL) {
3554 				so->so_rcv.sb_lastrecord = m;
3555 			}
3556 		}
3557 		type = m->m_type;
3558 		if (type == MT_OOBDATA) {
3559 			flags |= MSG_OOB;
3560 		}
3561 	} else {
3562 		if (!(flags & MSG_PEEK)) {
3563 			SB_EMPTY_FIXUP(&so->so_rcv);
3564 		}
3565 	}
3566 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3567 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3568 
3569 	moff = 0;
3570 	offset = 0;
3571 
3572 	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3573 		can_delay = 1;
3574 	} else {
3575 		can_delay = 0;
3576 	}
3577 
3578 	while (m != NULL &&
3579 	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3580 		if (m->m_type == MT_OOBDATA) {
3581 			if (type != MT_OOBDATA) {
3582 				break;
3583 			}
3584 		} else if (type == MT_OOBDATA) {
3585 			break;
3586 		}
3587 
3588 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3589 			break;
3590 		}
3591 		/*
3592 		 * Make sure to allways set MSG_OOB event when getting
3593 		 * out of band data inline.
3594 		 */
3595 		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3596 		    (so->so_options & SO_OOBINLINE) != 0 &&
3597 		    (so->so_state & SS_RCVATMARK) != 0) {
3598 			flags |= MSG_OOB;
3599 		}
3600 		so->so_state &= ~SS_RCVATMARK;
3601 		len = uio_resid(uio) - delayed_copy_len;
3602 		if (so->so_oobmark && len > so->so_oobmark - offset) {
3603 			len = so->so_oobmark - offset;
3604 		}
3605 		if (len > m->m_len - moff) {
3606 			len = m->m_len - moff;
3607 		}
3608 		/*
3609 		 * If mp is set, just pass back the mbufs.
3610 		 * Otherwise copy them out via the uio, then free.
3611 		 * Sockbuf must be consistent here (points to current mbuf,
3612 		 * it points to next record) when we drop priority;
3613 		 * we must note any additions to the sockbuf when we
3614 		 * block interrupts again.
3615 		 */
3616 		if (mp == NULL) {
3617 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3618 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3619 			if (can_delay && len == m->m_len) {
3620 				/*
3621 				 * only delay the copy if we're consuming the
3622 				 * mbuf and we're NOT in MSG_PEEK mode
3623 				 * and we have enough data to make it worthwile
3624 				 * to drop and retake the lock... can_delay
3625 				 * reflects the state of the 2 latter
3626 				 * constraints moff should always be zero
3627 				 * in these cases
3628 				 */
3629 				delayed_copy_len += len;
3630 			} else {
3631 				if (delayed_copy_len) {
3632 					error = sodelayed_copy(so, uio,
3633 					    &free_list, &delayed_copy_len);
3634 
3635 					if (error) {
3636 						goto release;
3637 					}
3638 					/*
3639 					 * can only get here if MSG_PEEK is not
3640 					 * set therefore, m should point at the
3641 					 * head of the rcv queue; if it doesn't,
3642 					 * it means something drastically
3643 					 * changed while we were out from behind
3644 					 * the lock in sodelayed_copy. perhaps
3645 					 * a RST on the stream. in any event,
3646 					 * the stream has been interrupted. it's
3647 					 * probably best just to return whatever
3648 					 * data we've moved and let the caller
3649 					 * sort it out...
3650 					 */
3651 					if (m != so->so_rcv.sb_mb) {
3652 						break;
3653 					}
3654 				}
3655 				socket_unlock(so, 0);
3656 				error = uiomove(mtod(m, caddr_t) + moff,
3657 				    (int)len, uio);
3658 				socket_lock(so, 0);
3659 
3660 				if (error) {
3661 					goto release;
3662 				}
3663 			}
3664 		} else {
3665 			uio_setresid(uio, (uio_resid(uio) - len));
3666 		}
3667 		if (len == m->m_len - moff) {
3668 			if (m->m_flags & M_EOR) {
3669 				flags |= MSG_EOR;
3670 			}
3671 			if (flags & MSG_PEEK) {
3672 				m = m->m_next;
3673 				moff = 0;
3674 			} else {
3675 				nextrecord = m->m_nextpkt;
3676 				sbfree(&so->so_rcv, m);
3677 				m->m_nextpkt = NULL;
3678 
3679 				if (mp != NULL) {
3680 					*mp = m;
3681 					mp = &m->m_next;
3682 					so->so_rcv.sb_mb = m = m->m_next;
3683 					*mp = NULL;
3684 				} else {
3685 					if (free_list == NULL) {
3686 						free_list = m;
3687 					} else {
3688 						ml->m_next = m;
3689 					}
3690 					ml = m;
3691 					so->so_rcv.sb_mb = m = m->m_next;
3692 					ml->m_next = NULL;
3693 				}
3694 				if (m != NULL) {
3695 					m->m_nextpkt = nextrecord;
3696 					if (nextrecord == NULL) {
3697 						so->so_rcv.sb_lastrecord = m;
3698 					}
3699 				} else {
3700 					so->so_rcv.sb_mb = nextrecord;
3701 					SB_EMPTY_FIXUP(&so->so_rcv);
3702 				}
3703 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3704 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3705 			}
3706 		} else {
3707 			if (flags & MSG_PEEK) {
3708 				moff += len;
3709 			} else {
3710 				if (mp != NULL) {
3711 					int copy_flag;
3712 
3713 					if (flags & MSG_DONTWAIT) {
3714 						copy_flag = M_DONTWAIT;
3715 					} else {
3716 						copy_flag = M_WAIT;
3717 					}
3718 					*mp = m_copym(m, 0, (int)len, copy_flag);
3719 					/*
3720 					 * Failed to allocate an mbuf?
3721 					 * Adjust uio_resid back, it was
3722 					 * adjusted down by len bytes which
3723 					 * we didn't copy over.
3724 					 */
3725 					if (*mp == NULL) {
3726 						uio_setresid(uio,
3727 						    (uio_resid(uio) + len));
3728 						break;
3729 					}
3730 				}
3731 				m->m_data += len;
3732 				m->m_len -= len;
3733 				so->so_rcv.sb_cc -= len;
3734 			}
3735 		}
3736 		if (so->so_oobmark) {
3737 			if ((flags & MSG_PEEK) == 0) {
3738 				so->so_oobmark -= len;
3739 				if (so->so_oobmark == 0) {
3740 					so->so_state |= SS_RCVATMARK;
3741 					break;
3742 				}
3743 			} else {
3744 				offset += len;
3745 				if (offset == so->so_oobmark) {
3746 					break;
3747 				}
3748 			}
3749 		}
3750 		if (flags & MSG_EOR) {
3751 			break;
3752 		}
3753 		/*
3754 		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3755 		 * (for non-atomic socket), we must not quit until
3756 		 * "uio->uio_resid == 0" or an error termination.
3757 		 * If a signal/timeout occurs, return with a short
3758 		 * count but without error.  Keep sockbuf locked
3759 		 * against other readers.
3760 		 */
3761 		while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3762 		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3763 		    !sosendallatonce(so) && !nextrecord) {
3764 			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3765 #if CONTENT_FILTER
3766 			    && cfil_sock_data_pending(&so->so_rcv) == 0
3767 #endif /* CONTENT_FILTER */
3768 			    )) {
3769 				goto release;
3770 			}
3771 
3772 			/*
3773 			 * Depending on the protocol (e.g. TCP), the following
3774 			 * might cause the socket lock to be dropped and later
3775 			 * be reacquired, and more data could have arrived and
3776 			 * have been appended to the receive socket buffer by
3777 			 * the time it returns.  Therefore, we only sleep in
3778 			 * sbwait() below if and only if the socket buffer is
3779 			 * empty, in order to avoid a false sleep.
3780 			 */
3781 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3782 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3783 			}
3784 
3785 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3786 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3787 
3788 			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3789 				error = 0;
3790 				goto release;
3791 			}
3792 			/*
3793 			 * have to wait until after we get back from the sbwait
3794 			 * to do the copy because we will drop the lock if we
3795 			 * have enough data that has been delayed... by dropping
3796 			 * the lock we open up a window allowing the netisr
3797 			 * thread to process the incoming packets and to change
3798 			 * the state of this socket... we're issuing the sbwait
3799 			 * because the socket is empty and we're expecting the
3800 			 * netisr thread to wake us up when more packets arrive;
3801 			 * if we allow that processing to happen and then sbwait
3802 			 * we could stall forever with packets sitting in the
3803 			 * socket if no further packets arrive from the remote
3804 			 * side.
3805 			 *
3806 			 * we want to copy before we've collected all the data
3807 			 * to satisfy this request to allow the copy to overlap
3808 			 * the incoming packet processing on an MP system
3809 			 */
3810 			if (delayed_copy_len > sorecvmincopy &&
3811 			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3812 				error = sodelayed_copy(so, uio,
3813 				    &free_list, &delayed_copy_len);
3814 
3815 				if (error) {
3816 					goto release;
3817 				}
3818 			}
3819 			m = so->so_rcv.sb_mb;
3820 			if (m != NULL) {
3821 				nextrecord = m->m_nextpkt;
3822 			}
3823 			SB_MB_CHECK(&so->so_rcv);
3824 		}
3825 	}
3826 #ifdef MORE_LOCKING_DEBUG
3827 	if (so->so_usecount <= 1) {
3828 		panic("%s: after big while so=%p ref=%d on socket",
3829 		    __func__, so, so->so_usecount);
3830 		/* NOTREACHED */
3831 	}
3832 #endif
3833 
3834 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3835 		if (so->so_options & SO_DONTTRUNC) {
3836 			flags |= MSG_RCVMORE;
3837 		} else {
3838 			flags |= MSG_TRUNC;
3839 			if ((flags & MSG_PEEK) == 0) {
3840 				(void) sbdroprecord(&so->so_rcv);
3841 			}
3842 		}
3843 	}
3844 
3845 	/*
3846 	 * pru_rcvd below (for TCP) may cause more data to be received
3847 	 * if the socket lock is dropped prior to sending the ACK; some
3848 	 * legacy OpenTransport applications don't handle this well
3849 	 * (if it receives less data than requested while MSG_HAVEMORE
3850 	 * is set), and so we set the flag now based on what we know
3851 	 * prior to calling pru_rcvd.
3852 	 */
3853 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3854 		flags |= MSG_HAVEMORE;
3855 	}
3856 
3857 	if ((flags & MSG_PEEK) == 0) {
3858 		if (m == NULL) {
3859 			so->so_rcv.sb_mb = nextrecord;
3860 			/*
3861 			 * First part is an inline SB_EMPTY_FIXUP().  Second
3862 			 * part makes sure sb_lastrecord is up-to-date if
3863 			 * there is still data in the socket buffer.
3864 			 */
3865 			if (so->so_rcv.sb_mb == NULL) {
3866 				so->so_rcv.sb_mbtail = NULL;
3867 				so->so_rcv.sb_lastrecord = NULL;
3868 			} else if (nextrecord->m_nextpkt == NULL) {
3869 				so->so_rcv.sb_lastrecord = nextrecord;
3870 			}
3871 			SB_MB_CHECK(&so->so_rcv);
3872 		}
3873 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3874 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3875 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3876 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3877 		}
3878 	}
3879 
3880 	if (delayed_copy_len) {
3881 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3882 		if (error) {
3883 			goto release;
3884 		}
3885 	}
3886 	if (free_list != NULL) {
3887 		m_freem_list(free_list);
3888 		free_list = NULL;
3889 	}
3890 
3891 	if (orig_resid == uio_resid(uio) && orig_resid &&
3892 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3893 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3894 		goto restart;
3895 	}
3896 
3897 	if (flagsp != NULL) {
3898 		*flagsp |= flags;
3899 	}
3900 release:
3901 #ifdef MORE_LOCKING_DEBUG
3902 	if (so->so_usecount <= 1) {
3903 		panic("%s: release so=%p ref=%d on socket", __func__,
3904 		    so, so->so_usecount);
3905 		/* NOTREACHED */
3906 	}
3907 #endif
3908 	if (delayed_copy_len) {
3909 		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3910 	}
3911 
3912 	if (free_list != NULL) {
3913 		m_freem_list(free_list);
3914 	}
3915 
3916 	sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3917 
3918 	if (en_tracing) {
3919 		KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3920 		    VM_KERNEL_ADDRPERM(so),
3921 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3922 		    (int64_t)(orig_resid - uio_resid(uio)));
3923 	}
3924 	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3925 	    so->so_rcv.sb_cc, 0, error);
3926 
3927 	return error;
3928 }
3929 
3930 /*
3931  * Returns:	0			Success
3932  *	uiomove:EFAULT
3933  */
3934 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)3935 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3936     user_ssize_t *resid)
3937 {
3938 	int error = 0;
3939 	struct mbuf *m;
3940 
3941 	m = *free_list;
3942 
3943 	socket_unlock(so, 0);
3944 
3945 	while (m != NULL && error == 0) {
3946 		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3947 		m = m->m_next;
3948 	}
3949 	m_freem_list(*free_list);
3950 
3951 	*free_list = NULL;
3952 	*resid = 0;
3953 
3954 	socket_lock(so, 0);
3955 
3956 	return error;
3957 }
3958 
3959 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3960 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
3961     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3962 {
3963 	mbuf_ref_t m;
3964 	mbuf_ref_ref_t mp;
3965 	mbuf_ref_t nextrecord;
3966 	int flags, error;
3967 	struct protosw *pr = so->so_proto;
3968 	struct proc *p = current_proc();
3969 	u_int npkts = 0;
3970 	mbuf_ref_t free_list = NULL;
3971 	int sblocked = 0;
3972 
3973 	/*
3974 	 * Sanity check on the parameters passed by caller
3975 	 */
3976 	if (mp0 == NULL || pktcntp == NULL) {
3977 		return EINVAL;
3978 	}
3979 	if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
3980 		return EINVAL;
3981 	}
3982 
3983 	mp = mp0;
3984 	*mp0 = NULL;
3985 	if (controlp != NULL) {
3986 		*controlp = NULL;
3987 	}
3988 	if (maddrp != NULL) {
3989 		*maddrp = NULL;
3990 	}
3991 	if (flagsp != NULL) {
3992 		flags = *flagsp;
3993 	} else {
3994 		flags = 0;
3995 	}
3996 
3997 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
3998 	    *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3999 	    so->so_rcv.sb_hiwat);
4000 
4001 	socket_lock(so, 1);
4002 	so_update_last_owner_locked(so, p);
4003 	so_update_policy(so);
4004 
4005 #if NECP
4006 	so_update_necp_policy(so, NULL, NULL);
4007 #endif /* NECP */
4008 
4009 	/*
4010 	 * If a recv attempt is made on a previously-accepted socket
4011 	 * that has been marked as inactive (disconnected), reject
4012 	 * the request.
4013 	 */
4014 	if (so->so_flags & SOF_DEFUNCT) {
4015 		struct sockbuf *sb = &so->so_rcv;
4016 
4017 		error = ENOTCONN;
4018 		SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4019 		    __func__, proc_pid(p), proc_best_name(p),
4020 		    so->so_gencnt,
4021 		    SOCK_DOM(so), SOCK_TYPE(so), error);
4022 		/*
4023 		 * This socket should have been disconnected and flushed
4024 		 * prior to being returned from sodefunct(); there should
4025 		 * be no data on its receive list, so panic otherwise.
4026 		 */
4027 		if (so->so_state & SS_DEFUNCT) {
4028 			sb_empty_assert(sb, __func__);
4029 		}
4030 		goto release;
4031 	}
4032 
4033 	*mp = NULL;
4034 
4035 restart:
4036 	/*
4037 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4038 	 * and if so just return to the caller.  This could happen when
4039 	 * soreceive() is called by a socket upcall function during the
4040 	 * time the socket is freed.  The socket buffer would have been
4041 	 * locked across the upcall, therefore we cannot put this thread
4042 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4043 	 * we may livelock), because the lock on the socket buffer will
4044 	 * only be released when the upcall routine returns to its caller.
4045 	 * Because the socket has been officially closed, there can be
4046 	 * no further read on it.
4047 	 */
4048 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4049 	    (SS_NOFDREF | SS_CANTRCVMORE)) {
4050 		error = 0;
4051 		goto release;
4052 	}
4053 
4054 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4055 	if (error) {
4056 		goto release;
4057 	}
4058 	sblocked = 1;
4059 
4060 	m = so->so_rcv.sb_mb;
4061 	/*
4062 	 * Block awaiting more datagram if needed
4063 	 */
4064 	if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4065 	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4066 		/*
4067 		 * Panic if we notice inconsistencies in the socket's
4068 		 * receive list; both sb_mb and sb_cc should correctly
4069 		 * reflect the contents of the list, otherwise we may
4070 		 * end up with false positives during select() or poll()
4071 		 * which could put the application in a bad state.
4072 		 */
4073 		SB_MB_CHECK(&so->so_rcv);
4074 
4075 		if (so->so_error) {
4076 			if (m != NULL) {
4077 				goto dontblock;
4078 			}
4079 			error = so->so_error;
4080 			if ((flags & MSG_PEEK) == 0) {
4081 				so->so_error = 0;
4082 			}
4083 			goto release;
4084 		}
4085 		if (so->so_state & SS_CANTRCVMORE) {
4086 			if (m != NULL) {
4087 				goto dontblock;
4088 			} else {
4089 				goto release;
4090 			}
4091 		}
4092 		for (; m != NULL; m = m->m_next) {
4093 			if (m->m_flags & M_EOR) {
4094 				m = so->so_rcv.sb_mb;
4095 				goto dontblock;
4096 			}
4097 		}
4098 		if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4099 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4100 			error = ENOTCONN;
4101 			goto release;
4102 		}
4103 		if ((so->so_state & SS_NBIO) ||
4104 		    (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4105 			error = EWOULDBLOCK;
4106 			goto release;
4107 		}
4108 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4109 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4110 
4111 		sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4112 		sblocked = 0;
4113 
4114 		error = sbwait(&so->so_rcv);
4115 		if (error != 0) {
4116 			goto release;
4117 		}
4118 		goto restart;
4119 	}
4120 dontblock:
4121 	m = so->so_rcv.sb_mb;
4122 	if (m == NULL) {
4123 		goto release;
4124 	}
4125 
4126 	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4127 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4128 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4129 	nextrecord = m->m_nextpkt;
4130 
4131 	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4132 		mbuf_ref_t maddr = NULL;
4133 
4134 		error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4135 		    &nextrecord, 1);
4136 		if (error == ERESTART) {
4137 			goto restart;
4138 		} else if (error != 0) {
4139 			goto release;
4140 		}
4141 
4142 		if (maddr != NULL) {
4143 			maddr->m_nextpkt = NULL;
4144 			maddr->m_next = NULL;
4145 			if (maddrp != NULL) {
4146 				*maddrp = maddr;
4147 				maddrp = &maddr->m_nextpkt;
4148 			} else {
4149 				maddr->m_next = free_list;
4150 				free_list = maddr;
4151 			}
4152 		}
4153 	}
4154 
4155 	/*
4156 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
4157 	 * in the first mbuf chain on the socket buffer.
4158 	 * We call into the protocol to perform externalization.
4159 	 */
4160 	if (m != NULL && m->m_type == MT_CONTROL) {
4161 		mbuf_ref_t control = NULL;
4162 
4163 		error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4164 		if (error != 0) {
4165 			goto release;
4166 		}
4167 		if (control != NULL) {
4168 			control->m_nextpkt = NULL;
4169 			control->m_next = NULL;
4170 			if (controlp != NULL) {
4171 				*controlp = control;
4172 				controlp = &control->m_nextpkt;
4173 			} else {
4174 				control->m_next = free_list;
4175 				free_list = control;
4176 			}
4177 		}
4178 	}
4179 
4180 	/*
4181 	 * Link the packet to the list
4182 	 */
4183 	if (m != NULL) {
4184 		if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4185 			panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4186 		}
4187 		m->m_nextpkt = NULL;
4188 		*mp = m;
4189 		mp = &m->m_nextpkt;
4190 	}
4191 	while (m != NULL) {
4192 		sbfree(&so->so_rcv, m);
4193 
4194 		m = m->m_next;
4195 	}
4196 
4197 	so->so_rcv.sb_mb = nextrecord;
4198 	/*
4199 	 * First part is an inline SB_EMPTY_FIXUP().  Second
4200 	 * part makes sure sb_lastrecord is up-to-date if
4201 	 * there is still data in the socket buffer.
4202 	 */
4203 	if (so->so_rcv.sb_mb == NULL) {
4204 		so->so_rcv.sb_mbtail = NULL;
4205 		so->so_rcv.sb_lastrecord = NULL;
4206 	} else if (nextrecord->m_nextpkt == NULL) {
4207 		so->so_rcv.sb_lastrecord = nextrecord;
4208 	}
4209 	SB_MB_CHECK(&so->so_rcv);
4210 
4211 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4212 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4213 
4214 	npkts += 1;
4215 
4216 	/*
4217 	 * We continue as long as all those conditions as we have less packets
4218 	 * than requested and the socket buffer is not empty
4219 	 */
4220 	if (npkts < *pktcntp) {
4221 		if (so->so_rcv.sb_mb != NULL) {
4222 			goto dontblock;
4223 		}
4224 		if ((flags & MSG_WAITALL) != 0) {
4225 			goto restart;
4226 		}
4227 	}
4228 
4229 	if (flagsp != NULL) {
4230 		*flagsp |= flags;
4231 	}
4232 
4233 release:
4234 	/*
4235 	 * pru_rcvd may cause more data to be received if the socket lock
4236 	 * is dropped so we set MSG_HAVEMORE now based on what we know.
4237 	 * That way the caller won't be surprised if it receives less data
4238 	 * than requested.
4239 	 */
4240 	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4241 		flags |= MSG_HAVEMORE;
4242 	}
4243 
4244 	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4245 		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4246 	}
4247 
4248 	if (sblocked) {
4249 		sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4250 	} else {
4251 		socket_unlock(so, 1);
4252 	}
4253 
4254 	*pktcntp = npkts;
4255 	/*
4256 	 * Amortize the cost of freeing the mbufs
4257 	 */
4258 	if (free_list != NULL) {
4259 		m_freem_list(free_list);
4260 	}
4261 
4262 	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4263 	    0, 0, 0, 0);
4264 	return error;
4265 }
4266 
4267 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4268 so_statistics_event_to_nstat_event(int64_t *input_options,
4269     uint64_t *nstat_event)
4270 {
4271 	int error = 0;
4272 	switch (*input_options) {
4273 	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4274 		*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4275 		break;
4276 	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4277 		*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4278 		break;
4279 #if (DEBUG || DEVELOPMENT)
4280 	case SO_STATISTICS_EVENT_RESERVED_1:
4281 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4282 		break;
4283 	case SO_STATISTICS_EVENT_RESERVED_2:
4284 		*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4285 		break;
4286 #endif /* (DEBUG || DEVELOPMENT) */
4287 	default:
4288 		error = EINVAL;
4289 		break;
4290 	}
4291 	return error;
4292 }
4293 
4294 /*
4295  * Returns:	0			Success
4296  *		EINVAL
4297  *		ENOTCONN
4298  *	<pru_shutdown>:EINVAL
4299  *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
4300  *	<pru_shutdown>:ENOBUFS[TCP]
4301  *	<pru_shutdown>:EMSGSIZE[TCP]
4302  *	<pru_shutdown>:EHOSTUNREACH[TCP]
4303  *	<pru_shutdown>:ENETUNREACH[TCP]
4304  *	<pru_shutdown>:ENETDOWN[TCP]
4305  *	<pru_shutdown>:ENOMEM[TCP]
4306  *	<pru_shutdown>:EACCES[TCP]
4307  *	<pru_shutdown>:EMSGSIZE[TCP]
4308  *	<pru_shutdown>:ENOBUFS[TCP]
4309  *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
4310  *	<pru_shutdown>:???		[other protocol families]
4311  */
4312 int
soshutdown(struct socket * so,int how)4313 soshutdown(struct socket *so, int how)
4314 {
4315 	int error;
4316 
4317 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4318 
4319 	switch (how) {
4320 	case SHUT_RD:
4321 	case SHUT_WR:
4322 	case SHUT_RDWR:
4323 		socket_lock(so, 1);
4324 		if ((so->so_state &
4325 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4326 			error = ENOTCONN;
4327 		} else {
4328 			error = soshutdownlock(so, how);
4329 		}
4330 		socket_unlock(so, 1);
4331 		break;
4332 	default:
4333 		error = EINVAL;
4334 		break;
4335 	}
4336 
4337 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4338 
4339 	return error;
4340 }
4341 
4342 int
soshutdownlock_final(struct socket * so,int how)4343 soshutdownlock_final(struct socket *so, int how)
4344 {
4345 	struct protosw *pr = so->so_proto;
4346 	int error = 0;
4347 
4348 	sflt_notify(so, sock_evt_shutdown, &how);
4349 
4350 	if (how != SHUT_WR) {
4351 		if ((so->so_state & SS_CANTRCVMORE) != 0) {
4352 			/* read already shut down */
4353 			error = ENOTCONN;
4354 			goto done;
4355 		}
4356 		sorflush(so);
4357 	}
4358 	if (how != SHUT_RD) {
4359 		if ((so->so_state & SS_CANTSENDMORE) != 0) {
4360 			/* write already shut down */
4361 			error = ENOTCONN;
4362 			goto done;
4363 		}
4364 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
4365 	}
4366 done:
4367 	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4368 	return error;
4369 }
4370 
4371 int
soshutdownlock(struct socket * so,int how)4372 soshutdownlock(struct socket *so, int how)
4373 {
4374 	int error = 0;
4375 
4376 #if CONTENT_FILTER
4377 	/*
4378 	 * A content filter may delay the actual shutdown until it
4379 	 * has processed the pending data
4380 	 */
4381 	if (so->so_flags & SOF_CONTENT_FILTER) {
4382 		error = cfil_sock_shutdown(so, &how);
4383 		if (error == EJUSTRETURN) {
4384 			error = 0;
4385 			goto done;
4386 		} else if (error != 0) {
4387 			goto done;
4388 		}
4389 	}
4390 #endif /* CONTENT_FILTER */
4391 
4392 	error = soshutdownlock_final(so, how);
4393 
4394 done:
4395 	return error;
4396 }
4397 
4398 void
sowflush(struct socket * so)4399 sowflush(struct socket *so)
4400 {
4401 	struct sockbuf *sb = &so->so_snd;
4402 
4403 	/*
4404 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4405 	 * to prevent the socket buffer from being unexpectedly altered
4406 	 * while it is used by another thread in socket send/receive.
4407 	 *
4408 	 * sblock() must not fail here, hence the assertion.
4409 	 */
4410 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4411 	VERIFY(sb->sb_flags & SB_LOCK);
4412 
4413 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4414 	sb->sb_flags            |= SB_DROP;
4415 	sb->sb_upcall           = NULL;
4416 	sb->sb_upcallarg        = NULL;
4417 
4418 	sbunlock(sb, TRUE);     /* keep socket locked */
4419 
4420 	selthreadclear(&sb->sb_sel);
4421 	sbrelease(sb);
4422 }
4423 
4424 void
sorflush(struct socket * so)4425 sorflush(struct socket *so)
4426 {
4427 	struct sockbuf *sb = &so->so_rcv;
4428 	struct protosw *pr = so->so_proto;
4429 	struct sockbuf asb;
4430 #ifdef notyet
4431 	lck_mtx_t *mutex_held;
4432 	/*
4433 	 * XXX: This code is currently commented out, because we may get here
4434 	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4435 	 * longer be able to return us the lock; this will be fixed in future.
4436 	 */
4437 	if (so->so_proto->pr_getlock != NULL) {
4438 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4439 	} else {
4440 		mutex_held = so->so_proto->pr_domain->dom_mtx;
4441 	}
4442 
4443 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4444 #endif /* notyet */
4445 
4446 	sflt_notify(so, sock_evt_flush_read, NULL);
4447 
4448 	socantrcvmore(so);
4449 
4450 	/*
4451 	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4452 	 * to prevent the socket buffer from being unexpectedly altered
4453 	 * while it is used by another thread in socket send/receive.
4454 	 *
4455 	 * sblock() must not fail here, hence the assertion.
4456 	 */
4457 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4458 	VERIFY(sb->sb_flags & SB_LOCK);
4459 
4460 	/*
4461 	 * Copy only the relevant fields from "sb" to "asb" which we
4462 	 * need for sbrelease() to function.  In particular, skip
4463 	 * sb_sel as it contains the wait queue linkage, which would
4464 	 * wreak havoc if we were to issue selthreadclear() on "asb".
4465 	 * Make sure to not carry over SB_LOCK in "asb", as we need
4466 	 * to acquire it later as part of sbrelease().
4467 	 */
4468 	bzero(&asb, sizeof(asb));
4469 	asb.sb_cc               = sb->sb_cc;
4470 	asb.sb_hiwat            = sb->sb_hiwat;
4471 	asb.sb_mbcnt            = sb->sb_mbcnt;
4472 	asb.sb_mbmax            = sb->sb_mbmax;
4473 	asb.sb_ctl              = sb->sb_ctl;
4474 	asb.sb_lowat            = sb->sb_lowat;
4475 	asb.sb_mb               = sb->sb_mb;
4476 	asb.sb_mbtail           = sb->sb_mbtail;
4477 	asb.sb_lastrecord       = sb->sb_lastrecord;
4478 	asb.sb_so               = sb->sb_so;
4479 	asb.sb_flags            = sb->sb_flags;
4480 	asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4481 	asb.sb_flags            |= SB_DROP;
4482 
4483 	/*
4484 	 * Ideally we'd bzero() these and preserve the ones we need;
4485 	 * but to do that we'd need to shuffle things around in the
4486 	 * sockbuf, and we can't do it now because there are KEXTS
4487 	 * that are directly referring to the socket structure.
4488 	 *
4489 	 * Setting SB_DROP acts as a barrier to prevent further appends.
4490 	 * Clearing SB_SEL is done for selthreadclear() below.
4491 	 */
4492 	sb->sb_cc               = 0;
4493 	sb->sb_hiwat            = 0;
4494 	sb->sb_mbcnt            = 0;
4495 	sb->sb_mbmax            = 0;
4496 	sb->sb_ctl              = 0;
4497 	sb->sb_lowat            = 0;
4498 	sb->sb_mb               = NULL;
4499 	sb->sb_mbtail           = NULL;
4500 	sb->sb_lastrecord       = NULL;
4501 	sb->sb_timeo.tv_sec     = 0;
4502 	sb->sb_timeo.tv_usec    = 0;
4503 	sb->sb_upcall           = NULL;
4504 	sb->sb_upcallarg        = NULL;
4505 	sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4506 	sb->sb_flags            |= SB_DROP;
4507 
4508 	sbunlock(sb, TRUE);     /* keep socket locked */
4509 
4510 	/*
4511 	 * Note that selthreadclear() is called on the original "sb" and
4512 	 * not the local "asb" because of the way wait queue linkage is
4513 	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4514 	 * should no longer be set (cleared above.)
4515 	 */
4516 	selthreadclear(&sb->sb_sel);
4517 
4518 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4519 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4520 	}
4521 
4522 	sbrelease(&asb);
4523 }
4524 
4525 /*
4526  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4527  * an additional variant to handle the case where the option value needs
4528  * to be some kind of integer, but not a specific size.
4529  * In addition to their use here, these functions are also called by the
4530  * protocol-level pr_ctloutput() routines.
4531  *
4532  * Returns:	0			Success
4533  *		EINVAL
4534  *	copyin:EFAULT
4535  */
4536 int
sooptcopyin(struct sockopt * sopt,void * __sized_by (len)buf,size_t len,size_t minlen)4537 sooptcopyin(struct sockopt *sopt, void *__sized_by(len) buf, size_t len, size_t minlen)
4538 {
4539 	size_t  valsize;
4540 
4541 	/*
4542 	 * If the user gives us more than we wanted, we ignore it,
4543 	 * but if we don't get the minimum length the caller
4544 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4545 	 * is set to however much we actually retrieved.
4546 	 */
4547 	if ((valsize = sopt->sopt_valsize) < minlen) {
4548 		return EINVAL;
4549 	}
4550 	if (valsize > len) {
4551 		sopt->sopt_valsize = valsize = len;
4552 	}
4553 
4554 	if (sopt->sopt_p != kernproc) {
4555 		return copyin(sopt->sopt_val, buf, valsize);
4556 	}
4557 
4558 	caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4559 	    CAST_DOWN(caddr_t, sopt->sopt_val),
4560 	    valsize);
4561 	bcopy(tmp, buf, valsize);
4562 
4563 	return 0;
4564 }
4565 
4566 /*
4567  * sooptcopyin_timeval
4568  *   Copy in a timeval value into tv_p, and take into account whether the
4569  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4570  *   code here so that we can verify the 64-bit tv_sec value before we lose
4571  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4572  */
4573 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4574 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4575 {
4576 	int                     error;
4577 
4578 	if (proc_is64bit(sopt->sopt_p)) {
4579 		struct user64_timeval   tv64;
4580 
4581 		if (sopt->sopt_valsize < sizeof(tv64)) {
4582 			return EINVAL;
4583 		}
4584 
4585 		sopt->sopt_valsize = sizeof(tv64);
4586 		if (sopt->sopt_p != kernproc) {
4587 			error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4588 			if (error != 0) {
4589 				return error;
4590 			}
4591 		} else {
4592 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4593 			    CAST_DOWN(caddr_t, sopt->sopt_val),
4594 			    sizeof(tv64));
4595 			bcopy(tmp, &tv64, sizeof(tv64));
4596 		}
4597 		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4598 		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4599 			return EDOM;
4600 		}
4601 
4602 		tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4603 		tv_p->tv_usec = tv64.tv_usec;
4604 	} else {
4605 		struct user32_timeval   tv32;
4606 
4607 		if (sopt->sopt_valsize < sizeof(tv32)) {
4608 			return EINVAL;
4609 		}
4610 
4611 		sopt->sopt_valsize = sizeof(tv32);
4612 		if (sopt->sopt_p != kernproc) {
4613 			error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4614 			if (error != 0) {
4615 				return error;
4616 			}
4617 		} else {
4618 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4619 			    CAST_DOWN(caddr_t, sopt->sopt_val),
4620 			    sizeof(tv32));
4621 			bcopy(tmp, &tv32, sizeof(tv32));
4622 		}
4623 #ifndef __LP64__
4624 		/*
4625 		 * K64todo "comparison is always false due to
4626 		 * limited range of data type"
4627 		 */
4628 		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4629 		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4630 			return EDOM;
4631 		}
4632 #endif
4633 		tv_p->tv_sec = tv32.tv_sec;
4634 		tv_p->tv_usec = tv32.tv_usec;
4635 	}
4636 	return 0;
4637 }
4638 
4639 int
sooptcopyin_bindtodevice(struct sockopt * sopt,char * __sized_by (bufsize)buf,size_t bufsize)4640 sooptcopyin_bindtodevice(struct sockopt *sopt, char * __sized_by(bufsize) buf, size_t bufsize)
4641 {
4642 #define MIN_BINDTODEVICE_NAME_SIZE    2
4643 	size_t maxlen = bufsize - 1;             /* the max string length that fits in the buffer */
4644 
4645 	if (bufsize < MIN_BINDTODEVICE_NAME_SIZE) {
4646 #if DEBUG || DEVELOPMENT
4647 		os_log(OS_LOG_DEFAULT, "%s: bufsize %lu < MIN_BINDTODEVICE_NAME_SIZE %d",
4648 		    __func__, bufsize, MIN_BINDTODEVICE_NAME_SIZE);
4649 #endif /* DEBUG || DEVELOPMENT */
4650 		return EINVAL;
4651 	}
4652 
4653 	memset(buf, 0, bufsize);
4654 
4655 	/*
4656 	 * bufsize includes the end-of-string because of the uncertainty wether
4657 	 * interface names are passed as strings or byte buffers.
4658 	 * If the user gives us more than the max string length return EINVAL.
4659 	 * On success, sopt->sopt_valsize is not modified
4660 	 */
4661 	maxlen = bufsize - 1;
4662 	if (sopt->sopt_valsize > maxlen) {
4663 		os_log(OS_LOG_DEFAULT, "%s: sopt_valsize %lu > maxlen %lu",
4664 		    __func__, sopt->sopt_valsize, maxlen);
4665 		return EINVAL;
4666 	}
4667 
4668 	if (sopt->sopt_p != kernproc) {
4669 		return copyin(sopt->sopt_val, buf, sopt->sopt_valsize);
4670 	} else {
4671 		caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4672 		    CAST_DOWN(caddr_t, sopt->sopt_val),
4673 		    sopt->sopt_valsize);
4674 		bcopy(tmp, buf, sopt->sopt_valsize);
4675 	}
4676 
4677 	return 0;
4678 #undef MIN_BINDTODEVICE_NAME_SIZE
4679 }
4680 
4681 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4682 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4683     boolean_t ignore_delegate)
4684 {
4685 	kauth_cred_t cred =  NULL;
4686 	proc_t ep = PROC_NULL;
4687 	uid_t uid;
4688 	int error = 0;
4689 
4690 	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4691 		ep = proc_find(so->e_pid);
4692 		if (ep) {
4693 			cred = kauth_cred_proc_ref(ep);
4694 		}
4695 	}
4696 
4697 	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4698 
4699 	/* uid is 0 for root */
4700 	if (uid != 0 || !allow_root) {
4701 		error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4702 	}
4703 	if (cred) {
4704 		kauth_cred_unref(&cred);
4705 	}
4706 	if (ep != PROC_NULL) {
4707 		proc_rele(ep);
4708 	}
4709 
4710 	return error;
4711 }
4712 
4713 /*
4714  * Returns:	0			Success
4715  *		EINVAL
4716  *		ENOPROTOOPT
4717  *		ENOBUFS
4718  *		EDOM
4719  *	sooptcopyin:EINVAL
4720  *	sooptcopyin:EFAULT
4721  *	sooptcopyin_timeval:EINVAL
4722  *	sooptcopyin_timeval:EFAULT
4723  *	sooptcopyin_timeval:EDOM
4724  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4725  *	<pr_ctloutput>:???w
4726  *	sflt_attach_private:???		[whatever a filter author chooses]
4727  *	<sf_setoption>:???		[whatever a filter author chooses]
4728  *
4729  * Notes:	Other <pru_listen> returns depend on the protocol family; all
4730  *		<sf_listen> returns depend on what the filter author causes
4731  *		their filter to return.
4732  */
4733 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4734 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4735 {
4736 	int     error, optval;
4737 	int64_t long_optval;
4738 	struct  linger l;
4739 	struct  timeval tv;
4740 
4741 	if (sopt->sopt_dir != SOPT_SET) {
4742 		sopt->sopt_dir = SOPT_SET;
4743 	}
4744 
4745 	if (dolock) {
4746 		socket_lock(so, 1);
4747 	}
4748 
4749 	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4750 	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4751 	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4752 		/* the socket has been shutdown, no more sockopt's */
4753 		error = EINVAL;
4754 		goto out;
4755 	}
4756 
4757 	error = sflt_setsockopt(so, sopt);
4758 	if (error != 0) {
4759 		if (error == EJUSTRETURN) {
4760 			error = 0;
4761 		}
4762 		goto out;
4763 	}
4764 
4765 	if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
4766 		if (so->so_proto != NULL &&
4767 		    so->so_proto->pr_ctloutput != NULL) {
4768 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
4769 			goto out;
4770 		}
4771 		error = ENOPROTOOPT;
4772 	} else {
4773 		/*
4774 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
4775 		 * the protocol layer, if needed.  A zero value returned from
4776 		 * the handler means use default socket-level processing as
4777 		 * done by the rest of this routine.  Otherwise, any other
4778 		 * return value indicates that the option is unsupported.
4779 		 */
4780 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4781 		    pru_socheckopt(so, sopt)) != 0) {
4782 			goto out;
4783 		}
4784 
4785 		error = 0;
4786 		switch (sopt->sopt_name) {
4787 		case SO_LINGER:
4788 		case SO_LINGER_SEC: {
4789 			error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4790 			if (error != 0) {
4791 				goto out;
4792 			}
4793 			/* Make sure to use sane values */
4794 			if (sopt->sopt_name == SO_LINGER) {
4795 				so->so_linger = (short)l.l_linger;
4796 			} else {
4797 				so->so_linger = (short)((long)l.l_linger * hz);
4798 			}
4799 			if (l.l_onoff != 0) {
4800 				so->so_options |= SO_LINGER;
4801 			} else {
4802 				so->so_options &= ~SO_LINGER;
4803 			}
4804 			break;
4805 		}
4806 		case SO_DEBUG:
4807 		case SO_KEEPALIVE:
4808 		case SO_DONTROUTE:
4809 		case SO_USELOOPBACK:
4810 		case SO_BROADCAST:
4811 		case SO_REUSEADDR:
4812 		case SO_REUSEPORT:
4813 		case SO_OOBINLINE:
4814 		case SO_TIMESTAMP:
4815 		case SO_TIMESTAMP_MONOTONIC:
4816 		case SO_TIMESTAMP_CONTINUOUS:
4817 		case SO_DONTTRUNC:
4818 		case SO_WANTMORE:
4819 		case SO_WANTOOBFLAG:
4820 		case SO_NOWAKEFROMSLEEP:
4821 		case SO_NOAPNFALLBK:
4822 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4823 			    sizeof(optval));
4824 			if (error != 0) {
4825 				goto out;
4826 			}
4827 			if (optval) {
4828 				so->so_options |= sopt->sopt_name;
4829 			} else {
4830 				so->so_options &= ~sopt->sopt_name;
4831 			}
4832 #if SKYWALK
4833 			inp_update_netns_flags(so);
4834 #endif /* SKYWALK */
4835 			break;
4836 
4837 		case SO_SNDBUF:
4838 		case SO_RCVBUF:
4839 		case SO_SNDLOWAT:
4840 		case SO_RCVLOWAT:
4841 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4842 			    sizeof(optval));
4843 			if (error != 0) {
4844 				goto out;
4845 			}
4846 
4847 			/*
4848 			 * Values < 1 make no sense for any of these
4849 			 * options, so disallow them.
4850 			 */
4851 			if (optval < 1) {
4852 				error = EINVAL;
4853 				goto out;
4854 			}
4855 
4856 			switch (sopt->sopt_name) {
4857 			case SO_SNDBUF:
4858 			case SO_RCVBUF: {
4859 				struct sockbuf *sb =
4860 				    (sopt->sopt_name == SO_SNDBUF) ?
4861 				    &so->so_snd : &so->so_rcv;
4862 				if (sbreserve(sb, (u_int32_t)optval) == 0) {
4863 					error = ENOBUFS;
4864 					goto out;
4865 				}
4866 				sb->sb_flags |= SB_USRSIZE;
4867 				sb->sb_flags &= ~SB_AUTOSIZE;
4868 				sb->sb_idealsize = (u_int32_t)optval;
4869 				break;
4870 			}
4871 			/*
4872 			 * Make sure the low-water is never greater than
4873 			 * the high-water.
4874 			 */
4875 			case SO_SNDLOWAT: {
4876 				int space = sbspace(&so->so_snd);
4877 				uint32_t hiwat = so->so_snd.sb_hiwat;
4878 
4879 				if (so->so_snd.sb_flags & SB_UNIX) {
4880 					struct unpcb *unp =
4881 					    (struct unpcb *)(so->so_pcb);
4882 					if (unp != NULL &&
4883 					    unp->unp_conn != NULL) {
4884 						struct socket *so2 = unp->unp_conn->unp_socket;
4885 						hiwat += unp->unp_conn->unp_cc;
4886 						space = sbspace(&so2->so_rcv);
4887 					}
4888 				}
4889 
4890 				so->so_snd.sb_lowat =
4891 				    (optval > hiwat) ?
4892 				    hiwat : optval;
4893 
4894 				if (space >= so->so_snd.sb_lowat) {
4895 					sowwakeup(so);
4896 				}
4897 				break;
4898 			}
4899 			case SO_RCVLOWAT: {
4900 				int64_t data_len;
4901 				so->so_rcv.sb_lowat =
4902 				    (optval > so->so_rcv.sb_hiwat) ?
4903 				    so->so_rcv.sb_hiwat : optval;
4904 				if (so->so_rcv.sb_flags & SB_UNIX) {
4905 					struct unpcb *unp =
4906 					    (struct unpcb *)(so->so_pcb);
4907 					if (unp != NULL &&
4908 					    unp->unp_conn != NULL) {
4909 						struct socket *so2 = unp->unp_conn->unp_socket;
4910 						data_len = so2->so_snd.sb_cc
4911 						    - so2->so_snd.sb_ctl;
4912 					} else {
4913 						data_len = so->so_rcv.sb_cc
4914 						    - so->so_rcv.sb_ctl;
4915 					}
4916 				} else {
4917 					data_len = so->so_rcv.sb_cc
4918 					    - so->so_rcv.sb_ctl;
4919 				}
4920 
4921 				if (data_len >= so->so_rcv.sb_lowat) {
4922 					sorwakeup(so);
4923 				}
4924 				break;
4925 			}
4926 			}
4927 			break;
4928 
4929 		case SO_SNDTIMEO:
4930 		case SO_RCVTIMEO:
4931 			error = sooptcopyin_timeval(sopt, &tv);
4932 			if (error != 0) {
4933 				goto out;
4934 			}
4935 
4936 			switch (sopt->sopt_name) {
4937 			case SO_SNDTIMEO:
4938 				so->so_snd.sb_timeo = tv;
4939 				break;
4940 			case SO_RCVTIMEO:
4941 				so->so_rcv.sb_timeo = tv;
4942 				break;
4943 			}
4944 			break;
4945 
4946 		case SO_NKE: {
4947 			struct so_nke nke;
4948 
4949 			error = sooptcopyin(sopt, &nke, sizeof(nke),
4950 			    sizeof(nke));
4951 			if (error != 0) {
4952 				goto out;
4953 			}
4954 
4955 			error = sflt_attach_internal(so, nke.nke_handle);
4956 			break;
4957 		}
4958 
4959 		case SO_NOSIGPIPE:
4960 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4961 			    sizeof(optval));
4962 			if (error != 0) {
4963 				goto out;
4964 			}
4965 			if (optval != 0) {
4966 				so->so_flags |= SOF_NOSIGPIPE;
4967 			} else {
4968 				so->so_flags &= ~SOF_NOSIGPIPE;
4969 			}
4970 			break;
4971 
4972 		case SO_NOADDRERR:
4973 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4974 			    sizeof(optval));
4975 			if (error != 0) {
4976 				goto out;
4977 			}
4978 			if (optval != 0) {
4979 				so->so_flags |= SOF_NOADDRAVAIL;
4980 			} else {
4981 				so->so_flags &= ~SOF_NOADDRAVAIL;
4982 			}
4983 			break;
4984 
4985 		case SO_REUSESHAREUID:
4986 			error = sooptcopyin(sopt, &optval, sizeof(optval),
4987 			    sizeof(optval));
4988 			if (error != 0) {
4989 				goto out;
4990 			}
4991 			if (optval != 0) {
4992 				so->so_flags |= SOF_REUSESHAREUID;
4993 			} else {
4994 				so->so_flags &= ~SOF_REUSESHAREUID;
4995 			}
4996 			break;
4997 
4998 		case SO_NOTIFYCONFLICT:
4999 			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5000 				error = EPERM;
5001 				goto out;
5002 			}
5003 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5004 			    sizeof(optval));
5005 			if (error != 0) {
5006 				goto out;
5007 			}
5008 			if (optval != 0) {
5009 				so->so_flags |= SOF_NOTIFYCONFLICT;
5010 			} else {
5011 				so->so_flags &= ~SOF_NOTIFYCONFLICT;
5012 			}
5013 			break;
5014 
5015 		case SO_RESTRICTIONS:
5016 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5017 			    sizeof(optval));
5018 			if (error != 0) {
5019 				goto out;
5020 			}
5021 
5022 			error = so_set_restrictions(so, optval);
5023 			break;
5024 
5025 		case SO_AWDL_UNRESTRICTED:
5026 			if (SOCK_DOM(so) != PF_INET &&
5027 			    SOCK_DOM(so) != PF_INET6) {
5028 				error = EOPNOTSUPP;
5029 				goto out;
5030 			}
5031 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5032 			    sizeof(optval));
5033 			if (error != 0) {
5034 				goto out;
5035 			}
5036 			if (optval != 0) {
5037 				error = soopt_cred_check(so,
5038 				    PRIV_NET_RESTRICTED_AWDL, false, false);
5039 				if (error == 0) {
5040 					inp_set_awdl_unrestricted(
5041 						sotoinpcb(so));
5042 				}
5043 			} else {
5044 				inp_clear_awdl_unrestricted(sotoinpcb(so));
5045 			}
5046 			break;
5047 		case SO_INTCOPROC_ALLOW:
5048 			if (SOCK_DOM(so) != PF_INET6) {
5049 				error = EOPNOTSUPP;
5050 				goto out;
5051 			}
5052 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5053 			    sizeof(optval));
5054 			if (error != 0) {
5055 				goto out;
5056 			}
5057 			if (optval != 0 &&
5058 			    inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5059 				error = soopt_cred_check(so,
5060 				    PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5061 				if (error == 0) {
5062 					inp_set_intcoproc_allowed(
5063 						sotoinpcb(so));
5064 				}
5065 			} else if (optval == 0) {
5066 				inp_clear_intcoproc_allowed(sotoinpcb(so));
5067 			}
5068 			break;
5069 
5070 		case SO_LABEL:
5071 			error = EOPNOTSUPP;
5072 			break;
5073 
5074 		case SO_UPCALLCLOSEWAIT:
5075 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5076 			    sizeof(optval));
5077 			if (error != 0) {
5078 				goto out;
5079 			}
5080 			if (optval != 0) {
5081 				so->so_flags |= SOF_UPCALLCLOSEWAIT;
5082 			} else {
5083 				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5084 			}
5085 			break;
5086 
5087 		case SO_RANDOMPORT:
5088 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5089 			    sizeof(optval));
5090 			if (error != 0) {
5091 				goto out;
5092 			}
5093 			if (optval != 0) {
5094 				so->so_flags |= SOF_BINDRANDOMPORT;
5095 			} else {
5096 				so->so_flags &= ~SOF_BINDRANDOMPORT;
5097 			}
5098 			break;
5099 
5100 		case SO_NP_EXTENSIONS: {
5101 			struct so_np_extensions sonpx;
5102 
5103 			error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5104 			    sizeof(sonpx));
5105 			if (error != 0) {
5106 				goto out;
5107 			}
5108 			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5109 				error = EINVAL;
5110 				goto out;
5111 			}
5112 			/*
5113 			 * Only one bit defined for now
5114 			 */
5115 			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5116 				if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5117 					so->so_flags |= SOF_NPX_SETOPTSHUT;
5118 				} else {
5119 					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5120 				}
5121 			}
5122 			break;
5123 		}
5124 
5125 		case SO_TRAFFIC_CLASS: {
5126 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5127 			    sizeof(optval));
5128 			if (error != 0) {
5129 				goto out;
5130 			}
5131 			if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5132 				int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5133 				error = so_set_net_service_type(so, netsvc);
5134 				goto out;
5135 			}
5136 			error = so_set_traffic_class(so, optval);
5137 			if (error != 0) {
5138 				goto out;
5139 			}
5140 			so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5141 			so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5142 			break;
5143 		}
5144 
5145 		case SO_RECV_TRAFFIC_CLASS: {
5146 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5147 			    sizeof(optval));
5148 			if (error != 0) {
5149 				goto out;
5150 			}
5151 			if (optval == 0) {
5152 				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5153 			} else {
5154 				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5155 			}
5156 			break;
5157 		}
5158 
5159 #if (DEVELOPMENT || DEBUG)
5160 		case SO_TRAFFIC_CLASS_DBG: {
5161 			struct so_tcdbg so_tcdbg;
5162 
5163 			error = sooptcopyin(sopt, &so_tcdbg,
5164 			    sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5165 			if (error != 0) {
5166 				goto out;
5167 			}
5168 			error = so_set_tcdbg(so, &so_tcdbg);
5169 			if (error != 0) {
5170 				goto out;
5171 			}
5172 			break;
5173 		}
5174 #endif /* (DEVELOPMENT || DEBUG) */
5175 
5176 		case SO_PRIVILEGED_TRAFFIC_CLASS:
5177 			error = priv_check_cred(kauth_cred_get(),
5178 			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5179 			if (error != 0) {
5180 				goto out;
5181 			}
5182 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5183 			    sizeof(optval));
5184 			if (error != 0) {
5185 				goto out;
5186 			}
5187 			if (optval == 0) {
5188 				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5189 			} else {
5190 				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5191 			}
5192 			break;
5193 
5194 #if (DEVELOPMENT || DEBUG)
5195 		case SO_DEFUNCTIT:
5196 			error = sosetdefunct(current_proc(), so, 0, FALSE);
5197 			if (error == 0) {
5198 				error = sodefunct(current_proc(), so, 0);
5199 			}
5200 
5201 			break;
5202 #endif /* (DEVELOPMENT || DEBUG) */
5203 
5204 		case SO_DEFUNCTOK:
5205 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5206 			    sizeof(optval));
5207 			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5208 				if (error == 0) {
5209 					error = EBADF;
5210 				}
5211 				goto out;
5212 			}
5213 			/*
5214 			 * Any process can set SO_DEFUNCTOK (clear
5215 			 * SOF_NODEFUNCT), but only root can clear
5216 			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5217 			 */
5218 			if (optval == 0 &&
5219 			    kauth_cred_issuser(kauth_cred_get()) == 0) {
5220 				error = EPERM;
5221 				goto out;
5222 			}
5223 			if (optval) {
5224 				so->so_flags &= ~SOF_NODEFUNCT;
5225 			} else {
5226 				so->so_flags |= SOF_NODEFUNCT;
5227 			}
5228 
5229 			if (SOCK_DOM(so) == PF_INET ||
5230 			    SOCK_DOM(so) == PF_INET6) {
5231 				char s[MAX_IPv6_STR_LEN];
5232 				char d[MAX_IPv6_STR_LEN];
5233 				struct inpcb *inp = sotoinpcb(so);
5234 
5235 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5236 				    "[%s %s:%d -> %s:%d] is now marked "
5237 				    "as %seligible for "
5238 				    "defunct\n", __func__, proc_selfpid(),
5239 				    proc_best_name(current_proc()),
5240 				    so->so_gencnt,
5241 				    (SOCK_TYPE(so) == SOCK_STREAM) ?
5242 				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5243 				    ((SOCK_DOM(so) == PF_INET) ?
5244 				    (void *)&inp->inp_laddr.s_addr :
5245 				    (void *)&inp->in6p_laddr), s, sizeof(s)),
5246 				    ntohs(inp->in6p_lport),
5247 				    inet_ntop(SOCK_DOM(so),
5248 				    (SOCK_DOM(so) == PF_INET) ?
5249 				    (void *)&inp->inp_faddr.s_addr :
5250 				    (void *)&inp->in6p_faddr, d, sizeof(d)),
5251 				    ntohs(inp->in6p_fport),
5252 				    (so->so_flags & SOF_NODEFUNCT) ?
5253 				    "not " : "");
5254 			} else {
5255 				SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5256 				    "is now marked as %seligible for "
5257 				    "defunct\n",
5258 				    __func__, proc_selfpid(),
5259 				    proc_best_name(current_proc()),
5260 				    so->so_gencnt,
5261 				    SOCK_DOM(so), SOCK_TYPE(so),
5262 				    (so->so_flags & SOF_NODEFUNCT) ?
5263 				    "not " : "");
5264 			}
5265 			break;
5266 
5267 		case SO_ISDEFUNCT:
5268 			/* This option is not settable */
5269 			error = EINVAL;
5270 			break;
5271 
5272 		case SO_OPPORTUNISTIC:
5273 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5274 			    sizeof(optval));
5275 			if (error == 0) {
5276 				error = so_set_opportunistic(so, optval);
5277 			}
5278 			break;
5279 
5280 		case SO_FLUSH:
5281 			/* This option is handled by lower layer(s) */
5282 			error = 0;
5283 			break;
5284 
5285 		case SO_RECV_ANYIF:
5286 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5287 			    sizeof(optval));
5288 			if (error == 0) {
5289 				error = so_set_recv_anyif(so, optval);
5290 			}
5291 			break;
5292 
5293 		case SO_TRAFFIC_MGT_BACKGROUND: {
5294 			/* This option is handled by lower layer(s) */
5295 			error = 0;
5296 			break;
5297 		}
5298 
5299 #if FLOW_DIVERT
5300 		case SO_FLOW_DIVERT_TOKEN:
5301 			error = flow_divert_token_set(so, sopt);
5302 			break;
5303 #endif  /* FLOW_DIVERT */
5304 
5305 
5306 		case SO_DELEGATED:
5307 			if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5308 			    sizeof(optval))) != 0) {
5309 				break;
5310 			}
5311 
5312 			error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5313 			break;
5314 
5315 		case SO_DELEGATED_UUID: {
5316 			uuid_t euuid;
5317 
5318 			if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5319 			    sizeof(euuid))) != 0) {
5320 				break;
5321 			}
5322 
5323 			error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5324 			break;
5325 		}
5326 
5327 #if NECP
5328 		case SO_NECP_ATTRIBUTES:
5329 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5330 				/* Handled by MPTCP itself */
5331 				break;
5332 			}
5333 
5334 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5335 				error = EINVAL;
5336 				goto out;
5337 			}
5338 
5339 			error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5340 			break;
5341 
5342 		case SO_NECP_CLIENTUUID: {
5343 			if (SOCK_DOM(so) == PF_MULTIPATH) {
5344 				/* Handled by MPTCP itself */
5345 				break;
5346 			}
5347 
5348 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5349 				error = EINVAL;
5350 				goto out;
5351 			}
5352 
5353 			struct inpcb *inp = sotoinpcb(so);
5354 			if (!uuid_is_null(inp->necp_client_uuid)) {
5355 				// Clear out the old client UUID if present
5356 				necp_inpcb_remove_cb(inp);
5357 			}
5358 
5359 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5360 			    sizeof(uuid_t), sizeof(uuid_t));
5361 			if (error != 0) {
5362 				goto out;
5363 			}
5364 
5365 			if (uuid_is_null(inp->necp_client_uuid)) {
5366 				error = EINVAL;
5367 				goto out;
5368 			}
5369 
5370 			pid_t current_pid = proc_pid(current_proc());
5371 			error = necp_client_register_socket_flow(current_pid,
5372 			    inp->necp_client_uuid, inp);
5373 			if (error != 0) {
5374 				uuid_clear(inp->necp_client_uuid);
5375 				goto out;
5376 			}
5377 
5378 			if (inp->inp_lport != 0) {
5379 				// There is a bound local port, so this is not
5380 				// a fresh socket. Assign to the client.
5381 				necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5382 			}
5383 
5384 			break;
5385 		}
5386 		case SO_NECP_LISTENUUID: {
5387 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5388 				error = EINVAL;
5389 				goto out;
5390 			}
5391 
5392 			struct inpcb *inp = sotoinpcb(so);
5393 			if (!uuid_is_null(inp->necp_client_uuid)) {
5394 				error = EINVAL;
5395 				goto out;
5396 			}
5397 
5398 			error = sooptcopyin(sopt, &inp->necp_client_uuid,
5399 			    sizeof(uuid_t), sizeof(uuid_t));
5400 			if (error != 0) {
5401 				goto out;
5402 			}
5403 
5404 			if (uuid_is_null(inp->necp_client_uuid)) {
5405 				error = EINVAL;
5406 				goto out;
5407 			}
5408 
5409 			error = necp_client_register_socket_listener(proc_pid(current_proc()),
5410 			    inp->necp_client_uuid, inp);
5411 			if (error != 0) {
5412 				uuid_clear(inp->necp_client_uuid);
5413 				goto out;
5414 			}
5415 
5416 			// Mark that the port registration is held by NECP
5417 			inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5418 
5419 			break;
5420 		}
5421 
5422 		case SO_RESOLVER_SIGNATURE: {
5423 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5424 				error = EINVAL;
5425 				goto out;
5426 			}
5427 			error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5428 			break;
5429 		}
5430 #endif /* NECP */
5431 
5432 		case SO_EXTENDED_BK_IDLE:
5433 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5434 			    sizeof(optval));
5435 			if (error == 0) {
5436 				error = so_set_extended_bk_idle(so, optval);
5437 			}
5438 			break;
5439 
5440 		case SO_MARK_CELLFALLBACK:
5441 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5442 			    sizeof(optval));
5443 			if (error != 0) {
5444 				goto out;
5445 			}
5446 			if (optval < 0) {
5447 				error = EINVAL;
5448 				goto out;
5449 			}
5450 			if (optval == 0) {
5451 				so->so_flags1 &= ~SOF1_CELLFALLBACK;
5452 			} else {
5453 				so->so_flags1 |= SOF1_CELLFALLBACK;
5454 			}
5455 			break;
5456 
5457 		case SO_MARK_CELLFALLBACK_UUID:
5458 		{
5459 			struct so_mark_cellfallback_uuid_args args;
5460 
5461 			error = sooptcopyin(sopt, &args, sizeof(args),
5462 			    sizeof(args));
5463 			if (error != 0) {
5464 				goto out;
5465 			}
5466 			error = nstat_userland_mark_rnf_override(args.flow_uuid,
5467 			    args.flow_cellfallback);
5468 			break;
5469 		}
5470 
5471 		case SO_FALLBACK_MODE:
5472 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5473 			    sizeof(optval));
5474 			if (error != 0) {
5475 				goto out;
5476 			}
5477 			if (optval < SO_FALLBACK_MODE_NONE ||
5478 			    optval > SO_FALLBACK_MODE_PREFER) {
5479 				error = EINVAL;
5480 				goto out;
5481 			}
5482 			so->so_fallback_mode = (u_int8_t)optval;
5483 			break;
5484 
5485 		case SO_MARK_KNOWN_TRACKER: {
5486 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5487 			    sizeof(optval));
5488 			if (error != 0) {
5489 				goto out;
5490 			}
5491 			if (optval < 0) {
5492 				error = EINVAL;
5493 				goto out;
5494 			}
5495 			if (optval == 0) {
5496 				so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5497 			} else {
5498 				so->so_flags1 |= SOF1_KNOWN_TRACKER;
5499 			}
5500 			break;
5501 		}
5502 
5503 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5504 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5505 			    sizeof(optval));
5506 			if (error != 0) {
5507 				goto out;
5508 			}
5509 			if (optval < 0) {
5510 				error = EINVAL;
5511 				goto out;
5512 			}
5513 			if (optval == 0) {
5514 				so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5515 			} else {
5516 				so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5517 			}
5518 			break;
5519 		}
5520 
5521 		case SO_MARK_APPROVED_APP_DOMAIN: {
5522 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5523 			    sizeof(optval));
5524 			if (error != 0) {
5525 				goto out;
5526 			}
5527 			if (optval < 0) {
5528 				error = EINVAL;
5529 				goto out;
5530 			}
5531 			if (optval == 0) {
5532 				so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5533 			} else {
5534 				so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5535 			}
5536 			break;
5537 		}
5538 
5539 		case SO_STATISTICS_EVENT:
5540 			error = sooptcopyin(sopt, &long_optval,
5541 			    sizeof(long_optval), sizeof(long_optval));
5542 			if (error != 0) {
5543 				goto out;
5544 			}
5545 			u_int64_t nstat_event = 0;
5546 			error = so_statistics_event_to_nstat_event(
5547 				&long_optval, &nstat_event);
5548 			if (error != 0) {
5549 				goto out;
5550 			}
5551 			nstat_pcb_event(sotoinpcb(so), nstat_event);
5552 			break;
5553 
5554 		case SO_NET_SERVICE_TYPE: {
5555 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5556 			    sizeof(optval));
5557 			if (error != 0) {
5558 				goto out;
5559 			}
5560 			error = so_set_net_service_type(so, optval);
5561 			break;
5562 		}
5563 
5564 		case SO_QOSMARKING_POLICY_OVERRIDE:
5565 			error = priv_check_cred(kauth_cred_get(),
5566 			    PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5567 			if (error != 0) {
5568 				goto out;
5569 			}
5570 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5571 			    sizeof(optval));
5572 			if (error != 0) {
5573 				goto out;
5574 			}
5575 			if (optval == 0) {
5576 				so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5577 			} else {
5578 				so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5579 			}
5580 			break;
5581 
5582 		case SO_MPKL_SEND_INFO: {
5583 			struct so_mpkl_send_info so_mpkl_send_info;
5584 
5585 			error = sooptcopyin(sopt, &so_mpkl_send_info,
5586 			    sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5587 			if (error != 0) {
5588 				goto out;
5589 			}
5590 			uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5591 			so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5592 
5593 			if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5594 				so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5595 			} else {
5596 				so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5597 			}
5598 			break;
5599 		}
5600 		case SO_WANT_KEV_SOCKET_CLOSED: {
5601 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5602 			    sizeof(optval));
5603 			if (error != 0) {
5604 				goto out;
5605 			}
5606 			if (optval == 0) {
5607 				so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5608 			} else {
5609 				so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5610 			}
5611 			break;
5612 		}
5613 		case SO_MARK_WAKE_PKT: {
5614 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5615 			    sizeof(optval));
5616 			if (error != 0) {
5617 				goto out;
5618 			}
5619 			if (optval == 0) {
5620 				so->so_flags &= ~SOF_MARK_WAKE_PKT;
5621 			} else {
5622 				so->so_flags |= SOF_MARK_WAKE_PKT;
5623 			}
5624 			break;
5625 		}
5626 		case SO_RECV_WAKE_PKT: {
5627 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5628 			    sizeof(optval));
5629 			if (error != 0) {
5630 				goto out;
5631 			}
5632 			if (optval == 0) {
5633 				so->so_flags &= ~SOF_RECV_WAKE_PKT;
5634 			} else {
5635 				so->so_flags |= SOF_RECV_WAKE_PKT;
5636 			}
5637 			break;
5638 		}
5639 		case SO_APPLICATION_ID: {
5640 			so_application_id_t application_id = { 0 };
5641 
5642 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5643 				error = EINVAL;
5644 				goto out;
5645 			}
5646 			error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5647 			    sizeof(application_id));
5648 			if (error != 0) {
5649 				goto out;
5650 			}
5651 
5652 			// The user needs to match
5653 			if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5654 				error = EINVAL;
5655 				printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5656 				goto out;
5657 			}
5658 			error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5659 			if (error != 0) {
5660 				printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5661 				goto out;
5662 			}
5663 			if (application_id.persona_id != PERSONA_ID_NONE) {
5664 				so->so_persona_id = application_id.persona_id;
5665 			}
5666 			break;
5667 		}
5668 		case SO_MARK_DOMAIN_INFO_SILENT:
5669 			error = sooptcopyin(sopt, &optval, sizeof(optval),
5670 			    sizeof(optval));
5671 			if (error != 0) {
5672 				goto out;
5673 			}
5674 			if (optval < 0) {
5675 				error = EINVAL;
5676 				goto out;
5677 			}
5678 			if (optval == 0) {
5679 				so->so_flags1 &= ~SOF1_DOMAIN_INFO_SILENT;
5680 			} else {
5681 				so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
5682 			}
5683 			break;
5684 		case SO_MAX_PACING_RATE: {
5685 			uint64_t pacingrate;
5686 
5687 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5688 				error = EINVAL;
5689 				goto out;
5690 			}
5691 
5692 			error = sooptcopyin(sopt, &pacingrate,
5693 			    sizeof(pacingrate), sizeof(pacingrate));
5694 			if (error != 0) {
5695 				goto out;
5696 			}
5697 
5698 			if (pacingrate == 0) {
5699 				error = EINVAL;
5700 				goto out;
5701 			}
5702 			sotoinpcb(so)->inp_max_pacing_rate = pacingrate;
5703 			break;
5704 		}
5705 		case SO_CONNECTION_IDLE: {
5706 			int is_idle;
5707 
5708 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5709 				error = EINVAL;
5710 				goto out;
5711 			}
5712 
5713 			error = sooptcopyin(sopt, &is_idle,
5714 			    sizeof(is_idle), sizeof(is_idle));
5715 			if (error != 0) {
5716 				goto out;
5717 			}
5718 
5719 			if (is_idle != 0) {
5720 				sotoinpcb(so)->inp_flags2 |= INP2_CONNECTION_IDLE;
5721 			} else {
5722 				sotoinpcb(so)->inp_flags2 &= ~INP2_CONNECTION_IDLE;
5723 			}
5724 			break;
5725 		}
5726 		default:
5727 			error = ENOPROTOOPT;
5728 			break;
5729 		}
5730 		if (error == 0 && so->so_proto != NULL &&
5731 		    so->so_proto->pr_ctloutput != NULL) {
5732 			(void) so->so_proto->pr_ctloutput(so, sopt);
5733 		}
5734 	}
5735 out:
5736 	if (dolock) {
5737 		socket_unlock(so, 1);
5738 	}
5739 	return error;
5740 }
5741 
5742 /* Helper routines for getsockopt */
5743 int
sooptcopyout(struct sockopt * sopt,void * __sized_by (len)buf,size_t len)5744 sooptcopyout(struct sockopt *sopt, void *__sized_by(len) buf, size_t len)
5745 {
5746 	int     error;
5747 	size_t  valsize;
5748 
5749 	error = 0;
5750 
5751 	/*
5752 	 * Documented get behavior is that we always return a value,
5753 	 * possibly truncated to fit in the user's buffer.
5754 	 * Traditional behavior is that we always tell the user
5755 	 * precisely how much we copied, rather than something useful
5756 	 * like the total amount we had available for her.
5757 	 * Note that this interface is not idempotent; the entire answer must
5758 	 * generated ahead of time.
5759 	 */
5760 	valsize = MIN(len, sopt->sopt_valsize);
5761 	sopt->sopt_valsize = valsize;
5762 	if (sopt->sopt_valsize != 0 && sopt->sopt_val != USER_ADDR_NULL) {
5763 		if (sopt->sopt_p != kernproc) {
5764 			error = copyout(buf, sopt->sopt_val, valsize);
5765 		} else {
5766 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5767 			    CAST_DOWN(caddr_t, sopt->sopt_val),
5768 			    valsize);
5769 			bcopy(buf, tmp, valsize);
5770 		}
5771 	}
5772 	return error;
5773 }
5774 
5775 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5776 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5777 {
5778 	int                     error;
5779 	size_t                  len;
5780 	struct user64_timeval   tv64 = {};
5781 	struct user32_timeval   tv32 = {};
5782 	const void *            val;
5783 	size_t                  valsize;
5784 
5785 	error = 0;
5786 	if (proc_is64bit(sopt->sopt_p)) {
5787 		len = sizeof(tv64);
5788 		tv64.tv_sec = tv_p->tv_sec;
5789 		tv64.tv_usec = tv_p->tv_usec;
5790 		val = &tv64;
5791 	} else {
5792 		len = sizeof(tv32);
5793 		tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5794 		tv32.tv_usec = tv_p->tv_usec;
5795 		val = &tv32;
5796 	}
5797 	valsize = MIN(len, sopt->sopt_valsize);
5798 	sopt->sopt_valsize = valsize;
5799 	if (sopt->sopt_val != USER_ADDR_NULL) {
5800 		if (sopt->sopt_p != kernproc) {
5801 			error = copyout(val, sopt->sopt_val, valsize);
5802 		} else {
5803 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5804 			    CAST_DOWN(caddr_t, sopt->sopt_val),
5805 			    valsize);
5806 			bcopy(val, tmp, valsize);
5807 		}
5808 	}
5809 	return error;
5810 }
5811 
5812 /*
5813  * Return:	0			Success
5814  *		ENOPROTOOPT
5815  *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5816  *	<pr_ctloutput>:???
5817  *	<sf_getoption>:???
5818  */
5819 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5820 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5821 {
5822 	int     error, optval;
5823 	struct  linger l;
5824 	struct  timeval tv;
5825 
5826 	if (sopt->sopt_dir != SOPT_GET) {
5827 		sopt->sopt_dir = SOPT_GET;
5828 	}
5829 
5830 	if (dolock) {
5831 		socket_lock(so, 1);
5832 	}
5833 
5834 	error = sflt_getsockopt(so, sopt);
5835 	if (error != 0) {
5836 		if (error == EJUSTRETURN) {
5837 			error = 0;
5838 		}
5839 		goto out;
5840 	}
5841 
5842 	if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
5843 		if (so->so_proto != NULL &&
5844 		    so->so_proto->pr_ctloutput != NULL) {
5845 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
5846 			goto out;
5847 		}
5848 		error = ENOPROTOOPT;
5849 	} else {
5850 		/*
5851 		 * Allow socket-level (SOL_SOCKET) options to be filtered by
5852 		 * the protocol layer, if needed.  A zero value returned from
5853 		 * the handler means use default socket-level processing as
5854 		 * done by the rest of this routine.  Otherwise, any other
5855 		 * return value indicates that the option is unsupported.
5856 		 */
5857 		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5858 		    pru_socheckopt(so, sopt)) != 0) {
5859 			goto out;
5860 		}
5861 
5862 		error = 0;
5863 		switch (sopt->sopt_name) {
5864 		case SO_LINGER:
5865 		case SO_LINGER_SEC:
5866 			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5867 			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5868 			    so->so_linger : so->so_linger / hz;
5869 			error = sooptcopyout(sopt, &l, sizeof(l));
5870 			break;
5871 
5872 		case SO_USELOOPBACK:
5873 		case SO_DONTROUTE:
5874 		case SO_DEBUG:
5875 		case SO_KEEPALIVE:
5876 		case SO_REUSEADDR:
5877 		case SO_REUSEPORT:
5878 		case SO_BROADCAST:
5879 		case SO_OOBINLINE:
5880 		case SO_TIMESTAMP:
5881 		case SO_TIMESTAMP_MONOTONIC:
5882 		case SO_TIMESTAMP_CONTINUOUS:
5883 		case SO_DONTTRUNC:
5884 		case SO_WANTMORE:
5885 		case SO_WANTOOBFLAG:
5886 		case SO_NOWAKEFROMSLEEP:
5887 		case SO_NOAPNFALLBK:
5888 			optval = so->so_options & sopt->sopt_name;
5889 integer:
5890 			error = sooptcopyout(sopt, &optval, sizeof(optval));
5891 			break;
5892 
5893 		case SO_TYPE:
5894 			optval = so->so_type;
5895 			goto integer;
5896 
5897 		case SO_NREAD:
5898 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5899 				int pkt_total;
5900 				struct mbuf *m1;
5901 
5902 				pkt_total = 0;
5903 				m1 = so->so_rcv.sb_mb;
5904 				while (m1 != NULL) {
5905 					if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5906 						pkt_total += m1->m_len;
5907 					}
5908 					m1 = m1->m_next;
5909 				}
5910 				optval = pkt_total;
5911 			} else {
5912 				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5913 			}
5914 			goto integer;
5915 
5916 		case SO_NUMRCVPKT:
5917 			if (so->so_proto->pr_flags & PR_ATOMIC) {
5918 				int cnt = 0;
5919 				struct mbuf *m1;
5920 
5921 				m1 = so->so_rcv.sb_mb;
5922 				while (m1 != NULL) {
5923 					cnt += 1;
5924 					m1 = m1->m_nextpkt;
5925 				}
5926 				optval = cnt;
5927 				goto integer;
5928 			} else {
5929 				error = ENOPROTOOPT;
5930 				break;
5931 			}
5932 
5933 		case SO_NWRITE:
5934 			optval = so->so_snd.sb_cc;
5935 			goto integer;
5936 
5937 		case SO_ERROR:
5938 			optval = so->so_error;
5939 			so->so_error = 0;
5940 			goto integer;
5941 
5942 		case SO_SNDBUF: {
5943 			u_int32_t hiwat = so->so_snd.sb_hiwat;
5944 
5945 			if (so->so_snd.sb_flags & SB_UNIX) {
5946 				struct unpcb *unp =
5947 				    (struct unpcb *)(so->so_pcb);
5948 				if (unp != NULL && unp->unp_conn != NULL) {
5949 					hiwat += unp->unp_conn->unp_cc;
5950 				}
5951 			}
5952 
5953 			optval = hiwat;
5954 			goto integer;
5955 		}
5956 		case SO_RCVBUF:
5957 			optval = so->so_rcv.sb_hiwat;
5958 			goto integer;
5959 
5960 		case SO_SNDLOWAT:
5961 			optval = so->so_snd.sb_lowat;
5962 			goto integer;
5963 
5964 		case SO_RCVLOWAT:
5965 			optval = so->so_rcv.sb_lowat;
5966 			goto integer;
5967 
5968 		case SO_SNDTIMEO:
5969 		case SO_RCVTIMEO:
5970 			tv = (sopt->sopt_name == SO_SNDTIMEO ?
5971 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5972 
5973 			error = sooptcopyout_timeval(sopt, &tv);
5974 			break;
5975 
5976 		case SO_NOSIGPIPE:
5977 			optval = (so->so_flags & SOF_NOSIGPIPE);
5978 			goto integer;
5979 
5980 		case SO_NOADDRERR:
5981 			optval = (so->so_flags & SOF_NOADDRAVAIL);
5982 			goto integer;
5983 
5984 		case SO_REUSESHAREUID:
5985 			optval = (so->so_flags & SOF_REUSESHAREUID);
5986 			goto integer;
5987 
5988 
5989 		case SO_NOTIFYCONFLICT:
5990 			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5991 			goto integer;
5992 
5993 		case SO_RESTRICTIONS:
5994 			optval = so_get_restrictions(so);
5995 			goto integer;
5996 
5997 		case SO_AWDL_UNRESTRICTED:
5998 			if (SOCK_DOM(so) == PF_INET ||
5999 			    SOCK_DOM(so) == PF_INET6) {
6000 				optval = inp_get_awdl_unrestricted(
6001 					sotoinpcb(so));
6002 				goto integer;
6003 			} else {
6004 				error = EOPNOTSUPP;
6005 			}
6006 			break;
6007 
6008 		case SO_INTCOPROC_ALLOW:
6009 			if (SOCK_DOM(so) == PF_INET6) {
6010 				optval = inp_get_intcoproc_allowed(
6011 					sotoinpcb(so));
6012 				goto integer;
6013 			} else {
6014 				error = EOPNOTSUPP;
6015 			}
6016 			break;
6017 
6018 		case SO_LABEL:
6019 			error = EOPNOTSUPP;
6020 			break;
6021 
6022 		case SO_PEERLABEL:
6023 			error = EOPNOTSUPP;
6024 			break;
6025 
6026 #ifdef __APPLE_API_PRIVATE
6027 		case SO_UPCALLCLOSEWAIT:
6028 			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6029 			goto integer;
6030 #endif
6031 		case SO_RANDOMPORT:
6032 			optval = (so->so_flags & SOF_BINDRANDOMPORT);
6033 			goto integer;
6034 
6035 		case SO_NP_EXTENSIONS: {
6036 			struct so_np_extensions sonpx = {};
6037 
6038 			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6039 			    SONPX_SETOPTSHUT : 0;
6040 			sonpx.npx_mask = SONPX_MASK_VALID;
6041 
6042 			error = sooptcopyout(sopt, &sonpx,
6043 			    sizeof(struct so_np_extensions));
6044 			break;
6045 		}
6046 
6047 		case SO_TRAFFIC_CLASS:
6048 			optval = so->so_traffic_class;
6049 			goto integer;
6050 
6051 		case SO_RECV_TRAFFIC_CLASS:
6052 			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6053 			goto integer;
6054 
6055 #if (DEVELOPMENT || DEBUG)
6056 		case SO_TRAFFIC_CLASS_DBG:
6057 			error = sogetopt_tcdbg(so, sopt);
6058 			break;
6059 #endif /* (DEVELOPMENT || DEBUG) */
6060 
6061 		case SO_PRIVILEGED_TRAFFIC_CLASS:
6062 			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6063 			goto integer;
6064 
6065 		case SO_DEFUNCTOK:
6066 			optval = !(so->so_flags & SOF_NODEFUNCT);
6067 			goto integer;
6068 
6069 		case SO_ISDEFUNCT:
6070 			optval = (so->so_flags & SOF_DEFUNCT);
6071 			goto integer;
6072 
6073 		case SO_OPPORTUNISTIC:
6074 			optval = so_get_opportunistic(so);
6075 			goto integer;
6076 
6077 		case SO_FLUSH:
6078 			/* This option is not gettable */
6079 			error = EINVAL;
6080 			break;
6081 
6082 		case SO_RECV_ANYIF:
6083 			optval = so_get_recv_anyif(so);
6084 			goto integer;
6085 
6086 		case SO_TRAFFIC_MGT_BACKGROUND:
6087 			/* This option is handled by lower layer(s) */
6088 			if (so->so_proto != NULL &&
6089 			    so->so_proto->pr_ctloutput != NULL) {
6090 				(void) so->so_proto->pr_ctloutput(so, sopt);
6091 			}
6092 			break;
6093 
6094 #if FLOW_DIVERT
6095 		case SO_FLOW_DIVERT_TOKEN:
6096 			error = flow_divert_token_get(so, sopt);
6097 			break;
6098 #endif  /* FLOW_DIVERT */
6099 
6100 #if NECP
6101 		case SO_NECP_ATTRIBUTES:
6102 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6103 				/* Handled by MPTCP itself */
6104 				break;
6105 			}
6106 
6107 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6108 				error = EINVAL;
6109 				goto out;
6110 			}
6111 
6112 			error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6113 			break;
6114 
6115 		case SO_NECP_CLIENTUUID: {
6116 			uuid_t *ncu;
6117 
6118 			if (SOCK_DOM(so) == PF_MULTIPATH) {
6119 				ncu = &mpsotomppcb(so)->necp_client_uuid;
6120 			} else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6121 				ncu = &sotoinpcb(so)->necp_client_uuid;
6122 			} else {
6123 				error = EINVAL;
6124 				goto out;
6125 			}
6126 
6127 			error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6128 			break;
6129 		}
6130 
6131 		case SO_NECP_LISTENUUID: {
6132 			uuid_t *nlu;
6133 
6134 			if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6135 				if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6136 					nlu = &sotoinpcb(so)->necp_client_uuid;
6137 				} else {
6138 					error = ENOENT;
6139 					goto out;
6140 				}
6141 			} else {
6142 				error = EINVAL;
6143 				goto out;
6144 			}
6145 
6146 			error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6147 			break;
6148 		}
6149 
6150 		case SO_RESOLVER_SIGNATURE: {
6151 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6152 				error = EINVAL;
6153 				goto out;
6154 			}
6155 			error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6156 			break;
6157 		}
6158 
6159 #endif /* NECP */
6160 
6161 #if CONTENT_FILTER
6162 		case SO_CFIL_SOCK_ID: {
6163 			cfil_sock_id_t sock_id;
6164 
6165 			sock_id = cfil_sock_id_from_socket(so);
6166 
6167 			error = sooptcopyout(sopt, &sock_id,
6168 			    sizeof(cfil_sock_id_t));
6169 			break;
6170 		}
6171 #endif  /* CONTENT_FILTER */
6172 
6173 		case SO_EXTENDED_BK_IDLE:
6174 			optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6175 			goto integer;
6176 		case SO_MARK_CELLFALLBACK:
6177 			optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6178 			    ? 1 : 0;
6179 			goto integer;
6180 		case SO_FALLBACK_MODE:
6181 			optval = so->so_fallback_mode;
6182 			goto integer;
6183 		case SO_MARK_KNOWN_TRACKER: {
6184 			optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6185 			    ? 1 : 0;
6186 			goto integer;
6187 		}
6188 		case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6189 			optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6190 			    ? 1 : 0;
6191 			goto integer;
6192 		}
6193 		case SO_MARK_APPROVED_APP_DOMAIN: {
6194 			optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6195 			    ? 1 : 0;
6196 			goto integer;
6197 		}
6198 		case SO_NET_SERVICE_TYPE: {
6199 			if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6200 				optval = so->so_netsvctype;
6201 			} else {
6202 				optval = NET_SERVICE_TYPE_BE;
6203 			}
6204 			goto integer;
6205 		}
6206 		case SO_NETSVC_MARKING_LEVEL:
6207 			optval = so_get_netsvc_marking_level(so);
6208 			goto integer;
6209 
6210 		case SO_MPKL_SEND_INFO: {
6211 			struct so_mpkl_send_info so_mpkl_send_info;
6212 
6213 			uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6214 			so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6215 			error = sooptcopyout(sopt, &so_mpkl_send_info,
6216 			    sizeof(struct so_mpkl_send_info));
6217 			break;
6218 		}
6219 		case SO_MARK_WAKE_PKT:
6220 			optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6221 			goto integer;
6222 		case SO_RECV_WAKE_PKT:
6223 			optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6224 			goto integer;
6225 		case SO_APPLICATION_ID: {
6226 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6227 				error = EINVAL;
6228 				goto out;
6229 			}
6230 			so_application_id_t application_id = { 0 };
6231 			application_id.uid = kauth_cred_getuid(so->so_cred);
6232 			uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6233 			application_id.persona_id = so->so_persona_id;
6234 			error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6235 			break;
6236 		}
6237 		case SO_MARK_DOMAIN_INFO_SILENT:
6238 			optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
6239 			    ? 1 : 0;
6240 			goto integer;
6241 		case SO_MAX_PACING_RATE: {
6242 			uint64_t pacingrate;
6243 
6244 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6245 				error = EINVAL;
6246 				goto out;
6247 			}
6248 
6249 			pacingrate = sotoinpcb(so)->inp_max_pacing_rate;
6250 
6251 			error = sooptcopyout(sopt, &pacingrate, sizeof(pacingrate));
6252 			break;
6253 		}
6254 		case SO_CONNECTION_IDLE: {
6255 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6256 				error = EINVAL;
6257 				goto out;
6258 			}
6259 			optval = sotoinpcb(so)->inp_flags2 & INP2_CONNECTION_IDLE ?
6260 			    1 : 0;
6261 			goto integer;
6262 		}
6263 		default:
6264 			error = ENOPROTOOPT;
6265 			break;
6266 		}
6267 	}
6268 out:
6269 	if (dolock) {
6270 		socket_unlock(so, 1);
6271 	}
6272 	return error;
6273 }
6274 
6275 /*
6276  * The size limits on our soopt_getm is different from that on FreeBSD.
6277  * We limit the size of options to MCLBYTES. This will have to change
6278  * if we need to define options that need more space than MCLBYTES.
6279  */
6280 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6281 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6282 {
6283 	struct mbuf *m, *m_prev;
6284 	int sopt_size = (int)sopt->sopt_valsize;
6285 	int how;
6286 
6287 	if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6288 		return EMSGSIZE;
6289 	}
6290 
6291 	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6292 	MGET(m, how, MT_DATA);
6293 	if (m == NULL) {
6294 		return ENOBUFS;
6295 	}
6296 	if (sopt_size > MLEN) {
6297 		MCLGET(m, how);
6298 		if ((m->m_flags & M_EXT) == 0) {
6299 			m_free(m);
6300 			return ENOBUFS;
6301 		}
6302 		m->m_len = min(MCLBYTES, sopt_size);
6303 	} else {
6304 		m->m_len = min(MLEN, sopt_size);
6305 	}
6306 	sopt_size -= m->m_len;
6307 	*mp = m;
6308 	m_prev = m;
6309 
6310 	while (sopt_size > 0) {
6311 		MGET(m, how, MT_DATA);
6312 		if (m == NULL) {
6313 			m_freem(*mp);
6314 			return ENOBUFS;
6315 		}
6316 		if (sopt_size > MLEN) {
6317 			MCLGET(m, how);
6318 			if ((m->m_flags & M_EXT) == 0) {
6319 				m_freem(*mp);
6320 				m_freem(m);
6321 				return ENOBUFS;
6322 			}
6323 			m->m_len = min(MCLBYTES, sopt_size);
6324 		} else {
6325 			m->m_len = min(MLEN, sopt_size);
6326 		}
6327 		sopt_size -= m->m_len;
6328 		m_prev->m_next = m;
6329 		m_prev = m;
6330 	}
6331 	return 0;
6332 }
6333 
6334 /* copyin sopt data into mbuf chain */
6335 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6336 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6337 {
6338 	struct mbuf *m0 = m;
6339 
6340 	if (sopt->sopt_val == USER_ADDR_NULL) {
6341 		return 0;
6342 	}
6343 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6344 		if (sopt->sopt_p != kernproc) {
6345 			int error;
6346 
6347 			error = copyin(sopt->sopt_val, mtod(m, char *),
6348 			    m->m_len);
6349 			if (error != 0) {
6350 				m_freem(m0);
6351 				return error;
6352 			}
6353 		} else {
6354 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6355 			    CAST_DOWN(caddr_t, sopt->sopt_val),
6356 			    m->m_len);
6357 			bcopy(tmp, mtod(m, char *), m->m_len);
6358 		}
6359 		sopt->sopt_valsize -= m->m_len;
6360 		sopt->sopt_val += m->m_len;
6361 		m = m->m_next;
6362 	}
6363 	/* should be allocated enoughly at ip6_sooptmcopyin() */
6364 	if (m != NULL) {
6365 		panic("soopt_mcopyin");
6366 		/* NOTREACHED */
6367 	}
6368 	return 0;
6369 }
6370 
6371 /* copyout mbuf chain data into soopt */
6372 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6373 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6374 {
6375 	struct mbuf *m0 = m;
6376 	size_t valsize = 0;
6377 
6378 	if (sopt->sopt_val == USER_ADDR_NULL) {
6379 		return 0;
6380 	}
6381 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6382 		if (sopt->sopt_p != kernproc) {
6383 			int error;
6384 
6385 			error = copyout(mtod(m, char *), sopt->sopt_val,
6386 			    m->m_len);
6387 			if (error != 0) {
6388 				m_freem(m0);
6389 				return error;
6390 			}
6391 		} else {
6392 			caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6393 			    CAST_DOWN(caddr_t, sopt->sopt_val),
6394 			    m->m_len);
6395 
6396 			bcopy(mtod(m, char *), tmp, m->m_len);
6397 		}
6398 		sopt->sopt_valsize -= m->m_len;
6399 		sopt->sopt_val += m->m_len;
6400 		valsize += m->m_len;
6401 		m = m->m_next;
6402 	}
6403 	if (m != NULL) {
6404 		/* enough soopt buffer should be given from user-land */
6405 		m_freem(m0);
6406 		return EINVAL;
6407 	}
6408 	sopt->sopt_valsize = valsize;
6409 	return 0;
6410 }
6411 
6412 void
sohasoutofband(struct socket * so)6413 sohasoutofband(struct socket *so)
6414 {
6415 	if (so->so_pgid < 0) {
6416 		gsignal(-so->so_pgid, SIGURG);
6417 	} else if (so->so_pgid > 0) {
6418 		proc_signal(so->so_pgid, SIGURG);
6419 	}
6420 	selwakeup(&so->so_rcv.sb_sel);
6421 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6422 		KNOTE(&so->so_rcv.sb_sel.si_note,
6423 		    (NOTE_OOB | SO_FILT_HINT_LOCKED));
6424 	}
6425 }
6426 
6427 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6428 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6429 {
6430 #pragma unused(cred)
6431 	struct proc *p = current_proc();
6432 	int revents = 0;
6433 
6434 	socket_lock(so, 1);
6435 	so_update_last_owner_locked(so, PROC_NULL);
6436 	so_update_policy(so);
6437 
6438 	if (events & (POLLIN | POLLRDNORM)) {
6439 		if (soreadable(so)) {
6440 			revents |= events & (POLLIN | POLLRDNORM);
6441 		}
6442 	}
6443 
6444 	if (events & (POLLOUT | POLLWRNORM)) {
6445 		if (sowriteable(so)) {
6446 			revents |= events & (POLLOUT | POLLWRNORM);
6447 		}
6448 	}
6449 
6450 	if (events & (POLLPRI | POLLRDBAND)) {
6451 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6452 			revents |= events & (POLLPRI | POLLRDBAND);
6453 		}
6454 	}
6455 
6456 	if (revents == 0) {
6457 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6458 			/*
6459 			 * Darwin sets the flag first,
6460 			 * BSD calls selrecord first
6461 			 */
6462 			so->so_rcv.sb_flags |= SB_SEL;
6463 			selrecord(p, &so->so_rcv.sb_sel, wql);
6464 		}
6465 
6466 		if (events & (POLLOUT | POLLWRNORM)) {
6467 			/*
6468 			 * Darwin sets the flag first,
6469 			 * BSD calls selrecord first
6470 			 */
6471 			so->so_snd.sb_flags |= SB_SEL;
6472 			selrecord(p, &so->so_snd.sb_sel, wql);
6473 		}
6474 	}
6475 
6476 	socket_unlock(so, 1);
6477 	return revents;
6478 }
6479 
6480 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6481 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6482 {
6483 	struct socket *so = (struct socket *)fp_get_data(fp);
6484 	int result;
6485 
6486 	socket_lock(so, 1);
6487 	so_update_last_owner_locked(so, PROC_NULL);
6488 	so_update_policy(so);
6489 
6490 	switch (kn->kn_filter) {
6491 	case EVFILT_READ:
6492 		kn->kn_filtid = EVFILTID_SOREAD;
6493 		break;
6494 	case EVFILT_WRITE:
6495 		kn->kn_filtid = EVFILTID_SOWRITE;
6496 		break;
6497 	case EVFILT_SOCK:
6498 		kn->kn_filtid = EVFILTID_SCK;
6499 		break;
6500 	case EVFILT_EXCEPT:
6501 		kn->kn_filtid = EVFILTID_SOEXCEPT;
6502 		break;
6503 	default:
6504 		socket_unlock(so, 1);
6505 		knote_set_error(kn, EINVAL);
6506 		return 0;
6507 	}
6508 
6509 	/*
6510 	 * call the appropriate sub-filter attach
6511 	 * with the socket still locked
6512 	 */
6513 	result = knote_fops(kn)->f_attach(kn, kev);
6514 
6515 	socket_unlock(so, 1);
6516 
6517 	return result;
6518 }
6519 
6520 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6521 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6522 {
6523 	int retval = 0;
6524 	int64_t data = 0;
6525 
6526 	if (so->so_options & SO_ACCEPTCONN) {
6527 		/*
6528 		 * Radar 6615193 handle the listen case dynamically
6529 		 * for kqueue read filter. This allows to call listen()
6530 		 * after registering the kqueue EVFILT_READ.
6531 		 */
6532 
6533 		retval = !TAILQ_EMPTY(&so->so_comp);
6534 		data = so->so_qlen;
6535 		goto out;
6536 	}
6537 
6538 	/* socket isn't a listener */
6539 	/*
6540 	 * NOTE_LOWAT specifies new low water mark in data, i.e.
6541 	 * the bytes of protocol data. We therefore exclude any
6542 	 * control bytes.
6543 	 */
6544 	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6545 
6546 	if (kn->kn_sfflags & NOTE_OOB) {
6547 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6548 			kn->kn_fflags |= NOTE_OOB;
6549 			data -= so->so_oobmark;
6550 			retval = 1;
6551 			goto out;
6552 		}
6553 	}
6554 
6555 	if ((so->so_state & SS_CANTRCVMORE)
6556 #if CONTENT_FILTER
6557 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6558 #endif /* CONTENT_FILTER */
6559 	    ) {
6560 		kn->kn_flags |= EV_EOF;
6561 		kn->kn_fflags = so->so_error;
6562 		retval = 1;
6563 		goto out;
6564 	}
6565 
6566 	if (so->so_error) {     /* temporary udp error */
6567 		retval = 1;
6568 		goto out;
6569 	}
6570 
6571 	int64_t lowwat = so->so_rcv.sb_lowat;
6572 	/*
6573 	 * Ensure that when NOTE_LOWAT is used, the derived
6574 	 * low water mark is bounded by socket's rcv buf's
6575 	 * high and low water mark values.
6576 	 */
6577 	if (kn->kn_sfflags & NOTE_LOWAT) {
6578 		if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6579 			lowwat = so->so_rcv.sb_hiwat;
6580 		} else if (kn->kn_sdata > lowwat) {
6581 			lowwat = kn->kn_sdata;
6582 		}
6583 	}
6584 
6585 	/*
6586 	 * While the `data` field is the amount of data to read,
6587 	 * 0-sized packets need to wake up the kqueue, see 58140856,
6588 	 * so we need to take control bytes into account too.
6589 	 */
6590 	retval = (so->so_rcv.sb_cc >= lowwat);
6591 
6592 out:
6593 	if (retval && kev) {
6594 		knote_fill_kevent(kn, kev, data);
6595 	}
6596 	return retval;
6597 }
6598 
6599 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6600 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6601 {
6602 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6603 
6604 	/* socket locked */
6605 
6606 	/*
6607 	 * If the caller explicitly asked for OOB results (e.g. poll())
6608 	 * from EVFILT_READ, then save that off in the hookid field
6609 	 * and reserve the kn_flags EV_OOBAND bit for output only.
6610 	 */
6611 	if (kn->kn_filter == EVFILT_READ &&
6612 	    kn->kn_flags & EV_OOBAND) {
6613 		kn->kn_flags &= ~EV_OOBAND;
6614 		kn->kn_hook32 = EV_OOBAND;
6615 	} else {
6616 		kn->kn_hook32 = 0;
6617 	}
6618 	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6619 		so->so_rcv.sb_flags |= SB_KNOTE;
6620 	}
6621 
6622 	/* indicate if event is already fired */
6623 	return filt_soread_common(kn, NULL, so);
6624 }
6625 
6626 static void
filt_sordetach(struct knote * kn)6627 filt_sordetach(struct knote *kn)
6628 {
6629 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6630 
6631 	socket_lock(so, 1);
6632 	if (so->so_rcv.sb_flags & SB_KNOTE) {
6633 		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6634 			so->so_rcv.sb_flags &= ~SB_KNOTE;
6635 		}
6636 	}
6637 	socket_unlock(so, 1);
6638 }
6639 
6640 /*ARGSUSED*/
6641 static int
filt_soread(struct knote * kn,long hint)6642 filt_soread(struct knote *kn, long hint)
6643 {
6644 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6645 	int retval;
6646 
6647 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6648 		socket_lock(so, 1);
6649 	}
6650 
6651 	retval = filt_soread_common(kn, NULL, so);
6652 
6653 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6654 		socket_unlock(so, 1);
6655 	}
6656 
6657 	return retval;
6658 }
6659 
6660 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6661 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6662 {
6663 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6664 	int retval;
6665 
6666 	socket_lock(so, 1);
6667 
6668 	/* save off the new input fflags and data */
6669 	kn->kn_sfflags = kev->fflags;
6670 	kn->kn_sdata = kev->data;
6671 
6672 	/* determine if changes result in fired events */
6673 	retval = filt_soread_common(kn, NULL, so);
6674 
6675 	socket_unlock(so, 1);
6676 
6677 	return retval;
6678 }
6679 
6680 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6681 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6682 {
6683 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6684 	int retval;
6685 
6686 	socket_lock(so, 1);
6687 	retval = filt_soread_common(kn, kev, so);
6688 	socket_unlock(so, 1);
6689 
6690 	return retval;
6691 }
6692 
6693 int
so_wait_for_if_feedback(struct socket * so)6694 so_wait_for_if_feedback(struct socket *so)
6695 {
6696 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6697 	    (so->so_state & SS_ISCONNECTED)) {
6698 		struct inpcb *inp = sotoinpcb(so);
6699 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6700 			return 1;
6701 		}
6702 	}
6703 	return 0;
6704 }
6705 
6706 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6707 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6708 {
6709 	int ret = 0;
6710 	int64_t data = sbspace(&so->so_snd);
6711 
6712 	if (so->so_state & SS_CANTSENDMORE) {
6713 		kn->kn_flags |= EV_EOF;
6714 		kn->kn_fflags = so->so_error;
6715 		ret = 1;
6716 		goto out;
6717 	}
6718 
6719 	if (so->so_error) {     /* temporary udp error */
6720 		ret = 1;
6721 		goto out;
6722 	}
6723 
6724 	if (!socanwrite(so)) {
6725 		ret = 0;
6726 		goto out;
6727 	}
6728 
6729 	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6730 		ret = 1;
6731 		goto out;
6732 	}
6733 
6734 	int64_t lowwat = so->so_snd.sb_lowat;
6735 	const int64_t hiwat = so->so_snd.sb_hiwat;
6736 	/*
6737 	 * Deal with connected UNIX domain sockets which
6738 	 * rely on the fact that the sender's socket buffer is
6739 	 * actually the receiver's socket buffer.
6740 	 */
6741 	if (SOCK_DOM(so) == PF_LOCAL) {
6742 		struct unpcb *unp = sotounpcb(so);
6743 		if (unp != NULL && unp->unp_conn != NULL &&
6744 		    unp->unp_conn->unp_socket != NULL) {
6745 			struct socket *so2 = unp->unp_conn->unp_socket;
6746 			/*
6747 			 * At this point we know that `so' is locked
6748 			 * and that `unp_conn` isn't going to change.
6749 			 * However, we don't lock `so2` because doing so
6750 			 * may require unlocking `so'
6751 			 * (see unp_get_locks_in_order()).
6752 			 *
6753 			 * Two cases can happen:
6754 			 *
6755 			 * 1) we return 1 and tell the application that
6756 			 *    it can write.  Meanwhile, another thread
6757 			 *    fills up the socket buffer.  This will either
6758 			 *    lead to a blocking send or EWOULDBLOCK
6759 			 *    which the application should deal with.
6760 			 * 2) we return 0 and tell the application that
6761 			 *    the socket is not writable.  Meanwhile,
6762 			 *    another thread depletes the receive socket
6763 			 *    buffer. In this case the application will
6764 			 *    be woken up by sb_notify().
6765 			 *
6766 			 * MIN() is required because otherwise sosendcheck()
6767 			 * may return EWOULDBLOCK since it only considers
6768 			 * so->so_snd.
6769 			 */
6770 			data = MIN(data, sbspace(&so2->so_rcv));
6771 		}
6772 	}
6773 
6774 	if (kn->kn_sfflags & NOTE_LOWAT) {
6775 		if (kn->kn_sdata > hiwat) {
6776 			lowwat = hiwat;
6777 		} else if (kn->kn_sdata > lowwat) {
6778 			lowwat = kn->kn_sdata;
6779 		}
6780 	}
6781 
6782 	if (data > 0 && data >= lowwat) {
6783 		if ((so->so_flags & SOF_NOTSENT_LOWAT)
6784 #if (DEBUG || DEVELOPMENT)
6785 		    && so_notsent_lowat_check == 1
6786 #endif /* DEBUG || DEVELOPMENT */
6787 		    ) {
6788 			if ((SOCK_DOM(so) == PF_INET ||
6789 			    SOCK_DOM(so) == PF_INET6) &&
6790 			    so->so_type == SOCK_STREAM) {
6791 				ret = tcp_notsent_lowat_check(so);
6792 			}
6793 #if MPTCP
6794 			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6795 			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
6796 				ret = mptcp_notsent_lowat_check(so);
6797 			}
6798 #endif
6799 			else {
6800 				ret = 1;
6801 				goto out;
6802 			}
6803 		} else {
6804 			ret = 1;
6805 		}
6806 	}
6807 	if (so_wait_for_if_feedback(so)) {
6808 		ret = 0;
6809 	}
6810 
6811 out:
6812 	if (ret && kev) {
6813 		knote_fill_kevent(kn, kev, data);
6814 	}
6815 	return ret;
6816 }
6817 
6818 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6819 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6820 {
6821 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6822 
6823 	/* socket locked */
6824 	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6825 		so->so_snd.sb_flags |= SB_KNOTE;
6826 	}
6827 
6828 	/* determine if its already fired */
6829 	return filt_sowrite_common(kn, NULL, so);
6830 }
6831 
6832 static void
filt_sowdetach(struct knote * kn)6833 filt_sowdetach(struct knote *kn)
6834 {
6835 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6836 	socket_lock(so, 1);
6837 
6838 	if (so->so_snd.sb_flags & SB_KNOTE) {
6839 		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6840 			so->so_snd.sb_flags &= ~SB_KNOTE;
6841 		}
6842 	}
6843 	socket_unlock(so, 1);
6844 }
6845 
6846 /*ARGSUSED*/
6847 static int
filt_sowrite(struct knote * kn,long hint)6848 filt_sowrite(struct knote *kn, long hint)
6849 {
6850 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6851 	int ret;
6852 
6853 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6854 		socket_lock(so, 1);
6855 	}
6856 
6857 	ret = filt_sowrite_common(kn, NULL, so);
6858 
6859 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6860 		socket_unlock(so, 1);
6861 	}
6862 
6863 	return ret;
6864 }
6865 
6866 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6867 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6868 {
6869 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6870 	int ret;
6871 
6872 	socket_lock(so, 1);
6873 
6874 	/*save off the new input fflags and data */
6875 	kn->kn_sfflags = kev->fflags;
6876 	kn->kn_sdata = kev->data;
6877 
6878 	/* determine if these changes result in a triggered event */
6879 	ret = filt_sowrite_common(kn, NULL, so);
6880 
6881 	socket_unlock(so, 1);
6882 
6883 	return ret;
6884 }
6885 
6886 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6887 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6888 {
6889 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6890 	int ret;
6891 
6892 	socket_lock(so, 1);
6893 	ret = filt_sowrite_common(kn, kev, so);
6894 	socket_unlock(so, 1);
6895 
6896 	return ret;
6897 }
6898 
6899 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6900 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6901     struct socket *so, long ev_hint)
6902 {
6903 	int ret = 0;
6904 	int64_t data = 0;
6905 	uint32_t level_trigger = 0;
6906 
6907 	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6908 		kn->kn_fflags |= NOTE_CONNRESET;
6909 	}
6910 	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6911 		kn->kn_fflags |= NOTE_TIMEOUT;
6912 	}
6913 	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6914 		kn->kn_fflags |= NOTE_NOSRCADDR;
6915 	}
6916 	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6917 		kn->kn_fflags |= NOTE_IFDENIED;
6918 	}
6919 	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6920 		kn->kn_fflags |= NOTE_KEEPALIVE;
6921 	}
6922 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6923 		kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6924 	}
6925 	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6926 		kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6927 	}
6928 	if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6929 	    (so->so_state & SS_ISCONNECTED)) {
6930 		kn->kn_fflags |= NOTE_CONNECTED;
6931 		level_trigger |= NOTE_CONNECTED;
6932 	}
6933 	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6934 	    (so->so_state & SS_ISDISCONNECTED)) {
6935 		kn->kn_fflags |= NOTE_DISCONNECTED;
6936 		level_trigger |= NOTE_DISCONNECTED;
6937 	}
6938 	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6939 		if (so->so_proto != NULL &&
6940 		    (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6941 			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6942 		}
6943 	}
6944 	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6945 	    tcp_notify_ack_active(so)) {
6946 		kn->kn_fflags |= NOTE_NOTIFY_ACK;
6947 	}
6948 	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6949 		kn->kn_fflags |= NOTE_WAKE_PKT;
6950 	}
6951 
6952 	if ((so->so_state & SS_CANTRCVMORE)
6953 #if CONTENT_FILTER
6954 	    && cfil_sock_data_pending(&so->so_rcv) == 0
6955 #endif /* CONTENT_FILTER */
6956 	    ) {
6957 		kn->kn_fflags |= NOTE_READCLOSED;
6958 		level_trigger |= NOTE_READCLOSED;
6959 	}
6960 
6961 	if (so->so_state & SS_CANTSENDMORE) {
6962 		kn->kn_fflags |= NOTE_WRITECLOSED;
6963 		level_trigger |= NOTE_WRITECLOSED;
6964 	}
6965 
6966 	if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6967 	    (so->so_flags & SOF_SUSPENDED)) {
6968 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6969 
6970 		/* If resume event was delivered before, reset it */
6971 		kn->kn_hook32 &= ~NOTE_RESUME;
6972 
6973 		kn->kn_fflags |= NOTE_SUSPEND;
6974 		level_trigger |= NOTE_SUSPEND;
6975 	}
6976 
6977 	if ((ev_hint & SO_FILT_HINT_RESUME) ||
6978 	    (so->so_flags & SOF_SUSPENDED) == 0) {
6979 		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6980 
6981 		/* If suspend event was delivered before, reset it */
6982 		kn->kn_hook32 &= ~NOTE_SUSPEND;
6983 
6984 		kn->kn_fflags |= NOTE_RESUME;
6985 		level_trigger |= NOTE_RESUME;
6986 	}
6987 
6988 	if (so->so_error != 0) {
6989 		ret = 1;
6990 		data = so->so_error;
6991 		kn->kn_flags |= EV_EOF;
6992 	} else {
6993 		u_int32_t data32 = 0;
6994 		get_sockev_state(so, &data32);
6995 		data = data32;
6996 	}
6997 
6998 	/* Reset any events that are not requested on this knote */
6999 	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7000 	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7001 
7002 	/* Find the level triggerred events that are already delivered */
7003 	level_trigger &= kn->kn_hook32;
7004 	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7005 
7006 	/* Do not deliver level triggerred events more than once */
7007 	if ((kn->kn_fflags & ~level_trigger) != 0) {
7008 		ret = 1;
7009 	}
7010 
7011 	if (ret && kev) {
7012 		/*
7013 		 * Store the state of the events being delivered. This
7014 		 * state can be used to deliver level triggered events
7015 		 * ateast once and still avoid waking up the application
7016 		 * multiple times as long as the event is active.
7017 		 */
7018 		if (kn->kn_fflags != 0) {
7019 			kn->kn_hook32 |= (kn->kn_fflags &
7020 			    EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7021 		}
7022 
7023 		/*
7024 		 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7025 		 * only one of them and remember the last one that was
7026 		 * delivered last
7027 		 */
7028 		if (kn->kn_fflags & NOTE_SUSPEND) {
7029 			kn->kn_hook32 &= ~NOTE_RESUME;
7030 		}
7031 		if (kn->kn_fflags & NOTE_RESUME) {
7032 			kn->kn_hook32 &= ~NOTE_SUSPEND;
7033 		}
7034 
7035 		knote_fill_kevent(kn, kev, data);
7036 	}
7037 	return ret;
7038 }
7039 
7040 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7041 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7042 {
7043 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7044 
7045 	/* socket locked */
7046 	kn->kn_hook32 = 0;
7047 	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7048 		so->so_flags |= SOF_KNOTE;
7049 	}
7050 
7051 	/* determine if event already fired */
7052 	return filt_sockev_common(kn, NULL, so, 0);
7053 }
7054 
7055 static void
filt_sockdetach(struct knote * kn)7056 filt_sockdetach(struct knote *kn)
7057 {
7058 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7059 	socket_lock(so, 1);
7060 
7061 	if ((so->so_flags & SOF_KNOTE) != 0) {
7062 		if (KNOTE_DETACH(&so->so_klist, kn)) {
7063 			so->so_flags &= ~SOF_KNOTE;
7064 		}
7065 	}
7066 	socket_unlock(so, 1);
7067 }
7068 
7069 static int
filt_sockev(struct knote * kn,long hint)7070 filt_sockev(struct knote *kn, long hint)
7071 {
7072 	int ret = 0, locked = 0;
7073 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7074 	long ev_hint = (hint & SO_FILT_HINT_EV);
7075 
7076 	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7077 		socket_lock(so, 1);
7078 		locked = 1;
7079 	}
7080 
7081 	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7082 
7083 	if (locked) {
7084 		socket_unlock(so, 1);
7085 	}
7086 
7087 	return ret;
7088 }
7089 
7090 
7091 
7092 /*
7093  *	filt_socktouch - update event state
7094  */
7095 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7096 filt_socktouch(
7097 	struct knote *kn,
7098 	struct kevent_qos_s *kev)
7099 {
7100 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7101 	uint32_t changed_flags;
7102 	int ret;
7103 
7104 	socket_lock(so, 1);
7105 
7106 	/* save off the [result] data and fflags */
7107 	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7108 
7109 	/* save off the new input fflags and data */
7110 	kn->kn_sfflags = kev->fflags;
7111 	kn->kn_sdata = kev->data;
7112 
7113 	/* restrict the current results to the (smaller?) set of new interest */
7114 	/*
7115 	 * For compatibility with previous implementations, we leave kn_fflags
7116 	 * as they were before.
7117 	 */
7118 	//kn->kn_fflags &= kev->fflags;
7119 
7120 	/*
7121 	 * Since we keep track of events that are already
7122 	 * delivered, if any of those events are not requested
7123 	 * anymore the state related to them can be reset
7124 	 */
7125 	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7126 
7127 	/* determine if we have events to deliver */
7128 	ret = filt_sockev_common(kn, NULL, so, 0);
7129 
7130 	socket_unlock(so, 1);
7131 
7132 	return ret;
7133 }
7134 
7135 /*
7136  *	filt_sockprocess - query event fired state and return data
7137  */
7138 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7139 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7140 {
7141 	struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7142 	int ret = 0;
7143 
7144 	socket_lock(so, 1);
7145 
7146 	ret = filt_sockev_common(kn, kev, so, 0);
7147 
7148 	socket_unlock(so, 1);
7149 
7150 	return ret;
7151 }
7152 
7153 void
get_sockev_state(struct socket * so,u_int32_t * statep)7154 get_sockev_state(struct socket *so, u_int32_t *statep)
7155 {
7156 	u_int32_t state = *(statep);
7157 
7158 	/*
7159 	 * If the state variable is already used by a previous event,
7160 	 * reset it.
7161 	 */
7162 	if (state != 0) {
7163 		return;
7164 	}
7165 
7166 	if (so->so_state & SS_ISCONNECTED) {
7167 		state |= SOCKEV_CONNECTED;
7168 	} else {
7169 		state &= ~(SOCKEV_CONNECTED);
7170 	}
7171 	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7172 	*(statep) = state;
7173 }
7174 
7175 #define SO_LOCK_HISTORY_STR_LEN \
7176 	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7177 
7178 __private_extern__ const char *
solockhistory_nr(struct socket * so)7179 solockhistory_nr(struct socket *so)
7180 {
7181 	size_t n = 0;
7182 	int i;
7183 	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7184 
7185 	bzero(lock_history_str, sizeof(lock_history_str));
7186 	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7187 		n += scnprintf(lock_history_str + n,
7188 		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7189 		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7190 		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7191 	}
7192 	return __unsafe_null_terminated_from_indexable(lock_history_str);
7193 }
7194 
7195 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7196 socket_getlock(struct socket *so, int flags)
7197 {
7198 	if (so->so_proto->pr_getlock != NULL) {
7199 		return (*so->so_proto->pr_getlock)(so, flags);
7200 	} else {
7201 		return so->so_proto->pr_domain->dom_mtx;
7202 	}
7203 }
7204 
7205 void
socket_lock(struct socket * so,int refcount)7206 socket_lock(struct socket *so, int refcount)
7207 {
7208 	void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7209 
7210 	if (so->so_proto->pr_lock) {
7211 		(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7212 	} else {
7213 #ifdef MORE_LOCKING_DEBUG
7214 		LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7215 		    LCK_MTX_ASSERT_NOTOWNED);
7216 #endif
7217 		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7218 		if (refcount) {
7219 			so->so_usecount++;
7220 		}
7221 		so->lock_lr[so->next_lock_lr] = lr_saved;
7222 		so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7223 	}
7224 }
7225 
7226 void
socket_lock_assert_owned(struct socket * so)7227 socket_lock_assert_owned(struct socket *so)
7228 {
7229 	lck_mtx_t *mutex_held;
7230 
7231 	if (so->so_proto->pr_getlock != NULL) {
7232 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7233 	} else {
7234 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7235 	}
7236 
7237 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7238 }
7239 
7240 int
socket_try_lock(struct socket * so)7241 socket_try_lock(struct socket *so)
7242 {
7243 	lck_mtx_t *mtx;
7244 
7245 	if (so->so_proto->pr_getlock != NULL) {
7246 		mtx = (*so->so_proto->pr_getlock)(so, 0);
7247 	} else {
7248 		mtx = so->so_proto->pr_domain->dom_mtx;
7249 	}
7250 
7251 	return lck_mtx_try_lock(mtx);
7252 }
7253 
7254 void
socket_unlock(struct socket * so,int refcount)7255 socket_unlock(struct socket *so, int refcount)
7256 {
7257 	lck_mtx_t *mutex_held;
7258 	void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7259 
7260 	if (so == NULL || so->so_proto == NULL) {
7261 		panic("%s: null so_proto so=%p", __func__, so);
7262 		/* NOTREACHED */
7263 	}
7264 
7265 	if (so->so_proto->pr_unlock) {
7266 		(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7267 	} else {
7268 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7269 #ifdef MORE_LOCKING_DEBUG
7270 		LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7271 #endif
7272 		so->unlock_lr[so->next_unlock_lr] = lr_saved;
7273 		so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7274 
7275 		if (refcount) {
7276 			if (so->so_usecount <= 0) {
7277 				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7278 				    "lrh=%s", __func__, so->so_usecount, so,
7279 				    SOCK_DOM(so), so->so_type,
7280 				    SOCK_PROTO(so), solockhistory_nr(so));
7281 				/* NOTREACHED */
7282 			}
7283 
7284 			so->so_usecount--;
7285 			if (so->so_usecount == 0) {
7286 				sofreelastref(so, 1);
7287 			}
7288 		}
7289 		lck_mtx_unlock(mutex_held);
7290 	}
7291 }
7292 
7293 /* Called with socket locked, will unlock socket */
7294 void
sofree(struct socket * so)7295 sofree(struct socket *so)
7296 {
7297 	lck_mtx_t *mutex_held;
7298 
7299 	if (so->so_proto->pr_getlock != NULL) {
7300 		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7301 	} else {
7302 		mutex_held = so->so_proto->pr_domain->dom_mtx;
7303 	}
7304 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7305 
7306 	sofreelastref(so, 0);
7307 }
7308 
7309 void
soreference(struct socket * so)7310 soreference(struct socket *so)
7311 {
7312 	socket_lock(so, 1);     /* locks & take one reference on socket */
7313 	socket_unlock(so, 0);   /* unlock only */
7314 }
7315 
7316 void
sodereference(struct socket * so)7317 sodereference(struct socket *so)
7318 {
7319 	socket_lock(so, 0);
7320 	socket_unlock(so, 1);
7321 }
7322 
7323 /*
7324  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7325  * possibility of using jumbo clusters.  Caller must ensure to hold
7326  * the socket lock.
7327  */
7328 void
somultipages(struct socket * so,boolean_t set)7329 somultipages(struct socket *so, boolean_t set)
7330 {
7331 	if (set) {
7332 		so->so_flags |= SOF_MULTIPAGES;
7333 	} else {
7334 		so->so_flags &= ~SOF_MULTIPAGES;
7335 	}
7336 }
7337 
7338 void
soif2kcl(struct socket * so,boolean_t set)7339 soif2kcl(struct socket *so, boolean_t set)
7340 {
7341 	if (set) {
7342 		so->so_flags1 |= SOF1_IF_2KCL;
7343 	} else {
7344 		so->so_flags1 &= ~SOF1_IF_2KCL;
7345 	}
7346 }
7347 
7348 int
so_isdstlocal(struct socket * so)7349 so_isdstlocal(struct socket *so)
7350 {
7351 	struct inpcb *inp = (struct inpcb *)so->so_pcb;
7352 
7353 	if (SOCK_DOM(so) == PF_INET) {
7354 		return inaddr_local(inp->inp_faddr);
7355 	} else if (SOCK_DOM(so) == PF_INET6) {
7356 		return in6addr_local(&inp->in6p_faddr);
7357 	}
7358 
7359 	return 0;
7360 }
7361 
7362 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7363 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7364 {
7365 	struct sockbuf *rcv, *snd;
7366 	int err = 0, defunct;
7367 
7368 	rcv = &so->so_rcv;
7369 	snd = &so->so_snd;
7370 
7371 	defunct = (so->so_flags & SOF_DEFUNCT);
7372 	if (defunct) {
7373 		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7374 			panic("%s: SB_DROP not set", __func__);
7375 			/* NOTREACHED */
7376 		}
7377 		goto done;
7378 	}
7379 
7380 	if (so->so_flags & SOF_NODEFUNCT) {
7381 		if (noforce) {
7382 			err = EOPNOTSUPP;
7383 			if (p != PROC_NULL) {
7384 				SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7385 				    "name %s level %d) so 0x%llu [%d,%d] "
7386 				    "is not eligible for defunct "
7387 				    "(%d)\n", __func__, proc_selfpid(),
7388 				    proc_best_name(current_proc()), proc_pid(p),
7389 				    proc_best_name(p), level,
7390 				    so->so_gencnt,
7391 				    SOCK_DOM(so), SOCK_TYPE(so), err);
7392 			}
7393 			return err;
7394 		}
7395 		so->so_flags &= ~SOF_NODEFUNCT;
7396 		if (p != PROC_NULL) {
7397 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7398 			    "name %s level %d) so 0x%llu [%d,%d] "
7399 			    "defunct by force "
7400 			    "(%d)\n", __func__, proc_selfpid(),
7401 			    proc_best_name(current_proc()), proc_pid(p),
7402 			    proc_best_name(p), level,
7403 			    so->so_gencnt,
7404 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7405 		}
7406 	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7407 		struct inpcb *inp = (struct inpcb *)so->so_pcb;
7408 		struct ifnet *ifp = inp->inp_last_outifp;
7409 
7410 		if (ifp && IFNET_IS_CELLULAR(ifp)) {
7411 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7412 		} else if (so->so_flags & SOF_DELEGATED) {
7413 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7414 		} else if (soextbkidlestat.so_xbkidle_time == 0) {
7415 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7416 		} else if (noforce && p != PROC_NULL) {
7417 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7418 
7419 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7420 			so->so_extended_bk_start = net_uptime();
7421 			OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7422 
7423 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7424 
7425 			err = EOPNOTSUPP;
7426 			SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7427 			    "name %s level %d) so 0x%llu [%d,%d] "
7428 			    "extend bk idle "
7429 			    "(%d)\n", __func__, proc_selfpid(),
7430 			    proc_best_name(current_proc()), proc_pid(p),
7431 			    proc_best_name(p), level,
7432 			    so->so_gencnt,
7433 			    SOCK_DOM(so), SOCK_TYPE(so), err);
7434 			return err;
7435 		} else {
7436 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7437 		}
7438 	}
7439 
7440 	so->so_flags |= SOF_DEFUNCT;
7441 
7442 	/* Prevent further data from being appended to the socket buffers */
7443 	snd->sb_flags |= SB_DROP;
7444 	rcv->sb_flags |= SB_DROP;
7445 
7446 	/* Flush any existing data in the socket buffers */
7447 	if (rcv->sb_cc != 0) {
7448 		rcv->sb_flags &= ~SB_SEL;
7449 		selthreadclear(&rcv->sb_sel);
7450 		sbrelease(rcv);
7451 	}
7452 	if (snd->sb_cc != 0) {
7453 		snd->sb_flags &= ~SB_SEL;
7454 		selthreadclear(&snd->sb_sel);
7455 		sbrelease(snd);
7456 	}
7457 
7458 done:
7459 	if (p != PROC_NULL) {
7460 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7461 		    "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7462 		    proc_selfpid(), proc_best_name(current_proc()),
7463 		    proc_pid(p), proc_best_name(p), level,
7464 		    so->so_gencnt, SOCK_DOM(so),
7465 		    SOCK_TYPE(so), defunct ? "is already" : "marked as",
7466 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7467 		    " extbkidle" : "");
7468 	}
7469 	return err;
7470 }
7471 
7472 int
sodefunct(struct proc * p,struct socket * so,int level)7473 sodefunct(struct proc *p, struct socket *so, int level)
7474 {
7475 	struct sockbuf *rcv, *snd;
7476 
7477 	if (!(so->so_flags & SOF_DEFUNCT)) {
7478 		panic("%s improperly called", __func__);
7479 		/* NOTREACHED */
7480 	}
7481 	if (so->so_state & SS_DEFUNCT) {
7482 		goto done;
7483 	}
7484 
7485 	rcv = &so->so_rcv;
7486 	snd = &so->so_snd;
7487 
7488 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7489 		char s[MAX_IPv6_STR_LEN];
7490 		char d[MAX_IPv6_STR_LEN];
7491 		struct inpcb *inp = sotoinpcb(so);
7492 
7493 		if (p != PROC_NULL) {
7494 			SODEFUNCTLOG(
7495 				"%s[%d, %s]: (target pid %d name %s level %d) "
7496 				"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7497 				"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7498 				" snd_fl 0x%x]\n", __func__,
7499 				proc_selfpid(), proc_best_name(current_proc()),
7500 				proc_pid(p), proc_best_name(p), level,
7501 				so->so_gencnt,
7502 				(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7503 				inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7504 				(void *)&inp->inp_laddr.s_addr :
7505 				(void *)&inp->in6p_laddr),
7506 				s, sizeof(s)), ntohs(inp->in6p_lport),
7507 				inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7508 				(void *)&inp->inp_faddr.s_addr :
7509 				(void *)&inp->in6p_faddr,
7510 				d, sizeof(d)), ntohs(inp->in6p_fport),
7511 				(uint32_t)rcv->sb_sel.si_flags,
7512 				(uint32_t)snd->sb_sel.si_flags,
7513 				rcv->sb_flags, snd->sb_flags);
7514 		}
7515 	} else if (p != PROC_NULL) {
7516 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7517 		    "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7518 		    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7519 		    proc_selfpid(), proc_best_name(current_proc()),
7520 		    proc_pid(p), proc_best_name(p), level,
7521 		    so->so_gencnt,
7522 		    SOCK_DOM(so), SOCK_TYPE(so),
7523 		    (uint32_t)rcv->sb_sel.si_flags,
7524 		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7525 		    snd->sb_flags);
7526 	}
7527 
7528 	/*
7529 	 * First tell the protocol the flow is defunct
7530 	 */
7531 	(void)  (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7532 
7533 	/*
7534 	 * Unwedge threads blocked on sbwait() and sb_lock().
7535 	 */
7536 	sbwakeup(rcv);
7537 	sbwakeup(snd);
7538 
7539 	so->so_flags1 |= SOF1_DEFUNCTINPROG;
7540 	if (rcv->sb_flags & SB_LOCK) {
7541 		sbunlock(rcv, TRUE);    /* keep socket locked */
7542 	}
7543 	if (snd->sb_flags & SB_LOCK) {
7544 		sbunlock(snd, TRUE);    /* keep socket locked */
7545 	}
7546 	/*
7547 	 * Flush the buffers and disconnect.  We explicitly call shutdown
7548 	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7549 	 * states are set for the socket.  This would also flush out data
7550 	 * hanging off the receive list of this socket.
7551 	 */
7552 	(void) soshutdownlock_final(so, SHUT_RD);
7553 	(void) soshutdownlock_final(so, SHUT_WR);
7554 	(void) sodisconnectlocked(so);
7555 
7556 	/*
7557 	 * Explicitly handle connectionless-protocol disconnection
7558 	 * and release any remaining data in the socket buffers.
7559 	 */
7560 	if (!(so->so_state & SS_ISDISCONNECTED)) {
7561 		(void) soisdisconnected(so);
7562 	}
7563 
7564 	if (so->so_error == 0) {
7565 		so->so_error = EBADF;
7566 	}
7567 
7568 	if (rcv->sb_cc != 0) {
7569 		rcv->sb_flags &= ~SB_SEL;
7570 		selthreadclear(&rcv->sb_sel);
7571 		sbrelease(rcv);
7572 	}
7573 	if (snd->sb_cc != 0) {
7574 		snd->sb_flags &= ~SB_SEL;
7575 		selthreadclear(&snd->sb_sel);
7576 		sbrelease(snd);
7577 	}
7578 	so->so_state |= SS_DEFUNCT;
7579 	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7580 
7581 done:
7582 	return 0;
7583 }
7584 
7585 int
soresume(struct proc * p,struct socket * so,int locked)7586 soresume(struct proc *p, struct socket *so, int locked)
7587 {
7588 	if (locked == 0) {
7589 		socket_lock(so, 1);
7590 	}
7591 
7592 	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7593 		SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7594 		    "[%d,%d] resumed from bk idle\n",
7595 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7596 		    proc_pid(p), proc_best_name(p),
7597 		    so->so_gencnt,
7598 		    SOCK_DOM(so), SOCK_TYPE(so));
7599 
7600 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7601 		so->so_extended_bk_start = 0;
7602 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7603 
7604 		OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7605 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7606 		VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7607 	}
7608 	if (locked == 0) {
7609 		socket_unlock(so, 1);
7610 	}
7611 
7612 	return 0;
7613 }
7614 
7615 /*
7616  * Does not attempt to account for sockets that are delegated from
7617  * the current process
7618  */
7619 int
so_set_extended_bk_idle(struct socket * so,int optval)7620 so_set_extended_bk_idle(struct socket *so, int optval)
7621 {
7622 	int error = 0;
7623 
7624 	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7625 	    SOCK_PROTO(so) != IPPROTO_TCP) {
7626 		OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7627 		error = EOPNOTSUPP;
7628 	} else if (optval == 0) {
7629 		so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7630 
7631 		soresume(current_proc(), so, 1);
7632 	} else {
7633 		struct proc *p = current_proc();
7634 		struct fileproc *fp;
7635 		int count = 0;
7636 
7637 		/*
7638 		 * Unlock socket to avoid lock ordering issue with
7639 		 * the proc fd table lock
7640 		 */
7641 		socket_unlock(so, 0);
7642 
7643 		proc_fdlock(p);
7644 		fdt_foreach(fp, p) {
7645 			struct socket *so2;
7646 
7647 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7648 				continue;
7649 			}
7650 
7651 			so2 = (struct socket *)fp_get_data(fp);
7652 			if (so != so2 &&
7653 			    so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7654 				count++;
7655 			}
7656 			if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7657 				break;
7658 			}
7659 		}
7660 		proc_fdunlock(p);
7661 
7662 		socket_lock(so, 0);
7663 
7664 		if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7665 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7666 			error = EBUSY;
7667 		} else if (so->so_flags & SOF_DELEGATED) {
7668 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7669 			error = EBUSY;
7670 		} else {
7671 			so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7672 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7673 		}
7674 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7675 		    "%s marked for extended bk idle\n",
7676 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7677 		    so->so_gencnt,
7678 		    SOCK_DOM(so), SOCK_TYPE(so),
7679 		    (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7680 		    "is" : "not");
7681 	}
7682 
7683 	return error;
7684 }
7685 
7686 static void
so_stop_extended_bk_idle(struct socket * so)7687 so_stop_extended_bk_idle(struct socket *so)
7688 {
7689 	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7690 	so->so_extended_bk_start = 0;
7691 
7692 	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7693 	VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7694 	/*
7695 	 * Force defunct
7696 	 */
7697 	sosetdefunct(current_proc(), so,
7698 	    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7699 	if (so->so_flags & SOF_DEFUNCT) {
7700 		sodefunct(current_proc(), so,
7701 		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7702 	}
7703 }
7704 
7705 void
so_drain_extended_bk_idle(struct socket * so)7706 so_drain_extended_bk_idle(struct socket *so)
7707 {
7708 	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7709 		/*
7710 		 * Only penalize sockets that have outstanding data
7711 		 */
7712 		if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7713 			so_stop_extended_bk_idle(so);
7714 
7715 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7716 		}
7717 	}
7718 }
7719 
7720 /*
7721  * Return values tells if socket is still in extended background idle
7722  */
7723 int
so_check_extended_bk_idle_time(struct socket * so)7724 so_check_extended_bk_idle_time(struct socket *so)
7725 {
7726 	int ret = 1;
7727 
7728 	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7729 		SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7730 		    __func__, proc_selfpid(), proc_best_name(current_proc()),
7731 		    so->so_gencnt,
7732 		    SOCK_DOM(so), SOCK_TYPE(so));
7733 		if (net_uptime() - so->so_extended_bk_start >
7734 		    soextbkidlestat.so_xbkidle_time) {
7735 			so_stop_extended_bk_idle(so);
7736 
7737 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7738 
7739 			ret = 0;
7740 		} else {
7741 			struct inpcb *inp = (struct inpcb *)so->so_pcb;
7742 
7743 			inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7744 			OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7745 		}
7746 	}
7747 
7748 	return ret;
7749 }
7750 
7751 void
resume_proc_sockets(proc_t p)7752 resume_proc_sockets(proc_t p)
7753 {
7754 	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7755 		struct fileproc *fp;
7756 		struct socket *so;
7757 
7758 		proc_fdlock(p);
7759 		fdt_foreach(fp, p) {
7760 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7761 				continue;
7762 			}
7763 
7764 			so = (struct socket *)fp_get_data(fp);
7765 			(void) soresume(p, so, 0);
7766 		}
7767 		proc_fdunlock(p);
7768 
7769 		OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7770 	}
7771 }
7772 
7773 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7774 so_set_recv_anyif(struct socket *so, int optval)
7775 {
7776 	int ret = 0;
7777 
7778 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7779 		if (optval) {
7780 			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7781 		} else {
7782 			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7783 		}
7784 #if SKYWALK
7785 		inp_update_netns_flags(so);
7786 #endif /* SKYWALK */
7787 	}
7788 
7789 
7790 	return ret;
7791 }
7792 
7793 __private_extern__ int
so_get_recv_anyif(struct socket * so)7794 so_get_recv_anyif(struct socket *so)
7795 {
7796 	int ret = 0;
7797 
7798 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7799 		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7800 	}
7801 
7802 	return ret;
7803 }
7804 
7805 int
so_set_restrictions(struct socket * so,uint32_t vals)7806 so_set_restrictions(struct socket *so, uint32_t vals)
7807 {
7808 	int nocell_old, nocell_new;
7809 	int noexpensive_old, noexpensive_new;
7810 	int noconstrained_old, noconstrained_new;
7811 
7812 	/*
7813 	 * Deny-type restrictions are trapdoors; once set they cannot be
7814 	 * unset for the lifetime of the socket.  This allows them to be
7815 	 * issued by a framework on behalf of the application without
7816 	 * having to worry that they can be undone.
7817 	 *
7818 	 * Note here that socket-level restrictions overrides any protocol
7819 	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7820 	 * socket restriction issued on the socket has a higher precendence
7821 	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7822 	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7823 	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7824 	 */
7825 	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7826 	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7827 	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7828 	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7829 	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7830 	    SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7831 	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7832 	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7833 	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7834 
7835 	/* we can only set, not clear restrictions */
7836 	if ((nocell_new - nocell_old) == 0 &&
7837 	    (noexpensive_new - noexpensive_old) == 0 &&
7838 	    (noconstrained_new - noconstrained_old) == 0) {
7839 		return 0;
7840 	}
7841 	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7842 		if (nocell_new - nocell_old != 0) {
7843 			/*
7844 			 * if deny cellular is now set, do what's needed
7845 			 * for INPCB
7846 			 */
7847 			inp_set_nocellular(sotoinpcb(so));
7848 		}
7849 		if (noexpensive_new - noexpensive_old != 0) {
7850 			inp_set_noexpensive(sotoinpcb(so));
7851 		}
7852 		if (noconstrained_new - noconstrained_old != 0) {
7853 			inp_set_noconstrained(sotoinpcb(so));
7854 		}
7855 	}
7856 
7857 	if (SOCK_DOM(so) == PF_MULTIPATH) {
7858 		mptcp_set_restrictions(so);
7859 	}
7860 
7861 	return 0;
7862 }
7863 
7864 uint32_t
so_get_restrictions(struct socket * so)7865 so_get_restrictions(struct socket *so)
7866 {
7867 	return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7868 	       SO_RESTRICT_DENY_OUT |
7869 	       SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7870 }
7871 
7872 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7873 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7874 {
7875 	struct proc *ep = PROC_NULL;
7876 	int error = 0;
7877 
7878 	/* pid 0 is reserved for kernel */
7879 	if (epid == 0) {
7880 		error = EINVAL;
7881 		goto done;
7882 	}
7883 
7884 	/*
7885 	 * If this is an in-kernel socket, prevent its delegate
7886 	 * association from changing unless the socket option is
7887 	 * coming from within the kernel itself.
7888 	 */
7889 	if (so->last_pid == 0 && p != kernproc) {
7890 		error = EACCES;
7891 		goto done;
7892 	}
7893 
7894 	/*
7895 	 * If this is issued by a process that's recorded as the
7896 	 * real owner of the socket, or if the pid is the same as
7897 	 * the process's own pid, then proceed.  Otherwise ensure
7898 	 * that the issuing process has the necessary privileges.
7899 	 */
7900 	if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7901 		if ((error = priv_check_cred(kauth_cred_get(),
7902 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7903 			error = EACCES;
7904 			goto done;
7905 		}
7906 	}
7907 
7908 	/* Find the process that corresponds to the effective pid */
7909 	if ((ep = proc_find(epid)) == PROC_NULL) {
7910 		error = ESRCH;
7911 		goto done;
7912 	}
7913 
7914 	/*
7915 	 * If a process tries to delegate the socket to itself, then
7916 	 * there's really nothing to do; treat it as a way for the
7917 	 * delegate association to be cleared.  Note that we check
7918 	 * the passed-in proc rather than calling proc_selfpid(),
7919 	 * as we need to check the process issuing the socket option
7920 	 * which could be kernproc.  Given that we don't allow 0 for
7921 	 * effective pid, it means that a delegated in-kernel socket
7922 	 * stays delegated during its lifetime (which is probably OK.)
7923 	 */
7924 	if (epid == proc_pid(p)) {
7925 		so->so_flags &= ~SOF_DELEGATED;
7926 		so->e_upid = 0;
7927 		so->e_pid = 0;
7928 		uuid_clear(so->e_uuid);
7929 	} else {
7930 		so->so_flags |= SOF_DELEGATED;
7931 		so->e_upid = proc_uniqueid(ep);
7932 		so->e_pid = proc_pid(ep);
7933 		proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7934 
7935 #if defined(XNU_TARGET_OS_OSX)
7936 		if (ep->p_responsible_pid != so->e_pid) {
7937 			proc_t rp = proc_find(ep->p_responsible_pid);
7938 			if (rp != PROC_NULL) {
7939 				proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7940 				so->so_rpid = ep->p_responsible_pid;
7941 				proc_rele(rp);
7942 			} else {
7943 				uuid_clear(so->so_ruuid);
7944 				so->so_rpid = -1;
7945 			}
7946 		}
7947 #endif
7948 	}
7949 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7950 		(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7951 	}
7952 done:
7953 	if (error == 0 && net_io_policy_log) {
7954 		uuid_string_t buf;
7955 
7956 		uuid_unparse(so->e_uuid, buf);
7957 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7958 		    "euuid %s%s\n", __func__, proc_name_address(p),
7959 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7960 		    SOCK_DOM(so), SOCK_TYPE(so),
7961 		    so->e_pid, proc_name_address(ep), buf,
7962 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7963 	} else if (error != 0 && net_io_policy_log) {
7964 		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7965 		    "ERROR (%d)\n", __func__, proc_name_address(p),
7966 		    proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7967 		    SOCK_DOM(so), SOCK_TYPE(so),
7968 		    epid, (ep == PROC_NULL) ? "PROC_NULL" :
7969 		    proc_name_address(ep), error);
7970 	}
7971 
7972 	/* Update this socket's policy upon success */
7973 	if (error == 0) {
7974 		so->so_policy_gencnt *= -1;
7975 		so_update_policy(so);
7976 #if NECP
7977 		so_update_necp_policy(so, NULL, NULL);
7978 #endif /* NECP */
7979 	}
7980 
7981 	if (ep != PROC_NULL) {
7982 		proc_rele(ep);
7983 	}
7984 
7985 	return error;
7986 }
7987 
7988 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7989 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7990 {
7991 	uuid_string_t buf;
7992 	uuid_t uuid;
7993 	int error = 0;
7994 
7995 	/* UUID must not be all-zeroes (reserved for kernel) */
7996 	if (uuid_is_null(euuid)) {
7997 		error = EINVAL;
7998 		goto done;
7999 	}
8000 
8001 	/*
8002 	 * If this is an in-kernel socket, prevent its delegate
8003 	 * association from changing unless the socket option is
8004 	 * coming from within the kernel itself.
8005 	 */
8006 	if (so->last_pid == 0 && p != kernproc) {
8007 		error = EACCES;
8008 		goto done;
8009 	}
8010 
8011 	/* Get the UUID of the issuing process */
8012 	proc_getexecutableuuid(p, uuid, sizeof(uuid));
8013 
8014 	/*
8015 	 * If this is issued by a process that's recorded as the
8016 	 * real owner of the socket, or if the uuid is the same as
8017 	 * the process's own uuid, then proceed.  Otherwise ensure
8018 	 * that the issuing process has the necessary privileges.
8019 	 */
8020 	if (check_cred &&
8021 	    (uuid_compare(euuid, so->last_uuid) != 0 ||
8022 	    uuid_compare(euuid, uuid) != 0)) {
8023 		if ((error = priv_check_cred(kauth_cred_get(),
8024 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8025 			error = EACCES;
8026 			goto done;
8027 		}
8028 	}
8029 
8030 	/*
8031 	 * If a process tries to delegate the socket to itself, then
8032 	 * there's really nothing to do; treat it as a way for the
8033 	 * delegate association to be cleared.  Note that we check
8034 	 * the uuid of the passed-in proc rather than that of the
8035 	 * current process, as we need to check the process issuing
8036 	 * the socket option which could be kernproc itself.  Given
8037 	 * that we don't allow 0 for effective uuid, it means that
8038 	 * a delegated in-kernel socket stays delegated during its
8039 	 * lifetime (which is okay.)
8040 	 */
8041 	if (uuid_compare(euuid, uuid) == 0) {
8042 		so->so_flags &= ~SOF_DELEGATED;
8043 		so->e_upid = 0;
8044 		so->e_pid = 0;
8045 		uuid_clear(so->e_uuid);
8046 	} else {
8047 		so->so_flags |= SOF_DELEGATED;
8048 		/*
8049 		 * Unlike so_set_effective_pid(), we only have the UUID
8050 		 * here and the process ID is not known.  Inherit the
8051 		 * real {pid,upid} of the socket.
8052 		 */
8053 		so->e_upid = so->last_upid;
8054 		so->e_pid = so->last_pid;
8055 		uuid_copy(so->e_uuid, euuid);
8056 	}
8057 	/*
8058 	 * The following will clear the effective process name as it's the same
8059 	 * as the real process
8060 	 */
8061 	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8062 		(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8063 	}
8064 done:
8065 	if (error == 0 && net_io_policy_log) {
8066 		uuid_unparse(so->e_uuid, buf);
8067 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8068 		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8069 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8070 		    SOCK_TYPE(so), so->e_pid, buf,
8071 		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8072 	} else if (error != 0 && net_io_policy_log) {
8073 		uuid_unparse(euuid, buf);
8074 		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8075 		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8076 		    (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8077 		    SOCK_TYPE(so), buf, error);
8078 	}
8079 
8080 	/* Update this socket's policy upon success */
8081 	if (error == 0) {
8082 		so->so_policy_gencnt *= -1;
8083 		so_update_policy(so);
8084 #if NECP
8085 		so_update_necp_policy(so, NULL, NULL);
8086 #endif /* NECP */
8087 	}
8088 
8089 	return error;
8090 }
8091 
8092 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8093 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8094     uint32_t ev_datalen)
8095 {
8096 	struct kev_msg ev_msg;
8097 
8098 	/*
8099 	 * A netpolicy event always starts with a netpolicy_event_data
8100 	 * structure, but the caller can provide for a longer event
8101 	 * structure to post, depending on the event code.
8102 	 */
8103 	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8104 
8105 	bzero(&ev_msg, sizeof(ev_msg));
8106 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8107 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
8108 	ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8109 	ev_msg.event_code       = ev_code;
8110 
8111 	ev_msg.dv[0].data_ptr   = ev_data;
8112 	ev_msg.dv[0].data_length = ev_datalen;
8113 
8114 	kev_post_msg(&ev_msg);
8115 }
8116 
8117 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8118 socket_post_kev_msg(uint32_t ev_code,
8119     struct kev_socket_event_data *ev_data,
8120     uint32_t ev_datalen)
8121 {
8122 	struct kev_msg ev_msg;
8123 
8124 	bzero(&ev_msg, sizeof(ev_msg));
8125 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8126 	ev_msg.kev_class = KEV_NETWORK_CLASS;
8127 	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8128 	ev_msg.event_code = ev_code;
8129 
8130 	ev_msg.dv[0].data_ptr = ev_data;
8131 	ev_msg.dv[0].data_length = ev_datalen;
8132 
8133 	kev_post_msg(&ev_msg);
8134 }
8135 
8136 void
socket_post_kev_msg_closed(struct socket * so)8137 socket_post_kev_msg_closed(struct socket *so)
8138 {
8139 	struct kev_socket_closed ev = {};
8140 	struct sockaddr *__single socksa = NULL, *__single peersa = NULL;
8141 	int err;
8142 
8143 	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8144 		return;
8145 	}
8146 	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8147 	if (err == 0) {
8148 		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8149 		    &peersa);
8150 		if (err == 0) {
8151 			SOCKADDR_COPY(socksa, &ev.ev_data.kev_sockname,
8152 			    min(socksa->sa_len,
8153 			    sizeof(ev.ev_data.kev_sockname)));
8154 			SOCKADDR_COPY(peersa, &ev.ev_data.kev_peername,
8155 			    min(peersa->sa_len,
8156 			    sizeof(ev.ev_data.kev_peername)));
8157 			socket_post_kev_msg(KEV_SOCKET_CLOSED,
8158 			    &ev.ev_data, sizeof(ev));
8159 		}
8160 	}
8161 	free_sockaddr(socksa);
8162 	free_sockaddr(peersa);
8163 }
8164 
8165 void
sock_parse_cm_info(struct mbuf * control,struct sock_cm_info * sockcminfo)8166 sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo)
8167 {
8168 	struct cmsghdr *cm;
8169 
8170 	for (cm = M_FIRST_CMSGHDR(control);
8171 	    is_cmsg_valid(control, cm);
8172 	    cm = M_NXT_CMSGHDR(control, cm)) {
8173 		int val;
8174 
8175 		if (cm->cmsg_level != SOL_SOCKET) {
8176 			continue;
8177 		}
8178 
8179 		if (cm->cmsg_len == CMSG_LEN(sizeof(int))) {
8180 			val = *(int *)(void *)CMSG_DATA(cm);
8181 		}
8182 
8183 		switch (cm->cmsg_type) {
8184 		case SO_TRAFFIC_CLASS:
8185 			if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8186 				break;
8187 			}
8188 			if (SO_VALID_TC(val)) {
8189 				sockcminfo->sotc = val;
8190 				break;
8191 			} else if (val < SO_TC_NET_SERVICE_OFFSET) {
8192 				break;
8193 			}
8194 			/*
8195 			 * Handle the case SO_NET_SERVICE_TYPE values are
8196 			 * passed using SO_TRAFFIC_CLASS
8197 			 */
8198 			val = val - SO_TC_NET_SERVICE_OFFSET;
8199 
8200 			OS_FALLTHROUGH;
8201 		case SO_NET_SERVICE_TYPE:
8202 			if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8203 				break;
8204 			}
8205 
8206 			if (!IS_VALID_NET_SERVICE_TYPE(val)) {
8207 				break;
8208 			}
8209 			sockcminfo->netsvctype = val;
8210 			sockcminfo->sotc = sotc_by_netservicetype[val];
8211 			break;
8212 		case SCM_TXTIME:
8213 			if (cm->cmsg_len != CMSG_LEN(sizeof(uint64_t))) {
8214 				break;
8215 			}
8216 
8217 			sockcminfo->tx_time = *(uint64_t *)(void *)CMSG_DATA(cm);
8218 			break;
8219 		default:
8220 			break;
8221 		}
8222 	}
8223 }
8224 
8225 __attribute__((noinline, cold, not_tail_called, noreturn))
8226 __private_extern__ int
assfail(const char * a,const char * f,int l)8227 assfail(const char *a, const char *f, int l)
8228 {
8229 	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8230 	/* NOTREACHED */
8231 	__builtin_unreachable();
8232 }
8233