1 /*
2 * Copyright (c) 1998-2022, 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <net/sockaddr_utils.h>
106 #include <netinet/in.h>
107 #include <netinet/in_pcb.h>
108 #include <netinet/in_tclass.h>
109 #include <netinet/in_var.h>
110 #include <netinet/tcp_var.h>
111 #include <netinet/ip6.h>
112 #include <netinet6/ip6_var.h>
113 #include <netinet/flow_divert.h>
114 #include <kern/assert.h>
115 #include <kern/locks.h>
116 #include <kern/mem_acct.h>
117 #include <kern/policy_internal.h>
118 #include <kern/uipc_domain.h>
119 #include <kern/uipc_socket.h>
120 #include <kern/task.h>
121 #include <kern/zalloc.h>
122 #include <machine/limits.h>
123 #include <libkern/OSAtomic.h>
124 #include <pexpert/pexpert.h>
125
126 #include <sys/kpi_mbuf.h>
127 #include <sys/mcache.h>
128 #include <sys/unpcb.h>
129 #include <libkern/section_keywords.h>
130
131 #include <os/log.h>
132
133 #if CONFIG_MACF
134 #include <security/mac_framework.h>
135 #endif /* MAC */
136
137 #if MULTIPATH
138 #include <netinet/mp_pcb.h>
139 #include <netinet/mptcp_var.h>
140 #endif /* MULTIPATH */
141
142 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
143
144 #if DEBUG || DEVELOPMENT
145 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
146 #else
147 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
148 #endif
149
150 /* TODO: this should be in a header file somewhere */
151 extern char *proc_name_address(void *p);
152
153 static int socketinit_done;
154 struct mem_acct *socket_memacct;
155
156 #include <machine/limits.h>
157
158 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
159 static void filt_sordetach(struct knote *kn);
160 static int filt_soread(struct knote *kn, long hint);
161 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
162 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
163
164 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void filt_sowdetach(struct knote *kn);
166 static int filt_sowrite(struct knote *kn, long hint);
167 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
169
170 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void filt_sockdetach(struct knote *kn);
172 static int filt_sockev(struct knote *kn, long hint);
173 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
175
176 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
177 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
178
179 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
180 .f_isfd = 1,
181 .f_attach = filt_sorattach,
182 .f_detach = filt_sordetach,
183 .f_event = filt_soread,
184 .f_touch = filt_sortouch,
185 .f_process = filt_sorprocess,
186 };
187
188 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
189 .f_isfd = 1,
190 .f_attach = filt_sowattach,
191 .f_detach = filt_sowdetach,
192 .f_event = filt_sowrite,
193 .f_touch = filt_sowtouch,
194 .f_process = filt_sowprocess,
195 };
196
197 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
198 .f_isfd = 1,
199 .f_attach = filt_sockattach,
200 .f_detach = filt_sockdetach,
201 .f_event = filt_sockev,
202 .f_touch = filt_socktouch,
203 .f_process = filt_sockprocess,
204 };
205
206 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
207 .f_isfd = 1,
208 .f_attach = filt_sorattach,
209 .f_detach = filt_sordetach,
210 .f_event = filt_soread,
211 .f_touch = filt_sortouch,
212 .f_process = filt_sorprocess,
213 };
214
215 SYSCTL_DECL(_kern_ipc);
216
217 #define EVEN_MORE_LOCKING_DEBUG 0
218
219 int socket_debug = 0;
220 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
221 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
222
223 #if (DEBUG || DEVELOPMENT)
224 #define DEFAULT_SOSEND_ASSERT_PANIC 1
225 #else
226 #define DEFAULT_SOSEND_ASSERT_PANIC 0
227 #endif /* (DEBUG || DEVELOPMENT) */
228
229 int sosend_assert_panic = 0;
230 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
231 CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
232
233 static unsigned long sodefunct_calls = 0;
234 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
235 &sodefunct_calls, "");
236
237 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
238 so_gen_t so_gencnt; /* generation count for sockets */
239
240 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
241 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
242 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
243 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
244 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
245 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
246 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
247 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
248 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
265 * writes on the socket for all protocols on any network interfaces.
266 * Be extra careful when setting this to 1, because sending down packets with
267 * clusters larger that 2 KB might lead to system panics or data corruption.
268 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
269 * on the outgoing interface
270 * Set this to 1 for testing/debugging purposes only.
271 */
272 int sosendbigcl_ignore_capab = 0;
273 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
274 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
275
276 int sodefunctlog = 0;
277 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
278 &sodefunctlog, 0, "");
279
280 int sothrottlelog = 0;
281 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
282 &sothrottlelog, 0, "");
283
284 int sorestrictrecv = 1;
285 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
286 &sorestrictrecv, 0, "Enable inbound interface restrictions");
287
288 int sorestrictsend = 1;
289 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
290 &sorestrictsend, 0, "Enable outbound interface restrictions");
291
292 int soreserveheadroom = 1;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
294 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
295
296 #if (DEBUG || DEVELOPMENT)
297 int so_notsent_lowat_check = 1;
298 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
299 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
300 #endif /* DEBUG || DEVELOPMENT */
301
302 int so_accept_list_waits = 0;
303 #if (DEBUG || DEVELOPMENT)
304 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &so_accept_list_waits, 0, "number of waits for listener incomp list");
306 #endif /* DEBUG || DEVELOPMENT */
307
308 extern struct inpcbinfo tcbinfo;
309
310 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
311 user_ssize_t *);
312
313 /*
314 * Maximum of extended background idle sockets per process
315 * Set to zero to disable further setting of the option
316 */
317
318 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
319 #define SO_IDLE_BK_IDLE_TIME 600
320 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
321
322 struct soextbkidlestat soextbkidlestat;
323
324 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
325 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
326 "Maximum of extended background idle sockets per process");
327
328 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
329 &soextbkidlestat.so_xbkidle_time, 0,
330 "Time in seconds to keep extended background idle sockets");
331
332 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
333 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
334 "High water mark for extended background idle sockets");
335
336 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
337 &soextbkidlestat, soextbkidlestat, "");
338
339 int so_set_extended_bk_idle(struct socket *, int);
340
341 #define SO_MAX_MSG_X 1024
342
343 /*
344 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
345 * setting the DSCP code on the packet based on the service class; see
346 * <rdar://problem/11277343> for details.
347 */
348 __private_extern__ u_int32_t sotcdb = 0;
349 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
350 &sotcdb, 0, "");
351
352 void
socketinit(void)353 socketinit(void)
354 {
355 static_assert(sizeof(so_gencnt) == sizeof(uint64_t));
356 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
357
358 #ifdef __LP64__
359 static_assert(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
360 static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
361 static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
362 static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
363 static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
364 static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
365 #else
366 static_assert(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
367 static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
368 static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
369 static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
370 static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
371 static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
372 #endif
373
374 if (socketinit_done) {
375 printf("socketinit: already called...\n");
376 return;
377 }
378 socketinit_done = 1;
379
380 PE_parse_boot_argn("socket_debug", &socket_debug,
381 sizeof(socket_debug));
382
383 PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
384 sizeof(sosend_assert_panic));
385
386 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
387 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
388 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
389 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
390
391 in_pcbinit();
392
393 socket_memacct = mem_acct_register("SOCKET", 0, 0);
394 if (socket_memacct == NULL) {
395 panic("mem_acct_register returned NULL");
396 }
397 }
398
399 void
so_update_last_owner_locked(struct socket * so,proc_t self)400 so_update_last_owner_locked(struct socket *so, proc_t self)
401 {
402 if (so->last_pid != 0) {
403 /*
404 * last_pid and last_upid should remain zero for sockets
405 * created using sock_socket. The check above achieves that
406 */
407 if (self == PROC_NULL) {
408 self = current_proc();
409 }
410
411 if (so->last_upid != proc_uniqueid(self) ||
412 so->last_pid != proc_pid(self)) {
413 so->last_upid = proc_uniqueid(self);
414 so->last_pid = proc_pid(self);
415 proc_getexecutableuuid(self, so->last_uuid,
416 sizeof(so->last_uuid));
417 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
418 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
419 }
420 }
421 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
422 }
423 }
424
425 void
so_update_policy(struct socket * so)426 so_update_policy(struct socket *so)
427 {
428 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
429 (void) inp_update_policy(sotoinpcb(so));
430 }
431 }
432
433 #if NECP
434 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)435 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
436 struct sockaddr *override_remote_addr)
437 {
438 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
439 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
440 override_remote_addr, 0);
441 }
442 }
443 #endif /* NECP */
444
445 /*
446 * Get a socket structure from our zone, and initialize it.
447 *
448 * Note that it would probably be better to allocate socket
449 * and PCB at the same time, but I'm not convinced that all
450 * the protocols can be easily modified to do this.
451 */
452 struct socket *
soalloc(void)453 soalloc(void)
454 {
455 struct socket *__single so;
456
457 so = zalloc_flags(socket_zone, Z_WAITOK_ZERO);
458 if (so != NULL) {
459 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
460
461 /*
462 * Increment the socket allocation statistics
463 */
464 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
465 }
466
467 return so;
468 }
469
470 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)471 socreate_internal(int dom, struct socket **aso, int type, int proto,
472 struct proc *p, uint32_t flags, struct proc *ep)
473 {
474 struct protosw *prp;
475 struct socket *so;
476 int error = 0;
477 pid_t rpid = -1;
478
479 VERIFY(aso != NULL);
480 *aso = NULL;
481
482 if (proto != 0) {
483 prp = pffindproto(dom, proto, type);
484 } else {
485 prp = pffindtype(dom, type);
486 }
487
488 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
489 if (pffinddomain(dom) == NULL) {
490 return EAFNOSUPPORT;
491 }
492 if (proto != 0) {
493 if (pffindprotonotype(dom, proto) != NULL) {
494 return EPROTOTYPE;
495 }
496 }
497 return EPROTONOSUPPORT;
498 }
499 if (prp->pr_type != type) {
500 return EPROTOTYPE;
501 }
502 if (proto_memacct_hardlimit(prp)) {
503 return ENOBUFS;
504 }
505 so = soalloc();
506 if (so == NULL) {
507 return ENOBUFS;
508 }
509
510 switch (dom) {
511 case PF_LOCAL:
512 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
513 break;
514 case PF_INET:
515 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
516 if (type == SOCK_STREAM) {
517 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
518 } else {
519 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
520 }
521 break;
522 case PF_ROUTE:
523 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
524 break;
525 case PF_NDRV:
526 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
527 break;
528 case PF_KEY:
529 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
530 break;
531 case PF_INET6:
532 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
533 if (type == SOCK_STREAM) {
534 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
535 } else {
536 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
537 }
538 break;
539 case PF_SYSTEM:
540 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
541 break;
542 case PF_MULTIPATH:
543 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
544 break;
545 default:
546 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
547 break;
548 }
549
550 if (flags & SOCF_MPTCP) {
551 so->so_state |= SS_NBIO;
552 }
553
554 TAILQ_INIT(&so->so_incomp);
555 TAILQ_INIT(&so->so_comp);
556 so->so_type = (short)type;
557 so->so_family = prp->pr_domain->dom_family;
558 so->so_protocol = prp->pr_protocol;
559 so->last_upid = proc_uniqueid(p);
560 so->last_pid = proc_pid(p);
561 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
562 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
563
564 so->so_rpid = -1;
565 uuid_clear(so->so_ruuid);
566
567 if (ep != PROC_NULL && ep != p) {
568 so->e_upid = proc_uniqueid(ep);
569 so->e_pid = proc_pid(ep);
570 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
571 so->so_flags |= SOF_DELEGATED;
572 if (ep->p_responsible_pid != so->e_pid) {
573 rpid = ep->p_responsible_pid;
574 so->so_rpid = rpid;
575 proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
576 }
577 }
578
579 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
580 rpid = p->p_responsible_pid;
581 so->so_rpid = rpid;
582 proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
583 }
584
585 so->so_cred = kauth_cred_proc_ref(p);
586 if (!suser(kauth_cred_get(), NULL)) {
587 so->so_state |= SS_PRIV;
588 }
589
590 so->so_persona_id = current_persona_get_id();
591 so->so_proto = prp;
592 so->so_rcv.sb_flags |= SB_RECV;
593 so->so_rcv.sb_so = so->so_snd.sb_so = so;
594 so->next_lock_lr = 0;
595 so->next_unlock_lr = 0;
596
597 proto_memacct_add(so->so_proto, sizeof(struct socket));
598
599 /*
600 * Attachment will create the per pcb lock if necessary and
601 * increase refcount for creation, make sure it's done before
602 * socket is inserted in lists.
603 */
604 so->so_usecount++;
605
606 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
607 if (error != 0) {
608 /*
609 * Warning:
610 * If so_pcb is not zero, the socket will be leaked,
611 * so protocol attachment handler must be coded carefuly
612 */
613 if (so->so_pcb != NULL) {
614 os_log_error(OS_LOG_DEFAULT,
615 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
616 error, dom, proto, type);
617 }
618 /*
619 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
620 */
621 so->so_state |= SS_NOFDREF;
622 so->so_flags |= SOF_PCBCLEARING;
623 VERIFY(so->so_usecount > 0);
624 so->so_usecount--;
625 sofreelastref(so, 1); /* will deallocate the socket */
626 return error;
627 }
628
629 /*
630 * Note: needs so_pcb to be set after pru_attach
631 */
632 if (prp->pr_update_last_owner != NULL) {
633 (*prp->pr_update_last_owner)(so, p, ep);
634 }
635
636 os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
637
638 /* Attach socket filters for this protocol */
639 sflt_initsock(so);
640 so_set_default_traffic_class(so);
641
642 /*
643 * If this thread or task is marked to create backgrounded sockets,
644 * mark the socket as background.
645 */
646 if (!(flags & SOCF_MPTCP) &&
647 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
648 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
649 so->so_background_thread = current_thread();
650 }
651
652 switch (dom) {
653 /*
654 * Don't mark Unix domain or system
655 * eligible for defunct by default.
656 */
657 case PF_LOCAL:
658 case PF_SYSTEM:
659 so->so_flags |= SOF_NODEFUNCT;
660 break;
661 default:
662 break;
663 }
664
665 /*
666 * Entitlements can't be checked at socket creation time except if the
667 * application requested a feature guarded by a privilege (c.f., socket
668 * delegation).
669 * The priv(9) and the Sandboxing APIs are designed with the idea that
670 * a privilege check should only be triggered by a userland request.
671 * A privilege check at socket creation time is time consuming and
672 * could trigger many authorisation error messages from the security
673 * APIs.
674 */
675
676 *aso = so;
677
678 return 0;
679 }
680
681 /*
682 * Returns: 0 Success
683 * EAFNOSUPPORT
684 * EPROTOTYPE
685 * EPROTONOSUPPORT
686 * ENOBUFS
687 * <pru_attach>:ENOBUFS[AF_UNIX]
688 * <pru_attach>:ENOBUFS[TCP]
689 * <pru_attach>:ENOMEM[TCP]
690 * <pru_attach>:??? [other protocol families, IPSEC]
691 */
692 int
socreate(int dom,struct socket ** aso,int type,int proto)693 socreate(int dom, struct socket **aso, int type, int proto)
694 {
695 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
696 PROC_NULL);
697 }
698
699 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)700 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
701 {
702 int error = 0;
703 struct proc *ep = PROC_NULL;
704
705 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
706 error = ESRCH;
707 goto done;
708 }
709
710 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
711
712 /*
713 * It might not be wise to hold the proc reference when calling
714 * socreate_internal since it calls soalloc with M_WAITOK
715 */
716 done:
717 if (ep != PROC_NULL) {
718 proc_rele(ep);
719 }
720
721 return error;
722 }
723
724 /*
725 * Returns: 0 Success
726 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
727 * <pru_bind>:EAFNOSUPPORT Address family not supported
728 * <pru_bind>:EADDRNOTAVAIL Address not available.
729 * <pru_bind>:EINVAL Invalid argument
730 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
731 * <pru_bind>:EACCES Permission denied
732 * <pru_bind>:EADDRINUSE Address in use
733 * <pru_bind>:EAGAIN Resource unavailable, try again
734 * <pru_bind>:EPERM Operation not permitted
735 * <pru_bind>:???
736 * <sf_bind>:???
737 *
738 * Notes: It's not possible to fully enumerate the return codes above,
739 * since socket filter authors and protocol family authors may
740 * not choose to limit their error returns to those listed, even
741 * though this may result in some software operating incorrectly.
742 *
743 * The error codes which are enumerated above are those known to
744 * be returned by the tcp_usr_bind function supplied.
745 */
746 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)747 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
748 {
749 struct proc *p = current_proc();
750 int error = 0;
751
752 if (dolock) {
753 socket_lock(so, 1);
754 }
755
756 so_update_last_owner_locked(so, p);
757 so_update_policy(so);
758
759 #if NECP
760 so_update_necp_policy(so, nam, NULL);
761 #endif /* NECP */
762
763 /*
764 * If this is a bind request on a socket that has been marked
765 * as inactive, reject it now before we go any further.
766 */
767 if (so->so_flags & SOF_DEFUNCT) {
768 error = EINVAL;
769 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
770 __func__, proc_pid(p), proc_best_name(p),
771 so->so_gencnt,
772 SOCK_DOM(so), SOCK_TYPE(so), error);
773 goto out;
774 }
775
776 /* Socket filter */
777 error = sflt_bind(so, nam);
778
779 if (error == 0) {
780 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
781 }
782 out:
783 if (dolock) {
784 socket_unlock(so, 1);
785 }
786
787 if (error == EJUSTRETURN) {
788 error = 0;
789 }
790
791 return error;
792 }
793
794 void
sodealloc(struct socket * so)795 sodealloc(struct socket *so)
796 {
797 proto_memacct_sub(so->so_proto, sizeof(struct socket));
798
799 kauth_cred_unref(&so->so_cred);
800
801 /* Remove any filters */
802 sflt_termsock(so);
803
804 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
805
806 zfree(socket_zone, so);
807 }
808
809 /*
810 * Returns: 0 Success
811 * EINVAL
812 * EOPNOTSUPP
813 * <pru_listen>:EINVAL[AF_UNIX]
814 * <pru_listen>:EINVAL[TCP]
815 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
816 * <pru_listen>:EINVAL[TCP] Invalid argument
817 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
818 * <pru_listen>:EACCES[TCP] Permission denied
819 * <pru_listen>:EADDRINUSE[TCP] Address in use
820 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
821 * <pru_listen>:EPERM[TCP] Operation not permitted
822 * <sf_listen>:???
823 *
824 * Notes: Other <pru_listen> returns depend on the protocol family; all
825 * <sf_listen> returns depend on what the filter author causes
826 * their filter to return.
827 */
828 int
solisten(struct socket * so,int backlog)829 solisten(struct socket *so, int backlog)
830 {
831 struct proc *p = current_proc();
832 int error = 0;
833
834 socket_lock(so, 1);
835
836 so_update_last_owner_locked(so, p);
837 so_update_policy(so);
838
839 if (TAILQ_EMPTY(&so->so_comp)) {
840 so->so_options |= SO_ACCEPTCONN;
841 }
842
843 #if NECP
844 so_update_necp_policy(so, NULL, NULL);
845 #endif /* NECP */
846
847 if (so->so_proto == NULL) {
848 error = EINVAL;
849 so->so_options &= ~SO_ACCEPTCONN;
850 goto out;
851 }
852 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
853 error = EOPNOTSUPP;
854 so->so_options &= ~SO_ACCEPTCONN;
855 goto out;
856 }
857
858 /*
859 * If the listen request is made on a socket that is not fully
860 * disconnected, or on a socket that has been marked as inactive,
861 * reject the request now.
862 */
863 if ((so->so_state &
864 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
865 (so->so_flags & SOF_DEFUNCT)) {
866 error = EINVAL;
867 if (so->so_flags & SOF_DEFUNCT) {
868 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
869 "(%d)\n", __func__, proc_pid(p),
870 proc_best_name(p),
871 so->so_gencnt,
872 SOCK_DOM(so), SOCK_TYPE(so), error);
873 }
874 so->so_options &= ~SO_ACCEPTCONN;
875 goto out;
876 }
877
878 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
879 error = EPERM;
880 so->so_options &= ~SO_ACCEPTCONN;
881 goto out;
882 }
883
884 error = sflt_listen(so);
885 if (error == 0) {
886 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
887 }
888
889 if (error) {
890 if (error == EJUSTRETURN) {
891 error = 0;
892 }
893 so->so_options &= ~SO_ACCEPTCONN;
894 goto out;
895 }
896
897 /*
898 * POSIX: The implementation may have an upper limit on the length of
899 * the listen queue-either global or per accepting socket. If backlog
900 * exceeds this limit, the length of the listen queue is set to the
901 * limit.
902 *
903 * If listen() is called with a backlog argument value that is less
904 * than 0, the function behaves as if it had been called with a backlog
905 * argument value of 0.
906 *
907 * A backlog argument of 0 may allow the socket to accept connections,
908 * in which case the length of the listen queue may be set to an
909 * implementation-defined minimum value.
910 */
911 if (backlog <= 0 || backlog > somaxconn) {
912 backlog = somaxconn;
913 }
914
915 so->so_qlimit = (short)backlog;
916 out:
917 socket_unlock(so, 1);
918 return error;
919 }
920
921 /*
922 * The "accept list lock" protects the fields related to the listener queues
923 * because we can unlock a socket to respect the lock ordering between
924 * the listener socket and its clients sockets. The lock ordering is first to
925 * acquire the client socket before the listener socket.
926 *
927 * The accept list lock serializes access to the following fields:
928 * - of the listener socket:
929 * - so_comp
930 * - so_incomp
931 * - so_qlen
932 * - so_inqlen
933 * - of client sockets that are in so_comp or so_incomp:
934 * - so_head
935 * - so_list
936 *
937 * As one can see the accept list lock protects the consistent of the
938 * linkage of the client sockets.
939 *
940 * Note that those fields may be read without holding the accept list lock
941 * for a preflight provided the accept list lock is taken when committing
942 * to take an action based on the result of the preflight. The preflight
943 * saves the cost of doing the unlock/lock dance.
944 */
945 void
so_acquire_accept_list(struct socket * head,struct socket * so)946 so_acquire_accept_list(struct socket *head, struct socket *so)
947 {
948 lck_mtx_t *mutex_held;
949
950 if (head->so_proto->pr_getlock == NULL) {
951 return;
952 }
953 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
954 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
955
956 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
957 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
958 return;
959 }
960 if (so != NULL) {
961 socket_unlock(so, 0);
962 }
963 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
964 so_accept_list_waits += 1;
965 msleep((caddr_t)&head->so_incomp, mutex_held,
966 PSOCK | PCATCH, __func__, NULL);
967 }
968 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
969 if (so != NULL) {
970 socket_unlock(head, 0);
971 socket_lock(so, 0);
972 socket_lock(head, 0);
973 }
974 }
975
976 void
so_release_accept_list(struct socket * head)977 so_release_accept_list(struct socket *head)
978 {
979 if (head->so_proto->pr_getlock != NULL) {
980 lck_mtx_t *mutex_held;
981
982 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
983 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
984
985 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
986 wakeup((caddr_t)&head->so_incomp);
987 }
988 }
989
990 void
sofreelastref(struct socket * so,int dealloc)991 sofreelastref(struct socket *so, int dealloc)
992 {
993 struct socket *head = so->so_head;
994
995 /* Assume socket is locked */
996
997 #if FLOW_DIVERT
998 if (so->so_flags & SOF_FLOW_DIVERT) {
999 flow_divert_detach(so);
1000 }
1001 #endif /* FLOW_DIVERT */
1002
1003 #if CONTENT_FILTER
1004 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1005 cfil_sock_detach(so);
1006 }
1007 #endif /* CONTENT_FILTER */
1008
1009 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1010 soflow_detach(so);
1011 }
1012
1013 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1014 selthreadclear(&so->so_snd.sb_sel);
1015 selthreadclear(&so->so_rcv.sb_sel);
1016 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1017 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1018 so->so_event = sonullevent;
1019 return;
1020 }
1021 if (head != NULL) {
1022 /*
1023 * Need to lock the listener when the protocol has
1024 * per socket locks
1025 */
1026 if (head->so_proto->pr_getlock != NULL) {
1027 socket_lock(head, 1);
1028 so_acquire_accept_list(head, so);
1029 }
1030 if (so->so_state & SS_INCOMP) {
1031 so->so_state &= ~SS_INCOMP;
1032 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1033 head->so_incqlen--;
1034 head->so_qlen--;
1035 so->so_head = NULL;
1036
1037 if (head->so_proto->pr_getlock != NULL) {
1038 so_release_accept_list(head);
1039 socket_unlock(head, 1);
1040 }
1041 } else if (so->so_state & SS_COMP) {
1042 if (head->so_proto->pr_getlock != NULL) {
1043 so_release_accept_list(head);
1044 socket_unlock(head, 1);
1045 }
1046 /*
1047 * We must not decommission a socket that's
1048 * on the accept(2) queue. If we do, then
1049 * accept(2) may hang after select(2) indicated
1050 * that the listening socket was ready.
1051 */
1052 selthreadclear(&so->so_snd.sb_sel);
1053 selthreadclear(&so->so_rcv.sb_sel);
1054 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1055 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1056 so->so_event = sonullevent;
1057 return;
1058 } else {
1059 if (head->so_proto->pr_getlock != NULL) {
1060 so_release_accept_list(head);
1061 socket_unlock(head, 1);
1062 }
1063 printf("sofree: not queued\n");
1064 }
1065 }
1066 sowflush(so);
1067 sorflush(so);
1068
1069 /* 3932268: disable upcall */
1070 so->so_rcv.sb_flags &= ~SB_UPCALL;
1071 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1072 so->so_event = sonullevent;
1073
1074 if (dealloc) {
1075 sodealloc(so);
1076 }
1077 }
1078
1079 void
soclose_wait_locked(struct socket * so)1080 soclose_wait_locked(struct socket *so)
1081 {
1082 lck_mtx_t *mutex_held;
1083
1084 if (so->so_proto->pr_getlock != NULL) {
1085 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1086 } else {
1087 mutex_held = so->so_proto->pr_domain->dom_mtx;
1088 }
1089 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1090
1091 /*
1092 * Double check here and return if there's no outstanding upcall;
1093 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1094 */
1095 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1096 return;
1097 }
1098 so->so_rcv.sb_flags &= ~SB_UPCALL;
1099 so->so_snd.sb_flags &= ~SB_UPCALL;
1100 so->so_flags |= SOF_CLOSEWAIT;
1101
1102 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1103 "soclose_wait_locked", NULL);
1104 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1105 so->so_flags &= ~SOF_CLOSEWAIT;
1106 }
1107
1108 /*
1109 * Close a socket on last file table reference removal.
1110 * Initiate disconnect if connected.
1111 * Free socket when disconnect complete.
1112 */
1113 int
soclose_locked(struct socket * so)1114 soclose_locked(struct socket *so)
1115 {
1116 int error = 0;
1117 struct timespec ts;
1118
1119 if (so->so_usecount == 0) {
1120 panic("soclose: so=%p refcount=0", so);
1121 /* NOTREACHED */
1122 }
1123
1124 sflt_notify(so, sock_evt_closing, NULL);
1125
1126 if (so->so_upcallusecount) {
1127 soclose_wait_locked(so);
1128 }
1129
1130 #if CONTENT_FILTER
1131 /*
1132 * We have to wait until the content filters are done
1133 */
1134 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1135 cfil_sock_close_wait(so);
1136 cfil_sock_is_closed(so);
1137 cfil_sock_detach(so);
1138 }
1139 #endif /* CONTENT_FILTER */
1140
1141 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1142 soflow_detach(so);
1143 }
1144
1145 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1146 soresume(current_proc(), so, 1);
1147 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1148 }
1149
1150 if ((so->so_options & SO_ACCEPTCONN)) {
1151 struct socket *sp, *sonext;
1152 int persocklock = 0;
1153 int incomp_overflow_only;
1154
1155 /*
1156 * We do not want new connection to be added
1157 * to the connection queues
1158 */
1159 so->so_options &= ~SO_ACCEPTCONN;
1160
1161 /*
1162 * We can drop the lock on the listener once
1163 * we've acquired the incoming list
1164 */
1165 if (so->so_proto->pr_getlock != NULL) {
1166 persocklock = 1;
1167 so_acquire_accept_list(so, NULL);
1168 socket_unlock(so, 0);
1169 }
1170 again:
1171 incomp_overflow_only = 1;
1172
1173 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1174 /*
1175 * Radar 5350314
1176 * skip sockets thrown away by tcpdropdropblreq
1177 * they will get cleanup by the garbage collection.
1178 * otherwise, remove the incomp socket from the queue
1179 * and let soabort trigger the appropriate cleanup.
1180 */
1181 if (sp->so_flags & SOF_OVERFLOW) {
1182 continue;
1183 }
1184
1185 if (persocklock != 0) {
1186 socket_lock(sp, 1);
1187 }
1188
1189 /*
1190 * Radar 27945981
1191 * The extra reference for the list insure the
1192 * validity of the socket pointer when we perform the
1193 * unlock of the head above
1194 */
1195 if (sp->so_state & SS_INCOMP) {
1196 sp->so_state &= ~SS_INCOMP;
1197 sp->so_head = NULL;
1198 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1199 so->so_incqlen--;
1200 so->so_qlen--;
1201
1202 (void) soabort(sp);
1203 } else {
1204 panic("%s sp %p in so_incomp but !SS_INCOMP",
1205 __func__, sp);
1206 }
1207
1208 if (persocklock != 0) {
1209 socket_unlock(sp, 1);
1210 }
1211 }
1212
1213 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1214 /* Dequeue from so_comp since sofree() won't do it */
1215 if (persocklock != 0) {
1216 socket_lock(sp, 1);
1217 }
1218
1219 if (sp->so_state & SS_COMP) {
1220 sp->so_state &= ~SS_COMP;
1221 sp->so_head = NULL;
1222 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1223 so->so_qlen--;
1224
1225 (void) soabort(sp);
1226 } else {
1227 panic("%s sp %p in so_comp but !SS_COMP",
1228 __func__, sp);
1229 }
1230
1231 if (persocklock) {
1232 socket_unlock(sp, 1);
1233 }
1234 }
1235
1236 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1237 #if (DEBUG | DEVELOPMENT)
1238 panic("%s head %p so_comp not empty", __func__, so);
1239 #endif /* (DEVELOPMENT || DEBUG) */
1240
1241 goto again;
1242 }
1243
1244 if (!TAILQ_EMPTY(&so->so_comp)) {
1245 #if (DEBUG | DEVELOPMENT)
1246 panic("%s head %p so_comp not empty", __func__, so);
1247 #endif /* (DEVELOPMENT || DEBUG) */
1248
1249 goto again;
1250 }
1251
1252 if (persocklock) {
1253 socket_lock(so, 0);
1254 so_release_accept_list(so);
1255 }
1256 }
1257 if (so->so_pcb == NULL) {
1258 /* 3915887: mark the socket as ready for dealloc */
1259 so->so_flags |= SOF_PCBCLEARING;
1260 goto discard;
1261 }
1262
1263 if (so->so_state & SS_ISCONNECTED) {
1264 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1265 error = sodisconnectlocked(so);
1266 if (error) {
1267 goto drop;
1268 }
1269 }
1270 if (so->so_options & SO_LINGER) {
1271 if ((so->so_state & SS_ISDISCONNECTING) &&
1272 (so->so_state & SS_NBIO)) {
1273 goto drop;
1274 }
1275 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1276 lck_mtx_t *mutex_held;
1277
1278 if (so->so_proto->pr_getlock != NULL) {
1279 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1280 } else {
1281 mutex_held = so->so_proto->pr_domain->dom_mtx;
1282 }
1283 ts.tv_sec = (so->so_linger / 100);
1284 ts.tv_nsec = (so->so_linger % 100) *
1285 NSEC_PER_USEC * 1000 * 10;
1286 error = msleep((caddr_t)&so->so_timeo,
1287 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1288 if (error) {
1289 /*
1290 * It's OK when the time fires,
1291 * don't report an error
1292 */
1293 if (error == EWOULDBLOCK) {
1294 error = 0;
1295 }
1296 break;
1297 }
1298 }
1299 }
1300 }
1301 drop:
1302 if (so->so_usecount == 0) {
1303 panic("soclose: usecount is zero so=%p", so);
1304 /* NOTREACHED */
1305 }
1306 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1307 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1308 if (error == 0) {
1309 error = error2;
1310 }
1311 }
1312 if (so->so_usecount <= 0) {
1313 panic("soclose: usecount is zero so=%p", so);
1314 /* NOTREACHED */
1315 }
1316 discard:
1317 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1318 (so->so_state & SS_NOFDREF)) {
1319 panic("soclose: NOFDREF");
1320 /* NOTREACHED */
1321 }
1322 so->so_state |= SS_NOFDREF;
1323
1324 if ((so->so_flags & SOF_KNOTE) != 0) {
1325 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1326 }
1327
1328 os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1329
1330 VERIFY(so->so_usecount > 0);
1331 so->so_usecount--;
1332 sofree(so);
1333 return error;
1334 }
1335
1336 int
soclose(struct socket * so)1337 soclose(struct socket *so)
1338 {
1339 int error = 0;
1340 socket_lock(so, 1);
1341
1342 if (so->so_retaincnt == 0) {
1343 error = soclose_locked(so);
1344 } else {
1345 /*
1346 * if the FD is going away, but socket is
1347 * retained in kernel remove its reference
1348 */
1349 so->so_usecount--;
1350 if (so->so_usecount < 2) {
1351 panic("soclose: retaincnt non null and so=%p "
1352 "usecount=%d\n", so, so->so_usecount);
1353 }
1354 }
1355 socket_unlock(so, 1);
1356 return error;
1357 }
1358
1359 /*
1360 * Must be called at splnet...
1361 */
1362 /* Should already be locked */
1363 int
soabort(struct socket * so)1364 soabort(struct socket *so)
1365 {
1366 int error;
1367
1368 #ifdef MORE_LOCKING_DEBUG
1369 lck_mtx_t *mutex_held;
1370
1371 if (so->so_proto->pr_getlock != NULL) {
1372 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1373 } else {
1374 mutex_held = so->so_proto->pr_domain->dom_mtx;
1375 }
1376 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1377 #endif
1378
1379 if ((so->so_flags & SOF_ABORTED) == 0) {
1380 so->so_flags |= SOF_ABORTED;
1381 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1382 if (error) {
1383 sofree(so);
1384 return error;
1385 }
1386 }
1387 return 0;
1388 }
1389
1390 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1391 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1392 {
1393 int error;
1394
1395 if (dolock) {
1396 socket_lock(so, 1);
1397 }
1398
1399 so_update_last_owner_locked(so, PROC_NULL);
1400 so_update_policy(so);
1401 #if NECP
1402 so_update_necp_policy(so, NULL, NULL);
1403 #endif /* NECP */
1404
1405 if ((so->so_state & SS_NOFDREF) == 0) {
1406 panic("soaccept: !NOFDREF");
1407 }
1408 so->so_state &= ~SS_NOFDREF;
1409 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1410
1411 if (dolock) {
1412 socket_unlock(so, 1);
1413 }
1414 return error;
1415 }
1416
1417 int
soaccept(struct socket * so,struct sockaddr ** nam)1418 soaccept(struct socket *so, struct sockaddr **nam)
1419 {
1420 return soacceptlock(so, nam, 1);
1421 }
1422
1423 int
soacceptfilter(struct socket * so,struct socket * head)1424 soacceptfilter(struct socket *so, struct socket *head)
1425 {
1426 struct sockaddr *__single local = NULL, *__single remote = NULL;
1427 int error = 0;
1428
1429 /*
1430 * Hold the lock even if this socket has not been made visible
1431 * to the filter(s). For sockets with global locks, this protects
1432 * against the head or peer going away
1433 */
1434 socket_lock(so, 1);
1435 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1436 sogetaddr_locked(so, &local, 0) != 0) {
1437 so->so_state &= ~SS_NOFDREF;
1438 socket_unlock(so, 1);
1439 soclose(so);
1440 /* Out of resources; try it again next time */
1441 error = ECONNABORTED;
1442 goto done;
1443 }
1444
1445 error = sflt_accept(head, so, local, remote);
1446
1447 /*
1448 * If we get EJUSTRETURN from one of the filters, mark this socket
1449 * as inactive and return it anyway. This newly accepted socket
1450 * will be disconnected later before we hand it off to the caller.
1451 */
1452 if (error == EJUSTRETURN) {
1453 error = 0;
1454 (void) sosetdefunct(current_proc(), so,
1455 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1456 }
1457
1458 if (error != 0) {
1459 /*
1460 * This may seem like a duplication to the above error
1461 * handling part when we return ECONNABORTED, except
1462 * the following is done while holding the lock since
1463 * the socket has been exposed to the filter(s) earlier.
1464 */
1465 so->so_state &= ~SS_NOFDREF;
1466 socket_unlock(so, 1);
1467 soclose(so);
1468 /* Propagate socket filter's error code to the caller */
1469 } else {
1470 socket_unlock(so, 1);
1471 }
1472 done:
1473 /* Callee checks for NULL pointer */
1474 sock_freeaddr(remote);
1475 sock_freeaddr(local);
1476 return error;
1477 }
1478
1479 /*
1480 * Returns: 0 Success
1481 * EOPNOTSUPP Operation not supported on socket
1482 * EISCONN Socket is connected
1483 * <pru_connect>:EADDRNOTAVAIL Address not available.
1484 * <pru_connect>:EINVAL Invalid argument
1485 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1486 * <pru_connect>:EACCES Permission denied
1487 * <pru_connect>:EADDRINUSE Address in use
1488 * <pru_connect>:EAGAIN Resource unavailable, try again
1489 * <pru_connect>:EPERM Operation not permitted
1490 * <sf_connect_out>:??? [anything a filter writer might set]
1491 */
1492 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1493 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1494 {
1495 int error;
1496 struct proc *p = current_proc();
1497 tracker_metadata_t metadata = { };
1498
1499 if (dolock) {
1500 socket_lock(so, 1);
1501 }
1502
1503 so_update_last_owner_locked(so, p);
1504 so_update_policy(so);
1505
1506 /*
1507 * If this is a listening socket or if this is a previously-accepted
1508 * socket that has been marked as inactive, reject the connect request.
1509 */
1510 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1511 error = EOPNOTSUPP;
1512 if (so->so_flags & SOF_DEFUNCT) {
1513 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1514 "(%d)\n", __func__, proc_pid(p),
1515 proc_best_name(p),
1516 so->so_gencnt,
1517 SOCK_DOM(so), SOCK_TYPE(so), error);
1518 }
1519 if (dolock) {
1520 socket_unlock(so, 1);
1521 }
1522 return error;
1523 }
1524
1525 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1526 if (dolock) {
1527 socket_unlock(so, 1);
1528 }
1529 return EPERM;
1530 }
1531
1532 /*
1533 * If protocol is connection-based, can only connect once.
1534 * Otherwise, if connected, try to disconnect first.
1535 * This allows user to disconnect by connecting to, e.g.,
1536 * a null address.
1537 */
1538 #if NECP
1539 bool set_domain_from_tracker_lookup = false;
1540 #endif /* NECP */
1541 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1542 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1543 (error = sodisconnectlocked(so)))) {
1544 error = EISCONN;
1545 } else {
1546 /*
1547 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1548 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1549 */
1550 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1551 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1552 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1553 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1554 }
1555 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1556 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1557 }
1558 #if NECP
1559 set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1560 #endif /* NECP */
1561 necp_set_socket_domain_attributes(so,
1562 __unsafe_null_terminated_from_indexable(metadata.domain),
1563 __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1564 }
1565 }
1566
1567 #if NECP
1568 /* Update NECP evaluation after setting any domain via the tracker checks */
1569 so_update_necp_policy(so, NULL, nam);
1570 if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1571 // Mark extended timeout on tracker lookup to ensure that the entry stays around
1572 tracker_metadata_t update_metadata = { };
1573 update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1574 (void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &update_metadata);
1575 }
1576 #endif /* NECP */
1577
1578 /*
1579 * Run connect filter before calling protocol:
1580 * - non-blocking connect returns before completion;
1581 */
1582 error = sflt_connectout(so, nam);
1583 if (error != 0) {
1584 if (error == EJUSTRETURN) {
1585 error = 0;
1586 }
1587 } else {
1588 error = (*so->so_proto->pr_usrreqs->pru_connect)
1589 (so, nam, p);
1590 if (error != 0) {
1591 so->so_state &= ~SS_ISCONNECTING;
1592 }
1593 }
1594 }
1595 if (dolock) {
1596 socket_unlock(so, 1);
1597 }
1598 return error;
1599 }
1600
1601 int
soconnect(struct socket * so,struct sockaddr * nam)1602 soconnect(struct socket *so, struct sockaddr *nam)
1603 {
1604 return soconnectlock(so, nam, 1);
1605 }
1606
1607 /*
1608 * Returns: 0 Success
1609 * <pru_connect2>:EINVAL[AF_UNIX]
1610 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1611 * <pru_connect2>:??? [other protocol families]
1612 *
1613 * Notes: <pru_connect2> is not supported by [TCP].
1614 */
1615 int
soconnect2(struct socket * so1,struct socket * so2)1616 soconnect2(struct socket *so1, struct socket *so2)
1617 {
1618 int error;
1619
1620 socket_lock(so1, 1);
1621 if (so2->so_proto->pr_lock) {
1622 socket_lock(so2, 1);
1623 }
1624
1625 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1626
1627 socket_unlock(so1, 1);
1628 if (so2->so_proto->pr_lock) {
1629 socket_unlock(so2, 1);
1630 }
1631 return error;
1632 }
1633
1634 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1635 soconnectxlocked(struct socket *so, struct sockaddr *src,
1636 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1637 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1638 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1639 {
1640 int error;
1641 tracker_metadata_t metadata = { };
1642
1643 so_update_last_owner_locked(so, p);
1644 so_update_policy(so);
1645
1646 /*
1647 * If this is a listening socket or if this is a previously-accepted
1648 * socket that has been marked as inactive, reject the connect request.
1649 */
1650 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1651 error = EOPNOTSUPP;
1652 if (so->so_flags & SOF_DEFUNCT) {
1653 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1654 "(%d)\n", __func__, proc_pid(p),
1655 proc_best_name(p),
1656 so->so_gencnt,
1657 SOCK_DOM(so), SOCK_TYPE(so), error);
1658 }
1659 return error;
1660 }
1661
1662 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1663 return EPERM;
1664 }
1665
1666 /*
1667 * If protocol is connection-based, can only connect once
1668 * unless PR_MULTICONN is set. Otherwise, if connected,
1669 * try to disconnect first. This allows user to disconnect
1670 * by connecting to, e.g., a null address.
1671 */
1672 #if NECP
1673 bool set_domain_from_tracker_lookup = false;
1674 #endif /* NECP */
1675 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1676 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1677 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1678 (error = sodisconnectlocked(so)) != 0)) {
1679 error = EISCONN;
1680 } else {
1681 /*
1682 * For TCP, check if destination address is a tracker and mark the socket accordingly
1683 * (only if it hasn't been marked yet).
1684 */
1685 if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1686 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1687 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1688 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1689 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1690 }
1691 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1692 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1693 }
1694 #if NECP
1695 set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1696 #endif /* NECP */
1697 necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain),
1698 __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1699 }
1700 }
1701
1702 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1703 (flags & CONNECT_DATA_IDEMPOTENT)) {
1704 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1705
1706 if (flags & CONNECT_DATA_AUTHENTICATED) {
1707 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1708 }
1709 }
1710
1711 /*
1712 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1713 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1714 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1715 * Case 3 allows user to combine write with connect even if they have
1716 * no use for TFO (such as regular TCP, and UDP).
1717 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1718 */
1719 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1720 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1721 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1722 }
1723
1724 /*
1725 * If a user sets data idempotent and does not pass an uio, or
1726 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1727 * SOF1_DATA_IDEMPOTENT.
1728 */
1729 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1730 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1731 /* We should return EINVAL instead perhaps. */
1732 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1733 }
1734
1735 /*
1736 * Run connect filter before calling protocol:
1737 * - non-blocking connect returns before completion;
1738 */
1739 error = sflt_connectout(so, dst);
1740 if (error != 0) {
1741 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1742 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1743 if (error == EJUSTRETURN) {
1744 error = 0;
1745 }
1746 } else {
1747 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1748 (so, src, dst, p, ifscope, aid, pcid,
1749 flags, arg, arglen, auio, bytes_written);
1750 if (error != 0) {
1751 so->so_state &= ~SS_ISCONNECTING;
1752 if (error != EINPROGRESS) {
1753 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1754 }
1755 }
1756
1757 #if NECP
1758 if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1759 // Mark extended timeout on tracker lookup to ensure that the entry stays around
1760 tracker_metadata_t update_metadata = { };
1761 update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1762 (void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &update_metadata);
1763 }
1764 #endif /* NECP */
1765 }
1766 }
1767
1768 return error;
1769 }
1770
1771 int
sodisconnectlocked(struct socket * so)1772 sodisconnectlocked(struct socket *so)
1773 {
1774 int error;
1775
1776 if ((so->so_state & SS_ISCONNECTED) == 0) {
1777 error = ENOTCONN;
1778 goto bad;
1779 }
1780 if (so->so_state & SS_ISDISCONNECTING) {
1781 error = EALREADY;
1782 goto bad;
1783 }
1784
1785 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1786 if (error == 0) {
1787 sflt_notify(so, sock_evt_disconnected, NULL);
1788 }
1789
1790 bad:
1791 return error;
1792 }
1793
1794 /* Locking version */
1795 int
sodisconnect(struct socket * so)1796 sodisconnect(struct socket *so)
1797 {
1798 int error;
1799
1800 socket_lock(so, 1);
1801 error = sodisconnectlocked(so);
1802 socket_unlock(so, 1);
1803 return error;
1804 }
1805
1806 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1807 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1808 {
1809 int error;
1810
1811 /*
1812 * Call the protocol disconnectx handler; let it handle all
1813 * matters related to the connection state of this session.
1814 */
1815 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1816 if (error == 0) {
1817 /*
1818 * The event applies only for the session, not for
1819 * the disconnection of individual subflows.
1820 */
1821 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1822 sflt_notify(so, sock_evt_disconnected, NULL);
1823 }
1824 }
1825 return error;
1826 }
1827
1828 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1829 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1830 {
1831 int error;
1832
1833 socket_lock(so, 1);
1834 error = sodisconnectxlocked(so, aid, cid);
1835 socket_unlock(so, 1);
1836 return error;
1837 }
1838
1839 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1840
1841 /*
1842 * sosendcheck will lock the socket buffer if it isn't locked and
1843 * verify that there is space for the data being inserted.
1844 *
1845 * Returns: 0 Success
1846 * EPIPE
1847 * sblock:EWOULDBLOCK
1848 * sblock:EINTR
1849 * sbwait:EBADF
1850 * sbwait:EINTR
1851 * [so_error]:???
1852 */
1853 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1854 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1855 int32_t clen, int32_t atomic, int flags, int *sblocked)
1856 {
1857 int assumelock = 0;
1858 int error = 0;
1859 int32_t space;
1860 int ret;
1861
1862 restart:
1863 if (*sblocked == 0) {
1864 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1865 so->so_send_filt_thread != 0 &&
1866 so->so_send_filt_thread == current_thread()) {
1867 /*
1868 * We're being called recursively from a filter,
1869 * allow this to continue. Radar 4150520.
1870 * Don't set sblocked because we don't want
1871 * to perform an unlock later.
1872 */
1873 assumelock = 1;
1874 } else {
1875 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1876 if (error) {
1877 if (so->so_flags & SOF_DEFUNCT) {
1878 goto defunct;
1879 }
1880 return error;
1881 }
1882 *sblocked = 1;
1883 }
1884 }
1885
1886 /*
1887 * If a send attempt is made on a socket that has been marked
1888 * as inactive (disconnected), reject the request.
1889 */
1890 if (so->so_flags & SOF_DEFUNCT) {
1891 defunct:
1892 error = EPIPE;
1893 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
1894 __func__, proc_selfpid(), proc_best_name(current_proc()),
1895 so->so_gencnt,
1896 SOCK_DOM(so), SOCK_TYPE(so), error);
1897 return error;
1898 }
1899
1900 if (so->so_state & SS_CANTSENDMORE) {
1901 #if CONTENT_FILTER
1902 /*
1903 * Can re-inject data of half closed connections
1904 */
1905 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1906 so->so_snd.sb_cfil_thread == current_thread() &&
1907 cfil_sock_data_pending(&so->so_snd) != 0) {
1908 CFIL_LOG(LOG_INFO,
1909 "so %llx ignore SS_CANTSENDMORE",
1910 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1911 } else
1912 #endif /* CONTENT_FILTER */
1913 return EPIPE;
1914 }
1915 if (so->so_error) {
1916 error = so->so_error;
1917 so->so_error = 0;
1918 return error;
1919 }
1920
1921 if ((so->so_state & SS_ISCONNECTED) == 0) {
1922 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1923 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1924 (resid != 0 || clen == 0) &&
1925 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1926 return ENOTCONN;
1927 }
1928 } else if (addr == 0) {
1929 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1930 ENOTCONN : EDESTADDRREQ;
1931 }
1932 }
1933
1934 space = sbspace(&so->so_snd);
1935
1936 if (flags & MSG_OOB) {
1937 space += 1024;
1938 }
1939 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1940 clen > so->so_snd.sb_hiwat) {
1941 return EMSGSIZE;
1942 }
1943
1944 if ((space < resid + clen &&
1945 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1946 space < clen)) ||
1947 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1948 /*
1949 * don't block the connectx call when there's more data
1950 * than can be copied.
1951 */
1952 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1953 if (space == 0) {
1954 return EWOULDBLOCK;
1955 }
1956 if (space < (int32_t)so->so_snd.sb_lowat) {
1957 return 0;
1958 }
1959 }
1960 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1961 assumelock) {
1962 return EWOULDBLOCK;
1963 }
1964 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1965 *sblocked = 0;
1966 error = sbwait(&so->so_snd);
1967 if (error) {
1968 if (so->so_flags & SOF_DEFUNCT) {
1969 goto defunct;
1970 }
1971 return error;
1972 }
1973 goto restart;
1974 }
1975
1976 ret = proto_memacct_limited(so->so_proto);
1977 if (ret == MEMACCT_HARDLIMIT ||
1978 (ret == MEMACCT_SOFTLIMIT && so->so_snd.sb_cc > 0)) {
1979 return ENOMEM;
1980 }
1981 return 0;
1982 }
1983
1984 /*
1985 * Send on a socket.
1986 * If send must go all at once and message is larger than
1987 * send buffering, then hard error.
1988 * Lock against other senders.
1989 * If must go all at once and not enough room now, then
1990 * inform user that this would block and do nothing.
1991 * Otherwise, if nonblocking, send as much as possible.
1992 * The data to be sent is described by "uio" if nonzero,
1993 * otherwise by the mbuf chain "top" (which must be null
1994 * if uio is not). Data provided in mbuf chain must be small
1995 * enough to send all at once.
1996 *
1997 * Returns nonzero on error, timeout or signal; callers
1998 * must check for short counts if EINTR/ERESTART are returned.
1999 * Data and control buffers are freed on return.
2000 *
2001 * Returns: 0 Success
2002 * EOPNOTSUPP
2003 * EINVAL
2004 * ENOBUFS
2005 * uiomove:EFAULT
2006 * sosendcheck:EPIPE
2007 * sosendcheck:EWOULDBLOCK
2008 * sosendcheck:EINTR
2009 * sosendcheck:EBADF
2010 * sosendcheck:EINTR
2011 * sosendcheck:??? [value from so_error]
2012 * <pru_send>:ECONNRESET[TCP]
2013 * <pru_send>:EINVAL[TCP]
2014 * <pru_send>:ENOBUFS[TCP]
2015 * <pru_send>:EADDRINUSE[TCP]
2016 * <pru_send>:EADDRNOTAVAIL[TCP]
2017 * <pru_send>:EAFNOSUPPORT[TCP]
2018 * <pru_send>:EACCES[TCP]
2019 * <pru_send>:EAGAIN[TCP]
2020 * <pru_send>:EPERM[TCP]
2021 * <pru_send>:EMSGSIZE[TCP]
2022 * <pru_send>:EHOSTUNREACH[TCP]
2023 * <pru_send>:ENETUNREACH[TCP]
2024 * <pru_send>:ENETDOWN[TCP]
2025 * <pru_send>:ENOMEM[TCP]
2026 * <pru_send>:ENOBUFS[TCP]
2027 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2028 * <pru_send>:EINVAL[AF_UNIX]
2029 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2030 * <pru_send>:EPIPE[AF_UNIX]
2031 * <pru_send>:ENOTCONN[AF_UNIX]
2032 * <pru_send>:EISCONN[AF_UNIX]
2033 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2034 * <sf_data_out>:??? [whatever a filter author chooses]
2035 *
2036 * Notes: Other <pru_send> returns depend on the protocol family; all
2037 * <sf_data_out> returns depend on what the filter author causes
2038 * their filter to return.
2039 */
2040 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2041 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2042 struct mbuf *top, struct mbuf *control, int flags)
2043 {
2044 mbuf_ref_ref_t mp;
2045 mbuf_ref_t m, freelist = NULL;
2046 struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2047 user_ssize_t space, len, resid, orig_resid;
2048 int clen = 0, error, dontroute, sendflags;
2049 int atomic = sosendallatonce(so) || top;
2050 int sblocked = 0;
2051 struct proc *p = current_proc();
2052 uint16_t headroom = 0;
2053 ssize_t mlen;
2054 boolean_t en_tracing = FALSE;
2055
2056 if (uio != NULL) {
2057 resid = uio_resid(uio);
2058 } else {
2059 resid = top->m_pkthdr.len;
2060 }
2061 orig_resid = resid;
2062
2063 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2064 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2065
2066 socket_lock(so, 1);
2067
2068 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2069 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, SOFLOW_DIRECTION_OUTBOUND, 0);
2070 }
2071
2072 /*
2073 * trace if tracing & network (vs. unix) sockets & and
2074 * non-loopback
2075 */
2076 if (ENTR_SHOULDTRACE &&
2077 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2078 struct inpcb *inp = sotoinpcb(so);
2079 if (inp->inp_last_outifp != NULL &&
2080 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2081 en_tracing = TRUE;
2082 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2083 VM_KERNEL_ADDRPERM(so),
2084 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2085 (int64_t)resid);
2086 }
2087 }
2088
2089 /*
2090 * Re-injection should not affect process accounting
2091 */
2092 if ((flags & MSG_SKIPCFIL) == 0) {
2093 so_update_last_owner_locked(so, p);
2094 so_update_policy(so);
2095
2096 #if NECP
2097 so_update_necp_policy(so, NULL, addr);
2098 #endif /* NECP */
2099 }
2100
2101 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2102 error = EOPNOTSUPP;
2103 goto out_locked;
2104 }
2105
2106 /*
2107 * In theory resid should be unsigned.
2108 * However, space must be signed, as it might be less than 0
2109 * if we over-committed, and we must use a signed comparison
2110 * of space and resid. On the other hand, a negative resid
2111 * causes us to loop sending 0-length segments to the protocol.
2112 *
2113 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2114 *
2115 * Note: We limit resid to be a positive int value as we use
2116 * imin() to set bytes_to_copy -- radr://14558484
2117 */
2118 if (resid < 0 || resid > INT_MAX ||
2119 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2120 error = EINVAL;
2121 goto out_locked;
2122 }
2123
2124 dontroute = (flags & MSG_DONTROUTE) &&
2125 (so->so_options & SO_DONTROUTE) == 0 &&
2126 (so->so_proto->pr_flags & PR_ATOMIC);
2127 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2128
2129 if (control != NULL) {
2130 clen = control->m_len;
2131 }
2132
2133 if (soreserveheadroom != 0) {
2134 headroom = so->so_pktheadroom;
2135 }
2136
2137 do {
2138 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2139 &sblocked);
2140 if (error) {
2141 goto out_locked;
2142 }
2143
2144 mp = ⊤
2145 space = sbspace(&so->so_snd) - clen;
2146 space += ((flags & MSG_OOB) ? 1024 : 0);
2147
2148 do {
2149 if (uio == NULL) {
2150 /*
2151 * Data is prepackaged in "top".
2152 */
2153 resid = 0;
2154 if (flags & MSG_EOR) {
2155 top->m_flags |= M_EOR;
2156 }
2157 } else {
2158 int chainlength;
2159 int bytes_to_copy;
2160 boolean_t jumbocl;
2161 boolean_t bigcl;
2162 int bytes_to_alloc;
2163
2164 bytes_to_copy = imin((int)resid, (int)space);
2165
2166 bytes_to_alloc = bytes_to_copy;
2167 if (top == NULL) {
2168 bytes_to_alloc += headroom;
2169 }
2170
2171 if (sosendminchain > 0) {
2172 chainlength = 0;
2173 } else {
2174 chainlength = sosendmaxchain;
2175 }
2176
2177 /*
2178 * Use big 4 KB cluster when the outgoing interface
2179 * does not prefer 2 KB clusters
2180 */
2181 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2182 sosendbigcl_ignore_capab;
2183
2184 /*
2185 * Attempt to use larger than system page-size
2186 * clusters for large writes only if there is
2187 * a jumbo cluster pool and if the socket is
2188 * marked accordingly.
2189 */
2190 jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0 &&
2191 bigcl;
2192
2193 socket_unlock(so, 0);
2194
2195 do {
2196 int num_needed;
2197 int hdrs_needed = (top == NULL) ? 1 : 0;
2198
2199 /*
2200 * try to maintain a local cache of mbuf
2201 * clusters needed to complete this
2202 * write the list is further limited to
2203 * the number that are currently needed
2204 * to fill the socket this mechanism
2205 * allows a large number of mbufs/
2206 * clusters to be grabbed under a single
2207 * mbuf lock... if we can't get any
2208 * clusters, than fall back to trying
2209 * for mbufs if we fail early (or
2210 * miscalcluate the number needed) make
2211 * sure to release any clusters we
2212 * haven't yet consumed.
2213 */
2214 if (freelist == NULL &&
2215 bytes_to_alloc > MBIGCLBYTES &&
2216 jumbocl) {
2217 num_needed =
2218 bytes_to_alloc / M16KCLBYTES;
2219
2220 if ((bytes_to_alloc -
2221 (num_needed * M16KCLBYTES))
2222 >= MINCLSIZE) {
2223 num_needed++;
2224 }
2225
2226 freelist =
2227 m_getpackets_internal(
2228 (unsigned int *)&num_needed,
2229 hdrs_needed, M_WAIT, 0,
2230 M16KCLBYTES);
2231 /*
2232 * Fall back to 4K cluster size
2233 * if allocation failed
2234 */
2235 }
2236
2237 if (freelist == NULL &&
2238 bytes_to_alloc > MCLBYTES &&
2239 bigcl) {
2240 num_needed =
2241 bytes_to_alloc / MBIGCLBYTES;
2242
2243 if ((bytes_to_alloc -
2244 (num_needed * MBIGCLBYTES)) >=
2245 MINCLSIZE) {
2246 num_needed++;
2247 }
2248
2249 freelist =
2250 m_getpackets_internal(
2251 (unsigned int *)&num_needed,
2252 hdrs_needed, M_WAIT, 0,
2253 MBIGCLBYTES);
2254 /*
2255 * Fall back to cluster size
2256 * if allocation failed
2257 */
2258 }
2259
2260 /*
2261 * Allocate a cluster as we want to
2262 * avoid to split the data in more
2263 * that one segment and using MINCLSIZE
2264 * would lead us to allocate two mbufs
2265 */
2266 if (soreserveheadroom != 0 &&
2267 freelist == NULL &&
2268 ((top == NULL &&
2269 bytes_to_alloc > _MHLEN) ||
2270 bytes_to_alloc > _MLEN)) {
2271 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2272 MCLBYTES;
2273 freelist =
2274 m_getpackets_internal(
2275 (unsigned int *)&num_needed,
2276 hdrs_needed, M_WAIT, 0,
2277 MCLBYTES);
2278 /*
2279 * Fall back to a single mbuf
2280 * if allocation failed
2281 */
2282 } else if (freelist == NULL &&
2283 bytes_to_alloc > MINCLSIZE) {
2284 num_needed =
2285 bytes_to_alloc / MCLBYTES;
2286
2287 if ((bytes_to_alloc -
2288 (num_needed * MCLBYTES)) >=
2289 MINCLSIZE) {
2290 num_needed++;
2291 }
2292
2293 freelist =
2294 m_getpackets_internal(
2295 (unsigned int *)&num_needed,
2296 hdrs_needed, M_WAIT, 0,
2297 MCLBYTES);
2298 /*
2299 * Fall back to a single mbuf
2300 * if allocation failed
2301 */
2302 }
2303 /*
2304 * For datagram protocols, leave
2305 * headroom for protocol headers
2306 * in the first cluster of the chain
2307 */
2308 if (freelist != NULL && atomic &&
2309 top == NULL && headroom > 0) {
2310 freelist->m_data += headroom;
2311 }
2312
2313 /*
2314 * Fall back to regular mbufs without
2315 * reserving the socket headroom
2316 */
2317 if (freelist == NULL) {
2318 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2319 if (top == NULL) {
2320 MGETHDR(freelist,
2321 M_WAIT, MT_DATA);
2322 } else {
2323 MGET(freelist,
2324 M_WAIT, MT_DATA);
2325 }
2326 }
2327
2328 if (freelist == NULL) {
2329 error = ENOBUFS;
2330 socket_lock(so, 0);
2331 goto out_locked;
2332 }
2333 /*
2334 * For datagram protocols,
2335 * leave room for protocol
2336 * headers in first mbuf.
2337 */
2338 if (atomic && top == NULL &&
2339 bytes_to_copy > 0 &&
2340 bytes_to_copy < MHLEN) {
2341 MH_ALIGN(freelist,
2342 bytes_to_copy);
2343 }
2344 }
2345 m = freelist;
2346 freelist = m->m_next;
2347 m->m_next = NULL;
2348
2349 if ((m->m_flags & M_EXT)) {
2350 mlen = m->m_ext.ext_size -
2351 M_LEADINGSPACE(m);
2352 } else if ((m->m_flags & M_PKTHDR)) {
2353 mlen = MHLEN - M_LEADINGSPACE(m);
2354 m_add_crumb(m, PKT_CRUMB_SOSEND);
2355 } else {
2356 mlen = MLEN - M_LEADINGSPACE(m);
2357 }
2358 len = imin((int)mlen, bytes_to_copy);
2359
2360 chainlength += len;
2361
2362 space -= len;
2363
2364 error = uiomove(mtod(m, caddr_t),
2365 (int)len, uio);
2366
2367 resid = uio_resid(uio);
2368
2369 m->m_len = (int32_t)len;
2370 *mp = m;
2371 top->m_pkthdr.len += len;
2372 if (error) {
2373 break;
2374 }
2375 mp = &m->m_next;
2376 if (resid <= 0) {
2377 if (flags & MSG_EOR) {
2378 top->m_flags |= M_EOR;
2379 }
2380 break;
2381 }
2382 bytes_to_copy = imin((int)resid, (int)space);
2383 } while (space > 0 &&
2384 (chainlength < sosendmaxchain || atomic ||
2385 resid < MINCLSIZE));
2386
2387 socket_lock(so, 0);
2388
2389 if (error) {
2390 goto out_locked;
2391 }
2392 }
2393
2394 if (dontroute) {
2395 so->so_options |= SO_DONTROUTE;
2396 }
2397
2398 /*
2399 * Compute flags here, for pru_send and NKEs
2400 *
2401 * If the user set MSG_EOF, the protocol
2402 * understands this flag and nothing left to
2403 * send then use PRU_SEND_EOF instead of PRU_SEND.
2404 */
2405 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2406 ((flags & MSG_EOF) &&
2407 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2408 (resid <= 0)) ? PRUS_EOF :
2409 /* If there is more to send set PRUS_MORETOCOME */
2410 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2411
2412 if ((flags & MSG_SKIPCFIL) == 0) {
2413 /*
2414 * Socket filter processing
2415 */
2416 error = sflt_data_out(so, addr, &top,
2417 &control, (sendflags & MSG_OOB) ?
2418 sock_data_filt_flag_oob : 0);
2419 if (error) {
2420 if (error == EJUSTRETURN) {
2421 error = 0;
2422 goto packet_consumed;
2423 }
2424 goto out_locked;
2425 }
2426 #if CONTENT_FILTER
2427 /*
2428 * Content filter processing
2429 */
2430 error = cfil_sock_data_out(so, addr, top,
2431 control, sendflags, dgram_flow_entry);
2432 if (error) {
2433 if (error == EJUSTRETURN) {
2434 error = 0;
2435 goto packet_consumed;
2436 }
2437 goto out_locked;
2438 }
2439 #endif /* CONTENT_FILTER */
2440 }
2441 error = (*so->so_proto->pr_usrreqs->pru_send)
2442 (so, sendflags, top, addr, control, p);
2443
2444 packet_consumed:
2445 if (dontroute) {
2446 so->so_options &= ~SO_DONTROUTE;
2447 }
2448
2449 clen = 0;
2450 control = NULL;
2451 top = NULL;
2452 mp = ⊤
2453 if (error) {
2454 goto out_locked;
2455 }
2456 } while (resid && space > 0);
2457 } while (resid);
2458
2459
2460 out_locked:
2461 if (resid > orig_resid) {
2462 char pname[MAXCOMLEN] = {};
2463 pid_t current_pid = proc_pid(current_proc());
2464 proc_name(current_pid, pname, sizeof(pname));
2465
2466 if (sosend_assert_panic != 0) {
2467 panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2468 so, resid, orig_resid, pname, current_pid);
2469 } else {
2470 os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2471 so->so_gencnt, resid, orig_resid, pname, current_pid);
2472 }
2473 }
2474
2475 if (sblocked) {
2476 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2477 } else {
2478 socket_unlock(so, 1);
2479 }
2480 if (top != NULL) {
2481 m_freem(top);
2482 }
2483 if (control != NULL) {
2484 m_freem(control);
2485 }
2486 if (freelist != NULL) {
2487 m_freem_list(freelist);
2488 }
2489
2490 if (dgram_flow_entry != NULL) {
2491 soflow_free_flow(dgram_flow_entry);
2492 }
2493
2494 soclearfastopen(so);
2495
2496 if (en_tracing) {
2497 /* resid passed here is the bytes left in uio */
2498 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2499 VM_KERNEL_ADDRPERM(so),
2500 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2501 (int64_t)(orig_resid - resid));
2502 }
2503 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2504 so->so_snd.sb_cc, space, error);
2505
2506 return error;
2507 }
2508
2509 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2510 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2511 {
2512 struct mbuf *m0 = NULL, *control_end = NULL;
2513
2514 socket_lock_assert_owned(so);
2515
2516 /*
2517 * top must points to mbuf chain to be sent.
2518 * If control is not NULL, top must be packet header
2519 */
2520 VERIFY(top != NULL &&
2521 (control == NULL || top->m_flags & M_PKTHDR));
2522
2523 /*
2524 * If control is not passed in, see if we can get it
2525 * from top.
2526 */
2527 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2528 // Locate start of control if present and start of data
2529 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2530 if (m0->m_flags & M_PKTHDR) {
2531 top = m0;
2532 break;
2533 } else if (m0->m_type == MT_CONTROL) {
2534 if (control == NULL) {
2535 // Found start of control
2536 control = m0;
2537 }
2538 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2539 // Found end of control
2540 control_end = m0;
2541 }
2542 }
2543 }
2544 if (control_end != NULL) {
2545 control_end->m_next = NULL;
2546 }
2547 }
2548
2549 int error = (*so->so_proto->pr_usrreqs->pru_send)
2550 (so, sendflags, top, addr, control, current_proc());
2551
2552 return error;
2553 }
2554
2555 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp,struct mbuf ** last_control)2556 mbuf_detach_control_from_list(struct mbuf **mp, struct mbuf **last_control)
2557 {
2558 struct mbuf *control = NULL;
2559 struct mbuf *m = *mp;
2560
2561 if (m->m_type == MT_CONTROL) {
2562 struct mbuf *control_end;
2563 struct mbuf *n;
2564
2565 n = control_end = control = m;
2566
2567 /*
2568 * Break the chain per mbuf type
2569 */
2570 while (n != NULL && n->m_type == MT_CONTROL) {
2571 control_end = n;
2572 n = n->m_next;
2573 }
2574 control_end->m_next = NULL;
2575 *mp = n;
2576 if (last_control != NULL) {
2577 *last_control = control_end;
2578 }
2579 }
2580 VERIFY(*mp != NULL);
2581
2582 return control;
2583 }
2584
2585 /*
2586 * Supported only connected sockets (no address) without ancillary data
2587 * (control mbuf) for atomic protocols
2588 */
2589 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2590 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2591 {
2592 mbuf_ref_t m, control = NULL;
2593 struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2594 int error, dontroute;
2595 int atomic = sosendallatonce(so);
2596 int sblocked = 0;
2597 struct proc *p = current_proc();
2598 struct mbuf *top = pktlist;
2599 bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2600
2601 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2602 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2603
2604 if (so->so_type != SOCK_DGRAM) {
2605 error = EINVAL;
2606 os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2607 error);
2608 goto out;
2609 }
2610 if (atomic == 0) {
2611 error = EINVAL;
2612 os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2613 error);
2614 goto out;
2615 }
2616 if ((so->so_state & SS_ISCONNECTED) == 0) {
2617 error = ENOTCONN;
2618 os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2619 error);
2620 goto out;
2621 }
2622 if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2623 error = EINVAL;
2624 os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2625 flags, error);
2626 goto out;
2627 }
2628
2629 socket_lock(so, 1);
2630 so_update_last_owner_locked(so, p);
2631 so_update_policy(so);
2632
2633 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2634 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, SOFLOW_DIRECTION_OUTBOUND, 0);
2635 }
2636
2637 #if NECP
2638 so_update_necp_policy(so, NULL, NULL);
2639 #endif /* NECP */
2640
2641 dontroute = (flags & MSG_DONTROUTE) &&
2642 (so->so_options & SO_DONTROUTE) == 0 &&
2643 (so->so_proto->pr_flags & PR_ATOMIC);
2644 if (dontroute) {
2645 so->so_options |= SO_DONTROUTE;
2646 }
2647
2648 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2649
2650 error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2651 if (error) {
2652 os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2653 error);
2654 goto release;
2655 }
2656
2657 if (!skip_filt) {
2658 mbuf_ref_ref_t prevnextp = NULL;
2659
2660 for (m = top; m != NULL; m = m->m_nextpkt) {
2661 mbuf_ref_t nextpkt, last_control;
2662
2663 /*
2664 * Remove packet from the list of packets
2665 */
2666 nextpkt = m->m_nextpkt;
2667 if (prevnextp != NULL) {
2668 *prevnextp = nextpkt;
2669 } else {
2670 top = nextpkt;
2671 }
2672 m->m_nextpkt = NULL;
2673
2674 /*
2675 * Break the chain per mbuf type
2676 */
2677 if (m->m_type == MT_CONTROL) {
2678 control = mbuf_detach_control_from_list(&m, &last_control);
2679 }
2680 /*
2681 * Socket filter processing
2682 */
2683 error = sflt_data_out(so, NULL, &m,
2684 &control, 0);
2685 if (error != 0 && error != EJUSTRETURN) {
2686 os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2687 error);
2688 m_freem(m);
2689 goto release;
2690 }
2691
2692 #if CONTENT_FILTER
2693 if (error == 0) {
2694 /*
2695 * Content filter processing
2696 */
2697 error = cfil_sock_data_out(so, NULL, m,
2698 control, 0, dgram_flow_entry);
2699 if (error != 0 && error != EJUSTRETURN) {
2700 os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2701 error);
2702 m_freem(m);
2703 goto release;
2704 }
2705 }
2706 #endif /* CONTENT_FILTER */
2707 if (error == EJUSTRETURN) {
2708 /*
2709 * When swallowed by a filter, the packet is not
2710 * in the list anymore
2711 */
2712 error = 0;
2713 } else {
2714 /*
2715 * Rebuild the mbuf chain of the packet
2716 */
2717 if (control != NULL) {
2718 last_control->m_next = m;
2719 m = control;
2720 }
2721 /*
2722 * Reinsert the packet in the list of packets
2723 */
2724 m->m_nextpkt = nextpkt;
2725 if (prevnextp != NULL) {
2726 *prevnextp = m;
2727 } else {
2728 top = m;
2729 }
2730 prevnextp = &m->m_nextpkt;
2731 }
2732 control = NULL;
2733 }
2734 }
2735
2736 if (top != NULL) {
2737 if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2738 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2739 (so, top, pktcnt, flags);
2740 if (error != 0 && error != ENOBUFS) {
2741 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2742 error);
2743 }
2744 top = NULL;
2745 } else {
2746 *pktcnt = 0;
2747 control = NULL;
2748 for (m = top; m != NULL; m = top) {
2749 top = m->m_nextpkt;
2750 m->m_nextpkt = NULL;
2751
2752 /*
2753 * Break the chain per mbuf type
2754 */
2755 if (m->m_type == MT_CONTROL) {
2756 control = mbuf_detach_control_from_list(&m, NULL);
2757 }
2758
2759 error = (*so->so_proto->pr_usrreqs->pru_send)
2760 (so, 0, m, NULL, control, current_proc());
2761 if (error != 0) {
2762 if (error != ENOBUFS) {
2763 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2764 error);
2765 }
2766 control = NULL;
2767 goto release;
2768 }
2769 *pktcnt += 1;
2770 control = NULL;
2771 }
2772 }
2773 }
2774
2775 release:
2776 if (dontroute) {
2777 so->so_options &= ~SO_DONTROUTE;
2778 }
2779 if (sblocked) {
2780 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2781 } else {
2782 socket_unlock(so, 1);
2783 }
2784 out:
2785 if (control != NULL) {
2786 m_freem(control);
2787 }
2788 if (top != NULL) {
2789 if (error != ENOBUFS) {
2790 os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2791 error);
2792 }
2793 m_freem_list(top);
2794 }
2795
2796 if (dgram_flow_entry != NULL) {
2797 soflow_free_flow(dgram_flow_entry);
2798 }
2799
2800 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2801 so->so_snd.sb_cc, 0, error);
2802
2803 return error;
2804 }
2805
2806 /*
2807 * May return ERESTART when packet is dropped by MAC policy check
2808 */
2809 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2810 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2811 struct mbuf **maddrp,
2812 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2813 {
2814 int error = 0;
2815 struct mbuf *m = *mp;
2816 struct mbuf *nextrecord = *nextrecordp;
2817
2818 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2819 #if CONFIG_MACF_SOCKET_SUBSET
2820 /*
2821 * Call the MAC framework for policy checking if we're in
2822 * the user process context and the socket isn't connected.
2823 */
2824 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2825 struct mbuf *m0 = m;
2826 /*
2827 * Dequeue this record (temporarily) from the receive
2828 * list since we're about to drop the socket's lock
2829 * where a new record may arrive and be appended to
2830 * the list. Upon MAC policy failure, the record
2831 * will be freed. Otherwise, we'll add it back to
2832 * the head of the list. We cannot rely on SB_LOCK
2833 * because append operation uses the socket's lock.
2834 */
2835 do {
2836 m->m_nextpkt = NULL;
2837 sbfree(&so->so_rcv, m);
2838 m = m->m_next;
2839 } while (m != NULL);
2840 m = m0;
2841 so->so_rcv.sb_mb = nextrecord;
2842 SB_EMPTY_FIXUP(&so->so_rcv);
2843 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2844 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2845 socket_unlock(so, 0);
2846
2847 error = mac_socket_check_received(kauth_cred_get(), so,
2848 mtod(m, struct sockaddr *));
2849
2850 if (error != 0) {
2851 /*
2852 * MAC policy failure; free this record and
2853 * process the next record (or block until
2854 * one is available). We have adjusted sb_cc
2855 * and sb_mbcnt above so there is no need to
2856 * call sbfree() again.
2857 */
2858 m_freem(m);
2859 /*
2860 * Clear SB_LOCK but don't unlock the socket.
2861 * Process the next record or wait for one.
2862 */
2863 socket_lock(so, 0);
2864 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2865 error = ERESTART;
2866 goto done;
2867 }
2868 socket_lock(so, 0);
2869 /*
2870 * If the socket has been defunct'd, drop it.
2871 */
2872 if (so->so_flags & SOF_DEFUNCT) {
2873 m_freem(m);
2874 error = ENOTCONN;
2875 goto done;
2876 }
2877 /*
2878 * Re-adjust the socket receive list and re-enqueue
2879 * the record in front of any packets which may have
2880 * been appended while we dropped the lock.
2881 */
2882 for (m = m0; m->m_next != NULL; m = m->m_next) {
2883 sballoc(&so->so_rcv, m);
2884 }
2885 sballoc(&so->so_rcv, m);
2886 if (so->so_rcv.sb_mb == NULL) {
2887 so->so_rcv.sb_lastrecord = m0;
2888 so->so_rcv.sb_mbtail = m;
2889 }
2890 m = m0;
2891 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2892 so->so_rcv.sb_mb = m;
2893 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2894 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2895 }
2896 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2897 if (psa != NULL) {
2898 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2899 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2900 error = EWOULDBLOCK;
2901 goto done;
2902 }
2903 } else if (maddrp != NULL) {
2904 *maddrp = m;
2905 }
2906 if (flags & MSG_PEEK) {
2907 m = m->m_next;
2908 } else {
2909 sbfree(&so->so_rcv, m);
2910 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2911 panic("%s: about to create invalid socketbuf",
2912 __func__);
2913 /* NOTREACHED */
2914 }
2915 if (maddrp == NULL) {
2916 MFREE(m, so->so_rcv.sb_mb);
2917 } else {
2918 so->so_rcv.sb_mb = m->m_next;
2919 m->m_next = NULL;
2920 }
2921 m = so->so_rcv.sb_mb;
2922 if (m != NULL) {
2923 m->m_nextpkt = nextrecord;
2924 } else {
2925 so->so_rcv.sb_mb = nextrecord;
2926 SB_EMPTY_FIXUP(&so->so_rcv);
2927 }
2928 }
2929 done:
2930 *mp = m;
2931 *nextrecordp = nextrecord;
2932
2933 return error;
2934 }
2935
2936 /*
2937 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
2938 * so clear the data portion in order not to leak the file pointers
2939 */
2940 static void
sopeek_scm_rights(struct mbuf * rights)2941 sopeek_scm_rights(struct mbuf *rights)
2942 {
2943 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
2944
2945 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
2946 VERIFY(cm->cmsg_len <= rights->m_len);
2947 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
2948 }
2949 }
2950
2951 /*
2952 * Process one or more MT_CONTROL mbufs present before any data mbufs
2953 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2954 * just copy the data; if !MSG_PEEK, we call into the protocol to
2955 * perform externalization.
2956 */
2957 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)2958 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2959 struct mbuf **mp, struct mbuf **nextrecordp)
2960 {
2961 int error = 0;
2962 mbuf_ref_t cm = NULL, cmn;
2963 mbuf_ref_ref_t cme = &cm;
2964 struct sockbuf *sb_rcv = &so->so_rcv;
2965 mbuf_ref_ref_t msgpcm = NULL;
2966 mbuf_ref_t m = *mp;
2967 mbuf_ref_t nextrecord = *nextrecordp;
2968 struct protosw *pr = so->so_proto;
2969
2970 /*
2971 * Externalizing the control messages would require us to
2972 * drop the socket's lock below. Once we re-acquire the
2973 * lock, the mbuf chain might change. In order to preserve
2974 * consistency, we unlink all control messages from the
2975 * first mbuf chain in one shot and link them separately
2976 * onto a different chain.
2977 */
2978 do {
2979 if (flags & MSG_PEEK) {
2980 if (controlp != NULL) {
2981 if (*controlp == NULL) {
2982 msgpcm = controlp;
2983 }
2984 *controlp = m_copy(m, 0, m->m_len);
2985
2986 /*
2987 * If we failed to allocate an mbuf,
2988 * release any previously allocated
2989 * mbufs for control data. Return
2990 * an error. Keep the mbufs in the
2991 * socket as this is using
2992 * MSG_PEEK flag.
2993 */
2994 if (*controlp == NULL) {
2995 m_freem(*msgpcm);
2996 error = ENOBUFS;
2997 goto done;
2998 }
2999
3000 if (pr->pr_domain->dom_externalize != NULL) {
3001 sopeek_scm_rights(*controlp);
3002 }
3003
3004 controlp = &(*controlp)->m_next;
3005 }
3006 m = m->m_next;
3007 } else {
3008 m->m_nextpkt = NULL;
3009 sbfree(sb_rcv, m);
3010 sb_rcv->sb_mb = m->m_next;
3011 m->m_next = NULL;
3012 *cme = m;
3013 cme = &(*cme)->m_next;
3014 m = sb_rcv->sb_mb;
3015 }
3016 } while (m != NULL && m->m_type == MT_CONTROL);
3017
3018 if (!(flags & MSG_PEEK)) {
3019 if (sb_rcv->sb_mb != NULL) {
3020 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3021 } else {
3022 sb_rcv->sb_mb = nextrecord;
3023 SB_EMPTY_FIXUP(sb_rcv);
3024 }
3025 if (nextrecord == NULL) {
3026 sb_rcv->sb_lastrecord = m;
3027 }
3028 }
3029
3030 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3031 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3032
3033 while (cm != NULL) {
3034 int cmsg_level;
3035 int cmsg_type;
3036
3037 cmn = cm->m_next;
3038 cm->m_next = NULL;
3039 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3040 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3041
3042 /*
3043 * Call the protocol to externalize SCM_RIGHTS message
3044 * and return the modified message to the caller upon
3045 * success. Otherwise, all other control messages are
3046 * returned unmodified to the caller. Note that we
3047 * only get into this loop if MSG_PEEK is not set.
3048 */
3049 if (pr->pr_domain->dom_externalize != NULL &&
3050 cmsg_level == SOL_SOCKET &&
3051 cmsg_type == SCM_RIGHTS) {
3052 /*
3053 * Release socket lock: see 3903171. This
3054 * would also allow more records to be appended
3055 * to the socket buffer. We still have SB_LOCK
3056 * set on it, so we can be sure that the head
3057 * of the mbuf chain won't change.
3058 */
3059 socket_unlock(so, 0);
3060 error = (*pr->pr_domain->dom_externalize)(cm);
3061 socket_lock(so, 0);
3062 } else {
3063 error = 0;
3064 }
3065
3066 if (controlp != NULL && error == 0) {
3067 *controlp = cm;
3068 controlp = &(*controlp)->m_next;
3069 } else {
3070 (void) m_free(cm);
3071 }
3072 cm = cmn;
3073 }
3074 /*
3075 * Update the value of nextrecord in case we received new
3076 * records when the socket was unlocked above for
3077 * externalizing SCM_RIGHTS.
3078 */
3079 if (m != NULL) {
3080 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3081 } else {
3082 nextrecord = sb_rcv->sb_mb;
3083 }
3084
3085 done:
3086 *mp = m;
3087 *nextrecordp = nextrecord;
3088
3089 return error;
3090 }
3091
3092 /*
3093 * If we have less data than requested, block awaiting more
3094 * (subject to any timeout) if:
3095 * 1. the current count is less than the low water mark, or
3096 * 2. MSG_WAITALL is set, and it is possible to do the entire
3097 * receive operation at once if we block (resid <= hiwat).
3098 * 3. MSG_DONTWAIT is not set
3099 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3100 * we have to do the receive in sections, and thus risk returning
3101 * a short count if a timeout or signal occurs after we start.
3102 */
3103 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3104 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3105 {
3106 struct protosw *pr = so->so_proto;
3107
3108 /* No mbufs in the receive-queue? Wait! */
3109 if (m == NULL) {
3110 return true;
3111 }
3112
3113 /* Not enough data in the receive socket-buffer - we may have to wait */
3114 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3115 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3116 /*
3117 * Application did set the lowater-mark, so we should wait for
3118 * this data to be present.
3119 */
3120 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3121 return true;
3122 }
3123
3124 /*
3125 * Application wants all the data - so let's try to do the
3126 * receive-operation at once by waiting for everything to
3127 * be there.
3128 */
3129 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3130 return true;
3131 }
3132 }
3133
3134 return false;
3135 }
3136
3137 /*
3138 * Implement receive operations on a socket.
3139 * We depend on the way that records are added to the sockbuf
3140 * by sbappend*. In particular, each record (mbufs linked through m_next)
3141 * must begin with an address if the protocol so specifies,
3142 * followed by an optional mbuf or mbufs containing ancillary data,
3143 * and then zero or more mbufs of data.
3144 * In order to avoid blocking network interrupts for the entire time here,
3145 * we splx() while doing the actual copy to user space.
3146 * Although the sockbuf is locked, new data may still be appended,
3147 * and thus we must maintain consistency of the sockbuf during that time.
3148 *
3149 * The caller may receive the data as a single mbuf chain by supplying
3150 * an mbuf **mp0 for use in returning the chain. The uio is then used
3151 * only for the count in uio_resid.
3152 *
3153 * Returns: 0 Success
3154 * ENOBUFS
3155 * ENOTCONN
3156 * EWOULDBLOCK
3157 * uiomove:EFAULT
3158 * sblock:EWOULDBLOCK
3159 * sblock:EINTR
3160 * sbwait:EBADF
3161 * sbwait:EINTR
3162 * sodelayed_copy:EFAULT
3163 * <pru_rcvoob>:EINVAL[TCP]
3164 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3165 * <pru_rcvoob>:???
3166 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3167 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3168 * <pr_domain->dom_externalize>:???
3169 *
3170 * Notes: Additional return values from calls through <pru_rcvoob> and
3171 * <pr_domain->dom_externalize> depend on protocols other than
3172 * TCP or AF_UNIX, which are documented above.
3173 */
3174 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3175 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3176 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3177 {
3178 mbuf_ref_t m;
3179 mbuf_ref_ref_t mp;
3180 mbuf_ref_t ml = NULL;
3181 mbuf_ref_t nextrecord, free_list;
3182 int flags, error, offset;
3183 user_ssize_t len;
3184 struct protosw *pr = so->so_proto;
3185 int moff, type = 0;
3186 user_ssize_t orig_resid = uio_resid(uio);
3187 user_ssize_t delayed_copy_len;
3188 int can_delay;
3189 struct proc *p = current_proc();
3190 boolean_t en_tracing = FALSE;
3191
3192 /*
3193 * Sanity check on the length passed by caller as we are making 'int'
3194 * comparisons
3195 */
3196 if (orig_resid < 0 || orig_resid > INT_MAX) {
3197 return EINVAL;
3198 }
3199
3200 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3201 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3202 so->so_rcv.sb_hiwat);
3203
3204 socket_lock(so, 1);
3205 so_update_last_owner_locked(so, p);
3206 so_update_policy(so);
3207
3208 #ifdef MORE_LOCKING_DEBUG
3209 if (so->so_usecount == 1) {
3210 panic("%s: so=%x no other reference on socket", __func__, so);
3211 /* NOTREACHED */
3212 }
3213 #endif
3214 mp = mp0;
3215 if (psa != NULL) {
3216 *psa = NULL;
3217 }
3218 if (controlp != NULL) {
3219 *controlp = NULL;
3220 }
3221 if (flagsp != NULL) {
3222 flags = *flagsp & ~MSG_EOR;
3223 } else {
3224 flags = 0;
3225 }
3226
3227 /*
3228 * If a recv attempt is made on a previously-accepted socket
3229 * that has been marked as inactive (disconnected), reject
3230 * the request.
3231 */
3232 if (so->so_flags & SOF_DEFUNCT) {
3233 struct sockbuf *sb = &so->so_rcv;
3234
3235 error = ENOTCONN;
3236 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3237 __func__, proc_pid(p), proc_best_name(p),
3238 so->so_gencnt,
3239 SOCK_DOM(so), SOCK_TYPE(so), error);
3240 /*
3241 * This socket should have been disconnected and flushed
3242 * prior to being returned from sodefunct(); there should
3243 * be no data on its receive list, so panic otherwise.
3244 */
3245 if (so->so_state & SS_DEFUNCT) {
3246 sb_empty_assert(sb, __func__);
3247 }
3248 socket_unlock(so, 1);
3249 return error;
3250 }
3251
3252 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3253 pr->pr_usrreqs->pru_preconnect) {
3254 /*
3255 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3256 * calling write() right after this. *If* the app calls a read
3257 * we do not want to block this read indefinetely. Thus,
3258 * we trigger a connect so that the session gets initiated.
3259 */
3260 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3261
3262 if (error) {
3263 socket_unlock(so, 1);
3264 return error;
3265 }
3266 }
3267
3268 if (ENTR_SHOULDTRACE &&
3269 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3270 /*
3271 * enable energy tracing for inet sockets that go over
3272 * non-loopback interfaces only.
3273 */
3274 struct inpcb *inp = sotoinpcb(so);
3275 if (inp->inp_last_outifp != NULL &&
3276 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3277 en_tracing = TRUE;
3278 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3279 VM_KERNEL_ADDRPERM(so),
3280 ((so->so_state & SS_NBIO) ?
3281 kEnTrFlagNonBlocking : 0),
3282 (int64_t)orig_resid);
3283 }
3284 }
3285
3286 /*
3287 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3288 * regardless of the flags argument. Here is the case were
3289 * out-of-band data is not inline.
3290 */
3291 if ((flags & MSG_OOB) ||
3292 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3293 (so->so_options & SO_OOBINLINE) == 0 &&
3294 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3295 m = m_get(M_WAIT, MT_DATA);
3296 if (m == NULL) {
3297 socket_unlock(so, 1);
3298 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3299 ENOBUFS, 0, 0, 0, 0);
3300 return ENOBUFS;
3301 }
3302 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3303 if (error) {
3304 goto bad;
3305 }
3306 socket_unlock(so, 0);
3307 do {
3308 error = uiomove(mtod(m, caddr_t),
3309 imin((int)uio_resid(uio), m->m_len), uio);
3310 m = m_free(m);
3311 } while (uio_resid(uio) && error == 0 && m != NULL);
3312 socket_lock(so, 0);
3313 bad:
3314 if (m != NULL) {
3315 m_freem(m);
3316 }
3317
3318 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3319 if (error == EWOULDBLOCK || error == EINVAL) {
3320 /*
3321 * Let's try to get normal data:
3322 * EWOULDBLOCK: out-of-band data not
3323 * receive yet. EINVAL: out-of-band data
3324 * already read.
3325 */
3326 error = 0;
3327 goto nooob;
3328 } else if (error == 0 && flagsp != NULL) {
3329 *flagsp |= MSG_OOB;
3330 }
3331 }
3332 socket_unlock(so, 1);
3333 if (en_tracing) {
3334 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3335 VM_KERNEL_ADDRPERM(so), 0,
3336 (int64_t)(orig_resid - uio_resid(uio)));
3337 }
3338 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3339 0, 0, 0, 0);
3340
3341 return error;
3342 }
3343 nooob:
3344 if (mp != NULL) {
3345 *mp = NULL;
3346 }
3347
3348 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3349 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3350 }
3351
3352 free_list = NULL;
3353 delayed_copy_len = 0;
3354 restart:
3355 #ifdef MORE_LOCKING_DEBUG
3356 if (so->so_usecount <= 1) {
3357 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3358 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3359 }
3360 #endif
3361 /*
3362 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3363 * and if so just return to the caller. This could happen when
3364 * soreceive() is called by a socket upcall function during the
3365 * time the socket is freed. The socket buffer would have been
3366 * locked across the upcall, therefore we cannot put this thread
3367 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3368 * we may livelock), because the lock on the socket buffer will
3369 * only be released when the upcall routine returns to its caller.
3370 * Because the socket has been officially closed, there can be
3371 * no further read on it.
3372 *
3373 * A multipath subflow socket would have its SS_NOFDREF set by
3374 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3375 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3376 */
3377 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3378 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3379 socket_unlock(so, 1);
3380 return 0;
3381 }
3382
3383 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3384 if (error) {
3385 socket_unlock(so, 1);
3386 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3387 0, 0, 0, 0);
3388 if (en_tracing) {
3389 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3390 VM_KERNEL_ADDRPERM(so), 0,
3391 (int64_t)(orig_resid - uio_resid(uio)));
3392 }
3393 return error;
3394 }
3395
3396 m = so->so_rcv.sb_mb;
3397 if (so_should_wait(so, uio, m, flags)) {
3398 /*
3399 * Panic if we notice inconsistencies in the socket's
3400 * receive list; both sb_mb and sb_cc should correctly
3401 * reflect the contents of the list, otherwise we may
3402 * end up with false positives during select() or poll()
3403 * which could put the application in a bad state.
3404 */
3405 SB_MB_CHECK(&so->so_rcv);
3406
3407 if (so->so_error) {
3408 if (m != NULL) {
3409 goto dontblock;
3410 }
3411 error = so->so_error;
3412 if ((flags & MSG_PEEK) == 0) {
3413 so->so_error = 0;
3414 }
3415 goto release;
3416 }
3417 if (so->so_state & SS_CANTRCVMORE) {
3418 #if CONTENT_FILTER
3419 /*
3420 * Deal with half closed connections
3421 */
3422 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3423 cfil_sock_data_pending(&so->so_rcv) != 0) {
3424 CFIL_LOG(LOG_INFO,
3425 "so %llx ignore SS_CANTRCVMORE",
3426 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3427 } else
3428 #endif /* CONTENT_FILTER */
3429 if (m != NULL) {
3430 goto dontblock;
3431 } else {
3432 goto release;
3433 }
3434 }
3435 for (; m != NULL; m = m->m_next) {
3436 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3437 m = so->so_rcv.sb_mb;
3438 goto dontblock;
3439 }
3440 }
3441 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3442 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3443 error = ENOTCONN;
3444 goto release;
3445 }
3446 if (uio_resid(uio) == 0) {
3447 goto release;
3448 }
3449
3450 if ((so->so_state & SS_NBIO) ||
3451 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3452 error = EWOULDBLOCK;
3453 goto release;
3454 }
3455 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3456 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3457 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3458 #if EVEN_MORE_LOCKING_DEBUG
3459 if (socket_debug) {
3460 printf("Waiting for socket data\n");
3461 }
3462 #endif
3463
3464 /*
3465 * Depending on the protocol (e.g. TCP), the following
3466 * might cause the socket lock to be dropped and later
3467 * be reacquired, and more data could have arrived and
3468 * have been appended to the receive socket buffer by
3469 * the time it returns. Therefore, we only sleep in
3470 * sbwait() below if and only if the wait-condition is still
3471 * true.
3472 */
3473 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3474 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3475 }
3476
3477 error = 0;
3478 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3479 error = sbwait(&so->so_rcv);
3480 }
3481
3482 #if EVEN_MORE_LOCKING_DEBUG
3483 if (socket_debug) {
3484 printf("SORECEIVE - sbwait returned %d\n", error);
3485 }
3486 #endif
3487 if (so->so_usecount < 1) {
3488 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3489 __func__, so, so->so_usecount);
3490 /* NOTREACHED */
3491 }
3492 if (error) {
3493 socket_unlock(so, 1);
3494 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3495 0, 0, 0, 0);
3496 if (en_tracing) {
3497 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3498 VM_KERNEL_ADDRPERM(so), 0,
3499 (int64_t)(orig_resid - uio_resid(uio)));
3500 }
3501 return error;
3502 }
3503 goto restart;
3504 }
3505 dontblock:
3506 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3507 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3508 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3509 nextrecord = m->m_nextpkt;
3510
3511 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3512 error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3513 mp0 == NULL);
3514 if (error == ERESTART) {
3515 goto restart;
3516 } else if (error != 0) {
3517 goto release;
3518 }
3519 orig_resid = 0;
3520 }
3521
3522 /*
3523 * Process one or more MT_CONTROL mbufs present before any data mbufs
3524 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3525 * just copy the data; if !MSG_PEEK, we call into the protocol to
3526 * perform externalization.
3527 */
3528 if (m != NULL && m->m_type == MT_CONTROL) {
3529 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3530 if (error != 0) {
3531 goto release;
3532 }
3533 orig_resid = 0;
3534 }
3535
3536 if (m != NULL) {
3537 if (!(flags & MSG_PEEK)) {
3538 /*
3539 * We get here because m points to an mbuf following
3540 * any MT_SONAME or MT_CONTROL mbufs which have been
3541 * processed above. In any case, m should be pointing
3542 * to the head of the mbuf chain, and the nextrecord
3543 * should be either NULL or equal to m->m_nextpkt.
3544 * See comments above about SB_LOCK.
3545 */
3546 if (m != so->so_rcv.sb_mb ||
3547 m->m_nextpkt != nextrecord) {
3548 panic("%s: post-control !sync so=%p m=%p "
3549 "nextrecord=%p\n", __func__, so, m,
3550 nextrecord);
3551 /* NOTREACHED */
3552 }
3553 if (nextrecord == NULL) {
3554 so->so_rcv.sb_lastrecord = m;
3555 }
3556 }
3557 type = m->m_type;
3558 if (type == MT_OOBDATA) {
3559 flags |= MSG_OOB;
3560 }
3561 } else {
3562 if (!(flags & MSG_PEEK)) {
3563 SB_EMPTY_FIXUP(&so->so_rcv);
3564 }
3565 }
3566 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3567 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3568
3569 moff = 0;
3570 offset = 0;
3571
3572 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3573 can_delay = 1;
3574 } else {
3575 can_delay = 0;
3576 }
3577
3578 while (m != NULL &&
3579 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3580 if (m->m_type == MT_OOBDATA) {
3581 if (type != MT_OOBDATA) {
3582 break;
3583 }
3584 } else if (type == MT_OOBDATA) {
3585 break;
3586 }
3587
3588 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3589 break;
3590 }
3591 /*
3592 * Make sure to allways set MSG_OOB event when getting
3593 * out of band data inline.
3594 */
3595 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3596 (so->so_options & SO_OOBINLINE) != 0 &&
3597 (so->so_state & SS_RCVATMARK) != 0) {
3598 flags |= MSG_OOB;
3599 }
3600 so->so_state &= ~SS_RCVATMARK;
3601 len = uio_resid(uio) - delayed_copy_len;
3602 if (so->so_oobmark && len > so->so_oobmark - offset) {
3603 len = so->so_oobmark - offset;
3604 }
3605 if (len > m->m_len - moff) {
3606 len = m->m_len - moff;
3607 }
3608 /*
3609 * If mp is set, just pass back the mbufs.
3610 * Otherwise copy them out via the uio, then free.
3611 * Sockbuf must be consistent here (points to current mbuf,
3612 * it points to next record) when we drop priority;
3613 * we must note any additions to the sockbuf when we
3614 * block interrupts again.
3615 */
3616 if (mp == NULL) {
3617 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3618 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3619 if (can_delay && len == m->m_len) {
3620 /*
3621 * only delay the copy if we're consuming the
3622 * mbuf and we're NOT in MSG_PEEK mode
3623 * and we have enough data to make it worthwile
3624 * to drop and retake the lock... can_delay
3625 * reflects the state of the 2 latter
3626 * constraints moff should always be zero
3627 * in these cases
3628 */
3629 delayed_copy_len += len;
3630 } else {
3631 if (delayed_copy_len) {
3632 error = sodelayed_copy(so, uio,
3633 &free_list, &delayed_copy_len);
3634
3635 if (error) {
3636 goto release;
3637 }
3638 /*
3639 * can only get here if MSG_PEEK is not
3640 * set therefore, m should point at the
3641 * head of the rcv queue; if it doesn't,
3642 * it means something drastically
3643 * changed while we were out from behind
3644 * the lock in sodelayed_copy. perhaps
3645 * a RST on the stream. in any event,
3646 * the stream has been interrupted. it's
3647 * probably best just to return whatever
3648 * data we've moved and let the caller
3649 * sort it out...
3650 */
3651 if (m != so->so_rcv.sb_mb) {
3652 break;
3653 }
3654 }
3655 socket_unlock(so, 0);
3656 error = uiomove(mtod(m, caddr_t) + moff,
3657 (int)len, uio);
3658 socket_lock(so, 0);
3659
3660 if (error) {
3661 goto release;
3662 }
3663 }
3664 } else {
3665 uio_setresid(uio, (uio_resid(uio) - len));
3666 }
3667 if (len == m->m_len - moff) {
3668 if (m->m_flags & M_EOR) {
3669 flags |= MSG_EOR;
3670 }
3671 if (flags & MSG_PEEK) {
3672 m = m->m_next;
3673 moff = 0;
3674 } else {
3675 nextrecord = m->m_nextpkt;
3676 sbfree(&so->so_rcv, m);
3677 m->m_nextpkt = NULL;
3678
3679 if (mp != NULL) {
3680 *mp = m;
3681 mp = &m->m_next;
3682 so->so_rcv.sb_mb = m = m->m_next;
3683 *mp = NULL;
3684 } else {
3685 if (free_list == NULL) {
3686 free_list = m;
3687 } else {
3688 ml->m_next = m;
3689 }
3690 ml = m;
3691 so->so_rcv.sb_mb = m = m->m_next;
3692 ml->m_next = NULL;
3693 }
3694 if (m != NULL) {
3695 m->m_nextpkt = nextrecord;
3696 if (nextrecord == NULL) {
3697 so->so_rcv.sb_lastrecord = m;
3698 }
3699 } else {
3700 so->so_rcv.sb_mb = nextrecord;
3701 SB_EMPTY_FIXUP(&so->so_rcv);
3702 }
3703 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3704 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3705 }
3706 } else {
3707 if (flags & MSG_PEEK) {
3708 moff += len;
3709 } else {
3710 if (mp != NULL) {
3711 int copy_flag;
3712
3713 if (flags & MSG_DONTWAIT) {
3714 copy_flag = M_DONTWAIT;
3715 } else {
3716 copy_flag = M_WAIT;
3717 }
3718 *mp = m_copym(m, 0, (int)len, copy_flag);
3719 /*
3720 * Failed to allocate an mbuf?
3721 * Adjust uio_resid back, it was
3722 * adjusted down by len bytes which
3723 * we didn't copy over.
3724 */
3725 if (*mp == NULL) {
3726 uio_setresid(uio,
3727 (uio_resid(uio) + len));
3728 break;
3729 }
3730 }
3731 m->m_data += len;
3732 m->m_len -= len;
3733 so->so_rcv.sb_cc -= len;
3734 }
3735 }
3736 if (so->so_oobmark) {
3737 if ((flags & MSG_PEEK) == 0) {
3738 so->so_oobmark -= len;
3739 if (so->so_oobmark == 0) {
3740 so->so_state |= SS_RCVATMARK;
3741 break;
3742 }
3743 } else {
3744 offset += len;
3745 if (offset == so->so_oobmark) {
3746 break;
3747 }
3748 }
3749 }
3750 if (flags & MSG_EOR) {
3751 break;
3752 }
3753 /*
3754 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3755 * (for non-atomic socket), we must not quit until
3756 * "uio->uio_resid == 0" or an error termination.
3757 * If a signal/timeout occurs, return with a short
3758 * count but without error. Keep sockbuf locked
3759 * against other readers.
3760 */
3761 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3762 (uio_resid(uio) - delayed_copy_len) > 0 &&
3763 !sosendallatonce(so) && !nextrecord) {
3764 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3765 #if CONTENT_FILTER
3766 && cfil_sock_data_pending(&so->so_rcv) == 0
3767 #endif /* CONTENT_FILTER */
3768 )) {
3769 goto release;
3770 }
3771
3772 /*
3773 * Depending on the protocol (e.g. TCP), the following
3774 * might cause the socket lock to be dropped and later
3775 * be reacquired, and more data could have arrived and
3776 * have been appended to the receive socket buffer by
3777 * the time it returns. Therefore, we only sleep in
3778 * sbwait() below if and only if the socket buffer is
3779 * empty, in order to avoid a false sleep.
3780 */
3781 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3782 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3783 }
3784
3785 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3786 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3787
3788 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3789 error = 0;
3790 goto release;
3791 }
3792 /*
3793 * have to wait until after we get back from the sbwait
3794 * to do the copy because we will drop the lock if we
3795 * have enough data that has been delayed... by dropping
3796 * the lock we open up a window allowing the netisr
3797 * thread to process the incoming packets and to change
3798 * the state of this socket... we're issuing the sbwait
3799 * because the socket is empty and we're expecting the
3800 * netisr thread to wake us up when more packets arrive;
3801 * if we allow that processing to happen and then sbwait
3802 * we could stall forever with packets sitting in the
3803 * socket if no further packets arrive from the remote
3804 * side.
3805 *
3806 * we want to copy before we've collected all the data
3807 * to satisfy this request to allow the copy to overlap
3808 * the incoming packet processing on an MP system
3809 */
3810 if (delayed_copy_len > sorecvmincopy &&
3811 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3812 error = sodelayed_copy(so, uio,
3813 &free_list, &delayed_copy_len);
3814
3815 if (error) {
3816 goto release;
3817 }
3818 }
3819 m = so->so_rcv.sb_mb;
3820 if (m != NULL) {
3821 nextrecord = m->m_nextpkt;
3822 }
3823 SB_MB_CHECK(&so->so_rcv);
3824 }
3825 }
3826 #ifdef MORE_LOCKING_DEBUG
3827 if (so->so_usecount <= 1) {
3828 panic("%s: after big while so=%p ref=%d on socket",
3829 __func__, so, so->so_usecount);
3830 /* NOTREACHED */
3831 }
3832 #endif
3833
3834 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3835 if (so->so_options & SO_DONTTRUNC) {
3836 flags |= MSG_RCVMORE;
3837 } else {
3838 flags |= MSG_TRUNC;
3839 if ((flags & MSG_PEEK) == 0) {
3840 (void) sbdroprecord(&so->so_rcv);
3841 }
3842 }
3843 }
3844
3845 /*
3846 * pru_rcvd below (for TCP) may cause more data to be received
3847 * if the socket lock is dropped prior to sending the ACK; some
3848 * legacy OpenTransport applications don't handle this well
3849 * (if it receives less data than requested while MSG_HAVEMORE
3850 * is set), and so we set the flag now based on what we know
3851 * prior to calling pru_rcvd.
3852 */
3853 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3854 flags |= MSG_HAVEMORE;
3855 }
3856
3857 if ((flags & MSG_PEEK) == 0) {
3858 if (m == NULL) {
3859 so->so_rcv.sb_mb = nextrecord;
3860 /*
3861 * First part is an inline SB_EMPTY_FIXUP(). Second
3862 * part makes sure sb_lastrecord is up-to-date if
3863 * there is still data in the socket buffer.
3864 */
3865 if (so->so_rcv.sb_mb == NULL) {
3866 so->so_rcv.sb_mbtail = NULL;
3867 so->so_rcv.sb_lastrecord = NULL;
3868 } else if (nextrecord->m_nextpkt == NULL) {
3869 so->so_rcv.sb_lastrecord = nextrecord;
3870 }
3871 SB_MB_CHECK(&so->so_rcv);
3872 }
3873 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3874 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3875 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3876 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3877 }
3878 }
3879
3880 if (delayed_copy_len) {
3881 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3882 if (error) {
3883 goto release;
3884 }
3885 }
3886 if (free_list != NULL) {
3887 m_freem_list(free_list);
3888 free_list = NULL;
3889 }
3890
3891 if (orig_resid == uio_resid(uio) && orig_resid &&
3892 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3893 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3894 goto restart;
3895 }
3896
3897 if (flagsp != NULL) {
3898 *flagsp |= flags;
3899 }
3900 release:
3901 #ifdef MORE_LOCKING_DEBUG
3902 if (so->so_usecount <= 1) {
3903 panic("%s: release so=%p ref=%d on socket", __func__,
3904 so, so->so_usecount);
3905 /* NOTREACHED */
3906 }
3907 #endif
3908 if (delayed_copy_len) {
3909 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3910 }
3911
3912 if (free_list != NULL) {
3913 m_freem_list(free_list);
3914 }
3915
3916 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3917
3918 if (en_tracing) {
3919 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3920 VM_KERNEL_ADDRPERM(so),
3921 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3922 (int64_t)(orig_resid - uio_resid(uio)));
3923 }
3924 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3925 so->so_rcv.sb_cc, 0, error);
3926
3927 return error;
3928 }
3929
3930 /*
3931 * Returns: 0 Success
3932 * uiomove:EFAULT
3933 */
3934 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)3935 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3936 user_ssize_t *resid)
3937 {
3938 int error = 0;
3939 struct mbuf *m;
3940
3941 m = *free_list;
3942
3943 socket_unlock(so, 0);
3944
3945 while (m != NULL && error == 0) {
3946 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3947 m = m->m_next;
3948 }
3949 m_freem_list(*free_list);
3950
3951 *free_list = NULL;
3952 *resid = 0;
3953
3954 socket_lock(so, 0);
3955
3956 return error;
3957 }
3958
3959 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3960 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
3961 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3962 {
3963 mbuf_ref_t m;
3964 mbuf_ref_ref_t mp;
3965 mbuf_ref_t nextrecord;
3966 int flags, error;
3967 struct protosw *pr = so->so_proto;
3968 struct proc *p = current_proc();
3969 u_int npkts = 0;
3970 mbuf_ref_t free_list = NULL;
3971 int sblocked = 0;
3972
3973 /*
3974 * Sanity check on the parameters passed by caller
3975 */
3976 if (mp0 == NULL || pktcntp == NULL) {
3977 return EINVAL;
3978 }
3979 if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
3980 return EINVAL;
3981 }
3982
3983 mp = mp0;
3984 *mp0 = NULL;
3985 if (controlp != NULL) {
3986 *controlp = NULL;
3987 }
3988 if (maddrp != NULL) {
3989 *maddrp = NULL;
3990 }
3991 if (flagsp != NULL) {
3992 flags = *flagsp;
3993 } else {
3994 flags = 0;
3995 }
3996
3997 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
3998 *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3999 so->so_rcv.sb_hiwat);
4000
4001 socket_lock(so, 1);
4002 so_update_last_owner_locked(so, p);
4003 so_update_policy(so);
4004
4005 #if NECP
4006 so_update_necp_policy(so, NULL, NULL);
4007 #endif /* NECP */
4008
4009 /*
4010 * If a recv attempt is made on a previously-accepted socket
4011 * that has been marked as inactive (disconnected), reject
4012 * the request.
4013 */
4014 if (so->so_flags & SOF_DEFUNCT) {
4015 struct sockbuf *sb = &so->so_rcv;
4016
4017 error = ENOTCONN;
4018 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4019 __func__, proc_pid(p), proc_best_name(p),
4020 so->so_gencnt,
4021 SOCK_DOM(so), SOCK_TYPE(so), error);
4022 /*
4023 * This socket should have been disconnected and flushed
4024 * prior to being returned from sodefunct(); there should
4025 * be no data on its receive list, so panic otherwise.
4026 */
4027 if (so->so_state & SS_DEFUNCT) {
4028 sb_empty_assert(sb, __func__);
4029 }
4030 goto release;
4031 }
4032
4033 *mp = NULL;
4034
4035 restart:
4036 /*
4037 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4038 * and if so just return to the caller. This could happen when
4039 * soreceive() is called by a socket upcall function during the
4040 * time the socket is freed. The socket buffer would have been
4041 * locked across the upcall, therefore we cannot put this thread
4042 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4043 * we may livelock), because the lock on the socket buffer will
4044 * only be released when the upcall routine returns to its caller.
4045 * Because the socket has been officially closed, there can be
4046 * no further read on it.
4047 */
4048 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4049 (SS_NOFDREF | SS_CANTRCVMORE)) {
4050 error = 0;
4051 goto release;
4052 }
4053
4054 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4055 if (error) {
4056 goto release;
4057 }
4058 sblocked = 1;
4059
4060 m = so->so_rcv.sb_mb;
4061 /*
4062 * Block awaiting more datagram if needed
4063 */
4064 if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4065 so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4066 /*
4067 * Panic if we notice inconsistencies in the socket's
4068 * receive list; both sb_mb and sb_cc should correctly
4069 * reflect the contents of the list, otherwise we may
4070 * end up with false positives during select() or poll()
4071 * which could put the application in a bad state.
4072 */
4073 SB_MB_CHECK(&so->so_rcv);
4074
4075 if (so->so_error) {
4076 if (m != NULL) {
4077 goto dontblock;
4078 }
4079 error = so->so_error;
4080 if ((flags & MSG_PEEK) == 0) {
4081 so->so_error = 0;
4082 }
4083 goto release;
4084 }
4085 if (so->so_state & SS_CANTRCVMORE) {
4086 if (m != NULL) {
4087 goto dontblock;
4088 } else {
4089 goto release;
4090 }
4091 }
4092 for (; m != NULL; m = m->m_next) {
4093 if (m->m_flags & M_EOR) {
4094 m = so->so_rcv.sb_mb;
4095 goto dontblock;
4096 }
4097 }
4098 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4099 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4100 error = ENOTCONN;
4101 goto release;
4102 }
4103 if ((so->so_state & SS_NBIO) ||
4104 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4105 error = EWOULDBLOCK;
4106 goto release;
4107 }
4108 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4109 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4110
4111 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4112 sblocked = 0;
4113
4114 error = sbwait(&so->so_rcv);
4115 if (error != 0) {
4116 goto release;
4117 }
4118 goto restart;
4119 }
4120 dontblock:
4121 m = so->so_rcv.sb_mb;
4122 if (m == NULL) {
4123 goto release;
4124 }
4125
4126 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4127 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4128 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4129 nextrecord = m->m_nextpkt;
4130
4131 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4132 mbuf_ref_t maddr = NULL;
4133
4134 error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4135 &nextrecord, 1);
4136 if (error == ERESTART) {
4137 goto restart;
4138 } else if (error != 0) {
4139 goto release;
4140 }
4141
4142 if (maddr != NULL) {
4143 maddr->m_nextpkt = NULL;
4144 maddr->m_next = NULL;
4145 if (maddrp != NULL) {
4146 *maddrp = maddr;
4147 maddrp = &maddr->m_nextpkt;
4148 } else {
4149 maddr->m_next = free_list;
4150 free_list = maddr;
4151 }
4152 }
4153 }
4154
4155 /*
4156 * Process one or more MT_CONTROL mbufs present before any data mbufs
4157 * in the first mbuf chain on the socket buffer.
4158 * We call into the protocol to perform externalization.
4159 */
4160 if (m != NULL && m->m_type == MT_CONTROL) {
4161 mbuf_ref_t control = NULL;
4162
4163 error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4164 if (error != 0) {
4165 goto release;
4166 }
4167 if (control != NULL) {
4168 control->m_nextpkt = NULL;
4169 control->m_next = NULL;
4170 if (controlp != NULL) {
4171 *controlp = control;
4172 controlp = &control->m_nextpkt;
4173 } else {
4174 control->m_next = free_list;
4175 free_list = control;
4176 }
4177 }
4178 }
4179
4180 /*
4181 * Link the packet to the list
4182 */
4183 if (m != NULL) {
4184 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4185 panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4186 }
4187 m->m_nextpkt = NULL;
4188 *mp = m;
4189 mp = &m->m_nextpkt;
4190 }
4191 while (m != NULL) {
4192 sbfree(&so->so_rcv, m);
4193
4194 m = m->m_next;
4195 }
4196
4197 so->so_rcv.sb_mb = nextrecord;
4198 /*
4199 * First part is an inline SB_EMPTY_FIXUP(). Second
4200 * part makes sure sb_lastrecord is up-to-date if
4201 * there is still data in the socket buffer.
4202 */
4203 if (so->so_rcv.sb_mb == NULL) {
4204 so->so_rcv.sb_mbtail = NULL;
4205 so->so_rcv.sb_lastrecord = NULL;
4206 } else if (nextrecord->m_nextpkt == NULL) {
4207 so->so_rcv.sb_lastrecord = nextrecord;
4208 }
4209 SB_MB_CHECK(&so->so_rcv);
4210
4211 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4212 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4213
4214 npkts += 1;
4215
4216 /*
4217 * We continue as long as all those conditions as we have less packets
4218 * than requested and the socket buffer is not empty
4219 */
4220 if (npkts < *pktcntp) {
4221 if (so->so_rcv.sb_mb != NULL) {
4222 goto dontblock;
4223 }
4224 if ((flags & MSG_WAITALL) != 0) {
4225 goto restart;
4226 }
4227 }
4228
4229 if (flagsp != NULL) {
4230 *flagsp |= flags;
4231 }
4232
4233 release:
4234 /*
4235 * pru_rcvd may cause more data to be received if the socket lock
4236 * is dropped so we set MSG_HAVEMORE now based on what we know.
4237 * That way the caller won't be surprised if it receives less data
4238 * than requested.
4239 */
4240 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4241 flags |= MSG_HAVEMORE;
4242 }
4243
4244 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4245 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4246 }
4247
4248 if (sblocked) {
4249 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4250 } else {
4251 socket_unlock(so, 1);
4252 }
4253
4254 *pktcntp = npkts;
4255 /*
4256 * Amortize the cost of freeing the mbufs
4257 */
4258 if (free_list != NULL) {
4259 m_freem_list(free_list);
4260 }
4261
4262 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4263 0, 0, 0, 0);
4264 return error;
4265 }
4266
4267 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4268 so_statistics_event_to_nstat_event(int64_t *input_options,
4269 uint64_t *nstat_event)
4270 {
4271 int error = 0;
4272 switch (*input_options) {
4273 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4274 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4275 break;
4276 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4277 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4278 break;
4279 #if (DEBUG || DEVELOPMENT)
4280 case SO_STATISTICS_EVENT_RESERVED_1:
4281 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4282 break;
4283 case SO_STATISTICS_EVENT_RESERVED_2:
4284 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4285 break;
4286 #endif /* (DEBUG || DEVELOPMENT) */
4287 default:
4288 error = EINVAL;
4289 break;
4290 }
4291 return error;
4292 }
4293
4294 /*
4295 * Returns: 0 Success
4296 * EINVAL
4297 * ENOTCONN
4298 * <pru_shutdown>:EINVAL
4299 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4300 * <pru_shutdown>:ENOBUFS[TCP]
4301 * <pru_shutdown>:EMSGSIZE[TCP]
4302 * <pru_shutdown>:EHOSTUNREACH[TCP]
4303 * <pru_shutdown>:ENETUNREACH[TCP]
4304 * <pru_shutdown>:ENETDOWN[TCP]
4305 * <pru_shutdown>:ENOMEM[TCP]
4306 * <pru_shutdown>:EACCES[TCP]
4307 * <pru_shutdown>:EMSGSIZE[TCP]
4308 * <pru_shutdown>:ENOBUFS[TCP]
4309 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4310 * <pru_shutdown>:??? [other protocol families]
4311 */
4312 int
soshutdown(struct socket * so,int how)4313 soshutdown(struct socket *so, int how)
4314 {
4315 int error;
4316
4317 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4318
4319 switch (how) {
4320 case SHUT_RD:
4321 case SHUT_WR:
4322 case SHUT_RDWR:
4323 socket_lock(so, 1);
4324 if ((so->so_state &
4325 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4326 error = ENOTCONN;
4327 } else {
4328 error = soshutdownlock(so, how);
4329 }
4330 socket_unlock(so, 1);
4331 break;
4332 default:
4333 error = EINVAL;
4334 break;
4335 }
4336
4337 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4338
4339 return error;
4340 }
4341
4342 int
soshutdownlock_final(struct socket * so,int how)4343 soshutdownlock_final(struct socket *so, int how)
4344 {
4345 struct protosw *pr = so->so_proto;
4346 int error = 0;
4347
4348 sflt_notify(so, sock_evt_shutdown, &how);
4349
4350 if (how != SHUT_WR) {
4351 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4352 /* read already shut down */
4353 error = ENOTCONN;
4354 goto done;
4355 }
4356 sorflush(so);
4357 }
4358 if (how != SHUT_RD) {
4359 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4360 /* write already shut down */
4361 error = ENOTCONN;
4362 goto done;
4363 }
4364 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4365 }
4366 done:
4367 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4368 return error;
4369 }
4370
4371 int
soshutdownlock(struct socket * so,int how)4372 soshutdownlock(struct socket *so, int how)
4373 {
4374 int error = 0;
4375
4376 #if CONTENT_FILTER
4377 /*
4378 * A content filter may delay the actual shutdown until it
4379 * has processed the pending data
4380 */
4381 if (so->so_flags & SOF_CONTENT_FILTER) {
4382 error = cfil_sock_shutdown(so, &how);
4383 if (error == EJUSTRETURN) {
4384 error = 0;
4385 goto done;
4386 } else if (error != 0) {
4387 goto done;
4388 }
4389 }
4390 #endif /* CONTENT_FILTER */
4391
4392 error = soshutdownlock_final(so, how);
4393
4394 done:
4395 return error;
4396 }
4397
4398 void
sowflush(struct socket * so)4399 sowflush(struct socket *so)
4400 {
4401 struct sockbuf *sb = &so->so_snd;
4402
4403 /*
4404 * Obtain lock on the socket buffer (SB_LOCK). This is required
4405 * to prevent the socket buffer from being unexpectedly altered
4406 * while it is used by another thread in socket send/receive.
4407 *
4408 * sblock() must not fail here, hence the assertion.
4409 */
4410 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4411 VERIFY(sb->sb_flags & SB_LOCK);
4412
4413 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4414 sb->sb_flags |= SB_DROP;
4415 sb->sb_upcall = NULL;
4416 sb->sb_upcallarg = NULL;
4417
4418 sbunlock(sb, TRUE); /* keep socket locked */
4419
4420 selthreadclear(&sb->sb_sel);
4421 sbrelease(sb);
4422 }
4423
4424 void
sorflush(struct socket * so)4425 sorflush(struct socket *so)
4426 {
4427 struct sockbuf *sb = &so->so_rcv;
4428 struct protosw *pr = so->so_proto;
4429 struct sockbuf asb;
4430 #ifdef notyet
4431 lck_mtx_t *mutex_held;
4432 /*
4433 * XXX: This code is currently commented out, because we may get here
4434 * as part of sofreelastref(), and at that time, pr_getlock() may no
4435 * longer be able to return us the lock; this will be fixed in future.
4436 */
4437 if (so->so_proto->pr_getlock != NULL) {
4438 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4439 } else {
4440 mutex_held = so->so_proto->pr_domain->dom_mtx;
4441 }
4442
4443 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4444 #endif /* notyet */
4445
4446 sflt_notify(so, sock_evt_flush_read, NULL);
4447
4448 socantrcvmore(so);
4449
4450 /*
4451 * Obtain lock on the socket buffer (SB_LOCK). This is required
4452 * to prevent the socket buffer from being unexpectedly altered
4453 * while it is used by another thread in socket send/receive.
4454 *
4455 * sblock() must not fail here, hence the assertion.
4456 */
4457 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4458 VERIFY(sb->sb_flags & SB_LOCK);
4459
4460 /*
4461 * Copy only the relevant fields from "sb" to "asb" which we
4462 * need for sbrelease() to function. In particular, skip
4463 * sb_sel as it contains the wait queue linkage, which would
4464 * wreak havoc if we were to issue selthreadclear() on "asb".
4465 * Make sure to not carry over SB_LOCK in "asb", as we need
4466 * to acquire it later as part of sbrelease().
4467 */
4468 bzero(&asb, sizeof(asb));
4469 asb.sb_cc = sb->sb_cc;
4470 asb.sb_hiwat = sb->sb_hiwat;
4471 asb.sb_mbcnt = sb->sb_mbcnt;
4472 asb.sb_mbmax = sb->sb_mbmax;
4473 asb.sb_ctl = sb->sb_ctl;
4474 asb.sb_lowat = sb->sb_lowat;
4475 asb.sb_mb = sb->sb_mb;
4476 asb.sb_mbtail = sb->sb_mbtail;
4477 asb.sb_lastrecord = sb->sb_lastrecord;
4478 asb.sb_so = sb->sb_so;
4479 asb.sb_flags = sb->sb_flags;
4480 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4481 asb.sb_flags |= SB_DROP;
4482
4483 /*
4484 * Ideally we'd bzero() these and preserve the ones we need;
4485 * but to do that we'd need to shuffle things around in the
4486 * sockbuf, and we can't do it now because there are KEXTS
4487 * that are directly referring to the socket structure.
4488 *
4489 * Setting SB_DROP acts as a barrier to prevent further appends.
4490 * Clearing SB_SEL is done for selthreadclear() below.
4491 */
4492 sb->sb_cc = 0;
4493 sb->sb_hiwat = 0;
4494 sb->sb_mbcnt = 0;
4495 sb->sb_mbmax = 0;
4496 sb->sb_ctl = 0;
4497 sb->sb_lowat = 0;
4498 sb->sb_mb = NULL;
4499 sb->sb_mbtail = NULL;
4500 sb->sb_lastrecord = NULL;
4501 sb->sb_timeo.tv_sec = 0;
4502 sb->sb_timeo.tv_usec = 0;
4503 sb->sb_upcall = NULL;
4504 sb->sb_upcallarg = NULL;
4505 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4506 sb->sb_flags |= SB_DROP;
4507
4508 sbunlock(sb, TRUE); /* keep socket locked */
4509
4510 /*
4511 * Note that selthreadclear() is called on the original "sb" and
4512 * not the local "asb" because of the way wait queue linkage is
4513 * implemented. Given that selwakeup() may be triggered, SB_SEL
4514 * should no longer be set (cleared above.)
4515 */
4516 selthreadclear(&sb->sb_sel);
4517
4518 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4519 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4520 }
4521
4522 sbrelease(&asb);
4523 }
4524
4525 /*
4526 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4527 * an additional variant to handle the case where the option value needs
4528 * to be some kind of integer, but not a specific size.
4529 * In addition to their use here, these functions are also called by the
4530 * protocol-level pr_ctloutput() routines.
4531 *
4532 * Returns: 0 Success
4533 * EINVAL
4534 * copyin:EFAULT
4535 */
4536 int
sooptcopyin(struct sockopt * sopt,void * __sized_by (len)buf,size_t len,size_t minlen)4537 sooptcopyin(struct sockopt *sopt, void *__sized_by(len) buf, size_t len, size_t minlen)
4538 {
4539 size_t valsize;
4540
4541 /*
4542 * If the user gives us more than we wanted, we ignore it,
4543 * but if we don't get the minimum length the caller
4544 * wants, we return EINVAL. On success, sopt->sopt_valsize
4545 * is set to however much we actually retrieved.
4546 */
4547 if ((valsize = sopt->sopt_valsize) < minlen) {
4548 return EINVAL;
4549 }
4550 if (valsize > len) {
4551 sopt->sopt_valsize = valsize = len;
4552 }
4553
4554 if (sopt->sopt_p != kernproc) {
4555 return copyin(sopt->sopt_val, buf, valsize);
4556 }
4557
4558 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4559 CAST_DOWN(caddr_t, sopt->sopt_val),
4560 valsize);
4561 bcopy(tmp, buf, valsize);
4562
4563 return 0;
4564 }
4565
4566 /*
4567 * sooptcopyin_timeval
4568 * Copy in a timeval value into tv_p, and take into account whether the
4569 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4570 * code here so that we can verify the 64-bit tv_sec value before we lose
4571 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4572 */
4573 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4574 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4575 {
4576 int error;
4577
4578 if (proc_is64bit(sopt->sopt_p)) {
4579 struct user64_timeval tv64;
4580
4581 if (sopt->sopt_valsize < sizeof(tv64)) {
4582 return EINVAL;
4583 }
4584
4585 sopt->sopt_valsize = sizeof(tv64);
4586 if (sopt->sopt_p != kernproc) {
4587 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4588 if (error != 0) {
4589 return error;
4590 }
4591 } else {
4592 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4593 CAST_DOWN(caddr_t, sopt->sopt_val),
4594 sizeof(tv64));
4595 bcopy(tmp, &tv64, sizeof(tv64));
4596 }
4597 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4598 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4599 return EDOM;
4600 }
4601
4602 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4603 tv_p->tv_usec = tv64.tv_usec;
4604 } else {
4605 struct user32_timeval tv32;
4606
4607 if (sopt->sopt_valsize < sizeof(tv32)) {
4608 return EINVAL;
4609 }
4610
4611 sopt->sopt_valsize = sizeof(tv32);
4612 if (sopt->sopt_p != kernproc) {
4613 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4614 if (error != 0) {
4615 return error;
4616 }
4617 } else {
4618 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4619 CAST_DOWN(caddr_t, sopt->sopt_val),
4620 sizeof(tv32));
4621 bcopy(tmp, &tv32, sizeof(tv32));
4622 }
4623 #ifndef __LP64__
4624 /*
4625 * K64todo "comparison is always false due to
4626 * limited range of data type"
4627 */
4628 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4629 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4630 return EDOM;
4631 }
4632 #endif
4633 tv_p->tv_sec = tv32.tv_sec;
4634 tv_p->tv_usec = tv32.tv_usec;
4635 }
4636 return 0;
4637 }
4638
4639 int
sooptcopyin_bindtodevice(struct sockopt * sopt,char * __sized_by (bufsize)buf,size_t bufsize)4640 sooptcopyin_bindtodevice(struct sockopt *sopt, char * __sized_by(bufsize) buf, size_t bufsize)
4641 {
4642 #define MIN_BINDTODEVICE_NAME_SIZE 2
4643 size_t maxlen = bufsize - 1; /* the max string length that fits in the buffer */
4644
4645 if (bufsize < MIN_BINDTODEVICE_NAME_SIZE) {
4646 #if DEBUG || DEVELOPMENT
4647 os_log(OS_LOG_DEFAULT, "%s: bufsize %lu < MIN_BINDTODEVICE_NAME_SIZE %d",
4648 __func__, bufsize, MIN_BINDTODEVICE_NAME_SIZE);
4649 #endif /* DEBUG || DEVELOPMENT */
4650 return EINVAL;
4651 }
4652
4653 memset(buf, 0, bufsize);
4654
4655 /*
4656 * bufsize includes the end-of-string because of the uncertainty wether
4657 * interface names are passed as strings or byte buffers.
4658 * If the user gives us more than the max string length return EINVAL.
4659 * On success, sopt->sopt_valsize is not modified
4660 */
4661 maxlen = bufsize - 1;
4662 if (sopt->sopt_valsize > maxlen) {
4663 os_log(OS_LOG_DEFAULT, "%s: sopt_valsize %lu > maxlen %lu",
4664 __func__, sopt->sopt_valsize, maxlen);
4665 return EINVAL;
4666 }
4667
4668 if (sopt->sopt_p != kernproc) {
4669 return copyin(sopt->sopt_val, buf, sopt->sopt_valsize);
4670 } else {
4671 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4672 CAST_DOWN(caddr_t, sopt->sopt_val),
4673 sopt->sopt_valsize);
4674 bcopy(tmp, buf, sopt->sopt_valsize);
4675 }
4676
4677 return 0;
4678 #undef MIN_BINDTODEVICE_NAME_SIZE
4679 }
4680
4681 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4682 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4683 boolean_t ignore_delegate)
4684 {
4685 kauth_cred_t cred = NULL;
4686 proc_t ep = PROC_NULL;
4687 uid_t uid;
4688 int error = 0;
4689
4690 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4691 ep = proc_find(so->e_pid);
4692 if (ep) {
4693 cred = kauth_cred_proc_ref(ep);
4694 }
4695 }
4696
4697 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4698
4699 /* uid is 0 for root */
4700 if (uid != 0 || !allow_root) {
4701 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4702 }
4703 if (cred) {
4704 kauth_cred_unref(&cred);
4705 }
4706 if (ep != PROC_NULL) {
4707 proc_rele(ep);
4708 }
4709
4710 return error;
4711 }
4712
4713 /*
4714 * Returns: 0 Success
4715 * EINVAL
4716 * ENOPROTOOPT
4717 * ENOBUFS
4718 * EDOM
4719 * sooptcopyin:EINVAL
4720 * sooptcopyin:EFAULT
4721 * sooptcopyin_timeval:EINVAL
4722 * sooptcopyin_timeval:EFAULT
4723 * sooptcopyin_timeval:EDOM
4724 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4725 * <pr_ctloutput>:???w
4726 * sflt_attach_private:??? [whatever a filter author chooses]
4727 * <sf_setoption>:??? [whatever a filter author chooses]
4728 *
4729 * Notes: Other <pru_listen> returns depend on the protocol family; all
4730 * <sf_listen> returns depend on what the filter author causes
4731 * their filter to return.
4732 */
4733 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4734 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4735 {
4736 int error, optval;
4737 int64_t long_optval;
4738 struct linger l;
4739 struct timeval tv;
4740
4741 if (sopt->sopt_dir != SOPT_SET) {
4742 sopt->sopt_dir = SOPT_SET;
4743 }
4744
4745 if (dolock) {
4746 socket_lock(so, 1);
4747 }
4748
4749 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4750 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4751 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4752 /* the socket has been shutdown, no more sockopt's */
4753 error = EINVAL;
4754 goto out;
4755 }
4756
4757 error = sflt_setsockopt(so, sopt);
4758 if (error != 0) {
4759 if (error == EJUSTRETURN) {
4760 error = 0;
4761 }
4762 goto out;
4763 }
4764
4765 if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
4766 if (so->so_proto != NULL &&
4767 so->so_proto->pr_ctloutput != NULL) {
4768 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4769 goto out;
4770 }
4771 error = ENOPROTOOPT;
4772 } else {
4773 /*
4774 * Allow socket-level (SOL_SOCKET) options to be filtered by
4775 * the protocol layer, if needed. A zero value returned from
4776 * the handler means use default socket-level processing as
4777 * done by the rest of this routine. Otherwise, any other
4778 * return value indicates that the option is unsupported.
4779 */
4780 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4781 pru_socheckopt(so, sopt)) != 0) {
4782 goto out;
4783 }
4784
4785 error = 0;
4786 switch (sopt->sopt_name) {
4787 case SO_LINGER:
4788 case SO_LINGER_SEC: {
4789 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4790 if (error != 0) {
4791 goto out;
4792 }
4793 /* Make sure to use sane values */
4794 if (sopt->sopt_name == SO_LINGER) {
4795 so->so_linger = (short)l.l_linger;
4796 } else {
4797 so->so_linger = (short)((long)l.l_linger * hz);
4798 }
4799 if (l.l_onoff != 0) {
4800 so->so_options |= SO_LINGER;
4801 } else {
4802 so->so_options &= ~SO_LINGER;
4803 }
4804 break;
4805 }
4806 case SO_DEBUG:
4807 case SO_KEEPALIVE:
4808 case SO_DONTROUTE:
4809 case SO_USELOOPBACK:
4810 case SO_BROADCAST:
4811 case SO_REUSEADDR:
4812 case SO_REUSEPORT:
4813 case SO_OOBINLINE:
4814 case SO_TIMESTAMP:
4815 case SO_TIMESTAMP_MONOTONIC:
4816 case SO_TIMESTAMP_CONTINUOUS:
4817 case SO_DONTTRUNC:
4818 case SO_WANTMORE:
4819 case SO_WANTOOBFLAG:
4820 case SO_NOWAKEFROMSLEEP:
4821 case SO_NOAPNFALLBK:
4822 error = sooptcopyin(sopt, &optval, sizeof(optval),
4823 sizeof(optval));
4824 if (error != 0) {
4825 goto out;
4826 }
4827 if (optval) {
4828 so->so_options |= sopt->sopt_name;
4829 } else {
4830 so->so_options &= ~sopt->sopt_name;
4831 }
4832 #if SKYWALK
4833 inp_update_netns_flags(so);
4834 #endif /* SKYWALK */
4835 break;
4836
4837 case SO_SNDBUF:
4838 case SO_RCVBUF:
4839 case SO_SNDLOWAT:
4840 case SO_RCVLOWAT:
4841 error = sooptcopyin(sopt, &optval, sizeof(optval),
4842 sizeof(optval));
4843 if (error != 0) {
4844 goto out;
4845 }
4846
4847 /*
4848 * Values < 1 make no sense for any of these
4849 * options, so disallow them.
4850 */
4851 if (optval < 1) {
4852 error = EINVAL;
4853 goto out;
4854 }
4855
4856 switch (sopt->sopt_name) {
4857 case SO_SNDBUF:
4858 case SO_RCVBUF: {
4859 struct sockbuf *sb =
4860 (sopt->sopt_name == SO_SNDBUF) ?
4861 &so->so_snd : &so->so_rcv;
4862 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4863 error = ENOBUFS;
4864 goto out;
4865 }
4866 sb->sb_flags |= SB_USRSIZE;
4867 sb->sb_flags &= ~SB_AUTOSIZE;
4868 sb->sb_idealsize = (u_int32_t)optval;
4869 break;
4870 }
4871 /*
4872 * Make sure the low-water is never greater than
4873 * the high-water.
4874 */
4875 case SO_SNDLOWAT: {
4876 int space = sbspace(&so->so_snd);
4877 uint32_t hiwat = so->so_snd.sb_hiwat;
4878
4879 if (so->so_snd.sb_flags & SB_UNIX) {
4880 struct unpcb *unp =
4881 (struct unpcb *)(so->so_pcb);
4882 if (unp != NULL &&
4883 unp->unp_conn != NULL) {
4884 struct socket *so2 = unp->unp_conn->unp_socket;
4885 hiwat += unp->unp_conn->unp_cc;
4886 space = sbspace(&so2->so_rcv);
4887 }
4888 }
4889
4890 so->so_snd.sb_lowat =
4891 (optval > hiwat) ?
4892 hiwat : optval;
4893
4894 if (space >= so->so_snd.sb_lowat) {
4895 sowwakeup(so);
4896 }
4897 break;
4898 }
4899 case SO_RCVLOWAT: {
4900 int64_t data_len;
4901 so->so_rcv.sb_lowat =
4902 (optval > so->so_rcv.sb_hiwat) ?
4903 so->so_rcv.sb_hiwat : optval;
4904 if (so->so_rcv.sb_flags & SB_UNIX) {
4905 struct unpcb *unp =
4906 (struct unpcb *)(so->so_pcb);
4907 if (unp != NULL &&
4908 unp->unp_conn != NULL) {
4909 struct socket *so2 = unp->unp_conn->unp_socket;
4910 data_len = so2->so_snd.sb_cc
4911 - so2->so_snd.sb_ctl;
4912 } else {
4913 data_len = so->so_rcv.sb_cc
4914 - so->so_rcv.sb_ctl;
4915 }
4916 } else {
4917 data_len = so->so_rcv.sb_cc
4918 - so->so_rcv.sb_ctl;
4919 }
4920
4921 if (data_len >= so->so_rcv.sb_lowat) {
4922 sorwakeup(so);
4923 }
4924 break;
4925 }
4926 }
4927 break;
4928
4929 case SO_SNDTIMEO:
4930 case SO_RCVTIMEO:
4931 error = sooptcopyin_timeval(sopt, &tv);
4932 if (error != 0) {
4933 goto out;
4934 }
4935
4936 switch (sopt->sopt_name) {
4937 case SO_SNDTIMEO:
4938 so->so_snd.sb_timeo = tv;
4939 break;
4940 case SO_RCVTIMEO:
4941 so->so_rcv.sb_timeo = tv;
4942 break;
4943 }
4944 break;
4945
4946 case SO_NKE: {
4947 struct so_nke nke;
4948
4949 error = sooptcopyin(sopt, &nke, sizeof(nke),
4950 sizeof(nke));
4951 if (error != 0) {
4952 goto out;
4953 }
4954
4955 error = sflt_attach_internal(so, nke.nke_handle);
4956 break;
4957 }
4958
4959 case SO_NOSIGPIPE:
4960 error = sooptcopyin(sopt, &optval, sizeof(optval),
4961 sizeof(optval));
4962 if (error != 0) {
4963 goto out;
4964 }
4965 if (optval != 0) {
4966 so->so_flags |= SOF_NOSIGPIPE;
4967 } else {
4968 so->so_flags &= ~SOF_NOSIGPIPE;
4969 }
4970 break;
4971
4972 case SO_NOADDRERR:
4973 error = sooptcopyin(sopt, &optval, sizeof(optval),
4974 sizeof(optval));
4975 if (error != 0) {
4976 goto out;
4977 }
4978 if (optval != 0) {
4979 so->so_flags |= SOF_NOADDRAVAIL;
4980 } else {
4981 so->so_flags &= ~SOF_NOADDRAVAIL;
4982 }
4983 break;
4984
4985 case SO_REUSESHAREUID:
4986 error = sooptcopyin(sopt, &optval, sizeof(optval),
4987 sizeof(optval));
4988 if (error != 0) {
4989 goto out;
4990 }
4991 if (optval != 0) {
4992 so->so_flags |= SOF_REUSESHAREUID;
4993 } else {
4994 so->so_flags &= ~SOF_REUSESHAREUID;
4995 }
4996 break;
4997
4998 case SO_NOTIFYCONFLICT:
4999 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5000 error = EPERM;
5001 goto out;
5002 }
5003 error = sooptcopyin(sopt, &optval, sizeof(optval),
5004 sizeof(optval));
5005 if (error != 0) {
5006 goto out;
5007 }
5008 if (optval != 0) {
5009 so->so_flags |= SOF_NOTIFYCONFLICT;
5010 } else {
5011 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5012 }
5013 break;
5014
5015 case SO_RESTRICTIONS:
5016 error = sooptcopyin(sopt, &optval, sizeof(optval),
5017 sizeof(optval));
5018 if (error != 0) {
5019 goto out;
5020 }
5021
5022 error = so_set_restrictions(so, optval);
5023 break;
5024
5025 case SO_AWDL_UNRESTRICTED:
5026 if (SOCK_DOM(so) != PF_INET &&
5027 SOCK_DOM(so) != PF_INET6) {
5028 error = EOPNOTSUPP;
5029 goto out;
5030 }
5031 error = sooptcopyin(sopt, &optval, sizeof(optval),
5032 sizeof(optval));
5033 if (error != 0) {
5034 goto out;
5035 }
5036 if (optval != 0) {
5037 error = soopt_cred_check(so,
5038 PRIV_NET_RESTRICTED_AWDL, false, false);
5039 if (error == 0) {
5040 inp_set_awdl_unrestricted(
5041 sotoinpcb(so));
5042 }
5043 } else {
5044 inp_clear_awdl_unrestricted(sotoinpcb(so));
5045 }
5046 break;
5047 case SO_INTCOPROC_ALLOW:
5048 if (SOCK_DOM(so) != PF_INET6) {
5049 error = EOPNOTSUPP;
5050 goto out;
5051 }
5052 error = sooptcopyin(sopt, &optval, sizeof(optval),
5053 sizeof(optval));
5054 if (error != 0) {
5055 goto out;
5056 }
5057 if (optval != 0 &&
5058 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5059 error = soopt_cred_check(so,
5060 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5061 if (error == 0) {
5062 inp_set_intcoproc_allowed(
5063 sotoinpcb(so));
5064 }
5065 } else if (optval == 0) {
5066 inp_clear_intcoproc_allowed(sotoinpcb(so));
5067 }
5068 break;
5069
5070 case SO_LABEL:
5071 error = EOPNOTSUPP;
5072 break;
5073
5074 case SO_UPCALLCLOSEWAIT:
5075 error = sooptcopyin(sopt, &optval, sizeof(optval),
5076 sizeof(optval));
5077 if (error != 0) {
5078 goto out;
5079 }
5080 if (optval != 0) {
5081 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5082 } else {
5083 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5084 }
5085 break;
5086
5087 case SO_RANDOMPORT:
5088 error = sooptcopyin(sopt, &optval, sizeof(optval),
5089 sizeof(optval));
5090 if (error != 0) {
5091 goto out;
5092 }
5093 if (optval != 0) {
5094 so->so_flags |= SOF_BINDRANDOMPORT;
5095 } else {
5096 so->so_flags &= ~SOF_BINDRANDOMPORT;
5097 }
5098 break;
5099
5100 case SO_NP_EXTENSIONS: {
5101 struct so_np_extensions sonpx;
5102
5103 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5104 sizeof(sonpx));
5105 if (error != 0) {
5106 goto out;
5107 }
5108 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5109 error = EINVAL;
5110 goto out;
5111 }
5112 /*
5113 * Only one bit defined for now
5114 */
5115 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5116 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5117 so->so_flags |= SOF_NPX_SETOPTSHUT;
5118 } else {
5119 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5120 }
5121 }
5122 break;
5123 }
5124
5125 case SO_TRAFFIC_CLASS: {
5126 error = sooptcopyin(sopt, &optval, sizeof(optval),
5127 sizeof(optval));
5128 if (error != 0) {
5129 goto out;
5130 }
5131 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5132 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5133 error = so_set_net_service_type(so, netsvc);
5134 goto out;
5135 }
5136 error = so_set_traffic_class(so, optval);
5137 if (error != 0) {
5138 goto out;
5139 }
5140 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5141 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5142 break;
5143 }
5144
5145 case SO_RECV_TRAFFIC_CLASS: {
5146 error = sooptcopyin(sopt, &optval, sizeof(optval),
5147 sizeof(optval));
5148 if (error != 0) {
5149 goto out;
5150 }
5151 if (optval == 0) {
5152 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5153 } else {
5154 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5155 }
5156 break;
5157 }
5158
5159 #if (DEVELOPMENT || DEBUG)
5160 case SO_TRAFFIC_CLASS_DBG: {
5161 struct so_tcdbg so_tcdbg;
5162
5163 error = sooptcopyin(sopt, &so_tcdbg,
5164 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5165 if (error != 0) {
5166 goto out;
5167 }
5168 error = so_set_tcdbg(so, &so_tcdbg);
5169 if (error != 0) {
5170 goto out;
5171 }
5172 break;
5173 }
5174 #endif /* (DEVELOPMENT || DEBUG) */
5175
5176 case SO_PRIVILEGED_TRAFFIC_CLASS:
5177 error = priv_check_cred(kauth_cred_get(),
5178 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5179 if (error != 0) {
5180 goto out;
5181 }
5182 error = sooptcopyin(sopt, &optval, sizeof(optval),
5183 sizeof(optval));
5184 if (error != 0) {
5185 goto out;
5186 }
5187 if (optval == 0) {
5188 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5189 } else {
5190 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5191 }
5192 break;
5193
5194 #if (DEVELOPMENT || DEBUG)
5195 case SO_DEFUNCTIT:
5196 error = sosetdefunct(current_proc(), so, 0, FALSE);
5197 if (error == 0) {
5198 error = sodefunct(current_proc(), so, 0);
5199 }
5200
5201 break;
5202 #endif /* (DEVELOPMENT || DEBUG) */
5203
5204 case SO_DEFUNCTOK:
5205 error = sooptcopyin(sopt, &optval, sizeof(optval),
5206 sizeof(optval));
5207 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5208 if (error == 0) {
5209 error = EBADF;
5210 }
5211 goto out;
5212 }
5213 /*
5214 * Any process can set SO_DEFUNCTOK (clear
5215 * SOF_NODEFUNCT), but only root can clear
5216 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5217 */
5218 if (optval == 0 &&
5219 kauth_cred_issuser(kauth_cred_get()) == 0) {
5220 error = EPERM;
5221 goto out;
5222 }
5223 if (optval) {
5224 so->so_flags &= ~SOF_NODEFUNCT;
5225 } else {
5226 so->so_flags |= SOF_NODEFUNCT;
5227 }
5228
5229 if (SOCK_DOM(so) == PF_INET ||
5230 SOCK_DOM(so) == PF_INET6) {
5231 char s[MAX_IPv6_STR_LEN];
5232 char d[MAX_IPv6_STR_LEN];
5233 struct inpcb *inp = sotoinpcb(so);
5234
5235 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5236 "[%s %s:%d -> %s:%d] is now marked "
5237 "as %seligible for "
5238 "defunct\n", __func__, proc_selfpid(),
5239 proc_best_name(current_proc()),
5240 so->so_gencnt,
5241 (SOCK_TYPE(so) == SOCK_STREAM) ?
5242 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5243 ((SOCK_DOM(so) == PF_INET) ?
5244 (void *)&inp->inp_laddr.s_addr :
5245 (void *)&inp->in6p_laddr), s, sizeof(s)),
5246 ntohs(inp->in6p_lport),
5247 inet_ntop(SOCK_DOM(so),
5248 (SOCK_DOM(so) == PF_INET) ?
5249 (void *)&inp->inp_faddr.s_addr :
5250 (void *)&inp->in6p_faddr, d, sizeof(d)),
5251 ntohs(inp->in6p_fport),
5252 (so->so_flags & SOF_NODEFUNCT) ?
5253 "not " : "");
5254 } else {
5255 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5256 "is now marked as %seligible for "
5257 "defunct\n",
5258 __func__, proc_selfpid(),
5259 proc_best_name(current_proc()),
5260 so->so_gencnt,
5261 SOCK_DOM(so), SOCK_TYPE(so),
5262 (so->so_flags & SOF_NODEFUNCT) ?
5263 "not " : "");
5264 }
5265 break;
5266
5267 case SO_ISDEFUNCT:
5268 /* This option is not settable */
5269 error = EINVAL;
5270 break;
5271
5272 case SO_OPPORTUNISTIC:
5273 error = sooptcopyin(sopt, &optval, sizeof(optval),
5274 sizeof(optval));
5275 if (error == 0) {
5276 error = so_set_opportunistic(so, optval);
5277 }
5278 break;
5279
5280 case SO_FLUSH:
5281 /* This option is handled by lower layer(s) */
5282 error = 0;
5283 break;
5284
5285 case SO_RECV_ANYIF:
5286 error = sooptcopyin(sopt, &optval, sizeof(optval),
5287 sizeof(optval));
5288 if (error == 0) {
5289 error = so_set_recv_anyif(so, optval);
5290 }
5291 break;
5292
5293 case SO_TRAFFIC_MGT_BACKGROUND: {
5294 /* This option is handled by lower layer(s) */
5295 error = 0;
5296 break;
5297 }
5298
5299 #if FLOW_DIVERT
5300 case SO_FLOW_DIVERT_TOKEN:
5301 error = flow_divert_token_set(so, sopt);
5302 break;
5303 #endif /* FLOW_DIVERT */
5304
5305
5306 case SO_DELEGATED:
5307 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5308 sizeof(optval))) != 0) {
5309 break;
5310 }
5311
5312 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5313 break;
5314
5315 case SO_DELEGATED_UUID: {
5316 uuid_t euuid;
5317
5318 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5319 sizeof(euuid))) != 0) {
5320 break;
5321 }
5322
5323 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5324 break;
5325 }
5326
5327 #if NECP
5328 case SO_NECP_ATTRIBUTES:
5329 if (SOCK_DOM(so) == PF_MULTIPATH) {
5330 /* Handled by MPTCP itself */
5331 break;
5332 }
5333
5334 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5335 error = EINVAL;
5336 goto out;
5337 }
5338
5339 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5340 break;
5341
5342 case SO_NECP_CLIENTUUID: {
5343 if (SOCK_DOM(so) == PF_MULTIPATH) {
5344 /* Handled by MPTCP itself */
5345 break;
5346 }
5347
5348 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5349 error = EINVAL;
5350 goto out;
5351 }
5352
5353 struct inpcb *inp = sotoinpcb(so);
5354 if (!uuid_is_null(inp->necp_client_uuid)) {
5355 // Clear out the old client UUID if present
5356 necp_inpcb_remove_cb(inp);
5357 }
5358
5359 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5360 sizeof(uuid_t), sizeof(uuid_t));
5361 if (error != 0) {
5362 goto out;
5363 }
5364
5365 if (uuid_is_null(inp->necp_client_uuid)) {
5366 error = EINVAL;
5367 goto out;
5368 }
5369
5370 pid_t current_pid = proc_pid(current_proc());
5371 error = necp_client_register_socket_flow(current_pid,
5372 inp->necp_client_uuid, inp);
5373 if (error != 0) {
5374 uuid_clear(inp->necp_client_uuid);
5375 goto out;
5376 }
5377
5378 if (inp->inp_lport != 0) {
5379 // There is a bound local port, so this is not
5380 // a fresh socket. Assign to the client.
5381 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5382 }
5383
5384 break;
5385 }
5386 case SO_NECP_LISTENUUID: {
5387 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5388 error = EINVAL;
5389 goto out;
5390 }
5391
5392 struct inpcb *inp = sotoinpcb(so);
5393 if (!uuid_is_null(inp->necp_client_uuid)) {
5394 error = EINVAL;
5395 goto out;
5396 }
5397
5398 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5399 sizeof(uuid_t), sizeof(uuid_t));
5400 if (error != 0) {
5401 goto out;
5402 }
5403
5404 if (uuid_is_null(inp->necp_client_uuid)) {
5405 error = EINVAL;
5406 goto out;
5407 }
5408
5409 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5410 inp->necp_client_uuid, inp);
5411 if (error != 0) {
5412 uuid_clear(inp->necp_client_uuid);
5413 goto out;
5414 }
5415
5416 // Mark that the port registration is held by NECP
5417 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5418
5419 break;
5420 }
5421
5422 case SO_RESOLVER_SIGNATURE: {
5423 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5424 error = EINVAL;
5425 goto out;
5426 }
5427 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5428 break;
5429 }
5430 #endif /* NECP */
5431
5432 case SO_EXTENDED_BK_IDLE:
5433 error = sooptcopyin(sopt, &optval, sizeof(optval),
5434 sizeof(optval));
5435 if (error == 0) {
5436 error = so_set_extended_bk_idle(so, optval);
5437 }
5438 break;
5439
5440 case SO_MARK_CELLFALLBACK:
5441 error = sooptcopyin(sopt, &optval, sizeof(optval),
5442 sizeof(optval));
5443 if (error != 0) {
5444 goto out;
5445 }
5446 if (optval < 0) {
5447 error = EINVAL;
5448 goto out;
5449 }
5450 if (optval == 0) {
5451 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5452 } else {
5453 so->so_flags1 |= SOF1_CELLFALLBACK;
5454 }
5455 break;
5456
5457 case SO_MARK_CELLFALLBACK_UUID:
5458 {
5459 struct so_mark_cellfallback_uuid_args args;
5460
5461 error = sooptcopyin(sopt, &args, sizeof(args),
5462 sizeof(args));
5463 if (error != 0) {
5464 goto out;
5465 }
5466 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5467 args.flow_cellfallback);
5468 break;
5469 }
5470
5471 case SO_FALLBACK_MODE:
5472 error = sooptcopyin(sopt, &optval, sizeof(optval),
5473 sizeof(optval));
5474 if (error != 0) {
5475 goto out;
5476 }
5477 if (optval < SO_FALLBACK_MODE_NONE ||
5478 optval > SO_FALLBACK_MODE_PREFER) {
5479 error = EINVAL;
5480 goto out;
5481 }
5482 so->so_fallback_mode = (u_int8_t)optval;
5483 break;
5484
5485 case SO_MARK_KNOWN_TRACKER: {
5486 error = sooptcopyin(sopt, &optval, sizeof(optval),
5487 sizeof(optval));
5488 if (error != 0) {
5489 goto out;
5490 }
5491 if (optval < 0) {
5492 error = EINVAL;
5493 goto out;
5494 }
5495 if (optval == 0) {
5496 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5497 } else {
5498 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5499 }
5500 break;
5501 }
5502
5503 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5504 error = sooptcopyin(sopt, &optval, sizeof(optval),
5505 sizeof(optval));
5506 if (error != 0) {
5507 goto out;
5508 }
5509 if (optval < 0) {
5510 error = EINVAL;
5511 goto out;
5512 }
5513 if (optval == 0) {
5514 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5515 } else {
5516 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5517 }
5518 break;
5519 }
5520
5521 case SO_MARK_APPROVED_APP_DOMAIN: {
5522 error = sooptcopyin(sopt, &optval, sizeof(optval),
5523 sizeof(optval));
5524 if (error != 0) {
5525 goto out;
5526 }
5527 if (optval < 0) {
5528 error = EINVAL;
5529 goto out;
5530 }
5531 if (optval == 0) {
5532 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5533 } else {
5534 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5535 }
5536 break;
5537 }
5538
5539 case SO_STATISTICS_EVENT:
5540 error = sooptcopyin(sopt, &long_optval,
5541 sizeof(long_optval), sizeof(long_optval));
5542 if (error != 0) {
5543 goto out;
5544 }
5545 u_int64_t nstat_event = 0;
5546 error = so_statistics_event_to_nstat_event(
5547 &long_optval, &nstat_event);
5548 if (error != 0) {
5549 goto out;
5550 }
5551 nstat_pcb_event(sotoinpcb(so), nstat_event);
5552 break;
5553
5554 case SO_NET_SERVICE_TYPE: {
5555 error = sooptcopyin(sopt, &optval, sizeof(optval),
5556 sizeof(optval));
5557 if (error != 0) {
5558 goto out;
5559 }
5560 error = so_set_net_service_type(so, optval);
5561 break;
5562 }
5563
5564 case SO_QOSMARKING_POLICY_OVERRIDE:
5565 error = priv_check_cred(kauth_cred_get(),
5566 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5567 if (error != 0) {
5568 goto out;
5569 }
5570 error = sooptcopyin(sopt, &optval, sizeof(optval),
5571 sizeof(optval));
5572 if (error != 0) {
5573 goto out;
5574 }
5575 if (optval == 0) {
5576 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5577 } else {
5578 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5579 }
5580 break;
5581
5582 case SO_MPKL_SEND_INFO: {
5583 struct so_mpkl_send_info so_mpkl_send_info;
5584
5585 error = sooptcopyin(sopt, &so_mpkl_send_info,
5586 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5587 if (error != 0) {
5588 goto out;
5589 }
5590 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5591 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5592
5593 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5594 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5595 } else {
5596 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5597 }
5598 break;
5599 }
5600 case SO_WANT_KEV_SOCKET_CLOSED: {
5601 error = sooptcopyin(sopt, &optval, sizeof(optval),
5602 sizeof(optval));
5603 if (error != 0) {
5604 goto out;
5605 }
5606 if (optval == 0) {
5607 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5608 } else {
5609 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5610 }
5611 break;
5612 }
5613 case SO_MARK_WAKE_PKT: {
5614 error = sooptcopyin(sopt, &optval, sizeof(optval),
5615 sizeof(optval));
5616 if (error != 0) {
5617 goto out;
5618 }
5619 if (optval == 0) {
5620 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5621 } else {
5622 so->so_flags |= SOF_MARK_WAKE_PKT;
5623 }
5624 break;
5625 }
5626 case SO_RECV_WAKE_PKT: {
5627 error = sooptcopyin(sopt, &optval, sizeof(optval),
5628 sizeof(optval));
5629 if (error != 0) {
5630 goto out;
5631 }
5632 if (optval == 0) {
5633 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5634 } else {
5635 so->so_flags |= SOF_RECV_WAKE_PKT;
5636 }
5637 break;
5638 }
5639 case SO_APPLICATION_ID: {
5640 so_application_id_t application_id = { 0 };
5641
5642 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5643 error = EINVAL;
5644 goto out;
5645 }
5646 error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5647 sizeof(application_id));
5648 if (error != 0) {
5649 goto out;
5650 }
5651
5652 // The user needs to match
5653 if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5654 error = EINVAL;
5655 printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5656 goto out;
5657 }
5658 error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5659 if (error != 0) {
5660 printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5661 goto out;
5662 }
5663 if (application_id.persona_id != PERSONA_ID_NONE) {
5664 so->so_persona_id = application_id.persona_id;
5665 }
5666 break;
5667 }
5668 case SO_MARK_DOMAIN_INFO_SILENT:
5669 error = sooptcopyin(sopt, &optval, sizeof(optval),
5670 sizeof(optval));
5671 if (error != 0) {
5672 goto out;
5673 }
5674 if (optval < 0) {
5675 error = EINVAL;
5676 goto out;
5677 }
5678 if (optval == 0) {
5679 so->so_flags1 &= ~SOF1_DOMAIN_INFO_SILENT;
5680 } else {
5681 so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
5682 }
5683 break;
5684 case SO_MAX_PACING_RATE: {
5685 uint64_t pacingrate;
5686
5687 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5688 error = EINVAL;
5689 goto out;
5690 }
5691
5692 error = sooptcopyin(sopt, &pacingrate,
5693 sizeof(pacingrate), sizeof(pacingrate));
5694 if (error != 0) {
5695 goto out;
5696 }
5697
5698 if (pacingrate == 0) {
5699 error = EINVAL;
5700 goto out;
5701 }
5702 sotoinpcb(so)->inp_max_pacing_rate = pacingrate;
5703 break;
5704 }
5705 case SO_CONNECTION_IDLE: {
5706 int is_idle;
5707
5708 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5709 error = EINVAL;
5710 goto out;
5711 }
5712
5713 error = sooptcopyin(sopt, &is_idle,
5714 sizeof(is_idle), sizeof(is_idle));
5715 if (error != 0) {
5716 goto out;
5717 }
5718
5719 if (is_idle != 0) {
5720 sotoinpcb(so)->inp_flags2 |= INP2_CONNECTION_IDLE;
5721 } else {
5722 sotoinpcb(so)->inp_flags2 &= ~INP2_CONNECTION_IDLE;
5723 }
5724 break;
5725 }
5726 default:
5727 error = ENOPROTOOPT;
5728 break;
5729 }
5730 if (error == 0 && so->so_proto != NULL &&
5731 so->so_proto->pr_ctloutput != NULL) {
5732 (void) so->so_proto->pr_ctloutput(so, sopt);
5733 }
5734 }
5735 out:
5736 if (dolock) {
5737 socket_unlock(so, 1);
5738 }
5739 return error;
5740 }
5741
5742 /* Helper routines for getsockopt */
5743 int
sooptcopyout(struct sockopt * sopt,void * __sized_by (len)buf,size_t len)5744 sooptcopyout(struct sockopt *sopt, void *__sized_by(len) buf, size_t len)
5745 {
5746 int error;
5747 size_t valsize;
5748
5749 error = 0;
5750
5751 /*
5752 * Documented get behavior is that we always return a value,
5753 * possibly truncated to fit in the user's buffer.
5754 * Traditional behavior is that we always tell the user
5755 * precisely how much we copied, rather than something useful
5756 * like the total amount we had available for her.
5757 * Note that this interface is not idempotent; the entire answer must
5758 * generated ahead of time.
5759 */
5760 valsize = MIN(len, sopt->sopt_valsize);
5761 sopt->sopt_valsize = valsize;
5762 if (sopt->sopt_valsize != 0 && sopt->sopt_val != USER_ADDR_NULL) {
5763 if (sopt->sopt_p != kernproc) {
5764 error = copyout(buf, sopt->sopt_val, valsize);
5765 } else {
5766 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5767 CAST_DOWN(caddr_t, sopt->sopt_val),
5768 valsize);
5769 bcopy(buf, tmp, valsize);
5770 }
5771 }
5772 return error;
5773 }
5774
5775 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5776 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5777 {
5778 int error;
5779 size_t len;
5780 struct user64_timeval tv64 = {};
5781 struct user32_timeval tv32 = {};
5782 const void * val;
5783 size_t valsize;
5784
5785 error = 0;
5786 if (proc_is64bit(sopt->sopt_p)) {
5787 len = sizeof(tv64);
5788 tv64.tv_sec = tv_p->tv_sec;
5789 tv64.tv_usec = tv_p->tv_usec;
5790 val = &tv64;
5791 } else {
5792 len = sizeof(tv32);
5793 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5794 tv32.tv_usec = tv_p->tv_usec;
5795 val = &tv32;
5796 }
5797 valsize = MIN(len, sopt->sopt_valsize);
5798 sopt->sopt_valsize = valsize;
5799 if (sopt->sopt_val != USER_ADDR_NULL) {
5800 if (sopt->sopt_p != kernproc) {
5801 error = copyout(val, sopt->sopt_val, valsize);
5802 } else {
5803 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5804 CAST_DOWN(caddr_t, sopt->sopt_val),
5805 valsize);
5806 bcopy(val, tmp, valsize);
5807 }
5808 }
5809 return error;
5810 }
5811
5812 /*
5813 * Return: 0 Success
5814 * ENOPROTOOPT
5815 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5816 * <pr_ctloutput>:???
5817 * <sf_getoption>:???
5818 */
5819 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5820 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5821 {
5822 int error, optval;
5823 struct linger l;
5824 struct timeval tv;
5825
5826 if (sopt->sopt_dir != SOPT_GET) {
5827 sopt->sopt_dir = SOPT_GET;
5828 }
5829
5830 if (dolock) {
5831 socket_lock(so, 1);
5832 }
5833
5834 error = sflt_getsockopt(so, sopt);
5835 if (error != 0) {
5836 if (error == EJUSTRETURN) {
5837 error = 0;
5838 }
5839 goto out;
5840 }
5841
5842 if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
5843 if (so->so_proto != NULL &&
5844 so->so_proto->pr_ctloutput != NULL) {
5845 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5846 goto out;
5847 }
5848 error = ENOPROTOOPT;
5849 } else {
5850 /*
5851 * Allow socket-level (SOL_SOCKET) options to be filtered by
5852 * the protocol layer, if needed. A zero value returned from
5853 * the handler means use default socket-level processing as
5854 * done by the rest of this routine. Otherwise, any other
5855 * return value indicates that the option is unsupported.
5856 */
5857 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5858 pru_socheckopt(so, sopt)) != 0) {
5859 goto out;
5860 }
5861
5862 error = 0;
5863 switch (sopt->sopt_name) {
5864 case SO_LINGER:
5865 case SO_LINGER_SEC:
5866 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5867 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5868 so->so_linger : so->so_linger / hz;
5869 error = sooptcopyout(sopt, &l, sizeof(l));
5870 break;
5871
5872 case SO_USELOOPBACK:
5873 case SO_DONTROUTE:
5874 case SO_DEBUG:
5875 case SO_KEEPALIVE:
5876 case SO_REUSEADDR:
5877 case SO_REUSEPORT:
5878 case SO_BROADCAST:
5879 case SO_OOBINLINE:
5880 case SO_TIMESTAMP:
5881 case SO_TIMESTAMP_MONOTONIC:
5882 case SO_TIMESTAMP_CONTINUOUS:
5883 case SO_DONTTRUNC:
5884 case SO_WANTMORE:
5885 case SO_WANTOOBFLAG:
5886 case SO_NOWAKEFROMSLEEP:
5887 case SO_NOAPNFALLBK:
5888 optval = so->so_options & sopt->sopt_name;
5889 integer:
5890 error = sooptcopyout(sopt, &optval, sizeof(optval));
5891 break;
5892
5893 case SO_TYPE:
5894 optval = so->so_type;
5895 goto integer;
5896
5897 case SO_NREAD:
5898 if (so->so_proto->pr_flags & PR_ATOMIC) {
5899 int pkt_total;
5900 struct mbuf *m1;
5901
5902 pkt_total = 0;
5903 m1 = so->so_rcv.sb_mb;
5904 while (m1 != NULL) {
5905 if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5906 pkt_total += m1->m_len;
5907 }
5908 m1 = m1->m_next;
5909 }
5910 optval = pkt_total;
5911 } else {
5912 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5913 }
5914 goto integer;
5915
5916 case SO_NUMRCVPKT:
5917 if (so->so_proto->pr_flags & PR_ATOMIC) {
5918 int cnt = 0;
5919 struct mbuf *m1;
5920
5921 m1 = so->so_rcv.sb_mb;
5922 while (m1 != NULL) {
5923 cnt += 1;
5924 m1 = m1->m_nextpkt;
5925 }
5926 optval = cnt;
5927 goto integer;
5928 } else {
5929 error = ENOPROTOOPT;
5930 break;
5931 }
5932
5933 case SO_NWRITE:
5934 optval = so->so_snd.sb_cc;
5935 goto integer;
5936
5937 case SO_ERROR:
5938 optval = so->so_error;
5939 so->so_error = 0;
5940 goto integer;
5941
5942 case SO_SNDBUF: {
5943 u_int32_t hiwat = so->so_snd.sb_hiwat;
5944
5945 if (so->so_snd.sb_flags & SB_UNIX) {
5946 struct unpcb *unp =
5947 (struct unpcb *)(so->so_pcb);
5948 if (unp != NULL && unp->unp_conn != NULL) {
5949 hiwat += unp->unp_conn->unp_cc;
5950 }
5951 }
5952
5953 optval = hiwat;
5954 goto integer;
5955 }
5956 case SO_RCVBUF:
5957 optval = so->so_rcv.sb_hiwat;
5958 goto integer;
5959
5960 case SO_SNDLOWAT:
5961 optval = so->so_snd.sb_lowat;
5962 goto integer;
5963
5964 case SO_RCVLOWAT:
5965 optval = so->so_rcv.sb_lowat;
5966 goto integer;
5967
5968 case SO_SNDTIMEO:
5969 case SO_RCVTIMEO:
5970 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5971 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5972
5973 error = sooptcopyout_timeval(sopt, &tv);
5974 break;
5975
5976 case SO_NOSIGPIPE:
5977 optval = (so->so_flags & SOF_NOSIGPIPE);
5978 goto integer;
5979
5980 case SO_NOADDRERR:
5981 optval = (so->so_flags & SOF_NOADDRAVAIL);
5982 goto integer;
5983
5984 case SO_REUSESHAREUID:
5985 optval = (so->so_flags & SOF_REUSESHAREUID);
5986 goto integer;
5987
5988
5989 case SO_NOTIFYCONFLICT:
5990 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5991 goto integer;
5992
5993 case SO_RESTRICTIONS:
5994 optval = so_get_restrictions(so);
5995 goto integer;
5996
5997 case SO_AWDL_UNRESTRICTED:
5998 if (SOCK_DOM(so) == PF_INET ||
5999 SOCK_DOM(so) == PF_INET6) {
6000 optval = inp_get_awdl_unrestricted(
6001 sotoinpcb(so));
6002 goto integer;
6003 } else {
6004 error = EOPNOTSUPP;
6005 }
6006 break;
6007
6008 case SO_INTCOPROC_ALLOW:
6009 if (SOCK_DOM(so) == PF_INET6) {
6010 optval = inp_get_intcoproc_allowed(
6011 sotoinpcb(so));
6012 goto integer;
6013 } else {
6014 error = EOPNOTSUPP;
6015 }
6016 break;
6017
6018 case SO_LABEL:
6019 error = EOPNOTSUPP;
6020 break;
6021
6022 case SO_PEERLABEL:
6023 error = EOPNOTSUPP;
6024 break;
6025
6026 #ifdef __APPLE_API_PRIVATE
6027 case SO_UPCALLCLOSEWAIT:
6028 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6029 goto integer;
6030 #endif
6031 case SO_RANDOMPORT:
6032 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6033 goto integer;
6034
6035 case SO_NP_EXTENSIONS: {
6036 struct so_np_extensions sonpx = {};
6037
6038 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6039 SONPX_SETOPTSHUT : 0;
6040 sonpx.npx_mask = SONPX_MASK_VALID;
6041
6042 error = sooptcopyout(sopt, &sonpx,
6043 sizeof(struct so_np_extensions));
6044 break;
6045 }
6046
6047 case SO_TRAFFIC_CLASS:
6048 optval = so->so_traffic_class;
6049 goto integer;
6050
6051 case SO_RECV_TRAFFIC_CLASS:
6052 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6053 goto integer;
6054
6055 #if (DEVELOPMENT || DEBUG)
6056 case SO_TRAFFIC_CLASS_DBG:
6057 error = sogetopt_tcdbg(so, sopt);
6058 break;
6059 #endif /* (DEVELOPMENT || DEBUG) */
6060
6061 case SO_PRIVILEGED_TRAFFIC_CLASS:
6062 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6063 goto integer;
6064
6065 case SO_DEFUNCTOK:
6066 optval = !(so->so_flags & SOF_NODEFUNCT);
6067 goto integer;
6068
6069 case SO_ISDEFUNCT:
6070 optval = (so->so_flags & SOF_DEFUNCT);
6071 goto integer;
6072
6073 case SO_OPPORTUNISTIC:
6074 optval = so_get_opportunistic(so);
6075 goto integer;
6076
6077 case SO_FLUSH:
6078 /* This option is not gettable */
6079 error = EINVAL;
6080 break;
6081
6082 case SO_RECV_ANYIF:
6083 optval = so_get_recv_anyif(so);
6084 goto integer;
6085
6086 case SO_TRAFFIC_MGT_BACKGROUND:
6087 /* This option is handled by lower layer(s) */
6088 if (so->so_proto != NULL &&
6089 so->so_proto->pr_ctloutput != NULL) {
6090 (void) so->so_proto->pr_ctloutput(so, sopt);
6091 }
6092 break;
6093
6094 #if FLOW_DIVERT
6095 case SO_FLOW_DIVERT_TOKEN:
6096 error = flow_divert_token_get(so, sopt);
6097 break;
6098 #endif /* FLOW_DIVERT */
6099
6100 #if NECP
6101 case SO_NECP_ATTRIBUTES:
6102 if (SOCK_DOM(so) == PF_MULTIPATH) {
6103 /* Handled by MPTCP itself */
6104 break;
6105 }
6106
6107 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6108 error = EINVAL;
6109 goto out;
6110 }
6111
6112 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6113 break;
6114
6115 case SO_NECP_CLIENTUUID: {
6116 uuid_t *ncu;
6117
6118 if (SOCK_DOM(so) == PF_MULTIPATH) {
6119 ncu = &mpsotomppcb(so)->necp_client_uuid;
6120 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6121 ncu = &sotoinpcb(so)->necp_client_uuid;
6122 } else {
6123 error = EINVAL;
6124 goto out;
6125 }
6126
6127 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6128 break;
6129 }
6130
6131 case SO_NECP_LISTENUUID: {
6132 uuid_t *nlu;
6133
6134 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6135 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6136 nlu = &sotoinpcb(so)->necp_client_uuid;
6137 } else {
6138 error = ENOENT;
6139 goto out;
6140 }
6141 } else {
6142 error = EINVAL;
6143 goto out;
6144 }
6145
6146 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6147 break;
6148 }
6149
6150 case SO_RESOLVER_SIGNATURE: {
6151 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6152 error = EINVAL;
6153 goto out;
6154 }
6155 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6156 break;
6157 }
6158
6159 #endif /* NECP */
6160
6161 #if CONTENT_FILTER
6162 case SO_CFIL_SOCK_ID: {
6163 cfil_sock_id_t sock_id;
6164
6165 sock_id = cfil_sock_id_from_socket(so);
6166
6167 error = sooptcopyout(sopt, &sock_id,
6168 sizeof(cfil_sock_id_t));
6169 break;
6170 }
6171 #endif /* CONTENT_FILTER */
6172
6173 case SO_EXTENDED_BK_IDLE:
6174 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6175 goto integer;
6176 case SO_MARK_CELLFALLBACK:
6177 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6178 ? 1 : 0;
6179 goto integer;
6180 case SO_FALLBACK_MODE:
6181 optval = so->so_fallback_mode;
6182 goto integer;
6183 case SO_MARK_KNOWN_TRACKER: {
6184 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6185 ? 1 : 0;
6186 goto integer;
6187 }
6188 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6189 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6190 ? 1 : 0;
6191 goto integer;
6192 }
6193 case SO_MARK_APPROVED_APP_DOMAIN: {
6194 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6195 ? 1 : 0;
6196 goto integer;
6197 }
6198 case SO_NET_SERVICE_TYPE: {
6199 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6200 optval = so->so_netsvctype;
6201 } else {
6202 optval = NET_SERVICE_TYPE_BE;
6203 }
6204 goto integer;
6205 }
6206 case SO_NETSVC_MARKING_LEVEL:
6207 optval = so_get_netsvc_marking_level(so);
6208 goto integer;
6209
6210 case SO_MPKL_SEND_INFO: {
6211 struct so_mpkl_send_info so_mpkl_send_info;
6212
6213 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6214 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6215 error = sooptcopyout(sopt, &so_mpkl_send_info,
6216 sizeof(struct so_mpkl_send_info));
6217 break;
6218 }
6219 case SO_MARK_WAKE_PKT:
6220 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6221 goto integer;
6222 case SO_RECV_WAKE_PKT:
6223 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6224 goto integer;
6225 case SO_APPLICATION_ID: {
6226 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6227 error = EINVAL;
6228 goto out;
6229 }
6230 so_application_id_t application_id = { 0 };
6231 application_id.uid = kauth_cred_getuid(so->so_cred);
6232 uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6233 application_id.persona_id = so->so_persona_id;
6234 error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6235 break;
6236 }
6237 case SO_MARK_DOMAIN_INFO_SILENT:
6238 optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
6239 ? 1 : 0;
6240 goto integer;
6241 case SO_MAX_PACING_RATE: {
6242 uint64_t pacingrate;
6243
6244 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6245 error = EINVAL;
6246 goto out;
6247 }
6248
6249 pacingrate = sotoinpcb(so)->inp_max_pacing_rate;
6250
6251 error = sooptcopyout(sopt, &pacingrate, sizeof(pacingrate));
6252 break;
6253 }
6254 case SO_CONNECTION_IDLE: {
6255 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6256 error = EINVAL;
6257 goto out;
6258 }
6259 optval = sotoinpcb(so)->inp_flags2 & INP2_CONNECTION_IDLE ?
6260 1 : 0;
6261 goto integer;
6262 }
6263 default:
6264 error = ENOPROTOOPT;
6265 break;
6266 }
6267 }
6268 out:
6269 if (dolock) {
6270 socket_unlock(so, 1);
6271 }
6272 return error;
6273 }
6274
6275 /*
6276 * The size limits on our soopt_getm is different from that on FreeBSD.
6277 * We limit the size of options to MCLBYTES. This will have to change
6278 * if we need to define options that need more space than MCLBYTES.
6279 */
6280 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6281 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6282 {
6283 struct mbuf *m, *m_prev;
6284 int sopt_size = (int)sopt->sopt_valsize;
6285 int how;
6286
6287 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6288 return EMSGSIZE;
6289 }
6290
6291 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6292 MGET(m, how, MT_DATA);
6293 if (m == NULL) {
6294 return ENOBUFS;
6295 }
6296 if (sopt_size > MLEN) {
6297 MCLGET(m, how);
6298 if ((m->m_flags & M_EXT) == 0) {
6299 m_free(m);
6300 return ENOBUFS;
6301 }
6302 m->m_len = min(MCLBYTES, sopt_size);
6303 } else {
6304 m->m_len = min(MLEN, sopt_size);
6305 }
6306 sopt_size -= m->m_len;
6307 *mp = m;
6308 m_prev = m;
6309
6310 while (sopt_size > 0) {
6311 MGET(m, how, MT_DATA);
6312 if (m == NULL) {
6313 m_freem(*mp);
6314 return ENOBUFS;
6315 }
6316 if (sopt_size > MLEN) {
6317 MCLGET(m, how);
6318 if ((m->m_flags & M_EXT) == 0) {
6319 m_freem(*mp);
6320 m_freem(m);
6321 return ENOBUFS;
6322 }
6323 m->m_len = min(MCLBYTES, sopt_size);
6324 } else {
6325 m->m_len = min(MLEN, sopt_size);
6326 }
6327 sopt_size -= m->m_len;
6328 m_prev->m_next = m;
6329 m_prev = m;
6330 }
6331 return 0;
6332 }
6333
6334 /* copyin sopt data into mbuf chain */
6335 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6336 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6337 {
6338 struct mbuf *m0 = m;
6339
6340 if (sopt->sopt_val == USER_ADDR_NULL) {
6341 return 0;
6342 }
6343 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6344 if (sopt->sopt_p != kernproc) {
6345 int error;
6346
6347 error = copyin(sopt->sopt_val, mtod(m, char *),
6348 m->m_len);
6349 if (error != 0) {
6350 m_freem(m0);
6351 return error;
6352 }
6353 } else {
6354 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6355 CAST_DOWN(caddr_t, sopt->sopt_val),
6356 m->m_len);
6357 bcopy(tmp, mtod(m, char *), m->m_len);
6358 }
6359 sopt->sopt_valsize -= m->m_len;
6360 sopt->sopt_val += m->m_len;
6361 m = m->m_next;
6362 }
6363 /* should be allocated enoughly at ip6_sooptmcopyin() */
6364 if (m != NULL) {
6365 panic("soopt_mcopyin");
6366 /* NOTREACHED */
6367 }
6368 return 0;
6369 }
6370
6371 /* copyout mbuf chain data into soopt */
6372 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6373 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6374 {
6375 struct mbuf *m0 = m;
6376 size_t valsize = 0;
6377
6378 if (sopt->sopt_val == USER_ADDR_NULL) {
6379 return 0;
6380 }
6381 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6382 if (sopt->sopt_p != kernproc) {
6383 int error;
6384
6385 error = copyout(mtod(m, char *), sopt->sopt_val,
6386 m->m_len);
6387 if (error != 0) {
6388 m_freem(m0);
6389 return error;
6390 }
6391 } else {
6392 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6393 CAST_DOWN(caddr_t, sopt->sopt_val),
6394 m->m_len);
6395
6396 bcopy(mtod(m, char *), tmp, m->m_len);
6397 }
6398 sopt->sopt_valsize -= m->m_len;
6399 sopt->sopt_val += m->m_len;
6400 valsize += m->m_len;
6401 m = m->m_next;
6402 }
6403 if (m != NULL) {
6404 /* enough soopt buffer should be given from user-land */
6405 m_freem(m0);
6406 return EINVAL;
6407 }
6408 sopt->sopt_valsize = valsize;
6409 return 0;
6410 }
6411
6412 void
sohasoutofband(struct socket * so)6413 sohasoutofband(struct socket *so)
6414 {
6415 if (so->so_pgid < 0) {
6416 gsignal(-so->so_pgid, SIGURG);
6417 } else if (so->so_pgid > 0) {
6418 proc_signal(so->so_pgid, SIGURG);
6419 }
6420 selwakeup(&so->so_rcv.sb_sel);
6421 if (so->so_rcv.sb_flags & SB_KNOTE) {
6422 KNOTE(&so->so_rcv.sb_sel.si_note,
6423 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6424 }
6425 }
6426
6427 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6428 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6429 {
6430 #pragma unused(cred)
6431 struct proc *p = current_proc();
6432 int revents = 0;
6433
6434 socket_lock(so, 1);
6435 so_update_last_owner_locked(so, PROC_NULL);
6436 so_update_policy(so);
6437
6438 if (events & (POLLIN | POLLRDNORM)) {
6439 if (soreadable(so)) {
6440 revents |= events & (POLLIN | POLLRDNORM);
6441 }
6442 }
6443
6444 if (events & (POLLOUT | POLLWRNORM)) {
6445 if (sowriteable(so)) {
6446 revents |= events & (POLLOUT | POLLWRNORM);
6447 }
6448 }
6449
6450 if (events & (POLLPRI | POLLRDBAND)) {
6451 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6452 revents |= events & (POLLPRI | POLLRDBAND);
6453 }
6454 }
6455
6456 if (revents == 0) {
6457 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6458 /*
6459 * Darwin sets the flag first,
6460 * BSD calls selrecord first
6461 */
6462 so->so_rcv.sb_flags |= SB_SEL;
6463 selrecord(p, &so->so_rcv.sb_sel, wql);
6464 }
6465
6466 if (events & (POLLOUT | POLLWRNORM)) {
6467 /*
6468 * Darwin sets the flag first,
6469 * BSD calls selrecord first
6470 */
6471 so->so_snd.sb_flags |= SB_SEL;
6472 selrecord(p, &so->so_snd.sb_sel, wql);
6473 }
6474 }
6475
6476 socket_unlock(so, 1);
6477 return revents;
6478 }
6479
6480 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6481 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6482 {
6483 struct socket *so = (struct socket *)fp_get_data(fp);
6484 int result;
6485
6486 socket_lock(so, 1);
6487 so_update_last_owner_locked(so, PROC_NULL);
6488 so_update_policy(so);
6489
6490 switch (kn->kn_filter) {
6491 case EVFILT_READ:
6492 kn->kn_filtid = EVFILTID_SOREAD;
6493 break;
6494 case EVFILT_WRITE:
6495 kn->kn_filtid = EVFILTID_SOWRITE;
6496 break;
6497 case EVFILT_SOCK:
6498 kn->kn_filtid = EVFILTID_SCK;
6499 break;
6500 case EVFILT_EXCEPT:
6501 kn->kn_filtid = EVFILTID_SOEXCEPT;
6502 break;
6503 default:
6504 socket_unlock(so, 1);
6505 knote_set_error(kn, EINVAL);
6506 return 0;
6507 }
6508
6509 /*
6510 * call the appropriate sub-filter attach
6511 * with the socket still locked
6512 */
6513 result = knote_fops(kn)->f_attach(kn, kev);
6514
6515 socket_unlock(so, 1);
6516
6517 return result;
6518 }
6519
6520 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6521 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6522 {
6523 int retval = 0;
6524 int64_t data = 0;
6525
6526 if (so->so_options & SO_ACCEPTCONN) {
6527 /*
6528 * Radar 6615193 handle the listen case dynamically
6529 * for kqueue read filter. This allows to call listen()
6530 * after registering the kqueue EVFILT_READ.
6531 */
6532
6533 retval = !TAILQ_EMPTY(&so->so_comp);
6534 data = so->so_qlen;
6535 goto out;
6536 }
6537
6538 /* socket isn't a listener */
6539 /*
6540 * NOTE_LOWAT specifies new low water mark in data, i.e.
6541 * the bytes of protocol data. We therefore exclude any
6542 * control bytes.
6543 */
6544 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6545
6546 if (kn->kn_sfflags & NOTE_OOB) {
6547 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6548 kn->kn_fflags |= NOTE_OOB;
6549 data -= so->so_oobmark;
6550 retval = 1;
6551 goto out;
6552 }
6553 }
6554
6555 if ((so->so_state & SS_CANTRCVMORE)
6556 #if CONTENT_FILTER
6557 && cfil_sock_data_pending(&so->so_rcv) == 0
6558 #endif /* CONTENT_FILTER */
6559 ) {
6560 kn->kn_flags |= EV_EOF;
6561 kn->kn_fflags = so->so_error;
6562 retval = 1;
6563 goto out;
6564 }
6565
6566 if (so->so_error) { /* temporary udp error */
6567 retval = 1;
6568 goto out;
6569 }
6570
6571 int64_t lowwat = so->so_rcv.sb_lowat;
6572 /*
6573 * Ensure that when NOTE_LOWAT is used, the derived
6574 * low water mark is bounded by socket's rcv buf's
6575 * high and low water mark values.
6576 */
6577 if (kn->kn_sfflags & NOTE_LOWAT) {
6578 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6579 lowwat = so->so_rcv.sb_hiwat;
6580 } else if (kn->kn_sdata > lowwat) {
6581 lowwat = kn->kn_sdata;
6582 }
6583 }
6584
6585 /*
6586 * While the `data` field is the amount of data to read,
6587 * 0-sized packets need to wake up the kqueue, see 58140856,
6588 * so we need to take control bytes into account too.
6589 */
6590 retval = (so->so_rcv.sb_cc >= lowwat);
6591
6592 out:
6593 if (retval && kev) {
6594 knote_fill_kevent(kn, kev, data);
6595 }
6596 return retval;
6597 }
6598
6599 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6600 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6601 {
6602 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6603
6604 /* socket locked */
6605
6606 /*
6607 * If the caller explicitly asked for OOB results (e.g. poll())
6608 * from EVFILT_READ, then save that off in the hookid field
6609 * and reserve the kn_flags EV_OOBAND bit for output only.
6610 */
6611 if (kn->kn_filter == EVFILT_READ &&
6612 kn->kn_flags & EV_OOBAND) {
6613 kn->kn_flags &= ~EV_OOBAND;
6614 kn->kn_hook32 = EV_OOBAND;
6615 } else {
6616 kn->kn_hook32 = 0;
6617 }
6618 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6619 so->so_rcv.sb_flags |= SB_KNOTE;
6620 }
6621
6622 /* indicate if event is already fired */
6623 return filt_soread_common(kn, NULL, so);
6624 }
6625
6626 static void
filt_sordetach(struct knote * kn)6627 filt_sordetach(struct knote *kn)
6628 {
6629 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6630
6631 socket_lock(so, 1);
6632 if (so->so_rcv.sb_flags & SB_KNOTE) {
6633 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6634 so->so_rcv.sb_flags &= ~SB_KNOTE;
6635 }
6636 }
6637 socket_unlock(so, 1);
6638 }
6639
6640 /*ARGSUSED*/
6641 static int
filt_soread(struct knote * kn,long hint)6642 filt_soread(struct knote *kn, long hint)
6643 {
6644 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6645 int retval;
6646
6647 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6648 socket_lock(so, 1);
6649 }
6650
6651 retval = filt_soread_common(kn, NULL, so);
6652
6653 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6654 socket_unlock(so, 1);
6655 }
6656
6657 return retval;
6658 }
6659
6660 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6661 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6662 {
6663 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6664 int retval;
6665
6666 socket_lock(so, 1);
6667
6668 /* save off the new input fflags and data */
6669 kn->kn_sfflags = kev->fflags;
6670 kn->kn_sdata = kev->data;
6671
6672 /* determine if changes result in fired events */
6673 retval = filt_soread_common(kn, NULL, so);
6674
6675 socket_unlock(so, 1);
6676
6677 return retval;
6678 }
6679
6680 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6681 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6682 {
6683 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6684 int retval;
6685
6686 socket_lock(so, 1);
6687 retval = filt_soread_common(kn, kev, so);
6688 socket_unlock(so, 1);
6689
6690 return retval;
6691 }
6692
6693 int
so_wait_for_if_feedback(struct socket * so)6694 so_wait_for_if_feedback(struct socket *so)
6695 {
6696 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6697 (so->so_state & SS_ISCONNECTED)) {
6698 struct inpcb *inp = sotoinpcb(so);
6699 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6700 return 1;
6701 }
6702 }
6703 return 0;
6704 }
6705
6706 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6707 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6708 {
6709 int ret = 0;
6710 int64_t data = sbspace(&so->so_snd);
6711
6712 if (so->so_state & SS_CANTSENDMORE) {
6713 kn->kn_flags |= EV_EOF;
6714 kn->kn_fflags = so->so_error;
6715 ret = 1;
6716 goto out;
6717 }
6718
6719 if (so->so_error) { /* temporary udp error */
6720 ret = 1;
6721 goto out;
6722 }
6723
6724 if (!socanwrite(so)) {
6725 ret = 0;
6726 goto out;
6727 }
6728
6729 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6730 ret = 1;
6731 goto out;
6732 }
6733
6734 int64_t lowwat = so->so_snd.sb_lowat;
6735 const int64_t hiwat = so->so_snd.sb_hiwat;
6736 /*
6737 * Deal with connected UNIX domain sockets which
6738 * rely on the fact that the sender's socket buffer is
6739 * actually the receiver's socket buffer.
6740 */
6741 if (SOCK_DOM(so) == PF_LOCAL) {
6742 struct unpcb *unp = sotounpcb(so);
6743 if (unp != NULL && unp->unp_conn != NULL &&
6744 unp->unp_conn->unp_socket != NULL) {
6745 struct socket *so2 = unp->unp_conn->unp_socket;
6746 /*
6747 * At this point we know that `so' is locked
6748 * and that `unp_conn` isn't going to change.
6749 * However, we don't lock `so2` because doing so
6750 * may require unlocking `so'
6751 * (see unp_get_locks_in_order()).
6752 *
6753 * Two cases can happen:
6754 *
6755 * 1) we return 1 and tell the application that
6756 * it can write. Meanwhile, another thread
6757 * fills up the socket buffer. This will either
6758 * lead to a blocking send or EWOULDBLOCK
6759 * which the application should deal with.
6760 * 2) we return 0 and tell the application that
6761 * the socket is not writable. Meanwhile,
6762 * another thread depletes the receive socket
6763 * buffer. In this case the application will
6764 * be woken up by sb_notify().
6765 *
6766 * MIN() is required because otherwise sosendcheck()
6767 * may return EWOULDBLOCK since it only considers
6768 * so->so_snd.
6769 */
6770 data = MIN(data, sbspace(&so2->so_rcv));
6771 }
6772 }
6773
6774 if (kn->kn_sfflags & NOTE_LOWAT) {
6775 if (kn->kn_sdata > hiwat) {
6776 lowwat = hiwat;
6777 } else if (kn->kn_sdata > lowwat) {
6778 lowwat = kn->kn_sdata;
6779 }
6780 }
6781
6782 if (data > 0 && data >= lowwat) {
6783 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6784 #if (DEBUG || DEVELOPMENT)
6785 && so_notsent_lowat_check == 1
6786 #endif /* DEBUG || DEVELOPMENT */
6787 ) {
6788 if ((SOCK_DOM(so) == PF_INET ||
6789 SOCK_DOM(so) == PF_INET6) &&
6790 so->so_type == SOCK_STREAM) {
6791 ret = tcp_notsent_lowat_check(so);
6792 }
6793 #if MPTCP
6794 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6795 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6796 ret = mptcp_notsent_lowat_check(so);
6797 }
6798 #endif
6799 else {
6800 ret = 1;
6801 goto out;
6802 }
6803 } else {
6804 ret = 1;
6805 }
6806 }
6807 if (so_wait_for_if_feedback(so)) {
6808 ret = 0;
6809 }
6810
6811 out:
6812 if (ret && kev) {
6813 knote_fill_kevent(kn, kev, data);
6814 }
6815 return ret;
6816 }
6817
6818 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6819 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6820 {
6821 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6822
6823 /* socket locked */
6824 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6825 so->so_snd.sb_flags |= SB_KNOTE;
6826 }
6827
6828 /* determine if its already fired */
6829 return filt_sowrite_common(kn, NULL, so);
6830 }
6831
6832 static void
filt_sowdetach(struct knote * kn)6833 filt_sowdetach(struct knote *kn)
6834 {
6835 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6836 socket_lock(so, 1);
6837
6838 if (so->so_snd.sb_flags & SB_KNOTE) {
6839 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6840 so->so_snd.sb_flags &= ~SB_KNOTE;
6841 }
6842 }
6843 socket_unlock(so, 1);
6844 }
6845
6846 /*ARGSUSED*/
6847 static int
filt_sowrite(struct knote * kn,long hint)6848 filt_sowrite(struct knote *kn, long hint)
6849 {
6850 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6851 int ret;
6852
6853 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6854 socket_lock(so, 1);
6855 }
6856
6857 ret = filt_sowrite_common(kn, NULL, so);
6858
6859 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6860 socket_unlock(so, 1);
6861 }
6862
6863 return ret;
6864 }
6865
6866 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6867 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6868 {
6869 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6870 int ret;
6871
6872 socket_lock(so, 1);
6873
6874 /*save off the new input fflags and data */
6875 kn->kn_sfflags = kev->fflags;
6876 kn->kn_sdata = kev->data;
6877
6878 /* determine if these changes result in a triggered event */
6879 ret = filt_sowrite_common(kn, NULL, so);
6880
6881 socket_unlock(so, 1);
6882
6883 return ret;
6884 }
6885
6886 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6887 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6888 {
6889 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6890 int ret;
6891
6892 socket_lock(so, 1);
6893 ret = filt_sowrite_common(kn, kev, so);
6894 socket_unlock(so, 1);
6895
6896 return ret;
6897 }
6898
6899 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6900 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6901 struct socket *so, long ev_hint)
6902 {
6903 int ret = 0;
6904 int64_t data = 0;
6905 uint32_t level_trigger = 0;
6906
6907 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6908 kn->kn_fflags |= NOTE_CONNRESET;
6909 }
6910 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6911 kn->kn_fflags |= NOTE_TIMEOUT;
6912 }
6913 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6914 kn->kn_fflags |= NOTE_NOSRCADDR;
6915 }
6916 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6917 kn->kn_fflags |= NOTE_IFDENIED;
6918 }
6919 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6920 kn->kn_fflags |= NOTE_KEEPALIVE;
6921 }
6922 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6923 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6924 }
6925 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6926 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6927 }
6928 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6929 (so->so_state & SS_ISCONNECTED)) {
6930 kn->kn_fflags |= NOTE_CONNECTED;
6931 level_trigger |= NOTE_CONNECTED;
6932 }
6933 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6934 (so->so_state & SS_ISDISCONNECTED)) {
6935 kn->kn_fflags |= NOTE_DISCONNECTED;
6936 level_trigger |= NOTE_DISCONNECTED;
6937 }
6938 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6939 if (so->so_proto != NULL &&
6940 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6941 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6942 }
6943 }
6944 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6945 tcp_notify_ack_active(so)) {
6946 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6947 }
6948 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6949 kn->kn_fflags |= NOTE_WAKE_PKT;
6950 }
6951
6952 if ((so->so_state & SS_CANTRCVMORE)
6953 #if CONTENT_FILTER
6954 && cfil_sock_data_pending(&so->so_rcv) == 0
6955 #endif /* CONTENT_FILTER */
6956 ) {
6957 kn->kn_fflags |= NOTE_READCLOSED;
6958 level_trigger |= NOTE_READCLOSED;
6959 }
6960
6961 if (so->so_state & SS_CANTSENDMORE) {
6962 kn->kn_fflags |= NOTE_WRITECLOSED;
6963 level_trigger |= NOTE_WRITECLOSED;
6964 }
6965
6966 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6967 (so->so_flags & SOF_SUSPENDED)) {
6968 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6969
6970 /* If resume event was delivered before, reset it */
6971 kn->kn_hook32 &= ~NOTE_RESUME;
6972
6973 kn->kn_fflags |= NOTE_SUSPEND;
6974 level_trigger |= NOTE_SUSPEND;
6975 }
6976
6977 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6978 (so->so_flags & SOF_SUSPENDED) == 0) {
6979 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6980
6981 /* If suspend event was delivered before, reset it */
6982 kn->kn_hook32 &= ~NOTE_SUSPEND;
6983
6984 kn->kn_fflags |= NOTE_RESUME;
6985 level_trigger |= NOTE_RESUME;
6986 }
6987
6988 if (so->so_error != 0) {
6989 ret = 1;
6990 data = so->so_error;
6991 kn->kn_flags |= EV_EOF;
6992 } else {
6993 u_int32_t data32 = 0;
6994 get_sockev_state(so, &data32);
6995 data = data32;
6996 }
6997
6998 /* Reset any events that are not requested on this knote */
6999 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7000 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7001
7002 /* Find the level triggerred events that are already delivered */
7003 level_trigger &= kn->kn_hook32;
7004 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7005
7006 /* Do not deliver level triggerred events more than once */
7007 if ((kn->kn_fflags & ~level_trigger) != 0) {
7008 ret = 1;
7009 }
7010
7011 if (ret && kev) {
7012 /*
7013 * Store the state of the events being delivered. This
7014 * state can be used to deliver level triggered events
7015 * ateast once and still avoid waking up the application
7016 * multiple times as long as the event is active.
7017 */
7018 if (kn->kn_fflags != 0) {
7019 kn->kn_hook32 |= (kn->kn_fflags &
7020 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7021 }
7022
7023 /*
7024 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7025 * only one of them and remember the last one that was
7026 * delivered last
7027 */
7028 if (kn->kn_fflags & NOTE_SUSPEND) {
7029 kn->kn_hook32 &= ~NOTE_RESUME;
7030 }
7031 if (kn->kn_fflags & NOTE_RESUME) {
7032 kn->kn_hook32 &= ~NOTE_SUSPEND;
7033 }
7034
7035 knote_fill_kevent(kn, kev, data);
7036 }
7037 return ret;
7038 }
7039
7040 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7041 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7042 {
7043 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7044
7045 /* socket locked */
7046 kn->kn_hook32 = 0;
7047 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7048 so->so_flags |= SOF_KNOTE;
7049 }
7050
7051 /* determine if event already fired */
7052 return filt_sockev_common(kn, NULL, so, 0);
7053 }
7054
7055 static void
filt_sockdetach(struct knote * kn)7056 filt_sockdetach(struct knote *kn)
7057 {
7058 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7059 socket_lock(so, 1);
7060
7061 if ((so->so_flags & SOF_KNOTE) != 0) {
7062 if (KNOTE_DETACH(&so->so_klist, kn)) {
7063 so->so_flags &= ~SOF_KNOTE;
7064 }
7065 }
7066 socket_unlock(so, 1);
7067 }
7068
7069 static int
filt_sockev(struct knote * kn,long hint)7070 filt_sockev(struct knote *kn, long hint)
7071 {
7072 int ret = 0, locked = 0;
7073 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7074 long ev_hint = (hint & SO_FILT_HINT_EV);
7075
7076 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7077 socket_lock(so, 1);
7078 locked = 1;
7079 }
7080
7081 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7082
7083 if (locked) {
7084 socket_unlock(so, 1);
7085 }
7086
7087 return ret;
7088 }
7089
7090
7091
7092 /*
7093 * filt_socktouch - update event state
7094 */
7095 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7096 filt_socktouch(
7097 struct knote *kn,
7098 struct kevent_qos_s *kev)
7099 {
7100 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7101 uint32_t changed_flags;
7102 int ret;
7103
7104 socket_lock(so, 1);
7105
7106 /* save off the [result] data and fflags */
7107 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7108
7109 /* save off the new input fflags and data */
7110 kn->kn_sfflags = kev->fflags;
7111 kn->kn_sdata = kev->data;
7112
7113 /* restrict the current results to the (smaller?) set of new interest */
7114 /*
7115 * For compatibility with previous implementations, we leave kn_fflags
7116 * as they were before.
7117 */
7118 //kn->kn_fflags &= kev->fflags;
7119
7120 /*
7121 * Since we keep track of events that are already
7122 * delivered, if any of those events are not requested
7123 * anymore the state related to them can be reset
7124 */
7125 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7126
7127 /* determine if we have events to deliver */
7128 ret = filt_sockev_common(kn, NULL, so, 0);
7129
7130 socket_unlock(so, 1);
7131
7132 return ret;
7133 }
7134
7135 /*
7136 * filt_sockprocess - query event fired state and return data
7137 */
7138 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7139 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7140 {
7141 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7142 int ret = 0;
7143
7144 socket_lock(so, 1);
7145
7146 ret = filt_sockev_common(kn, kev, so, 0);
7147
7148 socket_unlock(so, 1);
7149
7150 return ret;
7151 }
7152
7153 void
get_sockev_state(struct socket * so,u_int32_t * statep)7154 get_sockev_state(struct socket *so, u_int32_t *statep)
7155 {
7156 u_int32_t state = *(statep);
7157
7158 /*
7159 * If the state variable is already used by a previous event,
7160 * reset it.
7161 */
7162 if (state != 0) {
7163 return;
7164 }
7165
7166 if (so->so_state & SS_ISCONNECTED) {
7167 state |= SOCKEV_CONNECTED;
7168 } else {
7169 state &= ~(SOCKEV_CONNECTED);
7170 }
7171 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7172 *(statep) = state;
7173 }
7174
7175 #define SO_LOCK_HISTORY_STR_LEN \
7176 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7177
7178 __private_extern__ const char *
solockhistory_nr(struct socket * so)7179 solockhistory_nr(struct socket *so)
7180 {
7181 size_t n = 0;
7182 int i;
7183 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7184
7185 bzero(lock_history_str, sizeof(lock_history_str));
7186 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7187 n += scnprintf(lock_history_str + n,
7188 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7189 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7190 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7191 }
7192 return __unsafe_null_terminated_from_indexable(lock_history_str);
7193 }
7194
7195 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7196 socket_getlock(struct socket *so, int flags)
7197 {
7198 if (so->so_proto->pr_getlock != NULL) {
7199 return (*so->so_proto->pr_getlock)(so, flags);
7200 } else {
7201 return so->so_proto->pr_domain->dom_mtx;
7202 }
7203 }
7204
7205 void
socket_lock(struct socket * so,int refcount)7206 socket_lock(struct socket *so, int refcount)
7207 {
7208 void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7209
7210 if (so->so_proto->pr_lock) {
7211 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7212 } else {
7213 #ifdef MORE_LOCKING_DEBUG
7214 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7215 LCK_MTX_ASSERT_NOTOWNED);
7216 #endif
7217 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7218 if (refcount) {
7219 so->so_usecount++;
7220 }
7221 so->lock_lr[so->next_lock_lr] = lr_saved;
7222 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7223 }
7224 }
7225
7226 void
socket_lock_assert_owned(struct socket * so)7227 socket_lock_assert_owned(struct socket *so)
7228 {
7229 lck_mtx_t *mutex_held;
7230
7231 if (so->so_proto->pr_getlock != NULL) {
7232 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7233 } else {
7234 mutex_held = so->so_proto->pr_domain->dom_mtx;
7235 }
7236
7237 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7238 }
7239
7240 int
socket_try_lock(struct socket * so)7241 socket_try_lock(struct socket *so)
7242 {
7243 lck_mtx_t *mtx;
7244
7245 if (so->so_proto->pr_getlock != NULL) {
7246 mtx = (*so->so_proto->pr_getlock)(so, 0);
7247 } else {
7248 mtx = so->so_proto->pr_domain->dom_mtx;
7249 }
7250
7251 return lck_mtx_try_lock(mtx);
7252 }
7253
7254 void
socket_unlock(struct socket * so,int refcount)7255 socket_unlock(struct socket *so, int refcount)
7256 {
7257 lck_mtx_t *mutex_held;
7258 void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7259
7260 if (so == NULL || so->so_proto == NULL) {
7261 panic("%s: null so_proto so=%p", __func__, so);
7262 /* NOTREACHED */
7263 }
7264
7265 if (so->so_proto->pr_unlock) {
7266 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7267 } else {
7268 mutex_held = so->so_proto->pr_domain->dom_mtx;
7269 #ifdef MORE_LOCKING_DEBUG
7270 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7271 #endif
7272 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7273 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7274
7275 if (refcount) {
7276 if (so->so_usecount <= 0) {
7277 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7278 "lrh=%s", __func__, so->so_usecount, so,
7279 SOCK_DOM(so), so->so_type,
7280 SOCK_PROTO(so), solockhistory_nr(so));
7281 /* NOTREACHED */
7282 }
7283
7284 so->so_usecount--;
7285 if (so->so_usecount == 0) {
7286 sofreelastref(so, 1);
7287 }
7288 }
7289 lck_mtx_unlock(mutex_held);
7290 }
7291 }
7292
7293 /* Called with socket locked, will unlock socket */
7294 void
sofree(struct socket * so)7295 sofree(struct socket *so)
7296 {
7297 lck_mtx_t *mutex_held;
7298
7299 if (so->so_proto->pr_getlock != NULL) {
7300 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7301 } else {
7302 mutex_held = so->so_proto->pr_domain->dom_mtx;
7303 }
7304 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7305
7306 sofreelastref(so, 0);
7307 }
7308
7309 void
soreference(struct socket * so)7310 soreference(struct socket *so)
7311 {
7312 socket_lock(so, 1); /* locks & take one reference on socket */
7313 socket_unlock(so, 0); /* unlock only */
7314 }
7315
7316 void
sodereference(struct socket * so)7317 sodereference(struct socket *so)
7318 {
7319 socket_lock(so, 0);
7320 socket_unlock(so, 1);
7321 }
7322
7323 /*
7324 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7325 * possibility of using jumbo clusters. Caller must ensure to hold
7326 * the socket lock.
7327 */
7328 void
somultipages(struct socket * so,boolean_t set)7329 somultipages(struct socket *so, boolean_t set)
7330 {
7331 if (set) {
7332 so->so_flags |= SOF_MULTIPAGES;
7333 } else {
7334 so->so_flags &= ~SOF_MULTIPAGES;
7335 }
7336 }
7337
7338 void
soif2kcl(struct socket * so,boolean_t set)7339 soif2kcl(struct socket *so, boolean_t set)
7340 {
7341 if (set) {
7342 so->so_flags1 |= SOF1_IF_2KCL;
7343 } else {
7344 so->so_flags1 &= ~SOF1_IF_2KCL;
7345 }
7346 }
7347
7348 int
so_isdstlocal(struct socket * so)7349 so_isdstlocal(struct socket *so)
7350 {
7351 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7352
7353 if (SOCK_DOM(so) == PF_INET) {
7354 return inaddr_local(inp->inp_faddr);
7355 } else if (SOCK_DOM(so) == PF_INET6) {
7356 return in6addr_local(&inp->in6p_faddr);
7357 }
7358
7359 return 0;
7360 }
7361
7362 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7363 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7364 {
7365 struct sockbuf *rcv, *snd;
7366 int err = 0, defunct;
7367
7368 rcv = &so->so_rcv;
7369 snd = &so->so_snd;
7370
7371 defunct = (so->so_flags & SOF_DEFUNCT);
7372 if (defunct) {
7373 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7374 panic("%s: SB_DROP not set", __func__);
7375 /* NOTREACHED */
7376 }
7377 goto done;
7378 }
7379
7380 if (so->so_flags & SOF_NODEFUNCT) {
7381 if (noforce) {
7382 err = EOPNOTSUPP;
7383 if (p != PROC_NULL) {
7384 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7385 "name %s level %d) so 0x%llu [%d,%d] "
7386 "is not eligible for defunct "
7387 "(%d)\n", __func__, proc_selfpid(),
7388 proc_best_name(current_proc()), proc_pid(p),
7389 proc_best_name(p), level,
7390 so->so_gencnt,
7391 SOCK_DOM(so), SOCK_TYPE(so), err);
7392 }
7393 return err;
7394 }
7395 so->so_flags &= ~SOF_NODEFUNCT;
7396 if (p != PROC_NULL) {
7397 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7398 "name %s level %d) so 0x%llu [%d,%d] "
7399 "defunct by force "
7400 "(%d)\n", __func__, proc_selfpid(),
7401 proc_best_name(current_proc()), proc_pid(p),
7402 proc_best_name(p), level,
7403 so->so_gencnt,
7404 SOCK_DOM(so), SOCK_TYPE(so), err);
7405 }
7406 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7407 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7408 struct ifnet *ifp = inp->inp_last_outifp;
7409
7410 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7411 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7412 } else if (so->so_flags & SOF_DELEGATED) {
7413 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7414 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7415 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7416 } else if (noforce && p != PROC_NULL) {
7417 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7418
7419 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7420 so->so_extended_bk_start = net_uptime();
7421 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7422
7423 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7424
7425 err = EOPNOTSUPP;
7426 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7427 "name %s level %d) so 0x%llu [%d,%d] "
7428 "extend bk idle "
7429 "(%d)\n", __func__, proc_selfpid(),
7430 proc_best_name(current_proc()), proc_pid(p),
7431 proc_best_name(p), level,
7432 so->so_gencnt,
7433 SOCK_DOM(so), SOCK_TYPE(so), err);
7434 return err;
7435 } else {
7436 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7437 }
7438 }
7439
7440 so->so_flags |= SOF_DEFUNCT;
7441
7442 /* Prevent further data from being appended to the socket buffers */
7443 snd->sb_flags |= SB_DROP;
7444 rcv->sb_flags |= SB_DROP;
7445
7446 /* Flush any existing data in the socket buffers */
7447 if (rcv->sb_cc != 0) {
7448 rcv->sb_flags &= ~SB_SEL;
7449 selthreadclear(&rcv->sb_sel);
7450 sbrelease(rcv);
7451 }
7452 if (snd->sb_cc != 0) {
7453 snd->sb_flags &= ~SB_SEL;
7454 selthreadclear(&snd->sb_sel);
7455 sbrelease(snd);
7456 }
7457
7458 done:
7459 if (p != PROC_NULL) {
7460 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7461 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7462 proc_selfpid(), proc_best_name(current_proc()),
7463 proc_pid(p), proc_best_name(p), level,
7464 so->so_gencnt, SOCK_DOM(so),
7465 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7466 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7467 " extbkidle" : "");
7468 }
7469 return err;
7470 }
7471
7472 int
sodefunct(struct proc * p,struct socket * so,int level)7473 sodefunct(struct proc *p, struct socket *so, int level)
7474 {
7475 struct sockbuf *rcv, *snd;
7476
7477 if (!(so->so_flags & SOF_DEFUNCT)) {
7478 panic("%s improperly called", __func__);
7479 /* NOTREACHED */
7480 }
7481 if (so->so_state & SS_DEFUNCT) {
7482 goto done;
7483 }
7484
7485 rcv = &so->so_rcv;
7486 snd = &so->so_snd;
7487
7488 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7489 char s[MAX_IPv6_STR_LEN];
7490 char d[MAX_IPv6_STR_LEN];
7491 struct inpcb *inp = sotoinpcb(so);
7492
7493 if (p != PROC_NULL) {
7494 SODEFUNCTLOG(
7495 "%s[%d, %s]: (target pid %d name %s level %d) "
7496 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7497 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7498 " snd_fl 0x%x]\n", __func__,
7499 proc_selfpid(), proc_best_name(current_proc()),
7500 proc_pid(p), proc_best_name(p), level,
7501 so->so_gencnt,
7502 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7503 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7504 (void *)&inp->inp_laddr.s_addr :
7505 (void *)&inp->in6p_laddr),
7506 s, sizeof(s)), ntohs(inp->in6p_lport),
7507 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7508 (void *)&inp->inp_faddr.s_addr :
7509 (void *)&inp->in6p_faddr,
7510 d, sizeof(d)), ntohs(inp->in6p_fport),
7511 (uint32_t)rcv->sb_sel.si_flags,
7512 (uint32_t)snd->sb_sel.si_flags,
7513 rcv->sb_flags, snd->sb_flags);
7514 }
7515 } else if (p != PROC_NULL) {
7516 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7517 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7518 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7519 proc_selfpid(), proc_best_name(current_proc()),
7520 proc_pid(p), proc_best_name(p), level,
7521 so->so_gencnt,
7522 SOCK_DOM(so), SOCK_TYPE(so),
7523 (uint32_t)rcv->sb_sel.si_flags,
7524 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7525 snd->sb_flags);
7526 }
7527
7528 /*
7529 * First tell the protocol the flow is defunct
7530 */
7531 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7532
7533 /*
7534 * Unwedge threads blocked on sbwait() and sb_lock().
7535 */
7536 sbwakeup(rcv);
7537 sbwakeup(snd);
7538
7539 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7540 if (rcv->sb_flags & SB_LOCK) {
7541 sbunlock(rcv, TRUE); /* keep socket locked */
7542 }
7543 if (snd->sb_flags & SB_LOCK) {
7544 sbunlock(snd, TRUE); /* keep socket locked */
7545 }
7546 /*
7547 * Flush the buffers and disconnect. We explicitly call shutdown
7548 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7549 * states are set for the socket. This would also flush out data
7550 * hanging off the receive list of this socket.
7551 */
7552 (void) soshutdownlock_final(so, SHUT_RD);
7553 (void) soshutdownlock_final(so, SHUT_WR);
7554 (void) sodisconnectlocked(so);
7555
7556 /*
7557 * Explicitly handle connectionless-protocol disconnection
7558 * and release any remaining data in the socket buffers.
7559 */
7560 if (!(so->so_state & SS_ISDISCONNECTED)) {
7561 (void) soisdisconnected(so);
7562 }
7563
7564 if (so->so_error == 0) {
7565 so->so_error = EBADF;
7566 }
7567
7568 if (rcv->sb_cc != 0) {
7569 rcv->sb_flags &= ~SB_SEL;
7570 selthreadclear(&rcv->sb_sel);
7571 sbrelease(rcv);
7572 }
7573 if (snd->sb_cc != 0) {
7574 snd->sb_flags &= ~SB_SEL;
7575 selthreadclear(&snd->sb_sel);
7576 sbrelease(snd);
7577 }
7578 so->so_state |= SS_DEFUNCT;
7579 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7580
7581 done:
7582 return 0;
7583 }
7584
7585 int
soresume(struct proc * p,struct socket * so,int locked)7586 soresume(struct proc *p, struct socket *so, int locked)
7587 {
7588 if (locked == 0) {
7589 socket_lock(so, 1);
7590 }
7591
7592 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7593 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7594 "[%d,%d] resumed from bk idle\n",
7595 __func__, proc_selfpid(), proc_best_name(current_proc()),
7596 proc_pid(p), proc_best_name(p),
7597 so->so_gencnt,
7598 SOCK_DOM(so), SOCK_TYPE(so));
7599
7600 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7601 so->so_extended_bk_start = 0;
7602 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7603
7604 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7605 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7606 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7607 }
7608 if (locked == 0) {
7609 socket_unlock(so, 1);
7610 }
7611
7612 return 0;
7613 }
7614
7615 /*
7616 * Does not attempt to account for sockets that are delegated from
7617 * the current process
7618 */
7619 int
so_set_extended_bk_idle(struct socket * so,int optval)7620 so_set_extended_bk_idle(struct socket *so, int optval)
7621 {
7622 int error = 0;
7623
7624 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7625 SOCK_PROTO(so) != IPPROTO_TCP) {
7626 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7627 error = EOPNOTSUPP;
7628 } else if (optval == 0) {
7629 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7630
7631 soresume(current_proc(), so, 1);
7632 } else {
7633 struct proc *p = current_proc();
7634 struct fileproc *fp;
7635 int count = 0;
7636
7637 /*
7638 * Unlock socket to avoid lock ordering issue with
7639 * the proc fd table lock
7640 */
7641 socket_unlock(so, 0);
7642
7643 proc_fdlock(p);
7644 fdt_foreach(fp, p) {
7645 struct socket *so2;
7646
7647 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7648 continue;
7649 }
7650
7651 so2 = (struct socket *)fp_get_data(fp);
7652 if (so != so2 &&
7653 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7654 count++;
7655 }
7656 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7657 break;
7658 }
7659 }
7660 proc_fdunlock(p);
7661
7662 socket_lock(so, 0);
7663
7664 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7665 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7666 error = EBUSY;
7667 } else if (so->so_flags & SOF_DELEGATED) {
7668 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7669 error = EBUSY;
7670 } else {
7671 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7672 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7673 }
7674 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7675 "%s marked for extended bk idle\n",
7676 __func__, proc_selfpid(), proc_best_name(current_proc()),
7677 so->so_gencnt,
7678 SOCK_DOM(so), SOCK_TYPE(so),
7679 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7680 "is" : "not");
7681 }
7682
7683 return error;
7684 }
7685
7686 static void
so_stop_extended_bk_idle(struct socket * so)7687 so_stop_extended_bk_idle(struct socket *so)
7688 {
7689 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7690 so->so_extended_bk_start = 0;
7691
7692 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7693 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7694 /*
7695 * Force defunct
7696 */
7697 sosetdefunct(current_proc(), so,
7698 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7699 if (so->so_flags & SOF_DEFUNCT) {
7700 sodefunct(current_proc(), so,
7701 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7702 }
7703 }
7704
7705 void
so_drain_extended_bk_idle(struct socket * so)7706 so_drain_extended_bk_idle(struct socket *so)
7707 {
7708 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7709 /*
7710 * Only penalize sockets that have outstanding data
7711 */
7712 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7713 so_stop_extended_bk_idle(so);
7714
7715 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7716 }
7717 }
7718 }
7719
7720 /*
7721 * Return values tells if socket is still in extended background idle
7722 */
7723 int
so_check_extended_bk_idle_time(struct socket * so)7724 so_check_extended_bk_idle_time(struct socket *so)
7725 {
7726 int ret = 1;
7727
7728 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7729 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7730 __func__, proc_selfpid(), proc_best_name(current_proc()),
7731 so->so_gencnt,
7732 SOCK_DOM(so), SOCK_TYPE(so));
7733 if (net_uptime() - so->so_extended_bk_start >
7734 soextbkidlestat.so_xbkidle_time) {
7735 so_stop_extended_bk_idle(so);
7736
7737 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7738
7739 ret = 0;
7740 } else {
7741 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7742
7743 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7744 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7745 }
7746 }
7747
7748 return ret;
7749 }
7750
7751 void
resume_proc_sockets(proc_t p)7752 resume_proc_sockets(proc_t p)
7753 {
7754 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7755 struct fileproc *fp;
7756 struct socket *so;
7757
7758 proc_fdlock(p);
7759 fdt_foreach(fp, p) {
7760 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7761 continue;
7762 }
7763
7764 so = (struct socket *)fp_get_data(fp);
7765 (void) soresume(p, so, 0);
7766 }
7767 proc_fdunlock(p);
7768
7769 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7770 }
7771 }
7772
7773 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7774 so_set_recv_anyif(struct socket *so, int optval)
7775 {
7776 int ret = 0;
7777
7778 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7779 if (optval) {
7780 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7781 } else {
7782 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7783 }
7784 #if SKYWALK
7785 inp_update_netns_flags(so);
7786 #endif /* SKYWALK */
7787 }
7788
7789
7790 return ret;
7791 }
7792
7793 __private_extern__ int
so_get_recv_anyif(struct socket * so)7794 so_get_recv_anyif(struct socket *so)
7795 {
7796 int ret = 0;
7797
7798 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7799 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7800 }
7801
7802 return ret;
7803 }
7804
7805 int
so_set_restrictions(struct socket * so,uint32_t vals)7806 so_set_restrictions(struct socket *so, uint32_t vals)
7807 {
7808 int nocell_old, nocell_new;
7809 int noexpensive_old, noexpensive_new;
7810 int noconstrained_old, noconstrained_new;
7811
7812 /*
7813 * Deny-type restrictions are trapdoors; once set they cannot be
7814 * unset for the lifetime of the socket. This allows them to be
7815 * issued by a framework on behalf of the application without
7816 * having to worry that they can be undone.
7817 *
7818 * Note here that socket-level restrictions overrides any protocol
7819 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7820 * socket restriction issued on the socket has a higher precendence
7821 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7822 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7823 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7824 */
7825 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7826 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7827 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7828 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7829 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7830 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7831 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7832 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7833 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7834
7835 /* we can only set, not clear restrictions */
7836 if ((nocell_new - nocell_old) == 0 &&
7837 (noexpensive_new - noexpensive_old) == 0 &&
7838 (noconstrained_new - noconstrained_old) == 0) {
7839 return 0;
7840 }
7841 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7842 if (nocell_new - nocell_old != 0) {
7843 /*
7844 * if deny cellular is now set, do what's needed
7845 * for INPCB
7846 */
7847 inp_set_nocellular(sotoinpcb(so));
7848 }
7849 if (noexpensive_new - noexpensive_old != 0) {
7850 inp_set_noexpensive(sotoinpcb(so));
7851 }
7852 if (noconstrained_new - noconstrained_old != 0) {
7853 inp_set_noconstrained(sotoinpcb(so));
7854 }
7855 }
7856
7857 if (SOCK_DOM(so) == PF_MULTIPATH) {
7858 mptcp_set_restrictions(so);
7859 }
7860
7861 return 0;
7862 }
7863
7864 uint32_t
so_get_restrictions(struct socket * so)7865 so_get_restrictions(struct socket *so)
7866 {
7867 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7868 SO_RESTRICT_DENY_OUT |
7869 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7870 }
7871
7872 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7873 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7874 {
7875 struct proc *ep = PROC_NULL;
7876 int error = 0;
7877
7878 /* pid 0 is reserved for kernel */
7879 if (epid == 0) {
7880 error = EINVAL;
7881 goto done;
7882 }
7883
7884 /*
7885 * If this is an in-kernel socket, prevent its delegate
7886 * association from changing unless the socket option is
7887 * coming from within the kernel itself.
7888 */
7889 if (so->last_pid == 0 && p != kernproc) {
7890 error = EACCES;
7891 goto done;
7892 }
7893
7894 /*
7895 * If this is issued by a process that's recorded as the
7896 * real owner of the socket, or if the pid is the same as
7897 * the process's own pid, then proceed. Otherwise ensure
7898 * that the issuing process has the necessary privileges.
7899 */
7900 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7901 if ((error = priv_check_cred(kauth_cred_get(),
7902 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7903 error = EACCES;
7904 goto done;
7905 }
7906 }
7907
7908 /* Find the process that corresponds to the effective pid */
7909 if ((ep = proc_find(epid)) == PROC_NULL) {
7910 error = ESRCH;
7911 goto done;
7912 }
7913
7914 /*
7915 * If a process tries to delegate the socket to itself, then
7916 * there's really nothing to do; treat it as a way for the
7917 * delegate association to be cleared. Note that we check
7918 * the passed-in proc rather than calling proc_selfpid(),
7919 * as we need to check the process issuing the socket option
7920 * which could be kernproc. Given that we don't allow 0 for
7921 * effective pid, it means that a delegated in-kernel socket
7922 * stays delegated during its lifetime (which is probably OK.)
7923 */
7924 if (epid == proc_pid(p)) {
7925 so->so_flags &= ~SOF_DELEGATED;
7926 so->e_upid = 0;
7927 so->e_pid = 0;
7928 uuid_clear(so->e_uuid);
7929 } else {
7930 so->so_flags |= SOF_DELEGATED;
7931 so->e_upid = proc_uniqueid(ep);
7932 so->e_pid = proc_pid(ep);
7933 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7934
7935 #if defined(XNU_TARGET_OS_OSX)
7936 if (ep->p_responsible_pid != so->e_pid) {
7937 proc_t rp = proc_find(ep->p_responsible_pid);
7938 if (rp != PROC_NULL) {
7939 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7940 so->so_rpid = ep->p_responsible_pid;
7941 proc_rele(rp);
7942 } else {
7943 uuid_clear(so->so_ruuid);
7944 so->so_rpid = -1;
7945 }
7946 }
7947 #endif
7948 }
7949 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7950 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7951 }
7952 done:
7953 if (error == 0 && net_io_policy_log) {
7954 uuid_string_t buf;
7955
7956 uuid_unparse(so->e_uuid, buf);
7957 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7958 "euuid %s%s\n", __func__, proc_name_address(p),
7959 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7960 SOCK_DOM(so), SOCK_TYPE(so),
7961 so->e_pid, proc_name_address(ep), buf,
7962 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7963 } else if (error != 0 && net_io_policy_log) {
7964 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7965 "ERROR (%d)\n", __func__, proc_name_address(p),
7966 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7967 SOCK_DOM(so), SOCK_TYPE(so),
7968 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7969 proc_name_address(ep), error);
7970 }
7971
7972 /* Update this socket's policy upon success */
7973 if (error == 0) {
7974 so->so_policy_gencnt *= -1;
7975 so_update_policy(so);
7976 #if NECP
7977 so_update_necp_policy(so, NULL, NULL);
7978 #endif /* NECP */
7979 }
7980
7981 if (ep != PROC_NULL) {
7982 proc_rele(ep);
7983 }
7984
7985 return error;
7986 }
7987
7988 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7989 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7990 {
7991 uuid_string_t buf;
7992 uuid_t uuid;
7993 int error = 0;
7994
7995 /* UUID must not be all-zeroes (reserved for kernel) */
7996 if (uuid_is_null(euuid)) {
7997 error = EINVAL;
7998 goto done;
7999 }
8000
8001 /*
8002 * If this is an in-kernel socket, prevent its delegate
8003 * association from changing unless the socket option is
8004 * coming from within the kernel itself.
8005 */
8006 if (so->last_pid == 0 && p != kernproc) {
8007 error = EACCES;
8008 goto done;
8009 }
8010
8011 /* Get the UUID of the issuing process */
8012 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8013
8014 /*
8015 * If this is issued by a process that's recorded as the
8016 * real owner of the socket, or if the uuid is the same as
8017 * the process's own uuid, then proceed. Otherwise ensure
8018 * that the issuing process has the necessary privileges.
8019 */
8020 if (check_cred &&
8021 (uuid_compare(euuid, so->last_uuid) != 0 ||
8022 uuid_compare(euuid, uuid) != 0)) {
8023 if ((error = priv_check_cred(kauth_cred_get(),
8024 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8025 error = EACCES;
8026 goto done;
8027 }
8028 }
8029
8030 /*
8031 * If a process tries to delegate the socket to itself, then
8032 * there's really nothing to do; treat it as a way for the
8033 * delegate association to be cleared. Note that we check
8034 * the uuid of the passed-in proc rather than that of the
8035 * current process, as we need to check the process issuing
8036 * the socket option which could be kernproc itself. Given
8037 * that we don't allow 0 for effective uuid, it means that
8038 * a delegated in-kernel socket stays delegated during its
8039 * lifetime (which is okay.)
8040 */
8041 if (uuid_compare(euuid, uuid) == 0) {
8042 so->so_flags &= ~SOF_DELEGATED;
8043 so->e_upid = 0;
8044 so->e_pid = 0;
8045 uuid_clear(so->e_uuid);
8046 } else {
8047 so->so_flags |= SOF_DELEGATED;
8048 /*
8049 * Unlike so_set_effective_pid(), we only have the UUID
8050 * here and the process ID is not known. Inherit the
8051 * real {pid,upid} of the socket.
8052 */
8053 so->e_upid = so->last_upid;
8054 so->e_pid = so->last_pid;
8055 uuid_copy(so->e_uuid, euuid);
8056 }
8057 /*
8058 * The following will clear the effective process name as it's the same
8059 * as the real process
8060 */
8061 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8062 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8063 }
8064 done:
8065 if (error == 0 && net_io_policy_log) {
8066 uuid_unparse(so->e_uuid, buf);
8067 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8068 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8069 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8070 SOCK_TYPE(so), so->e_pid, buf,
8071 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8072 } else if (error != 0 && net_io_policy_log) {
8073 uuid_unparse(euuid, buf);
8074 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8075 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8076 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8077 SOCK_TYPE(so), buf, error);
8078 }
8079
8080 /* Update this socket's policy upon success */
8081 if (error == 0) {
8082 so->so_policy_gencnt *= -1;
8083 so_update_policy(so);
8084 #if NECP
8085 so_update_necp_policy(so, NULL, NULL);
8086 #endif /* NECP */
8087 }
8088
8089 return error;
8090 }
8091
8092 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8093 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8094 uint32_t ev_datalen)
8095 {
8096 struct kev_msg ev_msg;
8097
8098 /*
8099 * A netpolicy event always starts with a netpolicy_event_data
8100 * structure, but the caller can provide for a longer event
8101 * structure to post, depending on the event code.
8102 */
8103 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8104
8105 bzero(&ev_msg, sizeof(ev_msg));
8106 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8107 ev_msg.kev_class = KEV_NETWORK_CLASS;
8108 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8109 ev_msg.event_code = ev_code;
8110
8111 ev_msg.dv[0].data_ptr = ev_data;
8112 ev_msg.dv[0].data_length = ev_datalen;
8113
8114 kev_post_msg(&ev_msg);
8115 }
8116
8117 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8118 socket_post_kev_msg(uint32_t ev_code,
8119 struct kev_socket_event_data *ev_data,
8120 uint32_t ev_datalen)
8121 {
8122 struct kev_msg ev_msg;
8123
8124 bzero(&ev_msg, sizeof(ev_msg));
8125 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8126 ev_msg.kev_class = KEV_NETWORK_CLASS;
8127 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8128 ev_msg.event_code = ev_code;
8129
8130 ev_msg.dv[0].data_ptr = ev_data;
8131 ev_msg.dv[0].data_length = ev_datalen;
8132
8133 kev_post_msg(&ev_msg);
8134 }
8135
8136 void
socket_post_kev_msg_closed(struct socket * so)8137 socket_post_kev_msg_closed(struct socket *so)
8138 {
8139 struct kev_socket_closed ev = {};
8140 struct sockaddr *__single socksa = NULL, *__single peersa = NULL;
8141 int err;
8142
8143 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8144 return;
8145 }
8146 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8147 if (err == 0) {
8148 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8149 &peersa);
8150 if (err == 0) {
8151 SOCKADDR_COPY(socksa, &ev.ev_data.kev_sockname,
8152 min(socksa->sa_len,
8153 sizeof(ev.ev_data.kev_sockname)));
8154 SOCKADDR_COPY(peersa, &ev.ev_data.kev_peername,
8155 min(peersa->sa_len,
8156 sizeof(ev.ev_data.kev_peername)));
8157 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8158 &ev.ev_data, sizeof(ev));
8159 }
8160 }
8161 free_sockaddr(socksa);
8162 free_sockaddr(peersa);
8163 }
8164
8165 void
sock_parse_cm_info(struct mbuf * control,struct sock_cm_info * sockcminfo)8166 sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo)
8167 {
8168 struct cmsghdr *cm;
8169
8170 for (cm = M_FIRST_CMSGHDR(control);
8171 is_cmsg_valid(control, cm);
8172 cm = M_NXT_CMSGHDR(control, cm)) {
8173 int val;
8174
8175 if (cm->cmsg_level != SOL_SOCKET) {
8176 continue;
8177 }
8178
8179 if (cm->cmsg_len == CMSG_LEN(sizeof(int))) {
8180 val = *(int *)(void *)CMSG_DATA(cm);
8181 }
8182
8183 switch (cm->cmsg_type) {
8184 case SO_TRAFFIC_CLASS:
8185 if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8186 break;
8187 }
8188 if (SO_VALID_TC(val)) {
8189 sockcminfo->sotc = val;
8190 break;
8191 } else if (val < SO_TC_NET_SERVICE_OFFSET) {
8192 break;
8193 }
8194 /*
8195 * Handle the case SO_NET_SERVICE_TYPE values are
8196 * passed using SO_TRAFFIC_CLASS
8197 */
8198 val = val - SO_TC_NET_SERVICE_OFFSET;
8199
8200 OS_FALLTHROUGH;
8201 case SO_NET_SERVICE_TYPE:
8202 if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8203 break;
8204 }
8205
8206 if (!IS_VALID_NET_SERVICE_TYPE(val)) {
8207 break;
8208 }
8209 sockcminfo->netsvctype = val;
8210 sockcminfo->sotc = sotc_by_netservicetype[val];
8211 break;
8212 case SCM_TXTIME:
8213 if (cm->cmsg_len != CMSG_LEN(sizeof(uint64_t))) {
8214 break;
8215 }
8216
8217 sockcminfo->tx_time = *(uint64_t *)(void *)CMSG_DATA(cm);
8218 break;
8219 default:
8220 break;
8221 }
8222 }
8223 }
8224
8225 __attribute__((noinline, cold, not_tail_called, noreturn))
8226 __private_extern__ int
assfail(const char * a,const char * f,int l)8227 assfail(const char *a, const char *f, int l)
8228 {
8229 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8230 /* NOTREACHED */
8231 __builtin_unreachable();
8232 }
8233