1 /*
2 * Copyright (c) 1998-2022, 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <net/sockaddr_utils.h>
106 #include <netinet/in.h>
107 #include <netinet/in_pcb.h>
108 #include <netinet/in_tclass.h>
109 #include <netinet/in_var.h>
110 #include <netinet/tcp_var.h>
111 #include <netinet/ip6.h>
112 #include <netinet6/ip6_var.h>
113 #include <netinet/flow_divert.h>
114 #include <kern/assert.h>
115 #include <kern/locks.h>
116 #include <kern/mem_acct.h>
117 #include <kern/policy_internal.h>
118 #include <kern/uipc_domain.h>
119 #include <kern/uipc_socket.h>
120 #include <kern/task.h>
121 #include <kern/zalloc.h>
122 #include <machine/limits.h>
123 #include <libkern/OSAtomic.h>
124 #include <pexpert/pexpert.h>
125
126 #include <sys/kpi_mbuf.h>
127 #include <sys/mcache.h>
128 #include <sys/unpcb.h>
129 #include <libkern/section_keywords.h>
130
131 #include <os/log.h>
132
133 #if CONFIG_MACF
134 #include <security/mac_framework.h>
135 #endif /* MAC */
136
137 #if MULTIPATH
138 #include <netinet/mp_pcb.h>
139 #include <netinet/mptcp_var.h>
140 #endif /* MULTIPATH */
141
142 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
143
144 #if DEBUG || DEVELOPMENT
145 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
146 #else
147 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
148 #endif
149
150 /* TODO: this should be in a header file somewhere */
151 extern char *proc_name_address(void *p);
152
153 static int socketinit_done;
154 struct mem_acct *socket_memacct;
155
156 #include <machine/limits.h>
157
158 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
159 static void filt_sordetach(struct knote *kn);
160 static int filt_soread(struct knote *kn, long hint);
161 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
162 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
163
164 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void filt_sowdetach(struct knote *kn);
166 static int filt_sowrite(struct knote *kn, long hint);
167 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
169
170 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void filt_sockdetach(struct knote *kn);
172 static int filt_sockev(struct knote *kn, long hint);
173 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
175
176 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
177 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
178
179 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
180 .f_isfd = 1,
181 .f_attach = filt_sorattach,
182 .f_detach = filt_sordetach,
183 .f_event = filt_soread,
184 .f_touch = filt_sortouch,
185 .f_process = filt_sorprocess,
186 };
187
188 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
189 .f_isfd = 1,
190 .f_attach = filt_sowattach,
191 .f_detach = filt_sowdetach,
192 .f_event = filt_sowrite,
193 .f_touch = filt_sowtouch,
194 .f_process = filt_sowprocess,
195 };
196
197 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
198 .f_isfd = 1,
199 .f_attach = filt_sockattach,
200 .f_detach = filt_sockdetach,
201 .f_event = filt_sockev,
202 .f_touch = filt_socktouch,
203 .f_process = filt_sockprocess,
204 };
205
206 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
207 .f_isfd = 1,
208 .f_attach = filt_sorattach,
209 .f_detach = filt_sordetach,
210 .f_event = filt_soread,
211 .f_touch = filt_sortouch,
212 .f_process = filt_sorprocess,
213 };
214
215 SYSCTL_DECL(_kern_ipc);
216
217 #define EVEN_MORE_LOCKING_DEBUG 0
218
219 int socket_debug = 0;
220 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
221 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
222
223 #if (DEBUG || DEVELOPMENT)
224 #define DEFAULT_SOSEND_ASSERT_PANIC 1
225 #else
226 #define DEFAULT_SOSEND_ASSERT_PANIC 0
227 #endif /* (DEBUG || DEVELOPMENT) */
228
229 int sosend_assert_panic = 0;
230 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
231 CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
232
233 static unsigned long sodefunct_calls = 0;
234 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
235 &sodefunct_calls, "");
236
237 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
238 so_gen_t so_gencnt; /* generation count for sockets */
239
240 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
241 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
242 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
243 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
244 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
245 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
246 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
247 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
248 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
265 * writes on the socket for all protocols on any network interfaces.
266 * Be extra careful when setting this to 1, because sending down packets with
267 * clusters larger that 2 KB might lead to system panics or data corruption.
268 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
269 * on the outgoing interface
270 * Set this to 1 for testing/debugging purposes only.
271 */
272 int sosendbigcl_ignore_capab = 0;
273 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
274 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
275
276 int sodefunctlog = 0;
277 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
278 &sodefunctlog, 0, "");
279
280 int sothrottlelog = 0;
281 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
282 &sothrottlelog, 0, "");
283
284 int sorestrictrecv = 1;
285 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
286 &sorestrictrecv, 0, "Enable inbound interface restrictions");
287
288 int sorestrictsend = 1;
289 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
290 &sorestrictsend, 0, "Enable outbound interface restrictions");
291
292 int soreserveheadroom = 1;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
294 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
295
296 #if (DEBUG || DEVELOPMENT)
297 int so_notsent_lowat_check = 1;
298 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
299 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
300 #endif /* DEBUG || DEVELOPMENT */
301
302 int so_accept_list_waits = 0;
303 #if (DEBUG || DEVELOPMENT)
304 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &so_accept_list_waits, 0, "number of waits for listener incomp list");
306 #endif /* DEBUG || DEVELOPMENT */
307
308 extern struct inpcbinfo tcbinfo;
309
310 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
311 user_ssize_t *);
312
313 /*
314 * Maximum of extended background idle sockets per process
315 * Set to zero to disable further setting of the option
316 */
317
318 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
319 #define SO_IDLE_BK_IDLE_TIME 600
320 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
321
322 struct soextbkidlestat soextbkidlestat;
323
324 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
325 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
326 "Maximum of extended background idle sockets per process");
327
328 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
329 &soextbkidlestat.so_xbkidle_time, 0,
330 "Time in seconds to keep extended background idle sockets");
331
332 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
333 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
334 "High water mark for extended background idle sockets");
335
336 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
337 &soextbkidlestat, soextbkidlestat, "");
338
339 int so_set_extended_bk_idle(struct socket *, int);
340
341 #define SO_MAX_MSG_X 1024
342
343 /*
344 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
345 * setting the DSCP code on the packet based on the service class; see
346 * <rdar://problem/11277343> for details.
347 */
348 __private_extern__ u_int32_t sotcdb = 0;
349 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
350 &sotcdb, 0, "");
351
352 void
socketinit(void)353 socketinit(void)
354 {
355 static_assert(sizeof(so_gencnt) == sizeof(uint64_t));
356 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
357
358 #ifdef __LP64__
359 static_assert(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
360 static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
361 static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
362 static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
363 static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
364 static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
365 #else
366 static_assert(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
367 static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
368 static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
369 static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
370 static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
371 static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
372 #endif
373
374 if (socketinit_done) {
375 printf("socketinit: already called...\n");
376 return;
377 }
378 socketinit_done = 1;
379
380 PE_parse_boot_argn("socket_debug", &socket_debug,
381 sizeof(socket_debug));
382
383 PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
384 sizeof(sosend_assert_panic));
385
386 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
387 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
388 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
389 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
390
391 in_pcbinit();
392
393 socket_memacct = mem_acct_register("SOCKET", 0, 0);
394 if (socket_memacct == NULL) {
395 panic("mem_acct_register returned NULL");
396 }
397 }
398
399 void
so_update_last_owner_locked(struct socket * so,proc_t self)400 so_update_last_owner_locked(struct socket *so, proc_t self)
401 {
402 if (so->last_pid != 0) {
403 /*
404 * last_pid and last_upid should remain zero for sockets
405 * created using sock_socket. The check above achieves that
406 */
407 if (self == PROC_NULL) {
408 self = current_proc();
409 }
410
411 if (so->last_upid != proc_uniqueid(self) ||
412 so->last_pid != proc_pid(self)) {
413 so->last_upid = proc_uniqueid(self);
414 so->last_pid = proc_pid(self);
415 proc_getexecutableuuid(self, so->last_uuid,
416 sizeof(so->last_uuid));
417 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
418 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
419 }
420 }
421 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
422 }
423 }
424
425 void
so_update_policy(struct socket * so)426 so_update_policy(struct socket *so)
427 {
428 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
429 (void) inp_update_policy(sotoinpcb(so));
430 }
431 }
432
433 #if NECP
434 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)435 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
436 struct sockaddr *override_remote_addr)
437 {
438 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
439 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
440 override_remote_addr, 0);
441 }
442 }
443 #endif /* NECP */
444
445 /*
446 * Get a socket structure from our zone, and initialize it.
447 *
448 * Note that it would probably be better to allocate socket
449 * and PCB at the same time, but I'm not convinced that all
450 * the protocols can be easily modified to do this.
451 */
452 struct socket *
soalloc(void)453 soalloc(void)
454 {
455 struct socket *__single so;
456
457 so = zalloc_flags(socket_zone, Z_WAITOK_ZERO);
458 if (so != NULL) {
459 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
460
461 /*
462 * Increment the socket allocation statistics
463 */
464 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
465 }
466
467 return so;
468 }
469
470 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)471 socreate_internal(int dom, struct socket **aso, int type, int proto,
472 struct proc *p, uint32_t flags, struct proc *ep)
473 {
474 struct protosw *prp;
475 struct socket *so;
476 int error = 0;
477 pid_t rpid = -1;
478
479 VERIFY(aso != NULL);
480 *aso = NULL;
481
482 if (proto != 0) {
483 prp = pffindproto(dom, proto, type);
484 } else {
485 prp = pffindtype(dom, type);
486 }
487
488 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
489 if (pffinddomain(dom) == NULL) {
490 return EAFNOSUPPORT;
491 }
492 if (proto != 0) {
493 if (pffindprotonotype(dom, proto) != NULL) {
494 return EPROTOTYPE;
495 }
496 }
497 return EPROTONOSUPPORT;
498 }
499 if (prp->pr_type != type) {
500 return EPROTOTYPE;
501 }
502 if (proto_memacct_hardlimit(prp)) {
503 return ENOBUFS;
504 }
505 so = soalloc();
506 if (so == NULL) {
507 return ENOBUFS;
508 }
509
510 switch (dom) {
511 case PF_LOCAL:
512 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
513 break;
514 case PF_INET:
515 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
516 if (type == SOCK_STREAM) {
517 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
518 } else {
519 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
520 }
521 break;
522 case PF_ROUTE:
523 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
524 break;
525 case PF_NDRV:
526 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
527 break;
528 case PF_KEY:
529 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
530 break;
531 case PF_INET6:
532 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
533 if (type == SOCK_STREAM) {
534 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
535 } else {
536 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
537 }
538 break;
539 case PF_SYSTEM:
540 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
541 break;
542 case PF_MULTIPATH:
543 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
544 break;
545 default:
546 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
547 break;
548 }
549
550 if (flags & SOCF_MPTCP) {
551 so->so_state |= SS_NBIO;
552 }
553
554 TAILQ_INIT(&so->so_incomp);
555 TAILQ_INIT(&so->so_comp);
556 so->so_type = (short)type;
557 so->so_family = prp->pr_domain->dom_family;
558 so->so_protocol = prp->pr_protocol;
559 so->last_upid = proc_uniqueid(p);
560 so->last_pid = proc_pid(p);
561 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
562 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
563
564 so->so_rpid = -1;
565 uuid_clear(so->so_ruuid);
566
567 if (ep != PROC_NULL && ep != p) {
568 so->e_upid = proc_uniqueid(ep);
569 so->e_pid = proc_pid(ep);
570 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
571 so->so_flags |= SOF_DELEGATED;
572 if (ep->p_responsible_pid != so->e_pid) {
573 rpid = ep->p_responsible_pid;
574 so->so_rpid = rpid;
575 proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
576 }
577 }
578
579 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
580 rpid = p->p_responsible_pid;
581 so->so_rpid = rpid;
582 proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
583 }
584
585 so->so_cred = kauth_cred_proc_ref(p);
586 if (!suser(kauth_cred_get(), NULL)) {
587 so->so_state |= SS_PRIV;
588 }
589
590 so->so_persona_id = current_persona_get_id();
591 so->so_proto = prp;
592 so->so_rcv.sb_flags |= SB_RECV;
593 so->so_rcv.sb_so = so->so_snd.sb_so = so;
594 so->next_lock_lr = 0;
595 so->next_unlock_lr = 0;
596
597 proto_memacct_add(so->so_proto, sizeof(struct socket));
598
599 /*
600 * Attachment will create the per pcb lock if necessary and
601 * increase refcount for creation, make sure it's done before
602 * socket is inserted in lists.
603 */
604 so->so_usecount++;
605
606 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
607 if (error != 0) {
608 /*
609 * Warning:
610 * If so_pcb is not zero, the socket will be leaked,
611 * so protocol attachment handler must be coded carefuly
612 */
613 if (so->so_pcb != NULL) {
614 os_log_error(OS_LOG_DEFAULT,
615 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
616 error, dom, proto, type);
617 }
618 /*
619 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
620 */
621 so->so_state |= SS_NOFDREF;
622 so->so_flags |= SOF_PCBCLEARING;
623 VERIFY(so->so_usecount > 0);
624 so->so_usecount--;
625 sofreelastref(so, 1); /* will deallocate the socket */
626 return error;
627 }
628
629 /*
630 * Note: needs so_pcb to be set after pru_attach
631 */
632 if (prp->pr_update_last_owner != NULL) {
633 (*prp->pr_update_last_owner)(so, p, ep);
634 }
635
636 os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
637
638 /* Attach socket filters for this protocol */
639 sflt_initsock(so);
640 so_set_default_traffic_class(so);
641
642 /*
643 * If this thread or task is marked to create backgrounded sockets,
644 * mark the socket as background.
645 */
646 if (!(flags & SOCF_MPTCP) &&
647 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
648 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
649 so->so_background_thread = current_thread();
650 }
651
652 switch (dom) {
653 /*
654 * Don't mark Unix domain or system
655 * eligible for defunct by default.
656 */
657 case PF_LOCAL:
658 case PF_SYSTEM:
659 so->so_flags |= SOF_NODEFUNCT;
660 break;
661 default:
662 break;
663 }
664
665 /*
666 * Entitlements can't be checked at socket creation time except if the
667 * application requested a feature guarded by a privilege (c.f., socket
668 * delegation).
669 * The priv(9) and the Sandboxing APIs are designed with the idea that
670 * a privilege check should only be triggered by a userland request.
671 * A privilege check at socket creation time is time consuming and
672 * could trigger many authorisation error messages from the security
673 * APIs.
674 */
675
676 *aso = so;
677
678 return 0;
679 }
680
681 /*
682 * Returns: 0 Success
683 * EAFNOSUPPORT
684 * EPROTOTYPE
685 * EPROTONOSUPPORT
686 * ENOBUFS
687 * <pru_attach>:ENOBUFS[AF_UNIX]
688 * <pru_attach>:ENOBUFS[TCP]
689 * <pru_attach>:ENOMEM[TCP]
690 * <pru_attach>:??? [other protocol families, IPSEC]
691 */
692 int
socreate(int dom,struct socket ** aso,int type,int proto)693 socreate(int dom, struct socket **aso, int type, int proto)
694 {
695 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
696 PROC_NULL);
697 }
698
699 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)700 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
701 {
702 int error = 0;
703 struct proc *ep = PROC_NULL;
704
705 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
706 error = ESRCH;
707 goto done;
708 }
709
710 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
711
712 /*
713 * It might not be wise to hold the proc reference when calling
714 * socreate_internal since it calls soalloc with M_WAITOK
715 */
716 done:
717 if (ep != PROC_NULL) {
718 proc_rele(ep);
719 }
720
721 return error;
722 }
723
724 /*
725 * Returns: 0 Success
726 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
727 * <pru_bind>:EAFNOSUPPORT Address family not supported
728 * <pru_bind>:EADDRNOTAVAIL Address not available.
729 * <pru_bind>:EINVAL Invalid argument
730 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
731 * <pru_bind>:EACCES Permission denied
732 * <pru_bind>:EADDRINUSE Address in use
733 * <pru_bind>:EAGAIN Resource unavailable, try again
734 * <pru_bind>:EPERM Operation not permitted
735 * <pru_bind>:???
736 * <sf_bind>:???
737 *
738 * Notes: It's not possible to fully enumerate the return codes above,
739 * since socket filter authors and protocol family authors may
740 * not choose to limit their error returns to those listed, even
741 * though this may result in some software operating incorrectly.
742 *
743 * The error codes which are enumerated above are those known to
744 * be returned by the tcp_usr_bind function supplied.
745 */
746 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)747 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
748 {
749 struct proc *p = current_proc();
750 int error = 0;
751
752 if (dolock) {
753 socket_lock(so, 1);
754 }
755
756 so_update_last_owner_locked(so, p);
757 so_update_policy(so);
758
759 #if NECP
760 so_update_necp_policy(so, nam, NULL);
761 #endif /* NECP */
762
763 /*
764 * If this is a bind request on a socket that has been marked
765 * as inactive, reject it now before we go any further.
766 */
767 if (so->so_flags & SOF_DEFUNCT) {
768 error = EINVAL;
769 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
770 __func__, proc_pid(p), proc_best_name(p),
771 so->so_gencnt,
772 SOCK_DOM(so), SOCK_TYPE(so), error);
773 goto out;
774 }
775
776 /* Socket filter */
777 error = sflt_bind(so, nam);
778
779 if (error == 0) {
780 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
781 }
782 out:
783 if (dolock) {
784 socket_unlock(so, 1);
785 }
786
787 if (error == EJUSTRETURN) {
788 error = 0;
789 }
790
791 return error;
792 }
793
794 void
sodealloc(struct socket * so)795 sodealloc(struct socket *so)
796 {
797 proto_memacct_sub(so->so_proto, sizeof(struct socket));
798
799 kauth_cred_unref(&so->so_cred);
800
801 /* Remove any filters */
802 sflt_termsock(so);
803
804 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
805
806 zfree(socket_zone, so);
807 }
808
809 /*
810 * Returns: 0 Success
811 * EINVAL
812 * EOPNOTSUPP
813 * <pru_listen>:EINVAL[AF_UNIX]
814 * <pru_listen>:EINVAL[TCP]
815 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
816 * <pru_listen>:EINVAL[TCP] Invalid argument
817 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
818 * <pru_listen>:EACCES[TCP] Permission denied
819 * <pru_listen>:EADDRINUSE[TCP] Address in use
820 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
821 * <pru_listen>:EPERM[TCP] Operation not permitted
822 * <sf_listen>:???
823 *
824 * Notes: Other <pru_listen> returns depend on the protocol family; all
825 * <sf_listen> returns depend on what the filter author causes
826 * their filter to return.
827 */
828 int
solisten(struct socket * so,int backlog)829 solisten(struct socket *so, int backlog)
830 {
831 struct proc *p = current_proc();
832 int error = 0;
833
834 socket_lock(so, 1);
835
836 so_update_last_owner_locked(so, p);
837 so_update_policy(so);
838
839 if (TAILQ_EMPTY(&so->so_comp)) {
840 so->so_options |= SO_ACCEPTCONN;
841 }
842
843 #if NECP
844 so_update_necp_policy(so, NULL, NULL);
845 #endif /* NECP */
846
847 if (so->so_proto == NULL) {
848 error = EINVAL;
849 so->so_options &= ~SO_ACCEPTCONN;
850 goto out;
851 }
852 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
853 error = EOPNOTSUPP;
854 so->so_options &= ~SO_ACCEPTCONN;
855 goto out;
856 }
857
858 /*
859 * If the listen request is made on a socket that is not fully
860 * disconnected, or on a socket that has been marked as inactive,
861 * reject the request now.
862 */
863 if ((so->so_state &
864 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
865 (so->so_flags & SOF_DEFUNCT)) {
866 error = EINVAL;
867 if (so->so_flags & SOF_DEFUNCT) {
868 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
869 "(%d)\n", __func__, proc_pid(p),
870 proc_best_name(p),
871 so->so_gencnt,
872 SOCK_DOM(so), SOCK_TYPE(so), error);
873 }
874 so->so_options &= ~SO_ACCEPTCONN;
875 goto out;
876 }
877
878 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
879 error = EPERM;
880 so->so_options &= ~SO_ACCEPTCONN;
881 goto out;
882 }
883
884 error = sflt_listen(so);
885 if (error == 0) {
886 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
887 }
888
889 if (error) {
890 if (error == EJUSTRETURN) {
891 error = 0;
892 }
893 so->so_options &= ~SO_ACCEPTCONN;
894 goto out;
895 }
896
897 /*
898 * POSIX: The implementation may have an upper limit on the length of
899 * the listen queue-either global or per accepting socket. If backlog
900 * exceeds this limit, the length of the listen queue is set to the
901 * limit.
902 *
903 * If listen() is called with a backlog argument value that is less
904 * than 0, the function behaves as if it had been called with a backlog
905 * argument value of 0.
906 *
907 * A backlog argument of 0 may allow the socket to accept connections,
908 * in which case the length of the listen queue may be set to an
909 * implementation-defined minimum value.
910 */
911 if (backlog <= 0 || backlog > somaxconn) {
912 backlog = somaxconn;
913 }
914
915 so->so_qlimit = (short)backlog;
916 out:
917 socket_unlock(so, 1);
918 return error;
919 }
920
921 /*
922 * The "accept list lock" protects the fields related to the listener queues
923 * because we can unlock a socket to respect the lock ordering between
924 * the listener socket and its clients sockets. The lock ordering is first to
925 * acquire the client socket before the listener socket.
926 *
927 * The accept list lock serializes access to the following fields:
928 * - of the listener socket:
929 * - so_comp
930 * - so_incomp
931 * - so_qlen
932 * - so_inqlen
933 * - of client sockets that are in so_comp or so_incomp:
934 * - so_head
935 * - so_list
936 *
937 * As one can see the accept list lock protects the consistent of the
938 * linkage of the client sockets.
939 *
940 * Note that those fields may be read without holding the accept list lock
941 * for a preflight provided the accept list lock is taken when committing
942 * to take an action based on the result of the preflight. The preflight
943 * saves the cost of doing the unlock/lock dance.
944 */
945 void
so_acquire_accept_list(struct socket * head,struct socket * so)946 so_acquire_accept_list(struct socket *head, struct socket *so)
947 {
948 lck_mtx_t *mutex_held;
949
950 if (head->so_proto->pr_getlock == NULL) {
951 return;
952 }
953 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
954 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
955
956 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
957 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
958 return;
959 }
960 if (so != NULL) {
961 socket_unlock(so, 0);
962 }
963 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
964 so_accept_list_waits += 1;
965 msleep((caddr_t)&head->so_incomp, mutex_held,
966 PSOCK | PCATCH, __func__, NULL);
967 }
968 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
969 if (so != NULL) {
970 socket_unlock(head, 0);
971 socket_lock(so, 0);
972 socket_lock(head, 0);
973 }
974 }
975
976 void
so_release_accept_list(struct socket * head)977 so_release_accept_list(struct socket *head)
978 {
979 if (head->so_proto->pr_getlock != NULL) {
980 lck_mtx_t *mutex_held;
981
982 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
983 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
984
985 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
986 wakeup((caddr_t)&head->so_incomp);
987 }
988 }
989
990 void
sofreelastref(struct socket * so,int dealloc)991 sofreelastref(struct socket *so, int dealloc)
992 {
993 struct socket *head = so->so_head;
994
995 /* Assume socket is locked */
996
997 #if FLOW_DIVERT
998 if (so->so_flags & SOF_FLOW_DIVERT) {
999 flow_divert_detach(so);
1000 }
1001 #endif /* FLOW_DIVERT */
1002
1003 #if CONTENT_FILTER
1004 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1005 cfil_sock_detach(so);
1006 }
1007 #endif /* CONTENT_FILTER */
1008
1009 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1010 soflow_detach(so);
1011 }
1012
1013 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1014 selthreadclear(&so->so_snd.sb_sel);
1015 selthreadclear(&so->so_rcv.sb_sel);
1016 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1017 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1018 so->so_event = sonullevent;
1019 return;
1020 }
1021 if (head != NULL) {
1022 /*
1023 * Need to lock the listener when the protocol has
1024 * per socket locks
1025 */
1026 if (head->so_proto->pr_getlock != NULL) {
1027 socket_lock(head, 1);
1028 so_acquire_accept_list(head, so);
1029 }
1030 if (so->so_state & SS_INCOMP) {
1031 so->so_state &= ~SS_INCOMP;
1032 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1033 head->so_incqlen--;
1034 head->so_qlen--;
1035 so->so_head = NULL;
1036
1037 if (head->so_proto->pr_getlock != NULL) {
1038 so_release_accept_list(head);
1039 socket_unlock(head, 1);
1040 }
1041 } else if (so->so_state & SS_COMP) {
1042 if (head->so_proto->pr_getlock != NULL) {
1043 so_release_accept_list(head);
1044 socket_unlock(head, 1);
1045 }
1046 /*
1047 * We must not decommission a socket that's
1048 * on the accept(2) queue. If we do, then
1049 * accept(2) may hang after select(2) indicated
1050 * that the listening socket was ready.
1051 */
1052 selthreadclear(&so->so_snd.sb_sel);
1053 selthreadclear(&so->so_rcv.sb_sel);
1054 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1055 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1056 so->so_event = sonullevent;
1057 return;
1058 } else {
1059 if (head->so_proto->pr_getlock != NULL) {
1060 so_release_accept_list(head);
1061 socket_unlock(head, 1);
1062 }
1063 printf("sofree: not queued\n");
1064 }
1065 }
1066 sowflush(so);
1067 sorflush(so);
1068
1069 /* 3932268: disable upcall */
1070 so->so_rcv.sb_flags &= ~SB_UPCALL;
1071 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1072 so->so_event = sonullevent;
1073
1074 if (dealloc) {
1075 sodealloc(so);
1076 }
1077 }
1078
1079 void
soclose_wait_locked(struct socket * so)1080 soclose_wait_locked(struct socket *so)
1081 {
1082 lck_mtx_t *mutex_held;
1083
1084 if (so->so_proto->pr_getlock != NULL) {
1085 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1086 } else {
1087 mutex_held = so->so_proto->pr_domain->dom_mtx;
1088 }
1089 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1090
1091 /*
1092 * Double check here and return if there's no outstanding upcall;
1093 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1094 */
1095 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1096 return;
1097 }
1098 so->so_rcv.sb_flags &= ~SB_UPCALL;
1099 so->so_snd.sb_flags &= ~SB_UPCALL;
1100 so->so_flags |= SOF_CLOSEWAIT;
1101
1102 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1103 "soclose_wait_locked", NULL);
1104 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1105 so->so_flags &= ~SOF_CLOSEWAIT;
1106 }
1107
1108 /*
1109 * Close a socket on last file table reference removal.
1110 * Initiate disconnect if connected.
1111 * Free socket when disconnect complete.
1112 */
1113 int
soclose_locked(struct socket * so)1114 soclose_locked(struct socket *so)
1115 {
1116 int error = 0;
1117 struct timespec ts;
1118
1119 if (so->so_usecount == 0) {
1120 panic("soclose: so=%p refcount=0", so);
1121 /* NOTREACHED */
1122 }
1123
1124 sflt_notify(so, sock_evt_closing, NULL);
1125
1126 if (so->so_upcallusecount) {
1127 soclose_wait_locked(so);
1128 }
1129
1130 #if CONTENT_FILTER
1131 /*
1132 * We have to wait until the content filters are done
1133 */
1134 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1135 cfil_sock_close_wait(so);
1136 cfil_sock_is_closed(so);
1137 cfil_sock_detach(so);
1138 }
1139 #endif /* CONTENT_FILTER */
1140
1141 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1142 soflow_detach(so);
1143 }
1144
1145 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1146 soresume(current_proc(), so, 1);
1147 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1148 }
1149
1150 if ((so->so_options & SO_ACCEPTCONN)) {
1151 struct socket *sp, *sonext;
1152 int persocklock = 0;
1153 int incomp_overflow_only;
1154
1155 /*
1156 * We do not want new connection to be added
1157 * to the connection queues
1158 */
1159 so->so_options &= ~SO_ACCEPTCONN;
1160
1161 /*
1162 * We can drop the lock on the listener once
1163 * we've acquired the incoming list
1164 */
1165 if (so->so_proto->pr_getlock != NULL) {
1166 persocklock = 1;
1167 so_acquire_accept_list(so, NULL);
1168 socket_unlock(so, 0);
1169 }
1170 again:
1171 incomp_overflow_only = 1;
1172
1173 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1174 /*
1175 * Radar 5350314
1176 * skip sockets thrown away by tcpdropdropblreq
1177 * they will get cleanup by the garbage collection.
1178 * otherwise, remove the incomp socket from the queue
1179 * and let soabort trigger the appropriate cleanup.
1180 */
1181 if (sp->so_flags & SOF_OVERFLOW) {
1182 continue;
1183 }
1184
1185 if (persocklock != 0) {
1186 socket_lock(sp, 1);
1187 }
1188
1189 /*
1190 * Radar 27945981
1191 * The extra reference for the list insure the
1192 * validity of the socket pointer when we perform the
1193 * unlock of the head above
1194 */
1195 if (sp->so_state & SS_INCOMP) {
1196 sp->so_state &= ~SS_INCOMP;
1197 sp->so_head = NULL;
1198 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1199 so->so_incqlen--;
1200 so->so_qlen--;
1201
1202 (void) soabort(sp);
1203 } else {
1204 panic("%s sp %p in so_incomp but !SS_INCOMP",
1205 __func__, sp);
1206 }
1207
1208 if (persocklock != 0) {
1209 socket_unlock(sp, 1);
1210 }
1211 }
1212
1213 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1214 /* Dequeue from so_comp since sofree() won't do it */
1215 if (persocklock != 0) {
1216 socket_lock(sp, 1);
1217 }
1218
1219 if (sp->so_state & SS_COMP) {
1220 sp->so_state &= ~SS_COMP;
1221 sp->so_head = NULL;
1222 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1223 so->so_qlen--;
1224
1225 (void) soabort(sp);
1226 } else {
1227 panic("%s sp %p in so_comp but !SS_COMP",
1228 __func__, sp);
1229 }
1230
1231 if (persocklock) {
1232 socket_unlock(sp, 1);
1233 }
1234 }
1235
1236 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1237 #if (DEBUG | DEVELOPMENT)
1238 panic("%s head %p so_comp not empty", __func__, so);
1239 #endif /* (DEVELOPMENT || DEBUG) */
1240
1241 goto again;
1242 }
1243
1244 if (!TAILQ_EMPTY(&so->so_comp)) {
1245 #if (DEBUG | DEVELOPMENT)
1246 panic("%s head %p so_comp not empty", __func__, so);
1247 #endif /* (DEVELOPMENT || DEBUG) */
1248
1249 goto again;
1250 }
1251
1252 if (persocklock) {
1253 socket_lock(so, 0);
1254 so_release_accept_list(so);
1255 }
1256 }
1257 if (so->so_pcb == NULL) {
1258 /* 3915887: mark the socket as ready for dealloc */
1259 so->so_flags |= SOF_PCBCLEARING;
1260 goto discard;
1261 }
1262
1263 if (so->so_state & SS_ISCONNECTED) {
1264 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1265 error = sodisconnectlocked(so);
1266 if (error) {
1267 goto drop;
1268 }
1269 }
1270 if (so->so_options & SO_LINGER) {
1271 if ((so->so_state & SS_ISDISCONNECTING) &&
1272 (so->so_state & SS_NBIO)) {
1273 goto drop;
1274 }
1275 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1276 lck_mtx_t *mutex_held;
1277
1278 if (so->so_proto->pr_getlock != NULL) {
1279 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1280 } else {
1281 mutex_held = so->so_proto->pr_domain->dom_mtx;
1282 }
1283 ts.tv_sec = (so->so_linger / 100);
1284 ts.tv_nsec = (so->so_linger % 100) *
1285 NSEC_PER_USEC * 1000 * 10;
1286 error = msleep((caddr_t)&so->so_timeo,
1287 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1288 if (error) {
1289 /*
1290 * It's OK when the time fires,
1291 * don't report an error
1292 */
1293 if (error == EWOULDBLOCK) {
1294 error = 0;
1295 }
1296 break;
1297 }
1298 }
1299 }
1300 }
1301 drop:
1302 if (so->so_usecount == 0) {
1303 panic("soclose: usecount is zero so=%p", so);
1304 /* NOTREACHED */
1305 }
1306 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1307 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1308 if (error == 0) {
1309 error = error2;
1310 }
1311 }
1312 if (so->so_usecount <= 0) {
1313 panic("soclose: usecount is zero so=%p", so);
1314 /* NOTREACHED */
1315 }
1316 discard:
1317 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1318 (so->so_state & SS_NOFDREF)) {
1319 panic("soclose: NOFDREF");
1320 /* NOTREACHED */
1321 }
1322 so->so_state |= SS_NOFDREF;
1323
1324 if ((so->so_flags & SOF_KNOTE) != 0) {
1325 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1326 }
1327
1328 os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1329
1330 VERIFY(so->so_usecount > 0);
1331 so->so_usecount--;
1332 sofree(so);
1333 return error;
1334 }
1335
1336 int
soclose(struct socket * so)1337 soclose(struct socket *so)
1338 {
1339 int error = 0;
1340 socket_lock(so, 1);
1341
1342 if (so->so_retaincnt == 0) {
1343 error = soclose_locked(so);
1344 } else {
1345 /*
1346 * if the FD is going away, but socket is
1347 * retained in kernel remove its reference
1348 */
1349 so->so_usecount--;
1350 if (so->so_usecount < 2) {
1351 panic("soclose: retaincnt non null and so=%p "
1352 "usecount=%d\n", so, so->so_usecount);
1353 }
1354 }
1355 socket_unlock(so, 1);
1356 return error;
1357 }
1358
1359 /*
1360 * Must be called at splnet...
1361 */
1362 /* Should already be locked */
1363 int
soabort(struct socket * so)1364 soabort(struct socket *so)
1365 {
1366 int error;
1367
1368 #ifdef MORE_LOCKING_DEBUG
1369 lck_mtx_t *mutex_held;
1370
1371 if (so->so_proto->pr_getlock != NULL) {
1372 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1373 } else {
1374 mutex_held = so->so_proto->pr_domain->dom_mtx;
1375 }
1376 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1377 #endif
1378
1379 if ((so->so_flags & SOF_ABORTED) == 0) {
1380 so->so_flags |= SOF_ABORTED;
1381 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1382 if (error) {
1383 sofree(so);
1384 return error;
1385 }
1386 }
1387 return 0;
1388 }
1389
1390 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1391 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1392 {
1393 int error;
1394
1395 if (dolock) {
1396 socket_lock(so, 1);
1397 }
1398
1399 so_update_last_owner_locked(so, PROC_NULL);
1400 so_update_policy(so);
1401 #if NECP
1402 so_update_necp_policy(so, NULL, NULL);
1403 #endif /* NECP */
1404
1405 if ((so->so_state & SS_NOFDREF) == 0) {
1406 panic("soaccept: !NOFDREF");
1407 }
1408 so->so_state &= ~SS_NOFDREF;
1409 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1410
1411 if (dolock) {
1412 socket_unlock(so, 1);
1413 }
1414 return error;
1415 }
1416
1417 int
soaccept(struct socket * so,struct sockaddr ** nam)1418 soaccept(struct socket *so, struct sockaddr **nam)
1419 {
1420 return soacceptlock(so, nam, 1);
1421 }
1422
1423 int
soacceptfilter(struct socket * so,struct socket * head)1424 soacceptfilter(struct socket *so, struct socket *head)
1425 {
1426 struct sockaddr *__single local = NULL, *__single remote = NULL;
1427 int error = 0;
1428
1429 /*
1430 * Hold the lock even if this socket has not been made visible
1431 * to the filter(s). For sockets with global locks, this protects
1432 * against the head or peer going away
1433 */
1434 socket_lock(so, 1);
1435 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1436 sogetaddr_locked(so, &local, 0) != 0) {
1437 so->so_state &= ~SS_NOFDREF;
1438 socket_unlock(so, 1);
1439 soclose(so);
1440 /* Out of resources; try it again next time */
1441 error = ECONNABORTED;
1442 goto done;
1443 }
1444
1445 error = sflt_accept(head, so, local, remote);
1446
1447 /*
1448 * If we get EJUSTRETURN from one of the filters, mark this socket
1449 * as inactive and return it anyway. This newly accepted socket
1450 * will be disconnected later before we hand it off to the caller.
1451 */
1452 if (error == EJUSTRETURN) {
1453 error = 0;
1454 (void) sosetdefunct(current_proc(), so,
1455 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1456 }
1457
1458 if (error != 0) {
1459 /*
1460 * This may seem like a duplication to the above error
1461 * handling part when we return ECONNABORTED, except
1462 * the following is done while holding the lock since
1463 * the socket has been exposed to the filter(s) earlier.
1464 */
1465 so->so_state &= ~SS_NOFDREF;
1466 socket_unlock(so, 1);
1467 soclose(so);
1468 /* Propagate socket filter's error code to the caller */
1469 } else {
1470 socket_unlock(so, 1);
1471 }
1472 done:
1473 /* Callee checks for NULL pointer */
1474 sock_freeaddr(remote);
1475 sock_freeaddr(local);
1476 return error;
1477 }
1478
1479 /*
1480 * Returns: 0 Success
1481 * EOPNOTSUPP Operation not supported on socket
1482 * EISCONN Socket is connected
1483 * <pru_connect>:EADDRNOTAVAIL Address not available.
1484 * <pru_connect>:EINVAL Invalid argument
1485 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1486 * <pru_connect>:EACCES Permission denied
1487 * <pru_connect>:EADDRINUSE Address in use
1488 * <pru_connect>:EAGAIN Resource unavailable, try again
1489 * <pru_connect>:EPERM Operation not permitted
1490 * <sf_connect_out>:??? [anything a filter writer might set]
1491 */
1492 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1493 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1494 {
1495 int error;
1496 struct proc *p = current_proc();
1497 tracker_metadata_t metadata = { };
1498
1499 if (dolock) {
1500 socket_lock(so, 1);
1501 }
1502
1503 so_update_last_owner_locked(so, p);
1504 so_update_policy(so);
1505
1506 /*
1507 * If this is a listening socket or if this is a previously-accepted
1508 * socket that has been marked as inactive, reject the connect request.
1509 */
1510 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1511 error = EOPNOTSUPP;
1512 if (so->so_flags & SOF_DEFUNCT) {
1513 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1514 "(%d)\n", __func__, proc_pid(p),
1515 proc_best_name(p),
1516 so->so_gencnt,
1517 SOCK_DOM(so), SOCK_TYPE(so), error);
1518 }
1519 if (dolock) {
1520 socket_unlock(so, 1);
1521 }
1522 return error;
1523 }
1524
1525 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1526 if (dolock) {
1527 socket_unlock(so, 1);
1528 }
1529 return EPERM;
1530 }
1531
1532 /*
1533 * If protocol is connection-based, can only connect once.
1534 * Otherwise, if connected, try to disconnect first.
1535 * This allows user to disconnect by connecting to, e.g.,
1536 * a null address.
1537 */
1538 #if NECP
1539 bool set_domain_from_tracker_lookup = false;
1540 #endif /* NECP */
1541 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1542 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1543 (error = sodisconnectlocked(so)))) {
1544 error = EISCONN;
1545 } else {
1546 /*
1547 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1548 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1549 */
1550 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1551 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1552 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1553 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1554 }
1555 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1556 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1557 }
1558 #if NECP
1559 set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1560 #endif /* NECP */
1561 necp_set_socket_domain_attributes(so,
1562 __unsafe_null_terminated_from_indexable(metadata.domain),
1563 __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1564 }
1565 }
1566
1567 #if NECP
1568 /* Update NECP evaluation after setting any domain via the tracker checks */
1569 so_update_necp_policy(so, NULL, nam);
1570 if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1571 // Mark extended timeout on tracker lookup to ensure that the entry stays around
1572 tracker_metadata_t update_metadata = { };
1573 update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1574 (void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &update_metadata);
1575 }
1576 #endif /* NECP */
1577
1578 /*
1579 * Run connect filter before calling protocol:
1580 * - non-blocking connect returns before completion;
1581 */
1582 error = sflt_connectout(so, nam);
1583 if (error != 0) {
1584 if (error == EJUSTRETURN) {
1585 error = 0;
1586 }
1587 } else {
1588 error = (*so->so_proto->pr_usrreqs->pru_connect)
1589 (so, nam, p);
1590 if (error != 0) {
1591 so->so_state &= ~SS_ISCONNECTING;
1592 }
1593 }
1594 }
1595 if (dolock) {
1596 socket_unlock(so, 1);
1597 }
1598 return error;
1599 }
1600
1601 int
soconnect(struct socket * so,struct sockaddr * nam)1602 soconnect(struct socket *so, struct sockaddr *nam)
1603 {
1604 return soconnectlock(so, nam, 1);
1605 }
1606
1607 /*
1608 * Returns: 0 Success
1609 * <pru_connect2>:EINVAL[AF_UNIX]
1610 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1611 * <pru_connect2>:??? [other protocol families]
1612 *
1613 * Notes: <pru_connect2> is not supported by [TCP].
1614 */
1615 int
soconnect2(struct socket * so1,struct socket * so2)1616 soconnect2(struct socket *so1, struct socket *so2)
1617 {
1618 int error;
1619
1620 socket_lock(so1, 1);
1621 if (so2->so_proto->pr_lock) {
1622 socket_lock(so2, 1);
1623 }
1624
1625 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1626
1627 socket_unlock(so1, 1);
1628 if (so2->so_proto->pr_lock) {
1629 socket_unlock(so2, 1);
1630 }
1631 return error;
1632 }
1633
1634 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1635 soconnectxlocked(struct socket *so, struct sockaddr *src,
1636 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1637 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1638 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1639 {
1640 int error;
1641 tracker_metadata_t metadata = { };
1642
1643 so_update_last_owner_locked(so, p);
1644 so_update_policy(so);
1645
1646 /*
1647 * If this is a listening socket or if this is a previously-accepted
1648 * socket that has been marked as inactive, reject the connect request.
1649 */
1650 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1651 error = EOPNOTSUPP;
1652 if (so->so_flags & SOF_DEFUNCT) {
1653 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1654 "(%d)\n", __func__, proc_pid(p),
1655 proc_best_name(p),
1656 so->so_gencnt,
1657 SOCK_DOM(so), SOCK_TYPE(so), error);
1658 }
1659 return error;
1660 }
1661
1662 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1663 return EPERM;
1664 }
1665
1666 /*
1667 * If protocol is connection-based, can only connect once
1668 * unless PR_MULTICONN is set. Otherwise, if connected,
1669 * try to disconnect first. This allows user to disconnect
1670 * by connecting to, e.g., a null address.
1671 */
1672 #if NECP
1673 bool set_domain_from_tracker_lookup = false;
1674 #endif /* NECP */
1675 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1676 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1677 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1678 (error = sodisconnectlocked(so)) != 0)) {
1679 error = EISCONN;
1680 } else {
1681 /*
1682 * For TCP, check if destination address is a tracker and mark the socket accordingly
1683 * (only if it hasn't been marked yet).
1684 */
1685 if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1686 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1687 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1688 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1689 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1690 }
1691 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1692 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1693 }
1694 #if NECP
1695 set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
1696 #endif /* NECP */
1697 necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain),
1698 __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1699 }
1700 }
1701
1702 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1703 (flags & CONNECT_DATA_IDEMPOTENT)) {
1704 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1705
1706 if (flags & CONNECT_DATA_AUTHENTICATED) {
1707 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1708 }
1709 }
1710
1711 /*
1712 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1713 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1714 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1715 * Case 3 allows user to combine write with connect even if they have
1716 * no use for TFO (such as regular TCP, and UDP).
1717 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1718 */
1719 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1720 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1721 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1722 }
1723
1724 /*
1725 * If a user sets data idempotent and does not pass an uio, or
1726 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1727 * SOF1_DATA_IDEMPOTENT.
1728 */
1729 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1730 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1731 /* We should return EINVAL instead perhaps. */
1732 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1733 }
1734
1735 /*
1736 * Run connect filter before calling protocol:
1737 * - non-blocking connect returns before completion;
1738 */
1739 error = sflt_connectout(so, dst);
1740 if (error != 0) {
1741 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1742 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1743 if (error == EJUSTRETURN) {
1744 error = 0;
1745 }
1746 } else {
1747 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1748 (so, src, dst, p, ifscope, aid, pcid,
1749 flags, arg, arglen, auio, bytes_written);
1750 if (error != 0) {
1751 so->so_state &= ~SS_ISCONNECTING;
1752 if (error != EINPROGRESS) {
1753 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1754 }
1755 }
1756
1757 #if NECP
1758 if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
1759 // Mark extended timeout on tracker lookup to ensure that the entry stays around
1760 tracker_metadata_t update_metadata = { };
1761 update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
1762 (void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &update_metadata);
1763 }
1764 #endif /* NECP */
1765 }
1766 }
1767
1768 return error;
1769 }
1770
1771 int
sodisconnectlocked(struct socket * so)1772 sodisconnectlocked(struct socket *so)
1773 {
1774 int error;
1775
1776 if ((so->so_state & SS_ISCONNECTED) == 0) {
1777 error = ENOTCONN;
1778 goto bad;
1779 }
1780 if (so->so_state & SS_ISDISCONNECTING) {
1781 error = EALREADY;
1782 goto bad;
1783 }
1784
1785 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1786 if (error == 0) {
1787 sflt_notify(so, sock_evt_disconnected, NULL);
1788 }
1789
1790 bad:
1791 return error;
1792 }
1793
1794 /* Locking version */
1795 int
sodisconnect(struct socket * so)1796 sodisconnect(struct socket *so)
1797 {
1798 int error;
1799
1800 socket_lock(so, 1);
1801 error = sodisconnectlocked(so);
1802 socket_unlock(so, 1);
1803 return error;
1804 }
1805
1806 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1807 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1808 {
1809 int error;
1810
1811 /*
1812 * Call the protocol disconnectx handler; let it handle all
1813 * matters related to the connection state of this session.
1814 */
1815 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1816 if (error == 0) {
1817 /*
1818 * The event applies only for the session, not for
1819 * the disconnection of individual subflows.
1820 */
1821 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1822 sflt_notify(so, sock_evt_disconnected, NULL);
1823 }
1824 }
1825 return error;
1826 }
1827
1828 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1829 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1830 {
1831 int error;
1832
1833 socket_lock(so, 1);
1834 error = sodisconnectxlocked(so, aid, cid);
1835 socket_unlock(so, 1);
1836 return error;
1837 }
1838
1839 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1840
1841 /*
1842 * sosendcheck will lock the socket buffer if it isn't locked and
1843 * verify that there is space for the data being inserted.
1844 *
1845 * Returns: 0 Success
1846 * EPIPE
1847 * sblock:EWOULDBLOCK
1848 * sblock:EINTR
1849 * sbwait:EBADF
1850 * sbwait:EINTR
1851 * [so_error]:???
1852 */
1853 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1854 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1855 int32_t clen, int32_t atomic, int flags, int *sblocked)
1856 {
1857 int assumelock = 0;
1858 int error = 0;
1859 int32_t space;
1860 int ret;
1861
1862 restart:
1863 if (*sblocked == 0) {
1864 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1865 so->so_send_filt_thread != 0 &&
1866 so->so_send_filt_thread == current_thread()) {
1867 /*
1868 * We're being called recursively from a filter,
1869 * allow this to continue. Radar 4150520.
1870 * Don't set sblocked because we don't want
1871 * to perform an unlock later.
1872 */
1873 assumelock = 1;
1874 } else {
1875 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1876 if (error) {
1877 if (so->so_flags & SOF_DEFUNCT) {
1878 goto defunct;
1879 }
1880 return error;
1881 }
1882 *sblocked = 1;
1883 }
1884 }
1885
1886 /*
1887 * If a send attempt is made on a socket that has been marked
1888 * as inactive (disconnected), reject the request.
1889 */
1890 if (so->so_flags & SOF_DEFUNCT) {
1891 defunct:
1892 error = EPIPE;
1893 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
1894 __func__, proc_selfpid(), proc_best_name(current_proc()),
1895 so->so_gencnt,
1896 SOCK_DOM(so), SOCK_TYPE(so), error);
1897 return error;
1898 }
1899
1900 if (so->so_state & SS_CANTSENDMORE) {
1901 #if CONTENT_FILTER
1902 /*
1903 * Can re-inject data of half closed connections
1904 */
1905 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1906 so->so_snd.sb_cfil_thread == current_thread() &&
1907 cfil_sock_data_pending(&so->so_snd) != 0) {
1908 CFIL_LOG(LOG_INFO,
1909 "so %llx ignore SS_CANTSENDMORE",
1910 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1911 } else
1912 #endif /* CONTENT_FILTER */
1913 return EPIPE;
1914 }
1915 if (so->so_error) {
1916 error = so->so_error;
1917 so->so_error = 0;
1918 return error;
1919 }
1920
1921 if ((so->so_state & SS_ISCONNECTED) == 0) {
1922 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1923 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1924 (resid != 0 || clen == 0) &&
1925 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1926 return ENOTCONN;
1927 }
1928 } else if (addr == 0) {
1929 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1930 ENOTCONN : EDESTADDRREQ;
1931 }
1932 }
1933
1934 space = sbspace(&so->so_snd);
1935
1936 if (flags & MSG_OOB) {
1937 space += 1024;
1938 }
1939 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1940 clen > so->so_snd.sb_hiwat) {
1941 return EMSGSIZE;
1942 }
1943
1944 if ((space < resid + clen &&
1945 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1946 space < clen)) ||
1947 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1948 /*
1949 * don't block the connectx call when there's more data
1950 * than can be copied.
1951 */
1952 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1953 if (space == 0) {
1954 return EWOULDBLOCK;
1955 }
1956 if (space < (int32_t)so->so_snd.sb_lowat) {
1957 return 0;
1958 }
1959 }
1960 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1961 assumelock) {
1962 return EWOULDBLOCK;
1963 }
1964 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1965 *sblocked = 0;
1966 error = sbwait(&so->so_snd);
1967 if (error) {
1968 if (so->so_flags & SOF_DEFUNCT) {
1969 goto defunct;
1970 }
1971 return error;
1972 }
1973 goto restart;
1974 }
1975
1976 ret = proto_memacct_limited(so->so_proto);
1977 if (ret == MEMACCT_HARDLIMIT ||
1978 (ret == MEMACCT_SOFTLIMIT && so->so_snd.sb_cc > 0)) {
1979 return ENOMEM;
1980 }
1981 return 0;
1982 }
1983
1984 /*
1985 * Send on a socket.
1986 * If send must go all at once and message is larger than
1987 * send buffering, then hard error.
1988 * Lock against other senders.
1989 * If must go all at once and not enough room now, then
1990 * inform user that this would block and do nothing.
1991 * Otherwise, if nonblocking, send as much as possible.
1992 * The data to be sent is described by "uio" if nonzero,
1993 * otherwise by the mbuf chain "top" (which must be null
1994 * if uio is not). Data provided in mbuf chain must be small
1995 * enough to send all at once.
1996 *
1997 * Returns nonzero on error, timeout or signal; callers
1998 * must check for short counts if EINTR/ERESTART are returned.
1999 * Data and control buffers are freed on return.
2000 *
2001 * Returns: 0 Success
2002 * EOPNOTSUPP
2003 * EINVAL
2004 * ENOBUFS
2005 * uiomove:EFAULT
2006 * sosendcheck:EPIPE
2007 * sosendcheck:EWOULDBLOCK
2008 * sosendcheck:EINTR
2009 * sosendcheck:EBADF
2010 * sosendcheck:EINTR
2011 * sosendcheck:??? [value from so_error]
2012 * <pru_send>:ECONNRESET[TCP]
2013 * <pru_send>:EINVAL[TCP]
2014 * <pru_send>:ENOBUFS[TCP]
2015 * <pru_send>:EADDRINUSE[TCP]
2016 * <pru_send>:EADDRNOTAVAIL[TCP]
2017 * <pru_send>:EAFNOSUPPORT[TCP]
2018 * <pru_send>:EACCES[TCP]
2019 * <pru_send>:EAGAIN[TCP]
2020 * <pru_send>:EPERM[TCP]
2021 * <pru_send>:EMSGSIZE[TCP]
2022 * <pru_send>:EHOSTUNREACH[TCP]
2023 * <pru_send>:ENETUNREACH[TCP]
2024 * <pru_send>:ENETDOWN[TCP]
2025 * <pru_send>:ENOMEM[TCP]
2026 * <pru_send>:ENOBUFS[TCP]
2027 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2028 * <pru_send>:EINVAL[AF_UNIX]
2029 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2030 * <pru_send>:EPIPE[AF_UNIX]
2031 * <pru_send>:ENOTCONN[AF_UNIX]
2032 * <pru_send>:EISCONN[AF_UNIX]
2033 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2034 * <sf_data_out>:??? [whatever a filter author chooses]
2035 *
2036 * Notes: Other <pru_send> returns depend on the protocol family; all
2037 * <sf_data_out> returns depend on what the filter author causes
2038 * their filter to return.
2039 */
2040 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2041 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2042 struct mbuf *top, struct mbuf *control, int flags)
2043 {
2044 mbuf_ref_ref_t mp;
2045 mbuf_ref_t m, freelist = NULL;
2046 struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2047 user_ssize_t space, len, resid, orig_resid;
2048 int clen = 0, error, dontroute, sendflags;
2049 int atomic = sosendallatonce(so) || top;
2050 int sblocked = 0;
2051 struct proc *p = current_proc();
2052 uint16_t headroom = 0;
2053 ssize_t mlen;
2054 boolean_t en_tracing = FALSE;
2055
2056 if (uio != NULL) {
2057 resid = uio_resid(uio);
2058 } else {
2059 resid = top->m_pkthdr.len;
2060 }
2061 orig_resid = resid;
2062
2063 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2064 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2065
2066 socket_lock(so, 1);
2067
2068 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2069 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, SOFLOW_DIRECTION_OUTBOUND, 0);
2070 }
2071
2072 /*
2073 * trace if tracing & network (vs. unix) sockets & and
2074 * non-loopback
2075 */
2076 if (ENTR_SHOULDTRACE &&
2077 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2078 struct inpcb *inp = sotoinpcb(so);
2079 if (inp->inp_last_outifp != NULL &&
2080 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2081 en_tracing = TRUE;
2082 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2083 VM_KERNEL_ADDRPERM(so),
2084 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2085 (int64_t)resid);
2086 }
2087 }
2088
2089 /*
2090 * Re-injection should not affect process accounting
2091 */
2092 if ((flags & MSG_SKIPCFIL) == 0) {
2093 so_update_last_owner_locked(so, p);
2094 so_update_policy(so);
2095
2096 #if NECP
2097 so_update_necp_policy(so, NULL, addr);
2098 #endif /* NECP */
2099 }
2100
2101 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2102 error = EOPNOTSUPP;
2103 goto out_locked;
2104 }
2105
2106 /*
2107 * In theory resid should be unsigned.
2108 * However, space must be signed, as it might be less than 0
2109 * if we over-committed, and we must use a signed comparison
2110 * of space and resid. On the other hand, a negative resid
2111 * causes us to loop sending 0-length segments to the protocol.
2112 *
2113 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2114 *
2115 * Note: We limit resid to be a positive int value as we use
2116 * imin() to set bytes_to_copy -- radr://14558484
2117 */
2118 if (resid < 0 || resid > INT_MAX ||
2119 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2120 error = EINVAL;
2121 goto out_locked;
2122 }
2123
2124 dontroute = (flags & MSG_DONTROUTE) &&
2125 (so->so_options & SO_DONTROUTE) == 0 &&
2126 (so->so_proto->pr_flags & PR_ATOMIC);
2127 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2128
2129 if (control != NULL) {
2130 clen = control->m_len;
2131 }
2132
2133 if (soreserveheadroom != 0) {
2134 headroom = so->so_pktheadroom;
2135 }
2136
2137 do {
2138 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2139 &sblocked);
2140 if (error) {
2141 goto out_locked;
2142 }
2143
2144 mp = ⊤
2145 space = sbspace(&so->so_snd) - clen;
2146 space += ((flags & MSG_OOB) ? 1024 : 0);
2147
2148 do {
2149 if (uio == NULL) {
2150 /*
2151 * Data is prepackaged in "top".
2152 */
2153 resid = 0;
2154 if (flags & MSG_EOR) {
2155 top->m_flags |= M_EOR;
2156 }
2157 } else {
2158 int chainlength;
2159 int bytes_to_copy;
2160 boolean_t jumbocl;
2161 boolean_t bigcl;
2162 int bytes_to_alloc;
2163
2164 bytes_to_copy = imin((int)resid, (int)space);
2165
2166 bytes_to_alloc = bytes_to_copy;
2167 if (top == NULL) {
2168 bytes_to_alloc += headroom;
2169 }
2170
2171 if (sosendminchain > 0) {
2172 chainlength = 0;
2173 } else {
2174 chainlength = sosendmaxchain;
2175 }
2176
2177 /*
2178 * Use big 4 KB cluster when the outgoing interface
2179 * does not prefer 2 KB clusters
2180 */
2181 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2182 sosendbigcl_ignore_capab;
2183
2184 /*
2185 * Attempt to use larger than system page-size
2186 * clusters for large writes only if there is
2187 * a jumbo cluster pool and if the socket is
2188 * marked accordingly.
2189 */
2190 jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0 &&
2191 bigcl;
2192
2193 socket_unlock(so, 0);
2194
2195 do {
2196 int num_needed;
2197 int hdrs_needed = (top == NULL) ? 1 : 0;
2198
2199 /*
2200 * try to maintain a local cache of mbuf
2201 * clusters needed to complete this
2202 * write the list is further limited to
2203 * the number that are currently needed
2204 * to fill the socket this mechanism
2205 * allows a large number of mbufs/
2206 * clusters to be grabbed under a single
2207 * mbuf lock... if we can't get any
2208 * clusters, than fall back to trying
2209 * for mbufs if we fail early (or
2210 * miscalcluate the number needed) make
2211 * sure to release any clusters we
2212 * haven't yet consumed.
2213 */
2214 if (freelist == NULL &&
2215 bytes_to_alloc > MBIGCLBYTES &&
2216 jumbocl) {
2217 num_needed =
2218 bytes_to_alloc / M16KCLBYTES;
2219
2220 if ((bytes_to_alloc -
2221 (num_needed * M16KCLBYTES))
2222 >= MINCLSIZE) {
2223 num_needed++;
2224 }
2225
2226 freelist =
2227 m_getpackets_internal(
2228 (unsigned int *)&num_needed,
2229 hdrs_needed, M_WAIT, 0,
2230 M16KCLBYTES);
2231 /*
2232 * Fall back to 4K cluster size
2233 * if allocation failed
2234 */
2235 }
2236
2237 if (freelist == NULL &&
2238 bytes_to_alloc > MCLBYTES &&
2239 bigcl) {
2240 num_needed =
2241 bytes_to_alloc / MBIGCLBYTES;
2242
2243 if ((bytes_to_alloc -
2244 (num_needed * MBIGCLBYTES)) >=
2245 MINCLSIZE) {
2246 num_needed++;
2247 }
2248
2249 freelist =
2250 m_getpackets_internal(
2251 (unsigned int *)&num_needed,
2252 hdrs_needed, M_WAIT, 0,
2253 MBIGCLBYTES);
2254 /*
2255 * Fall back to cluster size
2256 * if allocation failed
2257 */
2258 }
2259
2260 /*
2261 * Allocate a cluster as we want to
2262 * avoid to split the data in more
2263 * that one segment and using MINCLSIZE
2264 * would lead us to allocate two mbufs
2265 */
2266 if (soreserveheadroom != 0 &&
2267 freelist == NULL &&
2268 ((top == NULL &&
2269 bytes_to_alloc > _MHLEN) ||
2270 bytes_to_alloc > _MLEN)) {
2271 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2272 MCLBYTES;
2273 freelist =
2274 m_getpackets_internal(
2275 (unsigned int *)&num_needed,
2276 hdrs_needed, M_WAIT, 0,
2277 MCLBYTES);
2278 /*
2279 * Fall back to a single mbuf
2280 * if allocation failed
2281 */
2282 } else if (freelist == NULL &&
2283 bytes_to_alloc > MINCLSIZE) {
2284 num_needed =
2285 bytes_to_alloc / MCLBYTES;
2286
2287 if ((bytes_to_alloc -
2288 (num_needed * MCLBYTES)) >=
2289 MINCLSIZE) {
2290 num_needed++;
2291 }
2292
2293 freelist =
2294 m_getpackets_internal(
2295 (unsigned int *)&num_needed,
2296 hdrs_needed, M_WAIT, 0,
2297 MCLBYTES);
2298 /*
2299 * Fall back to a single mbuf
2300 * if allocation failed
2301 */
2302 }
2303 /*
2304 * For datagram protocols, leave
2305 * headroom for protocol headers
2306 * in the first cluster of the chain
2307 */
2308 if (freelist != NULL && atomic &&
2309 top == NULL && headroom > 0) {
2310 freelist->m_data += headroom;
2311 }
2312
2313 /*
2314 * Fall back to regular mbufs without
2315 * reserving the socket headroom
2316 */
2317 if (freelist == NULL) {
2318 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2319 if (top == NULL) {
2320 MGETHDR(freelist,
2321 M_WAIT, MT_DATA);
2322 } else {
2323 MGET(freelist,
2324 M_WAIT, MT_DATA);
2325 }
2326 }
2327
2328 if (freelist == NULL) {
2329 error = ENOBUFS;
2330 socket_lock(so, 0);
2331 goto out_locked;
2332 }
2333 /*
2334 * For datagram protocols,
2335 * leave room for protocol
2336 * headers in first mbuf.
2337 */
2338 if (atomic && top == NULL &&
2339 bytes_to_copy > 0 &&
2340 bytes_to_copy < MHLEN) {
2341 MH_ALIGN(freelist,
2342 bytes_to_copy);
2343 }
2344 }
2345 m = freelist;
2346 freelist = m->m_next;
2347 m->m_next = NULL;
2348
2349 if ((m->m_flags & M_EXT)) {
2350 mlen = m->m_ext.ext_size -
2351 M_LEADINGSPACE(m);
2352 } else if ((m->m_flags & M_PKTHDR)) {
2353 mlen = MHLEN - M_LEADINGSPACE(m);
2354 m_add_crumb(m, PKT_CRUMB_SOSEND);
2355 } else {
2356 mlen = MLEN - M_LEADINGSPACE(m);
2357 }
2358 len = imin((int)mlen, bytes_to_copy);
2359
2360 chainlength += len;
2361
2362 space -= len;
2363
2364 error = uiomove(mtod(m, caddr_t),
2365 (int)len, uio);
2366
2367 resid = uio_resid(uio);
2368
2369 m->m_len = (int32_t)len;
2370 *mp = m;
2371 top->m_pkthdr.len += len;
2372 if (error) {
2373 break;
2374 }
2375 mp = &m->m_next;
2376 if (resid <= 0) {
2377 if (flags & MSG_EOR) {
2378 top->m_flags |= M_EOR;
2379 }
2380 break;
2381 }
2382 bytes_to_copy = imin((int)resid, (int)space);
2383 } while (space > 0 &&
2384 (chainlength < sosendmaxchain || atomic ||
2385 resid < MINCLSIZE));
2386
2387 socket_lock(so, 0);
2388
2389 if (error) {
2390 goto out_locked;
2391 }
2392 }
2393
2394 if (dontroute) {
2395 so->so_options |= SO_DONTROUTE;
2396 }
2397
2398 /*
2399 * Compute flags here, for pru_send and NKEs
2400 *
2401 * If the user set MSG_EOF, the protocol
2402 * understands this flag and nothing left to
2403 * send then use PRU_SEND_EOF instead of PRU_SEND.
2404 */
2405 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2406 ((flags & MSG_EOF) &&
2407 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2408 (resid <= 0)) ? PRUS_EOF :
2409 /* If there is more to send set PRUS_MORETOCOME */
2410 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2411
2412 if ((flags & MSG_SKIPCFIL) == 0) {
2413 /*
2414 * Socket filter processing
2415 */
2416 error = sflt_data_out(so, addr, &top,
2417 &control, (sendflags & MSG_OOB) ?
2418 sock_data_filt_flag_oob : 0);
2419 if (error) {
2420 if (error == EJUSTRETURN) {
2421 error = 0;
2422 goto packet_consumed;
2423 }
2424 goto out_locked;
2425 }
2426 #if CONTENT_FILTER
2427 /*
2428 * Content filter processing
2429 */
2430 error = cfil_sock_data_out(so, addr, top,
2431 control, sendflags, dgram_flow_entry);
2432 if (error) {
2433 if (error == EJUSTRETURN) {
2434 error = 0;
2435 goto packet_consumed;
2436 }
2437 goto out_locked;
2438 }
2439 #endif /* CONTENT_FILTER */
2440 }
2441 error = (*so->so_proto->pr_usrreqs->pru_send)
2442 (so, sendflags, top, addr, control, p);
2443 if (error == EJUSTRETURN) {
2444 error = 0;
2445 }
2446
2447 packet_consumed:
2448 if (dontroute) {
2449 so->so_options &= ~SO_DONTROUTE;
2450 }
2451
2452 clen = 0;
2453 control = NULL;
2454 top = NULL;
2455 mp = ⊤
2456 if (error) {
2457 goto out_locked;
2458 }
2459 } while (resid && space > 0);
2460 } while (resid);
2461
2462
2463 out_locked:
2464 if (resid > orig_resid) {
2465 char pname[MAXCOMLEN] = {};
2466 pid_t current_pid = proc_pid(current_proc());
2467 proc_name(current_pid, pname, sizeof(pname));
2468
2469 if (sosend_assert_panic != 0) {
2470 panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2471 so, resid, orig_resid, pname, current_pid);
2472 } else {
2473 os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2474 so->so_gencnt, resid, orig_resid, pname, current_pid);
2475 }
2476 }
2477
2478 if (sblocked) {
2479 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2480 } else {
2481 socket_unlock(so, 1);
2482 }
2483 if (top != NULL) {
2484 m_freem(top);
2485 }
2486 if (control != NULL) {
2487 m_freem(control);
2488 }
2489 if (freelist != NULL) {
2490 m_freem_list(freelist);
2491 }
2492
2493 if (dgram_flow_entry != NULL) {
2494 soflow_free_flow(dgram_flow_entry);
2495 }
2496
2497 soclearfastopen(so);
2498
2499 if (en_tracing) {
2500 /* resid passed here is the bytes left in uio */
2501 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2502 VM_KERNEL_ADDRPERM(so),
2503 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2504 (int64_t)(orig_resid - resid));
2505 }
2506 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2507 so->so_snd.sb_cc, space, error);
2508
2509 return error;
2510 }
2511
2512 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2513 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2514 {
2515 struct mbuf *m0 = NULL, *control_end = NULL;
2516
2517 socket_lock_assert_owned(so);
2518
2519 /*
2520 * top must points to mbuf chain to be sent.
2521 * If control is not NULL, top must be packet header
2522 */
2523 VERIFY(top != NULL &&
2524 (control == NULL || top->m_flags & M_PKTHDR));
2525
2526 /*
2527 * If control is not passed in, see if we can get it
2528 * from top.
2529 */
2530 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2531 // Locate start of control if present and start of data
2532 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2533 if (m0->m_flags & M_PKTHDR) {
2534 top = m0;
2535 break;
2536 } else if (m0->m_type == MT_CONTROL) {
2537 if (control == NULL) {
2538 // Found start of control
2539 control = m0;
2540 }
2541 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2542 // Found end of control
2543 control_end = m0;
2544 }
2545 }
2546 }
2547 if (control_end != NULL) {
2548 control_end->m_next = NULL;
2549 }
2550 }
2551
2552 int error = (*so->so_proto->pr_usrreqs->pru_send)
2553 (so, sendflags, top, addr, control, current_proc());
2554 if (error == EJUSTRETURN) {
2555 error = 0;
2556 }
2557
2558 return error;
2559 }
2560
2561 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp,struct mbuf ** last_control)2562 mbuf_detach_control_from_list(struct mbuf **mp, struct mbuf **last_control)
2563 {
2564 struct mbuf *control = NULL;
2565 struct mbuf *m = *mp;
2566
2567 if (m->m_type == MT_CONTROL) {
2568 struct mbuf *control_end;
2569 struct mbuf *n;
2570
2571 n = control_end = control = m;
2572
2573 /*
2574 * Break the chain per mbuf type
2575 */
2576 while (n != NULL && n->m_type == MT_CONTROL) {
2577 control_end = n;
2578 n = n->m_next;
2579 }
2580 control_end->m_next = NULL;
2581 *mp = n;
2582 if (last_control != NULL) {
2583 *last_control = control_end;
2584 }
2585 }
2586 VERIFY(*mp != NULL);
2587
2588 return control;
2589 }
2590
2591 /*
2592 * Supported only connected sockets (no address) without ancillary data
2593 * (control mbuf) for atomic protocols
2594 */
2595 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2596 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2597 {
2598 mbuf_ref_t m, control = NULL;
2599 struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2600 int error, dontroute;
2601 int atomic = sosendallatonce(so);
2602 int sblocked = 0;
2603 struct proc *p = current_proc();
2604 struct mbuf *top = pktlist;
2605 bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2606
2607 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2608 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2609
2610 if (so->so_type != SOCK_DGRAM) {
2611 error = EINVAL;
2612 os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2613 error);
2614 goto out;
2615 }
2616 if (atomic == 0) {
2617 error = EINVAL;
2618 os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2619 error);
2620 goto out;
2621 }
2622 if ((so->so_state & SS_ISCONNECTED) == 0) {
2623 error = ENOTCONN;
2624 os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2625 error);
2626 goto out;
2627 }
2628 if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2629 error = EINVAL;
2630 os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2631 flags, error);
2632 goto out;
2633 }
2634
2635 socket_lock(so, 1);
2636 so_update_last_owner_locked(so, p);
2637 so_update_policy(so);
2638
2639 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2640 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, SOFLOW_DIRECTION_OUTBOUND, 0);
2641 }
2642
2643 #if NECP
2644 so_update_necp_policy(so, NULL, NULL);
2645 #endif /* NECP */
2646
2647 dontroute = (flags & MSG_DONTROUTE) &&
2648 (so->so_options & SO_DONTROUTE) == 0 &&
2649 (so->so_proto->pr_flags & PR_ATOMIC);
2650 if (dontroute) {
2651 so->so_options |= SO_DONTROUTE;
2652 }
2653
2654 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2655
2656 error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2657 if (error) {
2658 os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2659 error);
2660 goto release;
2661 }
2662
2663 if (!skip_filt) {
2664 mbuf_ref_ref_t prevnextp = NULL;
2665
2666 for (m = top; m != NULL; m = m->m_nextpkt) {
2667 mbuf_ref_t nextpkt, last_control;
2668
2669 /*
2670 * Remove packet from the list of packets
2671 */
2672 nextpkt = m->m_nextpkt;
2673 if (prevnextp != NULL) {
2674 *prevnextp = nextpkt;
2675 } else {
2676 top = nextpkt;
2677 }
2678 m->m_nextpkt = NULL;
2679
2680 /*
2681 * Break the chain per mbuf type
2682 */
2683 if (m->m_type == MT_CONTROL) {
2684 control = mbuf_detach_control_from_list(&m, &last_control);
2685 }
2686 /*
2687 * Socket filter processing
2688 */
2689 error = sflt_data_out(so, NULL, &m,
2690 &control, 0);
2691 if (error != 0 && error != EJUSTRETURN) {
2692 os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2693 error);
2694 m_freem(m);
2695 goto release;
2696 }
2697
2698 #if CONTENT_FILTER
2699 if (error == 0) {
2700 /*
2701 * Content filter processing
2702 */
2703 error = cfil_sock_data_out(so, NULL, m,
2704 control, 0, dgram_flow_entry);
2705 if (error != 0 && error != EJUSTRETURN) {
2706 os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2707 error);
2708 m_freem(m);
2709 goto release;
2710 }
2711 }
2712 #endif /* CONTENT_FILTER */
2713 if (error == EJUSTRETURN) {
2714 /*
2715 * When swallowed by a filter, the packet is not
2716 * in the list anymore
2717 */
2718 error = 0;
2719 } else {
2720 /*
2721 * Rebuild the mbuf chain of the packet
2722 */
2723 if (control != NULL) {
2724 last_control->m_next = m;
2725 m = control;
2726 }
2727 /*
2728 * Reinsert the packet in the list of packets
2729 */
2730 m->m_nextpkt = nextpkt;
2731 if (prevnextp != NULL) {
2732 *prevnextp = m;
2733 } else {
2734 top = m;
2735 }
2736 prevnextp = &m->m_nextpkt;
2737 }
2738 control = NULL;
2739 }
2740 }
2741
2742 if (top != NULL) {
2743 if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2744 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2745 (so, top, pktcnt, flags);
2746 if (error != 0 && error != ENOBUFS) {
2747 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2748 error);
2749 }
2750 top = NULL;
2751 } else {
2752 *pktcnt = 0;
2753 control = NULL;
2754 for (m = top; m != NULL; m = top) {
2755 top = m->m_nextpkt;
2756 m->m_nextpkt = NULL;
2757
2758 /*
2759 * Break the chain per mbuf type
2760 */
2761 if (m->m_type == MT_CONTROL) {
2762 control = mbuf_detach_control_from_list(&m, NULL);
2763 }
2764
2765 error = (*so->so_proto->pr_usrreqs->pru_send)
2766 (so, 0, m, NULL, control, current_proc());
2767 if (error == EJUSTRETURN) {
2768 error = 0;
2769 }
2770 if (error != 0) {
2771 if (error != ENOBUFS) {
2772 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2773 error);
2774 }
2775 control = NULL;
2776 goto release;
2777 }
2778 *pktcnt += 1;
2779 control = NULL;
2780 }
2781 }
2782 }
2783
2784 release:
2785 if (dontroute) {
2786 so->so_options &= ~SO_DONTROUTE;
2787 }
2788 if (sblocked) {
2789 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2790 } else {
2791 socket_unlock(so, 1);
2792 }
2793 out:
2794 if (control != NULL) {
2795 m_freem(control);
2796 }
2797 if (top != NULL) {
2798 if (error != ENOBUFS) {
2799 os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2800 error);
2801 }
2802 m_freem_list(top);
2803 }
2804
2805 if (dgram_flow_entry != NULL) {
2806 soflow_free_flow(dgram_flow_entry);
2807 }
2808
2809 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2810 so->so_snd.sb_cc, 0, error);
2811
2812 return error;
2813 }
2814
2815 /*
2816 * May return ERESTART when packet is dropped by MAC policy check
2817 */
2818 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2819 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2820 struct mbuf **maddrp,
2821 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2822 {
2823 int error = 0;
2824 struct mbuf *m = *mp;
2825 struct mbuf *nextrecord = *nextrecordp;
2826
2827 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2828 #if CONFIG_MACF_SOCKET_SUBSET
2829 /*
2830 * Call the MAC framework for policy checking if we're in
2831 * the user process context and the socket isn't connected.
2832 */
2833 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2834 struct mbuf *m0 = m;
2835 /*
2836 * Dequeue this record (temporarily) from the receive
2837 * list since we're about to drop the socket's lock
2838 * where a new record may arrive and be appended to
2839 * the list. Upon MAC policy failure, the record
2840 * will be freed. Otherwise, we'll add it back to
2841 * the head of the list. We cannot rely on SB_LOCK
2842 * because append operation uses the socket's lock.
2843 */
2844 do {
2845 m->m_nextpkt = NULL;
2846 sbfree(&so->so_rcv, m);
2847 m = m->m_next;
2848 } while (m != NULL);
2849 m = m0;
2850 so->so_rcv.sb_mb = nextrecord;
2851 SB_EMPTY_FIXUP(&so->so_rcv);
2852 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2853 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2854 socket_unlock(so, 0);
2855
2856 error = mac_socket_check_received(kauth_cred_get(), so,
2857 mtod(m, struct sockaddr *));
2858
2859 if (error != 0) {
2860 /*
2861 * MAC policy failure; free this record and
2862 * process the next record (or block until
2863 * one is available). We have adjusted sb_cc
2864 * and sb_mbcnt above so there is no need to
2865 * call sbfree() again.
2866 */
2867 m_freem(m);
2868 /*
2869 * Clear SB_LOCK but don't unlock the socket.
2870 * Process the next record or wait for one.
2871 */
2872 socket_lock(so, 0);
2873 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2874 error = ERESTART;
2875 goto done;
2876 }
2877 socket_lock(so, 0);
2878 /*
2879 * If the socket has been defunct'd, drop it.
2880 */
2881 if (so->so_flags & SOF_DEFUNCT) {
2882 m_freem(m);
2883 error = ENOTCONN;
2884 goto done;
2885 }
2886 /*
2887 * Re-adjust the socket receive list and re-enqueue
2888 * the record in front of any packets which may have
2889 * been appended while we dropped the lock.
2890 */
2891 for (m = m0; m->m_next != NULL; m = m->m_next) {
2892 sballoc(&so->so_rcv, m);
2893 }
2894 sballoc(&so->so_rcv, m);
2895 if (so->so_rcv.sb_mb == NULL) {
2896 so->so_rcv.sb_lastrecord = m0;
2897 so->so_rcv.sb_mbtail = m;
2898 }
2899 m = m0;
2900 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2901 so->so_rcv.sb_mb = m;
2902 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2903 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2904 }
2905 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2906 if (psa != NULL) {
2907 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2908 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2909 error = EWOULDBLOCK;
2910 goto done;
2911 }
2912 } else if (maddrp != NULL) {
2913 *maddrp = m;
2914 }
2915 if (flags & MSG_PEEK) {
2916 m = m->m_next;
2917 } else {
2918 sbfree(&so->so_rcv, m);
2919 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2920 panic("%s: about to create invalid socketbuf",
2921 __func__);
2922 /* NOTREACHED */
2923 }
2924 if (maddrp == NULL) {
2925 MFREE(m, so->so_rcv.sb_mb);
2926 } else {
2927 so->so_rcv.sb_mb = m->m_next;
2928 m->m_next = NULL;
2929 }
2930 m = so->so_rcv.sb_mb;
2931 if (m != NULL) {
2932 m->m_nextpkt = nextrecord;
2933 } else {
2934 so->so_rcv.sb_mb = nextrecord;
2935 SB_EMPTY_FIXUP(&so->so_rcv);
2936 }
2937 }
2938 done:
2939 *mp = m;
2940 *nextrecordp = nextrecord;
2941
2942 return error;
2943 }
2944
2945 /*
2946 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
2947 * so clear the data portion in order not to leak the file pointers
2948 */
2949 static void
sopeek_scm_rights(struct mbuf * rights)2950 sopeek_scm_rights(struct mbuf *rights)
2951 {
2952 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
2953
2954 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
2955 VERIFY(cm->cmsg_len <= rights->m_len);
2956 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
2957 }
2958 }
2959
2960 /*
2961 * Process one or more MT_CONTROL mbufs present before any data mbufs
2962 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2963 * just copy the data; if !MSG_PEEK, we call into the protocol to
2964 * perform externalization.
2965 */
2966 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)2967 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2968 struct mbuf **mp, struct mbuf **nextrecordp)
2969 {
2970 int error = 0;
2971 mbuf_ref_t cm = NULL, cmn;
2972 mbuf_ref_ref_t cme = &cm;
2973 struct sockbuf *sb_rcv = &so->so_rcv;
2974 mbuf_ref_ref_t msgpcm = NULL;
2975 mbuf_ref_t m = *mp;
2976 mbuf_ref_t nextrecord = *nextrecordp;
2977 struct protosw *pr = so->so_proto;
2978
2979 /*
2980 * Externalizing the control messages would require us to
2981 * drop the socket's lock below. Once we re-acquire the
2982 * lock, the mbuf chain might change. In order to preserve
2983 * consistency, we unlink all control messages from the
2984 * first mbuf chain in one shot and link them separately
2985 * onto a different chain.
2986 */
2987 do {
2988 if (flags & MSG_PEEK) {
2989 if (controlp != NULL) {
2990 if (*controlp == NULL) {
2991 msgpcm = controlp;
2992 }
2993 *controlp = m_copy(m, 0, m->m_len);
2994
2995 /*
2996 * If we failed to allocate an mbuf,
2997 * release any previously allocated
2998 * mbufs for control data. Return
2999 * an error. Keep the mbufs in the
3000 * socket as this is using
3001 * MSG_PEEK flag.
3002 */
3003 if (*controlp == NULL) {
3004 m_freem(*msgpcm);
3005 error = ENOBUFS;
3006 goto done;
3007 }
3008
3009 if (pr->pr_domain->dom_externalize != NULL) {
3010 sopeek_scm_rights(*controlp);
3011 }
3012
3013 controlp = &(*controlp)->m_next;
3014 }
3015 m = m->m_next;
3016 } else {
3017 m->m_nextpkt = NULL;
3018 sbfree(sb_rcv, m);
3019 sb_rcv->sb_mb = m->m_next;
3020 m->m_next = NULL;
3021 *cme = m;
3022 cme = &(*cme)->m_next;
3023 m = sb_rcv->sb_mb;
3024 }
3025 } while (m != NULL && m->m_type == MT_CONTROL);
3026
3027 if (!(flags & MSG_PEEK)) {
3028 if (sb_rcv->sb_mb != NULL) {
3029 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3030 } else {
3031 sb_rcv->sb_mb = nextrecord;
3032 SB_EMPTY_FIXUP(sb_rcv);
3033 }
3034 if (nextrecord == NULL) {
3035 sb_rcv->sb_lastrecord = m;
3036 }
3037 }
3038
3039 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3040 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3041
3042 while (cm != NULL) {
3043 int cmsg_level;
3044 int cmsg_type;
3045
3046 cmn = cm->m_next;
3047 cm->m_next = NULL;
3048 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3049 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3050
3051 /*
3052 * Call the protocol to externalize SCM_RIGHTS message
3053 * and return the modified message to the caller upon
3054 * success. Otherwise, all other control messages are
3055 * returned unmodified to the caller. Note that we
3056 * only get into this loop if MSG_PEEK is not set.
3057 */
3058 if (pr->pr_domain->dom_externalize != NULL &&
3059 cmsg_level == SOL_SOCKET &&
3060 cmsg_type == SCM_RIGHTS) {
3061 /*
3062 * Release socket lock: see 3903171. This
3063 * would also allow more records to be appended
3064 * to the socket buffer. We still have SB_LOCK
3065 * set on it, so we can be sure that the head
3066 * of the mbuf chain won't change.
3067 */
3068 socket_unlock(so, 0);
3069 error = (*pr->pr_domain->dom_externalize)(cm);
3070 socket_lock(so, 0);
3071 } else {
3072 error = 0;
3073 }
3074
3075 if (controlp != NULL && error == 0) {
3076 *controlp = cm;
3077 controlp = &(*controlp)->m_next;
3078 } else {
3079 (void) m_free(cm);
3080 }
3081 cm = cmn;
3082 }
3083 /*
3084 * Update the value of nextrecord in case we received new
3085 * records when the socket was unlocked above for
3086 * externalizing SCM_RIGHTS.
3087 */
3088 if (m != NULL) {
3089 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3090 } else {
3091 nextrecord = sb_rcv->sb_mb;
3092 }
3093
3094 done:
3095 *mp = m;
3096 *nextrecordp = nextrecord;
3097
3098 return error;
3099 }
3100
3101 /*
3102 * If we have less data than requested, block awaiting more
3103 * (subject to any timeout) if:
3104 * 1. the current count is less than the low water mark, or
3105 * 2. MSG_WAITALL is set, and it is possible to do the entire
3106 * receive operation at once if we block (resid <= hiwat).
3107 * 3. MSG_DONTWAIT is not set
3108 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3109 * we have to do the receive in sections, and thus risk returning
3110 * a short count if a timeout or signal occurs after we start.
3111 */
3112 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3113 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3114 {
3115 struct protosw *pr = so->so_proto;
3116
3117 /* No mbufs in the receive-queue? Wait! */
3118 if (m == NULL) {
3119 return true;
3120 }
3121
3122 /* Not enough data in the receive socket-buffer - we may have to wait */
3123 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3124 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3125 /*
3126 * Application did set the lowater-mark, so we should wait for
3127 * this data to be present.
3128 */
3129 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3130 return true;
3131 }
3132
3133 /*
3134 * Application wants all the data - so let's try to do the
3135 * receive-operation at once by waiting for everything to
3136 * be there.
3137 */
3138 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3139 return true;
3140 }
3141 }
3142
3143 return false;
3144 }
3145
3146 /*
3147 * Implement receive operations on a socket.
3148 * We depend on the way that records are added to the sockbuf
3149 * by sbappend*. In particular, each record (mbufs linked through m_next)
3150 * must begin with an address if the protocol so specifies,
3151 * followed by an optional mbuf or mbufs containing ancillary data,
3152 * and then zero or more mbufs of data.
3153 * In order to avoid blocking network interrupts for the entire time here,
3154 * we splx() while doing the actual copy to user space.
3155 * Although the sockbuf is locked, new data may still be appended,
3156 * and thus we must maintain consistency of the sockbuf during that time.
3157 *
3158 * The caller may receive the data as a single mbuf chain by supplying
3159 * an mbuf **mp0 for use in returning the chain. The uio is then used
3160 * only for the count in uio_resid.
3161 *
3162 * Returns: 0 Success
3163 * ENOBUFS
3164 * ENOTCONN
3165 * EWOULDBLOCK
3166 * uiomove:EFAULT
3167 * sblock:EWOULDBLOCK
3168 * sblock:EINTR
3169 * sbwait:EBADF
3170 * sbwait:EINTR
3171 * sodelayed_copy:EFAULT
3172 * <pru_rcvoob>:EINVAL[TCP]
3173 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3174 * <pru_rcvoob>:???
3175 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3176 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3177 * <pr_domain->dom_externalize>:???
3178 *
3179 * Notes: Additional return values from calls through <pru_rcvoob> and
3180 * <pr_domain->dom_externalize> depend on protocols other than
3181 * TCP or AF_UNIX, which are documented above.
3182 */
3183 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3184 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3185 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3186 {
3187 mbuf_ref_t m;
3188 mbuf_ref_ref_t mp;
3189 mbuf_ref_t ml = NULL;
3190 mbuf_ref_t nextrecord, free_list;
3191 int flags, error, offset;
3192 user_ssize_t len;
3193 struct protosw *pr = so->so_proto;
3194 int moff, type = 0;
3195 user_ssize_t orig_resid = uio_resid(uio);
3196 user_ssize_t delayed_copy_len;
3197 int can_delay;
3198 struct proc *p = current_proc();
3199 boolean_t en_tracing = FALSE;
3200
3201 /*
3202 * Sanity check on the length passed by caller as we are making 'int'
3203 * comparisons
3204 */
3205 if (orig_resid < 0 || orig_resid > INT_MAX) {
3206 return EINVAL;
3207 }
3208
3209 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3210 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3211 so->so_rcv.sb_hiwat);
3212
3213 socket_lock(so, 1);
3214 so_update_last_owner_locked(so, p);
3215 so_update_policy(so);
3216
3217 #ifdef MORE_LOCKING_DEBUG
3218 if (so->so_usecount == 1) {
3219 panic("%s: so=%x no other reference on socket", __func__, so);
3220 /* NOTREACHED */
3221 }
3222 #endif
3223 mp = mp0;
3224 if (psa != NULL) {
3225 *psa = NULL;
3226 }
3227 if (controlp != NULL) {
3228 *controlp = NULL;
3229 }
3230 if (flagsp != NULL) {
3231 flags = *flagsp & ~MSG_EOR;
3232 } else {
3233 flags = 0;
3234 }
3235
3236 /*
3237 * If a recv attempt is made on a previously-accepted socket
3238 * that has been marked as inactive (disconnected), reject
3239 * the request.
3240 */
3241 if (so->so_flags & SOF_DEFUNCT) {
3242 struct sockbuf *sb = &so->so_rcv;
3243
3244 error = ENOTCONN;
3245 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3246 __func__, proc_pid(p), proc_best_name(p),
3247 so->so_gencnt,
3248 SOCK_DOM(so), SOCK_TYPE(so), error);
3249 /*
3250 * This socket should have been disconnected and flushed
3251 * prior to being returned from sodefunct(); there should
3252 * be no data on its receive list, so panic otherwise.
3253 */
3254 if (so->so_state & SS_DEFUNCT) {
3255 sb_empty_assert(sb, __func__);
3256 }
3257 socket_unlock(so, 1);
3258 return error;
3259 }
3260
3261 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3262 pr->pr_usrreqs->pru_preconnect) {
3263 /*
3264 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3265 * calling write() right after this. *If* the app calls a read
3266 * we do not want to block this read indefinetely. Thus,
3267 * we trigger a connect so that the session gets initiated.
3268 */
3269 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3270
3271 if (error) {
3272 socket_unlock(so, 1);
3273 return error;
3274 }
3275 }
3276
3277 if (ENTR_SHOULDTRACE &&
3278 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3279 /*
3280 * enable energy tracing for inet sockets that go over
3281 * non-loopback interfaces only.
3282 */
3283 struct inpcb *inp = sotoinpcb(so);
3284 if (inp->inp_last_outifp != NULL &&
3285 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3286 en_tracing = TRUE;
3287 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3288 VM_KERNEL_ADDRPERM(so),
3289 ((so->so_state & SS_NBIO) ?
3290 kEnTrFlagNonBlocking : 0),
3291 (int64_t)orig_resid);
3292 }
3293 }
3294
3295 /*
3296 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3297 * regardless of the flags argument. Here is the case were
3298 * out-of-band data is not inline.
3299 */
3300 if ((flags & MSG_OOB) ||
3301 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3302 (so->so_options & SO_OOBINLINE) == 0 &&
3303 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3304 m = m_get(M_WAIT, MT_DATA);
3305 if (m == NULL) {
3306 socket_unlock(so, 1);
3307 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3308 ENOBUFS, 0, 0, 0, 0);
3309 return ENOBUFS;
3310 }
3311 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3312 if (error) {
3313 goto bad;
3314 }
3315 socket_unlock(so, 0);
3316 do {
3317 error = uiomove(mtod(m, caddr_t),
3318 imin((int)uio_resid(uio), m->m_len), uio);
3319 m = m_free(m);
3320 } while (uio_resid(uio) && error == 0 && m != NULL);
3321 socket_lock(so, 0);
3322 bad:
3323 if (m != NULL) {
3324 m_freem(m);
3325 }
3326
3327 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3328 if (error == EWOULDBLOCK || error == EINVAL) {
3329 /*
3330 * Let's try to get normal data:
3331 * EWOULDBLOCK: out-of-band data not
3332 * receive yet. EINVAL: out-of-band data
3333 * already read.
3334 */
3335 error = 0;
3336 goto nooob;
3337 } else if (error == 0 && flagsp != NULL) {
3338 *flagsp |= MSG_OOB;
3339 }
3340 }
3341 socket_unlock(so, 1);
3342 if (en_tracing) {
3343 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3344 VM_KERNEL_ADDRPERM(so), 0,
3345 (int64_t)(orig_resid - uio_resid(uio)));
3346 }
3347 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3348 0, 0, 0, 0);
3349
3350 return error;
3351 }
3352 nooob:
3353 if (mp != NULL) {
3354 *mp = NULL;
3355 }
3356
3357 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3358 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3359 }
3360
3361 free_list = NULL;
3362 delayed_copy_len = 0;
3363 restart:
3364 #ifdef MORE_LOCKING_DEBUG
3365 if (so->so_usecount <= 1) {
3366 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3367 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3368 }
3369 #endif
3370 /*
3371 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3372 * and if so just return to the caller. This could happen when
3373 * soreceive() is called by a socket upcall function during the
3374 * time the socket is freed. The socket buffer would have been
3375 * locked across the upcall, therefore we cannot put this thread
3376 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3377 * we may livelock), because the lock on the socket buffer will
3378 * only be released when the upcall routine returns to its caller.
3379 * Because the socket has been officially closed, there can be
3380 * no further read on it.
3381 *
3382 * A multipath subflow socket would have its SS_NOFDREF set by
3383 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3384 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3385 */
3386 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3387 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3388 socket_unlock(so, 1);
3389 return 0;
3390 }
3391
3392 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3393 if (error) {
3394 socket_unlock(so, 1);
3395 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3396 0, 0, 0, 0);
3397 if (en_tracing) {
3398 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3399 VM_KERNEL_ADDRPERM(so), 0,
3400 (int64_t)(orig_resid - uio_resid(uio)));
3401 }
3402 return error;
3403 }
3404
3405 m = so->so_rcv.sb_mb;
3406 if (so_should_wait(so, uio, m, flags)) {
3407 /*
3408 * Panic if we notice inconsistencies in the socket's
3409 * receive list; both sb_mb and sb_cc should correctly
3410 * reflect the contents of the list, otherwise we may
3411 * end up with false positives during select() or poll()
3412 * which could put the application in a bad state.
3413 */
3414 SB_MB_CHECK(&so->so_rcv);
3415
3416 if (so->so_error) {
3417 if (m != NULL) {
3418 goto dontblock;
3419 }
3420 error = so->so_error;
3421 if ((flags & MSG_PEEK) == 0) {
3422 so->so_error = 0;
3423 }
3424 goto release;
3425 }
3426 if (so->so_state & SS_CANTRCVMORE) {
3427 #if CONTENT_FILTER
3428 /*
3429 * Deal with half closed connections
3430 */
3431 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3432 cfil_sock_data_pending(&so->so_rcv) != 0) {
3433 CFIL_LOG(LOG_INFO,
3434 "so %llx ignore SS_CANTRCVMORE",
3435 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3436 } else
3437 #endif /* CONTENT_FILTER */
3438 if (m != NULL) {
3439 goto dontblock;
3440 } else {
3441 goto release;
3442 }
3443 }
3444 for (; m != NULL; m = m->m_next) {
3445 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3446 m = so->so_rcv.sb_mb;
3447 goto dontblock;
3448 }
3449 }
3450 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3451 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3452 error = ENOTCONN;
3453 goto release;
3454 }
3455 if (uio_resid(uio) == 0) {
3456 goto release;
3457 }
3458
3459 if ((so->so_state & SS_NBIO) ||
3460 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3461 error = EWOULDBLOCK;
3462 goto release;
3463 }
3464 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3465 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3466 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3467 #if EVEN_MORE_LOCKING_DEBUG
3468 if (socket_debug) {
3469 printf("Waiting for socket data\n");
3470 }
3471 #endif
3472
3473 /*
3474 * Depending on the protocol (e.g. TCP), the following
3475 * might cause the socket lock to be dropped and later
3476 * be reacquired, and more data could have arrived and
3477 * have been appended to the receive socket buffer by
3478 * the time it returns. Therefore, we only sleep in
3479 * sbwait() below if and only if the wait-condition is still
3480 * true.
3481 */
3482 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3483 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3484 }
3485
3486 error = 0;
3487 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3488 error = sbwait(&so->so_rcv);
3489 }
3490
3491 #if EVEN_MORE_LOCKING_DEBUG
3492 if (socket_debug) {
3493 printf("SORECEIVE - sbwait returned %d\n", error);
3494 }
3495 #endif
3496 if (so->so_usecount < 1) {
3497 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3498 __func__, so, so->so_usecount);
3499 /* NOTREACHED */
3500 }
3501 if (error) {
3502 socket_unlock(so, 1);
3503 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3504 0, 0, 0, 0);
3505 if (en_tracing) {
3506 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3507 VM_KERNEL_ADDRPERM(so), 0,
3508 (int64_t)(orig_resid - uio_resid(uio)));
3509 }
3510 return error;
3511 }
3512 goto restart;
3513 }
3514 dontblock:
3515 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3516 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3517 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3518 nextrecord = m->m_nextpkt;
3519
3520 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3521 error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3522 mp0 == NULL);
3523 if (error == ERESTART) {
3524 goto restart;
3525 } else if (error != 0) {
3526 goto release;
3527 }
3528 orig_resid = 0;
3529 }
3530
3531 /*
3532 * Process one or more MT_CONTROL mbufs present before any data mbufs
3533 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3534 * just copy the data; if !MSG_PEEK, we call into the protocol to
3535 * perform externalization.
3536 */
3537 if (m != NULL && m->m_type == MT_CONTROL) {
3538 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3539 if (error != 0) {
3540 goto release;
3541 }
3542 orig_resid = 0;
3543 }
3544
3545 if (m != NULL) {
3546 if (!(flags & MSG_PEEK)) {
3547 /*
3548 * We get here because m points to an mbuf following
3549 * any MT_SONAME or MT_CONTROL mbufs which have been
3550 * processed above. In any case, m should be pointing
3551 * to the head of the mbuf chain, and the nextrecord
3552 * should be either NULL or equal to m->m_nextpkt.
3553 * See comments above about SB_LOCK.
3554 */
3555 if (m != so->so_rcv.sb_mb ||
3556 m->m_nextpkt != nextrecord) {
3557 panic("%s: post-control !sync so=%p m=%p "
3558 "nextrecord=%p\n", __func__, so, m,
3559 nextrecord);
3560 /* NOTREACHED */
3561 }
3562 if (nextrecord == NULL) {
3563 so->so_rcv.sb_lastrecord = m;
3564 }
3565 }
3566 type = m->m_type;
3567 if (type == MT_OOBDATA) {
3568 flags |= MSG_OOB;
3569 }
3570 } else {
3571 if (!(flags & MSG_PEEK)) {
3572 SB_EMPTY_FIXUP(&so->so_rcv);
3573 }
3574 }
3575 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3576 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3577
3578 moff = 0;
3579 offset = 0;
3580
3581 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3582 can_delay = 1;
3583 } else {
3584 can_delay = 0;
3585 }
3586
3587 while (m != NULL &&
3588 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3589 if (m->m_type == MT_OOBDATA) {
3590 if (type != MT_OOBDATA) {
3591 break;
3592 }
3593 } else if (type == MT_OOBDATA) {
3594 break;
3595 }
3596
3597 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3598 break;
3599 }
3600 /*
3601 * Make sure to allways set MSG_OOB event when getting
3602 * out of band data inline.
3603 */
3604 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3605 (so->so_options & SO_OOBINLINE) != 0 &&
3606 (so->so_state & SS_RCVATMARK) != 0) {
3607 flags |= MSG_OOB;
3608 }
3609 so->so_state &= ~SS_RCVATMARK;
3610 len = uio_resid(uio) - delayed_copy_len;
3611 if (so->so_oobmark && len > so->so_oobmark - offset) {
3612 len = so->so_oobmark - offset;
3613 }
3614 if (len > m->m_len - moff) {
3615 len = m->m_len - moff;
3616 }
3617 /*
3618 * If mp is set, just pass back the mbufs.
3619 * Otherwise copy them out via the uio, then free.
3620 * Sockbuf must be consistent here (points to current mbuf,
3621 * it points to next record) when we drop priority;
3622 * we must note any additions to the sockbuf when we
3623 * block interrupts again.
3624 */
3625 if (mp == NULL) {
3626 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3627 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3628 if (can_delay && len == m->m_len) {
3629 /*
3630 * only delay the copy if we're consuming the
3631 * mbuf and we're NOT in MSG_PEEK mode
3632 * and we have enough data to make it worthwile
3633 * to drop and retake the lock... can_delay
3634 * reflects the state of the 2 latter
3635 * constraints moff should always be zero
3636 * in these cases
3637 */
3638 delayed_copy_len += len;
3639 } else {
3640 if (delayed_copy_len) {
3641 error = sodelayed_copy(so, uio,
3642 &free_list, &delayed_copy_len);
3643
3644 if (error) {
3645 goto release;
3646 }
3647 /*
3648 * can only get here if MSG_PEEK is not
3649 * set therefore, m should point at the
3650 * head of the rcv queue; if it doesn't,
3651 * it means something drastically
3652 * changed while we were out from behind
3653 * the lock in sodelayed_copy. perhaps
3654 * a RST on the stream. in any event,
3655 * the stream has been interrupted. it's
3656 * probably best just to return whatever
3657 * data we've moved and let the caller
3658 * sort it out...
3659 */
3660 if (m != so->so_rcv.sb_mb) {
3661 break;
3662 }
3663 }
3664 socket_unlock(so, 0);
3665 error = uiomove(mtod(m, caddr_t) + moff,
3666 (int)len, uio);
3667 socket_lock(so, 0);
3668
3669 if (error) {
3670 goto release;
3671 }
3672 }
3673 } else {
3674 uio_setresid(uio, (uio_resid(uio) - len));
3675 }
3676 if (len == m->m_len - moff) {
3677 if (m->m_flags & M_EOR) {
3678 flags |= MSG_EOR;
3679 }
3680 if (flags & MSG_PEEK) {
3681 m = m->m_next;
3682 moff = 0;
3683 } else {
3684 nextrecord = m->m_nextpkt;
3685 sbfree(&so->so_rcv, m);
3686 m->m_nextpkt = NULL;
3687
3688 if (mp != NULL) {
3689 *mp = m;
3690 mp = &m->m_next;
3691 so->so_rcv.sb_mb = m = m->m_next;
3692 *mp = NULL;
3693 } else {
3694 if (free_list == NULL) {
3695 free_list = m;
3696 } else {
3697 ml->m_next = m;
3698 }
3699 ml = m;
3700 so->so_rcv.sb_mb = m = m->m_next;
3701 ml->m_next = NULL;
3702 }
3703 if (m != NULL) {
3704 m->m_nextpkt = nextrecord;
3705 if (nextrecord == NULL) {
3706 so->so_rcv.sb_lastrecord = m;
3707 }
3708 } else {
3709 so->so_rcv.sb_mb = nextrecord;
3710 SB_EMPTY_FIXUP(&so->so_rcv);
3711 }
3712 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3713 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3714 }
3715 } else {
3716 if (flags & MSG_PEEK) {
3717 moff += len;
3718 } else {
3719 if (mp != NULL) {
3720 int copy_flag;
3721
3722 if (flags & MSG_DONTWAIT) {
3723 copy_flag = M_DONTWAIT;
3724 } else {
3725 copy_flag = M_WAIT;
3726 }
3727 *mp = m_copym(m, 0, (int)len, copy_flag);
3728 /*
3729 * Failed to allocate an mbuf?
3730 * Adjust uio_resid back, it was
3731 * adjusted down by len bytes which
3732 * we didn't copy over.
3733 */
3734 if (*mp == NULL) {
3735 uio_setresid(uio,
3736 (uio_resid(uio) + len));
3737 break;
3738 }
3739 }
3740 m->m_data += len;
3741 m->m_len -= len;
3742 so->so_rcv.sb_cc -= len;
3743 }
3744 }
3745 if (so->so_oobmark) {
3746 if ((flags & MSG_PEEK) == 0) {
3747 so->so_oobmark -= len;
3748 if (so->so_oobmark == 0) {
3749 so->so_state |= SS_RCVATMARK;
3750 break;
3751 }
3752 } else {
3753 offset += len;
3754 if (offset == so->so_oobmark) {
3755 break;
3756 }
3757 }
3758 }
3759 if (flags & MSG_EOR) {
3760 break;
3761 }
3762 /*
3763 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3764 * (for non-atomic socket), we must not quit until
3765 * "uio->uio_resid == 0" or an error termination.
3766 * If a signal/timeout occurs, return with a short
3767 * count but without error. Keep sockbuf locked
3768 * against other readers.
3769 */
3770 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3771 (uio_resid(uio) - delayed_copy_len) > 0 &&
3772 !sosendallatonce(so) && !nextrecord) {
3773 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3774 #if CONTENT_FILTER
3775 && cfil_sock_data_pending(&so->so_rcv) == 0
3776 #endif /* CONTENT_FILTER */
3777 )) {
3778 goto release;
3779 }
3780
3781 /*
3782 * Depending on the protocol (e.g. TCP), the following
3783 * might cause the socket lock to be dropped and later
3784 * be reacquired, and more data could have arrived and
3785 * have been appended to the receive socket buffer by
3786 * the time it returns. Therefore, we only sleep in
3787 * sbwait() below if and only if the socket buffer is
3788 * empty, in order to avoid a false sleep.
3789 */
3790 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3791 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3792 }
3793
3794 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3795 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3796
3797 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3798 error = 0;
3799 goto release;
3800 }
3801 /*
3802 * have to wait until after we get back from the sbwait
3803 * to do the copy because we will drop the lock if we
3804 * have enough data that has been delayed... by dropping
3805 * the lock we open up a window allowing the netisr
3806 * thread to process the incoming packets and to change
3807 * the state of this socket... we're issuing the sbwait
3808 * because the socket is empty and we're expecting the
3809 * netisr thread to wake us up when more packets arrive;
3810 * if we allow that processing to happen and then sbwait
3811 * we could stall forever with packets sitting in the
3812 * socket if no further packets arrive from the remote
3813 * side.
3814 *
3815 * we want to copy before we've collected all the data
3816 * to satisfy this request to allow the copy to overlap
3817 * the incoming packet processing on an MP system
3818 */
3819 if (delayed_copy_len > sorecvmincopy &&
3820 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3821 error = sodelayed_copy(so, uio,
3822 &free_list, &delayed_copy_len);
3823
3824 if (error) {
3825 goto release;
3826 }
3827 }
3828 m = so->so_rcv.sb_mb;
3829 if (m != NULL) {
3830 nextrecord = m->m_nextpkt;
3831 }
3832 SB_MB_CHECK(&so->so_rcv);
3833 }
3834 }
3835 #ifdef MORE_LOCKING_DEBUG
3836 if (so->so_usecount <= 1) {
3837 panic("%s: after big while so=%p ref=%d on socket",
3838 __func__, so, so->so_usecount);
3839 /* NOTREACHED */
3840 }
3841 #endif
3842
3843 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3844 if (so->so_options & SO_DONTTRUNC) {
3845 flags |= MSG_RCVMORE;
3846 } else {
3847 flags |= MSG_TRUNC;
3848 if ((flags & MSG_PEEK) == 0) {
3849 (void) sbdroprecord(&so->so_rcv);
3850 }
3851 }
3852 }
3853
3854 /*
3855 * pru_rcvd below (for TCP) may cause more data to be received
3856 * if the socket lock is dropped prior to sending the ACK; some
3857 * legacy OpenTransport applications don't handle this well
3858 * (if it receives less data than requested while MSG_HAVEMORE
3859 * is set), and so we set the flag now based on what we know
3860 * prior to calling pru_rcvd.
3861 */
3862 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3863 flags |= MSG_HAVEMORE;
3864 }
3865
3866 if ((flags & MSG_PEEK) == 0) {
3867 if (m == NULL) {
3868 so->so_rcv.sb_mb = nextrecord;
3869 /*
3870 * First part is an inline SB_EMPTY_FIXUP(). Second
3871 * part makes sure sb_lastrecord is up-to-date if
3872 * there is still data in the socket buffer.
3873 */
3874 if (so->so_rcv.sb_mb == NULL) {
3875 so->so_rcv.sb_mbtail = NULL;
3876 so->so_rcv.sb_lastrecord = NULL;
3877 } else if (nextrecord->m_nextpkt == NULL) {
3878 so->so_rcv.sb_lastrecord = nextrecord;
3879 }
3880 SB_MB_CHECK(&so->so_rcv);
3881 }
3882 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3883 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3884 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3885 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3886 }
3887 }
3888
3889 if (delayed_copy_len) {
3890 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3891 if (error) {
3892 goto release;
3893 }
3894 }
3895 if (free_list != NULL) {
3896 m_freem_list(free_list);
3897 free_list = NULL;
3898 }
3899
3900 if (orig_resid == uio_resid(uio) && orig_resid &&
3901 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3902 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3903 goto restart;
3904 }
3905
3906 if (flagsp != NULL) {
3907 *flagsp |= flags;
3908 }
3909 release:
3910 #ifdef MORE_LOCKING_DEBUG
3911 if (so->so_usecount <= 1) {
3912 panic("%s: release so=%p ref=%d on socket", __func__,
3913 so, so->so_usecount);
3914 /* NOTREACHED */
3915 }
3916 #endif
3917 if (delayed_copy_len) {
3918 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3919 }
3920
3921 if (free_list != NULL) {
3922 m_freem_list(free_list);
3923 }
3924
3925 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3926
3927 if (en_tracing) {
3928 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3929 VM_KERNEL_ADDRPERM(so),
3930 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3931 (int64_t)(orig_resid - uio_resid(uio)));
3932 }
3933 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3934 so->so_rcv.sb_cc, 0, error);
3935
3936 return error;
3937 }
3938
3939 /*
3940 * Returns: 0 Success
3941 * uiomove:EFAULT
3942 */
3943 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)3944 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3945 user_ssize_t *resid)
3946 {
3947 int error = 0;
3948 struct mbuf *m;
3949
3950 m = *free_list;
3951
3952 socket_unlock(so, 0);
3953
3954 while (m != NULL && error == 0) {
3955 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3956 m = m->m_next;
3957 }
3958 m_freem_list(*free_list);
3959
3960 *free_list = NULL;
3961 *resid = 0;
3962
3963 socket_lock(so, 0);
3964
3965 return error;
3966 }
3967
3968 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3969 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
3970 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3971 {
3972 mbuf_ref_t m;
3973 mbuf_ref_ref_t mp;
3974 mbuf_ref_t nextrecord;
3975 int flags, error;
3976 struct protosw *pr = so->so_proto;
3977 struct proc *p = current_proc();
3978 u_int npkts = 0;
3979 mbuf_ref_t free_list = NULL;
3980 int sblocked = 0;
3981
3982 /*
3983 * Sanity check on the parameters passed by caller
3984 */
3985 if (mp0 == NULL || pktcntp == NULL) {
3986 return EINVAL;
3987 }
3988 if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
3989 return EINVAL;
3990 }
3991
3992 mp = mp0;
3993 *mp0 = NULL;
3994 if (controlp != NULL) {
3995 *controlp = NULL;
3996 }
3997 if (maddrp != NULL) {
3998 *maddrp = NULL;
3999 }
4000 if (flagsp != NULL) {
4001 flags = *flagsp;
4002 } else {
4003 flags = 0;
4004 }
4005
4006 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4007 *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4008 so->so_rcv.sb_hiwat);
4009
4010 socket_lock(so, 1);
4011 so_update_last_owner_locked(so, p);
4012 so_update_policy(so);
4013
4014 #if NECP
4015 so_update_necp_policy(so, NULL, NULL);
4016 #endif /* NECP */
4017
4018 /*
4019 * If a recv attempt is made on a previously-accepted socket
4020 * that has been marked as inactive (disconnected), reject
4021 * the request.
4022 */
4023 if (so->so_flags & SOF_DEFUNCT) {
4024 struct sockbuf *sb = &so->so_rcv;
4025
4026 error = ENOTCONN;
4027 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4028 __func__, proc_pid(p), proc_best_name(p),
4029 so->so_gencnt,
4030 SOCK_DOM(so), SOCK_TYPE(so), error);
4031 /*
4032 * This socket should have been disconnected and flushed
4033 * prior to being returned from sodefunct(); there should
4034 * be no data on its receive list, so panic otherwise.
4035 */
4036 if (so->so_state & SS_DEFUNCT) {
4037 sb_empty_assert(sb, __func__);
4038 }
4039 goto release;
4040 }
4041
4042 *mp = NULL;
4043
4044 restart:
4045 /*
4046 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4047 * and if so just return to the caller. This could happen when
4048 * soreceive() is called by a socket upcall function during the
4049 * time the socket is freed. The socket buffer would have been
4050 * locked across the upcall, therefore we cannot put this thread
4051 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4052 * we may livelock), because the lock on the socket buffer will
4053 * only be released when the upcall routine returns to its caller.
4054 * Because the socket has been officially closed, there can be
4055 * no further read on it.
4056 */
4057 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4058 (SS_NOFDREF | SS_CANTRCVMORE)) {
4059 error = 0;
4060 goto release;
4061 }
4062
4063 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4064 if (error) {
4065 goto release;
4066 }
4067 sblocked = 1;
4068
4069 m = so->so_rcv.sb_mb;
4070 /*
4071 * Block awaiting more datagram if needed
4072 */
4073 if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4074 so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4075 /*
4076 * Panic if we notice inconsistencies in the socket's
4077 * receive list; both sb_mb and sb_cc should correctly
4078 * reflect the contents of the list, otherwise we may
4079 * end up with false positives during select() or poll()
4080 * which could put the application in a bad state.
4081 */
4082 SB_MB_CHECK(&so->so_rcv);
4083
4084 if (so->so_error) {
4085 if (m != NULL) {
4086 goto dontblock;
4087 }
4088 error = so->so_error;
4089 if ((flags & MSG_PEEK) == 0) {
4090 so->so_error = 0;
4091 }
4092 goto release;
4093 }
4094 if (so->so_state & SS_CANTRCVMORE) {
4095 if (m != NULL) {
4096 goto dontblock;
4097 } else {
4098 goto release;
4099 }
4100 }
4101 for (; m != NULL; m = m->m_next) {
4102 if (m->m_flags & M_EOR) {
4103 m = so->so_rcv.sb_mb;
4104 goto dontblock;
4105 }
4106 }
4107 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4108 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4109 error = ENOTCONN;
4110 goto release;
4111 }
4112 if ((so->so_state & SS_NBIO) ||
4113 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4114 error = EWOULDBLOCK;
4115 goto release;
4116 }
4117 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4118 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4119
4120 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4121 sblocked = 0;
4122
4123 error = sbwait(&so->so_rcv);
4124 if (error != 0) {
4125 goto release;
4126 }
4127 goto restart;
4128 }
4129 dontblock:
4130 m = so->so_rcv.sb_mb;
4131 if (m == NULL) {
4132 goto release;
4133 }
4134
4135 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4136 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4137 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4138 nextrecord = m->m_nextpkt;
4139
4140 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4141 mbuf_ref_t maddr = NULL;
4142
4143 error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4144 &nextrecord, 1);
4145 if (error == ERESTART) {
4146 goto restart;
4147 } else if (error != 0) {
4148 goto release;
4149 }
4150
4151 if (maddr != NULL) {
4152 maddr->m_nextpkt = NULL;
4153 maddr->m_next = NULL;
4154 if (maddrp != NULL) {
4155 *maddrp = maddr;
4156 maddrp = &maddr->m_nextpkt;
4157 } else {
4158 maddr->m_next = free_list;
4159 free_list = maddr;
4160 }
4161 }
4162 }
4163
4164 /*
4165 * Process one or more MT_CONTROL mbufs present before any data mbufs
4166 * in the first mbuf chain on the socket buffer.
4167 * We call into the protocol to perform externalization.
4168 */
4169 if (m != NULL && m->m_type == MT_CONTROL) {
4170 mbuf_ref_t control = NULL;
4171
4172 error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4173 if (error != 0) {
4174 goto release;
4175 }
4176 if (control != NULL) {
4177 control->m_nextpkt = NULL;
4178 control->m_next = NULL;
4179 if (controlp != NULL) {
4180 *controlp = control;
4181 controlp = &control->m_nextpkt;
4182 } else {
4183 control->m_next = free_list;
4184 free_list = control;
4185 }
4186 }
4187 }
4188
4189 /*
4190 * Link the packet to the list
4191 */
4192 if (m != NULL) {
4193 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4194 panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4195 }
4196 m->m_nextpkt = NULL;
4197 *mp = m;
4198 mp = &m->m_nextpkt;
4199 }
4200 while (m != NULL) {
4201 sbfree(&so->so_rcv, m);
4202
4203 m = m->m_next;
4204 }
4205
4206 so->so_rcv.sb_mb = nextrecord;
4207 /*
4208 * First part is an inline SB_EMPTY_FIXUP(). Second
4209 * part makes sure sb_lastrecord is up-to-date if
4210 * there is still data in the socket buffer.
4211 */
4212 if (so->so_rcv.sb_mb == NULL) {
4213 so->so_rcv.sb_mbtail = NULL;
4214 so->so_rcv.sb_lastrecord = NULL;
4215 } else if (nextrecord->m_nextpkt == NULL) {
4216 so->so_rcv.sb_lastrecord = nextrecord;
4217 }
4218 SB_MB_CHECK(&so->so_rcv);
4219
4220 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4221 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4222
4223 npkts += 1;
4224
4225 /*
4226 * We continue as long as all those conditions as we have less packets
4227 * than requested and the socket buffer is not empty
4228 */
4229 if (npkts < *pktcntp) {
4230 if (so->so_rcv.sb_mb != NULL) {
4231 goto dontblock;
4232 }
4233 if ((flags & MSG_WAITALL) != 0) {
4234 goto restart;
4235 }
4236 }
4237
4238 if (flagsp != NULL) {
4239 *flagsp |= flags;
4240 }
4241
4242 release:
4243 /*
4244 * pru_rcvd may cause more data to be received if the socket lock
4245 * is dropped so we set MSG_HAVEMORE now based on what we know.
4246 * That way the caller won't be surprised if it receives less data
4247 * than requested.
4248 */
4249 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4250 flags |= MSG_HAVEMORE;
4251 }
4252
4253 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4254 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4255 }
4256
4257 if (sblocked) {
4258 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4259 } else {
4260 socket_unlock(so, 1);
4261 }
4262
4263 *pktcntp = npkts;
4264 /*
4265 * Amortize the cost of freeing the mbufs
4266 */
4267 if (free_list != NULL) {
4268 m_freem_list(free_list);
4269 }
4270
4271 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4272 0, 0, 0, 0);
4273 return error;
4274 }
4275
4276 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4277 so_statistics_event_to_nstat_event(int64_t *input_options,
4278 uint64_t *nstat_event)
4279 {
4280 int error = 0;
4281 switch (*input_options) {
4282 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4283 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4284 break;
4285 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4286 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4287 break;
4288 #if (DEBUG || DEVELOPMENT)
4289 case SO_STATISTICS_EVENT_RESERVED_1:
4290 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4291 break;
4292 case SO_STATISTICS_EVENT_RESERVED_2:
4293 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4294 break;
4295 #endif /* (DEBUG || DEVELOPMENT) */
4296 default:
4297 error = EINVAL;
4298 break;
4299 }
4300 return error;
4301 }
4302
4303 /*
4304 * Returns: 0 Success
4305 * EINVAL
4306 * ENOTCONN
4307 * <pru_shutdown>:EINVAL
4308 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4309 * <pru_shutdown>:ENOBUFS[TCP]
4310 * <pru_shutdown>:EMSGSIZE[TCP]
4311 * <pru_shutdown>:EHOSTUNREACH[TCP]
4312 * <pru_shutdown>:ENETUNREACH[TCP]
4313 * <pru_shutdown>:ENETDOWN[TCP]
4314 * <pru_shutdown>:ENOMEM[TCP]
4315 * <pru_shutdown>:EACCES[TCP]
4316 * <pru_shutdown>:EMSGSIZE[TCP]
4317 * <pru_shutdown>:ENOBUFS[TCP]
4318 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4319 * <pru_shutdown>:??? [other protocol families]
4320 */
4321 int
soshutdown(struct socket * so,int how)4322 soshutdown(struct socket *so, int how)
4323 {
4324 int error;
4325
4326 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4327
4328 switch (how) {
4329 case SHUT_RD:
4330 case SHUT_WR:
4331 case SHUT_RDWR:
4332 socket_lock(so, 1);
4333 if ((so->so_state &
4334 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4335 error = ENOTCONN;
4336 } else {
4337 error = soshutdownlock(so, how);
4338 }
4339 socket_unlock(so, 1);
4340 break;
4341 default:
4342 error = EINVAL;
4343 break;
4344 }
4345
4346 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4347
4348 return error;
4349 }
4350
4351 int
soshutdownlock_final(struct socket * so,int how)4352 soshutdownlock_final(struct socket *so, int how)
4353 {
4354 struct protosw *pr = so->so_proto;
4355 int error = 0;
4356
4357 sflt_notify(so, sock_evt_shutdown, &how);
4358
4359 if (how != SHUT_WR) {
4360 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4361 /* read already shut down */
4362 error = ENOTCONN;
4363 goto done;
4364 }
4365 sorflush(so);
4366 }
4367 if (how != SHUT_RD) {
4368 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4369 /* write already shut down */
4370 error = ENOTCONN;
4371 goto done;
4372 }
4373 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4374 }
4375 done:
4376 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4377 return error;
4378 }
4379
4380 int
soshutdownlock(struct socket * so,int how)4381 soshutdownlock(struct socket *so, int how)
4382 {
4383 int error = 0;
4384
4385 #if CONTENT_FILTER
4386 /*
4387 * A content filter may delay the actual shutdown until it
4388 * has processed the pending data
4389 */
4390 if (so->so_flags & SOF_CONTENT_FILTER) {
4391 error = cfil_sock_shutdown(so, &how);
4392 if (error == EJUSTRETURN) {
4393 error = 0;
4394 goto done;
4395 } else if (error != 0) {
4396 goto done;
4397 }
4398 }
4399 #endif /* CONTENT_FILTER */
4400
4401 error = soshutdownlock_final(so, how);
4402
4403 done:
4404 return error;
4405 }
4406
4407 void
sowflush(struct socket * so)4408 sowflush(struct socket *so)
4409 {
4410 struct sockbuf *sb = &so->so_snd;
4411
4412 /*
4413 * Obtain lock on the socket buffer (SB_LOCK). This is required
4414 * to prevent the socket buffer from being unexpectedly altered
4415 * while it is used by another thread in socket send/receive.
4416 *
4417 * sblock() must not fail here, hence the assertion.
4418 */
4419 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4420 VERIFY(sb->sb_flags & SB_LOCK);
4421
4422 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4423 sb->sb_flags |= SB_DROP;
4424 sb->sb_upcall = NULL;
4425 sb->sb_upcallarg = NULL;
4426
4427 sbunlock(sb, TRUE); /* keep socket locked */
4428
4429 selthreadclear(&sb->sb_sel);
4430 sbrelease(sb);
4431 }
4432
4433 void
sorflush(struct socket * so)4434 sorflush(struct socket *so)
4435 {
4436 struct sockbuf *sb = &so->so_rcv;
4437 struct protosw *pr = so->so_proto;
4438 struct sockbuf asb;
4439 #ifdef notyet
4440 lck_mtx_t *mutex_held;
4441 /*
4442 * XXX: This code is currently commented out, because we may get here
4443 * as part of sofreelastref(), and at that time, pr_getlock() may no
4444 * longer be able to return us the lock; this will be fixed in future.
4445 */
4446 if (so->so_proto->pr_getlock != NULL) {
4447 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4448 } else {
4449 mutex_held = so->so_proto->pr_domain->dom_mtx;
4450 }
4451
4452 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4453 #endif /* notyet */
4454
4455 sflt_notify(so, sock_evt_flush_read, NULL);
4456
4457 socantrcvmore(so);
4458
4459 /*
4460 * Obtain lock on the socket buffer (SB_LOCK). This is required
4461 * to prevent the socket buffer from being unexpectedly altered
4462 * while it is used by another thread in socket send/receive.
4463 *
4464 * sblock() must not fail here, hence the assertion.
4465 */
4466 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4467 VERIFY(sb->sb_flags & SB_LOCK);
4468
4469 /*
4470 * Copy only the relevant fields from "sb" to "asb" which we
4471 * need for sbrelease() to function. In particular, skip
4472 * sb_sel as it contains the wait queue linkage, which would
4473 * wreak havoc if we were to issue selthreadclear() on "asb".
4474 * Make sure to not carry over SB_LOCK in "asb", as we need
4475 * to acquire it later as part of sbrelease().
4476 */
4477 bzero(&asb, sizeof(asb));
4478 asb.sb_cc = sb->sb_cc;
4479 asb.sb_hiwat = sb->sb_hiwat;
4480 asb.sb_mbcnt = sb->sb_mbcnt;
4481 asb.sb_mbmax = sb->sb_mbmax;
4482 asb.sb_ctl = sb->sb_ctl;
4483 asb.sb_lowat = sb->sb_lowat;
4484 asb.sb_mb = sb->sb_mb;
4485 asb.sb_mbtail = sb->sb_mbtail;
4486 asb.sb_lastrecord = sb->sb_lastrecord;
4487 asb.sb_so = sb->sb_so;
4488 asb.sb_flags = sb->sb_flags;
4489 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4490 asb.sb_flags |= SB_DROP;
4491
4492 /*
4493 * Ideally we'd bzero() these and preserve the ones we need;
4494 * but to do that we'd need to shuffle things around in the
4495 * sockbuf, and we can't do it now because there are KEXTS
4496 * that are directly referring to the socket structure.
4497 *
4498 * Setting SB_DROP acts as a barrier to prevent further appends.
4499 * Clearing SB_SEL is done for selthreadclear() below.
4500 */
4501 sb->sb_cc = 0;
4502 sb->sb_hiwat = 0;
4503 sb->sb_mbcnt = 0;
4504 sb->sb_mbmax = 0;
4505 sb->sb_ctl = 0;
4506 sb->sb_lowat = 0;
4507 sb->sb_mb = NULL;
4508 sb->sb_mbtail = NULL;
4509 sb->sb_lastrecord = NULL;
4510 sb->sb_timeo.tv_sec = 0;
4511 sb->sb_timeo.tv_usec = 0;
4512 sb->sb_upcall = NULL;
4513 sb->sb_upcallarg = NULL;
4514 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4515 sb->sb_flags |= SB_DROP;
4516
4517 sbunlock(sb, TRUE); /* keep socket locked */
4518
4519 /*
4520 * Note that selthreadclear() is called on the original "sb" and
4521 * not the local "asb" because of the way wait queue linkage is
4522 * implemented. Given that selwakeup() may be triggered, SB_SEL
4523 * should no longer be set (cleared above.)
4524 */
4525 selthreadclear(&sb->sb_sel);
4526
4527 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4528 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4529 }
4530
4531 sbrelease(&asb);
4532 }
4533
4534 /*
4535 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4536 * an additional variant to handle the case where the option value needs
4537 * to be some kind of integer, but not a specific size.
4538 * In addition to their use here, these functions are also called by the
4539 * protocol-level pr_ctloutput() routines.
4540 *
4541 * Returns: 0 Success
4542 * EINVAL
4543 * copyin:EFAULT
4544 */
4545 int
sooptcopyin(struct sockopt * sopt,void * __sized_by (len)buf,size_t len,size_t minlen)4546 sooptcopyin(struct sockopt *sopt, void *__sized_by(len) buf, size_t len, size_t minlen)
4547 {
4548 size_t valsize;
4549
4550 /*
4551 * If the user gives us more than we wanted, we ignore it,
4552 * but if we don't get the minimum length the caller
4553 * wants, we return EINVAL. On success, sopt->sopt_valsize
4554 * is set to however much we actually retrieved.
4555 */
4556 if ((valsize = sopt->sopt_valsize) < minlen) {
4557 return EINVAL;
4558 }
4559 if (valsize > len) {
4560 sopt->sopt_valsize = valsize = len;
4561 }
4562
4563 if (sopt->sopt_p != kernproc) {
4564 return copyin(sopt->sopt_val, buf, valsize);
4565 }
4566
4567 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4568 CAST_DOWN(caddr_t, sopt->sopt_val),
4569 valsize);
4570 bcopy(tmp, buf, valsize);
4571
4572 return 0;
4573 }
4574
4575 /*
4576 * sooptcopyin_timeval
4577 * Copy in a timeval value into tv_p, and take into account whether the
4578 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4579 * code here so that we can verify the 64-bit tv_sec value before we lose
4580 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4581 */
4582 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4583 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4584 {
4585 int error;
4586
4587 if (proc_is64bit(sopt->sopt_p)) {
4588 struct user64_timeval tv64;
4589
4590 if (sopt->sopt_valsize < sizeof(tv64)) {
4591 return EINVAL;
4592 }
4593
4594 sopt->sopt_valsize = sizeof(tv64);
4595 if (sopt->sopt_p != kernproc) {
4596 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4597 if (error != 0) {
4598 return error;
4599 }
4600 } else {
4601 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4602 CAST_DOWN(caddr_t, sopt->sopt_val),
4603 sizeof(tv64));
4604 bcopy(tmp, &tv64, sizeof(tv64));
4605 }
4606 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4607 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4608 return EDOM;
4609 }
4610
4611 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4612 tv_p->tv_usec = tv64.tv_usec;
4613 } else {
4614 struct user32_timeval tv32;
4615
4616 if (sopt->sopt_valsize < sizeof(tv32)) {
4617 return EINVAL;
4618 }
4619
4620 sopt->sopt_valsize = sizeof(tv32);
4621 if (sopt->sopt_p != kernproc) {
4622 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4623 if (error != 0) {
4624 return error;
4625 }
4626 } else {
4627 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4628 CAST_DOWN(caddr_t, sopt->sopt_val),
4629 sizeof(tv32));
4630 bcopy(tmp, &tv32, sizeof(tv32));
4631 }
4632 #ifndef __LP64__
4633 /*
4634 * K64todo "comparison is always false due to
4635 * limited range of data type"
4636 */
4637 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4638 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4639 return EDOM;
4640 }
4641 #endif
4642 tv_p->tv_sec = tv32.tv_sec;
4643 tv_p->tv_usec = tv32.tv_usec;
4644 }
4645 return 0;
4646 }
4647
4648 int
sooptcopyin_bindtodevice(struct sockopt * sopt,char * __sized_by (bufsize)buf,size_t bufsize)4649 sooptcopyin_bindtodevice(struct sockopt *sopt, char * __sized_by(bufsize) buf, size_t bufsize)
4650 {
4651 #define MIN_BINDTODEVICE_NAME_SIZE 2
4652 size_t maxlen = bufsize - 1; /* the max string length that fits in the buffer */
4653
4654 if (bufsize < MIN_BINDTODEVICE_NAME_SIZE) {
4655 #if DEBUG || DEVELOPMENT
4656 os_log(OS_LOG_DEFAULT, "%s: bufsize %lu < MIN_BINDTODEVICE_NAME_SIZE %d",
4657 __func__, bufsize, MIN_BINDTODEVICE_NAME_SIZE);
4658 #endif /* DEBUG || DEVELOPMENT */
4659 return EINVAL;
4660 }
4661
4662 memset(buf, 0, bufsize);
4663
4664 /*
4665 * bufsize includes the end-of-string because of the uncertainty wether
4666 * interface names are passed as strings or byte buffers.
4667 * If the user gives us more than the max string length return EINVAL.
4668 * On success, sopt->sopt_valsize is not modified
4669 */
4670 maxlen = bufsize - 1;
4671 if (sopt->sopt_valsize > maxlen) {
4672 os_log(OS_LOG_DEFAULT, "%s: sopt_valsize %lu > maxlen %lu",
4673 __func__, sopt->sopt_valsize, maxlen);
4674 return EINVAL;
4675 }
4676
4677 if (sopt->sopt_p != kernproc) {
4678 return copyin(sopt->sopt_val, buf, sopt->sopt_valsize);
4679 } else {
4680 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4681 CAST_DOWN(caddr_t, sopt->sopt_val),
4682 sopt->sopt_valsize);
4683 bcopy(tmp, buf, sopt->sopt_valsize);
4684 }
4685
4686 return 0;
4687 #undef MIN_BINDTODEVICE_NAME_SIZE
4688 }
4689
4690 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4691 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4692 boolean_t ignore_delegate)
4693 {
4694 kauth_cred_t cred = NULL;
4695 proc_t ep = PROC_NULL;
4696 uid_t uid;
4697 int error = 0;
4698
4699 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4700 ep = proc_find(so->e_pid);
4701 if (ep) {
4702 cred = kauth_cred_proc_ref(ep);
4703 }
4704 }
4705
4706 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4707
4708 /* uid is 0 for root */
4709 if (uid != 0 || !allow_root) {
4710 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4711 }
4712 if (cred) {
4713 kauth_cred_unref(&cred);
4714 }
4715 if (ep != PROC_NULL) {
4716 proc_rele(ep);
4717 }
4718
4719 return error;
4720 }
4721
4722 /*
4723 * Returns: 0 Success
4724 * EINVAL
4725 * ENOPROTOOPT
4726 * ENOBUFS
4727 * EDOM
4728 * sooptcopyin:EINVAL
4729 * sooptcopyin:EFAULT
4730 * sooptcopyin_timeval:EINVAL
4731 * sooptcopyin_timeval:EFAULT
4732 * sooptcopyin_timeval:EDOM
4733 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4734 * <pr_ctloutput>:???w
4735 * sflt_attach_private:??? [whatever a filter author chooses]
4736 * <sf_setoption>:??? [whatever a filter author chooses]
4737 *
4738 * Notes: Other <pru_listen> returns depend on the protocol family; all
4739 * <sf_listen> returns depend on what the filter author causes
4740 * their filter to return.
4741 */
4742 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4743 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4744 {
4745 int error, optval;
4746 int64_t long_optval;
4747 struct linger l;
4748 struct timeval tv;
4749
4750 if (sopt->sopt_dir != SOPT_SET) {
4751 sopt->sopt_dir = SOPT_SET;
4752 }
4753
4754 if (dolock) {
4755 socket_lock(so, 1);
4756 }
4757
4758 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4759 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4760 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4761 /* the socket has been shutdown, no more sockopt's */
4762 error = EINVAL;
4763 goto out;
4764 }
4765
4766 error = sflt_setsockopt(so, sopt);
4767 if (error != 0) {
4768 if (error == EJUSTRETURN) {
4769 error = 0;
4770 }
4771 goto out;
4772 }
4773
4774 if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
4775 if (so->so_proto != NULL &&
4776 so->so_proto->pr_ctloutput != NULL) {
4777 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4778 goto out;
4779 }
4780 error = ENOPROTOOPT;
4781 } else {
4782 /*
4783 * Allow socket-level (SOL_SOCKET) options to be filtered by
4784 * the protocol layer, if needed. A zero value returned from
4785 * the handler means use default socket-level processing as
4786 * done by the rest of this routine. Otherwise, any other
4787 * return value indicates that the option is unsupported.
4788 */
4789 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4790 pru_socheckopt(so, sopt)) != 0) {
4791 goto out;
4792 }
4793
4794 error = 0;
4795 switch (sopt->sopt_name) {
4796 case SO_LINGER:
4797 case SO_LINGER_SEC: {
4798 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4799 if (error != 0) {
4800 goto out;
4801 }
4802 /* Make sure to use sane values */
4803 if (sopt->sopt_name == SO_LINGER) {
4804 so->so_linger = (short)l.l_linger;
4805 } else {
4806 so->so_linger = (short)((long)l.l_linger * hz);
4807 }
4808 if (l.l_onoff != 0) {
4809 so->so_options |= SO_LINGER;
4810 } else {
4811 so->so_options &= ~SO_LINGER;
4812 }
4813 break;
4814 }
4815 case SO_DEBUG:
4816 case SO_KEEPALIVE:
4817 case SO_DONTROUTE:
4818 case SO_USELOOPBACK:
4819 case SO_BROADCAST:
4820 case SO_REUSEADDR:
4821 case SO_REUSEPORT:
4822 case SO_OOBINLINE:
4823 case SO_TIMESTAMP:
4824 case SO_TIMESTAMP_MONOTONIC:
4825 case SO_TIMESTAMP_CONTINUOUS:
4826 case SO_DONTTRUNC:
4827 case SO_WANTMORE:
4828 case SO_WANTOOBFLAG:
4829 case SO_NOWAKEFROMSLEEP:
4830 case SO_NOAPNFALLBK:
4831 error = sooptcopyin(sopt, &optval, sizeof(optval),
4832 sizeof(optval));
4833 if (error != 0) {
4834 goto out;
4835 }
4836 if (optval) {
4837 so->so_options |= sopt->sopt_name;
4838 } else {
4839 so->so_options &= ~sopt->sopt_name;
4840 }
4841 #if SKYWALK
4842 inp_update_netns_flags(so);
4843 #endif /* SKYWALK */
4844 break;
4845
4846 case SO_SNDBUF:
4847 case SO_RCVBUF:
4848 case SO_SNDLOWAT:
4849 case SO_RCVLOWAT:
4850 error = sooptcopyin(sopt, &optval, sizeof(optval),
4851 sizeof(optval));
4852 if (error != 0) {
4853 goto out;
4854 }
4855
4856 /*
4857 * Values < 1 make no sense for any of these
4858 * options, so disallow them.
4859 */
4860 if (optval < 1) {
4861 error = EINVAL;
4862 goto out;
4863 }
4864
4865 switch (sopt->sopt_name) {
4866 case SO_SNDBUF:
4867 case SO_RCVBUF: {
4868 struct sockbuf *sb =
4869 (sopt->sopt_name == SO_SNDBUF) ?
4870 &so->so_snd : &so->so_rcv;
4871 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4872 error = ENOBUFS;
4873 goto out;
4874 }
4875 sb->sb_flags |= SB_USRSIZE;
4876 sb->sb_flags &= ~SB_AUTOSIZE;
4877 sb->sb_idealsize = (u_int32_t)optval;
4878 break;
4879 }
4880 /*
4881 * Make sure the low-water is never greater than
4882 * the high-water.
4883 */
4884 case SO_SNDLOWAT: {
4885 int space = sbspace(&so->so_snd);
4886 uint32_t hiwat = so->so_snd.sb_hiwat;
4887
4888 if (so->so_snd.sb_flags & SB_UNIX) {
4889 struct unpcb *unp =
4890 (struct unpcb *)(so->so_pcb);
4891 if (unp != NULL &&
4892 unp->unp_conn != NULL) {
4893 struct socket *so2 = unp->unp_conn->unp_socket;
4894 hiwat += unp->unp_conn->unp_cc;
4895 space = sbspace(&so2->so_rcv);
4896 }
4897 }
4898
4899 so->so_snd.sb_lowat =
4900 (optval > hiwat) ?
4901 hiwat : optval;
4902
4903 if (space >= so->so_snd.sb_lowat) {
4904 sowwakeup(so);
4905 }
4906 break;
4907 }
4908 case SO_RCVLOWAT: {
4909 int64_t data_len;
4910 so->so_rcv.sb_lowat =
4911 (optval > so->so_rcv.sb_hiwat) ?
4912 so->so_rcv.sb_hiwat : optval;
4913 if (so->so_rcv.sb_flags & SB_UNIX) {
4914 struct unpcb *unp =
4915 (struct unpcb *)(so->so_pcb);
4916 if (unp != NULL &&
4917 unp->unp_conn != NULL) {
4918 struct socket *so2 = unp->unp_conn->unp_socket;
4919 data_len = so2->so_snd.sb_cc
4920 - so2->so_snd.sb_ctl;
4921 } else {
4922 data_len = so->so_rcv.sb_cc
4923 - so->so_rcv.sb_ctl;
4924 }
4925 } else {
4926 data_len = so->so_rcv.sb_cc
4927 - so->so_rcv.sb_ctl;
4928 }
4929
4930 if (data_len >= so->so_rcv.sb_lowat) {
4931 sorwakeup(so);
4932 }
4933 break;
4934 }
4935 }
4936 break;
4937
4938 case SO_SNDTIMEO:
4939 case SO_RCVTIMEO:
4940 error = sooptcopyin_timeval(sopt, &tv);
4941 if (error != 0) {
4942 goto out;
4943 }
4944
4945 switch (sopt->sopt_name) {
4946 case SO_SNDTIMEO:
4947 so->so_snd.sb_timeo = tv;
4948 break;
4949 case SO_RCVTIMEO:
4950 so->so_rcv.sb_timeo = tv;
4951 break;
4952 }
4953 break;
4954
4955 case SO_NKE: {
4956 struct so_nke nke;
4957
4958 error = sooptcopyin(sopt, &nke, sizeof(nke),
4959 sizeof(nke));
4960 if (error != 0) {
4961 goto out;
4962 }
4963
4964 error = sflt_attach_internal(so, nke.nke_handle);
4965 break;
4966 }
4967
4968 case SO_NOSIGPIPE:
4969 error = sooptcopyin(sopt, &optval, sizeof(optval),
4970 sizeof(optval));
4971 if (error != 0) {
4972 goto out;
4973 }
4974 if (optval != 0) {
4975 so->so_flags |= SOF_NOSIGPIPE;
4976 } else {
4977 so->so_flags &= ~SOF_NOSIGPIPE;
4978 }
4979 break;
4980
4981 case SO_NOADDRERR:
4982 error = sooptcopyin(sopt, &optval, sizeof(optval),
4983 sizeof(optval));
4984 if (error != 0) {
4985 goto out;
4986 }
4987 if (optval != 0) {
4988 so->so_flags |= SOF_NOADDRAVAIL;
4989 } else {
4990 so->so_flags &= ~SOF_NOADDRAVAIL;
4991 }
4992 break;
4993
4994 case SO_REUSESHAREUID:
4995 error = sooptcopyin(sopt, &optval, sizeof(optval),
4996 sizeof(optval));
4997 if (error != 0) {
4998 goto out;
4999 }
5000 if (optval != 0) {
5001 so->so_flags |= SOF_REUSESHAREUID;
5002 } else {
5003 so->so_flags &= ~SOF_REUSESHAREUID;
5004 }
5005 break;
5006
5007 case SO_NOTIFYCONFLICT:
5008 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5009 error = EPERM;
5010 goto out;
5011 }
5012 error = sooptcopyin(sopt, &optval, sizeof(optval),
5013 sizeof(optval));
5014 if (error != 0) {
5015 goto out;
5016 }
5017 if (optval != 0) {
5018 so->so_flags |= SOF_NOTIFYCONFLICT;
5019 } else {
5020 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5021 }
5022 break;
5023
5024 case SO_RESTRICTIONS:
5025 error = sooptcopyin(sopt, &optval, sizeof(optval),
5026 sizeof(optval));
5027 if (error != 0) {
5028 goto out;
5029 }
5030
5031 error = so_set_restrictions(so, optval);
5032 break;
5033
5034 case SO_AWDL_UNRESTRICTED:
5035 if (SOCK_DOM(so) != PF_INET &&
5036 SOCK_DOM(so) != PF_INET6) {
5037 error = EOPNOTSUPP;
5038 goto out;
5039 }
5040 error = sooptcopyin(sopt, &optval, sizeof(optval),
5041 sizeof(optval));
5042 if (error != 0) {
5043 goto out;
5044 }
5045 if (optval != 0) {
5046 error = soopt_cred_check(so,
5047 PRIV_NET_RESTRICTED_AWDL, false, false);
5048 if (error == 0) {
5049 inp_set_awdl_unrestricted(
5050 sotoinpcb(so));
5051 }
5052 } else {
5053 inp_clear_awdl_unrestricted(sotoinpcb(so));
5054 }
5055 break;
5056 case SO_INTCOPROC_ALLOW:
5057 if (SOCK_DOM(so) != PF_INET6) {
5058 error = EOPNOTSUPP;
5059 goto out;
5060 }
5061 error = sooptcopyin(sopt, &optval, sizeof(optval),
5062 sizeof(optval));
5063 if (error != 0) {
5064 goto out;
5065 }
5066 if (optval != 0 &&
5067 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5068 error = soopt_cred_check(so,
5069 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5070 if (error == 0) {
5071 inp_set_intcoproc_allowed(
5072 sotoinpcb(so));
5073 }
5074 } else if (optval == 0) {
5075 inp_clear_intcoproc_allowed(sotoinpcb(so));
5076 }
5077 break;
5078
5079 case SO_LABEL:
5080 error = EOPNOTSUPP;
5081 break;
5082
5083 case SO_UPCALLCLOSEWAIT:
5084 error = sooptcopyin(sopt, &optval, sizeof(optval),
5085 sizeof(optval));
5086 if (error != 0) {
5087 goto out;
5088 }
5089 if (optval != 0) {
5090 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5091 } else {
5092 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5093 }
5094 break;
5095
5096 case SO_RANDOMPORT:
5097 error = sooptcopyin(sopt, &optval, sizeof(optval),
5098 sizeof(optval));
5099 if (error != 0) {
5100 goto out;
5101 }
5102 if (optval != 0) {
5103 so->so_flags |= SOF_BINDRANDOMPORT;
5104 } else {
5105 so->so_flags &= ~SOF_BINDRANDOMPORT;
5106 }
5107 break;
5108
5109 case SO_NP_EXTENSIONS: {
5110 struct so_np_extensions sonpx;
5111
5112 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5113 sizeof(sonpx));
5114 if (error != 0) {
5115 goto out;
5116 }
5117 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5118 error = EINVAL;
5119 goto out;
5120 }
5121 /*
5122 * Only one bit defined for now
5123 */
5124 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5125 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5126 so->so_flags |= SOF_NPX_SETOPTSHUT;
5127 } else {
5128 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5129 }
5130 }
5131 break;
5132 }
5133
5134 case SO_TRAFFIC_CLASS: {
5135 error = sooptcopyin(sopt, &optval, sizeof(optval),
5136 sizeof(optval));
5137 if (error != 0) {
5138 goto out;
5139 }
5140 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5141 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5142 error = so_set_net_service_type(so, netsvc);
5143 goto out;
5144 }
5145 error = so_set_traffic_class(so, optval);
5146 if (error != 0) {
5147 goto out;
5148 }
5149 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5150 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5151 break;
5152 }
5153
5154 case SO_RECV_TRAFFIC_CLASS: {
5155 error = sooptcopyin(sopt, &optval, sizeof(optval),
5156 sizeof(optval));
5157 if (error != 0) {
5158 goto out;
5159 }
5160 if (optval == 0) {
5161 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5162 } else {
5163 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5164 }
5165 break;
5166 }
5167
5168 #if (DEVELOPMENT || DEBUG)
5169 case SO_TRAFFIC_CLASS_DBG: {
5170 struct so_tcdbg so_tcdbg;
5171
5172 error = sooptcopyin(sopt, &so_tcdbg,
5173 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5174 if (error != 0) {
5175 goto out;
5176 }
5177 error = so_set_tcdbg(so, &so_tcdbg);
5178 if (error != 0) {
5179 goto out;
5180 }
5181 break;
5182 }
5183 #endif /* (DEVELOPMENT || DEBUG) */
5184
5185 case SO_PRIVILEGED_TRAFFIC_CLASS:
5186 error = priv_check_cred(kauth_cred_get(),
5187 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5188 if (error != 0) {
5189 goto out;
5190 }
5191 error = sooptcopyin(sopt, &optval, sizeof(optval),
5192 sizeof(optval));
5193 if (error != 0) {
5194 goto out;
5195 }
5196 if (optval == 0) {
5197 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5198 } else {
5199 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5200 }
5201 break;
5202
5203 #if (DEVELOPMENT || DEBUG)
5204 case SO_DEFUNCTIT:
5205 error = sosetdefunct(current_proc(), so, 0, FALSE);
5206 if (error == 0) {
5207 error = sodefunct(current_proc(), so, 0);
5208 }
5209
5210 break;
5211 #endif /* (DEVELOPMENT || DEBUG) */
5212
5213 case SO_DEFUNCTOK:
5214 error = sooptcopyin(sopt, &optval, sizeof(optval),
5215 sizeof(optval));
5216 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5217 if (error == 0) {
5218 error = EBADF;
5219 }
5220 goto out;
5221 }
5222 /*
5223 * Any process can set SO_DEFUNCTOK (clear
5224 * SOF_NODEFUNCT), but only root can clear
5225 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5226 */
5227 if (optval == 0 &&
5228 kauth_cred_issuser(kauth_cred_get()) == 0) {
5229 error = EPERM;
5230 goto out;
5231 }
5232 if (optval) {
5233 so->so_flags &= ~SOF_NODEFUNCT;
5234 } else {
5235 so->so_flags |= SOF_NODEFUNCT;
5236 }
5237
5238 if (SOCK_DOM(so) == PF_INET ||
5239 SOCK_DOM(so) == PF_INET6) {
5240 char s[MAX_IPv6_STR_LEN];
5241 char d[MAX_IPv6_STR_LEN];
5242 struct inpcb *inp = sotoinpcb(so);
5243
5244 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5245 "[%s %s:%d -> %s:%d] is now marked "
5246 "as %seligible for "
5247 "defunct\n", __func__, proc_selfpid(),
5248 proc_best_name(current_proc()),
5249 so->so_gencnt,
5250 (SOCK_TYPE(so) == SOCK_STREAM) ?
5251 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5252 ((SOCK_DOM(so) == PF_INET) ?
5253 (void *)&inp->inp_laddr.s_addr :
5254 (void *)&inp->in6p_laddr), s, sizeof(s)),
5255 ntohs(inp->in6p_lport),
5256 inet_ntop(SOCK_DOM(so),
5257 (SOCK_DOM(so) == PF_INET) ?
5258 (void *)&inp->inp_faddr.s_addr :
5259 (void *)&inp->in6p_faddr, d, sizeof(d)),
5260 ntohs(inp->in6p_fport),
5261 (so->so_flags & SOF_NODEFUNCT) ?
5262 "not " : "");
5263 } else {
5264 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5265 "is now marked as %seligible for "
5266 "defunct\n",
5267 __func__, proc_selfpid(),
5268 proc_best_name(current_proc()),
5269 so->so_gencnt,
5270 SOCK_DOM(so), SOCK_TYPE(so),
5271 (so->so_flags & SOF_NODEFUNCT) ?
5272 "not " : "");
5273 }
5274 break;
5275
5276 case SO_ISDEFUNCT:
5277 /* This option is not settable */
5278 error = EINVAL;
5279 break;
5280
5281 case SO_OPPORTUNISTIC:
5282 error = sooptcopyin(sopt, &optval, sizeof(optval),
5283 sizeof(optval));
5284 if (error == 0) {
5285 error = so_set_opportunistic(so, optval);
5286 }
5287 break;
5288
5289 case SO_FLUSH:
5290 /* This option is handled by lower layer(s) */
5291 error = 0;
5292 break;
5293
5294 case SO_RECV_ANYIF:
5295 error = sooptcopyin(sopt, &optval, sizeof(optval),
5296 sizeof(optval));
5297 if (error == 0) {
5298 error = so_set_recv_anyif(so, optval);
5299 }
5300 break;
5301
5302 case SO_TRAFFIC_MGT_BACKGROUND: {
5303 /* This option is handled by lower layer(s) */
5304 error = 0;
5305 break;
5306 }
5307
5308 #if FLOW_DIVERT
5309 case SO_FLOW_DIVERT_TOKEN:
5310 error = flow_divert_token_set(so, sopt);
5311 break;
5312 #endif /* FLOW_DIVERT */
5313
5314
5315 case SO_DELEGATED:
5316 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5317 sizeof(optval))) != 0) {
5318 break;
5319 }
5320
5321 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5322 break;
5323
5324 case SO_DELEGATED_UUID: {
5325 uuid_t euuid;
5326
5327 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5328 sizeof(euuid))) != 0) {
5329 break;
5330 }
5331
5332 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5333 break;
5334 }
5335
5336 #if NECP
5337 case SO_NECP_ATTRIBUTES:
5338 if (SOCK_DOM(so) == PF_MULTIPATH) {
5339 /* Handled by MPTCP itself */
5340 break;
5341 }
5342
5343 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5344 error = EINVAL;
5345 goto out;
5346 }
5347
5348 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5349 break;
5350
5351 case SO_NECP_CLIENTUUID: {
5352 if (SOCK_DOM(so) == PF_MULTIPATH) {
5353 /* Handled by MPTCP itself */
5354 break;
5355 }
5356
5357 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5358 error = EINVAL;
5359 goto out;
5360 }
5361
5362 struct inpcb *inp = sotoinpcb(so);
5363 if (!uuid_is_null(inp->necp_client_uuid)) {
5364 // Clear out the old client UUID if present
5365 necp_inpcb_remove_cb(inp);
5366 }
5367
5368 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5369 sizeof(uuid_t), sizeof(uuid_t));
5370 if (error != 0) {
5371 goto out;
5372 }
5373
5374 if (uuid_is_null(inp->necp_client_uuid)) {
5375 error = EINVAL;
5376 goto out;
5377 }
5378
5379 pid_t current_pid = proc_pid(current_proc());
5380 error = necp_client_register_socket_flow(current_pid,
5381 inp->necp_client_uuid, inp);
5382 if (error != 0) {
5383 uuid_clear(inp->necp_client_uuid);
5384 goto out;
5385 }
5386
5387 if (inp->inp_lport != 0) {
5388 // There is a bound local port, so this is not
5389 // a fresh socket. Assign to the client.
5390 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5391 }
5392
5393 break;
5394 }
5395 case SO_NECP_LISTENUUID: {
5396 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5397 error = EINVAL;
5398 goto out;
5399 }
5400
5401 struct inpcb *inp = sotoinpcb(so);
5402 if (!uuid_is_null(inp->necp_client_uuid)) {
5403 error = EINVAL;
5404 goto out;
5405 }
5406
5407 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5408 sizeof(uuid_t), sizeof(uuid_t));
5409 if (error != 0) {
5410 goto out;
5411 }
5412
5413 if (uuid_is_null(inp->necp_client_uuid)) {
5414 error = EINVAL;
5415 goto out;
5416 }
5417
5418 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5419 inp->necp_client_uuid, inp);
5420 if (error != 0) {
5421 uuid_clear(inp->necp_client_uuid);
5422 goto out;
5423 }
5424
5425 // Mark that the port registration is held by NECP
5426 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5427
5428 break;
5429 }
5430
5431 case SO_RESOLVER_SIGNATURE: {
5432 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5433 error = EINVAL;
5434 goto out;
5435 }
5436 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5437 break;
5438 }
5439 #endif /* NECP */
5440
5441 case SO_EXTENDED_BK_IDLE:
5442 error = sooptcopyin(sopt, &optval, sizeof(optval),
5443 sizeof(optval));
5444 if (error == 0) {
5445 error = so_set_extended_bk_idle(so, optval);
5446 }
5447 break;
5448
5449 case SO_MARK_CELLFALLBACK:
5450 error = sooptcopyin(sopt, &optval, sizeof(optval),
5451 sizeof(optval));
5452 if (error != 0) {
5453 goto out;
5454 }
5455 if (optval < 0) {
5456 error = EINVAL;
5457 goto out;
5458 }
5459 if (optval == 0) {
5460 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5461 } else {
5462 so->so_flags1 |= SOF1_CELLFALLBACK;
5463 }
5464 break;
5465
5466 case SO_MARK_CELLFALLBACK_UUID:
5467 {
5468 struct so_mark_cellfallback_uuid_args args;
5469
5470 error = sooptcopyin(sopt, &args, sizeof(args),
5471 sizeof(args));
5472 if (error != 0) {
5473 goto out;
5474 }
5475 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5476 args.flow_cellfallback);
5477 break;
5478 }
5479
5480 case SO_FALLBACK_MODE:
5481 error = sooptcopyin(sopt, &optval, sizeof(optval),
5482 sizeof(optval));
5483 if (error != 0) {
5484 goto out;
5485 }
5486 if (optval < SO_FALLBACK_MODE_NONE ||
5487 optval > SO_FALLBACK_MODE_PREFER) {
5488 error = EINVAL;
5489 goto out;
5490 }
5491 so->so_fallback_mode = (u_int8_t)optval;
5492 break;
5493
5494 case SO_MARK_KNOWN_TRACKER: {
5495 error = sooptcopyin(sopt, &optval, sizeof(optval),
5496 sizeof(optval));
5497 if (error != 0) {
5498 goto out;
5499 }
5500 if (optval < 0) {
5501 error = EINVAL;
5502 goto out;
5503 }
5504 if (optval == 0) {
5505 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5506 } else {
5507 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5508 }
5509 break;
5510 }
5511
5512 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5513 error = sooptcopyin(sopt, &optval, sizeof(optval),
5514 sizeof(optval));
5515 if (error != 0) {
5516 goto out;
5517 }
5518 if (optval < 0) {
5519 error = EINVAL;
5520 goto out;
5521 }
5522 if (optval == 0) {
5523 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5524 } else {
5525 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5526 }
5527 break;
5528 }
5529
5530 case SO_MARK_APPROVED_APP_DOMAIN: {
5531 error = sooptcopyin(sopt, &optval, sizeof(optval),
5532 sizeof(optval));
5533 if (error != 0) {
5534 goto out;
5535 }
5536 if (optval < 0) {
5537 error = EINVAL;
5538 goto out;
5539 }
5540 if (optval == 0) {
5541 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5542 } else {
5543 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5544 }
5545 break;
5546 }
5547
5548 case SO_STATISTICS_EVENT:
5549 error = sooptcopyin(sopt, &long_optval,
5550 sizeof(long_optval), sizeof(long_optval));
5551 if (error != 0) {
5552 goto out;
5553 }
5554 u_int64_t nstat_event = 0;
5555 error = so_statistics_event_to_nstat_event(
5556 &long_optval, &nstat_event);
5557 if (error != 0) {
5558 goto out;
5559 }
5560 nstat_pcb_event(sotoinpcb(so), nstat_event);
5561 break;
5562
5563 case SO_NET_SERVICE_TYPE: {
5564 error = sooptcopyin(sopt, &optval, sizeof(optval),
5565 sizeof(optval));
5566 if (error != 0) {
5567 goto out;
5568 }
5569 error = so_set_net_service_type(so, optval);
5570 break;
5571 }
5572
5573 case SO_QOSMARKING_POLICY_OVERRIDE:
5574 error = priv_check_cred(kauth_cred_get(),
5575 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5576 if (error != 0) {
5577 goto out;
5578 }
5579 error = sooptcopyin(sopt, &optval, sizeof(optval),
5580 sizeof(optval));
5581 if (error != 0) {
5582 goto out;
5583 }
5584 if (optval == 0) {
5585 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5586 } else {
5587 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5588 }
5589 break;
5590
5591 case SO_MPKL_SEND_INFO: {
5592 struct so_mpkl_send_info so_mpkl_send_info;
5593
5594 error = sooptcopyin(sopt, &so_mpkl_send_info,
5595 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5596 if (error != 0) {
5597 goto out;
5598 }
5599 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5600 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5601
5602 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5603 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5604 } else {
5605 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5606 }
5607 break;
5608 }
5609 case SO_WANT_KEV_SOCKET_CLOSED: {
5610 error = sooptcopyin(sopt, &optval, sizeof(optval),
5611 sizeof(optval));
5612 if (error != 0) {
5613 goto out;
5614 }
5615 if (optval == 0) {
5616 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5617 } else {
5618 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5619 }
5620 break;
5621 }
5622 case SO_MARK_WAKE_PKT: {
5623 error = sooptcopyin(sopt, &optval, sizeof(optval),
5624 sizeof(optval));
5625 if (error != 0) {
5626 goto out;
5627 }
5628 if (optval == 0) {
5629 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5630 } else {
5631 so->so_flags |= SOF_MARK_WAKE_PKT;
5632 }
5633 break;
5634 }
5635 case SO_RECV_WAKE_PKT: {
5636 error = sooptcopyin(sopt, &optval, sizeof(optval),
5637 sizeof(optval));
5638 if (error != 0) {
5639 goto out;
5640 }
5641 if (optval == 0) {
5642 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5643 } else {
5644 so->so_flags |= SOF_RECV_WAKE_PKT;
5645 }
5646 break;
5647 }
5648 case SO_APPLICATION_ID: {
5649 so_application_id_t application_id = { 0 };
5650
5651 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5652 error = EINVAL;
5653 goto out;
5654 }
5655 error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5656 sizeof(application_id));
5657 if (error != 0) {
5658 goto out;
5659 }
5660
5661 // The user needs to match
5662 if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5663 error = EINVAL;
5664 printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5665 goto out;
5666 }
5667 error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5668 if (error != 0) {
5669 printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5670 goto out;
5671 }
5672 if (application_id.persona_id != PERSONA_ID_NONE) {
5673 so->so_persona_id = application_id.persona_id;
5674 }
5675 break;
5676 }
5677 case SO_MARK_DOMAIN_INFO_SILENT:
5678 error = sooptcopyin(sopt, &optval, sizeof(optval),
5679 sizeof(optval));
5680 if (error != 0) {
5681 goto out;
5682 }
5683 if (optval < 0) {
5684 error = EINVAL;
5685 goto out;
5686 }
5687 if (optval == 0) {
5688 so->so_flags1 &= ~SOF1_DOMAIN_INFO_SILENT;
5689 } else {
5690 so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
5691 }
5692 break;
5693 case SO_MAX_PACING_RATE: {
5694 uint64_t pacingrate;
5695
5696 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5697 error = EINVAL;
5698 goto out;
5699 }
5700
5701 error = sooptcopyin(sopt, &pacingrate,
5702 sizeof(pacingrate), sizeof(pacingrate));
5703 if (error != 0) {
5704 goto out;
5705 }
5706
5707 if (pacingrate == 0) {
5708 error = EINVAL;
5709 goto out;
5710 }
5711 sotoinpcb(so)->inp_max_pacing_rate = pacingrate;
5712 break;
5713 }
5714 case SO_CONNECTION_IDLE: {
5715 int is_idle;
5716
5717 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5718 error = EINVAL;
5719 goto out;
5720 }
5721
5722 error = sooptcopyin(sopt, &is_idle,
5723 sizeof(is_idle), sizeof(is_idle));
5724 if (error != 0) {
5725 goto out;
5726 }
5727
5728 if (is_idle != 0) {
5729 sotoinpcb(so)->inp_flags2 |= INP2_CONNECTION_IDLE;
5730 } else {
5731 sotoinpcb(so)->inp_flags2 &= ~INP2_CONNECTION_IDLE;
5732 }
5733 break;
5734 }
5735 default:
5736 error = ENOPROTOOPT;
5737 break;
5738 }
5739 if (error == 0 && so->so_proto != NULL &&
5740 so->so_proto->pr_ctloutput != NULL) {
5741 (void) so->so_proto->pr_ctloutput(so, sopt);
5742 }
5743 }
5744 out:
5745 if (dolock) {
5746 socket_unlock(so, 1);
5747 }
5748 return error;
5749 }
5750
5751 /* Helper routines for getsockopt */
5752 int
sooptcopyout(struct sockopt * sopt,void * __sized_by (len)buf,size_t len)5753 sooptcopyout(struct sockopt *sopt, void *__sized_by(len) buf, size_t len)
5754 {
5755 int error;
5756 size_t valsize;
5757
5758 error = 0;
5759
5760 /*
5761 * Documented get behavior is that we always return a value,
5762 * possibly truncated to fit in the user's buffer.
5763 * Traditional behavior is that we always tell the user
5764 * precisely how much we copied, rather than something useful
5765 * like the total amount we had available for her.
5766 * Note that this interface is not idempotent; the entire answer must
5767 * generated ahead of time.
5768 */
5769 valsize = MIN(len, sopt->sopt_valsize);
5770 sopt->sopt_valsize = valsize;
5771 if (sopt->sopt_valsize != 0 && sopt->sopt_val != USER_ADDR_NULL) {
5772 if (sopt->sopt_p != kernproc) {
5773 error = copyout(buf, sopt->sopt_val, valsize);
5774 } else {
5775 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5776 CAST_DOWN(caddr_t, sopt->sopt_val),
5777 valsize);
5778 bcopy(buf, tmp, valsize);
5779 }
5780 }
5781 return error;
5782 }
5783
5784 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5785 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5786 {
5787 int error;
5788 size_t len;
5789 struct user64_timeval tv64 = {};
5790 struct user32_timeval tv32 = {};
5791 const void * val;
5792 size_t valsize;
5793
5794 error = 0;
5795 if (proc_is64bit(sopt->sopt_p)) {
5796 len = sizeof(tv64);
5797 tv64.tv_sec = tv_p->tv_sec;
5798 tv64.tv_usec = tv_p->tv_usec;
5799 val = &tv64;
5800 } else {
5801 len = sizeof(tv32);
5802 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5803 tv32.tv_usec = tv_p->tv_usec;
5804 val = &tv32;
5805 }
5806 valsize = MIN(len, sopt->sopt_valsize);
5807 sopt->sopt_valsize = valsize;
5808 if (sopt->sopt_val != USER_ADDR_NULL) {
5809 if (sopt->sopt_p != kernproc) {
5810 error = copyout(val, sopt->sopt_val, valsize);
5811 } else {
5812 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5813 CAST_DOWN(caddr_t, sopt->sopt_val),
5814 valsize);
5815 bcopy(val, tmp, valsize);
5816 }
5817 }
5818 return error;
5819 }
5820
5821 /*
5822 * Return: 0 Success
5823 * ENOPROTOOPT
5824 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5825 * <pr_ctloutput>:???
5826 * <sf_getoption>:???
5827 */
5828 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5829 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5830 {
5831 int error, optval;
5832 struct linger l;
5833 struct timeval tv;
5834
5835 if (sopt->sopt_dir != SOPT_GET) {
5836 sopt->sopt_dir = SOPT_GET;
5837 }
5838
5839 if (dolock) {
5840 socket_lock(so, 1);
5841 }
5842
5843 error = sflt_getsockopt(so, sopt);
5844 if (error != 0) {
5845 if (error == EJUSTRETURN) {
5846 error = 0;
5847 }
5848 goto out;
5849 }
5850
5851 if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
5852 if (so->so_proto != NULL &&
5853 so->so_proto->pr_ctloutput != NULL) {
5854 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5855 goto out;
5856 }
5857 error = ENOPROTOOPT;
5858 } else {
5859 /*
5860 * Allow socket-level (SOL_SOCKET) options to be filtered by
5861 * the protocol layer, if needed. A zero value returned from
5862 * the handler means use default socket-level processing as
5863 * done by the rest of this routine. Otherwise, any other
5864 * return value indicates that the option is unsupported.
5865 */
5866 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5867 pru_socheckopt(so, sopt)) != 0) {
5868 goto out;
5869 }
5870
5871 error = 0;
5872 switch (sopt->sopt_name) {
5873 case SO_LINGER:
5874 case SO_LINGER_SEC:
5875 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5876 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5877 so->so_linger : so->so_linger / hz;
5878 error = sooptcopyout(sopt, &l, sizeof(l));
5879 break;
5880
5881 case SO_USELOOPBACK:
5882 case SO_DONTROUTE:
5883 case SO_DEBUG:
5884 case SO_KEEPALIVE:
5885 case SO_REUSEADDR:
5886 case SO_REUSEPORT:
5887 case SO_BROADCAST:
5888 case SO_OOBINLINE:
5889 case SO_TIMESTAMP:
5890 case SO_TIMESTAMP_MONOTONIC:
5891 case SO_TIMESTAMP_CONTINUOUS:
5892 case SO_DONTTRUNC:
5893 case SO_WANTMORE:
5894 case SO_WANTOOBFLAG:
5895 case SO_NOWAKEFROMSLEEP:
5896 case SO_NOAPNFALLBK:
5897 optval = so->so_options & sopt->sopt_name;
5898 integer:
5899 error = sooptcopyout(sopt, &optval, sizeof(optval));
5900 break;
5901
5902 case SO_TYPE:
5903 optval = so->so_type;
5904 goto integer;
5905
5906 case SO_NREAD:
5907 if (so->so_proto->pr_flags & PR_ATOMIC) {
5908 int pkt_total;
5909 struct mbuf *m1;
5910
5911 pkt_total = 0;
5912 m1 = so->so_rcv.sb_mb;
5913 while (m1 != NULL) {
5914 if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5915 pkt_total += m1->m_len;
5916 }
5917 m1 = m1->m_next;
5918 }
5919 optval = pkt_total;
5920 } else {
5921 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5922 }
5923 goto integer;
5924
5925 case SO_NUMRCVPKT:
5926 if (so->so_proto->pr_flags & PR_ATOMIC) {
5927 int cnt = 0;
5928 struct mbuf *m1;
5929
5930 m1 = so->so_rcv.sb_mb;
5931 while (m1 != NULL) {
5932 cnt += 1;
5933 m1 = m1->m_nextpkt;
5934 }
5935 optval = cnt;
5936 goto integer;
5937 } else {
5938 error = ENOPROTOOPT;
5939 break;
5940 }
5941
5942 case SO_NWRITE:
5943 optval = so->so_snd.sb_cc;
5944 goto integer;
5945
5946 case SO_ERROR:
5947 optval = so->so_error;
5948 so->so_error = 0;
5949 goto integer;
5950
5951 case SO_SNDBUF: {
5952 u_int32_t hiwat = so->so_snd.sb_hiwat;
5953
5954 if (so->so_snd.sb_flags & SB_UNIX) {
5955 struct unpcb *unp =
5956 (struct unpcb *)(so->so_pcb);
5957 if (unp != NULL && unp->unp_conn != NULL) {
5958 hiwat += unp->unp_conn->unp_cc;
5959 }
5960 }
5961
5962 optval = hiwat;
5963 goto integer;
5964 }
5965 case SO_RCVBUF:
5966 optval = so->so_rcv.sb_hiwat;
5967 goto integer;
5968
5969 case SO_SNDLOWAT:
5970 optval = so->so_snd.sb_lowat;
5971 goto integer;
5972
5973 case SO_RCVLOWAT:
5974 optval = so->so_rcv.sb_lowat;
5975 goto integer;
5976
5977 case SO_SNDTIMEO:
5978 case SO_RCVTIMEO:
5979 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5980 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5981
5982 error = sooptcopyout_timeval(sopt, &tv);
5983 break;
5984
5985 case SO_NOSIGPIPE:
5986 optval = (so->so_flags & SOF_NOSIGPIPE);
5987 goto integer;
5988
5989 case SO_NOADDRERR:
5990 optval = (so->so_flags & SOF_NOADDRAVAIL);
5991 goto integer;
5992
5993 case SO_REUSESHAREUID:
5994 optval = (so->so_flags & SOF_REUSESHAREUID);
5995 goto integer;
5996
5997
5998 case SO_NOTIFYCONFLICT:
5999 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6000 goto integer;
6001
6002 case SO_RESTRICTIONS:
6003 optval = so_get_restrictions(so);
6004 goto integer;
6005
6006 case SO_AWDL_UNRESTRICTED:
6007 if (SOCK_DOM(so) == PF_INET ||
6008 SOCK_DOM(so) == PF_INET6) {
6009 optval = inp_get_awdl_unrestricted(
6010 sotoinpcb(so));
6011 goto integer;
6012 } else {
6013 error = EOPNOTSUPP;
6014 }
6015 break;
6016
6017 case SO_INTCOPROC_ALLOW:
6018 if (SOCK_DOM(so) == PF_INET6) {
6019 optval = inp_get_intcoproc_allowed(
6020 sotoinpcb(so));
6021 goto integer;
6022 } else {
6023 error = EOPNOTSUPP;
6024 }
6025 break;
6026
6027 case SO_LABEL:
6028 error = EOPNOTSUPP;
6029 break;
6030
6031 case SO_PEERLABEL:
6032 error = EOPNOTSUPP;
6033 break;
6034
6035 #ifdef __APPLE_API_PRIVATE
6036 case SO_UPCALLCLOSEWAIT:
6037 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6038 goto integer;
6039 #endif
6040 case SO_RANDOMPORT:
6041 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6042 goto integer;
6043
6044 case SO_NP_EXTENSIONS: {
6045 struct so_np_extensions sonpx = {};
6046
6047 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6048 SONPX_SETOPTSHUT : 0;
6049 sonpx.npx_mask = SONPX_MASK_VALID;
6050
6051 error = sooptcopyout(sopt, &sonpx,
6052 sizeof(struct so_np_extensions));
6053 break;
6054 }
6055
6056 case SO_TRAFFIC_CLASS:
6057 optval = so->so_traffic_class;
6058 goto integer;
6059
6060 case SO_RECV_TRAFFIC_CLASS:
6061 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6062 goto integer;
6063
6064 #if (DEVELOPMENT || DEBUG)
6065 case SO_TRAFFIC_CLASS_DBG:
6066 error = sogetopt_tcdbg(so, sopt);
6067 break;
6068 #endif /* (DEVELOPMENT || DEBUG) */
6069
6070 case SO_PRIVILEGED_TRAFFIC_CLASS:
6071 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6072 goto integer;
6073
6074 case SO_DEFUNCTOK:
6075 optval = !(so->so_flags & SOF_NODEFUNCT);
6076 goto integer;
6077
6078 case SO_ISDEFUNCT:
6079 optval = (so->so_flags & SOF_DEFUNCT);
6080 goto integer;
6081
6082 case SO_OPPORTUNISTIC:
6083 optval = so_get_opportunistic(so);
6084 goto integer;
6085
6086 case SO_FLUSH:
6087 /* This option is not gettable */
6088 error = EINVAL;
6089 break;
6090
6091 case SO_RECV_ANYIF:
6092 optval = so_get_recv_anyif(so);
6093 goto integer;
6094
6095 case SO_TRAFFIC_MGT_BACKGROUND:
6096 /* This option is handled by lower layer(s) */
6097 if (so->so_proto != NULL &&
6098 so->so_proto->pr_ctloutput != NULL) {
6099 (void) so->so_proto->pr_ctloutput(so, sopt);
6100 }
6101 break;
6102
6103 #if FLOW_DIVERT
6104 case SO_FLOW_DIVERT_TOKEN:
6105 error = flow_divert_token_get(so, sopt);
6106 break;
6107 #endif /* FLOW_DIVERT */
6108
6109 #if NECP
6110 case SO_NECP_ATTRIBUTES:
6111 if (SOCK_DOM(so) == PF_MULTIPATH) {
6112 /* Handled by MPTCP itself */
6113 break;
6114 }
6115
6116 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6117 error = EINVAL;
6118 goto out;
6119 }
6120
6121 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6122 break;
6123
6124 case SO_NECP_CLIENTUUID: {
6125 uuid_t *ncu;
6126
6127 if (SOCK_DOM(so) == PF_MULTIPATH) {
6128 ncu = &mpsotomppcb(so)->necp_client_uuid;
6129 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6130 ncu = &sotoinpcb(so)->necp_client_uuid;
6131 } else {
6132 error = EINVAL;
6133 goto out;
6134 }
6135
6136 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6137 break;
6138 }
6139
6140 case SO_NECP_LISTENUUID: {
6141 uuid_t *nlu;
6142
6143 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6144 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6145 nlu = &sotoinpcb(so)->necp_client_uuid;
6146 } else {
6147 error = ENOENT;
6148 goto out;
6149 }
6150 } else {
6151 error = EINVAL;
6152 goto out;
6153 }
6154
6155 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6156 break;
6157 }
6158
6159 case SO_RESOLVER_SIGNATURE: {
6160 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6161 error = EINVAL;
6162 goto out;
6163 }
6164 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6165 break;
6166 }
6167
6168 #endif /* NECP */
6169
6170 #if CONTENT_FILTER
6171 case SO_CFIL_SOCK_ID: {
6172 cfil_sock_id_t sock_id;
6173
6174 sock_id = cfil_sock_id_from_socket(so);
6175
6176 error = sooptcopyout(sopt, &sock_id,
6177 sizeof(cfil_sock_id_t));
6178 break;
6179 }
6180 #endif /* CONTENT_FILTER */
6181
6182 case SO_EXTENDED_BK_IDLE:
6183 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6184 goto integer;
6185 case SO_MARK_CELLFALLBACK:
6186 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6187 ? 1 : 0;
6188 goto integer;
6189 case SO_FALLBACK_MODE:
6190 optval = so->so_fallback_mode;
6191 goto integer;
6192 case SO_MARK_KNOWN_TRACKER: {
6193 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6194 ? 1 : 0;
6195 goto integer;
6196 }
6197 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6198 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6199 ? 1 : 0;
6200 goto integer;
6201 }
6202 case SO_MARK_APPROVED_APP_DOMAIN: {
6203 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6204 ? 1 : 0;
6205 goto integer;
6206 }
6207 case SO_NET_SERVICE_TYPE: {
6208 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6209 optval = so->so_netsvctype;
6210 } else {
6211 optval = NET_SERVICE_TYPE_BE;
6212 }
6213 goto integer;
6214 }
6215 case SO_NETSVC_MARKING_LEVEL:
6216 optval = so_get_netsvc_marking_level(so);
6217 goto integer;
6218
6219 case SO_MPKL_SEND_INFO: {
6220 struct so_mpkl_send_info so_mpkl_send_info;
6221
6222 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6223 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6224 error = sooptcopyout(sopt, &so_mpkl_send_info,
6225 sizeof(struct so_mpkl_send_info));
6226 break;
6227 }
6228 case SO_MARK_WAKE_PKT:
6229 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6230 goto integer;
6231 case SO_RECV_WAKE_PKT:
6232 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6233 goto integer;
6234 case SO_APPLICATION_ID: {
6235 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6236 error = EINVAL;
6237 goto out;
6238 }
6239 so_application_id_t application_id = { 0 };
6240 application_id.uid = kauth_cred_getuid(so->so_cred);
6241 uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6242 application_id.persona_id = so->so_persona_id;
6243 error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6244 break;
6245 }
6246 case SO_MARK_DOMAIN_INFO_SILENT:
6247 optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
6248 ? 1 : 0;
6249 goto integer;
6250 case SO_MAX_PACING_RATE: {
6251 uint64_t pacingrate;
6252
6253 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6254 error = EINVAL;
6255 goto out;
6256 }
6257
6258 pacingrate = sotoinpcb(so)->inp_max_pacing_rate;
6259
6260 error = sooptcopyout(sopt, &pacingrate, sizeof(pacingrate));
6261 break;
6262 }
6263 case SO_CONNECTION_IDLE: {
6264 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6265 error = EINVAL;
6266 goto out;
6267 }
6268 optval = sotoinpcb(so)->inp_flags2 & INP2_CONNECTION_IDLE ?
6269 1 : 0;
6270 goto integer;
6271 }
6272 default:
6273 error = ENOPROTOOPT;
6274 break;
6275 }
6276 }
6277 out:
6278 if (dolock) {
6279 socket_unlock(so, 1);
6280 }
6281 return error;
6282 }
6283
6284 /*
6285 * The size limits on our soopt_getm is different from that on FreeBSD.
6286 * We limit the size of options to MCLBYTES. This will have to change
6287 * if we need to define options that need more space than MCLBYTES.
6288 */
6289 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6290 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6291 {
6292 struct mbuf *m, *m_prev;
6293 int sopt_size = (int)sopt->sopt_valsize;
6294 int how;
6295
6296 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6297 return EMSGSIZE;
6298 }
6299
6300 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6301 MGET(m, how, MT_DATA);
6302 if (m == NULL) {
6303 return ENOBUFS;
6304 }
6305 if (sopt_size > MLEN) {
6306 MCLGET(m, how);
6307 if ((m->m_flags & M_EXT) == 0) {
6308 m_free(m);
6309 return ENOBUFS;
6310 }
6311 m->m_len = min(MCLBYTES, sopt_size);
6312 } else {
6313 m->m_len = min(MLEN, sopt_size);
6314 }
6315 sopt_size -= m->m_len;
6316 *mp = m;
6317 m_prev = m;
6318
6319 while (sopt_size > 0) {
6320 MGET(m, how, MT_DATA);
6321 if (m == NULL) {
6322 m_freem(*mp);
6323 return ENOBUFS;
6324 }
6325 if (sopt_size > MLEN) {
6326 MCLGET(m, how);
6327 if ((m->m_flags & M_EXT) == 0) {
6328 m_freem(*mp);
6329 m_freem(m);
6330 return ENOBUFS;
6331 }
6332 m->m_len = min(MCLBYTES, sopt_size);
6333 } else {
6334 m->m_len = min(MLEN, sopt_size);
6335 }
6336 sopt_size -= m->m_len;
6337 m_prev->m_next = m;
6338 m_prev = m;
6339 }
6340 return 0;
6341 }
6342
6343 /* copyin sopt data into mbuf chain */
6344 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6345 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6346 {
6347 struct mbuf *m0 = m;
6348
6349 if (sopt->sopt_val == USER_ADDR_NULL) {
6350 return 0;
6351 }
6352 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6353 if (sopt->sopt_p != kernproc) {
6354 int error;
6355
6356 error = copyin(sopt->sopt_val, mtod(m, char *),
6357 m->m_len);
6358 if (error != 0) {
6359 m_freem(m0);
6360 return error;
6361 }
6362 } else {
6363 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6364 CAST_DOWN(caddr_t, sopt->sopt_val),
6365 m->m_len);
6366 bcopy(tmp, mtod(m, char *), m->m_len);
6367 }
6368 sopt->sopt_valsize -= m->m_len;
6369 sopt->sopt_val += m->m_len;
6370 m = m->m_next;
6371 }
6372 /* should be allocated enoughly at ip6_sooptmcopyin() */
6373 if (m != NULL) {
6374 panic("soopt_mcopyin");
6375 /* NOTREACHED */
6376 }
6377 return 0;
6378 }
6379
6380 /* copyout mbuf chain data into soopt */
6381 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6382 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6383 {
6384 struct mbuf *m0 = m;
6385 size_t valsize = 0;
6386
6387 if (sopt->sopt_val == USER_ADDR_NULL) {
6388 return 0;
6389 }
6390 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6391 if (sopt->sopt_p != kernproc) {
6392 int error;
6393
6394 error = copyout(mtod(m, char *), sopt->sopt_val,
6395 m->m_len);
6396 if (error != 0) {
6397 m_freem(m0);
6398 return error;
6399 }
6400 } else {
6401 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6402 CAST_DOWN(caddr_t, sopt->sopt_val),
6403 m->m_len);
6404
6405 bcopy(mtod(m, char *), tmp, m->m_len);
6406 }
6407 sopt->sopt_valsize -= m->m_len;
6408 sopt->sopt_val += m->m_len;
6409 valsize += m->m_len;
6410 m = m->m_next;
6411 }
6412 if (m != NULL) {
6413 /* enough soopt buffer should be given from user-land */
6414 m_freem(m0);
6415 return EINVAL;
6416 }
6417 sopt->sopt_valsize = valsize;
6418 return 0;
6419 }
6420
6421 void
sohasoutofband(struct socket * so)6422 sohasoutofband(struct socket *so)
6423 {
6424 if (so->so_pgid < 0) {
6425 gsignal(-so->so_pgid, SIGURG);
6426 } else if (so->so_pgid > 0) {
6427 proc_signal(so->so_pgid, SIGURG);
6428 }
6429 selwakeup(&so->so_rcv.sb_sel);
6430 if (so->so_rcv.sb_flags & SB_KNOTE) {
6431 KNOTE(&so->so_rcv.sb_sel.si_note,
6432 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6433 }
6434 }
6435
6436 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6437 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6438 {
6439 #pragma unused(cred)
6440 struct proc *p = current_proc();
6441 int revents = 0;
6442
6443 socket_lock(so, 1);
6444 so_update_last_owner_locked(so, PROC_NULL);
6445 so_update_policy(so);
6446
6447 if (events & (POLLIN | POLLRDNORM)) {
6448 if (soreadable(so)) {
6449 revents |= events & (POLLIN | POLLRDNORM);
6450 }
6451 }
6452
6453 if (events & (POLLOUT | POLLWRNORM)) {
6454 if (sowriteable(so)) {
6455 revents |= events & (POLLOUT | POLLWRNORM);
6456 }
6457 }
6458
6459 if (events & (POLLPRI | POLLRDBAND)) {
6460 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6461 revents |= events & (POLLPRI | POLLRDBAND);
6462 }
6463 }
6464
6465 if (revents == 0) {
6466 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6467 /*
6468 * Darwin sets the flag first,
6469 * BSD calls selrecord first
6470 */
6471 so->so_rcv.sb_flags |= SB_SEL;
6472 selrecord(p, &so->so_rcv.sb_sel, wql);
6473 }
6474
6475 if (events & (POLLOUT | POLLWRNORM)) {
6476 /*
6477 * Darwin sets the flag first,
6478 * BSD calls selrecord first
6479 */
6480 so->so_snd.sb_flags |= SB_SEL;
6481 selrecord(p, &so->so_snd.sb_sel, wql);
6482 }
6483 }
6484
6485 socket_unlock(so, 1);
6486 return revents;
6487 }
6488
6489 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6490 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6491 {
6492 struct socket *so = (struct socket *)fp_get_data(fp);
6493 int result;
6494
6495 socket_lock(so, 1);
6496 so_update_last_owner_locked(so, PROC_NULL);
6497 so_update_policy(so);
6498
6499 switch (kn->kn_filter) {
6500 case EVFILT_READ:
6501 kn->kn_filtid = EVFILTID_SOREAD;
6502 break;
6503 case EVFILT_WRITE:
6504 kn->kn_filtid = EVFILTID_SOWRITE;
6505 break;
6506 case EVFILT_SOCK:
6507 kn->kn_filtid = EVFILTID_SCK;
6508 break;
6509 case EVFILT_EXCEPT:
6510 kn->kn_filtid = EVFILTID_SOEXCEPT;
6511 break;
6512 default:
6513 socket_unlock(so, 1);
6514 knote_set_error(kn, EINVAL);
6515 return 0;
6516 }
6517
6518 /*
6519 * call the appropriate sub-filter attach
6520 * with the socket still locked
6521 */
6522 result = knote_fops(kn)->f_attach(kn, kev);
6523
6524 socket_unlock(so, 1);
6525
6526 return result;
6527 }
6528
6529 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6530 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6531 {
6532 int retval = 0;
6533 int64_t data = 0;
6534
6535 if (so->so_options & SO_ACCEPTCONN) {
6536 /*
6537 * Radar 6615193 handle the listen case dynamically
6538 * for kqueue read filter. This allows to call listen()
6539 * after registering the kqueue EVFILT_READ.
6540 */
6541
6542 retval = !TAILQ_EMPTY(&so->so_comp);
6543 data = so->so_qlen;
6544 goto out;
6545 }
6546
6547 /* socket isn't a listener */
6548 /*
6549 * NOTE_LOWAT specifies new low water mark in data, i.e.
6550 * the bytes of protocol data. We therefore exclude any
6551 * control bytes.
6552 */
6553 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6554
6555 if (kn->kn_sfflags & NOTE_OOB) {
6556 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6557 kn->kn_fflags |= NOTE_OOB;
6558 data -= so->so_oobmark;
6559 retval = 1;
6560 goto out;
6561 }
6562 }
6563
6564 if ((so->so_state & SS_CANTRCVMORE)
6565 #if CONTENT_FILTER
6566 && cfil_sock_data_pending(&so->so_rcv) == 0
6567 #endif /* CONTENT_FILTER */
6568 ) {
6569 kn->kn_flags |= EV_EOF;
6570 kn->kn_fflags = so->so_error;
6571 retval = 1;
6572 goto out;
6573 }
6574
6575 if (so->so_error) { /* temporary udp error */
6576 retval = 1;
6577 goto out;
6578 }
6579
6580 int64_t lowwat = so->so_rcv.sb_lowat;
6581 /*
6582 * Ensure that when NOTE_LOWAT is used, the derived
6583 * low water mark is bounded by socket's rcv buf's
6584 * high and low water mark values.
6585 */
6586 if (kn->kn_sfflags & NOTE_LOWAT) {
6587 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6588 lowwat = so->so_rcv.sb_hiwat;
6589 } else if (kn->kn_sdata > lowwat) {
6590 lowwat = kn->kn_sdata;
6591 }
6592 }
6593
6594 /*
6595 * While the `data` field is the amount of data to read,
6596 * 0-sized packets need to wake up the kqueue, see 58140856,
6597 * so we need to take control bytes into account too.
6598 */
6599 retval = (so->so_rcv.sb_cc >= lowwat);
6600
6601 out:
6602 if (retval && kev) {
6603 knote_fill_kevent(kn, kev, data);
6604 }
6605 return retval;
6606 }
6607
6608 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6609 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6610 {
6611 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6612
6613 /* socket locked */
6614
6615 /*
6616 * If the caller explicitly asked for OOB results (e.g. poll())
6617 * from EVFILT_READ, then save that off in the hookid field
6618 * and reserve the kn_flags EV_OOBAND bit for output only.
6619 */
6620 if (kn->kn_filter == EVFILT_READ &&
6621 kn->kn_flags & EV_OOBAND) {
6622 kn->kn_flags &= ~EV_OOBAND;
6623 kn->kn_hook32 = EV_OOBAND;
6624 } else {
6625 kn->kn_hook32 = 0;
6626 }
6627 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6628 so->so_rcv.sb_flags |= SB_KNOTE;
6629 }
6630
6631 /* indicate if event is already fired */
6632 return filt_soread_common(kn, NULL, so);
6633 }
6634
6635 static void
filt_sordetach(struct knote * kn)6636 filt_sordetach(struct knote *kn)
6637 {
6638 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6639
6640 socket_lock(so, 1);
6641 if (so->so_rcv.sb_flags & SB_KNOTE) {
6642 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6643 so->so_rcv.sb_flags &= ~SB_KNOTE;
6644 }
6645 }
6646 socket_unlock(so, 1);
6647 }
6648
6649 /*ARGSUSED*/
6650 static int
filt_soread(struct knote * kn,long hint)6651 filt_soread(struct knote *kn, long hint)
6652 {
6653 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6654 int retval;
6655
6656 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6657 socket_lock(so, 1);
6658 }
6659
6660 retval = filt_soread_common(kn, NULL, so);
6661
6662 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6663 socket_unlock(so, 1);
6664 }
6665
6666 return retval;
6667 }
6668
6669 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6670 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6671 {
6672 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6673 int retval;
6674
6675 socket_lock(so, 1);
6676
6677 /* save off the new input fflags and data */
6678 kn->kn_sfflags = kev->fflags;
6679 kn->kn_sdata = kev->data;
6680
6681 /* determine if changes result in fired events */
6682 retval = filt_soread_common(kn, NULL, so);
6683
6684 socket_unlock(so, 1);
6685
6686 return retval;
6687 }
6688
6689 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6690 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6691 {
6692 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6693 int retval;
6694
6695 socket_lock(so, 1);
6696 retval = filt_soread_common(kn, kev, so);
6697 socket_unlock(so, 1);
6698
6699 return retval;
6700 }
6701
6702 int
so_wait_for_if_feedback(struct socket * so)6703 so_wait_for_if_feedback(struct socket *so)
6704 {
6705 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6706 (so->so_state & SS_ISCONNECTED)) {
6707 struct inpcb *inp = sotoinpcb(so);
6708 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6709 return 1;
6710 }
6711 }
6712 return 0;
6713 }
6714
6715 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6716 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6717 {
6718 int ret = 0;
6719 int64_t data = sbspace(&so->so_snd);
6720
6721 if (so->so_state & SS_CANTSENDMORE) {
6722 kn->kn_flags |= EV_EOF;
6723 kn->kn_fflags = so->so_error;
6724 ret = 1;
6725 goto out;
6726 }
6727
6728 if (so->so_error) { /* temporary udp error */
6729 ret = 1;
6730 goto out;
6731 }
6732
6733 if (!socanwrite(so)) {
6734 ret = 0;
6735 goto out;
6736 }
6737
6738 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6739 ret = 1;
6740 goto out;
6741 }
6742
6743 int64_t lowwat = so->so_snd.sb_lowat;
6744 const int64_t hiwat = so->so_snd.sb_hiwat;
6745 /*
6746 * Deal with connected UNIX domain sockets which
6747 * rely on the fact that the sender's socket buffer is
6748 * actually the receiver's socket buffer.
6749 */
6750 if (SOCK_DOM(so) == PF_LOCAL) {
6751 struct unpcb *unp = sotounpcb(so);
6752 if (unp != NULL && unp->unp_conn != NULL &&
6753 unp->unp_conn->unp_socket != NULL) {
6754 struct socket *so2 = unp->unp_conn->unp_socket;
6755 /*
6756 * At this point we know that `so' is locked
6757 * and that `unp_conn` isn't going to change.
6758 * However, we don't lock `so2` because doing so
6759 * may require unlocking `so'
6760 * (see unp_get_locks_in_order()).
6761 *
6762 * Two cases can happen:
6763 *
6764 * 1) we return 1 and tell the application that
6765 * it can write. Meanwhile, another thread
6766 * fills up the socket buffer. This will either
6767 * lead to a blocking send or EWOULDBLOCK
6768 * which the application should deal with.
6769 * 2) we return 0 and tell the application that
6770 * the socket is not writable. Meanwhile,
6771 * another thread depletes the receive socket
6772 * buffer. In this case the application will
6773 * be woken up by sb_notify().
6774 *
6775 * MIN() is required because otherwise sosendcheck()
6776 * may return EWOULDBLOCK since it only considers
6777 * so->so_snd.
6778 */
6779 data = MIN(data, sbspace(&so2->so_rcv));
6780 }
6781 }
6782
6783 if (kn->kn_sfflags & NOTE_LOWAT) {
6784 if (kn->kn_sdata > hiwat) {
6785 lowwat = hiwat;
6786 } else if (kn->kn_sdata > lowwat) {
6787 lowwat = kn->kn_sdata;
6788 }
6789 }
6790
6791 if (data > 0 && data >= lowwat) {
6792 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6793 #if (DEBUG || DEVELOPMENT)
6794 && so_notsent_lowat_check == 1
6795 #endif /* DEBUG || DEVELOPMENT */
6796 ) {
6797 if ((SOCK_DOM(so) == PF_INET ||
6798 SOCK_DOM(so) == PF_INET6) &&
6799 so->so_type == SOCK_STREAM) {
6800 ret = tcp_notsent_lowat_check(so);
6801 }
6802 #if MPTCP
6803 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6804 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6805 ret = mptcp_notsent_lowat_check(so);
6806 }
6807 #endif
6808 else {
6809 ret = 1;
6810 goto out;
6811 }
6812 } else {
6813 ret = 1;
6814 }
6815 }
6816 if (so_wait_for_if_feedback(so)) {
6817 ret = 0;
6818 }
6819
6820 out:
6821 if (ret && kev) {
6822 knote_fill_kevent(kn, kev, data);
6823 }
6824 return ret;
6825 }
6826
6827 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6828 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6829 {
6830 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6831
6832 /* socket locked */
6833 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6834 so->so_snd.sb_flags |= SB_KNOTE;
6835 }
6836
6837 /* determine if its already fired */
6838 return filt_sowrite_common(kn, NULL, so);
6839 }
6840
6841 static void
filt_sowdetach(struct knote * kn)6842 filt_sowdetach(struct knote *kn)
6843 {
6844 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6845 socket_lock(so, 1);
6846
6847 if (so->so_snd.sb_flags & SB_KNOTE) {
6848 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6849 so->so_snd.sb_flags &= ~SB_KNOTE;
6850 }
6851 }
6852 socket_unlock(so, 1);
6853 }
6854
6855 /*ARGSUSED*/
6856 static int
filt_sowrite(struct knote * kn,long hint)6857 filt_sowrite(struct knote *kn, long hint)
6858 {
6859 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6860 int ret;
6861
6862 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6863 socket_lock(so, 1);
6864 }
6865
6866 ret = filt_sowrite_common(kn, NULL, so);
6867
6868 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6869 socket_unlock(so, 1);
6870 }
6871
6872 return ret;
6873 }
6874
6875 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6876 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6877 {
6878 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6879 int ret;
6880
6881 socket_lock(so, 1);
6882
6883 /*save off the new input fflags and data */
6884 kn->kn_sfflags = kev->fflags;
6885 kn->kn_sdata = kev->data;
6886
6887 /* determine if these changes result in a triggered event */
6888 ret = filt_sowrite_common(kn, NULL, so);
6889
6890 socket_unlock(so, 1);
6891
6892 return ret;
6893 }
6894
6895 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6896 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6897 {
6898 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6899 int ret;
6900
6901 socket_lock(so, 1);
6902 ret = filt_sowrite_common(kn, kev, so);
6903 socket_unlock(so, 1);
6904
6905 return ret;
6906 }
6907
6908 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6909 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6910 struct socket *so, long ev_hint)
6911 {
6912 int ret = 0;
6913 int64_t data = 0;
6914 uint32_t level_trigger = 0;
6915
6916 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6917 kn->kn_fflags |= NOTE_CONNRESET;
6918 }
6919 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6920 kn->kn_fflags |= NOTE_TIMEOUT;
6921 }
6922 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6923 kn->kn_fflags |= NOTE_NOSRCADDR;
6924 }
6925 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6926 kn->kn_fflags |= NOTE_IFDENIED;
6927 }
6928 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6929 kn->kn_fflags |= NOTE_KEEPALIVE;
6930 }
6931 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6932 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6933 }
6934 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6935 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6936 }
6937 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6938 (so->so_state & SS_ISCONNECTED)) {
6939 kn->kn_fflags |= NOTE_CONNECTED;
6940 level_trigger |= NOTE_CONNECTED;
6941 }
6942 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6943 (so->so_state & SS_ISDISCONNECTED)) {
6944 kn->kn_fflags |= NOTE_DISCONNECTED;
6945 level_trigger |= NOTE_DISCONNECTED;
6946 }
6947 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6948 if (so->so_proto != NULL &&
6949 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6950 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6951 }
6952 }
6953 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6954 tcp_notify_ack_active(so)) {
6955 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6956 }
6957 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6958 kn->kn_fflags |= NOTE_WAKE_PKT;
6959 }
6960
6961 if ((so->so_state & SS_CANTRCVMORE)
6962 #if CONTENT_FILTER
6963 && cfil_sock_data_pending(&so->so_rcv) == 0
6964 #endif /* CONTENT_FILTER */
6965 ) {
6966 kn->kn_fflags |= NOTE_READCLOSED;
6967 level_trigger |= NOTE_READCLOSED;
6968 }
6969
6970 if (so->so_state & SS_CANTSENDMORE) {
6971 kn->kn_fflags |= NOTE_WRITECLOSED;
6972 level_trigger |= NOTE_WRITECLOSED;
6973 }
6974
6975 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6976 (so->so_flags & SOF_SUSPENDED)) {
6977 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6978
6979 /* If resume event was delivered before, reset it */
6980 kn->kn_hook32 &= ~NOTE_RESUME;
6981
6982 kn->kn_fflags |= NOTE_SUSPEND;
6983 level_trigger |= NOTE_SUSPEND;
6984 }
6985
6986 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6987 (so->so_flags & SOF_SUSPENDED) == 0) {
6988 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6989
6990 /* If suspend event was delivered before, reset it */
6991 kn->kn_hook32 &= ~NOTE_SUSPEND;
6992
6993 kn->kn_fflags |= NOTE_RESUME;
6994 level_trigger |= NOTE_RESUME;
6995 }
6996
6997 if (so->so_error != 0) {
6998 ret = 1;
6999 data = so->so_error;
7000 kn->kn_flags |= EV_EOF;
7001 } else {
7002 u_int32_t data32 = 0;
7003 get_sockev_state(so, &data32);
7004 data = data32;
7005 }
7006
7007 /* Reset any events that are not requested on this knote */
7008 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7009 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7010
7011 /* Find the level triggerred events that are already delivered */
7012 level_trigger &= kn->kn_hook32;
7013 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7014
7015 /* Do not deliver level triggerred events more than once */
7016 if ((kn->kn_fflags & ~level_trigger) != 0) {
7017 ret = 1;
7018 }
7019
7020 if (ret && kev) {
7021 /*
7022 * Store the state of the events being delivered. This
7023 * state can be used to deliver level triggered events
7024 * ateast once and still avoid waking up the application
7025 * multiple times as long as the event is active.
7026 */
7027 if (kn->kn_fflags != 0) {
7028 kn->kn_hook32 |= (kn->kn_fflags &
7029 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7030 }
7031
7032 /*
7033 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7034 * only one of them and remember the last one that was
7035 * delivered last
7036 */
7037 if (kn->kn_fflags & NOTE_SUSPEND) {
7038 kn->kn_hook32 &= ~NOTE_RESUME;
7039 }
7040 if (kn->kn_fflags & NOTE_RESUME) {
7041 kn->kn_hook32 &= ~NOTE_SUSPEND;
7042 }
7043
7044 knote_fill_kevent(kn, kev, data);
7045 }
7046 return ret;
7047 }
7048
7049 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7050 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7051 {
7052 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7053
7054 /* socket locked */
7055 kn->kn_hook32 = 0;
7056 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7057 so->so_flags |= SOF_KNOTE;
7058 }
7059
7060 /* determine if event already fired */
7061 return filt_sockev_common(kn, NULL, so, 0);
7062 }
7063
7064 static void
filt_sockdetach(struct knote * kn)7065 filt_sockdetach(struct knote *kn)
7066 {
7067 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7068 socket_lock(so, 1);
7069
7070 if ((so->so_flags & SOF_KNOTE) != 0) {
7071 if (KNOTE_DETACH(&so->so_klist, kn)) {
7072 so->so_flags &= ~SOF_KNOTE;
7073 }
7074 }
7075 socket_unlock(so, 1);
7076 }
7077
7078 static int
filt_sockev(struct knote * kn,long hint)7079 filt_sockev(struct knote *kn, long hint)
7080 {
7081 int ret = 0, locked = 0;
7082 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7083 long ev_hint = (hint & SO_FILT_HINT_EV);
7084
7085 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7086 socket_lock(so, 1);
7087 locked = 1;
7088 }
7089
7090 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7091
7092 if (locked) {
7093 socket_unlock(so, 1);
7094 }
7095
7096 return ret;
7097 }
7098
7099
7100
7101 /*
7102 * filt_socktouch - update event state
7103 */
7104 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7105 filt_socktouch(
7106 struct knote *kn,
7107 struct kevent_qos_s *kev)
7108 {
7109 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7110 uint32_t changed_flags;
7111 int ret;
7112
7113 socket_lock(so, 1);
7114
7115 /* save off the [result] data and fflags */
7116 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7117
7118 /* save off the new input fflags and data */
7119 kn->kn_sfflags = kev->fflags;
7120 kn->kn_sdata = kev->data;
7121
7122 /* restrict the current results to the (smaller?) set of new interest */
7123 /*
7124 * For compatibility with previous implementations, we leave kn_fflags
7125 * as they were before.
7126 */
7127 //kn->kn_fflags &= kev->fflags;
7128
7129 /*
7130 * Since we keep track of events that are already
7131 * delivered, if any of those events are not requested
7132 * anymore the state related to them can be reset
7133 */
7134 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7135
7136 /* determine if we have events to deliver */
7137 ret = filt_sockev_common(kn, NULL, so, 0);
7138
7139 socket_unlock(so, 1);
7140
7141 return ret;
7142 }
7143
7144 /*
7145 * filt_sockprocess - query event fired state and return data
7146 */
7147 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7148 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7149 {
7150 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7151 int ret = 0;
7152
7153 socket_lock(so, 1);
7154
7155 ret = filt_sockev_common(kn, kev, so, 0);
7156
7157 socket_unlock(so, 1);
7158
7159 return ret;
7160 }
7161
7162 void
get_sockev_state(struct socket * so,u_int32_t * statep)7163 get_sockev_state(struct socket *so, u_int32_t *statep)
7164 {
7165 u_int32_t state = *(statep);
7166
7167 /*
7168 * If the state variable is already used by a previous event,
7169 * reset it.
7170 */
7171 if (state != 0) {
7172 return;
7173 }
7174
7175 if (so->so_state & SS_ISCONNECTED) {
7176 state |= SOCKEV_CONNECTED;
7177 } else {
7178 state &= ~(SOCKEV_CONNECTED);
7179 }
7180 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7181 *(statep) = state;
7182 }
7183
7184 #define SO_LOCK_HISTORY_STR_LEN \
7185 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7186
7187 __private_extern__ const char *
solockhistory_nr(struct socket * so)7188 solockhistory_nr(struct socket *so)
7189 {
7190 size_t n = 0;
7191 int i;
7192 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7193
7194 bzero(lock_history_str, sizeof(lock_history_str));
7195 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7196 n += scnprintf(lock_history_str + n,
7197 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7198 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7199 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7200 }
7201 return __unsafe_null_terminated_from_indexable(lock_history_str);
7202 }
7203
7204 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7205 socket_getlock(struct socket *so, int flags)
7206 {
7207 if (so->so_proto->pr_getlock != NULL) {
7208 return (*so->so_proto->pr_getlock)(so, flags);
7209 } else {
7210 return so->so_proto->pr_domain->dom_mtx;
7211 }
7212 }
7213
7214 void
socket_lock(struct socket * so,int refcount)7215 socket_lock(struct socket *so, int refcount)
7216 {
7217 void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7218
7219 if (so->so_proto->pr_lock) {
7220 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7221 } else {
7222 #ifdef MORE_LOCKING_DEBUG
7223 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7224 LCK_MTX_ASSERT_NOTOWNED);
7225 #endif
7226 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7227 if (refcount) {
7228 so->so_usecount++;
7229 }
7230 so->lock_lr[so->next_lock_lr] = lr_saved;
7231 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7232 }
7233 }
7234
7235 void
socket_lock_assert_owned(struct socket * so)7236 socket_lock_assert_owned(struct socket *so)
7237 {
7238 lck_mtx_t *mutex_held;
7239
7240 if (so->so_proto->pr_getlock != NULL) {
7241 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7242 } else {
7243 mutex_held = so->so_proto->pr_domain->dom_mtx;
7244 }
7245
7246 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7247 }
7248
7249 int
socket_try_lock(struct socket * so)7250 socket_try_lock(struct socket *so)
7251 {
7252 lck_mtx_t *mtx;
7253
7254 if (so->so_proto->pr_getlock != NULL) {
7255 mtx = (*so->so_proto->pr_getlock)(so, 0);
7256 } else {
7257 mtx = so->so_proto->pr_domain->dom_mtx;
7258 }
7259
7260 return lck_mtx_try_lock(mtx);
7261 }
7262
7263 void
socket_unlock(struct socket * so,int refcount)7264 socket_unlock(struct socket *so, int refcount)
7265 {
7266 lck_mtx_t *mutex_held;
7267 void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7268
7269 if (so == NULL || so->so_proto == NULL) {
7270 panic("%s: null so_proto so=%p", __func__, so);
7271 /* NOTREACHED */
7272 }
7273
7274 if (so->so_proto->pr_unlock) {
7275 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7276 } else {
7277 mutex_held = so->so_proto->pr_domain->dom_mtx;
7278 #ifdef MORE_LOCKING_DEBUG
7279 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7280 #endif
7281 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7282 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7283
7284 if (refcount) {
7285 if (so->so_usecount <= 0) {
7286 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7287 "lrh=%s", __func__, so->so_usecount, so,
7288 SOCK_DOM(so), so->so_type,
7289 SOCK_PROTO(so), solockhistory_nr(so));
7290 /* NOTREACHED */
7291 }
7292
7293 so->so_usecount--;
7294 if (so->so_usecount == 0) {
7295 sofreelastref(so, 1);
7296 }
7297 }
7298 lck_mtx_unlock(mutex_held);
7299 }
7300 }
7301
7302 /* Called with socket locked, will unlock socket */
7303 void
sofree(struct socket * so)7304 sofree(struct socket *so)
7305 {
7306 lck_mtx_t *mutex_held;
7307
7308 if (so->so_proto->pr_getlock != NULL) {
7309 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7310 } else {
7311 mutex_held = so->so_proto->pr_domain->dom_mtx;
7312 }
7313 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7314
7315 sofreelastref(so, 0);
7316 }
7317
7318 void
soreference(struct socket * so)7319 soreference(struct socket *so)
7320 {
7321 socket_lock(so, 1); /* locks & take one reference on socket */
7322 socket_unlock(so, 0); /* unlock only */
7323 }
7324
7325 void
sodereference(struct socket * so)7326 sodereference(struct socket *so)
7327 {
7328 socket_lock(so, 0);
7329 socket_unlock(so, 1);
7330 }
7331
7332 /*
7333 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7334 * possibility of using jumbo clusters. Caller must ensure to hold
7335 * the socket lock.
7336 */
7337 void
somultipages(struct socket * so,boolean_t set)7338 somultipages(struct socket *so, boolean_t set)
7339 {
7340 if (set) {
7341 so->so_flags |= SOF_MULTIPAGES;
7342 } else {
7343 so->so_flags &= ~SOF_MULTIPAGES;
7344 }
7345 }
7346
7347 void
soif2kcl(struct socket * so,boolean_t set)7348 soif2kcl(struct socket *so, boolean_t set)
7349 {
7350 if (set) {
7351 so->so_flags1 |= SOF1_IF_2KCL;
7352 } else {
7353 so->so_flags1 &= ~SOF1_IF_2KCL;
7354 }
7355 }
7356
7357 int
so_isdstlocal(struct socket * so)7358 so_isdstlocal(struct socket *so)
7359 {
7360 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7361
7362 if (SOCK_DOM(so) == PF_INET) {
7363 return inaddr_local(inp->inp_faddr);
7364 } else if (SOCK_DOM(so) == PF_INET6) {
7365 return in6addr_local(&inp->in6p_faddr);
7366 }
7367
7368 return 0;
7369 }
7370
7371 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7372 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7373 {
7374 struct sockbuf *rcv, *snd;
7375 int err = 0, defunct;
7376
7377 rcv = &so->so_rcv;
7378 snd = &so->so_snd;
7379
7380 defunct = (so->so_flags & SOF_DEFUNCT);
7381 if (defunct) {
7382 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7383 panic("%s: SB_DROP not set", __func__);
7384 /* NOTREACHED */
7385 }
7386 goto done;
7387 }
7388
7389 if (so->so_flags & SOF_NODEFUNCT) {
7390 if (noforce) {
7391 err = EOPNOTSUPP;
7392 if (p != PROC_NULL) {
7393 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7394 "name %s level %d) so 0x%llu [%d,%d] "
7395 "is not eligible for defunct "
7396 "(%d)\n", __func__, proc_selfpid(),
7397 proc_best_name(current_proc()), proc_pid(p),
7398 proc_best_name(p), level,
7399 so->so_gencnt,
7400 SOCK_DOM(so), SOCK_TYPE(so), err);
7401 }
7402 return err;
7403 }
7404 so->so_flags &= ~SOF_NODEFUNCT;
7405 if (p != PROC_NULL) {
7406 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7407 "name %s level %d) so 0x%llu [%d,%d] "
7408 "defunct by force "
7409 "(%d)\n", __func__, proc_selfpid(),
7410 proc_best_name(current_proc()), proc_pid(p),
7411 proc_best_name(p), level,
7412 so->so_gencnt,
7413 SOCK_DOM(so), SOCK_TYPE(so), err);
7414 }
7415 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7416 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7417 struct ifnet *ifp = inp->inp_last_outifp;
7418
7419 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7420 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7421 } else if (so->so_flags & SOF_DELEGATED) {
7422 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7423 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7424 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7425 } else if (noforce && p != PROC_NULL) {
7426 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7427
7428 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7429 so->so_extended_bk_start = net_uptime();
7430 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7431
7432 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7433
7434 err = EOPNOTSUPP;
7435 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7436 "name %s level %d) so 0x%llu [%d,%d] "
7437 "extend bk idle "
7438 "(%d)\n", __func__, proc_selfpid(),
7439 proc_best_name(current_proc()), proc_pid(p),
7440 proc_best_name(p), level,
7441 so->so_gencnt,
7442 SOCK_DOM(so), SOCK_TYPE(so), err);
7443 return err;
7444 } else {
7445 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7446 }
7447 }
7448
7449 so->so_flags |= SOF_DEFUNCT;
7450
7451 /* Prevent further data from being appended to the socket buffers */
7452 snd->sb_flags |= SB_DROP;
7453 rcv->sb_flags |= SB_DROP;
7454
7455 /* Flush any existing data in the socket buffers */
7456 if (rcv->sb_cc != 0) {
7457 rcv->sb_flags &= ~SB_SEL;
7458 selthreadclear(&rcv->sb_sel);
7459 sbrelease(rcv);
7460 }
7461 if (snd->sb_cc != 0) {
7462 snd->sb_flags &= ~SB_SEL;
7463 selthreadclear(&snd->sb_sel);
7464 sbrelease(snd);
7465 }
7466
7467 done:
7468 if (p != PROC_NULL) {
7469 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7470 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7471 proc_selfpid(), proc_best_name(current_proc()),
7472 proc_pid(p), proc_best_name(p), level,
7473 so->so_gencnt, SOCK_DOM(so),
7474 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7475 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7476 " extbkidle" : "");
7477 }
7478 return err;
7479 }
7480
7481 int
sodefunct(struct proc * p,struct socket * so,int level)7482 sodefunct(struct proc *p, struct socket *so, int level)
7483 {
7484 struct sockbuf *rcv, *snd;
7485
7486 if (!(so->so_flags & SOF_DEFUNCT)) {
7487 panic("%s improperly called", __func__);
7488 /* NOTREACHED */
7489 }
7490 if (so->so_state & SS_DEFUNCT) {
7491 goto done;
7492 }
7493
7494 rcv = &so->so_rcv;
7495 snd = &so->so_snd;
7496
7497 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7498 char s[MAX_IPv6_STR_LEN];
7499 char d[MAX_IPv6_STR_LEN];
7500 struct inpcb *inp = sotoinpcb(so);
7501
7502 if (p != PROC_NULL) {
7503 SODEFUNCTLOG(
7504 "%s[%d, %s]: (target pid %d name %s level %d) "
7505 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7506 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7507 " snd_fl 0x%x]\n", __func__,
7508 proc_selfpid(), proc_best_name(current_proc()),
7509 proc_pid(p), proc_best_name(p), level,
7510 so->so_gencnt,
7511 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7512 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7513 (void *)&inp->inp_laddr.s_addr :
7514 (void *)&inp->in6p_laddr),
7515 s, sizeof(s)), ntohs(inp->in6p_lport),
7516 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7517 (void *)&inp->inp_faddr.s_addr :
7518 (void *)&inp->in6p_faddr,
7519 d, sizeof(d)), ntohs(inp->in6p_fport),
7520 (uint32_t)rcv->sb_sel.si_flags,
7521 (uint32_t)snd->sb_sel.si_flags,
7522 rcv->sb_flags, snd->sb_flags);
7523 }
7524 } else if (p != PROC_NULL) {
7525 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7526 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7527 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7528 proc_selfpid(), proc_best_name(current_proc()),
7529 proc_pid(p), proc_best_name(p), level,
7530 so->so_gencnt,
7531 SOCK_DOM(so), SOCK_TYPE(so),
7532 (uint32_t)rcv->sb_sel.si_flags,
7533 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7534 snd->sb_flags);
7535 }
7536
7537 /*
7538 * First tell the protocol the flow is defunct
7539 */
7540 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7541
7542 /*
7543 * Unwedge threads blocked on sbwait() and sb_lock().
7544 */
7545 sbwakeup(rcv);
7546 sbwakeup(snd);
7547
7548 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7549 if (rcv->sb_flags & SB_LOCK) {
7550 sbunlock(rcv, TRUE); /* keep socket locked */
7551 }
7552 if (snd->sb_flags & SB_LOCK) {
7553 sbunlock(snd, TRUE); /* keep socket locked */
7554 }
7555 /*
7556 * Flush the buffers and disconnect. We explicitly call shutdown
7557 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7558 * states are set for the socket. This would also flush out data
7559 * hanging off the receive list of this socket.
7560 */
7561 (void) soshutdownlock_final(so, SHUT_RD);
7562 (void) soshutdownlock_final(so, SHUT_WR);
7563 (void) sodisconnectlocked(so);
7564
7565 /*
7566 * Explicitly handle connectionless-protocol disconnection
7567 * and release any remaining data in the socket buffers.
7568 */
7569 if (!(so->so_state & SS_ISDISCONNECTED)) {
7570 (void) soisdisconnected(so);
7571 }
7572
7573 if (so->so_error == 0) {
7574 so->so_error = EBADF;
7575 }
7576
7577 if (rcv->sb_cc != 0) {
7578 rcv->sb_flags &= ~SB_SEL;
7579 selthreadclear(&rcv->sb_sel);
7580 sbrelease(rcv);
7581 }
7582 if (snd->sb_cc != 0) {
7583 snd->sb_flags &= ~SB_SEL;
7584 selthreadclear(&snd->sb_sel);
7585 sbrelease(snd);
7586 }
7587 so->so_state |= SS_DEFUNCT;
7588 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7589
7590 done:
7591 return 0;
7592 }
7593
7594 int
soresume(struct proc * p,struct socket * so,int locked)7595 soresume(struct proc *p, struct socket *so, int locked)
7596 {
7597 if (locked == 0) {
7598 socket_lock(so, 1);
7599 }
7600
7601 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7602 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7603 "[%d,%d] resumed from bk idle\n",
7604 __func__, proc_selfpid(), proc_best_name(current_proc()),
7605 proc_pid(p), proc_best_name(p),
7606 so->so_gencnt,
7607 SOCK_DOM(so), SOCK_TYPE(so));
7608
7609 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7610 so->so_extended_bk_start = 0;
7611 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7612
7613 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7614 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7615 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7616 }
7617 if (locked == 0) {
7618 socket_unlock(so, 1);
7619 }
7620
7621 return 0;
7622 }
7623
7624 /*
7625 * Does not attempt to account for sockets that are delegated from
7626 * the current process
7627 */
7628 int
so_set_extended_bk_idle(struct socket * so,int optval)7629 so_set_extended_bk_idle(struct socket *so, int optval)
7630 {
7631 int error = 0;
7632
7633 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7634 SOCK_PROTO(so) != IPPROTO_TCP) {
7635 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7636 error = EOPNOTSUPP;
7637 } else if (optval == 0) {
7638 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7639
7640 soresume(current_proc(), so, 1);
7641 } else {
7642 struct proc *p = current_proc();
7643 struct fileproc *fp;
7644 int count = 0;
7645
7646 /*
7647 * Unlock socket to avoid lock ordering issue with
7648 * the proc fd table lock
7649 */
7650 socket_unlock(so, 0);
7651
7652 proc_fdlock(p);
7653 fdt_foreach(fp, p) {
7654 struct socket *so2;
7655
7656 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7657 continue;
7658 }
7659
7660 so2 = (struct socket *)fp_get_data(fp);
7661 if (so != so2 &&
7662 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7663 count++;
7664 }
7665 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7666 break;
7667 }
7668 }
7669 proc_fdunlock(p);
7670
7671 socket_lock(so, 0);
7672
7673 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7674 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7675 error = EBUSY;
7676 } else if (so->so_flags & SOF_DELEGATED) {
7677 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7678 error = EBUSY;
7679 } else {
7680 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7681 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7682 }
7683 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7684 "%s marked for extended bk idle\n",
7685 __func__, proc_selfpid(), proc_best_name(current_proc()),
7686 so->so_gencnt,
7687 SOCK_DOM(so), SOCK_TYPE(so),
7688 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7689 "is" : "not");
7690 }
7691
7692 return error;
7693 }
7694
7695 static void
so_stop_extended_bk_idle(struct socket * so)7696 so_stop_extended_bk_idle(struct socket *so)
7697 {
7698 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7699 so->so_extended_bk_start = 0;
7700
7701 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7702 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7703 /*
7704 * Force defunct
7705 */
7706 sosetdefunct(current_proc(), so,
7707 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7708 if (so->so_flags & SOF_DEFUNCT) {
7709 sodefunct(current_proc(), so,
7710 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7711 }
7712 }
7713
7714 void
so_drain_extended_bk_idle(struct socket * so)7715 so_drain_extended_bk_idle(struct socket *so)
7716 {
7717 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7718 /*
7719 * Only penalize sockets that have outstanding data
7720 */
7721 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7722 so_stop_extended_bk_idle(so);
7723
7724 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7725 }
7726 }
7727 }
7728
7729 /*
7730 * Return values tells if socket is still in extended background idle
7731 */
7732 int
so_check_extended_bk_idle_time(struct socket * so)7733 so_check_extended_bk_idle_time(struct socket *so)
7734 {
7735 int ret = 1;
7736
7737 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7738 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7739 __func__, proc_selfpid(), proc_best_name(current_proc()),
7740 so->so_gencnt,
7741 SOCK_DOM(so), SOCK_TYPE(so));
7742 if (net_uptime() - so->so_extended_bk_start >
7743 soextbkidlestat.so_xbkidle_time) {
7744 so_stop_extended_bk_idle(so);
7745
7746 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7747
7748 ret = 0;
7749 } else {
7750 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7751
7752 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7753 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7754 }
7755 }
7756
7757 return ret;
7758 }
7759
7760 void
resume_proc_sockets(proc_t p)7761 resume_proc_sockets(proc_t p)
7762 {
7763 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7764 struct fileproc *fp;
7765 struct socket *so;
7766
7767 proc_fdlock(p);
7768 fdt_foreach(fp, p) {
7769 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7770 continue;
7771 }
7772
7773 so = (struct socket *)fp_get_data(fp);
7774 (void) soresume(p, so, 0);
7775 }
7776 proc_fdunlock(p);
7777
7778 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7779 }
7780 }
7781
7782 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7783 so_set_recv_anyif(struct socket *so, int optval)
7784 {
7785 int ret = 0;
7786
7787 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7788 if (optval) {
7789 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7790 } else {
7791 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7792 }
7793 #if SKYWALK
7794 inp_update_netns_flags(so);
7795 #endif /* SKYWALK */
7796 }
7797
7798
7799 return ret;
7800 }
7801
7802 __private_extern__ int
so_get_recv_anyif(struct socket * so)7803 so_get_recv_anyif(struct socket *so)
7804 {
7805 int ret = 0;
7806
7807 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7808 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7809 }
7810
7811 return ret;
7812 }
7813
7814 int
so_set_restrictions(struct socket * so,uint32_t vals)7815 so_set_restrictions(struct socket *so, uint32_t vals)
7816 {
7817 int nocell_old, nocell_new;
7818 int noexpensive_old, noexpensive_new;
7819 int noconstrained_old, noconstrained_new;
7820
7821 /*
7822 * Deny-type restrictions are trapdoors; once set they cannot be
7823 * unset for the lifetime of the socket. This allows them to be
7824 * issued by a framework on behalf of the application without
7825 * having to worry that they can be undone.
7826 *
7827 * Note here that socket-level restrictions overrides any protocol
7828 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7829 * socket restriction issued on the socket has a higher precendence
7830 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7831 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7832 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7833 */
7834 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7835 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7836 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7837 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7838 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7839 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7840 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7841 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7842 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7843
7844 /* we can only set, not clear restrictions */
7845 if ((nocell_new - nocell_old) == 0 &&
7846 (noexpensive_new - noexpensive_old) == 0 &&
7847 (noconstrained_new - noconstrained_old) == 0) {
7848 return 0;
7849 }
7850 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7851 if (nocell_new - nocell_old != 0) {
7852 /*
7853 * if deny cellular is now set, do what's needed
7854 * for INPCB
7855 */
7856 inp_set_nocellular(sotoinpcb(so));
7857 }
7858 if (noexpensive_new - noexpensive_old != 0) {
7859 inp_set_noexpensive(sotoinpcb(so));
7860 }
7861 if (noconstrained_new - noconstrained_old != 0) {
7862 inp_set_noconstrained(sotoinpcb(so));
7863 }
7864 }
7865
7866 if (SOCK_DOM(so) == PF_MULTIPATH) {
7867 mptcp_set_restrictions(so);
7868 }
7869
7870 return 0;
7871 }
7872
7873 uint32_t
so_get_restrictions(struct socket * so)7874 so_get_restrictions(struct socket *so)
7875 {
7876 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7877 SO_RESTRICT_DENY_OUT |
7878 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7879 }
7880
7881 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7882 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7883 {
7884 struct proc *ep = PROC_NULL;
7885 int error = 0;
7886
7887 /* pid 0 is reserved for kernel */
7888 if (epid == 0) {
7889 error = EINVAL;
7890 goto done;
7891 }
7892
7893 /*
7894 * If this is an in-kernel socket, prevent its delegate
7895 * association from changing unless the socket option is
7896 * coming from within the kernel itself.
7897 */
7898 if (so->last_pid == 0 && p != kernproc) {
7899 error = EACCES;
7900 goto done;
7901 }
7902
7903 /*
7904 * If this is issued by a process that's recorded as the
7905 * real owner of the socket, or if the pid is the same as
7906 * the process's own pid, then proceed. Otherwise ensure
7907 * that the issuing process has the necessary privileges.
7908 */
7909 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7910 if ((error = priv_check_cred(kauth_cred_get(),
7911 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7912 error = EACCES;
7913 goto done;
7914 }
7915 }
7916
7917 /* Find the process that corresponds to the effective pid */
7918 if ((ep = proc_find(epid)) == PROC_NULL) {
7919 error = ESRCH;
7920 goto done;
7921 }
7922
7923 /*
7924 * If a process tries to delegate the socket to itself, then
7925 * there's really nothing to do; treat it as a way for the
7926 * delegate association to be cleared. Note that we check
7927 * the passed-in proc rather than calling proc_selfpid(),
7928 * as we need to check the process issuing the socket option
7929 * which could be kernproc. Given that we don't allow 0 for
7930 * effective pid, it means that a delegated in-kernel socket
7931 * stays delegated during its lifetime (which is probably OK.)
7932 */
7933 if (epid == proc_pid(p)) {
7934 so->so_flags &= ~SOF_DELEGATED;
7935 so->e_upid = 0;
7936 so->e_pid = 0;
7937 uuid_clear(so->e_uuid);
7938 } else {
7939 so->so_flags |= SOF_DELEGATED;
7940 so->e_upid = proc_uniqueid(ep);
7941 so->e_pid = proc_pid(ep);
7942 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7943
7944 #if defined(XNU_TARGET_OS_OSX)
7945 if (ep->p_responsible_pid != so->e_pid) {
7946 proc_t rp = proc_find(ep->p_responsible_pid);
7947 if (rp != PROC_NULL) {
7948 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7949 so->so_rpid = ep->p_responsible_pid;
7950 proc_rele(rp);
7951 } else {
7952 uuid_clear(so->so_ruuid);
7953 so->so_rpid = -1;
7954 }
7955 }
7956 #endif
7957 }
7958 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7959 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7960 }
7961 done:
7962 if (error == 0 && net_io_policy_log) {
7963 uuid_string_t buf;
7964
7965 uuid_unparse(so->e_uuid, buf);
7966 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7967 "euuid %s%s\n", __func__, proc_name_address(p),
7968 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7969 SOCK_DOM(so), SOCK_TYPE(so),
7970 so->e_pid, proc_name_address(ep), buf,
7971 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7972 } else if (error != 0 && net_io_policy_log) {
7973 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7974 "ERROR (%d)\n", __func__, proc_name_address(p),
7975 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7976 SOCK_DOM(so), SOCK_TYPE(so),
7977 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7978 proc_name_address(ep), error);
7979 }
7980
7981 /* Update this socket's policy upon success */
7982 if (error == 0) {
7983 so->so_policy_gencnt *= -1;
7984 so_update_policy(so);
7985 #if NECP
7986 so_update_necp_policy(so, NULL, NULL);
7987 #endif /* NECP */
7988 }
7989
7990 if (ep != PROC_NULL) {
7991 proc_rele(ep);
7992 }
7993
7994 return error;
7995 }
7996
7997 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7998 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7999 {
8000 uuid_string_t buf;
8001 uuid_t uuid;
8002 int error = 0;
8003
8004 /* UUID must not be all-zeroes (reserved for kernel) */
8005 if (uuid_is_null(euuid)) {
8006 error = EINVAL;
8007 goto done;
8008 }
8009
8010 /*
8011 * If this is an in-kernel socket, prevent its delegate
8012 * association from changing unless the socket option is
8013 * coming from within the kernel itself.
8014 */
8015 if (so->last_pid == 0 && p != kernproc) {
8016 error = EACCES;
8017 goto done;
8018 }
8019
8020 /* Get the UUID of the issuing process */
8021 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8022
8023 /*
8024 * If this is issued by a process that's recorded as the
8025 * real owner of the socket, or if the uuid is the same as
8026 * the process's own uuid, then proceed. Otherwise ensure
8027 * that the issuing process has the necessary privileges.
8028 */
8029 if (check_cred &&
8030 (uuid_compare(euuid, so->last_uuid) != 0 ||
8031 uuid_compare(euuid, uuid) != 0)) {
8032 if ((error = priv_check_cred(kauth_cred_get(),
8033 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8034 error = EACCES;
8035 goto done;
8036 }
8037 }
8038
8039 /*
8040 * If a process tries to delegate the socket to itself, then
8041 * there's really nothing to do; treat it as a way for the
8042 * delegate association to be cleared. Note that we check
8043 * the uuid of the passed-in proc rather than that of the
8044 * current process, as we need to check the process issuing
8045 * the socket option which could be kernproc itself. Given
8046 * that we don't allow 0 for effective uuid, it means that
8047 * a delegated in-kernel socket stays delegated during its
8048 * lifetime (which is okay.)
8049 */
8050 if (uuid_compare(euuid, uuid) == 0) {
8051 so->so_flags &= ~SOF_DELEGATED;
8052 so->e_upid = 0;
8053 so->e_pid = 0;
8054 uuid_clear(so->e_uuid);
8055 } else {
8056 so->so_flags |= SOF_DELEGATED;
8057 /*
8058 * Unlike so_set_effective_pid(), we only have the UUID
8059 * here and the process ID is not known. Inherit the
8060 * real {pid,upid} of the socket.
8061 */
8062 so->e_upid = so->last_upid;
8063 so->e_pid = so->last_pid;
8064 uuid_copy(so->e_uuid, euuid);
8065 }
8066 /*
8067 * The following will clear the effective process name as it's the same
8068 * as the real process
8069 */
8070 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8071 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8072 }
8073 done:
8074 if (error == 0 && net_io_policy_log) {
8075 uuid_unparse(so->e_uuid, buf);
8076 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8077 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8078 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8079 SOCK_TYPE(so), so->e_pid, buf,
8080 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8081 } else if (error != 0 && net_io_policy_log) {
8082 uuid_unparse(euuid, buf);
8083 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8084 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8085 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8086 SOCK_TYPE(so), buf, error);
8087 }
8088
8089 /* Update this socket's policy upon success */
8090 if (error == 0) {
8091 so->so_policy_gencnt *= -1;
8092 so_update_policy(so);
8093 #if NECP
8094 so_update_necp_policy(so, NULL, NULL);
8095 #endif /* NECP */
8096 }
8097
8098 return error;
8099 }
8100
8101 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8102 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8103 uint32_t ev_datalen)
8104 {
8105 struct kev_msg ev_msg;
8106
8107 /*
8108 * A netpolicy event always starts with a netpolicy_event_data
8109 * structure, but the caller can provide for a longer event
8110 * structure to post, depending on the event code.
8111 */
8112 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8113
8114 bzero(&ev_msg, sizeof(ev_msg));
8115 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8116 ev_msg.kev_class = KEV_NETWORK_CLASS;
8117 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8118 ev_msg.event_code = ev_code;
8119
8120 ev_msg.dv[0].data_ptr = ev_data;
8121 ev_msg.dv[0].data_length = ev_datalen;
8122
8123 kev_post_msg(&ev_msg);
8124 }
8125
8126 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8127 socket_post_kev_msg(uint32_t ev_code,
8128 struct kev_socket_event_data *ev_data,
8129 uint32_t ev_datalen)
8130 {
8131 struct kev_msg ev_msg;
8132
8133 bzero(&ev_msg, sizeof(ev_msg));
8134 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8135 ev_msg.kev_class = KEV_NETWORK_CLASS;
8136 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8137 ev_msg.event_code = ev_code;
8138
8139 ev_msg.dv[0].data_ptr = ev_data;
8140 ev_msg.dv[0].data_length = ev_datalen;
8141
8142 kev_post_msg(&ev_msg);
8143 }
8144
8145 void
socket_post_kev_msg_closed(struct socket * so)8146 socket_post_kev_msg_closed(struct socket *so)
8147 {
8148 struct kev_socket_closed ev = {};
8149 struct sockaddr *__single socksa = NULL, *__single peersa = NULL;
8150 int err;
8151
8152 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8153 return;
8154 }
8155 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8156 if (err == 0) {
8157 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8158 &peersa);
8159 if (err == 0) {
8160 SOCKADDR_COPY(socksa, &ev.ev_data.kev_sockname,
8161 min(socksa->sa_len,
8162 sizeof(ev.ev_data.kev_sockname)));
8163 SOCKADDR_COPY(peersa, &ev.ev_data.kev_peername,
8164 min(peersa->sa_len,
8165 sizeof(ev.ev_data.kev_peername)));
8166 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8167 &ev.ev_data, sizeof(ev));
8168 }
8169 }
8170 free_sockaddr(socksa);
8171 free_sockaddr(peersa);
8172 }
8173
8174 void
sock_parse_cm_info(struct mbuf * control,struct sock_cm_info * sockcminfo)8175 sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo)
8176 {
8177 struct cmsghdr *cm;
8178
8179 for (cm = M_FIRST_CMSGHDR(control);
8180 is_cmsg_valid(control, cm);
8181 cm = M_NXT_CMSGHDR(control, cm)) {
8182 int val;
8183
8184 if (cm->cmsg_level != SOL_SOCKET) {
8185 continue;
8186 }
8187
8188 if (cm->cmsg_len == CMSG_LEN(sizeof(int))) {
8189 val = *(int *)(void *)CMSG_DATA(cm);
8190 }
8191
8192 switch (cm->cmsg_type) {
8193 case SO_TRAFFIC_CLASS:
8194 if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8195 break;
8196 }
8197 if (SO_VALID_TC(val)) {
8198 sockcminfo->sotc = val;
8199 break;
8200 } else if (val < SO_TC_NET_SERVICE_OFFSET) {
8201 break;
8202 }
8203 /*
8204 * Handle the case SO_NET_SERVICE_TYPE values are
8205 * passed using SO_TRAFFIC_CLASS
8206 */
8207 val = val - SO_TC_NET_SERVICE_OFFSET;
8208
8209 OS_FALLTHROUGH;
8210 case SO_NET_SERVICE_TYPE:
8211 if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
8212 break;
8213 }
8214
8215 if (!IS_VALID_NET_SERVICE_TYPE(val)) {
8216 break;
8217 }
8218 sockcminfo->netsvctype = val;
8219 sockcminfo->sotc = sotc_by_netservicetype[val];
8220 break;
8221 case SCM_TXTIME:
8222 if (cm->cmsg_len != CMSG_LEN(sizeof(uint64_t))) {
8223 break;
8224 }
8225
8226 sockcminfo->tx_time = *(uint64_t *)(void *)CMSG_DATA(cm);
8227 break;
8228 default:
8229 break;
8230 }
8231 }
8232 }
8233
8234 __attribute__((noinline, cold, not_tail_called, noreturn))
8235 __private_extern__ int
assfail(const char * a,const char * f,int l)8236 assfail(const char *a, const char *f, int l)
8237 {
8238 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8239 /* NOTREACHED */
8240 __builtin_unreachable();
8241 }
8242