1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #include <os/log.h>
127
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147
148 static u_int32_t so_cache_hw; /* High water mark for socache */
149 static u_int32_t so_cache_timeouts; /* number of timeouts */
150 static u_int32_t so_cache_max_freed; /* max freed per timeout */
151 static u_int32_t cached_sock_count = 0;
152 STAILQ_HEAD(, socket) so_cache_head;
153 int max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t so_cache_time;
155 static int socketinit_done;
156 static struct zone *so_cache_zone;
157
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 #if (DEBUG || DEVELOPMENT)
229 #define DEFAULT_SOSEND_ASSERT_PANIC 1
230 #else
231 #define DEFAULT_SOSEND_ASSERT_PANIC 0
232 #endif /* (DEBUG || DEVELOPMENT) */
233
234 int sosend_assert_panic = 0;
235 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
236 CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
237
238 static unsigned long sodefunct_calls = 0;
239 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
240 &sodefunct_calls, "");
241
242 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
243 so_gen_t so_gencnt; /* generation count for sockets */
244
245 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
246
247 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
248 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
249 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
250 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
251 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
252 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
253 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
254 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
255 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
256
257 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
258
259 int somaxconn = SOMAXCONN;
260 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
262
263 /* Should we get a maximum also ??? */
264 static int sosendmaxchain = 65536;
265 static int sosendminchain = 16384;
266 static int sorecvmincopy = 16384;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
269 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
270 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
271
272 /*
273 * Set to enable jumbo clusters (if available) for large writes when
274 * the socket is marked with SOF_MULTIPAGES; see below.
275 */
276 int sosendjcl = 1;
277 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
278 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
279
280 /*
281 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
282 * writes on the socket for all protocols on any network interfaces,
283 * depending upon sosendjcl above. Be extra careful when setting this
284 * to 1, because sending down packets that cross physical pages down to
285 * broken drivers (those that falsely assume that the physical pages
286 * are contiguous) might lead to system panics or silent data corruption.
287 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
288 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
289 * capable. Set this to 1 only for testing/debugging purposes.
290 */
291 int sosendjcl_ignore_capab = 0;
292 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
293 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
294
295 /*
296 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
297 * writes on the socket for all protocols on any network interfaces.
298 * Be extra careful when setting this to 1, because sending down packets with
299 * clusters larger that 2 KB might lead to system panics or data corruption.
300 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
301 * on the outgoing interface
302 * Set this to 1 for testing/debugging purposes only.
303 */
304 int sosendbigcl_ignore_capab = 0;
305 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
306 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
307
308 int sodefunctlog = 0;
309 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
310 &sodefunctlog, 0, "");
311
312 int sothrottlelog = 0;
313 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
314 &sothrottlelog, 0, "");
315
316 int sorestrictrecv = 1;
317 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
318 &sorestrictrecv, 0, "Enable inbound interface restrictions");
319
320 int sorestrictsend = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
322 &sorestrictsend, 0, "Enable outbound interface restrictions");
323
324 int soreserveheadroom = 1;
325 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
326 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
327
328 #if (DEBUG || DEVELOPMENT)
329 int so_notsent_lowat_check = 1;
330 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
331 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
332 #endif /* DEBUG || DEVELOPMENT */
333
334 int so_accept_list_waits = 0;
335 #if (DEBUG || DEVELOPMENT)
336 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
337 &so_accept_list_waits, 0, "number of waits for listener incomp list");
338 #endif /* DEBUG || DEVELOPMENT */
339
340 extern struct inpcbinfo tcbinfo;
341
342 /* TODO: these should be in header file */
343 extern int get_inpcb_str_size(void);
344 extern int get_tcp_str_size(void);
345
346 vm_size_t so_cache_zone_element_size;
347
348 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
349 user_ssize_t *);
350 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
351 static void cached_sock_free(struct socket *);
352
353 /*
354 * Maximum of extended background idle sockets per process
355 * Set to zero to disable further setting of the option
356 */
357
358 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
359 #define SO_IDLE_BK_IDLE_TIME 600
360 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
361
362 struct soextbkidlestat soextbkidlestat;
363
364 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
365 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
366 "Maximum of extended background idle sockets per process");
367
368 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
369 &soextbkidlestat.so_xbkidle_time, 0,
370 "Time in seconds to keep extended background idle sockets");
371
372 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
373 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
374 "High water mark for extended background idle sockets");
375
376 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
377 &soextbkidlestat, soextbkidlestat, "");
378
379 int so_set_extended_bk_idle(struct socket *, int);
380
381
382 /*
383 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
384 * setting the DSCP code on the packet based on the service class; see
385 * <rdar://problem/11277343> for details.
386 */
387 __private_extern__ u_int32_t sotcdb = 0;
388 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
389 &sotcdb, 0, "");
390
391 void
socketinit(void)392 socketinit(void)
393 {
394 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
395 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
396
397 #ifdef __LP64__
398 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
400 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
402 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
403 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
404 #else
405 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
406 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
407 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
408 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
409 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
410 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
411 #endif
412
413 if (socketinit_done) {
414 printf("socketinit: already called...\n");
415 return;
416 }
417 socketinit_done = 1;
418
419 PE_parse_boot_argn("socket_debug", &socket_debug,
420 sizeof(socket_debug));
421
422 PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
423 sizeof(sosend_assert_panic));
424
425 STAILQ_INIT(&so_cache_head);
426
427 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
428 + get_inpcb_str_size() + 4 + get_tcp_str_size());
429
430 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
431 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
432
433 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
434 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
435 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
436 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
437
438 in_pcbinit();
439 }
440
441 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)442 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
443 {
444 caddr_t temp;
445 uintptr_t offset;
446
447 lck_mtx_lock(&so_cache_mtx);
448
449 if (!STAILQ_EMPTY(&so_cache_head)) {
450 VERIFY(cached_sock_count > 0);
451
452 *so = STAILQ_FIRST(&so_cache_head);
453 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
454 STAILQ_NEXT((*so), so_cache_ent) = NULL;
455
456 cached_sock_count--;
457 lck_mtx_unlock(&so_cache_mtx);
458
459 temp = (*so)->so_saved_pcb;
460 bzero((caddr_t)*so, sizeof(struct socket));
461
462 (*so)->so_saved_pcb = temp;
463 } else {
464 lck_mtx_unlock(&so_cache_mtx);
465
466 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
467
468 /*
469 * Define offsets for extra structures into our
470 * single block of memory. Align extra structures
471 * on longword boundaries.
472 */
473
474 offset = (uintptr_t)*so;
475 offset += sizeof(struct socket);
476
477 offset = ALIGN(offset);
478
479 (*so)->so_saved_pcb = (caddr_t)offset;
480 offset += get_inpcb_str_size();
481
482 offset = ALIGN(offset);
483
484 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
485 (caddr_t)offset;
486 }
487
488 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
489 }
490
491 static void
cached_sock_free(struct socket * so)492 cached_sock_free(struct socket *so)
493 {
494 lck_mtx_lock(&so_cache_mtx);
495
496 so_cache_time = net_uptime();
497 if (++cached_sock_count > max_cached_sock_count) {
498 --cached_sock_count;
499 lck_mtx_unlock(&so_cache_mtx);
500 zfree(so_cache_zone, so);
501 } else {
502 if (so_cache_hw < cached_sock_count) {
503 so_cache_hw = cached_sock_count;
504 }
505
506 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
507
508 so->cache_timestamp = so_cache_time;
509 lck_mtx_unlock(&so_cache_mtx);
510 }
511 }
512
513 void
so_update_last_owner_locked(struct socket * so,proc_t self)514 so_update_last_owner_locked(struct socket *so, proc_t self)
515 {
516 if (so->last_pid != 0) {
517 /*
518 * last_pid and last_upid should remain zero for sockets
519 * created using sock_socket. The check above achieves that
520 */
521 if (self == PROC_NULL) {
522 self = current_proc();
523 }
524
525 if (so->last_upid != proc_uniqueid(self) ||
526 so->last_pid != proc_pid(self)) {
527 so->last_upid = proc_uniqueid(self);
528 so->last_pid = proc_pid(self);
529 proc_getexecutableuuid(self, so->last_uuid,
530 sizeof(so->last_uuid));
531 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
532 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
533 }
534 }
535 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
536 }
537 }
538
539 void
so_update_policy(struct socket * so)540 so_update_policy(struct socket *so)
541 {
542 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
543 (void) inp_update_policy(sotoinpcb(so));
544 }
545 }
546
547 #if NECP
548 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)549 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
550 struct sockaddr *override_remote_addr)
551 {
552 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
553 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
554 override_remote_addr, 0);
555 }
556 }
557 #endif /* NECP */
558
559 boolean_t
so_cache_timer(void)560 so_cache_timer(void)
561 {
562 struct socket *p;
563 int n_freed = 0;
564 boolean_t rc = FALSE;
565
566 lck_mtx_lock(&so_cache_mtx);
567 so_cache_timeouts++;
568 so_cache_time = net_uptime();
569
570 while (!STAILQ_EMPTY(&so_cache_head)) {
571 VERIFY(cached_sock_count > 0);
572 p = STAILQ_FIRST(&so_cache_head);
573 if ((so_cache_time - p->cache_timestamp) <
574 SO_CACHE_TIME_LIMIT) {
575 break;
576 }
577
578 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
579 --cached_sock_count;
580
581 zfree(so_cache_zone, p);
582
583 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
584 so_cache_max_freed++;
585 break;
586 }
587 }
588
589 /* Schedule again if there is more to cleanup */
590 if (!STAILQ_EMPTY(&so_cache_head)) {
591 rc = TRUE;
592 }
593
594 lck_mtx_unlock(&so_cache_mtx);
595 return rc;
596 }
597
598 /*
599 * Get a socket structure from our zone, and initialize it.
600 * We don't implement `waitok' yet (see comments in uipc_domain.c).
601 * Note that it would probably be better to allocate socket
602 * and PCB at the same time, but I'm not convinced that all
603 * the protocols can be easily modified to do this.
604 */
605 struct socket *
soalloc(int waitok,int dom,int type)606 soalloc(int waitok, int dom, int type)
607 {
608 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
609 struct socket *so;
610
611 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
612 cached_sock_alloc(&so, how);
613 } else {
614 so = zalloc_flags(socket_zone, how | Z_ZERO);
615 }
616 if (so != NULL) {
617 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
618
619 /*
620 * Increment the socket allocation statistics
621 */
622 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
623 }
624
625 return so;
626 }
627
628 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)629 socreate_internal(int dom, struct socket **aso, int type, int proto,
630 struct proc *p, uint32_t flags, struct proc *ep)
631 {
632 struct protosw *prp;
633 struct socket *so;
634 int error = 0;
635 #if defined(XNU_TARGET_OS_OSX)
636 pid_t rpid = -1;
637 #endif
638
639 #if TCPDEBUG
640 extern int tcpconsdebug;
641 #endif
642
643 VERIFY(aso != NULL);
644 *aso = NULL;
645
646 if (proto != 0) {
647 prp = pffindproto(dom, proto, type);
648 } else {
649 prp = pffindtype(dom, type);
650 }
651
652 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
653 if (pffinddomain(dom) == NULL) {
654 return EAFNOSUPPORT;
655 }
656 if (proto != 0) {
657 if (pffindprotonotype(dom, proto) != NULL) {
658 return EPROTOTYPE;
659 }
660 }
661 return EPROTONOSUPPORT;
662 }
663 if (prp->pr_type != type) {
664 return EPROTOTYPE;
665 }
666 so = soalloc(1, dom, type);
667 if (so == NULL) {
668 return ENOBUFS;
669 }
670
671 switch (dom) {
672 case PF_LOCAL:
673 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
674 break;
675 case PF_INET:
676 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
677 if (type == SOCK_STREAM) {
678 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
679 } else {
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
681 }
682 break;
683 case PF_ROUTE:
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
685 break;
686 case PF_NDRV:
687 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
688 break;
689 case PF_KEY:
690 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
691 break;
692 case PF_INET6:
693 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
694 if (type == SOCK_STREAM) {
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
696 } else {
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
698 }
699 break;
700 case PF_SYSTEM:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
702 break;
703 case PF_MULTIPATH:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
705 break;
706 default:
707 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
708 break;
709 }
710
711 if (flags & SOCF_MPTCP) {
712 so->so_state |= SS_NBIO;
713 }
714
715 TAILQ_INIT(&so->so_incomp);
716 TAILQ_INIT(&so->so_comp);
717 so->so_type = (short)type;
718 so->last_upid = proc_uniqueid(p);
719 so->last_pid = proc_pid(p);
720 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
721 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
722
723 if (ep != PROC_NULL && ep != p) {
724 so->e_upid = proc_uniqueid(ep);
725 so->e_pid = proc_pid(ep);
726 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
727 so->so_flags |= SOF_DELEGATED;
728 #if defined(XNU_TARGET_OS_OSX)
729 if (ep->p_responsible_pid != so->e_pid) {
730 rpid = ep->p_responsible_pid;
731 }
732 #endif
733 }
734
735 #if defined(XNU_TARGET_OS_OSX)
736 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
737 rpid = p->p_responsible_pid;
738 }
739
740 so->so_rpid = -1;
741 uuid_clear(so->so_ruuid);
742 if (rpid >= 0) {
743 proc_t rp = proc_find(rpid);
744 if (rp != PROC_NULL) {
745 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
746 so->so_rpid = rpid;
747 proc_rele(rp);
748 }
749 }
750 #endif
751
752 so->so_cred = kauth_cred_proc_ref(p);
753 if (!suser(kauth_cred_get(), NULL)) {
754 so->so_state |= SS_PRIV;
755 }
756
757 so->so_proto = prp;
758 so->so_rcv.sb_flags |= SB_RECV;
759 so->so_rcv.sb_so = so->so_snd.sb_so = so;
760 so->next_lock_lr = 0;
761 so->next_unlock_lr = 0;
762
763 /*
764 * Attachment will create the per pcb lock if necessary and
765 * increase refcount for creation, make sure it's done before
766 * socket is inserted in lists.
767 */
768 so->so_usecount++;
769
770 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
771 if (error != 0) {
772 /*
773 * Warning:
774 * If so_pcb is not zero, the socket will be leaked,
775 * so protocol attachment handler must be coded carefuly
776 */
777 if (so->so_pcb != NULL) {
778 os_log_error(OS_LOG_DEFAULT,
779 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
780 error, dom, proto, type);
781 }
782 /*
783 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
784 */
785 so->so_state |= SS_NOFDREF;
786 so->so_flags |= SOF_PCBCLEARING;
787 VERIFY(so->so_usecount > 0);
788 so->so_usecount--;
789 sofreelastref(so, 1); /* will deallocate the socket */
790 return error;
791 }
792
793 /*
794 * Note: needs so_pcb to be set after pru_attach
795 */
796 if (prp->pr_update_last_owner != NULL) {
797 (*prp->pr_update_last_owner)(so, p, ep);
798 }
799
800 atomic_add_32(&prp->pr_domain->dom_refs, 1);
801
802 /* Attach socket filters for this protocol */
803 sflt_initsock(so);
804 #if TCPDEBUG
805 if (tcpconsdebug == 2) {
806 so->so_options |= SO_DEBUG;
807 }
808 #endif
809 so_set_default_traffic_class(so);
810
811 /*
812 * If this thread or task is marked to create backgrounded sockets,
813 * mark the socket as background.
814 */
815 if (!(flags & SOCF_MPTCP) &&
816 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
817 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
818 so->so_background_thread = current_thread();
819 }
820
821 switch (dom) {
822 /*
823 * Don't mark Unix domain or system
824 * eligible for defunct by default.
825 */
826 case PF_LOCAL:
827 case PF_SYSTEM:
828 so->so_flags |= SOF_NODEFUNCT;
829 break;
830 default:
831 break;
832 }
833
834 /*
835 * Entitlements can't be checked at socket creation time except if the
836 * application requested a feature guarded by a privilege (c.f., socket
837 * delegation).
838 * The priv(9) and the Sandboxing APIs are designed with the idea that
839 * a privilege check should only be triggered by a userland request.
840 * A privilege check at socket creation time is time consuming and
841 * could trigger many authorisation error messages from the security
842 * APIs.
843 */
844
845 *aso = so;
846
847 return 0;
848 }
849
850 /*
851 * Returns: 0 Success
852 * EAFNOSUPPORT
853 * EPROTOTYPE
854 * EPROTONOSUPPORT
855 * ENOBUFS
856 * <pru_attach>:ENOBUFS[AF_UNIX]
857 * <pru_attach>:ENOBUFS[TCP]
858 * <pru_attach>:ENOMEM[TCP]
859 * <pru_attach>:??? [other protocol families, IPSEC]
860 */
861 int
socreate(int dom,struct socket ** aso,int type,int proto)862 socreate(int dom, struct socket **aso, int type, int proto)
863 {
864 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
865 PROC_NULL);
866 }
867
868 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)869 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
870 {
871 int error = 0;
872 struct proc *ep = PROC_NULL;
873
874 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
875 error = ESRCH;
876 goto done;
877 }
878
879 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
880
881 /*
882 * It might not be wise to hold the proc reference when calling
883 * socreate_internal since it calls soalloc with M_WAITOK
884 */
885 done:
886 if (ep != PROC_NULL) {
887 proc_rele(ep);
888 }
889
890 return error;
891 }
892
893 /*
894 * Returns: 0 Success
895 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
896 * <pru_bind>:EAFNOSUPPORT Address family not supported
897 * <pru_bind>:EADDRNOTAVAIL Address not available.
898 * <pru_bind>:EINVAL Invalid argument
899 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
900 * <pru_bind>:EACCES Permission denied
901 * <pru_bind>:EADDRINUSE Address in use
902 * <pru_bind>:EAGAIN Resource unavailable, try again
903 * <pru_bind>:EPERM Operation not permitted
904 * <pru_bind>:???
905 * <sf_bind>:???
906 *
907 * Notes: It's not possible to fully enumerate the return codes above,
908 * since socket filter authors and protocol family authors may
909 * not choose to limit their error returns to those listed, even
910 * though this may result in some software operating incorrectly.
911 *
912 * The error codes which are enumerated above are those known to
913 * be returned by the tcp_usr_bind function supplied.
914 */
915 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)916 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
917 {
918 struct proc *p = current_proc();
919 int error = 0;
920
921 if (dolock) {
922 socket_lock(so, 1);
923 }
924
925 so_update_last_owner_locked(so, p);
926 so_update_policy(so);
927
928 #if NECP
929 so_update_necp_policy(so, nam, NULL);
930 #endif /* NECP */
931
932 /*
933 * If this is a bind request on a socket that has been marked
934 * as inactive, reject it now before we go any further.
935 */
936 if (so->so_flags & SOF_DEFUNCT) {
937 error = EINVAL;
938 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
939 __func__, proc_pid(p), proc_best_name(p),
940 so->so_gencnt,
941 SOCK_DOM(so), SOCK_TYPE(so), error);
942 goto out;
943 }
944
945 /* Socket filter */
946 error = sflt_bind(so, nam);
947
948 if (error == 0) {
949 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
950 }
951 out:
952 if (dolock) {
953 socket_unlock(so, 1);
954 }
955
956 if (error == EJUSTRETURN) {
957 error = 0;
958 }
959
960 return error;
961 }
962
963 void
sodealloc(struct socket * so)964 sodealloc(struct socket *so)
965 {
966 kauth_cred_unref(&so->so_cred);
967
968 /* Remove any filters */
969 sflt_termsock(so);
970
971 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
972
973 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
974 cached_sock_free(so);
975 } else {
976 zfree(socket_zone, so);
977 }
978 }
979
980 /*
981 * Returns: 0 Success
982 * EINVAL
983 * EOPNOTSUPP
984 * <pru_listen>:EINVAL[AF_UNIX]
985 * <pru_listen>:EINVAL[TCP]
986 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
987 * <pru_listen>:EINVAL[TCP] Invalid argument
988 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
989 * <pru_listen>:EACCES[TCP] Permission denied
990 * <pru_listen>:EADDRINUSE[TCP] Address in use
991 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
992 * <pru_listen>:EPERM[TCP] Operation not permitted
993 * <sf_listen>:???
994 *
995 * Notes: Other <pru_listen> returns depend on the protocol family; all
996 * <sf_listen> returns depend on what the filter author causes
997 * their filter to return.
998 */
999 int
solisten(struct socket * so,int backlog)1000 solisten(struct socket *so, int backlog)
1001 {
1002 struct proc *p = current_proc();
1003 int error = 0;
1004
1005 socket_lock(so, 1);
1006
1007 so_update_last_owner_locked(so, p);
1008 so_update_policy(so);
1009
1010 if (TAILQ_EMPTY(&so->so_comp)) {
1011 so->so_options |= SO_ACCEPTCONN;
1012 }
1013
1014 #if NECP
1015 so_update_necp_policy(so, NULL, NULL);
1016 #endif /* NECP */
1017
1018 if (so->so_proto == NULL) {
1019 error = EINVAL;
1020 so->so_options &= ~SO_ACCEPTCONN;
1021 goto out;
1022 }
1023 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1024 error = EOPNOTSUPP;
1025 so->so_options &= ~SO_ACCEPTCONN;
1026 goto out;
1027 }
1028
1029 /*
1030 * If the listen request is made on a socket that is not fully
1031 * disconnected, or on a socket that has been marked as inactive,
1032 * reject the request now.
1033 */
1034 if ((so->so_state &
1035 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1036 (so->so_flags & SOF_DEFUNCT)) {
1037 error = EINVAL;
1038 if (so->so_flags & SOF_DEFUNCT) {
1039 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1040 "(%d)\n", __func__, proc_pid(p),
1041 proc_best_name(p),
1042 so->so_gencnt,
1043 SOCK_DOM(so), SOCK_TYPE(so), error);
1044 }
1045 so->so_options &= ~SO_ACCEPTCONN;
1046 goto out;
1047 }
1048
1049 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1050 error = EPERM;
1051 so->so_options &= ~SO_ACCEPTCONN;
1052 goto out;
1053 }
1054
1055 error = sflt_listen(so);
1056 if (error == 0) {
1057 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1058 }
1059
1060 if (error) {
1061 if (error == EJUSTRETURN) {
1062 error = 0;
1063 }
1064 so->so_options &= ~SO_ACCEPTCONN;
1065 goto out;
1066 }
1067
1068 /*
1069 * POSIX: The implementation may have an upper limit on the length of
1070 * the listen queue-either global or per accepting socket. If backlog
1071 * exceeds this limit, the length of the listen queue is set to the
1072 * limit.
1073 *
1074 * If listen() is called with a backlog argument value that is less
1075 * than 0, the function behaves as if it had been called with a backlog
1076 * argument value of 0.
1077 *
1078 * A backlog argument of 0 may allow the socket to accept connections,
1079 * in which case the length of the listen queue may be set to an
1080 * implementation-defined minimum value.
1081 */
1082 if (backlog <= 0 || backlog > somaxconn) {
1083 backlog = somaxconn;
1084 }
1085
1086 so->so_qlimit = (short)backlog;
1087 out:
1088 socket_unlock(so, 1);
1089 return error;
1090 }
1091
1092 /*
1093 * The "accept list lock" protects the fields related to the listener queues
1094 * because we can unlock a socket to respect the lock ordering between
1095 * the listener socket and its clients sockets. The lock ordering is first to
1096 * acquire the client socket before the listener socket.
1097 *
1098 * The accept list lock serializes access to the following fields:
1099 * - of the listener socket:
1100 * - so_comp
1101 * - so_incomp
1102 * - so_qlen
1103 * - so_inqlen
1104 * - of client sockets that are in so_comp or so_incomp:
1105 * - so_head
1106 * - so_list
1107 *
1108 * As one can see the accept list lock protects the consistent of the
1109 * linkage of the client sockets.
1110 *
1111 * Note that those fields may be read without holding the accept list lock
1112 * for a preflight provided the accept list lock is taken when committing
1113 * to take an action based on the result of the preflight. The preflight
1114 * saves the cost of doing the unlock/lock dance.
1115 */
1116 void
so_acquire_accept_list(struct socket * head,struct socket * so)1117 so_acquire_accept_list(struct socket *head, struct socket *so)
1118 {
1119 lck_mtx_t *mutex_held;
1120
1121 if (head->so_proto->pr_getlock == NULL) {
1122 return;
1123 }
1124 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1125 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1126
1127 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1128 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 return;
1130 }
1131 if (so != NULL) {
1132 socket_unlock(so, 0);
1133 }
1134 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1135 so_accept_list_waits += 1;
1136 msleep((caddr_t)&head->so_incomp, mutex_held,
1137 PSOCK | PCATCH, __func__, NULL);
1138 }
1139 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1140 if (so != NULL) {
1141 socket_unlock(head, 0);
1142 socket_lock(so, 0);
1143 socket_lock(head, 0);
1144 }
1145 }
1146
1147 void
so_release_accept_list(struct socket * head)1148 so_release_accept_list(struct socket *head)
1149 {
1150 if (head->so_proto->pr_getlock != NULL) {
1151 lck_mtx_t *mutex_held;
1152
1153 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1154 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1155
1156 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1157 wakeup((caddr_t)&head->so_incomp);
1158 }
1159 }
1160
1161 void
sofreelastref(struct socket * so,int dealloc)1162 sofreelastref(struct socket *so, int dealloc)
1163 {
1164 struct socket *head = so->so_head;
1165
1166 /* Assume socket is locked */
1167
1168 #if FLOW_DIVERT
1169 if (so->so_flags & SOF_FLOW_DIVERT) {
1170 flow_divert_detach(so);
1171 }
1172 #endif /* FLOW_DIVERT */
1173
1174 #if CONTENT_FILTER
1175 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1176 cfil_sock_detach(so);
1177 }
1178 #endif /* CONTENT_FILTER */
1179
1180 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1181 soflow_detach(so);
1182 }
1183
1184 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1185 selthreadclear(&so->so_snd.sb_sel);
1186 selthreadclear(&so->so_rcv.sb_sel);
1187 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1188 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1189 so->so_event = sonullevent;
1190 return;
1191 }
1192 if (head != NULL) {
1193 /*
1194 * Need to lock the listener when the protocol has
1195 * per socket locks
1196 */
1197 if (head->so_proto->pr_getlock != NULL) {
1198 socket_lock(head, 1);
1199 so_acquire_accept_list(head, so);
1200 }
1201 if (so->so_state & SS_INCOMP) {
1202 so->so_state &= ~SS_INCOMP;
1203 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1204 head->so_incqlen--;
1205 head->so_qlen--;
1206 so->so_head = NULL;
1207
1208 if (head->so_proto->pr_getlock != NULL) {
1209 so_release_accept_list(head);
1210 socket_unlock(head, 1);
1211 }
1212 } else if (so->so_state & SS_COMP) {
1213 if (head->so_proto->pr_getlock != NULL) {
1214 so_release_accept_list(head);
1215 socket_unlock(head, 1);
1216 }
1217 /*
1218 * We must not decommission a socket that's
1219 * on the accept(2) queue. If we do, then
1220 * accept(2) may hang after select(2) indicated
1221 * that the listening socket was ready.
1222 */
1223 selthreadclear(&so->so_snd.sb_sel);
1224 selthreadclear(&so->so_rcv.sb_sel);
1225 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1226 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1227 so->so_event = sonullevent;
1228 return;
1229 } else {
1230 if (head->so_proto->pr_getlock != NULL) {
1231 so_release_accept_list(head);
1232 socket_unlock(head, 1);
1233 }
1234 printf("sofree: not queued\n");
1235 }
1236 }
1237 sowflush(so);
1238 sorflush(so);
1239
1240 /* 3932268: disable upcall */
1241 so->so_rcv.sb_flags &= ~SB_UPCALL;
1242 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1243 so->so_event = sonullevent;
1244
1245 if (dealloc) {
1246 sodealloc(so);
1247 }
1248 }
1249
1250 void
soclose_wait_locked(struct socket * so)1251 soclose_wait_locked(struct socket *so)
1252 {
1253 lck_mtx_t *mutex_held;
1254
1255 if (so->so_proto->pr_getlock != NULL) {
1256 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1257 } else {
1258 mutex_held = so->so_proto->pr_domain->dom_mtx;
1259 }
1260 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1261
1262 /*
1263 * Double check here and return if there's no outstanding upcall;
1264 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1265 */
1266 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1267 return;
1268 }
1269 so->so_rcv.sb_flags &= ~SB_UPCALL;
1270 so->so_snd.sb_flags &= ~SB_UPCALL;
1271 so->so_flags |= SOF_CLOSEWAIT;
1272
1273 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1274 "soclose_wait_locked", NULL);
1275 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1276 so->so_flags &= ~SOF_CLOSEWAIT;
1277 }
1278
1279 /*
1280 * Close a socket on last file table reference removal.
1281 * Initiate disconnect if connected.
1282 * Free socket when disconnect complete.
1283 */
1284 int
soclose_locked(struct socket * so)1285 soclose_locked(struct socket *so)
1286 {
1287 int error = 0;
1288 struct timespec ts;
1289
1290 if (so->so_usecount == 0) {
1291 panic("soclose: so=%p refcount=0", so);
1292 /* NOTREACHED */
1293 }
1294
1295 sflt_notify(so, sock_evt_closing, NULL);
1296
1297 if (so->so_upcallusecount) {
1298 soclose_wait_locked(so);
1299 }
1300
1301 #if CONTENT_FILTER
1302 /*
1303 * We have to wait until the content filters are done
1304 */
1305 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1306 cfil_sock_close_wait(so);
1307 cfil_sock_is_closed(so);
1308 cfil_sock_detach(so);
1309 }
1310 #endif /* CONTENT_FILTER */
1311
1312 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1313 soflow_detach(so);
1314 }
1315
1316 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1317 soresume(current_proc(), so, 1);
1318 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1319 }
1320
1321 if ((so->so_options & SO_ACCEPTCONN)) {
1322 struct socket *sp, *sonext;
1323 int persocklock = 0;
1324 int incomp_overflow_only;
1325
1326 /*
1327 * We do not want new connection to be added
1328 * to the connection queues
1329 */
1330 so->so_options &= ~SO_ACCEPTCONN;
1331
1332 /*
1333 * We can drop the lock on the listener once
1334 * we've acquired the incoming list
1335 */
1336 if (so->so_proto->pr_getlock != NULL) {
1337 persocklock = 1;
1338 so_acquire_accept_list(so, NULL);
1339 socket_unlock(so, 0);
1340 }
1341 again:
1342 incomp_overflow_only = 1;
1343
1344 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1345 /*
1346 * Radar 5350314
1347 * skip sockets thrown away by tcpdropdropblreq
1348 * they will get cleanup by the garbage collection.
1349 * otherwise, remove the incomp socket from the queue
1350 * and let soabort trigger the appropriate cleanup.
1351 */
1352 if (sp->so_flags & SOF_OVERFLOW) {
1353 continue;
1354 }
1355
1356 if (persocklock != 0) {
1357 socket_lock(sp, 1);
1358 }
1359
1360 /*
1361 * Radar 27945981
1362 * The extra reference for the list insure the
1363 * validity of the socket pointer when we perform the
1364 * unlock of the head above
1365 */
1366 if (sp->so_state & SS_INCOMP) {
1367 sp->so_state &= ~SS_INCOMP;
1368 sp->so_head = NULL;
1369 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1370 so->so_incqlen--;
1371 so->so_qlen--;
1372
1373 (void) soabort(sp);
1374 } else {
1375 panic("%s sp %p in so_incomp but !SS_INCOMP",
1376 __func__, sp);
1377 }
1378
1379 if (persocklock != 0) {
1380 socket_unlock(sp, 1);
1381 }
1382 }
1383
1384 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1385 /* Dequeue from so_comp since sofree() won't do it */
1386 if (persocklock != 0) {
1387 socket_lock(sp, 1);
1388 }
1389
1390 if (sp->so_state & SS_COMP) {
1391 sp->so_state &= ~SS_COMP;
1392 sp->so_head = NULL;
1393 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1394 so->so_qlen--;
1395
1396 (void) soabort(sp);
1397 } else {
1398 panic("%s sp %p in so_comp but !SS_COMP",
1399 __func__, sp);
1400 }
1401
1402 if (persocklock) {
1403 socket_unlock(sp, 1);
1404 }
1405 }
1406
1407 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1408 #if (DEBUG | DEVELOPMENT)
1409 panic("%s head %p so_comp not empty", __func__, so);
1410 #endif /* (DEVELOPMENT || DEBUG) */
1411
1412 goto again;
1413 }
1414
1415 if (!TAILQ_EMPTY(&so->so_comp)) {
1416 #if (DEBUG | DEVELOPMENT)
1417 panic("%s head %p so_comp not empty", __func__, so);
1418 #endif /* (DEVELOPMENT || DEBUG) */
1419
1420 goto again;
1421 }
1422
1423 if (persocklock) {
1424 socket_lock(so, 0);
1425 so_release_accept_list(so);
1426 }
1427 }
1428 if (so->so_pcb == NULL) {
1429 /* 3915887: mark the socket as ready for dealloc */
1430 so->so_flags |= SOF_PCBCLEARING;
1431 goto discard;
1432 }
1433
1434 if (so->so_state & SS_ISCONNECTED) {
1435 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1436 error = sodisconnectlocked(so);
1437 if (error) {
1438 goto drop;
1439 }
1440 }
1441 if (so->so_options & SO_LINGER) {
1442 if ((so->so_state & SS_ISDISCONNECTING) &&
1443 (so->so_state & SS_NBIO)) {
1444 goto drop;
1445 }
1446 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1447 lck_mtx_t *mutex_held;
1448
1449 if (so->so_proto->pr_getlock != NULL) {
1450 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1451 } else {
1452 mutex_held = so->so_proto->pr_domain->dom_mtx;
1453 }
1454 ts.tv_sec = (so->so_linger / 100);
1455 ts.tv_nsec = (so->so_linger % 100) *
1456 NSEC_PER_USEC * 1000 * 10;
1457 error = msleep((caddr_t)&so->so_timeo,
1458 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1459 if (error) {
1460 /*
1461 * It's OK when the time fires,
1462 * don't report an error
1463 */
1464 if (error == EWOULDBLOCK) {
1465 error = 0;
1466 }
1467 break;
1468 }
1469 }
1470 }
1471 }
1472 drop:
1473 if (so->so_usecount == 0) {
1474 panic("soclose: usecount is zero so=%p", so);
1475 /* NOTREACHED */
1476 }
1477 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1478 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1479 if (error == 0) {
1480 error = error2;
1481 }
1482 }
1483 if (so->so_usecount <= 0) {
1484 panic("soclose: usecount is zero so=%p", so);
1485 /* NOTREACHED */
1486 }
1487 discard:
1488 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1489 (so->so_state & SS_NOFDREF)) {
1490 panic("soclose: NOFDREF");
1491 /* NOTREACHED */
1492 }
1493 so->so_state |= SS_NOFDREF;
1494
1495 if ((so->so_flags & SOF_KNOTE) != 0) {
1496 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1497 }
1498
1499 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1500
1501 VERIFY(so->so_usecount > 0);
1502 so->so_usecount--;
1503 sofree(so);
1504 return error;
1505 }
1506
1507 int
soclose(struct socket * so)1508 soclose(struct socket *so)
1509 {
1510 int error = 0;
1511 socket_lock(so, 1);
1512
1513 if (so->so_retaincnt == 0) {
1514 error = soclose_locked(so);
1515 } else {
1516 /*
1517 * if the FD is going away, but socket is
1518 * retained in kernel remove its reference
1519 */
1520 so->so_usecount--;
1521 if (so->so_usecount < 2) {
1522 panic("soclose: retaincnt non null and so=%p "
1523 "usecount=%d\n", so, so->so_usecount);
1524 }
1525 }
1526 socket_unlock(so, 1);
1527 return error;
1528 }
1529
1530 /*
1531 * Must be called at splnet...
1532 */
1533 /* Should already be locked */
1534 int
soabort(struct socket * so)1535 soabort(struct socket *so)
1536 {
1537 int error;
1538
1539 #ifdef MORE_LOCKING_DEBUG
1540 lck_mtx_t *mutex_held;
1541
1542 if (so->so_proto->pr_getlock != NULL) {
1543 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1544 } else {
1545 mutex_held = so->so_proto->pr_domain->dom_mtx;
1546 }
1547 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1548 #endif
1549
1550 if ((so->so_flags & SOF_ABORTED) == 0) {
1551 so->so_flags |= SOF_ABORTED;
1552 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1553 if (error) {
1554 sofree(so);
1555 return error;
1556 }
1557 }
1558 return 0;
1559 }
1560
1561 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1562 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1563 {
1564 int error;
1565
1566 if (dolock) {
1567 socket_lock(so, 1);
1568 }
1569
1570 so_update_last_owner_locked(so, PROC_NULL);
1571 so_update_policy(so);
1572 #if NECP
1573 so_update_necp_policy(so, NULL, NULL);
1574 #endif /* NECP */
1575
1576 if ((so->so_state & SS_NOFDREF) == 0) {
1577 panic("soaccept: !NOFDREF");
1578 }
1579 so->so_state &= ~SS_NOFDREF;
1580 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1581
1582 if (dolock) {
1583 socket_unlock(so, 1);
1584 }
1585 return error;
1586 }
1587
1588 int
soaccept(struct socket * so,struct sockaddr ** nam)1589 soaccept(struct socket *so, struct sockaddr **nam)
1590 {
1591 return soacceptlock(so, nam, 1);
1592 }
1593
1594 int
soacceptfilter(struct socket * so,struct socket * head)1595 soacceptfilter(struct socket *so, struct socket *head)
1596 {
1597 struct sockaddr *local = NULL, *remote = NULL;
1598 int error = 0;
1599
1600 /*
1601 * Hold the lock even if this socket has not been made visible
1602 * to the filter(s). For sockets with global locks, this protects
1603 * against the head or peer going away
1604 */
1605 socket_lock(so, 1);
1606 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1607 sogetaddr_locked(so, &local, 0) != 0) {
1608 so->so_state &= ~SS_NOFDREF;
1609 socket_unlock(so, 1);
1610 soclose(so);
1611 /* Out of resources; try it again next time */
1612 error = ECONNABORTED;
1613 goto done;
1614 }
1615
1616 error = sflt_accept(head, so, local, remote);
1617
1618 /*
1619 * If we get EJUSTRETURN from one of the filters, mark this socket
1620 * as inactive and return it anyway. This newly accepted socket
1621 * will be disconnected later before we hand it off to the caller.
1622 */
1623 if (error == EJUSTRETURN) {
1624 error = 0;
1625 (void) sosetdefunct(current_proc(), so,
1626 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1627 }
1628
1629 if (error != 0) {
1630 /*
1631 * This may seem like a duplication to the above error
1632 * handling part when we return ECONNABORTED, except
1633 * the following is done while holding the lock since
1634 * the socket has been exposed to the filter(s) earlier.
1635 */
1636 so->so_state &= ~SS_NOFDREF;
1637 socket_unlock(so, 1);
1638 soclose(so);
1639 /* Propagate socket filter's error code to the caller */
1640 } else {
1641 socket_unlock(so, 1);
1642 }
1643 done:
1644 /* Callee checks for NULL pointer */
1645 sock_freeaddr(remote);
1646 sock_freeaddr(local);
1647 return error;
1648 }
1649
1650 /*
1651 * Returns: 0 Success
1652 * EOPNOTSUPP Operation not supported on socket
1653 * EISCONN Socket is connected
1654 * <pru_connect>:EADDRNOTAVAIL Address not available.
1655 * <pru_connect>:EINVAL Invalid argument
1656 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1657 * <pru_connect>:EACCES Permission denied
1658 * <pru_connect>:EADDRINUSE Address in use
1659 * <pru_connect>:EAGAIN Resource unavailable, try again
1660 * <pru_connect>:EPERM Operation not permitted
1661 * <sf_connect_out>:??? [anything a filter writer might set]
1662 */
1663 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1664 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1665 {
1666 int error;
1667 struct proc *p = current_proc();
1668 tracker_metadata_t metadata = { };
1669
1670 if (dolock) {
1671 socket_lock(so, 1);
1672 }
1673
1674 so_update_last_owner_locked(so, p);
1675 so_update_policy(so);
1676
1677 #if NECP
1678 so_update_necp_policy(so, NULL, nam);
1679 #endif /* NECP */
1680
1681 /*
1682 * If this is a listening socket or if this is a previously-accepted
1683 * socket that has been marked as inactive, reject the connect request.
1684 */
1685 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1686 error = EOPNOTSUPP;
1687 if (so->so_flags & SOF_DEFUNCT) {
1688 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1689 "(%d)\n", __func__, proc_pid(p),
1690 proc_best_name(p),
1691 so->so_gencnt,
1692 SOCK_DOM(so), SOCK_TYPE(so), error);
1693 }
1694 if (dolock) {
1695 socket_unlock(so, 1);
1696 }
1697 return error;
1698 }
1699
1700 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1701 if (dolock) {
1702 socket_unlock(so, 1);
1703 }
1704 return EPERM;
1705 }
1706
1707 /*
1708 * If protocol is connection-based, can only connect once.
1709 * Otherwise, if connected, try to disconnect first.
1710 * This allows user to disconnect by connecting to, e.g.,
1711 * a null address.
1712 */
1713 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1714 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1715 (error = sodisconnectlocked(so)))) {
1716 error = EISCONN;
1717 } else {
1718 /*
1719 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1720 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1721 */
1722 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1723 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1724 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1725 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1726 }
1727 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1728 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1729 }
1730 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1731 printf("connect() - failed necp_set_socket_domain_attributes");
1732 }
1733 }
1734 }
1735
1736 /*
1737 * Run connect filter before calling protocol:
1738 * - non-blocking connect returns before completion;
1739 */
1740 error = sflt_connectout(so, nam);
1741 if (error != 0) {
1742 if (error == EJUSTRETURN) {
1743 error = 0;
1744 }
1745 } else {
1746 error = (*so->so_proto->pr_usrreqs->pru_connect)
1747 (so, nam, p);
1748 if (error != 0) {
1749 so->so_state &= ~SS_ISCONNECTING;
1750 }
1751 }
1752 }
1753 if (dolock) {
1754 socket_unlock(so, 1);
1755 }
1756 return error;
1757 }
1758
1759 int
soconnect(struct socket * so,struct sockaddr * nam)1760 soconnect(struct socket *so, struct sockaddr *nam)
1761 {
1762 return soconnectlock(so, nam, 1);
1763 }
1764
1765 /*
1766 * Returns: 0 Success
1767 * <pru_connect2>:EINVAL[AF_UNIX]
1768 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1769 * <pru_connect2>:??? [other protocol families]
1770 *
1771 * Notes: <pru_connect2> is not supported by [TCP].
1772 */
1773 int
soconnect2(struct socket * so1,struct socket * so2)1774 soconnect2(struct socket *so1, struct socket *so2)
1775 {
1776 int error;
1777
1778 socket_lock(so1, 1);
1779 if (so2->so_proto->pr_lock) {
1780 socket_lock(so2, 1);
1781 }
1782
1783 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1784
1785 socket_unlock(so1, 1);
1786 if (so2->so_proto->pr_lock) {
1787 socket_unlock(so2, 1);
1788 }
1789 return error;
1790 }
1791
1792 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1793 soconnectxlocked(struct socket *so, struct sockaddr *src,
1794 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1795 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1796 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1797 {
1798 int error;
1799 tracker_metadata_t metadata = { };
1800
1801 so_update_last_owner_locked(so, p);
1802 so_update_policy(so);
1803
1804 /*
1805 * If this is a listening socket or if this is a previously-accepted
1806 * socket that has been marked as inactive, reject the connect request.
1807 */
1808 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1809 error = EOPNOTSUPP;
1810 if (so->so_flags & SOF_DEFUNCT) {
1811 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1812 "(%d)\n", __func__, proc_pid(p),
1813 proc_best_name(p),
1814 so->so_gencnt,
1815 SOCK_DOM(so), SOCK_TYPE(so), error);
1816 }
1817 return error;
1818 }
1819
1820 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1821 return EPERM;
1822 }
1823
1824 /*
1825 * If protocol is connection-based, can only connect once
1826 * unless PR_MULTICONN is set. Otherwise, if connected,
1827 * try to disconnect first. This allows user to disconnect
1828 * by connecting to, e.g., a null address.
1829 */
1830 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1831 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1832 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1833 (error = sodisconnectlocked(so)) != 0)) {
1834 error = EISCONN;
1835 } else {
1836 /*
1837 * For TCP, check if destination address is a tracker and mark the socket accordingly
1838 * (only if it hasn't been marked yet).
1839 */
1840 if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1841 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1842 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1843 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1844 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1845 }
1846 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1847 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1848 }
1849 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1850 printf("connectx() - failed necp_set_socket_domain_attributes");
1851 }
1852 }
1853 }
1854
1855 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1856 (flags & CONNECT_DATA_IDEMPOTENT)) {
1857 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1858
1859 if (flags & CONNECT_DATA_AUTHENTICATED) {
1860 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1861 }
1862 }
1863
1864 /*
1865 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1866 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1867 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1868 * Case 3 allows user to combine write with connect even if they have
1869 * no use for TFO (such as regular TCP, and UDP).
1870 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1871 */
1872 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1873 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1874 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1875 }
1876
1877 /*
1878 * If a user sets data idempotent and does not pass an uio, or
1879 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1880 * SOF1_DATA_IDEMPOTENT.
1881 */
1882 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1883 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1884 /* We should return EINVAL instead perhaps. */
1885 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1886 }
1887
1888 /*
1889 * Run connect filter before calling protocol:
1890 * - non-blocking connect returns before completion;
1891 */
1892 error = sflt_connectout(so, dst);
1893 if (error != 0) {
1894 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1895 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1896 if (error == EJUSTRETURN) {
1897 error = 0;
1898 }
1899 } else {
1900 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1901 (so, src, dst, p, ifscope, aid, pcid,
1902 flags, arg, arglen, auio, bytes_written);
1903 if (error != 0) {
1904 so->so_state &= ~SS_ISCONNECTING;
1905 if (error != EINPROGRESS) {
1906 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1907 }
1908 }
1909 }
1910 }
1911
1912 return error;
1913 }
1914
1915 int
sodisconnectlocked(struct socket * so)1916 sodisconnectlocked(struct socket *so)
1917 {
1918 int error;
1919
1920 if ((so->so_state & SS_ISCONNECTED) == 0) {
1921 error = ENOTCONN;
1922 goto bad;
1923 }
1924 if (so->so_state & SS_ISDISCONNECTING) {
1925 error = EALREADY;
1926 goto bad;
1927 }
1928
1929 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1930 if (error == 0) {
1931 sflt_notify(so, sock_evt_disconnected, NULL);
1932 }
1933
1934 bad:
1935 return error;
1936 }
1937
1938 /* Locking version */
1939 int
sodisconnect(struct socket * so)1940 sodisconnect(struct socket *so)
1941 {
1942 int error;
1943
1944 socket_lock(so, 1);
1945 error = sodisconnectlocked(so);
1946 socket_unlock(so, 1);
1947 return error;
1948 }
1949
1950 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1951 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1952 {
1953 int error;
1954
1955 /*
1956 * Call the protocol disconnectx handler; let it handle all
1957 * matters related to the connection state of this session.
1958 */
1959 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1960 if (error == 0) {
1961 /*
1962 * The event applies only for the session, not for
1963 * the disconnection of individual subflows.
1964 */
1965 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1966 sflt_notify(so, sock_evt_disconnected, NULL);
1967 }
1968 }
1969 return error;
1970 }
1971
1972 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1973 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1974 {
1975 int error;
1976
1977 socket_lock(so, 1);
1978 error = sodisconnectxlocked(so, aid, cid);
1979 socket_unlock(so, 1);
1980 return error;
1981 }
1982
1983 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1984
1985 /*
1986 * sosendcheck will lock the socket buffer if it isn't locked and
1987 * verify that there is space for the data being inserted.
1988 *
1989 * Returns: 0 Success
1990 * EPIPE
1991 * sblock:EWOULDBLOCK
1992 * sblock:EINTR
1993 * sbwait:EBADF
1994 * sbwait:EINTR
1995 * [so_error]:???
1996 */
1997 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1998 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1999 int32_t clen, int32_t atomic, int flags, int *sblocked)
2000 {
2001 int error = 0;
2002 int32_t space;
2003 int assumelock = 0;
2004
2005 restart:
2006 if (*sblocked == 0) {
2007 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2008 so->so_send_filt_thread != 0 &&
2009 so->so_send_filt_thread == current_thread()) {
2010 /*
2011 * We're being called recursively from a filter,
2012 * allow this to continue. Radar 4150520.
2013 * Don't set sblocked because we don't want
2014 * to perform an unlock later.
2015 */
2016 assumelock = 1;
2017 } else {
2018 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2019 if (error) {
2020 if (so->so_flags & SOF_DEFUNCT) {
2021 goto defunct;
2022 }
2023 return error;
2024 }
2025 *sblocked = 1;
2026 }
2027 }
2028
2029 /*
2030 * If a send attempt is made on a socket that has been marked
2031 * as inactive (disconnected), reject the request.
2032 */
2033 if (so->so_flags & SOF_DEFUNCT) {
2034 defunct:
2035 error = EPIPE;
2036 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2037 __func__, proc_selfpid(), proc_best_name(current_proc()),
2038 so->so_gencnt,
2039 SOCK_DOM(so), SOCK_TYPE(so), error);
2040 return error;
2041 }
2042
2043 if (so->so_state & SS_CANTSENDMORE) {
2044 #if CONTENT_FILTER
2045 /*
2046 * Can re-inject data of half closed connections
2047 */
2048 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2049 so->so_snd.sb_cfil_thread == current_thread() &&
2050 cfil_sock_data_pending(&so->so_snd) != 0) {
2051 CFIL_LOG(LOG_INFO,
2052 "so %llx ignore SS_CANTSENDMORE",
2053 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2054 } else
2055 #endif /* CONTENT_FILTER */
2056 return EPIPE;
2057 }
2058 if (so->so_error) {
2059 error = so->so_error;
2060 so->so_error = 0;
2061 return error;
2062 }
2063
2064 if ((so->so_state & SS_ISCONNECTED) == 0) {
2065 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2066 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2067 (resid != 0 || clen == 0) &&
2068 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2069 return ENOTCONN;
2070 }
2071 } else if (addr == 0) {
2072 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2073 ENOTCONN : EDESTADDRREQ;
2074 }
2075 }
2076
2077 space = sbspace(&so->so_snd);
2078
2079 if (flags & MSG_OOB) {
2080 space += 1024;
2081 }
2082 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2083 clen > so->so_snd.sb_hiwat) {
2084 return EMSGSIZE;
2085 }
2086
2087 if ((space < resid + clen &&
2088 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2089 space < clen)) ||
2090 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2091 /*
2092 * don't block the connectx call when there's more data
2093 * than can be copied.
2094 */
2095 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2096 if (space == 0) {
2097 return EWOULDBLOCK;
2098 }
2099 if (space < (int32_t)so->so_snd.sb_lowat) {
2100 return 0;
2101 }
2102 }
2103 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2104 assumelock) {
2105 return EWOULDBLOCK;
2106 }
2107 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2108 *sblocked = 0;
2109 error = sbwait(&so->so_snd);
2110 if (error) {
2111 if (so->so_flags & SOF_DEFUNCT) {
2112 goto defunct;
2113 }
2114 return error;
2115 }
2116 goto restart;
2117 }
2118 return 0;
2119 }
2120
2121 /*
2122 * Send on a socket.
2123 * If send must go all at once and message is larger than
2124 * send buffering, then hard error.
2125 * Lock against other senders.
2126 * If must go all at once and not enough room now, then
2127 * inform user that this would block and do nothing.
2128 * Otherwise, if nonblocking, send as much as possible.
2129 * The data to be sent is described by "uio" if nonzero,
2130 * otherwise by the mbuf chain "top" (which must be null
2131 * if uio is not). Data provided in mbuf chain must be small
2132 * enough to send all at once.
2133 *
2134 * Returns nonzero on error, timeout or signal; callers
2135 * must check for short counts if EINTR/ERESTART are returned.
2136 * Data and control buffers are freed on return.
2137 *
2138 * Returns: 0 Success
2139 * EOPNOTSUPP
2140 * EINVAL
2141 * ENOBUFS
2142 * uiomove:EFAULT
2143 * sosendcheck:EPIPE
2144 * sosendcheck:EWOULDBLOCK
2145 * sosendcheck:EINTR
2146 * sosendcheck:EBADF
2147 * sosendcheck:EINTR
2148 * sosendcheck:??? [value from so_error]
2149 * <pru_send>:ECONNRESET[TCP]
2150 * <pru_send>:EINVAL[TCP]
2151 * <pru_send>:ENOBUFS[TCP]
2152 * <pru_send>:EADDRINUSE[TCP]
2153 * <pru_send>:EADDRNOTAVAIL[TCP]
2154 * <pru_send>:EAFNOSUPPORT[TCP]
2155 * <pru_send>:EACCES[TCP]
2156 * <pru_send>:EAGAIN[TCP]
2157 * <pru_send>:EPERM[TCP]
2158 * <pru_send>:EMSGSIZE[TCP]
2159 * <pru_send>:EHOSTUNREACH[TCP]
2160 * <pru_send>:ENETUNREACH[TCP]
2161 * <pru_send>:ENETDOWN[TCP]
2162 * <pru_send>:ENOMEM[TCP]
2163 * <pru_send>:ENOBUFS[TCP]
2164 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2165 * <pru_send>:EINVAL[AF_UNIX]
2166 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2167 * <pru_send>:EPIPE[AF_UNIX]
2168 * <pru_send>:ENOTCONN[AF_UNIX]
2169 * <pru_send>:EISCONN[AF_UNIX]
2170 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2171 * <sf_data_out>:??? [whatever a filter author chooses]
2172 *
2173 * Notes: Other <pru_send> returns depend on the protocol family; all
2174 * <sf_data_out> returns depend on what the filter author causes
2175 * their filter to return.
2176 */
2177 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2178 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2179 struct mbuf *top, struct mbuf *control, int flags)
2180 {
2181 struct mbuf **mp;
2182 struct mbuf *m, *freelist = NULL;
2183 struct soflow_hash_entry *dgram_flow_entry = NULL;
2184 user_ssize_t space, len, resid, orig_resid;
2185 int clen = 0, error, dontroute, sendflags;
2186 int atomic = sosendallatonce(so) || top;
2187 int sblocked = 0;
2188 struct proc *p = current_proc();
2189 uint16_t headroom = 0;
2190 ssize_t mlen;
2191 boolean_t en_tracing = FALSE;
2192
2193 if (uio != NULL) {
2194 resid = uio_resid(uio);
2195 } else {
2196 resid = top->m_pkthdr.len;
2197 }
2198 orig_resid = resid;
2199
2200 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2201 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2202
2203 socket_lock(so, 1);
2204
2205 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2206 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2207 }
2208
2209 /*
2210 * trace if tracing & network (vs. unix) sockets & and
2211 * non-loopback
2212 */
2213 if (ENTR_SHOULDTRACE &&
2214 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2215 struct inpcb *inp = sotoinpcb(so);
2216 if (inp->inp_last_outifp != NULL &&
2217 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2218 en_tracing = TRUE;
2219 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2220 VM_KERNEL_ADDRPERM(so),
2221 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2222 (int64_t)resid);
2223 }
2224 }
2225
2226 /*
2227 * Re-injection should not affect process accounting
2228 */
2229 if ((flags & MSG_SKIPCFIL) == 0) {
2230 so_update_last_owner_locked(so, p);
2231 so_update_policy(so);
2232
2233 #if NECP
2234 so_update_necp_policy(so, NULL, addr);
2235 #endif /* NECP */
2236 }
2237
2238 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2239 error = EOPNOTSUPP;
2240 goto out_locked;
2241 }
2242
2243 /*
2244 * In theory resid should be unsigned.
2245 * However, space must be signed, as it might be less than 0
2246 * if we over-committed, and we must use a signed comparison
2247 * of space and resid. On the other hand, a negative resid
2248 * causes us to loop sending 0-length segments to the protocol.
2249 *
2250 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2251 *
2252 * Note: We limit resid to be a positive int value as we use
2253 * imin() to set bytes_to_copy -- radr://14558484
2254 */
2255 if (resid < 0 || resid > INT_MAX ||
2256 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2257 error = EINVAL;
2258 goto out_locked;
2259 }
2260
2261 dontroute = (flags & MSG_DONTROUTE) &&
2262 (so->so_options & SO_DONTROUTE) == 0 &&
2263 (so->so_proto->pr_flags & PR_ATOMIC);
2264 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2265
2266 if (control != NULL) {
2267 clen = control->m_len;
2268 }
2269
2270 if (soreserveheadroom != 0) {
2271 headroom = so->so_pktheadroom;
2272 }
2273
2274 do {
2275 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2276 &sblocked);
2277 if (error) {
2278 goto out_locked;
2279 }
2280
2281 mp = ⊤
2282 space = sbspace(&so->so_snd) - clen;
2283 space += ((flags & MSG_OOB) ? 1024 : 0);
2284
2285 do {
2286 if (uio == NULL) {
2287 /*
2288 * Data is prepackaged in "top".
2289 */
2290 resid = 0;
2291 if (flags & MSG_EOR) {
2292 top->m_flags |= M_EOR;
2293 }
2294 } else {
2295 int chainlength;
2296 int bytes_to_copy;
2297 boolean_t jumbocl;
2298 boolean_t bigcl;
2299 int bytes_to_alloc;
2300
2301 bytes_to_copy = imin((int)resid, (int)space);
2302
2303 bytes_to_alloc = bytes_to_copy;
2304 if (top == NULL) {
2305 bytes_to_alloc += headroom;
2306 }
2307
2308 if (sosendminchain > 0) {
2309 chainlength = 0;
2310 } else {
2311 chainlength = sosendmaxchain;
2312 }
2313
2314 /*
2315 * Use big 4 KB cluster when the outgoing interface
2316 * does not prefer 2 KB clusters
2317 */
2318 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2319 sosendbigcl_ignore_capab;
2320
2321 /*
2322 * Attempt to use larger than system page-size
2323 * clusters for large writes only if there is
2324 * a jumbo cluster pool and if the socket is
2325 * marked accordingly.
2326 */
2327 jumbocl = sosendjcl && njcl > 0 &&
2328 ((so->so_flags & SOF_MULTIPAGES) ||
2329 sosendjcl_ignore_capab) &&
2330 bigcl;
2331
2332 socket_unlock(so, 0);
2333
2334 do {
2335 int num_needed;
2336 int hdrs_needed = (top == NULL) ? 1 : 0;
2337
2338 /*
2339 * try to maintain a local cache of mbuf
2340 * clusters needed to complete this
2341 * write the list is further limited to
2342 * the number that are currently needed
2343 * to fill the socket this mechanism
2344 * allows a large number of mbufs/
2345 * clusters to be grabbed under a single
2346 * mbuf lock... if we can't get any
2347 * clusters, than fall back to trying
2348 * for mbufs if we fail early (or
2349 * miscalcluate the number needed) make
2350 * sure to release any clusters we
2351 * haven't yet consumed.
2352 */
2353 if (freelist == NULL &&
2354 bytes_to_alloc > MBIGCLBYTES &&
2355 jumbocl) {
2356 num_needed =
2357 bytes_to_alloc / M16KCLBYTES;
2358
2359 if ((bytes_to_alloc -
2360 (num_needed * M16KCLBYTES))
2361 >= MINCLSIZE) {
2362 num_needed++;
2363 }
2364
2365 freelist =
2366 m_getpackets_internal(
2367 (unsigned int *)&num_needed,
2368 hdrs_needed, M_WAIT, 0,
2369 M16KCLBYTES);
2370 /*
2371 * Fall back to 4K cluster size
2372 * if allocation failed
2373 */
2374 }
2375
2376 if (freelist == NULL &&
2377 bytes_to_alloc > MCLBYTES &&
2378 bigcl) {
2379 num_needed =
2380 bytes_to_alloc / MBIGCLBYTES;
2381
2382 if ((bytes_to_alloc -
2383 (num_needed * MBIGCLBYTES)) >=
2384 MINCLSIZE) {
2385 num_needed++;
2386 }
2387
2388 freelist =
2389 m_getpackets_internal(
2390 (unsigned int *)&num_needed,
2391 hdrs_needed, M_WAIT, 0,
2392 MBIGCLBYTES);
2393 /*
2394 * Fall back to cluster size
2395 * if allocation failed
2396 */
2397 }
2398
2399 /*
2400 * Allocate a cluster as we want to
2401 * avoid to split the data in more
2402 * that one segment and using MINCLSIZE
2403 * would lead us to allocate two mbufs
2404 */
2405 if (soreserveheadroom != 0 &&
2406 freelist == NULL &&
2407 ((top == NULL &&
2408 bytes_to_alloc > _MHLEN) ||
2409 bytes_to_alloc > _MLEN)) {
2410 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2411 MCLBYTES;
2412 freelist =
2413 m_getpackets_internal(
2414 (unsigned int *)&num_needed,
2415 hdrs_needed, M_WAIT, 0,
2416 MCLBYTES);
2417 /*
2418 * Fall back to a single mbuf
2419 * if allocation failed
2420 */
2421 } else if (freelist == NULL &&
2422 bytes_to_alloc > MINCLSIZE) {
2423 num_needed =
2424 bytes_to_alloc / MCLBYTES;
2425
2426 if ((bytes_to_alloc -
2427 (num_needed * MCLBYTES)) >=
2428 MINCLSIZE) {
2429 num_needed++;
2430 }
2431
2432 freelist =
2433 m_getpackets_internal(
2434 (unsigned int *)&num_needed,
2435 hdrs_needed, M_WAIT, 0,
2436 MCLBYTES);
2437 /*
2438 * Fall back to a single mbuf
2439 * if allocation failed
2440 */
2441 }
2442 /*
2443 * For datagram protocols, leave
2444 * headroom for protocol headers
2445 * in the first cluster of the chain
2446 */
2447 if (freelist != NULL && atomic &&
2448 top == NULL && headroom > 0) {
2449 freelist->m_data += headroom;
2450 }
2451
2452 /*
2453 * Fall back to regular mbufs without
2454 * reserving the socket headroom
2455 */
2456 if (freelist == NULL) {
2457 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2458 if (top == NULL) {
2459 MGETHDR(freelist,
2460 M_WAIT, MT_DATA);
2461 } else {
2462 MGET(freelist,
2463 M_WAIT, MT_DATA);
2464 }
2465 }
2466
2467 if (freelist == NULL) {
2468 error = ENOBUFS;
2469 socket_lock(so, 0);
2470 goto out_locked;
2471 }
2472 /*
2473 * For datagram protocols,
2474 * leave room for protocol
2475 * headers in first mbuf.
2476 */
2477 if (atomic && top == NULL &&
2478 bytes_to_copy > 0 &&
2479 bytes_to_copy < MHLEN) {
2480 MH_ALIGN(freelist,
2481 bytes_to_copy);
2482 }
2483 }
2484 m = freelist;
2485 freelist = m->m_next;
2486 m->m_next = NULL;
2487
2488 if ((m->m_flags & M_EXT)) {
2489 mlen = m->m_ext.ext_size -
2490 M_LEADINGSPACE(m);
2491 } else if ((m->m_flags & M_PKTHDR)) {
2492 mlen = MHLEN - M_LEADINGSPACE(m);
2493 m_add_crumb(m, PKT_CRUMB_SOSEND);
2494 } else {
2495 mlen = MLEN - M_LEADINGSPACE(m);
2496 }
2497 len = imin((int)mlen, bytes_to_copy);
2498
2499 chainlength += len;
2500
2501 space -= len;
2502
2503 error = uiomove(mtod(m, caddr_t),
2504 (int)len, uio);
2505
2506 resid = uio_resid(uio);
2507
2508 m->m_len = (int32_t)len;
2509 *mp = m;
2510 top->m_pkthdr.len += len;
2511 if (error) {
2512 break;
2513 }
2514 mp = &m->m_next;
2515 if (resid <= 0) {
2516 if (flags & MSG_EOR) {
2517 top->m_flags |= M_EOR;
2518 }
2519 break;
2520 }
2521 bytes_to_copy = imin((int)resid, (int)space);
2522 } while (space > 0 &&
2523 (chainlength < sosendmaxchain || atomic ||
2524 resid < MINCLSIZE));
2525
2526 socket_lock(so, 0);
2527
2528 if (error) {
2529 goto out_locked;
2530 }
2531 }
2532
2533 if (dontroute) {
2534 so->so_options |= SO_DONTROUTE;
2535 }
2536
2537 /*
2538 * Compute flags here, for pru_send and NKEs
2539 *
2540 * If the user set MSG_EOF, the protocol
2541 * understands this flag and nothing left to
2542 * send then use PRU_SEND_EOF instead of PRU_SEND.
2543 */
2544 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2545 ((flags & MSG_EOF) &&
2546 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2547 (resid <= 0)) ? PRUS_EOF :
2548 /* If there is more to send set PRUS_MORETOCOME */
2549 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2550
2551 if ((flags & MSG_SKIPCFIL) == 0) {
2552 /*
2553 * Socket filter processing
2554 */
2555 error = sflt_data_out(so, addr, &top,
2556 &control, (sendflags & MSG_OOB) ?
2557 sock_data_filt_flag_oob : 0);
2558 if (error) {
2559 if (error == EJUSTRETURN) {
2560 error = 0;
2561 goto packet_consumed;
2562 }
2563 goto out_locked;
2564 }
2565 #if CONTENT_FILTER
2566 /*
2567 * Content filter processing
2568 */
2569 error = cfil_sock_data_out(so, addr, top,
2570 control, sendflags, dgram_flow_entry);
2571 if (error) {
2572 if (error == EJUSTRETURN) {
2573 error = 0;
2574 goto packet_consumed;
2575 }
2576 goto out_locked;
2577 }
2578 #endif /* CONTENT_FILTER */
2579 }
2580 error = (*so->so_proto->pr_usrreqs->pru_send)
2581 (so, sendflags, top, addr, control, p);
2582
2583 packet_consumed:
2584 if (dontroute) {
2585 so->so_options &= ~SO_DONTROUTE;
2586 }
2587
2588 clen = 0;
2589 control = NULL;
2590 top = NULL;
2591 mp = ⊤
2592 if (error) {
2593 goto out_locked;
2594 }
2595 } while (resid && space > 0);
2596 } while (resid);
2597
2598
2599 out_locked:
2600 if (resid > orig_resid) {
2601 char pname[MAXCOMLEN] = {};
2602 pid_t current_pid = proc_pid(current_proc());
2603 proc_name(current_pid, pname, sizeof(pname));
2604
2605 if (sosend_assert_panic != 0) {
2606 panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2607 so, resid, orig_resid, pname, current_pid);
2608 } else {
2609 os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2610 so->so_gencnt, resid, orig_resid, pname, current_pid);
2611 }
2612 }
2613
2614 if (sblocked) {
2615 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2616 } else {
2617 socket_unlock(so, 1);
2618 }
2619 if (top != NULL) {
2620 m_freem(top);
2621 }
2622 if (control != NULL) {
2623 m_freem(control);
2624 }
2625 if (freelist != NULL) {
2626 m_freem_list(freelist);
2627 }
2628
2629 if (dgram_flow_entry != NULL) {
2630 soflow_free_flow(dgram_flow_entry);
2631 }
2632
2633 soclearfastopen(so);
2634
2635 if (en_tracing) {
2636 /* resid passed here is the bytes left in uio */
2637 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2638 VM_KERNEL_ADDRPERM(so),
2639 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2640 (int64_t)(orig_resid - resid));
2641 }
2642 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2643 so->so_snd.sb_cc, space, error);
2644
2645 return error;
2646 }
2647
2648 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2649 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2650 {
2651 struct mbuf *m0 = NULL, *control_end = NULL;
2652
2653 socket_lock_assert_owned(so);
2654
2655 /*
2656 * top must points to mbuf chain to be sent.
2657 * If control is not NULL, top must be packet header
2658 */
2659 VERIFY(top != NULL &&
2660 (control == NULL || top->m_flags & M_PKTHDR));
2661
2662 /*
2663 * If control is not passed in, see if we can get it
2664 * from top.
2665 */
2666 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2667 // Locate start of control if present and start of data
2668 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2669 if (m0->m_flags & M_PKTHDR) {
2670 top = m0;
2671 break;
2672 } else if (m0->m_type == MT_CONTROL) {
2673 if (control == NULL) {
2674 // Found start of control
2675 control = m0;
2676 }
2677 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2678 // Found end of control
2679 control_end = m0;
2680 }
2681 }
2682 }
2683 if (control_end != NULL) {
2684 control_end->m_next = NULL;
2685 }
2686 }
2687
2688 int error = (*so->so_proto->pr_usrreqs->pru_send)
2689 (so, sendflags, top, addr, control, current_proc());
2690
2691 return error;
2692 }
2693
2694 /*
2695 * Supported only connected sockets (no address) without ancillary data
2696 * (control mbuf) for atomic protocols
2697 */
2698 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2699 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2700 {
2701 struct mbuf *m, *freelist = NULL;
2702 struct soflow_hash_entry *dgram_flow_entry = NULL;
2703 user_ssize_t len, resid;
2704 int error, dontroute;
2705 int atomic = sosendallatonce(so);
2706 int sblocked = 0;
2707 struct proc *p = current_proc();
2708 u_int uiofirst = 0;
2709 u_int uiolast = 0;
2710 struct mbuf *top = NULL;
2711 uint16_t headroom = 0;
2712 ssize_t mlen;
2713 boolean_t bigcl;
2714
2715 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2716 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2717
2718 if (so->so_type != SOCK_DGRAM) {
2719 error = EINVAL;
2720 goto out;
2721 }
2722 if (atomic == 0) {
2723 error = EINVAL;
2724 goto out;
2725 }
2726 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2727 error = EPROTONOSUPPORT;
2728 goto out;
2729 }
2730 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2731 error = EINVAL;
2732 goto out;
2733 }
2734 resid = uio_array_resid(uioarray, uiocnt);
2735
2736 /*
2737 * In theory resid should be unsigned.
2738 * However, space must be signed, as it might be less than 0
2739 * if we over-committed, and we must use a signed comparison
2740 * of space and resid. On the other hand, a negative resid
2741 * causes us to loop sending 0-length segments to the protocol.
2742 *
2743 * Note: We limit resid to be a positive int value as we use
2744 * imin() to set bytes_to_copy -- radr://14558484
2745 */
2746 if (resid < 0 || resid > INT_MAX) {
2747 error = EINVAL;
2748 goto out;
2749 }
2750
2751 socket_lock(so, 1);
2752 so_update_last_owner_locked(so, p);
2753 so_update_policy(so);
2754
2755 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2756 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2757 }
2758
2759 #if NECP
2760 so_update_necp_policy(so, NULL, NULL);
2761 #endif /* NECP */
2762
2763 dontroute = (flags & MSG_DONTROUTE) &&
2764 (so->so_options & SO_DONTROUTE) == 0 &&
2765 (so->so_proto->pr_flags & PR_ATOMIC);
2766 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2767
2768 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2769 if (error) {
2770 goto release;
2771 }
2772
2773 /*
2774 * Use big 4 KB clusters when the outgoing interface does not prefer
2775 * 2 KB clusters
2776 */
2777 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2778
2779 if (soreserveheadroom != 0) {
2780 headroom = so->so_pktheadroom;
2781 }
2782
2783 do {
2784 int i;
2785 int num_needed = 0;
2786 int chainlength;
2787 size_t maxpktlen = 0;
2788 int bytes_to_alloc;
2789
2790 if (sosendminchain > 0) {
2791 chainlength = 0;
2792 } else {
2793 chainlength = sosendmaxchain;
2794 }
2795
2796 socket_unlock(so, 0);
2797
2798 /*
2799 * Find a set of uio that fit in a reasonable number
2800 * of mbuf packets
2801 */
2802 for (i = uiofirst; i < uiocnt; i++) {
2803 struct uio *auio = uioarray[i];
2804
2805 len = uio_resid(auio);
2806
2807 /* Do nothing for empty messages */
2808 if (len == 0) {
2809 continue;
2810 }
2811
2812 num_needed += 1;
2813 uiolast += 1;
2814
2815 if (len > maxpktlen) {
2816 maxpktlen = len;
2817 }
2818
2819 chainlength += len;
2820 if (chainlength > sosendmaxchain) {
2821 break;
2822 }
2823 }
2824 /*
2825 * Nothing left to send
2826 */
2827 if (num_needed == 0) {
2828 socket_lock(so, 0);
2829 break;
2830 }
2831 /*
2832 * Allocate buffer large enough to include headroom space for
2833 * network and link header
2834 *
2835 */
2836 bytes_to_alloc = (int) maxpktlen + headroom;
2837
2838 /*
2839 * Allocate a single contiguous buffer of the smallest available
2840 * size when possible
2841 */
2842 if (bytes_to_alloc > MCLBYTES &&
2843 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2844 freelist = m_getpackets_internal(
2845 (unsigned int *)&num_needed,
2846 num_needed, M_WAIT, 1,
2847 MBIGCLBYTES);
2848 } else if (bytes_to_alloc > _MHLEN &&
2849 bytes_to_alloc <= MCLBYTES) {
2850 freelist = m_getpackets_internal(
2851 (unsigned int *)&num_needed,
2852 num_needed, M_WAIT, 1,
2853 MCLBYTES);
2854 } else {
2855 freelist = m_allocpacket_internal(
2856 (unsigned int *)&num_needed,
2857 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2858 }
2859
2860 if (freelist == NULL) {
2861 socket_lock(so, 0);
2862 error = ENOMEM;
2863 goto release;
2864 }
2865 /*
2866 * Copy each uio of the set into its own mbuf packet
2867 */
2868 for (i = uiofirst, m = freelist;
2869 i < uiolast && m != NULL;
2870 i++) {
2871 int bytes_to_copy;
2872 struct mbuf *n;
2873 struct uio *auio = uioarray[i];
2874
2875 bytes_to_copy = (int)uio_resid(auio);
2876
2877 /* Do nothing for empty messages */
2878 if (bytes_to_copy == 0) {
2879 continue;
2880 }
2881 /*
2882 * Leave headroom for protocol headers
2883 * in the first mbuf of the chain
2884 */
2885 m->m_data += headroom;
2886
2887 for (n = m; n != NULL; n = n->m_next) {
2888 if ((m->m_flags & M_EXT)) {
2889 mlen = m->m_ext.ext_size -
2890 M_LEADINGSPACE(m);
2891 } else if ((m->m_flags & M_PKTHDR)) {
2892 mlen =
2893 MHLEN - M_LEADINGSPACE(m);
2894 } else {
2895 mlen = MLEN - M_LEADINGSPACE(m);
2896 }
2897 len = imin((int)mlen, bytes_to_copy);
2898
2899 /*
2900 * Note: uiomove() decrements the iovec
2901 * length
2902 */
2903 error = uiomove(mtod(n, caddr_t),
2904 (int)len, auio);
2905 if (error != 0) {
2906 break;
2907 }
2908 n->m_len = (int32_t)len;
2909 m->m_pkthdr.len += len;
2910
2911 VERIFY(m->m_pkthdr.len <= maxpktlen);
2912
2913 bytes_to_copy -= len;
2914 resid -= len;
2915 }
2916 if (m->m_pkthdr.len == 0) {
2917 printf(
2918 "%s:%d so %llx pkt %llx type %u len null\n",
2919 __func__, __LINE__,
2920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2921 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2922 m->m_type);
2923 }
2924 if (error != 0) {
2925 break;
2926 }
2927 m = m->m_nextpkt;
2928 }
2929
2930 socket_lock(so, 0);
2931
2932 if (error) {
2933 goto release;
2934 }
2935 top = freelist;
2936 freelist = NULL;
2937
2938 if (dontroute) {
2939 so->so_options |= SO_DONTROUTE;
2940 }
2941
2942 if ((flags & MSG_SKIPCFIL) == 0) {
2943 struct mbuf **prevnextp = NULL;
2944
2945 for (i = uiofirst, m = top;
2946 i < uiolast && m != NULL;
2947 i++) {
2948 struct mbuf *nextpkt = m->m_nextpkt;
2949
2950 /*
2951 * Socket filter processing
2952 */
2953 error = sflt_data_out(so, NULL, &m,
2954 NULL, 0);
2955 if (error != 0 && error != EJUSTRETURN) {
2956 goto release;
2957 }
2958
2959 #if CONTENT_FILTER
2960 if (error == 0) {
2961 /*
2962 * Content filter processing
2963 */
2964 error = cfil_sock_data_out(so, NULL, m,
2965 NULL, 0, dgram_flow_entry);
2966 if (error != 0 && error != EJUSTRETURN) {
2967 goto release;
2968 }
2969 }
2970 #endif /* CONTENT_FILTER */
2971 /*
2972 * Remove packet from the list when
2973 * swallowed by a filter
2974 */
2975 if (error == EJUSTRETURN) {
2976 error = 0;
2977 if (prevnextp != NULL) {
2978 *prevnextp = nextpkt;
2979 } else {
2980 top = nextpkt;
2981 }
2982 }
2983
2984 m = nextpkt;
2985 if (m != NULL) {
2986 prevnextp = &m->m_nextpkt;
2987 }
2988 }
2989 }
2990 if (top != NULL) {
2991 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2992 (so, 0, top, NULL, NULL, p);
2993 }
2994
2995 if (dontroute) {
2996 so->so_options &= ~SO_DONTROUTE;
2997 }
2998
2999 top = NULL;
3000 uiofirst = uiolast;
3001 } while (resid > 0 && error == 0);
3002 release:
3003 if (sblocked) {
3004 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
3005 } else {
3006 socket_unlock(so, 1);
3007 }
3008 out:
3009 if (top != NULL) {
3010 m_freem(top);
3011 }
3012 if (freelist != NULL) {
3013 m_freem_list(freelist);
3014 }
3015
3016 if (dgram_flow_entry != NULL) {
3017 soflow_free_flow(dgram_flow_entry);
3018 }
3019
3020 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
3021 so->so_snd.sb_cc, 0, error);
3022
3023 return error;
3024 }
3025
3026 /*
3027 * May return ERESTART when packet is dropped by MAC policy check
3028 */
3029 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)3030 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
3031 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
3032 {
3033 int error = 0;
3034 struct mbuf *m = *mp;
3035 struct mbuf *nextrecord = *nextrecordp;
3036
3037 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3038 #if CONFIG_MACF_SOCKET_SUBSET
3039 /*
3040 * Call the MAC framework for policy checking if we're in
3041 * the user process context and the socket isn't connected.
3042 */
3043 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3044 struct mbuf *m0 = m;
3045 /*
3046 * Dequeue this record (temporarily) from the receive
3047 * list since we're about to drop the socket's lock
3048 * where a new record may arrive and be appended to
3049 * the list. Upon MAC policy failure, the record
3050 * will be freed. Otherwise, we'll add it back to
3051 * the head of the list. We cannot rely on SB_LOCK
3052 * because append operation uses the socket's lock.
3053 */
3054 do {
3055 m->m_nextpkt = NULL;
3056 sbfree(&so->so_rcv, m);
3057 m = m->m_next;
3058 } while (m != NULL);
3059 m = m0;
3060 so->so_rcv.sb_mb = nextrecord;
3061 SB_EMPTY_FIXUP(&so->so_rcv);
3062 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3063 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3064 socket_unlock(so, 0);
3065
3066 error = mac_socket_check_received(kauth_cred_get(), so,
3067 mtod(m, struct sockaddr *));
3068
3069 if (error != 0) {
3070 /*
3071 * MAC policy failure; free this record and
3072 * process the next record (or block until
3073 * one is available). We have adjusted sb_cc
3074 * and sb_mbcnt above so there is no need to
3075 * call sbfree() again.
3076 */
3077 m_freem(m);
3078 /*
3079 * Clear SB_LOCK but don't unlock the socket.
3080 * Process the next record or wait for one.
3081 */
3082 socket_lock(so, 0);
3083 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3084 error = ERESTART;
3085 goto done;
3086 }
3087 socket_lock(so, 0);
3088 /*
3089 * If the socket has been defunct'd, drop it.
3090 */
3091 if (so->so_flags & SOF_DEFUNCT) {
3092 m_freem(m);
3093 error = ENOTCONN;
3094 goto done;
3095 }
3096 /*
3097 * Re-adjust the socket receive list and re-enqueue
3098 * the record in front of any packets which may have
3099 * been appended while we dropped the lock.
3100 */
3101 for (m = m0; m->m_next != NULL; m = m->m_next) {
3102 sballoc(&so->so_rcv, m);
3103 }
3104 sballoc(&so->so_rcv, m);
3105 if (so->so_rcv.sb_mb == NULL) {
3106 so->so_rcv.sb_lastrecord = m0;
3107 so->so_rcv.sb_mbtail = m;
3108 }
3109 m = m0;
3110 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3111 so->so_rcv.sb_mb = m;
3112 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3113 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3114 }
3115 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3116 if (psa != NULL) {
3117 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3118 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3119 error = EWOULDBLOCK;
3120 goto done;
3121 }
3122 }
3123 if (flags & MSG_PEEK) {
3124 m = m->m_next;
3125 } else {
3126 sbfree(&so->so_rcv, m);
3127 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3128 panic("%s: about to create invalid socketbuf",
3129 __func__);
3130 /* NOTREACHED */
3131 }
3132 MFREE(m, so->so_rcv.sb_mb);
3133 m = so->so_rcv.sb_mb;
3134 if (m != NULL) {
3135 m->m_nextpkt = nextrecord;
3136 } else {
3137 so->so_rcv.sb_mb = nextrecord;
3138 SB_EMPTY_FIXUP(&so->so_rcv);
3139 }
3140 }
3141 done:
3142 *mp = m;
3143 *nextrecordp = nextrecord;
3144
3145 return error;
3146 }
3147
3148 /*
3149 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3150 * so clear the data portion in order not to leak the file pointers
3151 */
3152 static void
sopeek_scm_rights(struct mbuf * rights)3153 sopeek_scm_rights(struct mbuf *rights)
3154 {
3155 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3156
3157 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3158 VERIFY(cm->cmsg_len <= rights->m_len);
3159 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3160 }
3161 }
3162
3163 /*
3164 * Process one or more MT_CONTROL mbufs present before any data mbufs
3165 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3166 * just copy the data; if !MSG_PEEK, we call into the protocol to
3167 * perform externalization.
3168 */
3169 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3170 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3171 struct mbuf **mp, struct mbuf **nextrecordp)
3172 {
3173 int error = 0;
3174 struct mbuf *cm = NULL, *cmn;
3175 struct mbuf **cme = &cm;
3176 struct sockbuf *sb_rcv = &so->so_rcv;
3177 struct mbuf **msgpcm = NULL;
3178 struct mbuf *m = *mp;
3179 struct mbuf *nextrecord = *nextrecordp;
3180 struct protosw *pr = so->so_proto;
3181
3182 /*
3183 * Externalizing the control messages would require us to
3184 * drop the socket's lock below. Once we re-acquire the
3185 * lock, the mbuf chain might change. In order to preserve
3186 * consistency, we unlink all control messages from the
3187 * first mbuf chain in one shot and link them separately
3188 * onto a different chain.
3189 */
3190 do {
3191 if (flags & MSG_PEEK) {
3192 if (controlp != NULL) {
3193 if (*controlp == NULL) {
3194 msgpcm = controlp;
3195 }
3196 *controlp = m_copy(m, 0, m->m_len);
3197
3198 /*
3199 * If we failed to allocate an mbuf,
3200 * release any previously allocated
3201 * mbufs for control data. Return
3202 * an error. Keep the mbufs in the
3203 * socket as this is using
3204 * MSG_PEEK flag.
3205 */
3206 if (*controlp == NULL) {
3207 m_freem(*msgpcm);
3208 error = ENOBUFS;
3209 goto done;
3210 }
3211
3212 if (pr->pr_domain->dom_externalize != NULL) {
3213 sopeek_scm_rights(*controlp);
3214 }
3215
3216 controlp = &(*controlp)->m_next;
3217 }
3218 m = m->m_next;
3219 } else {
3220 m->m_nextpkt = NULL;
3221 sbfree(sb_rcv, m);
3222 sb_rcv->sb_mb = m->m_next;
3223 m->m_next = NULL;
3224 *cme = m;
3225 cme = &(*cme)->m_next;
3226 m = sb_rcv->sb_mb;
3227 }
3228 } while (m != NULL && m->m_type == MT_CONTROL);
3229
3230 if (!(flags & MSG_PEEK)) {
3231 if (sb_rcv->sb_mb != NULL) {
3232 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3233 } else {
3234 sb_rcv->sb_mb = nextrecord;
3235 SB_EMPTY_FIXUP(sb_rcv);
3236 }
3237 if (nextrecord == NULL) {
3238 sb_rcv->sb_lastrecord = m;
3239 }
3240 }
3241
3242 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3243 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3244
3245 while (cm != NULL) {
3246 int cmsg_level;
3247 int cmsg_type;
3248
3249 cmn = cm->m_next;
3250 cm->m_next = NULL;
3251 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3252 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3253
3254 /*
3255 * Call the protocol to externalize SCM_RIGHTS message
3256 * and return the modified message to the caller upon
3257 * success. Otherwise, all other control messages are
3258 * returned unmodified to the caller. Note that we
3259 * only get into this loop if MSG_PEEK is not set.
3260 */
3261 if (pr->pr_domain->dom_externalize != NULL &&
3262 cmsg_level == SOL_SOCKET &&
3263 cmsg_type == SCM_RIGHTS) {
3264 /*
3265 * Release socket lock: see 3903171. This
3266 * would also allow more records to be appended
3267 * to the socket buffer. We still have SB_LOCK
3268 * set on it, so we can be sure that the head
3269 * of the mbuf chain won't change.
3270 */
3271 socket_unlock(so, 0);
3272 error = (*pr->pr_domain->dom_externalize)(cm);
3273 socket_lock(so, 0);
3274 } else {
3275 error = 0;
3276 }
3277
3278 if (controlp != NULL && error == 0) {
3279 *controlp = cm;
3280 controlp = &(*controlp)->m_next;
3281 } else {
3282 (void) m_free(cm);
3283 }
3284 cm = cmn;
3285 }
3286 /*
3287 * Update the value of nextrecord in case we received new
3288 * records when the socket was unlocked above for
3289 * externalizing SCM_RIGHTS.
3290 */
3291 if (m != NULL) {
3292 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3293 } else {
3294 nextrecord = sb_rcv->sb_mb;
3295 }
3296
3297 done:
3298 *mp = m;
3299 *nextrecordp = nextrecord;
3300
3301 return error;
3302 }
3303
3304 /*
3305 * If we have less data than requested, block awaiting more
3306 * (subject to any timeout) if:
3307 * 1. the current count is less than the low water mark, or
3308 * 2. MSG_WAITALL is set, and it is possible to do the entire
3309 * receive operation at once if we block (resid <= hiwat).
3310 * 3. MSG_DONTWAIT is not set
3311 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3312 * we have to do the receive in sections, and thus risk returning
3313 * a short count if a timeout or signal occurs after we start.
3314 */
3315 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3316 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3317 {
3318 struct protosw *pr = so->so_proto;
3319
3320 /* No mbufs in the receive-queue? Wait! */
3321 if (m == NULL) {
3322 return true;
3323 }
3324
3325 /* Not enough data in the receive socket-buffer - we may have to wait */
3326 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3327 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3328 /*
3329 * Application did set the lowater-mark, so we should wait for
3330 * this data to be present.
3331 */
3332 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3333 return true;
3334 }
3335
3336 /*
3337 * Application wants all the data - so let's try to do the
3338 * receive-operation at once by waiting for everything to
3339 * be there.
3340 */
3341 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3342 return true;
3343 }
3344 }
3345
3346 return false;
3347 }
3348
3349 /*
3350 * Implement receive operations on a socket.
3351 * We depend on the way that records are added to the sockbuf
3352 * by sbappend*. In particular, each record (mbufs linked through m_next)
3353 * must begin with an address if the protocol so specifies,
3354 * followed by an optional mbuf or mbufs containing ancillary data,
3355 * and then zero or more mbufs of data.
3356 * In order to avoid blocking network interrupts for the entire time here,
3357 * we splx() while doing the actual copy to user space.
3358 * Although the sockbuf is locked, new data may still be appended,
3359 * and thus we must maintain consistency of the sockbuf during that time.
3360 *
3361 * The caller may receive the data as a single mbuf chain by supplying
3362 * an mbuf **mp0 for use in returning the chain. The uio is then used
3363 * only for the count in uio_resid.
3364 *
3365 * Returns: 0 Success
3366 * ENOBUFS
3367 * ENOTCONN
3368 * EWOULDBLOCK
3369 * uiomove:EFAULT
3370 * sblock:EWOULDBLOCK
3371 * sblock:EINTR
3372 * sbwait:EBADF
3373 * sbwait:EINTR
3374 * sodelayed_copy:EFAULT
3375 * <pru_rcvoob>:EINVAL[TCP]
3376 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3377 * <pru_rcvoob>:???
3378 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3379 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3380 * <pr_domain->dom_externalize>:???
3381 *
3382 * Notes: Additional return values from calls through <pru_rcvoob> and
3383 * <pr_domain->dom_externalize> depend on protocols other than
3384 * TCP or AF_UNIX, which are documented above.
3385 */
3386 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3387 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3388 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3389 {
3390 struct mbuf *m, **mp, *ml = NULL;
3391 struct mbuf *nextrecord, *free_list;
3392 int flags, error, offset;
3393 user_ssize_t len;
3394 struct protosw *pr = so->so_proto;
3395 int moff, type = 0;
3396 user_ssize_t orig_resid = uio_resid(uio);
3397 user_ssize_t delayed_copy_len;
3398 int can_delay;
3399 struct proc *p = current_proc();
3400 boolean_t en_tracing = FALSE;
3401
3402 /*
3403 * Sanity check on the length passed by caller as we are making 'int'
3404 * comparisons
3405 */
3406 if (orig_resid < 0 || orig_resid > INT_MAX) {
3407 return EINVAL;
3408 }
3409
3410 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3411 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3412 so->so_rcv.sb_hiwat);
3413
3414 socket_lock(so, 1);
3415 so_update_last_owner_locked(so, p);
3416 so_update_policy(so);
3417
3418 #ifdef MORE_LOCKING_DEBUG
3419 if (so->so_usecount == 1) {
3420 panic("%s: so=%x no other reference on socket", __func__, so);
3421 /* NOTREACHED */
3422 }
3423 #endif
3424 mp = mp0;
3425 if (psa != NULL) {
3426 *psa = NULL;
3427 }
3428 if (controlp != NULL) {
3429 *controlp = NULL;
3430 }
3431 if (flagsp != NULL) {
3432 flags = *flagsp & ~MSG_EOR;
3433 } else {
3434 flags = 0;
3435 }
3436
3437 /*
3438 * If a recv attempt is made on a previously-accepted socket
3439 * that has been marked as inactive (disconnected), reject
3440 * the request.
3441 */
3442 if (so->so_flags & SOF_DEFUNCT) {
3443 struct sockbuf *sb = &so->so_rcv;
3444
3445 error = ENOTCONN;
3446 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3447 __func__, proc_pid(p), proc_best_name(p),
3448 so->so_gencnt,
3449 SOCK_DOM(so), SOCK_TYPE(so), error);
3450 /*
3451 * This socket should have been disconnected and flushed
3452 * prior to being returned from sodefunct(); there should
3453 * be no data on its receive list, so panic otherwise.
3454 */
3455 if (so->so_state & SS_DEFUNCT) {
3456 sb_empty_assert(sb, __func__);
3457 }
3458 socket_unlock(so, 1);
3459 return error;
3460 }
3461
3462 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3463 pr->pr_usrreqs->pru_preconnect) {
3464 /*
3465 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3466 * calling write() right after this. *If* the app calls a read
3467 * we do not want to block this read indefinetely. Thus,
3468 * we trigger a connect so that the session gets initiated.
3469 */
3470 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3471
3472 if (error) {
3473 socket_unlock(so, 1);
3474 return error;
3475 }
3476 }
3477
3478 if (ENTR_SHOULDTRACE &&
3479 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3480 /*
3481 * enable energy tracing for inet sockets that go over
3482 * non-loopback interfaces only.
3483 */
3484 struct inpcb *inp = sotoinpcb(so);
3485 if (inp->inp_last_outifp != NULL &&
3486 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3487 en_tracing = TRUE;
3488 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3489 VM_KERNEL_ADDRPERM(so),
3490 ((so->so_state & SS_NBIO) ?
3491 kEnTrFlagNonBlocking : 0),
3492 (int64_t)orig_resid);
3493 }
3494 }
3495
3496 /*
3497 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3498 * regardless of the flags argument. Here is the case were
3499 * out-of-band data is not inline.
3500 */
3501 if ((flags & MSG_OOB) ||
3502 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3503 (so->so_options & SO_OOBINLINE) == 0 &&
3504 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3505 m = m_get(M_WAIT, MT_DATA);
3506 if (m == NULL) {
3507 socket_unlock(so, 1);
3508 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3509 ENOBUFS, 0, 0, 0, 0);
3510 return ENOBUFS;
3511 }
3512 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3513 if (error) {
3514 goto bad;
3515 }
3516 socket_unlock(so, 0);
3517 do {
3518 error = uiomove(mtod(m, caddr_t),
3519 imin((int)uio_resid(uio), m->m_len), uio);
3520 m = m_free(m);
3521 } while (uio_resid(uio) && error == 0 && m != NULL);
3522 socket_lock(so, 0);
3523 bad:
3524 if (m != NULL) {
3525 m_freem(m);
3526 }
3527
3528 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3529 if (error == EWOULDBLOCK || error == EINVAL) {
3530 /*
3531 * Let's try to get normal data:
3532 * EWOULDBLOCK: out-of-band data not
3533 * receive yet. EINVAL: out-of-band data
3534 * already read.
3535 */
3536 error = 0;
3537 goto nooob;
3538 } else if (error == 0 && flagsp != NULL) {
3539 *flagsp |= MSG_OOB;
3540 }
3541 }
3542 socket_unlock(so, 1);
3543 if (en_tracing) {
3544 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3545 VM_KERNEL_ADDRPERM(so), 0,
3546 (int64_t)(orig_resid - uio_resid(uio)));
3547 }
3548 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3549 0, 0, 0, 0);
3550
3551 return error;
3552 }
3553 nooob:
3554 if (mp != NULL) {
3555 *mp = NULL;
3556 }
3557
3558 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3559 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3560 }
3561
3562 free_list = NULL;
3563 delayed_copy_len = 0;
3564 restart:
3565 #ifdef MORE_LOCKING_DEBUG
3566 if (so->so_usecount <= 1) {
3567 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3568 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3569 }
3570 #endif
3571 /*
3572 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3573 * and if so just return to the caller. This could happen when
3574 * soreceive() is called by a socket upcall function during the
3575 * time the socket is freed. The socket buffer would have been
3576 * locked across the upcall, therefore we cannot put this thread
3577 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3578 * we may livelock), because the lock on the socket buffer will
3579 * only be released when the upcall routine returns to its caller.
3580 * Because the socket has been officially closed, there can be
3581 * no further read on it.
3582 *
3583 * A multipath subflow socket would have its SS_NOFDREF set by
3584 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3585 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3586 */
3587 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3588 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3589 socket_unlock(so, 1);
3590 return 0;
3591 }
3592
3593 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3594 if (error) {
3595 socket_unlock(so, 1);
3596 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3597 0, 0, 0, 0);
3598 if (en_tracing) {
3599 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3600 VM_KERNEL_ADDRPERM(so), 0,
3601 (int64_t)(orig_resid - uio_resid(uio)));
3602 }
3603 return error;
3604 }
3605
3606 m = so->so_rcv.sb_mb;
3607 if (so_should_wait(so, uio, m, flags)) {
3608 /*
3609 * Panic if we notice inconsistencies in the socket's
3610 * receive list; both sb_mb and sb_cc should correctly
3611 * reflect the contents of the list, otherwise we may
3612 * end up with false positives during select() or poll()
3613 * which could put the application in a bad state.
3614 */
3615 SB_MB_CHECK(&so->so_rcv);
3616
3617 if (so->so_error) {
3618 if (m != NULL) {
3619 goto dontblock;
3620 }
3621 error = so->so_error;
3622 if ((flags & MSG_PEEK) == 0) {
3623 so->so_error = 0;
3624 }
3625 goto release;
3626 }
3627 if (so->so_state & SS_CANTRCVMORE) {
3628 #if CONTENT_FILTER
3629 /*
3630 * Deal with half closed connections
3631 */
3632 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3633 cfil_sock_data_pending(&so->so_rcv) != 0) {
3634 CFIL_LOG(LOG_INFO,
3635 "so %llx ignore SS_CANTRCVMORE",
3636 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3637 } else
3638 #endif /* CONTENT_FILTER */
3639 if (m != NULL) {
3640 goto dontblock;
3641 } else {
3642 goto release;
3643 }
3644 }
3645 for (; m != NULL; m = m->m_next) {
3646 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3647 m = so->so_rcv.sb_mb;
3648 goto dontblock;
3649 }
3650 }
3651 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3652 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3653 error = ENOTCONN;
3654 goto release;
3655 }
3656 if (uio_resid(uio) == 0) {
3657 goto release;
3658 }
3659
3660 if ((so->so_state & SS_NBIO) ||
3661 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3662 error = EWOULDBLOCK;
3663 goto release;
3664 }
3665 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3666 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3667 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3668 #if EVEN_MORE_LOCKING_DEBUG
3669 if (socket_debug) {
3670 printf("Waiting for socket data\n");
3671 }
3672 #endif
3673
3674 /*
3675 * Depending on the protocol (e.g. TCP), the following
3676 * might cause the socket lock to be dropped and later
3677 * be reacquired, and more data could have arrived and
3678 * have been appended to the receive socket buffer by
3679 * the time it returns. Therefore, we only sleep in
3680 * sbwait() below if and only if the wait-condition is still
3681 * true.
3682 */
3683 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3684 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3685 }
3686
3687 error = 0;
3688 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3689 error = sbwait(&so->so_rcv);
3690 }
3691
3692 #if EVEN_MORE_LOCKING_DEBUG
3693 if (socket_debug) {
3694 printf("SORECEIVE - sbwait returned %d\n", error);
3695 }
3696 #endif
3697 if (so->so_usecount < 1) {
3698 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3699 __func__, so, so->so_usecount);
3700 /* NOTREACHED */
3701 }
3702 if (error) {
3703 socket_unlock(so, 1);
3704 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3705 0, 0, 0, 0);
3706 if (en_tracing) {
3707 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3708 VM_KERNEL_ADDRPERM(so), 0,
3709 (int64_t)(orig_resid - uio_resid(uio)));
3710 }
3711 return error;
3712 }
3713 goto restart;
3714 }
3715 dontblock:
3716 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3717 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3718 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3719 nextrecord = m->m_nextpkt;
3720
3721 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3722 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3723 mp0 == NULL);
3724 if (error == ERESTART) {
3725 goto restart;
3726 } else if (error != 0) {
3727 goto release;
3728 }
3729 orig_resid = 0;
3730 }
3731
3732 /*
3733 * Process one or more MT_CONTROL mbufs present before any data mbufs
3734 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3735 * just copy the data; if !MSG_PEEK, we call into the protocol to
3736 * perform externalization.
3737 */
3738 if (m != NULL && m->m_type == MT_CONTROL) {
3739 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3740 if (error != 0) {
3741 goto release;
3742 }
3743 orig_resid = 0;
3744 }
3745
3746 if (m != NULL) {
3747 if (!(flags & MSG_PEEK)) {
3748 /*
3749 * We get here because m points to an mbuf following
3750 * any MT_SONAME or MT_CONTROL mbufs which have been
3751 * processed above. In any case, m should be pointing
3752 * to the head of the mbuf chain, and the nextrecord
3753 * should be either NULL or equal to m->m_nextpkt.
3754 * See comments above about SB_LOCK.
3755 */
3756 if (m != so->so_rcv.sb_mb ||
3757 m->m_nextpkt != nextrecord) {
3758 panic("%s: post-control !sync so=%p m=%p "
3759 "nextrecord=%p\n", __func__, so, m,
3760 nextrecord);
3761 /* NOTREACHED */
3762 }
3763 if (nextrecord == NULL) {
3764 so->so_rcv.sb_lastrecord = m;
3765 }
3766 }
3767 type = m->m_type;
3768 if (type == MT_OOBDATA) {
3769 flags |= MSG_OOB;
3770 }
3771 } else {
3772 if (!(flags & MSG_PEEK)) {
3773 SB_EMPTY_FIXUP(&so->so_rcv);
3774 }
3775 }
3776 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3777 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3778
3779 moff = 0;
3780 offset = 0;
3781
3782 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3783 can_delay = 1;
3784 } else {
3785 can_delay = 0;
3786 }
3787
3788 while (m != NULL &&
3789 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3790 if (m->m_type == MT_OOBDATA) {
3791 if (type != MT_OOBDATA) {
3792 break;
3793 }
3794 } else if (type == MT_OOBDATA) {
3795 break;
3796 }
3797
3798 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3799 m->m_type != MT_HEADER) {
3800 break;
3801 }
3802 /*
3803 * Make sure to allways set MSG_OOB event when getting
3804 * out of band data inline.
3805 */
3806 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3807 (so->so_options & SO_OOBINLINE) != 0 &&
3808 (so->so_state & SS_RCVATMARK) != 0) {
3809 flags |= MSG_OOB;
3810 }
3811 so->so_state &= ~SS_RCVATMARK;
3812 len = uio_resid(uio) - delayed_copy_len;
3813 if (so->so_oobmark && len > so->so_oobmark - offset) {
3814 len = so->so_oobmark - offset;
3815 }
3816 if (len > m->m_len - moff) {
3817 len = m->m_len - moff;
3818 }
3819 /*
3820 * If mp is set, just pass back the mbufs.
3821 * Otherwise copy them out via the uio, then free.
3822 * Sockbuf must be consistent here (points to current mbuf,
3823 * it points to next record) when we drop priority;
3824 * we must note any additions to the sockbuf when we
3825 * block interrupts again.
3826 */
3827 if (mp == NULL) {
3828 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3829 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3830 if (can_delay && len == m->m_len) {
3831 /*
3832 * only delay the copy if we're consuming the
3833 * mbuf and we're NOT in MSG_PEEK mode
3834 * and we have enough data to make it worthwile
3835 * to drop and retake the lock... can_delay
3836 * reflects the state of the 2 latter
3837 * constraints moff should always be zero
3838 * in these cases
3839 */
3840 delayed_copy_len += len;
3841 } else {
3842 if (delayed_copy_len) {
3843 error = sodelayed_copy(so, uio,
3844 &free_list, &delayed_copy_len);
3845
3846 if (error) {
3847 goto release;
3848 }
3849 /*
3850 * can only get here if MSG_PEEK is not
3851 * set therefore, m should point at the
3852 * head of the rcv queue; if it doesn't,
3853 * it means something drastically
3854 * changed while we were out from behind
3855 * the lock in sodelayed_copy. perhaps
3856 * a RST on the stream. in any event,
3857 * the stream has been interrupted. it's
3858 * probably best just to return whatever
3859 * data we've moved and let the caller
3860 * sort it out...
3861 */
3862 if (m != so->so_rcv.sb_mb) {
3863 break;
3864 }
3865 }
3866 socket_unlock(so, 0);
3867 error = uiomove(mtod(m, caddr_t) + moff,
3868 (int)len, uio);
3869 socket_lock(so, 0);
3870
3871 if (error) {
3872 goto release;
3873 }
3874 }
3875 } else {
3876 uio_setresid(uio, (uio_resid(uio) - len));
3877 }
3878 if (len == m->m_len - moff) {
3879 if (m->m_flags & M_EOR) {
3880 flags |= MSG_EOR;
3881 }
3882 if (flags & MSG_PEEK) {
3883 m = m->m_next;
3884 moff = 0;
3885 } else {
3886 nextrecord = m->m_nextpkt;
3887 sbfree(&so->so_rcv, m);
3888 m->m_nextpkt = NULL;
3889
3890 if (mp != NULL) {
3891 *mp = m;
3892 mp = &m->m_next;
3893 so->so_rcv.sb_mb = m = m->m_next;
3894 *mp = NULL;
3895 } else {
3896 if (free_list == NULL) {
3897 free_list = m;
3898 } else {
3899 ml->m_next = m;
3900 }
3901 ml = m;
3902 so->so_rcv.sb_mb = m = m->m_next;
3903 ml->m_next = NULL;
3904 }
3905 if (m != NULL) {
3906 m->m_nextpkt = nextrecord;
3907 if (nextrecord == NULL) {
3908 so->so_rcv.sb_lastrecord = m;
3909 }
3910 } else {
3911 so->so_rcv.sb_mb = nextrecord;
3912 SB_EMPTY_FIXUP(&so->so_rcv);
3913 }
3914 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3915 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3916 }
3917 } else {
3918 if (flags & MSG_PEEK) {
3919 moff += len;
3920 } else {
3921 if (mp != NULL) {
3922 int copy_flag;
3923
3924 if (flags & MSG_DONTWAIT) {
3925 copy_flag = M_DONTWAIT;
3926 } else {
3927 copy_flag = M_WAIT;
3928 }
3929 *mp = m_copym(m, 0, (int)len, copy_flag);
3930 /*
3931 * Failed to allocate an mbuf?
3932 * Adjust uio_resid back, it was
3933 * adjusted down by len bytes which
3934 * we didn't copy over.
3935 */
3936 if (*mp == NULL) {
3937 uio_setresid(uio,
3938 (uio_resid(uio) + len));
3939 break;
3940 }
3941 }
3942 m->m_data += len;
3943 m->m_len -= len;
3944 so->so_rcv.sb_cc -= len;
3945 }
3946 }
3947 if (so->so_oobmark) {
3948 if ((flags & MSG_PEEK) == 0) {
3949 so->so_oobmark -= len;
3950 if (so->so_oobmark == 0) {
3951 so->so_state |= SS_RCVATMARK;
3952 break;
3953 }
3954 } else {
3955 offset += len;
3956 if (offset == so->so_oobmark) {
3957 break;
3958 }
3959 }
3960 }
3961 if (flags & MSG_EOR) {
3962 break;
3963 }
3964 /*
3965 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3966 * (for non-atomic socket), we must not quit until
3967 * "uio->uio_resid == 0" or an error termination.
3968 * If a signal/timeout occurs, return with a short
3969 * count but without error. Keep sockbuf locked
3970 * against other readers.
3971 */
3972 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3973 (uio_resid(uio) - delayed_copy_len) > 0 &&
3974 !sosendallatonce(so) && !nextrecord) {
3975 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3976 #if CONTENT_FILTER
3977 && cfil_sock_data_pending(&so->so_rcv) == 0
3978 #endif /* CONTENT_FILTER */
3979 )) {
3980 goto release;
3981 }
3982
3983 /*
3984 * Depending on the protocol (e.g. TCP), the following
3985 * might cause the socket lock to be dropped and later
3986 * be reacquired, and more data could have arrived and
3987 * have been appended to the receive socket buffer by
3988 * the time it returns. Therefore, we only sleep in
3989 * sbwait() below if and only if the socket buffer is
3990 * empty, in order to avoid a false sleep.
3991 */
3992 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3993 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3994 }
3995
3996 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3997 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3998
3999 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
4000 error = 0;
4001 goto release;
4002 }
4003 /*
4004 * have to wait until after we get back from the sbwait
4005 * to do the copy because we will drop the lock if we
4006 * have enough data that has been delayed... by dropping
4007 * the lock we open up a window allowing the netisr
4008 * thread to process the incoming packets and to change
4009 * the state of this socket... we're issuing the sbwait
4010 * because the socket is empty and we're expecting the
4011 * netisr thread to wake us up when more packets arrive;
4012 * if we allow that processing to happen and then sbwait
4013 * we could stall forever with packets sitting in the
4014 * socket if no further packets arrive from the remote
4015 * side.
4016 *
4017 * we want to copy before we've collected all the data
4018 * to satisfy this request to allow the copy to overlap
4019 * the incoming packet processing on an MP system
4020 */
4021 if (delayed_copy_len > sorecvmincopy &&
4022 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
4023 error = sodelayed_copy(so, uio,
4024 &free_list, &delayed_copy_len);
4025
4026 if (error) {
4027 goto release;
4028 }
4029 }
4030 m = so->so_rcv.sb_mb;
4031 if (m != NULL) {
4032 nextrecord = m->m_nextpkt;
4033 }
4034 SB_MB_CHECK(&so->so_rcv);
4035 }
4036 }
4037 #ifdef MORE_LOCKING_DEBUG
4038 if (so->so_usecount <= 1) {
4039 panic("%s: after big while so=%p ref=%d on socket",
4040 __func__, so, so->so_usecount);
4041 /* NOTREACHED */
4042 }
4043 #endif
4044
4045 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4046 if (so->so_options & SO_DONTTRUNC) {
4047 flags |= MSG_RCVMORE;
4048 } else {
4049 flags |= MSG_TRUNC;
4050 if ((flags & MSG_PEEK) == 0) {
4051 (void) sbdroprecord(&so->so_rcv);
4052 }
4053 }
4054 }
4055
4056 /*
4057 * pru_rcvd below (for TCP) may cause more data to be received
4058 * if the socket lock is dropped prior to sending the ACK; some
4059 * legacy OpenTransport applications don't handle this well
4060 * (if it receives less data than requested while MSG_HAVEMORE
4061 * is set), and so we set the flag now based on what we know
4062 * prior to calling pru_rcvd.
4063 */
4064 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4065 flags |= MSG_HAVEMORE;
4066 }
4067
4068 if ((flags & MSG_PEEK) == 0) {
4069 if (m == NULL) {
4070 so->so_rcv.sb_mb = nextrecord;
4071 /*
4072 * First part is an inline SB_EMPTY_FIXUP(). Second
4073 * part makes sure sb_lastrecord is up-to-date if
4074 * there is still data in the socket buffer.
4075 */
4076 if (so->so_rcv.sb_mb == NULL) {
4077 so->so_rcv.sb_mbtail = NULL;
4078 so->so_rcv.sb_lastrecord = NULL;
4079 } else if (nextrecord->m_nextpkt == NULL) {
4080 so->so_rcv.sb_lastrecord = nextrecord;
4081 }
4082 SB_MB_CHECK(&so->so_rcv);
4083 }
4084 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4085 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4086 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4087 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4088 }
4089 }
4090
4091 if (delayed_copy_len) {
4092 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4093 if (error) {
4094 goto release;
4095 }
4096 }
4097 if (free_list != NULL) {
4098 m_freem_list(free_list);
4099 free_list = NULL;
4100 }
4101
4102 if (orig_resid == uio_resid(uio) && orig_resid &&
4103 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4104 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4105 goto restart;
4106 }
4107
4108 if (flagsp != NULL) {
4109 *flagsp |= flags;
4110 }
4111 release:
4112 #ifdef MORE_LOCKING_DEBUG
4113 if (so->so_usecount <= 1) {
4114 panic("%s: release so=%p ref=%d on socket", __func__,
4115 so, so->so_usecount);
4116 /* NOTREACHED */
4117 }
4118 #endif
4119 if (delayed_copy_len) {
4120 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4121 }
4122
4123 if (free_list != NULL) {
4124 m_freem_list(free_list);
4125 }
4126
4127 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4128
4129 if (en_tracing) {
4130 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4131 VM_KERNEL_ADDRPERM(so),
4132 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4133 (int64_t)(orig_resid - uio_resid(uio)));
4134 }
4135 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4136 so->so_rcv.sb_cc, 0, error);
4137
4138 return error;
4139 }
4140
4141 /*
4142 * Returns: 0 Success
4143 * uiomove:EFAULT
4144 */
4145 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4146 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4147 user_ssize_t *resid)
4148 {
4149 int error = 0;
4150 struct mbuf *m;
4151
4152 m = *free_list;
4153
4154 socket_unlock(so, 0);
4155
4156 while (m != NULL && error == 0) {
4157 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4158 m = m->m_next;
4159 }
4160 m_freem_list(*free_list);
4161
4162 *free_list = NULL;
4163 *resid = 0;
4164
4165 socket_lock(so, 0);
4166
4167 return error;
4168 }
4169
4170 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4171 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4172 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4173 {
4174 #pragma unused(so)
4175 int error = 0;
4176 struct mbuf *ml, *m;
4177 int i = 0;
4178 struct uio *auio;
4179
4180 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4181 ml = ml->m_nextpkt, i++) {
4182 auio = msgarray[i].uio;
4183 for (m = ml; m != NULL; m = m->m_next) {
4184 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4185 if (error != 0) {
4186 goto out;
4187 }
4188 }
4189 }
4190 out:
4191 m_freem_list(*free_list);
4192
4193 *free_list = NULL;
4194 *resid = 0;
4195
4196 return error;
4197 }
4198
4199 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4200 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4201 int *flagsp)
4202 {
4203 struct mbuf *m;
4204 struct mbuf *nextrecord;
4205 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4206 int error;
4207 user_ssize_t len, pktlen, delayed_copy_len = 0;
4208 struct protosw *pr = so->so_proto;
4209 user_ssize_t resid;
4210 struct proc *p = current_proc();
4211 struct uio *auio = NULL;
4212 int npkts = 0;
4213 int sblocked = 0;
4214 struct sockaddr **psa = NULL;
4215 struct mbuf **controlp = NULL;
4216 int can_delay;
4217 int flags;
4218 struct mbuf *free_others = NULL;
4219
4220 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4221 so, uiocnt,
4222 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4223
4224 /*
4225 * Sanity checks:
4226 * - Only supports don't wait flags
4227 * - Only support datagram sockets (could be extended to raw)
4228 * - Must be atomic
4229 * - Protocol must support packet chains
4230 * - The uio array is NULL (should we panic?)
4231 */
4232 if (flagsp != NULL) {
4233 flags = *flagsp;
4234 } else {
4235 flags = 0;
4236 }
4237 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4238 MSG_NBIO)) {
4239 printf("%s invalid flags 0x%x\n", __func__, flags);
4240 error = EINVAL;
4241 goto out;
4242 }
4243 if (so->so_type != SOCK_DGRAM) {
4244 error = EINVAL;
4245 goto out;
4246 }
4247 if (sosendallatonce(so) == 0) {
4248 error = EINVAL;
4249 goto out;
4250 }
4251 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4252 error = EPROTONOSUPPORT;
4253 goto out;
4254 }
4255 if (msgarray == NULL) {
4256 printf("%s uioarray is NULL\n", __func__);
4257 error = EINVAL;
4258 goto out;
4259 }
4260 if (uiocnt == 0) {
4261 printf("%s uiocnt is 0\n", __func__);
4262 error = EINVAL;
4263 goto out;
4264 }
4265 /*
4266 * Sanity check on the length passed by caller as we are making 'int'
4267 * comparisons
4268 */
4269 resid = recv_msg_array_resid(msgarray, uiocnt);
4270 if (resid < 0 || resid > INT_MAX) {
4271 error = EINVAL;
4272 goto out;
4273 }
4274
4275 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4276 can_delay = 1;
4277 } else {
4278 can_delay = 0;
4279 }
4280
4281 socket_lock(so, 1);
4282 so_update_last_owner_locked(so, p);
4283 so_update_policy(so);
4284
4285 #if NECP
4286 so_update_necp_policy(so, NULL, NULL);
4287 #endif /* NECP */
4288
4289 /*
4290 * If a recv attempt is made on a previously-accepted socket
4291 * that has been marked as inactive (disconnected), reject
4292 * the request.
4293 */
4294 if (so->so_flags & SOF_DEFUNCT) {
4295 struct sockbuf *sb = &so->so_rcv;
4296
4297 error = ENOTCONN;
4298 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4299 __func__, proc_pid(p), proc_best_name(p),
4300 so->so_gencnt,
4301 SOCK_DOM(so), SOCK_TYPE(so), error);
4302 /*
4303 * This socket should have been disconnected and flushed
4304 * prior to being returned from sodefunct(); there should
4305 * be no data on its receive list, so panic otherwise.
4306 */
4307 if (so->so_state & SS_DEFUNCT) {
4308 sb_empty_assert(sb, __func__);
4309 }
4310 goto release;
4311 }
4312
4313 next:
4314 /*
4315 * The uio may be empty
4316 */
4317 if (npkts >= uiocnt) {
4318 error = 0;
4319 goto release;
4320 }
4321 restart:
4322 /*
4323 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4324 * and if so just return to the caller. This could happen when
4325 * soreceive() is called by a socket upcall function during the
4326 * time the socket is freed. The socket buffer would have been
4327 * locked across the upcall, therefore we cannot put this thread
4328 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4329 * we may livelock), because the lock on the socket buffer will
4330 * only be released when the upcall routine returns to its caller.
4331 * Because the socket has been officially closed, there can be
4332 * no further read on it.
4333 */
4334 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4335 (SS_NOFDREF | SS_CANTRCVMORE)) {
4336 error = 0;
4337 goto release;
4338 }
4339
4340 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4341 if (error) {
4342 goto release;
4343 }
4344 sblocked = 1;
4345
4346 m = so->so_rcv.sb_mb;
4347 /*
4348 * Block awaiting more datagram if needed
4349 */
4350 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4351 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4352 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4353 /*
4354 * Panic if we notice inconsistencies in the socket's
4355 * receive list; both sb_mb and sb_cc should correctly
4356 * reflect the contents of the list, otherwise we may
4357 * end up with false positives during select() or poll()
4358 * which could put the application in a bad state.
4359 */
4360 SB_MB_CHECK(&so->so_rcv);
4361
4362 if (so->so_error) {
4363 error = so->so_error;
4364 if ((flags & MSG_PEEK) == 0) {
4365 so->so_error = 0;
4366 }
4367 goto release;
4368 }
4369 if (so->so_state & SS_CANTRCVMORE) {
4370 goto release;
4371 }
4372 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4373 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4374 error = ENOTCONN;
4375 goto release;
4376 }
4377 if ((so->so_state & SS_NBIO) ||
4378 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4379 error = EWOULDBLOCK;
4380 goto release;
4381 }
4382 /*
4383 * Do not block if we got some data
4384 */
4385 if (free_list != NULL) {
4386 error = 0;
4387 goto release;
4388 }
4389
4390 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4391 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4392
4393 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4394 sblocked = 0;
4395
4396 error = sbwait(&so->so_rcv);
4397 if (error) {
4398 goto release;
4399 }
4400 goto restart;
4401 }
4402
4403 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4404 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4405 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4406
4407 /*
4408 * Consume the current uio index as we have a datagram
4409 */
4410 auio = msgarray[npkts].uio;
4411 resid = uio_resid(auio);
4412 msgarray[npkts].which |= SOCK_MSG_DATA;
4413 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4414 &msgarray[npkts].psa : NULL;
4415 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4416 &msgarray[npkts].controlp : NULL;
4417 npkts += 1;
4418 nextrecord = m->m_nextpkt;
4419
4420 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4421 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4422 if (error == ERESTART) {
4423 goto restart;
4424 } else if (error != 0) {
4425 goto release;
4426 }
4427 }
4428
4429 if (m != NULL && m->m_type == MT_CONTROL) {
4430 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4431 if (error != 0) {
4432 goto release;
4433 }
4434 }
4435
4436 if (m->m_pkthdr.len == 0) {
4437 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4438 __func__, __LINE__,
4439 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4440 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4441 m->m_type);
4442 }
4443
4444 /*
4445 * Loop to copy the mbufs of the current record
4446 * Support zero length packets
4447 */
4448 ml = NULL;
4449 pktlen = 0;
4450 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4451 if (m->m_len == 0) {
4452 panic("%p m_len zero", m);
4453 }
4454 if (m->m_type == 0) {
4455 panic("%p m_type zero", m);
4456 }
4457 /*
4458 * Clip to the residual length
4459 */
4460 if (len > m->m_len) {
4461 len = m->m_len;
4462 }
4463 pktlen += len;
4464 /*
4465 * Copy the mbufs via the uio or delay the copy
4466 * Sockbuf must be consistent here (points to current mbuf,
4467 * it points to next record) when we drop priority;
4468 * we must note any additions to the sockbuf when we
4469 * block interrupts again.
4470 */
4471 if (len > 0 && can_delay == 0) {
4472 socket_unlock(so, 0);
4473 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4474 socket_lock(so, 0);
4475 if (error) {
4476 goto release;
4477 }
4478 } else {
4479 delayed_copy_len += len;
4480 }
4481
4482 if (len == m->m_len) {
4483 /*
4484 * m was entirely copied
4485 */
4486 sbfree(&so->so_rcv, m);
4487 nextrecord = m->m_nextpkt;
4488 m->m_nextpkt = NULL;
4489
4490 /*
4491 * Set the first packet to the head of the free list
4492 */
4493 if (free_list == NULL) {
4494 free_list = m;
4495 }
4496 /*
4497 * Link current packet to tail of free list
4498 */
4499 if (ml == NULL) {
4500 if (free_tail != NULL) {
4501 free_tail->m_nextpkt = m;
4502 }
4503 free_tail = m;
4504 }
4505 /*
4506 * Link current mbuf to last mbuf of current packet
4507 */
4508 if (ml != NULL) {
4509 ml->m_next = m;
4510 }
4511 ml = m;
4512
4513 /*
4514 * Move next buf to head of socket buffer
4515 */
4516 so->so_rcv.sb_mb = m = ml->m_next;
4517 ml->m_next = NULL;
4518
4519 if (m != NULL) {
4520 m->m_nextpkt = nextrecord;
4521 if (nextrecord == NULL) {
4522 so->so_rcv.sb_lastrecord = m;
4523 }
4524 } else {
4525 so->so_rcv.sb_mb = nextrecord;
4526 SB_EMPTY_FIXUP(&so->so_rcv);
4527 }
4528 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4529 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4530 } else {
4531 /*
4532 * Stop the loop on partial copy
4533 */
4534 break;
4535 }
4536 }
4537 #ifdef MORE_LOCKING_DEBUG
4538 if (so->so_usecount <= 1) {
4539 panic("%s: after big while so=%llx ref=%d on socket",
4540 __func__,
4541 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4542 /* NOTREACHED */
4543 }
4544 #endif
4545 /*
4546 * Tell the caller we made a partial copy
4547 */
4548 if (m != NULL) {
4549 if (so->so_options & SO_DONTTRUNC) {
4550 /*
4551 * Copyout first the freelist then the partial mbuf
4552 */
4553 socket_unlock(so, 0);
4554 if (delayed_copy_len) {
4555 error = sodelayed_copy_list(so, msgarray,
4556 uiocnt, &free_list, &delayed_copy_len);
4557 }
4558
4559 if (error == 0) {
4560 error = uiomove(mtod(m, caddr_t), (int)len,
4561 auio);
4562 }
4563 socket_lock(so, 0);
4564 if (error) {
4565 goto release;
4566 }
4567
4568 m->m_data += len;
4569 m->m_len -= len;
4570 so->so_rcv.sb_cc -= len;
4571 flags |= MSG_RCVMORE;
4572 } else {
4573 (void) sbdroprecord(&so->so_rcv);
4574 nextrecord = so->so_rcv.sb_mb;
4575 m = NULL;
4576 flags |= MSG_TRUNC;
4577 }
4578 }
4579
4580 if (m == NULL) {
4581 so->so_rcv.sb_mb = nextrecord;
4582 /*
4583 * First part is an inline SB_EMPTY_FIXUP(). Second
4584 * part makes sure sb_lastrecord is up-to-date if
4585 * there is still data in the socket buffer.
4586 */
4587 if (so->so_rcv.sb_mb == NULL) {
4588 so->so_rcv.sb_mbtail = NULL;
4589 so->so_rcv.sb_lastrecord = NULL;
4590 } else if (nextrecord->m_nextpkt == NULL) {
4591 so->so_rcv.sb_lastrecord = nextrecord;
4592 }
4593 SB_MB_CHECK(&so->so_rcv);
4594 }
4595 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4596 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4597
4598 /*
4599 * We can continue to the next packet as long as:
4600 * - We haven't exhausted the uio array
4601 * - There was no error
4602 * - A packet was not truncated
4603 * - We can still receive more data
4604 */
4605 if (npkts < uiocnt && error == 0 &&
4606 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4607 (so->so_state & SS_CANTRCVMORE) == 0) {
4608 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4609 sblocked = 0;
4610
4611 goto next;
4612 }
4613 if (flagsp != NULL) {
4614 *flagsp |= flags;
4615 }
4616
4617 release:
4618 /*
4619 * pru_rcvd may cause more data to be received if the socket lock
4620 * is dropped so we set MSG_HAVEMORE now based on what we know.
4621 * That way the caller won't be surprised if it receives less data
4622 * than requested.
4623 */
4624 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4625 flags |= MSG_HAVEMORE;
4626 }
4627
4628 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4629 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4630 }
4631
4632 if (sblocked) {
4633 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4634 } else {
4635 socket_unlock(so, 1);
4636 }
4637
4638 if (delayed_copy_len) {
4639 error = sodelayed_copy_list(so, msgarray, uiocnt,
4640 &free_list, &delayed_copy_len);
4641 }
4642 out:
4643 /*
4644 * Amortize the cost of freeing the mbufs
4645 */
4646 if (free_list != NULL) {
4647 m_freem_list(free_list);
4648 }
4649 if (free_others != NULL) {
4650 m_freem_list(free_others);
4651 }
4652
4653 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4654 0, 0, 0, 0);
4655 return error;
4656 }
4657
4658 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4659 so_statistics_event_to_nstat_event(int64_t *input_options,
4660 uint64_t *nstat_event)
4661 {
4662 int error = 0;
4663 switch (*input_options) {
4664 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4665 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4666 break;
4667 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4668 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4669 break;
4670 #if (DEBUG || DEVELOPMENT)
4671 case SO_STATISTICS_EVENT_RESERVED_1:
4672 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4673 break;
4674 case SO_STATISTICS_EVENT_RESERVED_2:
4675 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4676 break;
4677 #endif /* (DEBUG || DEVELOPMENT) */
4678 default:
4679 error = EINVAL;
4680 break;
4681 }
4682 return error;
4683 }
4684
4685 /*
4686 * Returns: 0 Success
4687 * EINVAL
4688 * ENOTCONN
4689 * <pru_shutdown>:EINVAL
4690 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4691 * <pru_shutdown>:ENOBUFS[TCP]
4692 * <pru_shutdown>:EMSGSIZE[TCP]
4693 * <pru_shutdown>:EHOSTUNREACH[TCP]
4694 * <pru_shutdown>:ENETUNREACH[TCP]
4695 * <pru_shutdown>:ENETDOWN[TCP]
4696 * <pru_shutdown>:ENOMEM[TCP]
4697 * <pru_shutdown>:EACCES[TCP]
4698 * <pru_shutdown>:EMSGSIZE[TCP]
4699 * <pru_shutdown>:ENOBUFS[TCP]
4700 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4701 * <pru_shutdown>:??? [other protocol families]
4702 */
4703 int
soshutdown(struct socket * so,int how)4704 soshutdown(struct socket *so, int how)
4705 {
4706 int error;
4707
4708 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4709
4710 switch (how) {
4711 case SHUT_RD:
4712 case SHUT_WR:
4713 case SHUT_RDWR:
4714 socket_lock(so, 1);
4715 if ((so->so_state &
4716 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4717 error = ENOTCONN;
4718 } else {
4719 error = soshutdownlock(so, how);
4720 }
4721 socket_unlock(so, 1);
4722 break;
4723 default:
4724 error = EINVAL;
4725 break;
4726 }
4727
4728 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4729
4730 return error;
4731 }
4732
4733 int
soshutdownlock_final(struct socket * so,int how)4734 soshutdownlock_final(struct socket *so, int how)
4735 {
4736 struct protosw *pr = so->so_proto;
4737 int error = 0;
4738
4739 sflt_notify(so, sock_evt_shutdown, &how);
4740
4741 if (how != SHUT_WR) {
4742 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4743 /* read already shut down */
4744 error = ENOTCONN;
4745 goto done;
4746 }
4747 sorflush(so);
4748 }
4749 if (how != SHUT_RD) {
4750 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4751 /* write already shut down */
4752 error = ENOTCONN;
4753 goto done;
4754 }
4755 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4756 }
4757 done:
4758 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4759 return error;
4760 }
4761
4762 int
soshutdownlock(struct socket * so,int how)4763 soshutdownlock(struct socket *so, int how)
4764 {
4765 int error = 0;
4766
4767 #if CONTENT_FILTER
4768 /*
4769 * A content filter may delay the actual shutdown until it
4770 * has processed the pending data
4771 */
4772 if (so->so_flags & SOF_CONTENT_FILTER) {
4773 error = cfil_sock_shutdown(so, &how);
4774 if (error == EJUSTRETURN) {
4775 error = 0;
4776 goto done;
4777 } else if (error != 0) {
4778 goto done;
4779 }
4780 }
4781 #endif /* CONTENT_FILTER */
4782
4783 error = soshutdownlock_final(so, how);
4784
4785 done:
4786 return error;
4787 }
4788
4789 void
sowflush(struct socket * so)4790 sowflush(struct socket *so)
4791 {
4792 struct sockbuf *sb = &so->so_snd;
4793
4794 /*
4795 * Obtain lock on the socket buffer (SB_LOCK). This is required
4796 * to prevent the socket buffer from being unexpectedly altered
4797 * while it is used by another thread in socket send/receive.
4798 *
4799 * sblock() must not fail here, hence the assertion.
4800 */
4801 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4802 VERIFY(sb->sb_flags & SB_LOCK);
4803
4804 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4805 sb->sb_flags |= SB_DROP;
4806 sb->sb_upcall = NULL;
4807 sb->sb_upcallarg = NULL;
4808
4809 sbunlock(sb, TRUE); /* keep socket locked */
4810
4811 selthreadclear(&sb->sb_sel);
4812 sbrelease(sb);
4813 }
4814
4815 void
sorflush(struct socket * so)4816 sorflush(struct socket *so)
4817 {
4818 struct sockbuf *sb = &so->so_rcv;
4819 struct protosw *pr = so->so_proto;
4820 struct sockbuf asb;
4821 #ifdef notyet
4822 lck_mtx_t *mutex_held;
4823 /*
4824 * XXX: This code is currently commented out, because we may get here
4825 * as part of sofreelastref(), and at that time, pr_getlock() may no
4826 * longer be able to return us the lock; this will be fixed in future.
4827 */
4828 if (so->so_proto->pr_getlock != NULL) {
4829 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4830 } else {
4831 mutex_held = so->so_proto->pr_domain->dom_mtx;
4832 }
4833
4834 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4835 #endif /* notyet */
4836
4837 sflt_notify(so, sock_evt_flush_read, NULL);
4838
4839 socantrcvmore(so);
4840
4841 /*
4842 * Obtain lock on the socket buffer (SB_LOCK). This is required
4843 * to prevent the socket buffer from being unexpectedly altered
4844 * while it is used by another thread in socket send/receive.
4845 *
4846 * sblock() must not fail here, hence the assertion.
4847 */
4848 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4849 VERIFY(sb->sb_flags & SB_LOCK);
4850
4851 /*
4852 * Copy only the relevant fields from "sb" to "asb" which we
4853 * need for sbrelease() to function. In particular, skip
4854 * sb_sel as it contains the wait queue linkage, which would
4855 * wreak havoc if we were to issue selthreadclear() on "asb".
4856 * Make sure to not carry over SB_LOCK in "asb", as we need
4857 * to acquire it later as part of sbrelease().
4858 */
4859 bzero(&asb, sizeof(asb));
4860 asb.sb_cc = sb->sb_cc;
4861 asb.sb_hiwat = sb->sb_hiwat;
4862 asb.sb_mbcnt = sb->sb_mbcnt;
4863 asb.sb_mbmax = sb->sb_mbmax;
4864 asb.sb_ctl = sb->sb_ctl;
4865 asb.sb_lowat = sb->sb_lowat;
4866 asb.sb_mb = sb->sb_mb;
4867 asb.sb_mbtail = sb->sb_mbtail;
4868 asb.sb_lastrecord = sb->sb_lastrecord;
4869 asb.sb_so = sb->sb_so;
4870 asb.sb_flags = sb->sb_flags;
4871 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4872 asb.sb_flags |= SB_DROP;
4873
4874 /*
4875 * Ideally we'd bzero() these and preserve the ones we need;
4876 * but to do that we'd need to shuffle things around in the
4877 * sockbuf, and we can't do it now because there are KEXTS
4878 * that are directly referring to the socket structure.
4879 *
4880 * Setting SB_DROP acts as a barrier to prevent further appends.
4881 * Clearing SB_SEL is done for selthreadclear() below.
4882 */
4883 sb->sb_cc = 0;
4884 sb->sb_hiwat = 0;
4885 sb->sb_mbcnt = 0;
4886 sb->sb_mbmax = 0;
4887 sb->sb_ctl = 0;
4888 sb->sb_lowat = 0;
4889 sb->sb_mb = NULL;
4890 sb->sb_mbtail = NULL;
4891 sb->sb_lastrecord = NULL;
4892 sb->sb_timeo.tv_sec = 0;
4893 sb->sb_timeo.tv_usec = 0;
4894 sb->sb_upcall = NULL;
4895 sb->sb_upcallarg = NULL;
4896 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4897 sb->sb_flags |= SB_DROP;
4898
4899 sbunlock(sb, TRUE); /* keep socket locked */
4900
4901 /*
4902 * Note that selthreadclear() is called on the original "sb" and
4903 * not the local "asb" because of the way wait queue linkage is
4904 * implemented. Given that selwakeup() may be triggered, SB_SEL
4905 * should no longer be set (cleared above.)
4906 */
4907 selthreadclear(&sb->sb_sel);
4908
4909 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4910 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4911 }
4912
4913 sbrelease(&asb);
4914 }
4915
4916 /*
4917 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4918 * an additional variant to handle the case where the option value needs
4919 * to be some kind of integer, but not a specific size.
4920 * In addition to their use here, these functions are also called by the
4921 * protocol-level pr_ctloutput() routines.
4922 *
4923 * Returns: 0 Success
4924 * EINVAL
4925 * copyin:EFAULT
4926 */
4927 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4928 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4929 {
4930 size_t valsize;
4931
4932 /*
4933 * If the user gives us more than we wanted, we ignore it,
4934 * but if we don't get the minimum length the caller
4935 * wants, we return EINVAL. On success, sopt->sopt_valsize
4936 * is set to however much we actually retrieved.
4937 */
4938 if ((valsize = sopt->sopt_valsize) < minlen) {
4939 return EINVAL;
4940 }
4941 if (valsize > len) {
4942 sopt->sopt_valsize = valsize = len;
4943 }
4944
4945 if (sopt->sopt_p != kernproc) {
4946 return copyin(sopt->sopt_val, buf, valsize);
4947 }
4948
4949 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4950 return 0;
4951 }
4952
4953 /*
4954 * sooptcopyin_timeval
4955 * Copy in a timeval value into tv_p, and take into account whether the
4956 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4957 * code here so that we can verify the 64-bit tv_sec value before we lose
4958 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4959 */
4960 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4961 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4962 {
4963 int error;
4964
4965 if (proc_is64bit(sopt->sopt_p)) {
4966 struct user64_timeval tv64;
4967
4968 if (sopt->sopt_valsize < sizeof(tv64)) {
4969 return EINVAL;
4970 }
4971
4972 sopt->sopt_valsize = sizeof(tv64);
4973 if (sopt->sopt_p != kernproc) {
4974 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4975 if (error != 0) {
4976 return error;
4977 }
4978 } else {
4979 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4980 sizeof(tv64));
4981 }
4982 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4983 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4984 return EDOM;
4985 }
4986
4987 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4988 tv_p->tv_usec = tv64.tv_usec;
4989 } else {
4990 struct user32_timeval tv32;
4991
4992 if (sopt->sopt_valsize < sizeof(tv32)) {
4993 return EINVAL;
4994 }
4995
4996 sopt->sopt_valsize = sizeof(tv32);
4997 if (sopt->sopt_p != kernproc) {
4998 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4999 if (error != 0) {
5000 return error;
5001 }
5002 } else {
5003 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
5004 sizeof(tv32));
5005 }
5006 #ifndef __LP64__
5007 /*
5008 * K64todo "comparison is always false due to
5009 * limited range of data type"
5010 */
5011 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
5012 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
5013 return EDOM;
5014 }
5015 #endif
5016 tv_p->tv_sec = tv32.tv_sec;
5017 tv_p->tv_usec = tv32.tv_usec;
5018 }
5019 return 0;
5020 }
5021
5022 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)5023 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
5024 boolean_t ignore_delegate)
5025 {
5026 kauth_cred_t cred = NULL;
5027 proc_t ep = PROC_NULL;
5028 uid_t uid;
5029 int error = 0;
5030
5031 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
5032 ep = proc_find(so->e_pid);
5033 if (ep) {
5034 cred = kauth_cred_proc_ref(ep);
5035 }
5036 }
5037
5038 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5039
5040 /* uid is 0 for root */
5041 if (uid != 0 || !allow_root) {
5042 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5043 }
5044 if (cred) {
5045 kauth_cred_unref(&cred);
5046 }
5047 if (ep != PROC_NULL) {
5048 proc_rele(ep);
5049 }
5050
5051 return error;
5052 }
5053
5054 /*
5055 * Returns: 0 Success
5056 * EINVAL
5057 * ENOPROTOOPT
5058 * ENOBUFS
5059 * EDOM
5060 * sooptcopyin:EINVAL
5061 * sooptcopyin:EFAULT
5062 * sooptcopyin_timeval:EINVAL
5063 * sooptcopyin_timeval:EFAULT
5064 * sooptcopyin_timeval:EDOM
5065 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5066 * <pr_ctloutput>:???w
5067 * sflt_attach_private:??? [whatever a filter author chooses]
5068 * <sf_setoption>:??? [whatever a filter author chooses]
5069 *
5070 * Notes: Other <pru_listen> returns depend on the protocol family; all
5071 * <sf_listen> returns depend on what the filter author causes
5072 * their filter to return.
5073 */
5074 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5075 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5076 {
5077 int error, optval;
5078 int64_t long_optval;
5079 struct linger l;
5080 struct timeval tv;
5081
5082 if (sopt->sopt_dir != SOPT_SET) {
5083 sopt->sopt_dir = SOPT_SET;
5084 }
5085
5086 if (dolock) {
5087 socket_lock(so, 1);
5088 }
5089
5090 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5091 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5092 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5093 /* the socket has been shutdown, no more sockopt's */
5094 error = EINVAL;
5095 goto out;
5096 }
5097
5098 error = sflt_setsockopt(so, sopt);
5099 if (error != 0) {
5100 if (error == EJUSTRETURN) {
5101 error = 0;
5102 }
5103 goto out;
5104 }
5105
5106 if (sopt->sopt_level != SOL_SOCKET) {
5107 if (so->so_proto != NULL &&
5108 so->so_proto->pr_ctloutput != NULL) {
5109 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5110 goto out;
5111 }
5112 error = ENOPROTOOPT;
5113 } else {
5114 /*
5115 * Allow socket-level (SOL_SOCKET) options to be filtered by
5116 * the protocol layer, if needed. A zero value returned from
5117 * the handler means use default socket-level processing as
5118 * done by the rest of this routine. Otherwise, any other
5119 * return value indicates that the option is unsupported.
5120 */
5121 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5122 pru_socheckopt(so, sopt)) != 0) {
5123 goto out;
5124 }
5125
5126 error = 0;
5127 switch (sopt->sopt_name) {
5128 case SO_LINGER:
5129 case SO_LINGER_SEC: {
5130 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5131 if (error != 0) {
5132 goto out;
5133 }
5134 /* Make sure to use sane values */
5135 if (sopt->sopt_name == SO_LINGER) {
5136 so->so_linger = (short)l.l_linger;
5137 } else {
5138 so->so_linger = (short)((long)l.l_linger * hz);
5139 }
5140 if (l.l_onoff != 0) {
5141 so->so_options |= SO_LINGER;
5142 } else {
5143 so->so_options &= ~SO_LINGER;
5144 }
5145 break;
5146 }
5147 case SO_DEBUG:
5148 case SO_KEEPALIVE:
5149 case SO_DONTROUTE:
5150 case SO_USELOOPBACK:
5151 case SO_BROADCAST:
5152 case SO_REUSEADDR:
5153 case SO_REUSEPORT:
5154 case SO_OOBINLINE:
5155 case SO_TIMESTAMP:
5156 case SO_TIMESTAMP_MONOTONIC:
5157 case SO_TIMESTAMP_CONTINUOUS:
5158 case SO_DONTTRUNC:
5159 case SO_WANTMORE:
5160 case SO_WANTOOBFLAG:
5161 case SO_NOWAKEFROMSLEEP:
5162 case SO_NOAPNFALLBK:
5163 error = sooptcopyin(sopt, &optval, sizeof(optval),
5164 sizeof(optval));
5165 if (error != 0) {
5166 goto out;
5167 }
5168 if (optval) {
5169 so->so_options |= sopt->sopt_name;
5170 } else {
5171 so->so_options &= ~sopt->sopt_name;
5172 }
5173 #if SKYWALK
5174 inp_update_netns_flags(so);
5175 #endif /* SKYWALK */
5176 break;
5177
5178 case SO_SNDBUF:
5179 case SO_RCVBUF:
5180 case SO_SNDLOWAT:
5181 case SO_RCVLOWAT:
5182 error = sooptcopyin(sopt, &optval, sizeof(optval),
5183 sizeof(optval));
5184 if (error != 0) {
5185 goto out;
5186 }
5187
5188 /*
5189 * Values < 1 make no sense for any of these
5190 * options, so disallow them.
5191 */
5192 if (optval < 1) {
5193 error = EINVAL;
5194 goto out;
5195 }
5196
5197 switch (sopt->sopt_name) {
5198 case SO_SNDBUF:
5199 case SO_RCVBUF: {
5200 struct sockbuf *sb =
5201 (sopt->sopt_name == SO_SNDBUF) ?
5202 &so->so_snd : &so->so_rcv;
5203 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5204 error = ENOBUFS;
5205 goto out;
5206 }
5207 sb->sb_flags |= SB_USRSIZE;
5208 sb->sb_flags &= ~SB_AUTOSIZE;
5209 sb->sb_idealsize = (u_int32_t)optval;
5210 break;
5211 }
5212 /*
5213 * Make sure the low-water is never greater than
5214 * the high-water.
5215 */
5216 case SO_SNDLOWAT: {
5217 int space = sbspace(&so->so_snd);
5218 uint32_t hiwat = so->so_snd.sb_hiwat;
5219
5220 if (so->so_snd.sb_flags & SB_UNIX) {
5221 struct unpcb *unp =
5222 (struct unpcb *)(so->so_pcb);
5223 if (unp != NULL &&
5224 unp->unp_conn != NULL) {
5225 struct socket *so2 = unp->unp_conn->unp_socket;
5226 hiwat += unp->unp_conn->unp_cc;
5227 space = sbspace(&so2->so_rcv);
5228 }
5229 }
5230
5231 so->so_snd.sb_lowat =
5232 (optval > hiwat) ?
5233 hiwat : optval;
5234
5235 if (space >= so->so_snd.sb_lowat) {
5236 sowwakeup(so);
5237 }
5238 break;
5239 }
5240 case SO_RCVLOWAT: {
5241 int64_t data_len;
5242 so->so_rcv.sb_lowat =
5243 (optval > so->so_rcv.sb_hiwat) ?
5244 so->so_rcv.sb_hiwat : optval;
5245 if (so->so_rcv.sb_flags & SB_UNIX) {
5246 struct unpcb *unp =
5247 (struct unpcb *)(so->so_pcb);
5248 if (unp != NULL &&
5249 unp->unp_conn != NULL) {
5250 struct socket *so2 = unp->unp_conn->unp_socket;
5251 data_len = so2->so_snd.sb_cc
5252 - so2->so_snd.sb_ctl;
5253 } else {
5254 data_len = so->so_rcv.sb_cc
5255 - so->so_rcv.sb_ctl;
5256 }
5257 } else {
5258 data_len = so->so_rcv.sb_cc
5259 - so->so_rcv.sb_ctl;
5260 }
5261
5262 if (data_len >= so->so_rcv.sb_lowat) {
5263 sorwakeup(so);
5264 }
5265 break;
5266 }
5267 }
5268 break;
5269
5270 case SO_SNDTIMEO:
5271 case SO_RCVTIMEO:
5272 error = sooptcopyin_timeval(sopt, &tv);
5273 if (error != 0) {
5274 goto out;
5275 }
5276
5277 switch (sopt->sopt_name) {
5278 case SO_SNDTIMEO:
5279 so->so_snd.sb_timeo = tv;
5280 break;
5281 case SO_RCVTIMEO:
5282 so->so_rcv.sb_timeo = tv;
5283 break;
5284 }
5285 break;
5286
5287 case SO_NKE: {
5288 struct so_nke nke;
5289
5290 error = sooptcopyin(sopt, &nke, sizeof(nke),
5291 sizeof(nke));
5292 if (error != 0) {
5293 goto out;
5294 }
5295
5296 error = sflt_attach_internal(so, nke.nke_handle);
5297 break;
5298 }
5299
5300 case SO_NOSIGPIPE:
5301 error = sooptcopyin(sopt, &optval, sizeof(optval),
5302 sizeof(optval));
5303 if (error != 0) {
5304 goto out;
5305 }
5306 if (optval != 0) {
5307 so->so_flags |= SOF_NOSIGPIPE;
5308 } else {
5309 so->so_flags &= ~SOF_NOSIGPIPE;
5310 }
5311 break;
5312
5313 case SO_NOADDRERR:
5314 error = sooptcopyin(sopt, &optval, sizeof(optval),
5315 sizeof(optval));
5316 if (error != 0) {
5317 goto out;
5318 }
5319 if (optval != 0) {
5320 so->so_flags |= SOF_NOADDRAVAIL;
5321 } else {
5322 so->so_flags &= ~SOF_NOADDRAVAIL;
5323 }
5324 break;
5325
5326 case SO_REUSESHAREUID:
5327 error = sooptcopyin(sopt, &optval, sizeof(optval),
5328 sizeof(optval));
5329 if (error != 0) {
5330 goto out;
5331 }
5332 if (optval != 0) {
5333 so->so_flags |= SOF_REUSESHAREUID;
5334 } else {
5335 so->so_flags &= ~SOF_REUSESHAREUID;
5336 }
5337 break;
5338
5339 case SO_NOTIFYCONFLICT:
5340 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5341 error = EPERM;
5342 goto out;
5343 }
5344 error = sooptcopyin(sopt, &optval, sizeof(optval),
5345 sizeof(optval));
5346 if (error != 0) {
5347 goto out;
5348 }
5349 if (optval != 0) {
5350 so->so_flags |= SOF_NOTIFYCONFLICT;
5351 } else {
5352 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5353 }
5354 break;
5355
5356 case SO_RESTRICTIONS:
5357 error = sooptcopyin(sopt, &optval, sizeof(optval),
5358 sizeof(optval));
5359 if (error != 0) {
5360 goto out;
5361 }
5362
5363 error = so_set_restrictions(so, optval);
5364 break;
5365
5366 case SO_AWDL_UNRESTRICTED:
5367 if (SOCK_DOM(so) != PF_INET &&
5368 SOCK_DOM(so) != PF_INET6) {
5369 error = EOPNOTSUPP;
5370 goto out;
5371 }
5372 error = sooptcopyin(sopt, &optval, sizeof(optval),
5373 sizeof(optval));
5374 if (error != 0) {
5375 goto out;
5376 }
5377 if (optval != 0) {
5378 error = soopt_cred_check(so,
5379 PRIV_NET_RESTRICTED_AWDL, false, false);
5380 if (error == 0) {
5381 inp_set_awdl_unrestricted(
5382 sotoinpcb(so));
5383 }
5384 } else {
5385 inp_clear_awdl_unrestricted(sotoinpcb(so));
5386 }
5387 break;
5388 case SO_INTCOPROC_ALLOW:
5389 if (SOCK_DOM(so) != PF_INET6) {
5390 error = EOPNOTSUPP;
5391 goto out;
5392 }
5393 error = sooptcopyin(sopt, &optval, sizeof(optval),
5394 sizeof(optval));
5395 if (error != 0) {
5396 goto out;
5397 }
5398 if (optval != 0 &&
5399 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5400 error = soopt_cred_check(so,
5401 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5402 if (error == 0) {
5403 inp_set_intcoproc_allowed(
5404 sotoinpcb(so));
5405 }
5406 } else if (optval == 0) {
5407 inp_clear_intcoproc_allowed(sotoinpcb(so));
5408 }
5409 break;
5410
5411 case SO_LABEL:
5412 error = EOPNOTSUPP;
5413 break;
5414
5415 case SO_UPCALLCLOSEWAIT:
5416 error = sooptcopyin(sopt, &optval, sizeof(optval),
5417 sizeof(optval));
5418 if (error != 0) {
5419 goto out;
5420 }
5421 if (optval != 0) {
5422 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5423 } else {
5424 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5425 }
5426 break;
5427
5428 case SO_RANDOMPORT:
5429 error = sooptcopyin(sopt, &optval, sizeof(optval),
5430 sizeof(optval));
5431 if (error != 0) {
5432 goto out;
5433 }
5434 if (optval != 0) {
5435 so->so_flags |= SOF_BINDRANDOMPORT;
5436 } else {
5437 so->so_flags &= ~SOF_BINDRANDOMPORT;
5438 }
5439 break;
5440
5441 case SO_NP_EXTENSIONS: {
5442 struct so_np_extensions sonpx;
5443
5444 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5445 sizeof(sonpx));
5446 if (error != 0) {
5447 goto out;
5448 }
5449 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5450 error = EINVAL;
5451 goto out;
5452 }
5453 /*
5454 * Only one bit defined for now
5455 */
5456 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5457 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5458 so->so_flags |= SOF_NPX_SETOPTSHUT;
5459 } else {
5460 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5461 }
5462 }
5463 break;
5464 }
5465
5466 case SO_TRAFFIC_CLASS: {
5467 error = sooptcopyin(sopt, &optval, sizeof(optval),
5468 sizeof(optval));
5469 if (error != 0) {
5470 goto out;
5471 }
5472 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5473 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5474 error = so_set_net_service_type(so, netsvc);
5475 goto out;
5476 }
5477 error = so_set_traffic_class(so, optval);
5478 if (error != 0) {
5479 goto out;
5480 }
5481 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5482 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5483 break;
5484 }
5485
5486 case SO_RECV_TRAFFIC_CLASS: {
5487 error = sooptcopyin(sopt, &optval, sizeof(optval),
5488 sizeof(optval));
5489 if (error != 0) {
5490 goto out;
5491 }
5492 if (optval == 0) {
5493 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5494 } else {
5495 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5496 }
5497 break;
5498 }
5499
5500 #if (DEVELOPMENT || DEBUG)
5501 case SO_TRAFFIC_CLASS_DBG: {
5502 struct so_tcdbg so_tcdbg;
5503
5504 error = sooptcopyin(sopt, &so_tcdbg,
5505 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5506 if (error != 0) {
5507 goto out;
5508 }
5509 error = so_set_tcdbg(so, &so_tcdbg);
5510 if (error != 0) {
5511 goto out;
5512 }
5513 break;
5514 }
5515 #endif /* (DEVELOPMENT || DEBUG) */
5516
5517 case SO_PRIVILEGED_TRAFFIC_CLASS:
5518 error = priv_check_cred(kauth_cred_get(),
5519 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5520 if (error != 0) {
5521 goto out;
5522 }
5523 error = sooptcopyin(sopt, &optval, sizeof(optval),
5524 sizeof(optval));
5525 if (error != 0) {
5526 goto out;
5527 }
5528 if (optval == 0) {
5529 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5530 } else {
5531 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5532 }
5533 break;
5534
5535 #if (DEVELOPMENT || DEBUG)
5536 case SO_DEFUNCTIT:
5537 error = sosetdefunct(current_proc(), so, 0, FALSE);
5538 if (error == 0) {
5539 error = sodefunct(current_proc(), so, 0);
5540 }
5541
5542 break;
5543 #endif /* (DEVELOPMENT || DEBUG) */
5544
5545 case SO_DEFUNCTOK:
5546 error = sooptcopyin(sopt, &optval, sizeof(optval),
5547 sizeof(optval));
5548 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5549 if (error == 0) {
5550 error = EBADF;
5551 }
5552 goto out;
5553 }
5554 /*
5555 * Any process can set SO_DEFUNCTOK (clear
5556 * SOF_NODEFUNCT), but only root can clear
5557 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5558 */
5559 if (optval == 0 &&
5560 kauth_cred_issuser(kauth_cred_get()) == 0) {
5561 error = EPERM;
5562 goto out;
5563 }
5564 if (optval) {
5565 so->so_flags &= ~SOF_NODEFUNCT;
5566 } else {
5567 so->so_flags |= SOF_NODEFUNCT;
5568 }
5569
5570 if (SOCK_DOM(so) == PF_INET ||
5571 SOCK_DOM(so) == PF_INET6) {
5572 char s[MAX_IPv6_STR_LEN];
5573 char d[MAX_IPv6_STR_LEN];
5574 struct inpcb *inp = sotoinpcb(so);
5575
5576 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5577 "[%s %s:%d -> %s:%d] is now marked "
5578 "as %seligible for "
5579 "defunct\n", __func__, proc_selfpid(),
5580 proc_best_name(current_proc()),
5581 so->so_gencnt,
5582 (SOCK_TYPE(so) == SOCK_STREAM) ?
5583 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5584 ((SOCK_DOM(so) == PF_INET) ?
5585 (void *)&inp->inp_laddr.s_addr :
5586 (void *)&inp->in6p_laddr), s, sizeof(s)),
5587 ntohs(inp->in6p_lport),
5588 inet_ntop(SOCK_DOM(so),
5589 (SOCK_DOM(so) == PF_INET) ?
5590 (void *)&inp->inp_faddr.s_addr :
5591 (void *)&inp->in6p_faddr, d, sizeof(d)),
5592 ntohs(inp->in6p_fport),
5593 (so->so_flags & SOF_NODEFUNCT) ?
5594 "not " : "");
5595 } else {
5596 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5597 "is now marked as %seligible for "
5598 "defunct\n",
5599 __func__, proc_selfpid(),
5600 proc_best_name(current_proc()),
5601 so->so_gencnt,
5602 SOCK_DOM(so), SOCK_TYPE(so),
5603 (so->so_flags & SOF_NODEFUNCT) ?
5604 "not " : "");
5605 }
5606 break;
5607
5608 case SO_ISDEFUNCT:
5609 /* This option is not settable */
5610 error = EINVAL;
5611 break;
5612
5613 case SO_OPPORTUNISTIC:
5614 error = sooptcopyin(sopt, &optval, sizeof(optval),
5615 sizeof(optval));
5616 if (error == 0) {
5617 error = so_set_opportunistic(so, optval);
5618 }
5619 break;
5620
5621 case SO_FLUSH:
5622 /* This option is handled by lower layer(s) */
5623 error = 0;
5624 break;
5625
5626 case SO_RECV_ANYIF:
5627 error = sooptcopyin(sopt, &optval, sizeof(optval),
5628 sizeof(optval));
5629 if (error == 0) {
5630 error = so_set_recv_anyif(so, optval);
5631 }
5632 break;
5633
5634 case SO_TRAFFIC_MGT_BACKGROUND: {
5635 /* This option is handled by lower layer(s) */
5636 error = 0;
5637 break;
5638 }
5639
5640 #if FLOW_DIVERT
5641 case SO_FLOW_DIVERT_TOKEN:
5642 error = flow_divert_token_set(so, sopt);
5643 break;
5644 #endif /* FLOW_DIVERT */
5645
5646
5647 case SO_DELEGATED:
5648 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5649 sizeof(optval))) != 0) {
5650 break;
5651 }
5652
5653 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5654 break;
5655
5656 case SO_DELEGATED_UUID: {
5657 uuid_t euuid;
5658
5659 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5660 sizeof(euuid))) != 0) {
5661 break;
5662 }
5663
5664 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5665 break;
5666 }
5667
5668 #if NECP
5669 case SO_NECP_ATTRIBUTES:
5670 if (SOCK_DOM(so) == PF_MULTIPATH) {
5671 /* Handled by MPTCP itself */
5672 break;
5673 }
5674
5675 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5676 error = EINVAL;
5677 goto out;
5678 }
5679
5680 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5681 break;
5682
5683 case SO_NECP_CLIENTUUID: {
5684 if (SOCK_DOM(so) == PF_MULTIPATH) {
5685 /* Handled by MPTCP itself */
5686 break;
5687 }
5688
5689 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5690 error = EINVAL;
5691 goto out;
5692 }
5693
5694 struct inpcb *inp = sotoinpcb(so);
5695 if (!uuid_is_null(inp->necp_client_uuid)) {
5696 // Clear out the old client UUID if present
5697 necp_inpcb_remove_cb(inp);
5698 }
5699
5700 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5701 sizeof(uuid_t), sizeof(uuid_t));
5702 if (error != 0) {
5703 goto out;
5704 }
5705
5706 if (uuid_is_null(inp->necp_client_uuid)) {
5707 error = EINVAL;
5708 goto out;
5709 }
5710
5711 pid_t current_pid = proc_pid(current_proc());
5712 error = necp_client_register_socket_flow(current_pid,
5713 inp->necp_client_uuid, inp);
5714 if (error != 0) {
5715 uuid_clear(inp->necp_client_uuid);
5716 goto out;
5717 }
5718
5719 if (inp->inp_lport != 0) {
5720 // There is a bound local port, so this is not
5721 // a fresh socket. Assign to the client.
5722 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5723 }
5724
5725 break;
5726 }
5727 case SO_NECP_LISTENUUID: {
5728 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5729 error = EINVAL;
5730 goto out;
5731 }
5732
5733 struct inpcb *inp = sotoinpcb(so);
5734 if (!uuid_is_null(inp->necp_client_uuid)) {
5735 error = EINVAL;
5736 goto out;
5737 }
5738
5739 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5740 sizeof(uuid_t), sizeof(uuid_t));
5741 if (error != 0) {
5742 goto out;
5743 }
5744
5745 if (uuid_is_null(inp->necp_client_uuid)) {
5746 error = EINVAL;
5747 goto out;
5748 }
5749
5750 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5751 inp->necp_client_uuid, inp);
5752 if (error != 0) {
5753 uuid_clear(inp->necp_client_uuid);
5754 goto out;
5755 }
5756
5757 // Mark that the port registration is held by NECP
5758 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5759
5760 break;
5761 }
5762
5763 case SO_RESOLVER_SIGNATURE: {
5764 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5765 error = EINVAL;
5766 goto out;
5767 }
5768 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5769 break;
5770 }
5771 #endif /* NECP */
5772
5773 case SO_EXTENDED_BK_IDLE:
5774 error = sooptcopyin(sopt, &optval, sizeof(optval),
5775 sizeof(optval));
5776 if (error == 0) {
5777 error = so_set_extended_bk_idle(so, optval);
5778 }
5779 break;
5780
5781 case SO_MARK_CELLFALLBACK:
5782 error = sooptcopyin(sopt, &optval, sizeof(optval),
5783 sizeof(optval));
5784 if (error != 0) {
5785 goto out;
5786 }
5787 if (optval < 0) {
5788 error = EINVAL;
5789 goto out;
5790 }
5791 if (optval == 0) {
5792 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5793 } else {
5794 so->so_flags1 |= SOF1_CELLFALLBACK;
5795 }
5796 break;
5797
5798 case SO_MARK_CELLFALLBACK_UUID:
5799 {
5800 struct so_mark_cellfallback_uuid_args args;
5801
5802 error = sooptcopyin(sopt, &args, sizeof(args),
5803 sizeof(args));
5804 if (error != 0) {
5805 goto out;
5806 }
5807 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5808 args.flow_cellfallback);
5809 break;
5810 }
5811
5812 case SO_FALLBACK_MODE:
5813 error = sooptcopyin(sopt, &optval, sizeof(optval),
5814 sizeof(optval));
5815 if (error != 0) {
5816 goto out;
5817 }
5818 if (optval < SO_FALLBACK_MODE_NONE ||
5819 optval > SO_FALLBACK_MODE_PREFER) {
5820 error = EINVAL;
5821 goto out;
5822 }
5823 so->so_fallback_mode = (u_int8_t)optval;
5824 break;
5825
5826 case SO_MARK_KNOWN_TRACKER: {
5827 error = sooptcopyin(sopt, &optval, sizeof(optval),
5828 sizeof(optval));
5829 if (error != 0) {
5830 goto out;
5831 }
5832 if (optval < 0) {
5833 error = EINVAL;
5834 goto out;
5835 }
5836 if (optval == 0) {
5837 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5838 } else {
5839 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5840 }
5841 break;
5842 }
5843
5844 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5845 error = sooptcopyin(sopt, &optval, sizeof(optval),
5846 sizeof(optval));
5847 if (error != 0) {
5848 goto out;
5849 }
5850 if (optval < 0) {
5851 error = EINVAL;
5852 goto out;
5853 }
5854 if (optval == 0) {
5855 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5856 } else {
5857 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5858 }
5859 break;
5860 }
5861
5862 case SO_MARK_APPROVED_APP_DOMAIN: {
5863 error = sooptcopyin(sopt, &optval, sizeof(optval),
5864 sizeof(optval));
5865 if (error != 0) {
5866 goto out;
5867 }
5868 if (optval < 0) {
5869 error = EINVAL;
5870 goto out;
5871 }
5872 if (optval == 0) {
5873 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5874 } else {
5875 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5876 }
5877 break;
5878 }
5879
5880 case SO_STATISTICS_EVENT:
5881 error = sooptcopyin(sopt, &long_optval,
5882 sizeof(long_optval), sizeof(long_optval));
5883 if (error != 0) {
5884 goto out;
5885 }
5886 u_int64_t nstat_event = 0;
5887 error = so_statistics_event_to_nstat_event(
5888 &long_optval, &nstat_event);
5889 if (error != 0) {
5890 goto out;
5891 }
5892 nstat_pcb_event(sotoinpcb(so), nstat_event);
5893 break;
5894
5895 case SO_NET_SERVICE_TYPE: {
5896 error = sooptcopyin(sopt, &optval, sizeof(optval),
5897 sizeof(optval));
5898 if (error != 0) {
5899 goto out;
5900 }
5901 error = so_set_net_service_type(so, optval);
5902 break;
5903 }
5904
5905 case SO_QOSMARKING_POLICY_OVERRIDE:
5906 error = priv_check_cred(kauth_cred_get(),
5907 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5908 if (error != 0) {
5909 goto out;
5910 }
5911 error = sooptcopyin(sopt, &optval, sizeof(optval),
5912 sizeof(optval));
5913 if (error != 0) {
5914 goto out;
5915 }
5916 if (optval == 0) {
5917 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5918 } else {
5919 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5920 }
5921 break;
5922
5923 case SO_MPKL_SEND_INFO: {
5924 struct so_mpkl_send_info so_mpkl_send_info;
5925
5926 error = sooptcopyin(sopt, &so_mpkl_send_info,
5927 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5928 if (error != 0) {
5929 goto out;
5930 }
5931 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5932 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5933
5934 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5935 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5936 } else {
5937 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5938 }
5939 break;
5940 }
5941 case SO_WANT_KEV_SOCKET_CLOSED: {
5942 error = sooptcopyin(sopt, &optval, sizeof(optval),
5943 sizeof(optval));
5944 if (error != 0) {
5945 goto out;
5946 }
5947 if (optval == 0) {
5948 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5949 } else {
5950 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5951 }
5952 break;
5953 }
5954 case SO_MARK_WAKE_PKT: {
5955 error = sooptcopyin(sopt, &optval, sizeof(optval),
5956 sizeof(optval));
5957 if (error != 0) {
5958 goto out;
5959 }
5960 if (optval == 0) {
5961 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5962 } else {
5963 so->so_flags |= SOF_MARK_WAKE_PKT;
5964 }
5965 break;
5966 }
5967 case SO_RECV_WAKE_PKT: {
5968 error = sooptcopyin(sopt, &optval, sizeof(optval),
5969 sizeof(optval));
5970 if (error != 0) {
5971 goto out;
5972 }
5973 if (optval == 0) {
5974 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5975 } else {
5976 so->so_flags |= SOF_RECV_WAKE_PKT;
5977 }
5978 break;
5979 }
5980 default:
5981 error = ENOPROTOOPT;
5982 break;
5983 }
5984 if (error == 0 && so->so_proto != NULL &&
5985 so->so_proto->pr_ctloutput != NULL) {
5986 (void) so->so_proto->pr_ctloutput(so, sopt);
5987 }
5988 }
5989 out:
5990 if (dolock) {
5991 socket_unlock(so, 1);
5992 }
5993 return error;
5994 }
5995
5996 /* Helper routines for getsockopt */
5997 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5998 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5999 {
6000 int error;
6001 size_t valsize;
6002
6003 error = 0;
6004
6005 /*
6006 * Documented get behavior is that we always return a value,
6007 * possibly truncated to fit in the user's buffer.
6008 * Traditional behavior is that we always tell the user
6009 * precisely how much we copied, rather than something useful
6010 * like the total amount we had available for her.
6011 * Note that this interface is not idempotent; the entire answer must
6012 * generated ahead of time.
6013 */
6014 valsize = MIN(len, sopt->sopt_valsize);
6015 sopt->sopt_valsize = valsize;
6016 if (sopt->sopt_val != USER_ADDR_NULL) {
6017 if (sopt->sopt_p != kernproc) {
6018 error = copyout(buf, sopt->sopt_val, valsize);
6019 } else {
6020 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6021 }
6022 }
6023 return error;
6024 }
6025
6026 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)6027 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
6028 {
6029 int error;
6030 size_t len;
6031 struct user64_timeval tv64 = {};
6032 struct user32_timeval tv32 = {};
6033 const void * val;
6034 size_t valsize;
6035
6036 error = 0;
6037 if (proc_is64bit(sopt->sopt_p)) {
6038 len = sizeof(tv64);
6039 tv64.tv_sec = tv_p->tv_sec;
6040 tv64.tv_usec = tv_p->tv_usec;
6041 val = &tv64;
6042 } else {
6043 len = sizeof(tv32);
6044 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
6045 tv32.tv_usec = tv_p->tv_usec;
6046 val = &tv32;
6047 }
6048 valsize = MIN(len, sopt->sopt_valsize);
6049 sopt->sopt_valsize = valsize;
6050 if (sopt->sopt_val != USER_ADDR_NULL) {
6051 if (sopt->sopt_p != kernproc) {
6052 error = copyout(val, sopt->sopt_val, valsize);
6053 } else {
6054 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6055 }
6056 }
6057 return error;
6058 }
6059
6060 /*
6061 * Return: 0 Success
6062 * ENOPROTOOPT
6063 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
6064 * <pr_ctloutput>:???
6065 * <sf_getoption>:???
6066 */
6067 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)6068 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
6069 {
6070 int error, optval;
6071 struct linger l;
6072 struct timeval tv;
6073
6074 if (sopt->sopt_dir != SOPT_GET) {
6075 sopt->sopt_dir = SOPT_GET;
6076 }
6077
6078 if (dolock) {
6079 socket_lock(so, 1);
6080 }
6081
6082 error = sflt_getsockopt(so, sopt);
6083 if (error != 0) {
6084 if (error == EJUSTRETURN) {
6085 error = 0;
6086 }
6087 goto out;
6088 }
6089
6090 if (sopt->sopt_level != SOL_SOCKET) {
6091 if (so->so_proto != NULL &&
6092 so->so_proto->pr_ctloutput != NULL) {
6093 error = (*so->so_proto->pr_ctloutput)(so, sopt);
6094 goto out;
6095 }
6096 error = ENOPROTOOPT;
6097 } else {
6098 /*
6099 * Allow socket-level (SOL_SOCKET) options to be filtered by
6100 * the protocol layer, if needed. A zero value returned from
6101 * the handler means use default socket-level processing as
6102 * done by the rest of this routine. Otherwise, any other
6103 * return value indicates that the option is unsupported.
6104 */
6105 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6106 pru_socheckopt(so, sopt)) != 0) {
6107 goto out;
6108 }
6109
6110 error = 0;
6111 switch (sopt->sopt_name) {
6112 case SO_LINGER:
6113 case SO_LINGER_SEC:
6114 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6115 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6116 so->so_linger : so->so_linger / hz;
6117 error = sooptcopyout(sopt, &l, sizeof(l));
6118 break;
6119
6120 case SO_USELOOPBACK:
6121 case SO_DONTROUTE:
6122 case SO_DEBUG:
6123 case SO_KEEPALIVE:
6124 case SO_REUSEADDR:
6125 case SO_REUSEPORT:
6126 case SO_BROADCAST:
6127 case SO_OOBINLINE:
6128 case SO_TIMESTAMP:
6129 case SO_TIMESTAMP_MONOTONIC:
6130 case SO_TIMESTAMP_CONTINUOUS:
6131 case SO_DONTTRUNC:
6132 case SO_WANTMORE:
6133 case SO_WANTOOBFLAG:
6134 case SO_NOWAKEFROMSLEEP:
6135 case SO_NOAPNFALLBK:
6136 optval = so->so_options & sopt->sopt_name;
6137 integer:
6138 error = sooptcopyout(sopt, &optval, sizeof(optval));
6139 break;
6140
6141 case SO_TYPE:
6142 optval = so->so_type;
6143 goto integer;
6144
6145 case SO_NREAD:
6146 if (so->so_proto->pr_flags & PR_ATOMIC) {
6147 int pkt_total;
6148 struct mbuf *m1;
6149
6150 pkt_total = 0;
6151 m1 = so->so_rcv.sb_mb;
6152 while (m1 != NULL) {
6153 if (m1->m_type == MT_DATA ||
6154 m1->m_type == MT_HEADER ||
6155 m1->m_type == MT_OOBDATA) {
6156 pkt_total += m1->m_len;
6157 }
6158 m1 = m1->m_next;
6159 }
6160 optval = pkt_total;
6161 } else {
6162 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6163 }
6164 goto integer;
6165
6166 case SO_NUMRCVPKT:
6167 if (so->so_proto->pr_flags & PR_ATOMIC) {
6168 int cnt = 0;
6169 struct mbuf *m1;
6170
6171 m1 = so->so_rcv.sb_mb;
6172 while (m1 != NULL) {
6173 cnt += 1;
6174 m1 = m1->m_nextpkt;
6175 }
6176 optval = cnt;
6177 goto integer;
6178 } else {
6179 error = ENOPROTOOPT;
6180 break;
6181 }
6182
6183 case SO_NWRITE:
6184 optval = so->so_snd.sb_cc;
6185 goto integer;
6186
6187 case SO_ERROR:
6188 optval = so->so_error;
6189 so->so_error = 0;
6190 goto integer;
6191
6192 case SO_SNDBUF: {
6193 u_int32_t hiwat = so->so_snd.sb_hiwat;
6194
6195 if (so->so_snd.sb_flags & SB_UNIX) {
6196 struct unpcb *unp =
6197 (struct unpcb *)(so->so_pcb);
6198 if (unp != NULL && unp->unp_conn != NULL) {
6199 hiwat += unp->unp_conn->unp_cc;
6200 }
6201 }
6202
6203 optval = hiwat;
6204 goto integer;
6205 }
6206 case SO_RCVBUF:
6207 optval = so->so_rcv.sb_hiwat;
6208 goto integer;
6209
6210 case SO_SNDLOWAT:
6211 optval = so->so_snd.sb_lowat;
6212 goto integer;
6213
6214 case SO_RCVLOWAT:
6215 optval = so->so_rcv.sb_lowat;
6216 goto integer;
6217
6218 case SO_SNDTIMEO:
6219 case SO_RCVTIMEO:
6220 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6221 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6222
6223 error = sooptcopyout_timeval(sopt, &tv);
6224 break;
6225
6226 case SO_NOSIGPIPE:
6227 optval = (so->so_flags & SOF_NOSIGPIPE);
6228 goto integer;
6229
6230 case SO_NOADDRERR:
6231 optval = (so->so_flags & SOF_NOADDRAVAIL);
6232 goto integer;
6233
6234 case SO_REUSESHAREUID:
6235 optval = (so->so_flags & SOF_REUSESHAREUID);
6236 goto integer;
6237
6238
6239 case SO_NOTIFYCONFLICT:
6240 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6241 goto integer;
6242
6243 case SO_RESTRICTIONS:
6244 optval = so_get_restrictions(so);
6245 goto integer;
6246
6247 case SO_AWDL_UNRESTRICTED:
6248 if (SOCK_DOM(so) == PF_INET ||
6249 SOCK_DOM(so) == PF_INET6) {
6250 optval = inp_get_awdl_unrestricted(
6251 sotoinpcb(so));
6252 goto integer;
6253 } else {
6254 error = EOPNOTSUPP;
6255 }
6256 break;
6257
6258 case SO_INTCOPROC_ALLOW:
6259 if (SOCK_DOM(so) == PF_INET6) {
6260 optval = inp_get_intcoproc_allowed(
6261 sotoinpcb(so));
6262 goto integer;
6263 } else {
6264 error = EOPNOTSUPP;
6265 }
6266 break;
6267
6268 case SO_LABEL:
6269 error = EOPNOTSUPP;
6270 break;
6271
6272 case SO_PEERLABEL:
6273 error = EOPNOTSUPP;
6274 break;
6275
6276 #ifdef __APPLE_API_PRIVATE
6277 case SO_UPCALLCLOSEWAIT:
6278 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6279 goto integer;
6280 #endif
6281 case SO_RANDOMPORT:
6282 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6283 goto integer;
6284
6285 case SO_NP_EXTENSIONS: {
6286 struct so_np_extensions sonpx = {};
6287
6288 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6289 SONPX_SETOPTSHUT : 0;
6290 sonpx.npx_mask = SONPX_MASK_VALID;
6291
6292 error = sooptcopyout(sopt, &sonpx,
6293 sizeof(struct so_np_extensions));
6294 break;
6295 }
6296
6297 case SO_TRAFFIC_CLASS:
6298 optval = so->so_traffic_class;
6299 goto integer;
6300
6301 case SO_RECV_TRAFFIC_CLASS:
6302 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6303 goto integer;
6304
6305 #if (DEVELOPMENT || DEBUG)
6306 case SO_TRAFFIC_CLASS_DBG:
6307 error = sogetopt_tcdbg(so, sopt);
6308 break;
6309 #endif /* (DEVELOPMENT || DEBUG) */
6310
6311 case SO_PRIVILEGED_TRAFFIC_CLASS:
6312 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6313 goto integer;
6314
6315 case SO_DEFUNCTOK:
6316 optval = !(so->so_flags & SOF_NODEFUNCT);
6317 goto integer;
6318
6319 case SO_ISDEFUNCT:
6320 optval = (so->so_flags & SOF_DEFUNCT);
6321 goto integer;
6322
6323 case SO_OPPORTUNISTIC:
6324 optval = so_get_opportunistic(so);
6325 goto integer;
6326
6327 case SO_FLUSH:
6328 /* This option is not gettable */
6329 error = EINVAL;
6330 break;
6331
6332 case SO_RECV_ANYIF:
6333 optval = so_get_recv_anyif(so);
6334 goto integer;
6335
6336 case SO_TRAFFIC_MGT_BACKGROUND:
6337 /* This option is handled by lower layer(s) */
6338 if (so->so_proto != NULL &&
6339 so->so_proto->pr_ctloutput != NULL) {
6340 (void) so->so_proto->pr_ctloutput(so, sopt);
6341 }
6342 break;
6343
6344 #if FLOW_DIVERT
6345 case SO_FLOW_DIVERT_TOKEN:
6346 error = flow_divert_token_get(so, sopt);
6347 break;
6348 #endif /* FLOW_DIVERT */
6349
6350 #if NECP
6351 case SO_NECP_ATTRIBUTES:
6352 if (SOCK_DOM(so) == PF_MULTIPATH) {
6353 /* Handled by MPTCP itself */
6354 break;
6355 }
6356
6357 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6358 error = EINVAL;
6359 goto out;
6360 }
6361
6362 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6363 break;
6364
6365 case SO_NECP_CLIENTUUID: {
6366 uuid_t *ncu;
6367
6368 if (SOCK_DOM(so) == PF_MULTIPATH) {
6369 ncu = &mpsotomppcb(so)->necp_client_uuid;
6370 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6371 ncu = &sotoinpcb(so)->necp_client_uuid;
6372 } else {
6373 error = EINVAL;
6374 goto out;
6375 }
6376
6377 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6378 break;
6379 }
6380
6381 case SO_NECP_LISTENUUID: {
6382 uuid_t *nlu;
6383
6384 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6385 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6386 nlu = &sotoinpcb(so)->necp_client_uuid;
6387 } else {
6388 error = ENOENT;
6389 goto out;
6390 }
6391 } else {
6392 error = EINVAL;
6393 goto out;
6394 }
6395
6396 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6397 break;
6398 }
6399
6400 case SO_RESOLVER_SIGNATURE: {
6401 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6402 error = EINVAL;
6403 goto out;
6404 }
6405 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6406 break;
6407 }
6408
6409 #endif /* NECP */
6410
6411 #if CONTENT_FILTER
6412 case SO_CFIL_SOCK_ID: {
6413 cfil_sock_id_t sock_id;
6414
6415 sock_id = cfil_sock_id_from_socket(so);
6416
6417 error = sooptcopyout(sopt, &sock_id,
6418 sizeof(cfil_sock_id_t));
6419 break;
6420 }
6421 #endif /* CONTENT_FILTER */
6422
6423 case SO_EXTENDED_BK_IDLE:
6424 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6425 goto integer;
6426 case SO_MARK_CELLFALLBACK:
6427 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6428 ? 1 : 0;
6429 goto integer;
6430 case SO_FALLBACK_MODE:
6431 optval = so->so_fallback_mode;
6432 goto integer;
6433 case SO_MARK_KNOWN_TRACKER: {
6434 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6435 ? 1 : 0;
6436 goto integer;
6437 }
6438 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6439 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6440 ? 1 : 0;
6441 goto integer;
6442 }
6443 case SO_MARK_APPROVED_APP_DOMAIN: {
6444 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6445 ? 1 : 0;
6446 goto integer;
6447 }
6448 case SO_NET_SERVICE_TYPE: {
6449 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6450 optval = so->so_netsvctype;
6451 } else {
6452 optval = NET_SERVICE_TYPE_BE;
6453 }
6454 goto integer;
6455 }
6456 case SO_NETSVC_MARKING_LEVEL:
6457 optval = so_get_netsvc_marking_level(so);
6458 goto integer;
6459
6460 case SO_MPKL_SEND_INFO: {
6461 struct so_mpkl_send_info so_mpkl_send_info;
6462
6463 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6464 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6465 error = sooptcopyout(sopt, &so_mpkl_send_info,
6466 sizeof(struct so_mpkl_send_info));
6467 break;
6468 }
6469 case SO_MARK_WAKE_PKT:
6470 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6471 goto integer;
6472 case SO_RECV_WAKE_PKT:
6473 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6474 goto integer;
6475 default:
6476 error = ENOPROTOOPT;
6477 break;
6478 }
6479 }
6480 out:
6481 if (dolock) {
6482 socket_unlock(so, 1);
6483 }
6484 return error;
6485 }
6486
6487 /*
6488 * The size limits on our soopt_getm is different from that on FreeBSD.
6489 * We limit the size of options to MCLBYTES. This will have to change
6490 * if we need to define options that need more space than MCLBYTES.
6491 */
6492 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6493 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6494 {
6495 struct mbuf *m, *m_prev;
6496 int sopt_size = (int)sopt->sopt_valsize;
6497 int how;
6498
6499 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6500 return EMSGSIZE;
6501 }
6502
6503 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6504 MGET(m, how, MT_DATA);
6505 if (m == NULL) {
6506 return ENOBUFS;
6507 }
6508 if (sopt_size > MLEN) {
6509 MCLGET(m, how);
6510 if ((m->m_flags & M_EXT) == 0) {
6511 m_free(m);
6512 return ENOBUFS;
6513 }
6514 m->m_len = min(MCLBYTES, sopt_size);
6515 } else {
6516 m->m_len = min(MLEN, sopt_size);
6517 }
6518 sopt_size -= m->m_len;
6519 *mp = m;
6520 m_prev = m;
6521
6522 while (sopt_size > 0) {
6523 MGET(m, how, MT_DATA);
6524 if (m == NULL) {
6525 m_freem(*mp);
6526 return ENOBUFS;
6527 }
6528 if (sopt_size > MLEN) {
6529 MCLGET(m, how);
6530 if ((m->m_flags & M_EXT) == 0) {
6531 m_freem(*mp);
6532 m_freem(m);
6533 return ENOBUFS;
6534 }
6535 m->m_len = min(MCLBYTES, sopt_size);
6536 } else {
6537 m->m_len = min(MLEN, sopt_size);
6538 }
6539 sopt_size -= m->m_len;
6540 m_prev->m_next = m;
6541 m_prev = m;
6542 }
6543 return 0;
6544 }
6545
6546 /* copyin sopt data into mbuf chain */
6547 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6548 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6549 {
6550 struct mbuf *m0 = m;
6551
6552 if (sopt->sopt_val == USER_ADDR_NULL) {
6553 return 0;
6554 }
6555 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6556 if (sopt->sopt_p != kernproc) {
6557 int error;
6558
6559 error = copyin(sopt->sopt_val, mtod(m, char *),
6560 m->m_len);
6561 if (error != 0) {
6562 m_freem(m0);
6563 return error;
6564 }
6565 } else {
6566 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6567 mtod(m, char *), m->m_len);
6568 }
6569 sopt->sopt_valsize -= m->m_len;
6570 sopt->sopt_val += m->m_len;
6571 m = m->m_next;
6572 }
6573 /* should be allocated enoughly at ip6_sooptmcopyin() */
6574 if (m != NULL) {
6575 panic("soopt_mcopyin");
6576 /* NOTREACHED */
6577 }
6578 return 0;
6579 }
6580
6581 /* copyout mbuf chain data into soopt */
6582 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6583 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6584 {
6585 struct mbuf *m0 = m;
6586 size_t valsize = 0;
6587
6588 if (sopt->sopt_val == USER_ADDR_NULL) {
6589 return 0;
6590 }
6591 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6592 if (sopt->sopt_p != kernproc) {
6593 int error;
6594
6595 error = copyout(mtod(m, char *), sopt->sopt_val,
6596 m->m_len);
6597 if (error != 0) {
6598 m_freem(m0);
6599 return error;
6600 }
6601 } else {
6602 bcopy(mtod(m, char *),
6603 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6604 }
6605 sopt->sopt_valsize -= m->m_len;
6606 sopt->sopt_val += m->m_len;
6607 valsize += m->m_len;
6608 m = m->m_next;
6609 }
6610 if (m != NULL) {
6611 /* enough soopt buffer should be given from user-land */
6612 m_freem(m0);
6613 return EINVAL;
6614 }
6615 sopt->sopt_valsize = valsize;
6616 return 0;
6617 }
6618
6619 void
sohasoutofband(struct socket * so)6620 sohasoutofband(struct socket *so)
6621 {
6622 if (so->so_pgid < 0) {
6623 gsignal(-so->so_pgid, SIGURG);
6624 } else if (so->so_pgid > 0) {
6625 proc_signal(so->so_pgid, SIGURG);
6626 }
6627 selwakeup(&so->so_rcv.sb_sel);
6628 if (so->so_rcv.sb_flags & SB_KNOTE) {
6629 KNOTE(&so->so_rcv.sb_sel.si_note,
6630 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6631 }
6632 }
6633
6634 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6635 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6636 {
6637 #pragma unused(cred)
6638 struct proc *p = current_proc();
6639 int revents = 0;
6640
6641 socket_lock(so, 1);
6642 so_update_last_owner_locked(so, PROC_NULL);
6643 so_update_policy(so);
6644
6645 if (events & (POLLIN | POLLRDNORM)) {
6646 if (soreadable(so)) {
6647 revents |= events & (POLLIN | POLLRDNORM);
6648 }
6649 }
6650
6651 if (events & (POLLOUT | POLLWRNORM)) {
6652 if (sowriteable(so)) {
6653 revents |= events & (POLLOUT | POLLWRNORM);
6654 }
6655 }
6656
6657 if (events & (POLLPRI | POLLRDBAND)) {
6658 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6659 revents |= events & (POLLPRI | POLLRDBAND);
6660 }
6661 }
6662
6663 if (revents == 0) {
6664 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6665 /*
6666 * Darwin sets the flag first,
6667 * BSD calls selrecord first
6668 */
6669 so->so_rcv.sb_flags |= SB_SEL;
6670 selrecord(p, &so->so_rcv.sb_sel, wql);
6671 }
6672
6673 if (events & (POLLOUT | POLLWRNORM)) {
6674 /*
6675 * Darwin sets the flag first,
6676 * BSD calls selrecord first
6677 */
6678 so->so_snd.sb_flags |= SB_SEL;
6679 selrecord(p, &so->so_snd.sb_sel, wql);
6680 }
6681 }
6682
6683 socket_unlock(so, 1);
6684 return revents;
6685 }
6686
6687 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6688 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6689 {
6690 struct socket *so = (struct socket *)fp_get_data(fp);
6691 int result;
6692
6693 socket_lock(so, 1);
6694 so_update_last_owner_locked(so, PROC_NULL);
6695 so_update_policy(so);
6696
6697 switch (kn->kn_filter) {
6698 case EVFILT_READ:
6699 kn->kn_filtid = EVFILTID_SOREAD;
6700 break;
6701 case EVFILT_WRITE:
6702 kn->kn_filtid = EVFILTID_SOWRITE;
6703 break;
6704 case EVFILT_SOCK:
6705 kn->kn_filtid = EVFILTID_SCK;
6706 break;
6707 case EVFILT_EXCEPT:
6708 kn->kn_filtid = EVFILTID_SOEXCEPT;
6709 break;
6710 default:
6711 socket_unlock(so, 1);
6712 knote_set_error(kn, EINVAL);
6713 return 0;
6714 }
6715
6716 /*
6717 * call the appropriate sub-filter attach
6718 * with the socket still locked
6719 */
6720 result = knote_fops(kn)->f_attach(kn, kev);
6721
6722 socket_unlock(so, 1);
6723
6724 return result;
6725 }
6726
6727 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6728 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6729 {
6730 int retval = 0;
6731 int64_t data = 0;
6732
6733 if (so->so_options & SO_ACCEPTCONN) {
6734 /*
6735 * Radar 6615193 handle the listen case dynamically
6736 * for kqueue read filter. This allows to call listen()
6737 * after registering the kqueue EVFILT_READ.
6738 */
6739
6740 retval = !TAILQ_EMPTY(&so->so_comp);
6741 data = so->so_qlen;
6742 goto out;
6743 }
6744
6745 /* socket isn't a listener */
6746 /*
6747 * NOTE_LOWAT specifies new low water mark in data, i.e.
6748 * the bytes of protocol data. We therefore exclude any
6749 * control bytes.
6750 */
6751 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6752
6753 if (kn->kn_sfflags & NOTE_OOB) {
6754 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6755 kn->kn_fflags |= NOTE_OOB;
6756 data -= so->so_oobmark;
6757 retval = 1;
6758 goto out;
6759 }
6760 }
6761
6762 if ((so->so_state & SS_CANTRCVMORE)
6763 #if CONTENT_FILTER
6764 && cfil_sock_data_pending(&so->so_rcv) == 0
6765 #endif /* CONTENT_FILTER */
6766 ) {
6767 kn->kn_flags |= EV_EOF;
6768 kn->kn_fflags = so->so_error;
6769 retval = 1;
6770 goto out;
6771 }
6772
6773 if (so->so_error) { /* temporary udp error */
6774 retval = 1;
6775 goto out;
6776 }
6777
6778 int64_t lowwat = so->so_rcv.sb_lowat;
6779 /*
6780 * Ensure that when NOTE_LOWAT is used, the derived
6781 * low water mark is bounded by socket's rcv buf's
6782 * high and low water mark values.
6783 */
6784 if (kn->kn_sfflags & NOTE_LOWAT) {
6785 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6786 lowwat = so->so_rcv.sb_hiwat;
6787 } else if (kn->kn_sdata > lowwat) {
6788 lowwat = kn->kn_sdata;
6789 }
6790 }
6791
6792 /*
6793 * While the `data` field is the amount of data to read,
6794 * 0-sized packets need to wake up the kqueue, see 58140856,
6795 * so we need to take control bytes into account too.
6796 */
6797 retval = (so->so_rcv.sb_cc >= lowwat);
6798
6799 out:
6800 if (retval && kev) {
6801 knote_fill_kevent(kn, kev, data);
6802 }
6803 return retval;
6804 }
6805
6806 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6807 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6808 {
6809 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6810
6811 /* socket locked */
6812
6813 /*
6814 * If the caller explicitly asked for OOB results (e.g. poll())
6815 * from EVFILT_READ, then save that off in the hookid field
6816 * and reserve the kn_flags EV_OOBAND bit for output only.
6817 */
6818 if (kn->kn_filter == EVFILT_READ &&
6819 kn->kn_flags & EV_OOBAND) {
6820 kn->kn_flags &= ~EV_OOBAND;
6821 kn->kn_hook32 = EV_OOBAND;
6822 } else {
6823 kn->kn_hook32 = 0;
6824 }
6825 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6826 so->so_rcv.sb_flags |= SB_KNOTE;
6827 }
6828
6829 /* indicate if event is already fired */
6830 return filt_soread_common(kn, NULL, so);
6831 }
6832
6833 static void
filt_sordetach(struct knote * kn)6834 filt_sordetach(struct knote *kn)
6835 {
6836 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6837
6838 socket_lock(so, 1);
6839 if (so->so_rcv.sb_flags & SB_KNOTE) {
6840 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6841 so->so_rcv.sb_flags &= ~SB_KNOTE;
6842 }
6843 }
6844 socket_unlock(so, 1);
6845 }
6846
6847 /*ARGSUSED*/
6848 static int
filt_soread(struct knote * kn,long hint)6849 filt_soread(struct knote *kn, long hint)
6850 {
6851 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6852 int retval;
6853
6854 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6855 socket_lock(so, 1);
6856 }
6857
6858 retval = filt_soread_common(kn, NULL, so);
6859
6860 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6861 socket_unlock(so, 1);
6862 }
6863
6864 return retval;
6865 }
6866
6867 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6868 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6869 {
6870 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6871 int retval;
6872
6873 socket_lock(so, 1);
6874
6875 /* save off the new input fflags and data */
6876 kn->kn_sfflags = kev->fflags;
6877 kn->kn_sdata = kev->data;
6878
6879 /* determine if changes result in fired events */
6880 retval = filt_soread_common(kn, NULL, so);
6881
6882 socket_unlock(so, 1);
6883
6884 return retval;
6885 }
6886
6887 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6888 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6889 {
6890 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6891 int retval;
6892
6893 socket_lock(so, 1);
6894 retval = filt_soread_common(kn, kev, so);
6895 socket_unlock(so, 1);
6896
6897 return retval;
6898 }
6899
6900 int
so_wait_for_if_feedback(struct socket * so)6901 so_wait_for_if_feedback(struct socket *so)
6902 {
6903 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6904 (so->so_state & SS_ISCONNECTED)) {
6905 struct inpcb *inp = sotoinpcb(so);
6906 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6907 return 1;
6908 }
6909 }
6910 return 0;
6911 }
6912
6913 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6914 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6915 {
6916 int ret = 0;
6917 int64_t data = sbspace(&so->so_snd);
6918
6919 if (so->so_state & SS_CANTSENDMORE) {
6920 kn->kn_flags |= EV_EOF;
6921 kn->kn_fflags = so->so_error;
6922 ret = 1;
6923 goto out;
6924 }
6925
6926 if (so->so_error) { /* temporary udp error */
6927 ret = 1;
6928 goto out;
6929 }
6930
6931 if (!socanwrite(so)) {
6932 ret = 0;
6933 goto out;
6934 }
6935
6936 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6937 ret = 1;
6938 goto out;
6939 }
6940
6941 int64_t lowwat = so->so_snd.sb_lowat;
6942 const int64_t hiwat = so->so_snd.sb_hiwat;
6943 /*
6944 * Deal with connected UNIX domain sockets which
6945 * rely on the fact that the sender's socket buffer is
6946 * actually the receiver's socket buffer.
6947 */
6948 if (SOCK_DOM(so) == PF_LOCAL) {
6949 struct unpcb *unp = sotounpcb(so);
6950 if (unp != NULL && unp->unp_conn != NULL &&
6951 unp->unp_conn->unp_socket != NULL) {
6952 struct socket *so2 = unp->unp_conn->unp_socket;
6953 /*
6954 * At this point we know that `so' is locked
6955 * and that `unp_conn` isn't going to change.
6956 * However, we don't lock `so2` because doing so
6957 * may require unlocking `so'
6958 * (see unp_get_locks_in_order()).
6959 *
6960 * Two cases can happen:
6961 *
6962 * 1) we return 1 and tell the application that
6963 * it can write. Meanwhile, another thread
6964 * fills up the socket buffer. This will either
6965 * lead to a blocking send or EWOULDBLOCK
6966 * which the application should deal with.
6967 * 2) we return 0 and tell the application that
6968 * the socket is not writable. Meanwhile,
6969 * another thread depletes the receive socket
6970 * buffer. In this case the application will
6971 * be woken up by sb_notify().
6972 *
6973 * MIN() is required because otherwise sosendcheck()
6974 * may return EWOULDBLOCK since it only considers
6975 * so->so_snd.
6976 */
6977 data = MIN(data, sbspace(&so2->so_rcv));
6978 }
6979 }
6980
6981 if (kn->kn_sfflags & NOTE_LOWAT) {
6982 if (kn->kn_sdata > hiwat) {
6983 lowwat = hiwat;
6984 } else if (kn->kn_sdata > lowwat) {
6985 lowwat = kn->kn_sdata;
6986 }
6987 }
6988
6989 if (data > 0 && data >= lowwat) {
6990 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6991 #if (DEBUG || DEVELOPMENT)
6992 && so_notsent_lowat_check == 1
6993 #endif /* DEBUG || DEVELOPMENT */
6994 ) {
6995 if ((SOCK_DOM(so) == PF_INET ||
6996 SOCK_DOM(so) == PF_INET6) &&
6997 so->so_type == SOCK_STREAM) {
6998 ret = tcp_notsent_lowat_check(so);
6999 }
7000 #if MPTCP
7001 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
7002 (SOCK_PROTO(so) == IPPROTO_TCP)) {
7003 ret = mptcp_notsent_lowat_check(so);
7004 }
7005 #endif
7006 else {
7007 ret = 1;
7008 goto out;
7009 }
7010 } else {
7011 ret = 1;
7012 }
7013 }
7014 if (so_wait_for_if_feedback(so)) {
7015 ret = 0;
7016 }
7017
7018 out:
7019 if (ret && kev) {
7020 knote_fill_kevent(kn, kev, data);
7021 }
7022 return ret;
7023 }
7024
7025 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)7026 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7027 {
7028 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7029
7030 /* socket locked */
7031 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
7032 so->so_snd.sb_flags |= SB_KNOTE;
7033 }
7034
7035 /* determine if its already fired */
7036 return filt_sowrite_common(kn, NULL, so);
7037 }
7038
7039 static void
filt_sowdetach(struct knote * kn)7040 filt_sowdetach(struct knote *kn)
7041 {
7042 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7043 socket_lock(so, 1);
7044
7045 if (so->so_snd.sb_flags & SB_KNOTE) {
7046 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
7047 so->so_snd.sb_flags &= ~SB_KNOTE;
7048 }
7049 }
7050 socket_unlock(so, 1);
7051 }
7052
7053 /*ARGSUSED*/
7054 static int
filt_sowrite(struct knote * kn,long hint)7055 filt_sowrite(struct knote *kn, long hint)
7056 {
7057 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7058 int ret;
7059
7060 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7061 socket_lock(so, 1);
7062 }
7063
7064 ret = filt_sowrite_common(kn, NULL, so);
7065
7066 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7067 socket_unlock(so, 1);
7068 }
7069
7070 return ret;
7071 }
7072
7073 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)7074 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
7075 {
7076 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7077 int ret;
7078
7079 socket_lock(so, 1);
7080
7081 /*save off the new input fflags and data */
7082 kn->kn_sfflags = kev->fflags;
7083 kn->kn_sdata = kev->data;
7084
7085 /* determine if these changes result in a triggered event */
7086 ret = filt_sowrite_common(kn, NULL, so);
7087
7088 socket_unlock(so, 1);
7089
7090 return ret;
7091 }
7092
7093 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)7094 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
7095 {
7096 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7097 int ret;
7098
7099 socket_lock(so, 1);
7100 ret = filt_sowrite_common(kn, kev, so);
7101 socket_unlock(so, 1);
7102
7103 return ret;
7104 }
7105
7106 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)7107 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
7108 struct socket *so, long ev_hint)
7109 {
7110 int ret = 0;
7111 int64_t data = 0;
7112 uint32_t level_trigger = 0;
7113
7114 if (ev_hint & SO_FILT_HINT_CONNRESET) {
7115 kn->kn_fflags |= NOTE_CONNRESET;
7116 }
7117 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
7118 kn->kn_fflags |= NOTE_TIMEOUT;
7119 }
7120 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
7121 kn->kn_fflags |= NOTE_NOSRCADDR;
7122 }
7123 if (ev_hint & SO_FILT_HINT_IFDENIED) {
7124 kn->kn_fflags |= NOTE_IFDENIED;
7125 }
7126 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7127 kn->kn_fflags |= NOTE_KEEPALIVE;
7128 }
7129 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7130 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7131 }
7132 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7133 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7134 }
7135 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7136 (so->so_state & SS_ISCONNECTED)) {
7137 kn->kn_fflags |= NOTE_CONNECTED;
7138 level_trigger |= NOTE_CONNECTED;
7139 }
7140 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7141 (so->so_state & SS_ISDISCONNECTED)) {
7142 kn->kn_fflags |= NOTE_DISCONNECTED;
7143 level_trigger |= NOTE_DISCONNECTED;
7144 }
7145 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7146 if (so->so_proto != NULL &&
7147 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7148 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7149 }
7150 }
7151 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7152 tcp_notify_ack_active(so)) {
7153 kn->kn_fflags |= NOTE_NOTIFY_ACK;
7154 }
7155 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7156 kn->kn_fflags |= NOTE_WAKE_PKT;
7157 }
7158
7159 if ((so->so_state & SS_CANTRCVMORE)
7160 #if CONTENT_FILTER
7161 && cfil_sock_data_pending(&so->so_rcv) == 0
7162 #endif /* CONTENT_FILTER */
7163 ) {
7164 kn->kn_fflags |= NOTE_READCLOSED;
7165 level_trigger |= NOTE_READCLOSED;
7166 }
7167
7168 if (so->so_state & SS_CANTSENDMORE) {
7169 kn->kn_fflags |= NOTE_WRITECLOSED;
7170 level_trigger |= NOTE_WRITECLOSED;
7171 }
7172
7173 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7174 (so->so_flags & SOF_SUSPENDED)) {
7175 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7176
7177 /* If resume event was delivered before, reset it */
7178 kn->kn_hook32 &= ~NOTE_RESUME;
7179
7180 kn->kn_fflags |= NOTE_SUSPEND;
7181 level_trigger |= NOTE_SUSPEND;
7182 }
7183
7184 if ((ev_hint & SO_FILT_HINT_RESUME) ||
7185 (so->so_flags & SOF_SUSPENDED) == 0) {
7186 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7187
7188 /* If suspend event was delivered before, reset it */
7189 kn->kn_hook32 &= ~NOTE_SUSPEND;
7190
7191 kn->kn_fflags |= NOTE_RESUME;
7192 level_trigger |= NOTE_RESUME;
7193 }
7194
7195 if (so->so_error != 0) {
7196 ret = 1;
7197 data = so->so_error;
7198 kn->kn_flags |= EV_EOF;
7199 } else {
7200 u_int32_t data32 = 0;
7201 get_sockev_state(so, &data32);
7202 data = data32;
7203 }
7204
7205 /* Reset any events that are not requested on this knote */
7206 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7207 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7208
7209 /* Find the level triggerred events that are already delivered */
7210 level_trigger &= kn->kn_hook32;
7211 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7212
7213 /* Do not deliver level triggerred events more than once */
7214 if ((kn->kn_fflags & ~level_trigger) != 0) {
7215 ret = 1;
7216 }
7217
7218 if (ret && kev) {
7219 /*
7220 * Store the state of the events being delivered. This
7221 * state can be used to deliver level triggered events
7222 * ateast once and still avoid waking up the application
7223 * multiple times as long as the event is active.
7224 */
7225 if (kn->kn_fflags != 0) {
7226 kn->kn_hook32 |= (kn->kn_fflags &
7227 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7228 }
7229
7230 /*
7231 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7232 * only one of them and remember the last one that was
7233 * delivered last
7234 */
7235 if (kn->kn_fflags & NOTE_SUSPEND) {
7236 kn->kn_hook32 &= ~NOTE_RESUME;
7237 }
7238 if (kn->kn_fflags & NOTE_RESUME) {
7239 kn->kn_hook32 &= ~NOTE_SUSPEND;
7240 }
7241
7242 knote_fill_kevent(kn, kev, data);
7243 }
7244 return ret;
7245 }
7246
7247 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7248 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7249 {
7250 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7251
7252 /* socket locked */
7253 kn->kn_hook32 = 0;
7254 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7255 so->so_flags |= SOF_KNOTE;
7256 }
7257
7258 /* determine if event already fired */
7259 return filt_sockev_common(kn, NULL, so, 0);
7260 }
7261
7262 static void
filt_sockdetach(struct knote * kn)7263 filt_sockdetach(struct knote *kn)
7264 {
7265 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7266 socket_lock(so, 1);
7267
7268 if ((so->so_flags & SOF_KNOTE) != 0) {
7269 if (KNOTE_DETACH(&so->so_klist, kn)) {
7270 so->so_flags &= ~SOF_KNOTE;
7271 }
7272 }
7273 socket_unlock(so, 1);
7274 }
7275
7276 static int
filt_sockev(struct knote * kn,long hint)7277 filt_sockev(struct knote *kn, long hint)
7278 {
7279 int ret = 0, locked = 0;
7280 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7281 long ev_hint = (hint & SO_FILT_HINT_EV);
7282
7283 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7284 socket_lock(so, 1);
7285 locked = 1;
7286 }
7287
7288 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7289
7290 if (locked) {
7291 socket_unlock(so, 1);
7292 }
7293
7294 return ret;
7295 }
7296
7297
7298
7299 /*
7300 * filt_socktouch - update event state
7301 */
7302 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7303 filt_socktouch(
7304 struct knote *kn,
7305 struct kevent_qos_s *kev)
7306 {
7307 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7308 uint32_t changed_flags;
7309 int ret;
7310
7311 socket_lock(so, 1);
7312
7313 /* save off the [result] data and fflags */
7314 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7315
7316 /* save off the new input fflags and data */
7317 kn->kn_sfflags = kev->fflags;
7318 kn->kn_sdata = kev->data;
7319
7320 /* restrict the current results to the (smaller?) set of new interest */
7321 /*
7322 * For compatibility with previous implementations, we leave kn_fflags
7323 * as they were before.
7324 */
7325 //kn->kn_fflags &= kev->fflags;
7326
7327 /*
7328 * Since we keep track of events that are already
7329 * delivered, if any of those events are not requested
7330 * anymore the state related to them can be reset
7331 */
7332 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7333
7334 /* determine if we have events to deliver */
7335 ret = filt_sockev_common(kn, NULL, so, 0);
7336
7337 socket_unlock(so, 1);
7338
7339 return ret;
7340 }
7341
7342 /*
7343 * filt_sockprocess - query event fired state and return data
7344 */
7345 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7346 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7347 {
7348 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7349 int ret = 0;
7350
7351 socket_lock(so, 1);
7352
7353 ret = filt_sockev_common(kn, kev, so, 0);
7354
7355 socket_unlock(so, 1);
7356
7357 return ret;
7358 }
7359
7360 void
get_sockev_state(struct socket * so,u_int32_t * statep)7361 get_sockev_state(struct socket *so, u_int32_t *statep)
7362 {
7363 u_int32_t state = *(statep);
7364
7365 /*
7366 * If the state variable is already used by a previous event,
7367 * reset it.
7368 */
7369 if (state != 0) {
7370 return;
7371 }
7372
7373 if (so->so_state & SS_ISCONNECTED) {
7374 state |= SOCKEV_CONNECTED;
7375 } else {
7376 state &= ~(SOCKEV_CONNECTED);
7377 }
7378 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7379 *(statep) = state;
7380 }
7381
7382 #define SO_LOCK_HISTORY_STR_LEN \
7383 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7384
7385 __private_extern__ const char *
solockhistory_nr(struct socket * so)7386 solockhistory_nr(struct socket *so)
7387 {
7388 size_t n = 0;
7389 int i;
7390 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7391
7392 bzero(lock_history_str, sizeof(lock_history_str));
7393 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7394 n += scnprintf(lock_history_str + n,
7395 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7396 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7397 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7398 }
7399 return lock_history_str;
7400 }
7401
7402 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7403 socket_getlock(struct socket *so, int flags)
7404 {
7405 if (so->so_proto->pr_getlock != NULL) {
7406 return (*so->so_proto->pr_getlock)(so, flags);
7407 } else {
7408 return so->so_proto->pr_domain->dom_mtx;
7409 }
7410 }
7411
7412 void
socket_lock(struct socket * so,int refcount)7413 socket_lock(struct socket *so, int refcount)
7414 {
7415 void *lr_saved;
7416
7417 lr_saved = __builtin_return_address(0);
7418
7419 if (so->so_proto->pr_lock) {
7420 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7421 } else {
7422 #ifdef MORE_LOCKING_DEBUG
7423 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7424 LCK_MTX_ASSERT_NOTOWNED);
7425 #endif
7426 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7427 if (refcount) {
7428 so->so_usecount++;
7429 }
7430 so->lock_lr[so->next_lock_lr] = lr_saved;
7431 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7432 }
7433 }
7434
7435 void
socket_lock_assert_owned(struct socket * so)7436 socket_lock_assert_owned(struct socket *so)
7437 {
7438 lck_mtx_t *mutex_held;
7439
7440 if (so->so_proto->pr_getlock != NULL) {
7441 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7442 } else {
7443 mutex_held = so->so_proto->pr_domain->dom_mtx;
7444 }
7445
7446 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7447 }
7448
7449 int
socket_try_lock(struct socket * so)7450 socket_try_lock(struct socket *so)
7451 {
7452 lck_mtx_t *mtx;
7453
7454 if (so->so_proto->pr_getlock != NULL) {
7455 mtx = (*so->so_proto->pr_getlock)(so, 0);
7456 } else {
7457 mtx = so->so_proto->pr_domain->dom_mtx;
7458 }
7459
7460 return lck_mtx_try_lock(mtx);
7461 }
7462
7463 void
socket_unlock(struct socket * so,int refcount)7464 socket_unlock(struct socket *so, int refcount)
7465 {
7466 void *lr_saved;
7467 lck_mtx_t *mutex_held;
7468
7469 lr_saved = __builtin_return_address(0);
7470
7471 if (so == NULL || so->so_proto == NULL) {
7472 panic("%s: null so_proto so=%p", __func__, so);
7473 /* NOTREACHED */
7474 }
7475
7476 if (so->so_proto->pr_unlock) {
7477 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7478 } else {
7479 mutex_held = so->so_proto->pr_domain->dom_mtx;
7480 #ifdef MORE_LOCKING_DEBUG
7481 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7482 #endif
7483 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7484 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7485
7486 if (refcount) {
7487 if (so->so_usecount <= 0) {
7488 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7489 "lrh=%s", __func__, so->so_usecount, so,
7490 SOCK_DOM(so), so->so_type,
7491 SOCK_PROTO(so), solockhistory_nr(so));
7492 /* NOTREACHED */
7493 }
7494
7495 so->so_usecount--;
7496 if (so->so_usecount == 0) {
7497 sofreelastref(so, 1);
7498 }
7499 }
7500 lck_mtx_unlock(mutex_held);
7501 }
7502 }
7503
7504 /* Called with socket locked, will unlock socket */
7505 void
sofree(struct socket * so)7506 sofree(struct socket *so)
7507 {
7508 lck_mtx_t *mutex_held;
7509
7510 if (so->so_proto->pr_getlock != NULL) {
7511 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7512 } else {
7513 mutex_held = so->so_proto->pr_domain->dom_mtx;
7514 }
7515 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7516
7517 sofreelastref(so, 0);
7518 }
7519
7520 void
soreference(struct socket * so)7521 soreference(struct socket *so)
7522 {
7523 socket_lock(so, 1); /* locks & take one reference on socket */
7524 socket_unlock(so, 0); /* unlock only */
7525 }
7526
7527 void
sodereference(struct socket * so)7528 sodereference(struct socket *so)
7529 {
7530 socket_lock(so, 0);
7531 socket_unlock(so, 1);
7532 }
7533
7534 /*
7535 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7536 * possibility of using jumbo clusters. Caller must ensure to hold
7537 * the socket lock.
7538 */
7539 void
somultipages(struct socket * so,boolean_t set)7540 somultipages(struct socket *so, boolean_t set)
7541 {
7542 if (set) {
7543 so->so_flags |= SOF_MULTIPAGES;
7544 } else {
7545 so->so_flags &= ~SOF_MULTIPAGES;
7546 }
7547 }
7548
7549 void
soif2kcl(struct socket * so,boolean_t set)7550 soif2kcl(struct socket *so, boolean_t set)
7551 {
7552 if (set) {
7553 so->so_flags1 |= SOF1_IF_2KCL;
7554 } else {
7555 so->so_flags1 &= ~SOF1_IF_2KCL;
7556 }
7557 }
7558
7559 int
so_isdstlocal(struct socket * so)7560 so_isdstlocal(struct socket *so)
7561 {
7562 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7563
7564 if (SOCK_DOM(so) == PF_INET) {
7565 return inaddr_local(inp->inp_faddr);
7566 } else if (SOCK_DOM(so) == PF_INET6) {
7567 return in6addr_local(&inp->in6p_faddr);
7568 }
7569
7570 return 0;
7571 }
7572
7573 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7574 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7575 {
7576 struct sockbuf *rcv, *snd;
7577 int err = 0, defunct;
7578
7579 rcv = &so->so_rcv;
7580 snd = &so->so_snd;
7581
7582 defunct = (so->so_flags & SOF_DEFUNCT);
7583 if (defunct) {
7584 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7585 panic("%s: SB_DROP not set", __func__);
7586 /* NOTREACHED */
7587 }
7588 goto done;
7589 }
7590
7591 if (so->so_flags & SOF_NODEFUNCT) {
7592 if (noforce) {
7593 err = EOPNOTSUPP;
7594 if (p != PROC_NULL) {
7595 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7596 "name %s level %d) so 0x%llu [%d,%d] "
7597 "is not eligible for defunct "
7598 "(%d)\n", __func__, proc_selfpid(),
7599 proc_best_name(current_proc()), proc_pid(p),
7600 proc_best_name(p), level,
7601 so->so_gencnt,
7602 SOCK_DOM(so), SOCK_TYPE(so), err);
7603 }
7604 return err;
7605 }
7606 so->so_flags &= ~SOF_NODEFUNCT;
7607 if (p != PROC_NULL) {
7608 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7609 "name %s level %d) so 0x%llu [%d,%d] "
7610 "defunct by force "
7611 "(%d)\n", __func__, proc_selfpid(),
7612 proc_best_name(current_proc()), proc_pid(p),
7613 proc_best_name(p), level,
7614 so->so_gencnt,
7615 SOCK_DOM(so), SOCK_TYPE(so), err);
7616 }
7617 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7618 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7619 struct ifnet *ifp = inp->inp_last_outifp;
7620
7621 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7622 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7623 } else if (so->so_flags & SOF_DELEGATED) {
7624 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7625 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7626 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7627 } else if (noforce && p != PROC_NULL) {
7628 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7629
7630 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7631 so->so_extended_bk_start = net_uptime();
7632 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7633
7634 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7635
7636 err = EOPNOTSUPP;
7637 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7638 "name %s level %d) so 0x%llu [%d,%d] "
7639 "extend bk idle "
7640 "(%d)\n", __func__, proc_selfpid(),
7641 proc_best_name(current_proc()), proc_pid(p),
7642 proc_best_name(p), level,
7643 so->so_gencnt,
7644 SOCK_DOM(so), SOCK_TYPE(so), err);
7645 return err;
7646 } else {
7647 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7648 }
7649 }
7650
7651 so->so_flags |= SOF_DEFUNCT;
7652
7653 /* Prevent further data from being appended to the socket buffers */
7654 snd->sb_flags |= SB_DROP;
7655 rcv->sb_flags |= SB_DROP;
7656
7657 /* Flush any existing data in the socket buffers */
7658 if (rcv->sb_cc != 0) {
7659 rcv->sb_flags &= ~SB_SEL;
7660 selthreadclear(&rcv->sb_sel);
7661 sbrelease(rcv);
7662 }
7663 if (snd->sb_cc != 0) {
7664 snd->sb_flags &= ~SB_SEL;
7665 selthreadclear(&snd->sb_sel);
7666 sbrelease(snd);
7667 }
7668
7669 done:
7670 if (p != PROC_NULL) {
7671 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7672 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7673 proc_selfpid(), proc_best_name(current_proc()),
7674 proc_pid(p), proc_best_name(p), level,
7675 so->so_gencnt, SOCK_DOM(so),
7676 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7677 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7678 " extbkidle" : "");
7679 }
7680 return err;
7681 }
7682
7683 int
sodefunct(struct proc * p,struct socket * so,int level)7684 sodefunct(struct proc *p, struct socket *so, int level)
7685 {
7686 struct sockbuf *rcv, *snd;
7687
7688 if (!(so->so_flags & SOF_DEFUNCT)) {
7689 panic("%s improperly called", __func__);
7690 /* NOTREACHED */
7691 }
7692 if (so->so_state & SS_DEFUNCT) {
7693 goto done;
7694 }
7695
7696 rcv = &so->so_rcv;
7697 snd = &so->so_snd;
7698
7699 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7700 char s[MAX_IPv6_STR_LEN];
7701 char d[MAX_IPv6_STR_LEN];
7702 struct inpcb *inp = sotoinpcb(so);
7703
7704 if (p != PROC_NULL) {
7705 SODEFUNCTLOG(
7706 "%s[%d, %s]: (target pid %d name %s level %d) "
7707 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7708 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7709 " snd_fl 0x%x]\n", __func__,
7710 proc_selfpid(), proc_best_name(current_proc()),
7711 proc_pid(p), proc_best_name(p), level,
7712 so->so_gencnt,
7713 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7714 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7715 (void *)&inp->inp_laddr.s_addr :
7716 (void *)&inp->in6p_laddr),
7717 s, sizeof(s)), ntohs(inp->in6p_lport),
7718 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7719 (void *)&inp->inp_faddr.s_addr :
7720 (void *)&inp->in6p_faddr,
7721 d, sizeof(d)), ntohs(inp->in6p_fport),
7722 (uint32_t)rcv->sb_sel.si_flags,
7723 (uint32_t)snd->sb_sel.si_flags,
7724 rcv->sb_flags, snd->sb_flags);
7725 }
7726 } else if (p != PROC_NULL) {
7727 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7728 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7729 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7730 proc_selfpid(), proc_best_name(current_proc()),
7731 proc_pid(p), proc_best_name(p), level,
7732 so->so_gencnt,
7733 SOCK_DOM(so), SOCK_TYPE(so),
7734 (uint32_t)rcv->sb_sel.si_flags,
7735 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7736 snd->sb_flags);
7737 }
7738
7739 /*
7740 * First tell the protocol the flow is defunct
7741 */
7742 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7743
7744 /*
7745 * Unwedge threads blocked on sbwait() and sb_lock().
7746 */
7747 sbwakeup(rcv);
7748 sbwakeup(snd);
7749
7750 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7751 if (rcv->sb_flags & SB_LOCK) {
7752 sbunlock(rcv, TRUE); /* keep socket locked */
7753 }
7754 if (snd->sb_flags & SB_LOCK) {
7755 sbunlock(snd, TRUE); /* keep socket locked */
7756 }
7757 /*
7758 * Flush the buffers and disconnect. We explicitly call shutdown
7759 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7760 * states are set for the socket. This would also flush out data
7761 * hanging off the receive list of this socket.
7762 */
7763 (void) soshutdownlock_final(so, SHUT_RD);
7764 (void) soshutdownlock_final(so, SHUT_WR);
7765 (void) sodisconnectlocked(so);
7766
7767 /*
7768 * Explicitly handle connectionless-protocol disconnection
7769 * and release any remaining data in the socket buffers.
7770 */
7771 if (!(so->so_state & SS_ISDISCONNECTED)) {
7772 (void) soisdisconnected(so);
7773 }
7774
7775 if (so->so_error == 0) {
7776 so->so_error = EBADF;
7777 }
7778
7779 if (rcv->sb_cc != 0) {
7780 rcv->sb_flags &= ~SB_SEL;
7781 selthreadclear(&rcv->sb_sel);
7782 sbrelease(rcv);
7783 }
7784 if (snd->sb_cc != 0) {
7785 snd->sb_flags &= ~SB_SEL;
7786 selthreadclear(&snd->sb_sel);
7787 sbrelease(snd);
7788 }
7789 so->so_state |= SS_DEFUNCT;
7790 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7791
7792 done:
7793 return 0;
7794 }
7795
7796 int
soresume(struct proc * p,struct socket * so,int locked)7797 soresume(struct proc *p, struct socket *so, int locked)
7798 {
7799 if (locked == 0) {
7800 socket_lock(so, 1);
7801 }
7802
7803 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7804 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7805 "[%d,%d] resumed from bk idle\n",
7806 __func__, proc_selfpid(), proc_best_name(current_proc()),
7807 proc_pid(p), proc_best_name(p),
7808 so->so_gencnt,
7809 SOCK_DOM(so), SOCK_TYPE(so));
7810
7811 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7812 so->so_extended_bk_start = 0;
7813 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7814
7815 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7816 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7817 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7818 }
7819 if (locked == 0) {
7820 socket_unlock(so, 1);
7821 }
7822
7823 return 0;
7824 }
7825
7826 /*
7827 * Does not attempt to account for sockets that are delegated from
7828 * the current process
7829 */
7830 int
so_set_extended_bk_idle(struct socket * so,int optval)7831 so_set_extended_bk_idle(struct socket *so, int optval)
7832 {
7833 int error = 0;
7834
7835 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7836 SOCK_PROTO(so) != IPPROTO_TCP) {
7837 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7838 error = EOPNOTSUPP;
7839 } else if (optval == 0) {
7840 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7841
7842 soresume(current_proc(), so, 1);
7843 } else {
7844 struct proc *p = current_proc();
7845 struct fileproc *fp;
7846 int count = 0;
7847
7848 /*
7849 * Unlock socket to avoid lock ordering issue with
7850 * the proc fd table lock
7851 */
7852 socket_unlock(so, 0);
7853
7854 proc_fdlock(p);
7855 fdt_foreach(fp, p) {
7856 struct socket *so2;
7857
7858 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7859 continue;
7860 }
7861
7862 so2 = (struct socket *)fp_get_data(fp);
7863 if (so != so2 &&
7864 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7865 count++;
7866 }
7867 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7868 break;
7869 }
7870 }
7871 proc_fdunlock(p);
7872
7873 socket_lock(so, 0);
7874
7875 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7876 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7877 error = EBUSY;
7878 } else if (so->so_flags & SOF_DELEGATED) {
7879 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7880 error = EBUSY;
7881 } else {
7882 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7883 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7884 }
7885 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7886 "%s marked for extended bk idle\n",
7887 __func__, proc_selfpid(), proc_best_name(current_proc()),
7888 so->so_gencnt,
7889 SOCK_DOM(so), SOCK_TYPE(so),
7890 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7891 "is" : "not");
7892 }
7893
7894 return error;
7895 }
7896
7897 static void
so_stop_extended_bk_idle(struct socket * so)7898 so_stop_extended_bk_idle(struct socket *so)
7899 {
7900 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7901 so->so_extended_bk_start = 0;
7902
7903 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7904 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7905 /*
7906 * Force defunct
7907 */
7908 sosetdefunct(current_proc(), so,
7909 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7910 if (so->so_flags & SOF_DEFUNCT) {
7911 sodefunct(current_proc(), so,
7912 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7913 }
7914 }
7915
7916 void
so_drain_extended_bk_idle(struct socket * so)7917 so_drain_extended_bk_idle(struct socket *so)
7918 {
7919 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7920 /*
7921 * Only penalize sockets that have outstanding data
7922 */
7923 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7924 so_stop_extended_bk_idle(so);
7925
7926 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7927 }
7928 }
7929 }
7930
7931 /*
7932 * Return values tells if socket is still in extended background idle
7933 */
7934 int
so_check_extended_bk_idle_time(struct socket * so)7935 so_check_extended_bk_idle_time(struct socket *so)
7936 {
7937 int ret = 1;
7938
7939 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7940 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7941 __func__, proc_selfpid(), proc_best_name(current_proc()),
7942 so->so_gencnt,
7943 SOCK_DOM(so), SOCK_TYPE(so));
7944 if (net_uptime() - so->so_extended_bk_start >
7945 soextbkidlestat.so_xbkidle_time) {
7946 so_stop_extended_bk_idle(so);
7947
7948 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7949
7950 ret = 0;
7951 } else {
7952 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7953
7954 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7955 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7956 }
7957 }
7958
7959 return ret;
7960 }
7961
7962 void
resume_proc_sockets(proc_t p)7963 resume_proc_sockets(proc_t p)
7964 {
7965 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7966 struct fileproc *fp;
7967 struct socket *so;
7968
7969 proc_fdlock(p);
7970 fdt_foreach(fp, p) {
7971 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7972 continue;
7973 }
7974
7975 so = (struct socket *)fp_get_data(fp);
7976 (void) soresume(p, so, 0);
7977 }
7978 proc_fdunlock(p);
7979
7980 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7981 }
7982 }
7983
7984 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7985 so_set_recv_anyif(struct socket *so, int optval)
7986 {
7987 int ret = 0;
7988
7989 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7990 if (optval) {
7991 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7992 } else {
7993 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7994 }
7995 #if SKYWALK
7996 inp_update_netns_flags(so);
7997 #endif /* SKYWALK */
7998 }
7999
8000
8001 return ret;
8002 }
8003
8004 __private_extern__ int
so_get_recv_anyif(struct socket * so)8005 so_get_recv_anyif(struct socket *so)
8006 {
8007 int ret = 0;
8008
8009 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8010 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
8011 }
8012
8013 return ret;
8014 }
8015
8016 int
so_set_restrictions(struct socket * so,uint32_t vals)8017 so_set_restrictions(struct socket *so, uint32_t vals)
8018 {
8019 int nocell_old, nocell_new;
8020 int noexpensive_old, noexpensive_new;
8021 int noconstrained_old, noconstrained_new;
8022
8023 /*
8024 * Deny-type restrictions are trapdoors; once set they cannot be
8025 * unset for the lifetime of the socket. This allows them to be
8026 * issued by a framework on behalf of the application without
8027 * having to worry that they can be undone.
8028 *
8029 * Note here that socket-level restrictions overrides any protocol
8030 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
8031 * socket restriction issued on the socket has a higher precendence
8032 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
8033 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
8034 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
8035 */
8036 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8037 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8038 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8039 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
8040 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
8041 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
8042 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8043 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8044 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8045
8046 /* we can only set, not clear restrictions */
8047 if ((nocell_new - nocell_old) == 0 &&
8048 (noexpensive_new - noexpensive_old) == 0 &&
8049 (noconstrained_new - noconstrained_old) == 0) {
8050 return 0;
8051 }
8052 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8053 if (nocell_new - nocell_old != 0) {
8054 /*
8055 * if deny cellular is now set, do what's needed
8056 * for INPCB
8057 */
8058 inp_set_nocellular(sotoinpcb(so));
8059 }
8060 if (noexpensive_new - noexpensive_old != 0) {
8061 inp_set_noexpensive(sotoinpcb(so));
8062 }
8063 if (noconstrained_new - noconstrained_old != 0) {
8064 inp_set_noconstrained(sotoinpcb(so));
8065 }
8066 }
8067
8068 if (SOCK_DOM(so) == PF_MULTIPATH) {
8069 mptcp_set_restrictions(so);
8070 }
8071
8072 return 0;
8073 }
8074
8075 uint32_t
so_get_restrictions(struct socket * so)8076 so_get_restrictions(struct socket *so)
8077 {
8078 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
8079 SO_RESTRICT_DENY_OUT |
8080 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
8081 }
8082
8083 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)8084 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
8085 {
8086 struct proc *ep = PROC_NULL;
8087 int error = 0;
8088
8089 /* pid 0 is reserved for kernel */
8090 if (epid == 0) {
8091 error = EINVAL;
8092 goto done;
8093 }
8094
8095 /*
8096 * If this is an in-kernel socket, prevent its delegate
8097 * association from changing unless the socket option is
8098 * coming from within the kernel itself.
8099 */
8100 if (so->last_pid == 0 && p != kernproc) {
8101 error = EACCES;
8102 goto done;
8103 }
8104
8105 /*
8106 * If this is issued by a process that's recorded as the
8107 * real owner of the socket, or if the pid is the same as
8108 * the process's own pid, then proceed. Otherwise ensure
8109 * that the issuing process has the necessary privileges.
8110 */
8111 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
8112 if ((error = priv_check_cred(kauth_cred_get(),
8113 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8114 error = EACCES;
8115 goto done;
8116 }
8117 }
8118
8119 /* Find the process that corresponds to the effective pid */
8120 if ((ep = proc_find(epid)) == PROC_NULL) {
8121 error = ESRCH;
8122 goto done;
8123 }
8124
8125 /*
8126 * If a process tries to delegate the socket to itself, then
8127 * there's really nothing to do; treat it as a way for the
8128 * delegate association to be cleared. Note that we check
8129 * the passed-in proc rather than calling proc_selfpid(),
8130 * as we need to check the process issuing the socket option
8131 * which could be kernproc. Given that we don't allow 0 for
8132 * effective pid, it means that a delegated in-kernel socket
8133 * stays delegated during its lifetime (which is probably OK.)
8134 */
8135 if (epid == proc_pid(p)) {
8136 so->so_flags &= ~SOF_DELEGATED;
8137 so->e_upid = 0;
8138 so->e_pid = 0;
8139 uuid_clear(so->e_uuid);
8140 } else {
8141 so->so_flags |= SOF_DELEGATED;
8142 so->e_upid = proc_uniqueid(ep);
8143 so->e_pid = proc_pid(ep);
8144 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8145
8146 #if defined(XNU_TARGET_OS_OSX)
8147 if (ep->p_responsible_pid != so->e_pid) {
8148 proc_t rp = proc_find(ep->p_responsible_pid);
8149 if (rp != PROC_NULL) {
8150 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8151 so->so_rpid = ep->p_responsible_pid;
8152 proc_rele(rp);
8153 } else {
8154 uuid_clear(so->so_ruuid);
8155 so->so_rpid = -1;
8156 }
8157 }
8158 #endif
8159 }
8160 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8161 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8162 }
8163 done:
8164 if (error == 0 && net_io_policy_log) {
8165 uuid_string_t buf;
8166
8167 uuid_unparse(so->e_uuid, buf);
8168 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8169 "euuid %s%s\n", __func__, proc_name_address(p),
8170 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8171 SOCK_DOM(so), SOCK_TYPE(so),
8172 so->e_pid, proc_name_address(ep), buf,
8173 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8174 } else if (error != 0 && net_io_policy_log) {
8175 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8176 "ERROR (%d)\n", __func__, proc_name_address(p),
8177 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8178 SOCK_DOM(so), SOCK_TYPE(so),
8179 epid, (ep == PROC_NULL) ? "PROC_NULL" :
8180 proc_name_address(ep), error);
8181 }
8182
8183 /* Update this socket's policy upon success */
8184 if (error == 0) {
8185 so->so_policy_gencnt *= -1;
8186 so_update_policy(so);
8187 #if NECP
8188 so_update_necp_policy(so, NULL, NULL);
8189 #endif /* NECP */
8190 }
8191
8192 if (ep != PROC_NULL) {
8193 proc_rele(ep);
8194 }
8195
8196 return error;
8197 }
8198
8199 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8200 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8201 {
8202 uuid_string_t buf;
8203 uuid_t uuid;
8204 int error = 0;
8205
8206 /* UUID must not be all-zeroes (reserved for kernel) */
8207 if (uuid_is_null(euuid)) {
8208 error = EINVAL;
8209 goto done;
8210 }
8211
8212 /*
8213 * If this is an in-kernel socket, prevent its delegate
8214 * association from changing unless the socket option is
8215 * coming from within the kernel itself.
8216 */
8217 if (so->last_pid == 0 && p != kernproc) {
8218 error = EACCES;
8219 goto done;
8220 }
8221
8222 /* Get the UUID of the issuing process */
8223 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8224
8225 /*
8226 * If this is issued by a process that's recorded as the
8227 * real owner of the socket, or if the uuid is the same as
8228 * the process's own uuid, then proceed. Otherwise ensure
8229 * that the issuing process has the necessary privileges.
8230 */
8231 if (check_cred &&
8232 (uuid_compare(euuid, so->last_uuid) != 0 ||
8233 uuid_compare(euuid, uuid) != 0)) {
8234 if ((error = priv_check_cred(kauth_cred_get(),
8235 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8236 error = EACCES;
8237 goto done;
8238 }
8239 }
8240
8241 /*
8242 * If a process tries to delegate the socket to itself, then
8243 * there's really nothing to do; treat it as a way for the
8244 * delegate association to be cleared. Note that we check
8245 * the uuid of the passed-in proc rather than that of the
8246 * current process, as we need to check the process issuing
8247 * the socket option which could be kernproc itself. Given
8248 * that we don't allow 0 for effective uuid, it means that
8249 * a delegated in-kernel socket stays delegated during its
8250 * lifetime (which is okay.)
8251 */
8252 if (uuid_compare(euuid, uuid) == 0) {
8253 so->so_flags &= ~SOF_DELEGATED;
8254 so->e_upid = 0;
8255 so->e_pid = 0;
8256 uuid_clear(so->e_uuid);
8257 } else {
8258 so->so_flags |= SOF_DELEGATED;
8259 /*
8260 * Unlike so_set_effective_pid(), we only have the UUID
8261 * here and the process ID is not known. Inherit the
8262 * real {pid,upid} of the socket.
8263 */
8264 so->e_upid = so->last_upid;
8265 so->e_pid = so->last_pid;
8266 uuid_copy(so->e_uuid, euuid);
8267 }
8268 /*
8269 * The following will clear the effective process name as it's the same
8270 * as the real process
8271 */
8272 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8273 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8274 }
8275 done:
8276 if (error == 0 && net_io_policy_log) {
8277 uuid_unparse(so->e_uuid, buf);
8278 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8279 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8280 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8281 SOCK_TYPE(so), so->e_pid, buf,
8282 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8283 } else if (error != 0 && net_io_policy_log) {
8284 uuid_unparse(euuid, buf);
8285 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8286 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8287 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8288 SOCK_TYPE(so), buf, error);
8289 }
8290
8291 /* Update this socket's policy upon success */
8292 if (error == 0) {
8293 so->so_policy_gencnt *= -1;
8294 so_update_policy(so);
8295 #if NECP
8296 so_update_necp_policy(so, NULL, NULL);
8297 #endif /* NECP */
8298 }
8299
8300 return error;
8301 }
8302
8303 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8304 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8305 uint32_t ev_datalen)
8306 {
8307 struct kev_msg ev_msg;
8308
8309 /*
8310 * A netpolicy event always starts with a netpolicy_event_data
8311 * structure, but the caller can provide for a longer event
8312 * structure to post, depending on the event code.
8313 */
8314 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8315
8316 bzero(&ev_msg, sizeof(ev_msg));
8317 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8318 ev_msg.kev_class = KEV_NETWORK_CLASS;
8319 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8320 ev_msg.event_code = ev_code;
8321
8322 ev_msg.dv[0].data_ptr = ev_data;
8323 ev_msg.dv[0].data_length = ev_datalen;
8324
8325 kev_post_msg(&ev_msg);
8326 }
8327
8328 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8329 socket_post_kev_msg(uint32_t ev_code,
8330 struct kev_socket_event_data *ev_data,
8331 uint32_t ev_datalen)
8332 {
8333 struct kev_msg ev_msg;
8334
8335 bzero(&ev_msg, sizeof(ev_msg));
8336 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8337 ev_msg.kev_class = KEV_NETWORK_CLASS;
8338 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8339 ev_msg.event_code = ev_code;
8340
8341 ev_msg.dv[0].data_ptr = ev_data;
8342 ev_msg.dv[0].data_length = ev_datalen;
8343
8344 kev_post_msg(&ev_msg);
8345 }
8346
8347 void
socket_post_kev_msg_closed(struct socket * so)8348 socket_post_kev_msg_closed(struct socket *so)
8349 {
8350 struct kev_socket_closed ev = {};
8351 struct sockaddr *socksa = NULL, *peersa = NULL;
8352 int err;
8353
8354 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8355 return;
8356 }
8357 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8358 if (err == 0) {
8359 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8360 &peersa);
8361 if (err == 0) {
8362 memcpy(&ev.ev_data.kev_sockname, socksa,
8363 min(socksa->sa_len,
8364 sizeof(ev.ev_data.kev_sockname)));
8365 memcpy(&ev.ev_data.kev_peername, peersa,
8366 min(peersa->sa_len,
8367 sizeof(ev.ev_data.kev_peername)));
8368 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8369 &ev.ev_data, sizeof(ev));
8370 }
8371 }
8372 free_sockaddr(socksa);
8373 free_sockaddr(peersa);
8374 }
8375