1 /*
2 * Copyright (c) 1998-2022, 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <net/sockaddr_utils.h>
106 #include <netinet/in.h>
107 #include <netinet/in_pcb.h>
108 #include <netinet/in_tclass.h>
109 #include <netinet/in_var.h>
110 #include <netinet/tcp_var.h>
111 #include <netinet/ip6.h>
112 #include <netinet6/ip6_var.h>
113 #include <netinet/flow_divert.h>
114 #include <kern/zalloc.h>
115 #include <kern/locks.h>
116 #include <machine/limits.h>
117 #include <libkern/OSAtomic.h>
118 #include <pexpert/pexpert.h>
119 #include <kern/assert.h>
120 #include <kern/task.h>
121 #include <kern/policy_internal.h>
122
123 #include <sys/kpi_mbuf.h>
124 #include <sys/mcache.h>
125 #include <sys/unpcb.h>
126 #include <libkern/section_keywords.h>
127
128 #include <os/log.h>
129
130 #if CONFIG_MACF
131 #include <security/mac_framework.h>
132 #endif /* MAC */
133
134 #if MULTIPATH
135 #include <netinet/mp_pcb.h>
136 #include <netinet/mptcp_var.h>
137 #endif /* MULTIPATH */
138
139 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
140
141 #if DEBUG || DEVELOPMENT
142 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
143 #else
144 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
145 #endif
146
147 /* TODO: this should be in a header file somewhere */
148 extern char *proc_name_address(void *p);
149
150 static u_int32_t so_cache_hw; /* High water mark for socache */
151 static u_int32_t so_cache_timeouts; /* number of timeouts */
152 static u_int32_t so_cache_max_freed; /* max freed per timeout */
153 static u_int32_t cached_sock_count = 0;
154 STAILQ_HEAD(, socket) so_cache_head;
155 int max_cached_sock_count = MAX_CACHED_SOCKETS;
156 static uint64_t so_cache_time;
157 static int socketinit_done;
158 static struct zone *so_cache_zone;
159 ZONE_DECLARE(so_cache_zone, struct zone *);
160
161 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
162 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
163
164 #include <machine/limits.h>
165
166 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
167 static void filt_sordetach(struct knote *kn);
168 static int filt_soread(struct knote *kn, long hint);
169 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
170 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
171
172 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
173 static void filt_sowdetach(struct knote *kn);
174 static int filt_sowrite(struct knote *kn, long hint);
175 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
176 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
177
178 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
179 static void filt_sockdetach(struct knote *kn);
180 static int filt_sockev(struct knote *kn, long hint);
181 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
182 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
183
184 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
185 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
186
187 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
188 .f_isfd = 1,
189 .f_attach = filt_sorattach,
190 .f_detach = filt_sordetach,
191 .f_event = filt_soread,
192 .f_touch = filt_sortouch,
193 .f_process = filt_sorprocess,
194 };
195
196 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
197 .f_isfd = 1,
198 .f_attach = filt_sowattach,
199 .f_detach = filt_sowdetach,
200 .f_event = filt_sowrite,
201 .f_touch = filt_sowtouch,
202 .f_process = filt_sowprocess,
203 };
204
205 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
206 .f_isfd = 1,
207 .f_attach = filt_sockattach,
208 .f_detach = filt_sockdetach,
209 .f_event = filt_sockev,
210 .f_touch = filt_socktouch,
211 .f_process = filt_sockprocess,
212 };
213
214 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
215 .f_isfd = 1,
216 .f_attach = filt_sorattach,
217 .f_detach = filt_sordetach,
218 .f_event = filt_soread,
219 .f_touch = filt_sortouch,
220 .f_process = filt_sorprocess,
221 };
222
223 SYSCTL_DECL(_kern_ipc);
224
225 #define EVEN_MORE_LOCKING_DEBUG 0
226
227 int socket_debug = 0;
228 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
229 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
230
231 #if (DEBUG || DEVELOPMENT)
232 #define DEFAULT_SOSEND_ASSERT_PANIC 1
233 #else
234 #define DEFAULT_SOSEND_ASSERT_PANIC 0
235 #endif /* (DEBUG || DEVELOPMENT) */
236
237 int sosend_assert_panic = 0;
238 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
239 CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
240
241 static unsigned long sodefunct_calls = 0;
242 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
243 &sodefunct_calls, "");
244
245 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
246 so_gen_t so_gencnt; /* generation count for sockets */
247
248 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
249
250 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
251 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
252 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
253 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
254 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
255 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
256 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
257 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
258 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
259
260 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
261
262 int somaxconn = SOMAXCONN;
263 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
264 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
265
266 /* Should we get a maximum also ??? */
267 static int sosendmaxchain = 65536;
268 static int sosendminchain = 16384;
269 static int sorecvmincopy = 16384;
270 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
271 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
272 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
273 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
274
275 /*
276 * Set to enable jumbo clusters (if available) for large writes when
277 * the socket is marked with SOF_MULTIPAGES; see below.
278 */
279 int sosendjcl = 1;
280 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
281 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
282
283 /*
284 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
285 * writes on the socket for all protocols on any network interfaces,
286 * depending upon sosendjcl above. Be extra careful when setting this
287 * to 1, because sending down packets that cross physical pages down to
288 * broken drivers (those that falsely assume that the physical pages
289 * are contiguous) might lead to system panics or silent data corruption.
290 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
291 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
292 * capable. Set this to 1 only for testing/debugging purposes.
293 */
294 int sosendjcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
297
298 /*
299 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
300 * writes on the socket for all protocols on any network interfaces.
301 * Be extra careful when setting this to 1, because sending down packets with
302 * clusters larger that 2 KB might lead to system panics or data corruption.
303 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
304 * on the outgoing interface
305 * Set this to 1 for testing/debugging purposes only.
306 */
307 int sosendbigcl_ignore_capab = 0;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
309 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
310
311 int sodefunctlog = 0;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sodefunctlog, 0, "");
314
315 int sothrottlelog = 0;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &sothrottlelog, 0, "");
318
319 int sorestrictrecv = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
321 &sorestrictrecv, 0, "Enable inbound interface restrictions");
322
323 int sorestrictsend = 1;
324 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
325 &sorestrictsend, 0, "Enable outbound interface restrictions");
326
327 int soreserveheadroom = 1;
328 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
329 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
330
331 #if (DEBUG || DEVELOPMENT)
332 int so_notsent_lowat_check = 1;
333 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
334 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
335 #endif /* DEBUG || DEVELOPMENT */
336
337 int so_accept_list_waits = 0;
338 #if (DEBUG || DEVELOPMENT)
339 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
340 &so_accept_list_waits, 0, "number of waits for listener incomp list");
341 #endif /* DEBUG || DEVELOPMENT */
342
343 extern struct inpcbinfo tcbinfo;
344
345 /* TODO: these should be in header file */
346 extern int get_inpcb_str_size(void);
347 extern int get_tcp_str_size(void);
348
349 vm_size_t so_cache_zone_element_size;
350
351 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
352 user_ssize_t *);
353 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
354 static void cached_sock_free(struct socket *);
355
356 /*
357 * Maximum of extended background idle sockets per process
358 * Set to zero to disable further setting of the option
359 */
360
361 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
362 #define SO_IDLE_BK_IDLE_TIME 600
363 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
364
365 struct soextbkidlestat soextbkidlestat;
366
367 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
368 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
369 "Maximum of extended background idle sockets per process");
370
371 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
372 &soextbkidlestat.so_xbkidle_time, 0,
373 "Time in seconds to keep extended background idle sockets");
374
375 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
376 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
377 "High water mark for extended background idle sockets");
378
379 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
380 &soextbkidlestat, soextbkidlestat, "");
381
382 int so_set_extended_bk_idle(struct socket *, int);
383
384 #define SO_MAX_MSG_X 1024
385
386 /*
387 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
388 * setting the DSCP code on the packet based on the service class; see
389 * <rdar://problem/11277343> for details.
390 */
391 __private_extern__ u_int32_t sotcdb = 0;
392 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
393 &sotcdb, 0, "");
394
395 void
socketinit(void)396 socketinit(void)
397 {
398 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
399 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
400
401 #ifdef __LP64__
402 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
403 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
404 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
405 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
406 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
407 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
408 #else
409 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
410 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
411 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
412 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
413 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
414 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
415 #endif
416
417 if (socketinit_done) {
418 printf("socketinit: already called...\n");
419 return;
420 }
421 socketinit_done = 1;
422
423 PE_parse_boot_argn("socket_debug", &socket_debug,
424 sizeof(socket_debug));
425
426 PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
427 sizeof(sosend_assert_panic));
428
429 STAILQ_INIT(&so_cache_head);
430
431 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
432 + get_inpcb_str_size() + 4 + get_tcp_str_size());
433
434 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
435 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
436
437 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
438 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
439 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
440 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
441
442 in_pcbinit();
443 }
444
445 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)446 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
447 {
448 caddr_t temp;
449 uintptr_t offset;
450
451 lck_mtx_lock(&so_cache_mtx);
452
453 if (!STAILQ_EMPTY(&so_cache_head)) {
454 VERIFY(cached_sock_count > 0);
455
456 *so = STAILQ_FIRST(&so_cache_head);
457 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
458 STAILQ_NEXT((*so), so_cache_ent) = NULL;
459
460 cached_sock_count--;
461 lck_mtx_unlock(&so_cache_mtx);
462
463 temp = (*so)->so_saved_pcb;
464 bzero(*so, sizeof(struct socket));
465
466 (*so)->so_saved_pcb = temp;
467 } else {
468 lck_mtx_unlock(&so_cache_mtx);
469
470 uint8_t *so_mem = zalloc_flags_buf(so_cache_zone, how | Z_ZERO);
471 #pragma clang diagnostic push
472 #pragma clang diagnostic ignored "-Wcast-align"
473 *so = (struct socket *)so_mem;
474
475 /*
476 * Define offsets for extra structures into our
477 * single block of memory. Align extra structures
478 * on longword boundaries.
479 */
480
481 offset = (uintptr_t)so_mem;
482 offset += sizeof(struct socket);
483 offset = ALIGN(offset);
484 struct inpcb *pcb = (struct inpcb *)(so_mem + (offset - (uintptr_t)so_mem));
485 #pragma clang diagnostic pop
486 (*so)->so_saved_pcb = (caddr_t)pcb;
487
488 offset += get_inpcb_str_size();
489 offset = ALIGN(offset);
490 pcb->inp_saved_ppcb = (caddr_t)(so_mem + (offset - (uintptr_t)so_mem));
491 }
492
493 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
494 }
495
496 static void
cached_sock_free(struct socket * so)497 cached_sock_free(struct socket *so)
498 {
499 lck_mtx_lock(&so_cache_mtx);
500
501 so_cache_time = net_uptime();
502 if (++cached_sock_count > max_cached_sock_count) {
503 --cached_sock_count;
504 lck_mtx_unlock(&so_cache_mtx);
505 zfree(so_cache_zone, so);
506 } else {
507 if (so_cache_hw < cached_sock_count) {
508 so_cache_hw = cached_sock_count;
509 }
510
511 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
512
513 so->cache_timestamp = so_cache_time;
514 lck_mtx_unlock(&so_cache_mtx);
515 }
516 }
517
518 void
so_update_last_owner_locked(struct socket * so,proc_t self)519 so_update_last_owner_locked(struct socket *so, proc_t self)
520 {
521 if (so->last_pid != 0) {
522 /*
523 * last_pid and last_upid should remain zero for sockets
524 * created using sock_socket. The check above achieves that
525 */
526 if (self == PROC_NULL) {
527 self = current_proc();
528 }
529
530 if (so->last_upid != proc_uniqueid(self) ||
531 so->last_pid != proc_pid(self)) {
532 so->last_upid = proc_uniqueid(self);
533 so->last_pid = proc_pid(self);
534 proc_getexecutableuuid(self, so->last_uuid,
535 sizeof(so->last_uuid));
536 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
537 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
538 }
539 }
540 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
541 }
542 }
543
544 void
so_update_policy(struct socket * so)545 so_update_policy(struct socket *so)
546 {
547 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
548 (void) inp_update_policy(sotoinpcb(so));
549 }
550 }
551
552 #if NECP
553 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)554 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
555 struct sockaddr *override_remote_addr)
556 {
557 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
558 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
559 override_remote_addr, 0);
560 }
561 }
562 #endif /* NECP */
563
564 boolean_t
so_cache_timer(void)565 so_cache_timer(void)
566 {
567 struct socket *p;
568 int n_freed = 0;
569 boolean_t rc = FALSE;
570
571 lck_mtx_lock(&so_cache_mtx);
572 so_cache_timeouts++;
573 so_cache_time = net_uptime();
574
575 while (!STAILQ_EMPTY(&so_cache_head)) {
576 VERIFY(cached_sock_count > 0);
577 p = STAILQ_FIRST(&so_cache_head);
578 if ((so_cache_time - p->cache_timestamp) <
579 SO_CACHE_TIME_LIMIT) {
580 break;
581 }
582
583 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
584 --cached_sock_count;
585
586 zfree(so_cache_zone, p);
587
588 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
589 so_cache_max_freed++;
590 break;
591 }
592 }
593
594 /* Schedule again if there is more to cleanup */
595 if (!STAILQ_EMPTY(&so_cache_head)) {
596 rc = TRUE;
597 }
598
599 lck_mtx_unlock(&so_cache_mtx);
600 return rc;
601 }
602
603 /*
604 * Get a socket structure from our zone, and initialize it.
605 * We don't implement `waitok' yet (see comments in uipc_domain.c).
606 * Note that it would probably be better to allocate socket
607 * and PCB at the same time, but I'm not convinced that all
608 * the protocols can be easily modified to do this.
609 */
610 struct socket *
soalloc(int waitok,int dom,int type)611 soalloc(int waitok, int dom, int type)
612 {
613 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
614 struct socket *__single so;
615
616 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
617 cached_sock_alloc(&so, how);
618 } else {
619 so = zalloc_flags(socket_zone, how | Z_ZERO);
620 }
621 if (so != NULL) {
622 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
623
624 /*
625 * Increment the socket allocation statistics
626 */
627 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
628 }
629
630 return so;
631 }
632
633 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)634 socreate_internal(int dom, struct socket **aso, int type, int proto,
635 struct proc *p, uint32_t flags, struct proc *ep)
636 {
637 struct protosw *prp;
638 struct socket *so;
639 int error = 0;
640 pid_t rpid = -1;
641
642 VERIFY(aso != NULL);
643 *aso = NULL;
644
645 if (proto != 0) {
646 prp = pffindproto(dom, proto, type);
647 } else {
648 prp = pffindtype(dom, type);
649 }
650
651 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
652 if (pffinddomain(dom) == NULL) {
653 return EAFNOSUPPORT;
654 }
655 if (proto != 0) {
656 if (pffindprotonotype(dom, proto) != NULL) {
657 return EPROTOTYPE;
658 }
659 }
660 return EPROTONOSUPPORT;
661 }
662 if (prp->pr_type != type) {
663 return EPROTOTYPE;
664 }
665 so = soalloc(1, dom, type);
666 if (so == NULL) {
667 return ENOBUFS;
668 }
669
670 switch (dom) {
671 case PF_LOCAL:
672 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
673 break;
674 case PF_INET:
675 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
676 if (type == SOCK_STREAM) {
677 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
678 } else {
679 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
680 }
681 break;
682 case PF_ROUTE:
683 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
684 break;
685 case PF_NDRV:
686 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
687 break;
688 case PF_KEY:
689 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
690 break;
691 case PF_INET6:
692 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
693 if (type == SOCK_STREAM) {
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
695 } else {
696 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
697 }
698 break;
699 case PF_SYSTEM:
700 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
701 break;
702 case PF_MULTIPATH:
703 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
704 break;
705 default:
706 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
707 break;
708 }
709
710 if (flags & SOCF_MPTCP) {
711 so->so_state |= SS_NBIO;
712 }
713
714 TAILQ_INIT(&so->so_incomp);
715 TAILQ_INIT(&so->so_comp);
716 so->so_type = (short)type;
717 so->so_family = prp->pr_domain->dom_family;
718 so->so_protocol = prp->pr_protocol;
719 so->last_upid = proc_uniqueid(p);
720 so->last_pid = proc_pid(p);
721 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
722 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
723
724 so->so_rpid = -1;
725 uuid_clear(so->so_ruuid);
726
727 if (ep != PROC_NULL && ep != p) {
728 so->e_upid = proc_uniqueid(ep);
729 so->e_pid = proc_pid(ep);
730 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
731 so->so_flags |= SOF_DELEGATED;
732 if (ep->p_responsible_pid != so->e_pid) {
733 rpid = ep->p_responsible_pid;
734 so->so_rpid = rpid;
735 proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
736 }
737 }
738
739 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
740 rpid = p->p_responsible_pid;
741 so->so_rpid = rpid;
742 proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
743 }
744
745 so->so_cred = kauth_cred_proc_ref(p);
746 if (!suser(kauth_cred_get(), NULL)) {
747 so->so_state |= SS_PRIV;
748 }
749
750 so->so_persona_id = current_persona_get_id();
751 so->so_proto = prp;
752 so->so_rcv.sb_flags |= SB_RECV;
753 so->so_rcv.sb_so = so->so_snd.sb_so = so;
754 so->next_lock_lr = 0;
755 so->next_unlock_lr = 0;
756
757 /*
758 * Attachment will create the per pcb lock if necessary and
759 * increase refcount for creation, make sure it's done before
760 * socket is inserted in lists.
761 */
762 so->so_usecount++;
763
764 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
765 if (error != 0) {
766 /*
767 * Warning:
768 * If so_pcb is not zero, the socket will be leaked,
769 * so protocol attachment handler must be coded carefuly
770 */
771 if (so->so_pcb != NULL) {
772 os_log_error(OS_LOG_DEFAULT,
773 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
774 error, dom, proto, type);
775 }
776 /*
777 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
778 */
779 so->so_state |= SS_NOFDREF;
780 so->so_flags |= SOF_PCBCLEARING;
781 VERIFY(so->so_usecount > 0);
782 so->so_usecount--;
783 sofreelastref(so, 1); /* will deallocate the socket */
784 return error;
785 }
786
787 /*
788 * Note: needs so_pcb to be set after pru_attach
789 */
790 if (prp->pr_update_last_owner != NULL) {
791 (*prp->pr_update_last_owner)(so, p, ep);
792 }
793
794 os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
795
796 /* Attach socket filters for this protocol */
797 sflt_initsock(so);
798 so_set_default_traffic_class(so);
799
800 /*
801 * If this thread or task is marked to create backgrounded sockets,
802 * mark the socket as background.
803 */
804 if (!(flags & SOCF_MPTCP) &&
805 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
806 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
807 so->so_background_thread = current_thread();
808 }
809
810 switch (dom) {
811 /*
812 * Don't mark Unix domain or system
813 * eligible for defunct by default.
814 */
815 case PF_LOCAL:
816 case PF_SYSTEM:
817 so->so_flags |= SOF_NODEFUNCT;
818 break;
819 default:
820 break;
821 }
822
823 /*
824 * Entitlements can't be checked at socket creation time except if the
825 * application requested a feature guarded by a privilege (c.f., socket
826 * delegation).
827 * The priv(9) and the Sandboxing APIs are designed with the idea that
828 * a privilege check should only be triggered by a userland request.
829 * A privilege check at socket creation time is time consuming and
830 * could trigger many authorisation error messages from the security
831 * APIs.
832 */
833
834 *aso = so;
835
836 return 0;
837 }
838
839 /*
840 * Returns: 0 Success
841 * EAFNOSUPPORT
842 * EPROTOTYPE
843 * EPROTONOSUPPORT
844 * ENOBUFS
845 * <pru_attach>:ENOBUFS[AF_UNIX]
846 * <pru_attach>:ENOBUFS[TCP]
847 * <pru_attach>:ENOMEM[TCP]
848 * <pru_attach>:??? [other protocol families, IPSEC]
849 */
850 int
socreate(int dom,struct socket ** aso,int type,int proto)851 socreate(int dom, struct socket **aso, int type, int proto)
852 {
853 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
854 PROC_NULL);
855 }
856
857 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)858 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
859 {
860 int error = 0;
861 struct proc *ep = PROC_NULL;
862
863 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
864 error = ESRCH;
865 goto done;
866 }
867
868 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
869
870 /*
871 * It might not be wise to hold the proc reference when calling
872 * socreate_internal since it calls soalloc with M_WAITOK
873 */
874 done:
875 if (ep != PROC_NULL) {
876 proc_rele(ep);
877 }
878
879 return error;
880 }
881
882 /*
883 * Returns: 0 Success
884 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
885 * <pru_bind>:EAFNOSUPPORT Address family not supported
886 * <pru_bind>:EADDRNOTAVAIL Address not available.
887 * <pru_bind>:EINVAL Invalid argument
888 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
889 * <pru_bind>:EACCES Permission denied
890 * <pru_bind>:EADDRINUSE Address in use
891 * <pru_bind>:EAGAIN Resource unavailable, try again
892 * <pru_bind>:EPERM Operation not permitted
893 * <pru_bind>:???
894 * <sf_bind>:???
895 *
896 * Notes: It's not possible to fully enumerate the return codes above,
897 * since socket filter authors and protocol family authors may
898 * not choose to limit their error returns to those listed, even
899 * though this may result in some software operating incorrectly.
900 *
901 * The error codes which are enumerated above are those known to
902 * be returned by the tcp_usr_bind function supplied.
903 */
904 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)905 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
906 {
907 struct proc *p = current_proc();
908 int error = 0;
909
910 if (dolock) {
911 socket_lock(so, 1);
912 }
913
914 so_update_last_owner_locked(so, p);
915 so_update_policy(so);
916
917 #if NECP
918 so_update_necp_policy(so, nam, NULL);
919 #endif /* NECP */
920
921 /*
922 * If this is a bind request on a socket that has been marked
923 * as inactive, reject it now before we go any further.
924 */
925 if (so->so_flags & SOF_DEFUNCT) {
926 error = EINVAL;
927 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
928 __func__, proc_pid(p), proc_best_name(p),
929 so->so_gencnt,
930 SOCK_DOM(so), SOCK_TYPE(so), error);
931 goto out;
932 }
933
934 /* Socket filter */
935 error = sflt_bind(so, nam);
936
937 if (error == 0) {
938 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
939 }
940 out:
941 if (dolock) {
942 socket_unlock(so, 1);
943 }
944
945 if (error == EJUSTRETURN) {
946 error = 0;
947 }
948
949 return error;
950 }
951
952 void
sodealloc(struct socket * so)953 sodealloc(struct socket *so)
954 {
955 kauth_cred_unref(&so->so_cred);
956
957 /* Remove any filters */
958 sflt_termsock(so);
959
960 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
961
962 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
963 cached_sock_free(so);
964 } else {
965 zfree(socket_zone, so);
966 }
967 }
968
969 /*
970 * Returns: 0 Success
971 * EINVAL
972 * EOPNOTSUPP
973 * <pru_listen>:EINVAL[AF_UNIX]
974 * <pru_listen>:EINVAL[TCP]
975 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
976 * <pru_listen>:EINVAL[TCP] Invalid argument
977 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
978 * <pru_listen>:EACCES[TCP] Permission denied
979 * <pru_listen>:EADDRINUSE[TCP] Address in use
980 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
981 * <pru_listen>:EPERM[TCP] Operation not permitted
982 * <sf_listen>:???
983 *
984 * Notes: Other <pru_listen> returns depend on the protocol family; all
985 * <sf_listen> returns depend on what the filter author causes
986 * their filter to return.
987 */
988 int
solisten(struct socket * so,int backlog)989 solisten(struct socket *so, int backlog)
990 {
991 struct proc *p = current_proc();
992 int error = 0;
993
994 socket_lock(so, 1);
995
996 so_update_last_owner_locked(so, p);
997 so_update_policy(so);
998
999 if (TAILQ_EMPTY(&so->so_comp)) {
1000 so->so_options |= SO_ACCEPTCONN;
1001 }
1002
1003 #if NECP
1004 so_update_necp_policy(so, NULL, NULL);
1005 #endif /* NECP */
1006
1007 if (so->so_proto == NULL) {
1008 error = EINVAL;
1009 so->so_options &= ~SO_ACCEPTCONN;
1010 goto out;
1011 }
1012 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1013 error = EOPNOTSUPP;
1014 so->so_options &= ~SO_ACCEPTCONN;
1015 goto out;
1016 }
1017
1018 /*
1019 * If the listen request is made on a socket that is not fully
1020 * disconnected, or on a socket that has been marked as inactive,
1021 * reject the request now.
1022 */
1023 if ((so->so_state &
1024 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1025 (so->so_flags & SOF_DEFUNCT)) {
1026 error = EINVAL;
1027 if (so->so_flags & SOF_DEFUNCT) {
1028 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1029 "(%d)\n", __func__, proc_pid(p),
1030 proc_best_name(p),
1031 so->so_gencnt,
1032 SOCK_DOM(so), SOCK_TYPE(so), error);
1033 }
1034 so->so_options &= ~SO_ACCEPTCONN;
1035 goto out;
1036 }
1037
1038 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1039 error = EPERM;
1040 so->so_options &= ~SO_ACCEPTCONN;
1041 goto out;
1042 }
1043
1044 error = sflt_listen(so);
1045 if (error == 0) {
1046 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1047 }
1048
1049 if (error) {
1050 if (error == EJUSTRETURN) {
1051 error = 0;
1052 }
1053 so->so_options &= ~SO_ACCEPTCONN;
1054 goto out;
1055 }
1056
1057 /*
1058 * POSIX: The implementation may have an upper limit on the length of
1059 * the listen queue-either global or per accepting socket. If backlog
1060 * exceeds this limit, the length of the listen queue is set to the
1061 * limit.
1062 *
1063 * If listen() is called with a backlog argument value that is less
1064 * than 0, the function behaves as if it had been called with a backlog
1065 * argument value of 0.
1066 *
1067 * A backlog argument of 0 may allow the socket to accept connections,
1068 * in which case the length of the listen queue may be set to an
1069 * implementation-defined minimum value.
1070 */
1071 if (backlog <= 0 || backlog > somaxconn) {
1072 backlog = somaxconn;
1073 }
1074
1075 so->so_qlimit = (short)backlog;
1076 out:
1077 socket_unlock(so, 1);
1078 return error;
1079 }
1080
1081 /*
1082 * The "accept list lock" protects the fields related to the listener queues
1083 * because we can unlock a socket to respect the lock ordering between
1084 * the listener socket and its clients sockets. The lock ordering is first to
1085 * acquire the client socket before the listener socket.
1086 *
1087 * The accept list lock serializes access to the following fields:
1088 * - of the listener socket:
1089 * - so_comp
1090 * - so_incomp
1091 * - so_qlen
1092 * - so_inqlen
1093 * - of client sockets that are in so_comp or so_incomp:
1094 * - so_head
1095 * - so_list
1096 *
1097 * As one can see the accept list lock protects the consistent of the
1098 * linkage of the client sockets.
1099 *
1100 * Note that those fields may be read without holding the accept list lock
1101 * for a preflight provided the accept list lock is taken when committing
1102 * to take an action based on the result of the preflight. The preflight
1103 * saves the cost of doing the unlock/lock dance.
1104 */
1105 void
so_acquire_accept_list(struct socket * head,struct socket * so)1106 so_acquire_accept_list(struct socket *head, struct socket *so)
1107 {
1108 lck_mtx_t *mutex_held;
1109
1110 if (head->so_proto->pr_getlock == NULL) {
1111 return;
1112 }
1113 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1114 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1115
1116 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1117 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1118 return;
1119 }
1120 if (so != NULL) {
1121 socket_unlock(so, 0);
1122 }
1123 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1124 so_accept_list_waits += 1;
1125 msleep((caddr_t)&head->so_incomp, mutex_held,
1126 PSOCK | PCATCH, __func__, NULL);
1127 }
1128 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 if (so != NULL) {
1130 socket_unlock(head, 0);
1131 socket_lock(so, 0);
1132 socket_lock(head, 0);
1133 }
1134 }
1135
1136 void
so_release_accept_list(struct socket * head)1137 so_release_accept_list(struct socket *head)
1138 {
1139 if (head->so_proto->pr_getlock != NULL) {
1140 lck_mtx_t *mutex_held;
1141
1142 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1143 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1144
1145 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1146 wakeup((caddr_t)&head->so_incomp);
1147 }
1148 }
1149
1150 void
sofreelastref(struct socket * so,int dealloc)1151 sofreelastref(struct socket *so, int dealloc)
1152 {
1153 struct socket *head = so->so_head;
1154
1155 /* Assume socket is locked */
1156
1157 #if FLOW_DIVERT
1158 if (so->so_flags & SOF_FLOW_DIVERT) {
1159 flow_divert_detach(so);
1160 }
1161 #endif /* FLOW_DIVERT */
1162
1163 #if CONTENT_FILTER
1164 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1165 cfil_sock_detach(so);
1166 }
1167 #endif /* CONTENT_FILTER */
1168
1169 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1170 soflow_detach(so);
1171 }
1172
1173 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1174 selthreadclear(&so->so_snd.sb_sel);
1175 selthreadclear(&so->so_rcv.sb_sel);
1176 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1177 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1178 so->so_event = sonullevent;
1179 return;
1180 }
1181 if (head != NULL) {
1182 /*
1183 * Need to lock the listener when the protocol has
1184 * per socket locks
1185 */
1186 if (head->so_proto->pr_getlock != NULL) {
1187 socket_lock(head, 1);
1188 so_acquire_accept_list(head, so);
1189 }
1190 if (so->so_state & SS_INCOMP) {
1191 so->so_state &= ~SS_INCOMP;
1192 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1193 head->so_incqlen--;
1194 head->so_qlen--;
1195 so->so_head = NULL;
1196
1197 if (head->so_proto->pr_getlock != NULL) {
1198 so_release_accept_list(head);
1199 socket_unlock(head, 1);
1200 }
1201 } else if (so->so_state & SS_COMP) {
1202 if (head->so_proto->pr_getlock != NULL) {
1203 so_release_accept_list(head);
1204 socket_unlock(head, 1);
1205 }
1206 /*
1207 * We must not decommission a socket that's
1208 * on the accept(2) queue. If we do, then
1209 * accept(2) may hang after select(2) indicated
1210 * that the listening socket was ready.
1211 */
1212 selthreadclear(&so->so_snd.sb_sel);
1213 selthreadclear(&so->so_rcv.sb_sel);
1214 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1215 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1216 so->so_event = sonullevent;
1217 return;
1218 } else {
1219 if (head->so_proto->pr_getlock != NULL) {
1220 so_release_accept_list(head);
1221 socket_unlock(head, 1);
1222 }
1223 printf("sofree: not queued\n");
1224 }
1225 }
1226 sowflush(so);
1227 sorflush(so);
1228
1229 /* 3932268: disable upcall */
1230 so->so_rcv.sb_flags &= ~SB_UPCALL;
1231 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1232 so->so_event = sonullevent;
1233
1234 if (dealloc) {
1235 sodealloc(so);
1236 }
1237 }
1238
1239 void
soclose_wait_locked(struct socket * so)1240 soclose_wait_locked(struct socket *so)
1241 {
1242 lck_mtx_t *mutex_held;
1243
1244 if (so->so_proto->pr_getlock != NULL) {
1245 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1246 } else {
1247 mutex_held = so->so_proto->pr_domain->dom_mtx;
1248 }
1249 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1250
1251 /*
1252 * Double check here and return if there's no outstanding upcall;
1253 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1254 */
1255 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1256 return;
1257 }
1258 so->so_rcv.sb_flags &= ~SB_UPCALL;
1259 so->so_snd.sb_flags &= ~SB_UPCALL;
1260 so->so_flags |= SOF_CLOSEWAIT;
1261
1262 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1263 "soclose_wait_locked", NULL);
1264 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1265 so->so_flags &= ~SOF_CLOSEWAIT;
1266 }
1267
1268 /*
1269 * Close a socket on last file table reference removal.
1270 * Initiate disconnect if connected.
1271 * Free socket when disconnect complete.
1272 */
1273 int
soclose_locked(struct socket * so)1274 soclose_locked(struct socket *so)
1275 {
1276 int error = 0;
1277 struct timespec ts;
1278
1279 if (so->so_usecount == 0) {
1280 panic("soclose: so=%p refcount=0", so);
1281 /* NOTREACHED */
1282 }
1283
1284 sflt_notify(so, sock_evt_closing, NULL);
1285
1286 if (so->so_upcallusecount) {
1287 soclose_wait_locked(so);
1288 }
1289
1290 #if CONTENT_FILTER
1291 /*
1292 * We have to wait until the content filters are done
1293 */
1294 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1295 cfil_sock_close_wait(so);
1296 cfil_sock_is_closed(so);
1297 cfil_sock_detach(so);
1298 }
1299 #endif /* CONTENT_FILTER */
1300
1301 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1302 soflow_detach(so);
1303 }
1304
1305 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1306 soresume(current_proc(), so, 1);
1307 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1308 }
1309
1310 if ((so->so_options & SO_ACCEPTCONN)) {
1311 struct socket *sp, *sonext;
1312 int persocklock = 0;
1313 int incomp_overflow_only;
1314
1315 /*
1316 * We do not want new connection to be added
1317 * to the connection queues
1318 */
1319 so->so_options &= ~SO_ACCEPTCONN;
1320
1321 /*
1322 * We can drop the lock on the listener once
1323 * we've acquired the incoming list
1324 */
1325 if (so->so_proto->pr_getlock != NULL) {
1326 persocklock = 1;
1327 so_acquire_accept_list(so, NULL);
1328 socket_unlock(so, 0);
1329 }
1330 again:
1331 incomp_overflow_only = 1;
1332
1333 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1334 /*
1335 * Radar 5350314
1336 * skip sockets thrown away by tcpdropdropblreq
1337 * they will get cleanup by the garbage collection.
1338 * otherwise, remove the incomp socket from the queue
1339 * and let soabort trigger the appropriate cleanup.
1340 */
1341 if (sp->so_flags & SOF_OVERFLOW) {
1342 continue;
1343 }
1344
1345 if (persocklock != 0) {
1346 socket_lock(sp, 1);
1347 }
1348
1349 /*
1350 * Radar 27945981
1351 * The extra reference for the list insure the
1352 * validity of the socket pointer when we perform the
1353 * unlock of the head above
1354 */
1355 if (sp->so_state & SS_INCOMP) {
1356 sp->so_state &= ~SS_INCOMP;
1357 sp->so_head = NULL;
1358 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1359 so->so_incqlen--;
1360 so->so_qlen--;
1361
1362 (void) soabort(sp);
1363 } else {
1364 panic("%s sp %p in so_incomp but !SS_INCOMP",
1365 __func__, sp);
1366 }
1367
1368 if (persocklock != 0) {
1369 socket_unlock(sp, 1);
1370 }
1371 }
1372
1373 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1374 /* Dequeue from so_comp since sofree() won't do it */
1375 if (persocklock != 0) {
1376 socket_lock(sp, 1);
1377 }
1378
1379 if (sp->so_state & SS_COMP) {
1380 sp->so_state &= ~SS_COMP;
1381 sp->so_head = NULL;
1382 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1383 so->so_qlen--;
1384
1385 (void) soabort(sp);
1386 } else {
1387 panic("%s sp %p in so_comp but !SS_COMP",
1388 __func__, sp);
1389 }
1390
1391 if (persocklock) {
1392 socket_unlock(sp, 1);
1393 }
1394 }
1395
1396 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1397 #if (DEBUG | DEVELOPMENT)
1398 panic("%s head %p so_comp not empty", __func__, so);
1399 #endif /* (DEVELOPMENT || DEBUG) */
1400
1401 goto again;
1402 }
1403
1404 if (!TAILQ_EMPTY(&so->so_comp)) {
1405 #if (DEBUG | DEVELOPMENT)
1406 panic("%s head %p so_comp not empty", __func__, so);
1407 #endif /* (DEVELOPMENT || DEBUG) */
1408
1409 goto again;
1410 }
1411
1412 if (persocklock) {
1413 socket_lock(so, 0);
1414 so_release_accept_list(so);
1415 }
1416 }
1417 if (so->so_pcb == NULL) {
1418 /* 3915887: mark the socket as ready for dealloc */
1419 so->so_flags |= SOF_PCBCLEARING;
1420 goto discard;
1421 }
1422
1423 if (so->so_state & SS_ISCONNECTED) {
1424 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1425 error = sodisconnectlocked(so);
1426 if (error) {
1427 goto drop;
1428 }
1429 }
1430 if (so->so_options & SO_LINGER) {
1431 if ((so->so_state & SS_ISDISCONNECTING) &&
1432 (so->so_state & SS_NBIO)) {
1433 goto drop;
1434 }
1435 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1436 lck_mtx_t *mutex_held;
1437
1438 if (so->so_proto->pr_getlock != NULL) {
1439 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1440 } else {
1441 mutex_held = so->so_proto->pr_domain->dom_mtx;
1442 }
1443 ts.tv_sec = (so->so_linger / 100);
1444 ts.tv_nsec = (so->so_linger % 100) *
1445 NSEC_PER_USEC * 1000 * 10;
1446 error = msleep((caddr_t)&so->so_timeo,
1447 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1448 if (error) {
1449 /*
1450 * It's OK when the time fires,
1451 * don't report an error
1452 */
1453 if (error == EWOULDBLOCK) {
1454 error = 0;
1455 }
1456 break;
1457 }
1458 }
1459 }
1460 }
1461 drop:
1462 if (so->so_usecount == 0) {
1463 panic("soclose: usecount is zero so=%p", so);
1464 /* NOTREACHED */
1465 }
1466 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1467 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1468 if (error == 0) {
1469 error = error2;
1470 }
1471 }
1472 if (so->so_usecount <= 0) {
1473 panic("soclose: usecount is zero so=%p", so);
1474 /* NOTREACHED */
1475 }
1476 discard:
1477 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1478 (so->so_state & SS_NOFDREF)) {
1479 panic("soclose: NOFDREF");
1480 /* NOTREACHED */
1481 }
1482 so->so_state |= SS_NOFDREF;
1483
1484 if ((so->so_flags & SOF_KNOTE) != 0) {
1485 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1486 }
1487
1488 os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1489
1490 VERIFY(so->so_usecount > 0);
1491 so->so_usecount--;
1492 sofree(so);
1493 return error;
1494 }
1495
1496 int
soclose(struct socket * so)1497 soclose(struct socket *so)
1498 {
1499 int error = 0;
1500 socket_lock(so, 1);
1501
1502 if (so->so_retaincnt == 0) {
1503 error = soclose_locked(so);
1504 } else {
1505 /*
1506 * if the FD is going away, but socket is
1507 * retained in kernel remove its reference
1508 */
1509 so->so_usecount--;
1510 if (so->so_usecount < 2) {
1511 panic("soclose: retaincnt non null and so=%p "
1512 "usecount=%d\n", so, so->so_usecount);
1513 }
1514 }
1515 socket_unlock(so, 1);
1516 return error;
1517 }
1518
1519 /*
1520 * Must be called at splnet...
1521 */
1522 /* Should already be locked */
1523 int
soabort(struct socket * so)1524 soabort(struct socket *so)
1525 {
1526 int error;
1527
1528 #ifdef MORE_LOCKING_DEBUG
1529 lck_mtx_t *mutex_held;
1530
1531 if (so->so_proto->pr_getlock != NULL) {
1532 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1533 } else {
1534 mutex_held = so->so_proto->pr_domain->dom_mtx;
1535 }
1536 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1537 #endif
1538
1539 if ((so->so_flags & SOF_ABORTED) == 0) {
1540 so->so_flags |= SOF_ABORTED;
1541 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1542 if (error) {
1543 sofree(so);
1544 return error;
1545 }
1546 }
1547 return 0;
1548 }
1549
1550 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1551 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1552 {
1553 int error;
1554
1555 if (dolock) {
1556 socket_lock(so, 1);
1557 }
1558
1559 so_update_last_owner_locked(so, PROC_NULL);
1560 so_update_policy(so);
1561 #if NECP
1562 so_update_necp_policy(so, NULL, NULL);
1563 #endif /* NECP */
1564
1565 if ((so->so_state & SS_NOFDREF) == 0) {
1566 panic("soaccept: !NOFDREF");
1567 }
1568 so->so_state &= ~SS_NOFDREF;
1569 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1570
1571 if (dolock) {
1572 socket_unlock(so, 1);
1573 }
1574 return error;
1575 }
1576
1577 int
soaccept(struct socket * so,struct sockaddr ** nam)1578 soaccept(struct socket *so, struct sockaddr **nam)
1579 {
1580 return soacceptlock(so, nam, 1);
1581 }
1582
1583 int
soacceptfilter(struct socket * so,struct socket * head)1584 soacceptfilter(struct socket *so, struct socket *head)
1585 {
1586 struct sockaddr *__single local = NULL, *__single remote = NULL;
1587 int error = 0;
1588
1589 /*
1590 * Hold the lock even if this socket has not been made visible
1591 * to the filter(s). For sockets with global locks, this protects
1592 * against the head or peer going away
1593 */
1594 socket_lock(so, 1);
1595 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1596 sogetaddr_locked(so, &local, 0) != 0) {
1597 so->so_state &= ~SS_NOFDREF;
1598 socket_unlock(so, 1);
1599 soclose(so);
1600 /* Out of resources; try it again next time */
1601 error = ECONNABORTED;
1602 goto done;
1603 }
1604
1605 error = sflt_accept(head, so, local, remote);
1606
1607 /*
1608 * If we get EJUSTRETURN from one of the filters, mark this socket
1609 * as inactive and return it anyway. This newly accepted socket
1610 * will be disconnected later before we hand it off to the caller.
1611 */
1612 if (error == EJUSTRETURN) {
1613 error = 0;
1614 (void) sosetdefunct(current_proc(), so,
1615 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1616 }
1617
1618 if (error != 0) {
1619 /*
1620 * This may seem like a duplication to the above error
1621 * handling part when we return ECONNABORTED, except
1622 * the following is done while holding the lock since
1623 * the socket has been exposed to the filter(s) earlier.
1624 */
1625 so->so_state &= ~SS_NOFDREF;
1626 socket_unlock(so, 1);
1627 soclose(so);
1628 /* Propagate socket filter's error code to the caller */
1629 } else {
1630 socket_unlock(so, 1);
1631 }
1632 done:
1633 /* Callee checks for NULL pointer */
1634 sock_freeaddr(remote);
1635 sock_freeaddr(local);
1636 return error;
1637 }
1638
1639 /*
1640 * Returns: 0 Success
1641 * EOPNOTSUPP Operation not supported on socket
1642 * EISCONN Socket is connected
1643 * <pru_connect>:EADDRNOTAVAIL Address not available.
1644 * <pru_connect>:EINVAL Invalid argument
1645 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1646 * <pru_connect>:EACCES Permission denied
1647 * <pru_connect>:EADDRINUSE Address in use
1648 * <pru_connect>:EAGAIN Resource unavailable, try again
1649 * <pru_connect>:EPERM Operation not permitted
1650 * <sf_connect_out>:??? [anything a filter writer might set]
1651 */
1652 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1653 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1654 {
1655 int error;
1656 struct proc *p = current_proc();
1657 tracker_metadata_t metadata = { };
1658
1659 if (dolock) {
1660 socket_lock(so, 1);
1661 }
1662
1663 so_update_last_owner_locked(so, p);
1664 so_update_policy(so);
1665
1666 /*
1667 * If this is a listening socket or if this is a previously-accepted
1668 * socket that has been marked as inactive, reject the connect request.
1669 */
1670 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1671 error = EOPNOTSUPP;
1672 if (so->so_flags & SOF_DEFUNCT) {
1673 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1674 "(%d)\n", __func__, proc_pid(p),
1675 proc_best_name(p),
1676 so->so_gencnt,
1677 SOCK_DOM(so), SOCK_TYPE(so), error);
1678 }
1679 if (dolock) {
1680 socket_unlock(so, 1);
1681 }
1682 return error;
1683 }
1684
1685 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1686 if (dolock) {
1687 socket_unlock(so, 1);
1688 }
1689 return EPERM;
1690 }
1691
1692 /*
1693 * If protocol is connection-based, can only connect once.
1694 * Otherwise, if connected, try to disconnect first.
1695 * This allows user to disconnect by connecting to, e.g.,
1696 * a null address.
1697 */
1698 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1699 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1700 (error = sodisconnectlocked(so)))) {
1701 error = EISCONN;
1702 } else {
1703 /*
1704 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1705 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1706 */
1707 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1708 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1709 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1710 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1711 }
1712 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1713 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1714 }
1715 necp_set_socket_domain_attributes(so,
1716 __unsafe_null_terminated_from_indexable(metadata.domain),
1717 __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1718 }
1719 }
1720
1721 #if NECP
1722 /* Update NECP evaluation after setting any domain via the tracker checks */
1723 so_update_necp_policy(so, NULL, nam);
1724 #endif /* NECP */
1725
1726 /*
1727 * Run connect filter before calling protocol:
1728 * - non-blocking connect returns before completion;
1729 */
1730 error = sflt_connectout(so, nam);
1731 if (error != 0) {
1732 if (error == EJUSTRETURN) {
1733 error = 0;
1734 }
1735 } else {
1736 error = (*so->so_proto->pr_usrreqs->pru_connect)
1737 (so, nam, p);
1738 if (error != 0) {
1739 so->so_state &= ~SS_ISCONNECTING;
1740 }
1741 }
1742 }
1743 if (dolock) {
1744 socket_unlock(so, 1);
1745 }
1746 return error;
1747 }
1748
1749 int
soconnect(struct socket * so,struct sockaddr * nam)1750 soconnect(struct socket *so, struct sockaddr *nam)
1751 {
1752 return soconnectlock(so, nam, 1);
1753 }
1754
1755 /*
1756 * Returns: 0 Success
1757 * <pru_connect2>:EINVAL[AF_UNIX]
1758 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1759 * <pru_connect2>:??? [other protocol families]
1760 *
1761 * Notes: <pru_connect2> is not supported by [TCP].
1762 */
1763 int
soconnect2(struct socket * so1,struct socket * so2)1764 soconnect2(struct socket *so1, struct socket *so2)
1765 {
1766 int error;
1767
1768 socket_lock(so1, 1);
1769 if (so2->so_proto->pr_lock) {
1770 socket_lock(so2, 1);
1771 }
1772
1773 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1774
1775 socket_unlock(so1, 1);
1776 if (so2->so_proto->pr_lock) {
1777 socket_unlock(so2, 1);
1778 }
1779 return error;
1780 }
1781
1782 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1783 soconnectxlocked(struct socket *so, struct sockaddr *src,
1784 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1785 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1786 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1787 {
1788 int error;
1789 tracker_metadata_t metadata = { };
1790
1791 so_update_last_owner_locked(so, p);
1792 so_update_policy(so);
1793
1794 /*
1795 * If this is a listening socket or if this is a previously-accepted
1796 * socket that has been marked as inactive, reject the connect request.
1797 */
1798 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1799 error = EOPNOTSUPP;
1800 if (so->so_flags & SOF_DEFUNCT) {
1801 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1802 "(%d)\n", __func__, proc_pid(p),
1803 proc_best_name(p),
1804 so->so_gencnt,
1805 SOCK_DOM(so), SOCK_TYPE(so), error);
1806 }
1807 return error;
1808 }
1809
1810 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1811 return EPERM;
1812 }
1813
1814 /*
1815 * If protocol is connection-based, can only connect once
1816 * unless PR_MULTICONN is set. Otherwise, if connected,
1817 * try to disconnect first. This allows user to disconnect
1818 * by connecting to, e.g., a null address.
1819 */
1820 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1821 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1822 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1823 (error = sodisconnectlocked(so)) != 0)) {
1824 error = EISCONN;
1825 } else {
1826 /*
1827 * For TCP, check if destination address is a tracker and mark the socket accordingly
1828 * (only if it hasn't been marked yet).
1829 */
1830 if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1831 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1832 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1833 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1834 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1835 }
1836 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1837 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1838 }
1839 necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain),
1840 __unsafe_null_terminated_from_indexable(metadata.domain_owner));
1841 }
1842 }
1843
1844 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1845 (flags & CONNECT_DATA_IDEMPOTENT)) {
1846 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1847
1848 if (flags & CONNECT_DATA_AUTHENTICATED) {
1849 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1850 }
1851 }
1852
1853 /*
1854 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1855 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1856 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1857 * Case 3 allows user to combine write with connect even if they have
1858 * no use for TFO (such as regular TCP, and UDP).
1859 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1860 */
1861 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1862 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1863 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1864 }
1865
1866 /*
1867 * If a user sets data idempotent and does not pass an uio, or
1868 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1869 * SOF1_DATA_IDEMPOTENT.
1870 */
1871 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1872 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1873 /* We should return EINVAL instead perhaps. */
1874 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1875 }
1876
1877 /*
1878 * Run connect filter before calling protocol:
1879 * - non-blocking connect returns before completion;
1880 */
1881 error = sflt_connectout(so, dst);
1882 if (error != 0) {
1883 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1884 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1885 if (error == EJUSTRETURN) {
1886 error = 0;
1887 }
1888 } else {
1889 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1890 (so, src, dst, p, ifscope, aid, pcid,
1891 flags, arg, arglen, auio, bytes_written);
1892 if (error != 0) {
1893 so->so_state &= ~SS_ISCONNECTING;
1894 if (error != EINPROGRESS) {
1895 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1896 }
1897 }
1898 }
1899 }
1900
1901 return error;
1902 }
1903
1904 int
sodisconnectlocked(struct socket * so)1905 sodisconnectlocked(struct socket *so)
1906 {
1907 int error;
1908
1909 if ((so->so_state & SS_ISCONNECTED) == 0) {
1910 error = ENOTCONN;
1911 goto bad;
1912 }
1913 if (so->so_state & SS_ISDISCONNECTING) {
1914 error = EALREADY;
1915 goto bad;
1916 }
1917
1918 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1919 if (error == 0) {
1920 sflt_notify(so, sock_evt_disconnected, NULL);
1921 }
1922
1923 bad:
1924 return error;
1925 }
1926
1927 /* Locking version */
1928 int
sodisconnect(struct socket * so)1929 sodisconnect(struct socket *so)
1930 {
1931 int error;
1932
1933 socket_lock(so, 1);
1934 error = sodisconnectlocked(so);
1935 socket_unlock(so, 1);
1936 return error;
1937 }
1938
1939 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1940 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1941 {
1942 int error;
1943
1944 /*
1945 * Call the protocol disconnectx handler; let it handle all
1946 * matters related to the connection state of this session.
1947 */
1948 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1949 if (error == 0) {
1950 /*
1951 * The event applies only for the session, not for
1952 * the disconnection of individual subflows.
1953 */
1954 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1955 sflt_notify(so, sock_evt_disconnected, NULL);
1956 }
1957 }
1958 return error;
1959 }
1960
1961 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1962 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1963 {
1964 int error;
1965
1966 socket_lock(so, 1);
1967 error = sodisconnectxlocked(so, aid, cid);
1968 socket_unlock(so, 1);
1969 return error;
1970 }
1971
1972 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1973
1974 /*
1975 * sosendcheck will lock the socket buffer if it isn't locked and
1976 * verify that there is space for the data being inserted.
1977 *
1978 * Returns: 0 Success
1979 * EPIPE
1980 * sblock:EWOULDBLOCK
1981 * sblock:EINTR
1982 * sbwait:EBADF
1983 * sbwait:EINTR
1984 * [so_error]:???
1985 */
1986 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1987 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1988 int32_t clen, int32_t atomic, int flags, int *sblocked)
1989 {
1990 int error = 0;
1991 int32_t space;
1992 int assumelock = 0;
1993
1994 restart:
1995 if (*sblocked == 0) {
1996 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1997 so->so_send_filt_thread != 0 &&
1998 so->so_send_filt_thread == current_thread()) {
1999 /*
2000 * We're being called recursively from a filter,
2001 * allow this to continue. Radar 4150520.
2002 * Don't set sblocked because we don't want
2003 * to perform an unlock later.
2004 */
2005 assumelock = 1;
2006 } else {
2007 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2008 if (error) {
2009 if (so->so_flags & SOF_DEFUNCT) {
2010 goto defunct;
2011 }
2012 return error;
2013 }
2014 *sblocked = 1;
2015 }
2016 }
2017
2018 /*
2019 * If a send attempt is made on a socket that has been marked
2020 * as inactive (disconnected), reject the request.
2021 */
2022 if (so->so_flags & SOF_DEFUNCT) {
2023 defunct:
2024 error = EPIPE;
2025 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2026 __func__, proc_selfpid(), proc_best_name(current_proc()),
2027 so->so_gencnt,
2028 SOCK_DOM(so), SOCK_TYPE(so), error);
2029 return error;
2030 }
2031
2032 if (so->so_state & SS_CANTSENDMORE) {
2033 #if CONTENT_FILTER
2034 /*
2035 * Can re-inject data of half closed connections
2036 */
2037 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2038 so->so_snd.sb_cfil_thread == current_thread() &&
2039 cfil_sock_data_pending(&so->so_snd) != 0) {
2040 CFIL_LOG(LOG_INFO,
2041 "so %llx ignore SS_CANTSENDMORE",
2042 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2043 } else
2044 #endif /* CONTENT_FILTER */
2045 return EPIPE;
2046 }
2047 if (so->so_error) {
2048 error = so->so_error;
2049 so->so_error = 0;
2050 return error;
2051 }
2052
2053 if ((so->so_state & SS_ISCONNECTED) == 0) {
2054 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2055 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2056 (resid != 0 || clen == 0) &&
2057 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2058 return ENOTCONN;
2059 }
2060 } else if (addr == 0) {
2061 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2062 ENOTCONN : EDESTADDRREQ;
2063 }
2064 }
2065
2066 space = sbspace(&so->so_snd);
2067
2068 if (flags & MSG_OOB) {
2069 space += 1024;
2070 }
2071 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2072 clen > so->so_snd.sb_hiwat) {
2073 return EMSGSIZE;
2074 }
2075
2076 if ((space < resid + clen &&
2077 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2078 space < clen)) ||
2079 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2080 /*
2081 * don't block the connectx call when there's more data
2082 * than can be copied.
2083 */
2084 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2085 if (space == 0) {
2086 return EWOULDBLOCK;
2087 }
2088 if (space < (int32_t)so->so_snd.sb_lowat) {
2089 return 0;
2090 }
2091 }
2092 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2093 assumelock) {
2094 return EWOULDBLOCK;
2095 }
2096 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2097 *sblocked = 0;
2098 error = sbwait(&so->so_snd);
2099 if (error) {
2100 if (so->so_flags & SOF_DEFUNCT) {
2101 goto defunct;
2102 }
2103 return error;
2104 }
2105 goto restart;
2106 }
2107 return 0;
2108 }
2109
2110 /*
2111 * Send on a socket.
2112 * If send must go all at once and message is larger than
2113 * send buffering, then hard error.
2114 * Lock against other senders.
2115 * If must go all at once and not enough room now, then
2116 * inform user that this would block and do nothing.
2117 * Otherwise, if nonblocking, send as much as possible.
2118 * The data to be sent is described by "uio" if nonzero,
2119 * otherwise by the mbuf chain "top" (which must be null
2120 * if uio is not). Data provided in mbuf chain must be small
2121 * enough to send all at once.
2122 *
2123 * Returns nonzero on error, timeout or signal; callers
2124 * must check for short counts if EINTR/ERESTART are returned.
2125 * Data and control buffers are freed on return.
2126 *
2127 * Returns: 0 Success
2128 * EOPNOTSUPP
2129 * EINVAL
2130 * ENOBUFS
2131 * uiomove:EFAULT
2132 * sosendcheck:EPIPE
2133 * sosendcheck:EWOULDBLOCK
2134 * sosendcheck:EINTR
2135 * sosendcheck:EBADF
2136 * sosendcheck:EINTR
2137 * sosendcheck:??? [value from so_error]
2138 * <pru_send>:ECONNRESET[TCP]
2139 * <pru_send>:EINVAL[TCP]
2140 * <pru_send>:ENOBUFS[TCP]
2141 * <pru_send>:EADDRINUSE[TCP]
2142 * <pru_send>:EADDRNOTAVAIL[TCP]
2143 * <pru_send>:EAFNOSUPPORT[TCP]
2144 * <pru_send>:EACCES[TCP]
2145 * <pru_send>:EAGAIN[TCP]
2146 * <pru_send>:EPERM[TCP]
2147 * <pru_send>:EMSGSIZE[TCP]
2148 * <pru_send>:EHOSTUNREACH[TCP]
2149 * <pru_send>:ENETUNREACH[TCP]
2150 * <pru_send>:ENETDOWN[TCP]
2151 * <pru_send>:ENOMEM[TCP]
2152 * <pru_send>:ENOBUFS[TCP]
2153 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2154 * <pru_send>:EINVAL[AF_UNIX]
2155 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2156 * <pru_send>:EPIPE[AF_UNIX]
2157 * <pru_send>:ENOTCONN[AF_UNIX]
2158 * <pru_send>:EISCONN[AF_UNIX]
2159 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2160 * <sf_data_out>:??? [whatever a filter author chooses]
2161 *
2162 * Notes: Other <pru_send> returns depend on the protocol family; all
2163 * <sf_data_out> returns depend on what the filter author causes
2164 * their filter to return.
2165 */
2166 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2167 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2168 struct mbuf *top, struct mbuf *control, int flags)
2169 {
2170 mbuf_ref_ref_t mp;
2171 mbuf_ref_t m, freelist = NULL;
2172 struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2173 user_ssize_t space, len, resid, orig_resid;
2174 int clen = 0, error, dontroute, sendflags;
2175 int atomic = sosendallatonce(so) || top;
2176 int sblocked = 0;
2177 struct proc *p = current_proc();
2178 uint16_t headroom = 0;
2179 ssize_t mlen;
2180 boolean_t en_tracing = FALSE;
2181
2182 if (uio != NULL) {
2183 resid = uio_resid(uio);
2184 } else {
2185 resid = top->m_pkthdr.len;
2186 }
2187 orig_resid = resid;
2188
2189 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2190 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2191
2192 socket_lock(so, 1);
2193
2194 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2195 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, SOFLOW_DIRECTION_OUTBOUND, 0);
2196 }
2197
2198 /*
2199 * trace if tracing & network (vs. unix) sockets & and
2200 * non-loopback
2201 */
2202 if (ENTR_SHOULDTRACE &&
2203 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2204 struct inpcb *inp = sotoinpcb(so);
2205 if (inp->inp_last_outifp != NULL &&
2206 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2207 en_tracing = TRUE;
2208 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2209 VM_KERNEL_ADDRPERM(so),
2210 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2211 (int64_t)resid);
2212 }
2213 }
2214
2215 /*
2216 * Re-injection should not affect process accounting
2217 */
2218 if ((flags & MSG_SKIPCFIL) == 0) {
2219 so_update_last_owner_locked(so, p);
2220 so_update_policy(so);
2221
2222 #if NECP
2223 so_update_necp_policy(so, NULL, addr);
2224 #endif /* NECP */
2225 }
2226
2227 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2228 error = EOPNOTSUPP;
2229 goto out_locked;
2230 }
2231
2232 /*
2233 * In theory resid should be unsigned.
2234 * However, space must be signed, as it might be less than 0
2235 * if we over-committed, and we must use a signed comparison
2236 * of space and resid. On the other hand, a negative resid
2237 * causes us to loop sending 0-length segments to the protocol.
2238 *
2239 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2240 *
2241 * Note: We limit resid to be a positive int value as we use
2242 * imin() to set bytes_to_copy -- radr://14558484
2243 */
2244 if (resid < 0 || resid > INT_MAX ||
2245 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2246 error = EINVAL;
2247 goto out_locked;
2248 }
2249
2250 dontroute = (flags & MSG_DONTROUTE) &&
2251 (so->so_options & SO_DONTROUTE) == 0 &&
2252 (so->so_proto->pr_flags & PR_ATOMIC);
2253 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2254
2255 if (control != NULL) {
2256 clen = control->m_len;
2257 }
2258
2259 if (soreserveheadroom != 0) {
2260 headroom = so->so_pktheadroom;
2261 }
2262
2263 do {
2264 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2265 &sblocked);
2266 if (error) {
2267 goto out_locked;
2268 }
2269
2270 mp = ⊤
2271 space = sbspace(&so->so_snd) - clen;
2272 space += ((flags & MSG_OOB) ? 1024 : 0);
2273
2274 do {
2275 if (uio == NULL) {
2276 /*
2277 * Data is prepackaged in "top".
2278 */
2279 resid = 0;
2280 if (flags & MSG_EOR) {
2281 top->m_flags |= M_EOR;
2282 }
2283 } else {
2284 int chainlength;
2285 int bytes_to_copy;
2286 boolean_t jumbocl;
2287 boolean_t bigcl;
2288 int bytes_to_alloc;
2289
2290 bytes_to_copy = imin((int)resid, (int)space);
2291
2292 bytes_to_alloc = bytes_to_copy;
2293 if (top == NULL) {
2294 bytes_to_alloc += headroom;
2295 }
2296
2297 if (sosendminchain > 0) {
2298 chainlength = 0;
2299 } else {
2300 chainlength = sosendmaxchain;
2301 }
2302
2303 /*
2304 * Use big 4 KB cluster when the outgoing interface
2305 * does not prefer 2 KB clusters
2306 */
2307 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2308 sosendbigcl_ignore_capab;
2309
2310 /*
2311 * Attempt to use larger than system page-size
2312 * clusters for large writes only if there is
2313 * a jumbo cluster pool and if the socket is
2314 * marked accordingly.
2315 */
2316 jumbocl = sosendjcl && njcl > 0 &&
2317 ((so->so_flags & SOF_MULTIPAGES) ||
2318 sosendjcl_ignore_capab) &&
2319 bigcl;
2320
2321 socket_unlock(so, 0);
2322
2323 do {
2324 int num_needed;
2325 int hdrs_needed = (top == NULL) ? 1 : 0;
2326
2327 /*
2328 * try to maintain a local cache of mbuf
2329 * clusters needed to complete this
2330 * write the list is further limited to
2331 * the number that are currently needed
2332 * to fill the socket this mechanism
2333 * allows a large number of mbufs/
2334 * clusters to be grabbed under a single
2335 * mbuf lock... if we can't get any
2336 * clusters, than fall back to trying
2337 * for mbufs if we fail early (or
2338 * miscalcluate the number needed) make
2339 * sure to release any clusters we
2340 * haven't yet consumed.
2341 */
2342 if (freelist == NULL &&
2343 bytes_to_alloc > MBIGCLBYTES &&
2344 jumbocl) {
2345 num_needed =
2346 bytes_to_alloc / M16KCLBYTES;
2347
2348 if ((bytes_to_alloc -
2349 (num_needed * M16KCLBYTES))
2350 >= MINCLSIZE) {
2351 num_needed++;
2352 }
2353
2354 freelist =
2355 m_getpackets_internal(
2356 (unsigned int *)&num_needed,
2357 hdrs_needed, M_WAIT, 0,
2358 M16KCLBYTES);
2359 /*
2360 * Fall back to 4K cluster size
2361 * if allocation failed
2362 */
2363 }
2364
2365 if (freelist == NULL &&
2366 bytes_to_alloc > MCLBYTES &&
2367 bigcl) {
2368 num_needed =
2369 bytes_to_alloc / MBIGCLBYTES;
2370
2371 if ((bytes_to_alloc -
2372 (num_needed * MBIGCLBYTES)) >=
2373 MINCLSIZE) {
2374 num_needed++;
2375 }
2376
2377 freelist =
2378 m_getpackets_internal(
2379 (unsigned int *)&num_needed,
2380 hdrs_needed, M_WAIT, 0,
2381 MBIGCLBYTES);
2382 /*
2383 * Fall back to cluster size
2384 * if allocation failed
2385 */
2386 }
2387
2388 /*
2389 * Allocate a cluster as we want to
2390 * avoid to split the data in more
2391 * that one segment and using MINCLSIZE
2392 * would lead us to allocate two mbufs
2393 */
2394 if (soreserveheadroom != 0 &&
2395 freelist == NULL &&
2396 ((top == NULL &&
2397 bytes_to_alloc > _MHLEN) ||
2398 bytes_to_alloc > _MLEN)) {
2399 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2400 MCLBYTES;
2401 freelist =
2402 m_getpackets_internal(
2403 (unsigned int *)&num_needed,
2404 hdrs_needed, M_WAIT, 0,
2405 MCLBYTES);
2406 /*
2407 * Fall back to a single mbuf
2408 * if allocation failed
2409 */
2410 } else if (freelist == NULL &&
2411 bytes_to_alloc > MINCLSIZE) {
2412 num_needed =
2413 bytes_to_alloc / MCLBYTES;
2414
2415 if ((bytes_to_alloc -
2416 (num_needed * MCLBYTES)) >=
2417 MINCLSIZE) {
2418 num_needed++;
2419 }
2420
2421 freelist =
2422 m_getpackets_internal(
2423 (unsigned int *)&num_needed,
2424 hdrs_needed, M_WAIT, 0,
2425 MCLBYTES);
2426 /*
2427 * Fall back to a single mbuf
2428 * if allocation failed
2429 */
2430 }
2431 /*
2432 * For datagram protocols, leave
2433 * headroom for protocol headers
2434 * in the first cluster of the chain
2435 */
2436 if (freelist != NULL && atomic &&
2437 top == NULL && headroom > 0) {
2438 freelist->m_data += headroom;
2439 }
2440
2441 /*
2442 * Fall back to regular mbufs without
2443 * reserving the socket headroom
2444 */
2445 if (freelist == NULL) {
2446 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2447 if (top == NULL) {
2448 MGETHDR(freelist,
2449 M_WAIT, MT_DATA);
2450 } else {
2451 MGET(freelist,
2452 M_WAIT, MT_DATA);
2453 }
2454 }
2455
2456 if (freelist == NULL) {
2457 error = ENOBUFS;
2458 socket_lock(so, 0);
2459 goto out_locked;
2460 }
2461 /*
2462 * For datagram protocols,
2463 * leave room for protocol
2464 * headers in first mbuf.
2465 */
2466 if (atomic && top == NULL &&
2467 bytes_to_copy > 0 &&
2468 bytes_to_copy < MHLEN) {
2469 MH_ALIGN(freelist,
2470 bytes_to_copy);
2471 }
2472 }
2473 m = freelist;
2474 freelist = m->m_next;
2475 m->m_next = NULL;
2476
2477 if ((m->m_flags & M_EXT)) {
2478 mlen = m->m_ext.ext_size -
2479 M_LEADINGSPACE(m);
2480 } else if ((m->m_flags & M_PKTHDR)) {
2481 mlen = MHLEN - M_LEADINGSPACE(m);
2482 m_add_crumb(m, PKT_CRUMB_SOSEND);
2483 } else {
2484 mlen = MLEN - M_LEADINGSPACE(m);
2485 }
2486 len = imin((int)mlen, bytes_to_copy);
2487
2488 chainlength += len;
2489
2490 space -= len;
2491
2492 error = uiomove(mtod(m, caddr_t),
2493 (int)len, uio);
2494
2495 resid = uio_resid(uio);
2496
2497 m->m_len = (int32_t)len;
2498 *mp = m;
2499 top->m_pkthdr.len += len;
2500 if (error) {
2501 break;
2502 }
2503 mp = &m->m_next;
2504 if (resid <= 0) {
2505 if (flags & MSG_EOR) {
2506 top->m_flags |= M_EOR;
2507 }
2508 break;
2509 }
2510 bytes_to_copy = imin((int)resid, (int)space);
2511 } while (space > 0 &&
2512 (chainlength < sosendmaxchain || atomic ||
2513 resid < MINCLSIZE));
2514
2515 socket_lock(so, 0);
2516
2517 if (error) {
2518 goto out_locked;
2519 }
2520 }
2521
2522 if (dontroute) {
2523 so->so_options |= SO_DONTROUTE;
2524 }
2525
2526 /*
2527 * Compute flags here, for pru_send and NKEs
2528 *
2529 * If the user set MSG_EOF, the protocol
2530 * understands this flag and nothing left to
2531 * send then use PRU_SEND_EOF instead of PRU_SEND.
2532 */
2533 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2534 ((flags & MSG_EOF) &&
2535 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2536 (resid <= 0)) ? PRUS_EOF :
2537 /* If there is more to send set PRUS_MORETOCOME */
2538 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2539
2540 if ((flags & MSG_SKIPCFIL) == 0) {
2541 /*
2542 * Socket filter processing
2543 */
2544 error = sflt_data_out(so, addr, &top,
2545 &control, (sendflags & MSG_OOB) ?
2546 sock_data_filt_flag_oob : 0);
2547 if (error) {
2548 if (error == EJUSTRETURN) {
2549 error = 0;
2550 goto packet_consumed;
2551 }
2552 goto out_locked;
2553 }
2554 #if CONTENT_FILTER
2555 /*
2556 * Content filter processing
2557 */
2558 error = cfil_sock_data_out(so, addr, top,
2559 control, sendflags, dgram_flow_entry);
2560 if (error) {
2561 if (error == EJUSTRETURN) {
2562 error = 0;
2563 goto packet_consumed;
2564 }
2565 goto out_locked;
2566 }
2567 #endif /* CONTENT_FILTER */
2568 }
2569 error = (*so->so_proto->pr_usrreqs->pru_send)
2570 (so, sendflags, top, addr, control, p);
2571
2572 packet_consumed:
2573 if (dontroute) {
2574 so->so_options &= ~SO_DONTROUTE;
2575 }
2576
2577 clen = 0;
2578 control = NULL;
2579 top = NULL;
2580 mp = ⊤
2581 if (error) {
2582 goto out_locked;
2583 }
2584 } while (resid && space > 0);
2585 } while (resid);
2586
2587
2588 out_locked:
2589 if (resid > orig_resid) {
2590 char pname[MAXCOMLEN] = {};
2591 pid_t current_pid = proc_pid(current_proc());
2592 proc_name(current_pid, pname, sizeof(pname));
2593
2594 if (sosend_assert_panic != 0) {
2595 panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2596 so, resid, orig_resid, pname, current_pid);
2597 } else {
2598 os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2599 so->so_gencnt, resid, orig_resid, pname, current_pid);
2600 }
2601 }
2602
2603 if (sblocked) {
2604 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2605 } else {
2606 socket_unlock(so, 1);
2607 }
2608 if (top != NULL) {
2609 m_freem(top);
2610 }
2611 if (control != NULL) {
2612 m_freem(control);
2613 }
2614 if (freelist != NULL) {
2615 m_freem_list(freelist);
2616 }
2617
2618 if (dgram_flow_entry != NULL) {
2619 soflow_free_flow(dgram_flow_entry);
2620 }
2621
2622 soclearfastopen(so);
2623
2624 if (en_tracing) {
2625 /* resid passed here is the bytes left in uio */
2626 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2627 VM_KERNEL_ADDRPERM(so),
2628 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2629 (int64_t)(orig_resid - resid));
2630 }
2631 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2632 so->so_snd.sb_cc, space, error);
2633
2634 return error;
2635 }
2636
2637 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2638 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2639 {
2640 struct mbuf *m0 = NULL, *control_end = NULL;
2641
2642 socket_lock_assert_owned(so);
2643
2644 /*
2645 * top must points to mbuf chain to be sent.
2646 * If control is not NULL, top must be packet header
2647 */
2648 VERIFY(top != NULL &&
2649 (control == NULL || top->m_flags & M_PKTHDR));
2650
2651 /*
2652 * If control is not passed in, see if we can get it
2653 * from top.
2654 */
2655 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2656 // Locate start of control if present and start of data
2657 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2658 if (m0->m_flags & M_PKTHDR) {
2659 top = m0;
2660 break;
2661 } else if (m0->m_type == MT_CONTROL) {
2662 if (control == NULL) {
2663 // Found start of control
2664 control = m0;
2665 }
2666 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2667 // Found end of control
2668 control_end = m0;
2669 }
2670 }
2671 }
2672 if (control_end != NULL) {
2673 control_end->m_next = NULL;
2674 }
2675 }
2676
2677 int error = (*so->so_proto->pr_usrreqs->pru_send)
2678 (so, sendflags, top, addr, control, current_proc());
2679
2680 return error;
2681 }
2682
2683 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp,struct mbuf ** last_control)2684 mbuf_detach_control_from_list(struct mbuf **mp, struct mbuf **last_control)
2685 {
2686 struct mbuf *control = NULL;
2687 struct mbuf *m = *mp;
2688
2689 if (m->m_type == MT_CONTROL) {
2690 struct mbuf *control_end;
2691 struct mbuf *n;
2692
2693 n = control_end = control = m;
2694
2695 /*
2696 * Break the chain per mbuf type
2697 */
2698 while (n != NULL && n->m_type == MT_CONTROL) {
2699 control_end = n;
2700 n = n->m_next;
2701 }
2702 control_end->m_next = NULL;
2703 *mp = n;
2704 if (last_control != NULL) {
2705 *last_control = control_end;
2706 }
2707 }
2708 VERIFY(*mp != NULL);
2709
2710 return control;
2711 }
2712
2713 /*
2714 * Supported only connected sockets (no address) without ancillary data
2715 * (control mbuf) for atomic protocols
2716 */
2717 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2718 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2719 {
2720 mbuf_ref_t m, control = NULL;
2721 struct soflow_hash_entry *__single dgram_flow_entry = NULL;
2722 int error, dontroute;
2723 int atomic = sosendallatonce(so);
2724 int sblocked = 0;
2725 struct proc *p = current_proc();
2726 struct mbuf *top = pktlist;
2727 bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2728
2729 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2730 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2731
2732 if (so->so_type != SOCK_DGRAM) {
2733 error = EINVAL;
2734 os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2735 error);
2736 goto out;
2737 }
2738 if (atomic == 0) {
2739 error = EINVAL;
2740 os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2741 error);
2742 goto out;
2743 }
2744 if ((so->so_state & SS_ISCONNECTED) == 0) {
2745 error = ENOTCONN;
2746 os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2747 error);
2748 goto out;
2749 }
2750 if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2751 error = EINVAL;
2752 os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2753 flags, error);
2754 goto out;
2755 }
2756
2757 socket_lock(so, 1);
2758 so_update_last_owner_locked(so, p);
2759 so_update_policy(so);
2760
2761 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2762 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, SOFLOW_DIRECTION_OUTBOUND, 0);
2763 }
2764
2765 #if NECP
2766 so_update_necp_policy(so, NULL, NULL);
2767 #endif /* NECP */
2768
2769 dontroute = (flags & MSG_DONTROUTE) &&
2770 (so->so_options & SO_DONTROUTE) == 0 &&
2771 (so->so_proto->pr_flags & PR_ATOMIC);
2772 if (dontroute) {
2773 so->so_options |= SO_DONTROUTE;
2774 }
2775
2776 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2777
2778 error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2779 if (error) {
2780 os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2781 error);
2782 goto release;
2783 }
2784
2785 if (!skip_filt) {
2786 mbuf_ref_ref_t prevnextp = NULL;
2787
2788 for (m = top; m != NULL; m = m->m_nextpkt) {
2789 mbuf_ref_t nextpkt, last_control;
2790
2791 /*
2792 * Remove packet from the list of packets
2793 */
2794 nextpkt = m->m_nextpkt;
2795 if (prevnextp != NULL) {
2796 *prevnextp = nextpkt;
2797 } else {
2798 top = nextpkt;
2799 }
2800 m->m_nextpkt = NULL;
2801
2802 /*
2803 * Break the chain per mbuf type
2804 */
2805 if (m->m_type == MT_CONTROL) {
2806 control = mbuf_detach_control_from_list(&m, &last_control);
2807 }
2808 /*
2809 * Socket filter processing
2810 */
2811 error = sflt_data_out(so, NULL, &m,
2812 &control, 0);
2813 if (error != 0 && error != EJUSTRETURN) {
2814 os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2815 error);
2816 m_freem(m);
2817 goto release;
2818 }
2819
2820 #if CONTENT_FILTER
2821 if (error == 0) {
2822 /*
2823 * Content filter processing
2824 */
2825 error = cfil_sock_data_out(so, NULL, m,
2826 control, 0, dgram_flow_entry);
2827 if (error != 0 && error != EJUSTRETURN) {
2828 os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2829 error);
2830 m_freem(m);
2831 goto release;
2832 }
2833 }
2834 #endif /* CONTENT_FILTER */
2835 if (error == EJUSTRETURN) {
2836 /*
2837 * When swallowed by a filter, the packet is not
2838 * in the list anymore
2839 */
2840 error = 0;
2841 } else {
2842 /*
2843 * Rebuild the mbuf chain of the packet
2844 */
2845 if (control != NULL) {
2846 last_control->m_next = m;
2847 m = control;
2848 }
2849 /*
2850 * Reinsert the packet in the list of packets
2851 */
2852 m->m_nextpkt = nextpkt;
2853 if (prevnextp != NULL) {
2854 *prevnextp = m;
2855 } else {
2856 top = m;
2857 }
2858 prevnextp = &m->m_nextpkt;
2859 }
2860 control = NULL;
2861 }
2862 }
2863
2864 if (top != NULL) {
2865 if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2866 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2867 (so, top, pktcnt, flags);
2868 if (error != 0 && error != ENOBUFS) {
2869 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2870 error);
2871 }
2872 top = NULL;
2873 } else {
2874 *pktcnt = 0;
2875 control = NULL;
2876 for (m = top; m != NULL; m = top) {
2877 top = m->m_nextpkt;
2878 m->m_nextpkt = NULL;
2879
2880 /*
2881 * Break the chain per mbuf type
2882 */
2883 if (m->m_type == MT_CONTROL) {
2884 control = mbuf_detach_control_from_list(&m, NULL);
2885 }
2886
2887 error = (*so->so_proto->pr_usrreqs->pru_send)
2888 (so, 0, m, NULL, control, current_proc());
2889 if (error != 0) {
2890 if (error != ENOBUFS) {
2891 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2892 error);
2893 }
2894 control = NULL;
2895 goto release;
2896 }
2897 *pktcnt += 1;
2898 control = NULL;
2899 }
2900 }
2901 }
2902
2903 release:
2904 if (dontroute) {
2905 so->so_options &= ~SO_DONTROUTE;
2906 }
2907 if (sblocked) {
2908 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2909 } else {
2910 socket_unlock(so, 1);
2911 }
2912 out:
2913 if (control != NULL) {
2914 m_freem(control);
2915 }
2916 if (top != NULL) {
2917 if (error != ENOBUFS) {
2918 os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2919 error);
2920 }
2921 m_freem_list(top);
2922 }
2923
2924 if (dgram_flow_entry != NULL) {
2925 soflow_free_flow(dgram_flow_entry);
2926 }
2927
2928 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2929 so->so_snd.sb_cc, 0, error);
2930
2931 return error;
2932 }
2933
2934 /*
2935 * May return ERESTART when packet is dropped by MAC policy check
2936 */
2937 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2938 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2939 struct mbuf **maddrp,
2940 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2941 {
2942 int error = 0;
2943 struct mbuf *m = *mp;
2944 struct mbuf *nextrecord = *nextrecordp;
2945
2946 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2947 #if CONFIG_MACF_SOCKET_SUBSET
2948 /*
2949 * Call the MAC framework for policy checking if we're in
2950 * the user process context and the socket isn't connected.
2951 */
2952 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2953 struct mbuf *m0 = m;
2954 /*
2955 * Dequeue this record (temporarily) from the receive
2956 * list since we're about to drop the socket's lock
2957 * where a new record may arrive and be appended to
2958 * the list. Upon MAC policy failure, the record
2959 * will be freed. Otherwise, we'll add it back to
2960 * the head of the list. We cannot rely on SB_LOCK
2961 * because append operation uses the socket's lock.
2962 */
2963 do {
2964 m->m_nextpkt = NULL;
2965 sbfree(&so->so_rcv, m);
2966 m = m->m_next;
2967 } while (m != NULL);
2968 m = m0;
2969 so->so_rcv.sb_mb = nextrecord;
2970 SB_EMPTY_FIXUP(&so->so_rcv);
2971 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2972 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2973 socket_unlock(so, 0);
2974
2975 error = mac_socket_check_received(kauth_cred_get(), so,
2976 mtod(m, struct sockaddr *));
2977
2978 if (error != 0) {
2979 /*
2980 * MAC policy failure; free this record and
2981 * process the next record (or block until
2982 * one is available). We have adjusted sb_cc
2983 * and sb_mbcnt above so there is no need to
2984 * call sbfree() again.
2985 */
2986 m_freem(m);
2987 /*
2988 * Clear SB_LOCK but don't unlock the socket.
2989 * Process the next record or wait for one.
2990 */
2991 socket_lock(so, 0);
2992 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2993 error = ERESTART;
2994 goto done;
2995 }
2996 socket_lock(so, 0);
2997 /*
2998 * If the socket has been defunct'd, drop it.
2999 */
3000 if (so->so_flags & SOF_DEFUNCT) {
3001 m_freem(m);
3002 error = ENOTCONN;
3003 goto done;
3004 }
3005 /*
3006 * Re-adjust the socket receive list and re-enqueue
3007 * the record in front of any packets which may have
3008 * been appended while we dropped the lock.
3009 */
3010 for (m = m0; m->m_next != NULL; m = m->m_next) {
3011 sballoc(&so->so_rcv, m);
3012 }
3013 sballoc(&so->so_rcv, m);
3014 if (so->so_rcv.sb_mb == NULL) {
3015 so->so_rcv.sb_lastrecord = m0;
3016 so->so_rcv.sb_mbtail = m;
3017 }
3018 m = m0;
3019 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3020 so->so_rcv.sb_mb = m;
3021 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3022 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3023 }
3024 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3025 if (psa != NULL) {
3026 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3027 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3028 error = EWOULDBLOCK;
3029 goto done;
3030 }
3031 } else if (maddrp != NULL) {
3032 *maddrp = m;
3033 }
3034 if (flags & MSG_PEEK) {
3035 m = m->m_next;
3036 } else {
3037 sbfree(&so->so_rcv, m);
3038 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3039 panic("%s: about to create invalid socketbuf",
3040 __func__);
3041 /* NOTREACHED */
3042 }
3043 if (maddrp == NULL) {
3044 MFREE(m, so->so_rcv.sb_mb);
3045 } else {
3046 so->so_rcv.sb_mb = m->m_next;
3047 m->m_next = NULL;
3048 }
3049 m = so->so_rcv.sb_mb;
3050 if (m != NULL) {
3051 m->m_nextpkt = nextrecord;
3052 } else {
3053 so->so_rcv.sb_mb = nextrecord;
3054 SB_EMPTY_FIXUP(&so->so_rcv);
3055 }
3056 }
3057 done:
3058 *mp = m;
3059 *nextrecordp = nextrecord;
3060
3061 return error;
3062 }
3063
3064 /*
3065 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3066 * so clear the data portion in order not to leak the file pointers
3067 */
3068 static void
sopeek_scm_rights(struct mbuf * rights)3069 sopeek_scm_rights(struct mbuf *rights)
3070 {
3071 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3072
3073 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3074 VERIFY(cm->cmsg_len <= rights->m_len);
3075 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3076 }
3077 }
3078
3079 /*
3080 * Process one or more MT_CONTROL mbufs present before any data mbufs
3081 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3082 * just copy the data; if !MSG_PEEK, we call into the protocol to
3083 * perform externalization.
3084 */
3085 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3086 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3087 struct mbuf **mp, struct mbuf **nextrecordp)
3088 {
3089 int error = 0;
3090 mbuf_ref_t cm = NULL, cmn;
3091 mbuf_ref_ref_t cme = &cm;
3092 struct sockbuf *sb_rcv = &so->so_rcv;
3093 mbuf_ref_ref_t msgpcm = NULL;
3094 mbuf_ref_t m = *mp;
3095 mbuf_ref_t nextrecord = *nextrecordp;
3096 struct protosw *pr = so->so_proto;
3097
3098 /*
3099 * Externalizing the control messages would require us to
3100 * drop the socket's lock below. Once we re-acquire the
3101 * lock, the mbuf chain might change. In order to preserve
3102 * consistency, we unlink all control messages from the
3103 * first mbuf chain in one shot and link them separately
3104 * onto a different chain.
3105 */
3106 do {
3107 if (flags & MSG_PEEK) {
3108 if (controlp != NULL) {
3109 if (*controlp == NULL) {
3110 msgpcm = controlp;
3111 }
3112 *controlp = m_copy(m, 0, m->m_len);
3113
3114 /*
3115 * If we failed to allocate an mbuf,
3116 * release any previously allocated
3117 * mbufs for control data. Return
3118 * an error. Keep the mbufs in the
3119 * socket as this is using
3120 * MSG_PEEK flag.
3121 */
3122 if (*controlp == NULL) {
3123 m_freem(*msgpcm);
3124 error = ENOBUFS;
3125 goto done;
3126 }
3127
3128 if (pr->pr_domain->dom_externalize != NULL) {
3129 sopeek_scm_rights(*controlp);
3130 }
3131
3132 controlp = &(*controlp)->m_next;
3133 }
3134 m = m->m_next;
3135 } else {
3136 m->m_nextpkt = NULL;
3137 sbfree(sb_rcv, m);
3138 sb_rcv->sb_mb = m->m_next;
3139 m->m_next = NULL;
3140 *cme = m;
3141 cme = &(*cme)->m_next;
3142 m = sb_rcv->sb_mb;
3143 }
3144 } while (m != NULL && m->m_type == MT_CONTROL);
3145
3146 if (!(flags & MSG_PEEK)) {
3147 if (sb_rcv->sb_mb != NULL) {
3148 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3149 } else {
3150 sb_rcv->sb_mb = nextrecord;
3151 SB_EMPTY_FIXUP(sb_rcv);
3152 }
3153 if (nextrecord == NULL) {
3154 sb_rcv->sb_lastrecord = m;
3155 }
3156 }
3157
3158 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3159 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3160
3161 while (cm != NULL) {
3162 int cmsg_level;
3163 int cmsg_type;
3164
3165 cmn = cm->m_next;
3166 cm->m_next = NULL;
3167 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3168 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3169
3170 /*
3171 * Call the protocol to externalize SCM_RIGHTS message
3172 * and return the modified message to the caller upon
3173 * success. Otherwise, all other control messages are
3174 * returned unmodified to the caller. Note that we
3175 * only get into this loop if MSG_PEEK is not set.
3176 */
3177 if (pr->pr_domain->dom_externalize != NULL &&
3178 cmsg_level == SOL_SOCKET &&
3179 cmsg_type == SCM_RIGHTS) {
3180 /*
3181 * Release socket lock: see 3903171. This
3182 * would also allow more records to be appended
3183 * to the socket buffer. We still have SB_LOCK
3184 * set on it, so we can be sure that the head
3185 * of the mbuf chain won't change.
3186 */
3187 socket_unlock(so, 0);
3188 error = (*pr->pr_domain->dom_externalize)(cm);
3189 socket_lock(so, 0);
3190 } else {
3191 error = 0;
3192 }
3193
3194 if (controlp != NULL && error == 0) {
3195 *controlp = cm;
3196 controlp = &(*controlp)->m_next;
3197 } else {
3198 (void) m_free(cm);
3199 }
3200 cm = cmn;
3201 }
3202 /*
3203 * Update the value of nextrecord in case we received new
3204 * records when the socket was unlocked above for
3205 * externalizing SCM_RIGHTS.
3206 */
3207 if (m != NULL) {
3208 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3209 } else {
3210 nextrecord = sb_rcv->sb_mb;
3211 }
3212
3213 done:
3214 *mp = m;
3215 *nextrecordp = nextrecord;
3216
3217 return error;
3218 }
3219
3220 /*
3221 * If we have less data than requested, block awaiting more
3222 * (subject to any timeout) if:
3223 * 1. the current count is less than the low water mark, or
3224 * 2. MSG_WAITALL is set, and it is possible to do the entire
3225 * receive operation at once if we block (resid <= hiwat).
3226 * 3. MSG_DONTWAIT is not set
3227 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3228 * we have to do the receive in sections, and thus risk returning
3229 * a short count if a timeout or signal occurs after we start.
3230 */
3231 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3232 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3233 {
3234 struct protosw *pr = so->so_proto;
3235
3236 /* No mbufs in the receive-queue? Wait! */
3237 if (m == NULL) {
3238 return true;
3239 }
3240
3241 /* Not enough data in the receive socket-buffer - we may have to wait */
3242 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3243 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3244 /*
3245 * Application did set the lowater-mark, so we should wait for
3246 * this data to be present.
3247 */
3248 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3249 return true;
3250 }
3251
3252 /*
3253 * Application wants all the data - so let's try to do the
3254 * receive-operation at once by waiting for everything to
3255 * be there.
3256 */
3257 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3258 return true;
3259 }
3260 }
3261
3262 return false;
3263 }
3264
3265 /*
3266 * Implement receive operations on a socket.
3267 * We depend on the way that records are added to the sockbuf
3268 * by sbappend*. In particular, each record (mbufs linked through m_next)
3269 * must begin with an address if the protocol so specifies,
3270 * followed by an optional mbuf or mbufs containing ancillary data,
3271 * and then zero or more mbufs of data.
3272 * In order to avoid blocking network interrupts for the entire time here,
3273 * we splx() while doing the actual copy to user space.
3274 * Although the sockbuf is locked, new data may still be appended,
3275 * and thus we must maintain consistency of the sockbuf during that time.
3276 *
3277 * The caller may receive the data as a single mbuf chain by supplying
3278 * an mbuf **mp0 for use in returning the chain. The uio is then used
3279 * only for the count in uio_resid.
3280 *
3281 * Returns: 0 Success
3282 * ENOBUFS
3283 * ENOTCONN
3284 * EWOULDBLOCK
3285 * uiomove:EFAULT
3286 * sblock:EWOULDBLOCK
3287 * sblock:EINTR
3288 * sbwait:EBADF
3289 * sbwait:EINTR
3290 * sodelayed_copy:EFAULT
3291 * <pru_rcvoob>:EINVAL[TCP]
3292 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3293 * <pru_rcvoob>:???
3294 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3295 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3296 * <pr_domain->dom_externalize>:???
3297 *
3298 * Notes: Additional return values from calls through <pru_rcvoob> and
3299 * <pr_domain->dom_externalize> depend on protocols other than
3300 * TCP or AF_UNIX, which are documented above.
3301 */
3302 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3303 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3304 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3305 {
3306 mbuf_ref_t m;
3307 mbuf_ref_ref_t mp;
3308 mbuf_ref_t ml = NULL;
3309 mbuf_ref_t nextrecord, free_list;
3310 int flags, error, offset;
3311 user_ssize_t len;
3312 struct protosw *pr = so->so_proto;
3313 int moff, type = 0;
3314 user_ssize_t orig_resid = uio_resid(uio);
3315 user_ssize_t delayed_copy_len;
3316 int can_delay;
3317 struct proc *p = current_proc();
3318 boolean_t en_tracing = FALSE;
3319
3320 /*
3321 * Sanity check on the length passed by caller as we are making 'int'
3322 * comparisons
3323 */
3324 if (orig_resid < 0 || orig_resid > INT_MAX) {
3325 return EINVAL;
3326 }
3327
3328 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3329 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3330 so->so_rcv.sb_hiwat);
3331
3332 socket_lock(so, 1);
3333 so_update_last_owner_locked(so, p);
3334 so_update_policy(so);
3335
3336 #ifdef MORE_LOCKING_DEBUG
3337 if (so->so_usecount == 1) {
3338 panic("%s: so=%x no other reference on socket", __func__, so);
3339 /* NOTREACHED */
3340 }
3341 #endif
3342 mp = mp0;
3343 if (psa != NULL) {
3344 *psa = NULL;
3345 }
3346 if (controlp != NULL) {
3347 *controlp = NULL;
3348 }
3349 if (flagsp != NULL) {
3350 flags = *flagsp & ~MSG_EOR;
3351 } else {
3352 flags = 0;
3353 }
3354
3355 /*
3356 * If a recv attempt is made on a previously-accepted socket
3357 * that has been marked as inactive (disconnected), reject
3358 * the request.
3359 */
3360 if (so->so_flags & SOF_DEFUNCT) {
3361 struct sockbuf *sb = &so->so_rcv;
3362
3363 error = ENOTCONN;
3364 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3365 __func__, proc_pid(p), proc_best_name(p),
3366 so->so_gencnt,
3367 SOCK_DOM(so), SOCK_TYPE(so), error);
3368 /*
3369 * This socket should have been disconnected and flushed
3370 * prior to being returned from sodefunct(); there should
3371 * be no data on its receive list, so panic otherwise.
3372 */
3373 if (so->so_state & SS_DEFUNCT) {
3374 sb_empty_assert(sb, __func__);
3375 }
3376 socket_unlock(so, 1);
3377 return error;
3378 }
3379
3380 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3381 pr->pr_usrreqs->pru_preconnect) {
3382 /*
3383 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3384 * calling write() right after this. *If* the app calls a read
3385 * we do not want to block this read indefinetely. Thus,
3386 * we trigger a connect so that the session gets initiated.
3387 */
3388 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3389
3390 if (error) {
3391 socket_unlock(so, 1);
3392 return error;
3393 }
3394 }
3395
3396 if (ENTR_SHOULDTRACE &&
3397 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3398 /*
3399 * enable energy tracing for inet sockets that go over
3400 * non-loopback interfaces only.
3401 */
3402 struct inpcb *inp = sotoinpcb(so);
3403 if (inp->inp_last_outifp != NULL &&
3404 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3405 en_tracing = TRUE;
3406 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3407 VM_KERNEL_ADDRPERM(so),
3408 ((so->so_state & SS_NBIO) ?
3409 kEnTrFlagNonBlocking : 0),
3410 (int64_t)orig_resid);
3411 }
3412 }
3413
3414 /*
3415 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3416 * regardless of the flags argument. Here is the case were
3417 * out-of-band data is not inline.
3418 */
3419 if ((flags & MSG_OOB) ||
3420 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3421 (so->so_options & SO_OOBINLINE) == 0 &&
3422 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3423 m = m_get(M_WAIT, MT_DATA);
3424 if (m == NULL) {
3425 socket_unlock(so, 1);
3426 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3427 ENOBUFS, 0, 0, 0, 0);
3428 return ENOBUFS;
3429 }
3430 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3431 if (error) {
3432 goto bad;
3433 }
3434 socket_unlock(so, 0);
3435 do {
3436 error = uiomove(mtod(m, caddr_t),
3437 imin((int)uio_resid(uio), m->m_len), uio);
3438 m = m_free(m);
3439 } while (uio_resid(uio) && error == 0 && m != NULL);
3440 socket_lock(so, 0);
3441 bad:
3442 if (m != NULL) {
3443 m_freem(m);
3444 }
3445
3446 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3447 if (error == EWOULDBLOCK || error == EINVAL) {
3448 /*
3449 * Let's try to get normal data:
3450 * EWOULDBLOCK: out-of-band data not
3451 * receive yet. EINVAL: out-of-band data
3452 * already read.
3453 */
3454 error = 0;
3455 goto nooob;
3456 } else if (error == 0 && flagsp != NULL) {
3457 *flagsp |= MSG_OOB;
3458 }
3459 }
3460 socket_unlock(so, 1);
3461 if (en_tracing) {
3462 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3463 VM_KERNEL_ADDRPERM(so), 0,
3464 (int64_t)(orig_resid - uio_resid(uio)));
3465 }
3466 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3467 0, 0, 0, 0);
3468
3469 return error;
3470 }
3471 nooob:
3472 if (mp != NULL) {
3473 *mp = NULL;
3474 }
3475
3476 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3477 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3478 }
3479
3480 free_list = NULL;
3481 delayed_copy_len = 0;
3482 restart:
3483 #ifdef MORE_LOCKING_DEBUG
3484 if (so->so_usecount <= 1) {
3485 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3486 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3487 }
3488 #endif
3489 /*
3490 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3491 * and if so just return to the caller. This could happen when
3492 * soreceive() is called by a socket upcall function during the
3493 * time the socket is freed. The socket buffer would have been
3494 * locked across the upcall, therefore we cannot put this thread
3495 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3496 * we may livelock), because the lock on the socket buffer will
3497 * only be released when the upcall routine returns to its caller.
3498 * Because the socket has been officially closed, there can be
3499 * no further read on it.
3500 *
3501 * A multipath subflow socket would have its SS_NOFDREF set by
3502 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3503 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3504 */
3505 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3506 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3507 socket_unlock(so, 1);
3508 return 0;
3509 }
3510
3511 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3512 if (error) {
3513 socket_unlock(so, 1);
3514 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3515 0, 0, 0, 0);
3516 if (en_tracing) {
3517 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3518 VM_KERNEL_ADDRPERM(so), 0,
3519 (int64_t)(orig_resid - uio_resid(uio)));
3520 }
3521 return error;
3522 }
3523
3524 m = so->so_rcv.sb_mb;
3525 if (so_should_wait(so, uio, m, flags)) {
3526 /*
3527 * Panic if we notice inconsistencies in the socket's
3528 * receive list; both sb_mb and sb_cc should correctly
3529 * reflect the contents of the list, otherwise we may
3530 * end up with false positives during select() or poll()
3531 * which could put the application in a bad state.
3532 */
3533 SB_MB_CHECK(&so->so_rcv);
3534
3535 if (so->so_error) {
3536 if (m != NULL) {
3537 goto dontblock;
3538 }
3539 error = so->so_error;
3540 if ((flags & MSG_PEEK) == 0) {
3541 so->so_error = 0;
3542 }
3543 goto release;
3544 }
3545 if (so->so_state & SS_CANTRCVMORE) {
3546 #if CONTENT_FILTER
3547 /*
3548 * Deal with half closed connections
3549 */
3550 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3551 cfil_sock_data_pending(&so->so_rcv) != 0) {
3552 CFIL_LOG(LOG_INFO,
3553 "so %llx ignore SS_CANTRCVMORE",
3554 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3555 } else
3556 #endif /* CONTENT_FILTER */
3557 if (m != NULL) {
3558 goto dontblock;
3559 } else {
3560 goto release;
3561 }
3562 }
3563 for (; m != NULL; m = m->m_next) {
3564 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3565 m = so->so_rcv.sb_mb;
3566 goto dontblock;
3567 }
3568 }
3569 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3570 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3571 error = ENOTCONN;
3572 goto release;
3573 }
3574 if (uio_resid(uio) == 0) {
3575 goto release;
3576 }
3577
3578 if ((so->so_state & SS_NBIO) ||
3579 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3580 error = EWOULDBLOCK;
3581 goto release;
3582 }
3583 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3584 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3585 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3586 #if EVEN_MORE_LOCKING_DEBUG
3587 if (socket_debug) {
3588 printf("Waiting for socket data\n");
3589 }
3590 #endif
3591
3592 /*
3593 * Depending on the protocol (e.g. TCP), the following
3594 * might cause the socket lock to be dropped and later
3595 * be reacquired, and more data could have arrived and
3596 * have been appended to the receive socket buffer by
3597 * the time it returns. Therefore, we only sleep in
3598 * sbwait() below if and only if the wait-condition is still
3599 * true.
3600 */
3601 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3602 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3603 }
3604
3605 error = 0;
3606 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3607 error = sbwait(&so->so_rcv);
3608 }
3609
3610 #if EVEN_MORE_LOCKING_DEBUG
3611 if (socket_debug) {
3612 printf("SORECEIVE - sbwait returned %d\n", error);
3613 }
3614 #endif
3615 if (so->so_usecount < 1) {
3616 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3617 __func__, so, so->so_usecount);
3618 /* NOTREACHED */
3619 }
3620 if (error) {
3621 socket_unlock(so, 1);
3622 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3623 0, 0, 0, 0);
3624 if (en_tracing) {
3625 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3626 VM_KERNEL_ADDRPERM(so), 0,
3627 (int64_t)(orig_resid - uio_resid(uio)));
3628 }
3629 return error;
3630 }
3631 goto restart;
3632 }
3633 dontblock:
3634 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3635 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3636 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3637 nextrecord = m->m_nextpkt;
3638
3639 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3640 error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3641 mp0 == NULL);
3642 if (error == ERESTART) {
3643 goto restart;
3644 } else if (error != 0) {
3645 goto release;
3646 }
3647 orig_resid = 0;
3648 }
3649
3650 /*
3651 * Process one or more MT_CONTROL mbufs present before any data mbufs
3652 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3653 * just copy the data; if !MSG_PEEK, we call into the protocol to
3654 * perform externalization.
3655 */
3656 if (m != NULL && m->m_type == MT_CONTROL) {
3657 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3658 if (error != 0) {
3659 goto release;
3660 }
3661 orig_resid = 0;
3662 }
3663
3664 if (m != NULL) {
3665 if (!(flags & MSG_PEEK)) {
3666 /*
3667 * We get here because m points to an mbuf following
3668 * any MT_SONAME or MT_CONTROL mbufs which have been
3669 * processed above. In any case, m should be pointing
3670 * to the head of the mbuf chain, and the nextrecord
3671 * should be either NULL or equal to m->m_nextpkt.
3672 * See comments above about SB_LOCK.
3673 */
3674 if (m != so->so_rcv.sb_mb ||
3675 m->m_nextpkt != nextrecord) {
3676 panic("%s: post-control !sync so=%p m=%p "
3677 "nextrecord=%p\n", __func__, so, m,
3678 nextrecord);
3679 /* NOTREACHED */
3680 }
3681 if (nextrecord == NULL) {
3682 so->so_rcv.sb_lastrecord = m;
3683 }
3684 }
3685 type = m->m_type;
3686 if (type == MT_OOBDATA) {
3687 flags |= MSG_OOB;
3688 }
3689 } else {
3690 if (!(flags & MSG_PEEK)) {
3691 SB_EMPTY_FIXUP(&so->so_rcv);
3692 }
3693 }
3694 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3695 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3696
3697 moff = 0;
3698 offset = 0;
3699
3700 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3701 can_delay = 1;
3702 } else {
3703 can_delay = 0;
3704 }
3705
3706 while (m != NULL &&
3707 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3708 if (m->m_type == MT_OOBDATA) {
3709 if (type != MT_OOBDATA) {
3710 break;
3711 }
3712 } else if (type == MT_OOBDATA) {
3713 break;
3714 }
3715
3716 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3717 break;
3718 }
3719 /*
3720 * Make sure to allways set MSG_OOB event when getting
3721 * out of band data inline.
3722 */
3723 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3724 (so->so_options & SO_OOBINLINE) != 0 &&
3725 (so->so_state & SS_RCVATMARK) != 0) {
3726 flags |= MSG_OOB;
3727 }
3728 so->so_state &= ~SS_RCVATMARK;
3729 len = uio_resid(uio) - delayed_copy_len;
3730 if (so->so_oobmark && len > so->so_oobmark - offset) {
3731 len = so->so_oobmark - offset;
3732 }
3733 if (len > m->m_len - moff) {
3734 len = m->m_len - moff;
3735 }
3736 /*
3737 * If mp is set, just pass back the mbufs.
3738 * Otherwise copy them out via the uio, then free.
3739 * Sockbuf must be consistent here (points to current mbuf,
3740 * it points to next record) when we drop priority;
3741 * we must note any additions to the sockbuf when we
3742 * block interrupts again.
3743 */
3744 if (mp == NULL) {
3745 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3746 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3747 if (can_delay && len == m->m_len) {
3748 /*
3749 * only delay the copy if we're consuming the
3750 * mbuf and we're NOT in MSG_PEEK mode
3751 * and we have enough data to make it worthwile
3752 * to drop and retake the lock... can_delay
3753 * reflects the state of the 2 latter
3754 * constraints moff should always be zero
3755 * in these cases
3756 */
3757 delayed_copy_len += len;
3758 } else {
3759 if (delayed_copy_len) {
3760 error = sodelayed_copy(so, uio,
3761 &free_list, &delayed_copy_len);
3762
3763 if (error) {
3764 goto release;
3765 }
3766 /*
3767 * can only get here if MSG_PEEK is not
3768 * set therefore, m should point at the
3769 * head of the rcv queue; if it doesn't,
3770 * it means something drastically
3771 * changed while we were out from behind
3772 * the lock in sodelayed_copy. perhaps
3773 * a RST on the stream. in any event,
3774 * the stream has been interrupted. it's
3775 * probably best just to return whatever
3776 * data we've moved and let the caller
3777 * sort it out...
3778 */
3779 if (m != so->so_rcv.sb_mb) {
3780 break;
3781 }
3782 }
3783 socket_unlock(so, 0);
3784 error = uiomove(mtod(m, caddr_t) + moff,
3785 (int)len, uio);
3786 socket_lock(so, 0);
3787
3788 if (error) {
3789 goto release;
3790 }
3791 }
3792 } else {
3793 uio_setresid(uio, (uio_resid(uio) - len));
3794 }
3795 if (len == m->m_len - moff) {
3796 if (m->m_flags & M_EOR) {
3797 flags |= MSG_EOR;
3798 }
3799 if (flags & MSG_PEEK) {
3800 m = m->m_next;
3801 moff = 0;
3802 } else {
3803 nextrecord = m->m_nextpkt;
3804 sbfree(&so->so_rcv, m);
3805 m->m_nextpkt = NULL;
3806
3807 if (mp != NULL) {
3808 *mp = m;
3809 mp = &m->m_next;
3810 so->so_rcv.sb_mb = m = m->m_next;
3811 *mp = NULL;
3812 } else {
3813 if (free_list == NULL) {
3814 free_list = m;
3815 } else {
3816 ml->m_next = m;
3817 }
3818 ml = m;
3819 so->so_rcv.sb_mb = m = m->m_next;
3820 ml->m_next = NULL;
3821 }
3822 if (m != NULL) {
3823 m->m_nextpkt = nextrecord;
3824 if (nextrecord == NULL) {
3825 so->so_rcv.sb_lastrecord = m;
3826 }
3827 } else {
3828 so->so_rcv.sb_mb = nextrecord;
3829 SB_EMPTY_FIXUP(&so->so_rcv);
3830 }
3831 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3832 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3833 }
3834 } else {
3835 if (flags & MSG_PEEK) {
3836 moff += len;
3837 } else {
3838 if (mp != NULL) {
3839 int copy_flag;
3840
3841 if (flags & MSG_DONTWAIT) {
3842 copy_flag = M_DONTWAIT;
3843 } else {
3844 copy_flag = M_WAIT;
3845 }
3846 *mp = m_copym(m, 0, (int)len, copy_flag);
3847 /*
3848 * Failed to allocate an mbuf?
3849 * Adjust uio_resid back, it was
3850 * adjusted down by len bytes which
3851 * we didn't copy over.
3852 */
3853 if (*mp == NULL) {
3854 uio_setresid(uio,
3855 (uio_resid(uio) + len));
3856 break;
3857 }
3858 }
3859 m->m_data += len;
3860 m->m_len -= len;
3861 so->so_rcv.sb_cc -= len;
3862 }
3863 }
3864 if (so->so_oobmark) {
3865 if ((flags & MSG_PEEK) == 0) {
3866 so->so_oobmark -= len;
3867 if (so->so_oobmark == 0) {
3868 so->so_state |= SS_RCVATMARK;
3869 break;
3870 }
3871 } else {
3872 offset += len;
3873 if (offset == so->so_oobmark) {
3874 break;
3875 }
3876 }
3877 }
3878 if (flags & MSG_EOR) {
3879 break;
3880 }
3881 /*
3882 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3883 * (for non-atomic socket), we must not quit until
3884 * "uio->uio_resid == 0" or an error termination.
3885 * If a signal/timeout occurs, return with a short
3886 * count but without error. Keep sockbuf locked
3887 * against other readers.
3888 */
3889 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3890 (uio_resid(uio) - delayed_copy_len) > 0 &&
3891 !sosendallatonce(so) && !nextrecord) {
3892 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3893 #if CONTENT_FILTER
3894 && cfil_sock_data_pending(&so->so_rcv) == 0
3895 #endif /* CONTENT_FILTER */
3896 )) {
3897 goto release;
3898 }
3899
3900 /*
3901 * Depending on the protocol (e.g. TCP), the following
3902 * might cause the socket lock to be dropped and later
3903 * be reacquired, and more data could have arrived and
3904 * have been appended to the receive socket buffer by
3905 * the time it returns. Therefore, we only sleep in
3906 * sbwait() below if and only if the socket buffer is
3907 * empty, in order to avoid a false sleep.
3908 */
3909 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3910 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3911 }
3912
3913 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3914 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3915
3916 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3917 error = 0;
3918 goto release;
3919 }
3920 /*
3921 * have to wait until after we get back from the sbwait
3922 * to do the copy because we will drop the lock if we
3923 * have enough data that has been delayed... by dropping
3924 * the lock we open up a window allowing the netisr
3925 * thread to process the incoming packets and to change
3926 * the state of this socket... we're issuing the sbwait
3927 * because the socket is empty and we're expecting the
3928 * netisr thread to wake us up when more packets arrive;
3929 * if we allow that processing to happen and then sbwait
3930 * we could stall forever with packets sitting in the
3931 * socket if no further packets arrive from the remote
3932 * side.
3933 *
3934 * we want to copy before we've collected all the data
3935 * to satisfy this request to allow the copy to overlap
3936 * the incoming packet processing on an MP system
3937 */
3938 if (delayed_copy_len > sorecvmincopy &&
3939 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3940 error = sodelayed_copy(so, uio,
3941 &free_list, &delayed_copy_len);
3942
3943 if (error) {
3944 goto release;
3945 }
3946 }
3947 m = so->so_rcv.sb_mb;
3948 if (m != NULL) {
3949 nextrecord = m->m_nextpkt;
3950 }
3951 SB_MB_CHECK(&so->so_rcv);
3952 }
3953 }
3954 #ifdef MORE_LOCKING_DEBUG
3955 if (so->so_usecount <= 1) {
3956 panic("%s: after big while so=%p ref=%d on socket",
3957 __func__, so, so->so_usecount);
3958 /* NOTREACHED */
3959 }
3960 #endif
3961
3962 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3963 if (so->so_options & SO_DONTTRUNC) {
3964 flags |= MSG_RCVMORE;
3965 } else {
3966 flags |= MSG_TRUNC;
3967 if ((flags & MSG_PEEK) == 0) {
3968 (void) sbdroprecord(&so->so_rcv);
3969 }
3970 }
3971 }
3972
3973 /*
3974 * pru_rcvd below (for TCP) may cause more data to be received
3975 * if the socket lock is dropped prior to sending the ACK; some
3976 * legacy OpenTransport applications don't handle this well
3977 * (if it receives less data than requested while MSG_HAVEMORE
3978 * is set), and so we set the flag now based on what we know
3979 * prior to calling pru_rcvd.
3980 */
3981 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3982 flags |= MSG_HAVEMORE;
3983 }
3984
3985 if ((flags & MSG_PEEK) == 0) {
3986 if (m == NULL) {
3987 so->so_rcv.sb_mb = nextrecord;
3988 /*
3989 * First part is an inline SB_EMPTY_FIXUP(). Second
3990 * part makes sure sb_lastrecord is up-to-date if
3991 * there is still data in the socket buffer.
3992 */
3993 if (so->so_rcv.sb_mb == NULL) {
3994 so->so_rcv.sb_mbtail = NULL;
3995 so->so_rcv.sb_lastrecord = NULL;
3996 } else if (nextrecord->m_nextpkt == NULL) {
3997 so->so_rcv.sb_lastrecord = nextrecord;
3998 }
3999 SB_MB_CHECK(&so->so_rcv);
4000 }
4001 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4002 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4003 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4004 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4005 }
4006 }
4007
4008 if (delayed_copy_len) {
4009 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4010 if (error) {
4011 goto release;
4012 }
4013 }
4014 if (free_list != NULL) {
4015 m_freem_list(free_list);
4016 free_list = NULL;
4017 }
4018
4019 if (orig_resid == uio_resid(uio) && orig_resid &&
4020 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4021 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4022 goto restart;
4023 }
4024
4025 if (flagsp != NULL) {
4026 *flagsp |= flags;
4027 }
4028 release:
4029 #ifdef MORE_LOCKING_DEBUG
4030 if (so->so_usecount <= 1) {
4031 panic("%s: release so=%p ref=%d on socket", __func__,
4032 so, so->so_usecount);
4033 /* NOTREACHED */
4034 }
4035 #endif
4036 if (delayed_copy_len) {
4037 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4038 }
4039
4040 if (free_list != NULL) {
4041 m_freem_list(free_list);
4042 }
4043
4044 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4045
4046 if (en_tracing) {
4047 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4048 VM_KERNEL_ADDRPERM(so),
4049 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4050 (int64_t)(orig_resid - uio_resid(uio)));
4051 }
4052 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4053 so->so_rcv.sb_cc, 0, error);
4054
4055 return error;
4056 }
4057
4058 /*
4059 * Returns: 0 Success
4060 * uiomove:EFAULT
4061 */
4062 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4063 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4064 user_ssize_t *resid)
4065 {
4066 int error = 0;
4067 struct mbuf *m;
4068
4069 m = *free_list;
4070
4071 socket_unlock(so, 0);
4072
4073 while (m != NULL && error == 0) {
4074 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4075 m = m->m_next;
4076 }
4077 m_freem_list(*free_list);
4078
4079 *free_list = NULL;
4080 *resid = 0;
4081
4082 socket_lock(so, 0);
4083
4084 return error;
4085 }
4086
4087 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)4088 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
4089 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
4090 {
4091 mbuf_ref_t m;
4092 mbuf_ref_ref_t mp;
4093 mbuf_ref_t nextrecord;
4094 int flags, error;
4095 struct protosw *pr = so->so_proto;
4096 struct proc *p = current_proc();
4097 u_int npkts = 0;
4098 mbuf_ref_t free_list = NULL;
4099 int sblocked = 0;
4100
4101 /*
4102 * Sanity check on the parameters passed by caller
4103 */
4104 if (mp0 == NULL || pktcntp == NULL) {
4105 return EINVAL;
4106 }
4107 if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
4108 return EINVAL;
4109 }
4110
4111 mp = mp0;
4112 *mp0 = NULL;
4113 if (controlp != NULL) {
4114 *controlp = NULL;
4115 }
4116 if (maddrp != NULL) {
4117 *maddrp = NULL;
4118 }
4119 if (flagsp != NULL) {
4120 flags = *flagsp;
4121 } else {
4122 flags = 0;
4123 }
4124
4125 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4126 *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4127 so->so_rcv.sb_hiwat);
4128
4129 socket_lock(so, 1);
4130 so_update_last_owner_locked(so, p);
4131 so_update_policy(so);
4132
4133 #if NECP
4134 so_update_necp_policy(so, NULL, NULL);
4135 #endif /* NECP */
4136
4137 /*
4138 * If a recv attempt is made on a previously-accepted socket
4139 * that has been marked as inactive (disconnected), reject
4140 * the request.
4141 */
4142 if (so->so_flags & SOF_DEFUNCT) {
4143 struct sockbuf *sb = &so->so_rcv;
4144
4145 error = ENOTCONN;
4146 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4147 __func__, proc_pid(p), proc_best_name(p),
4148 so->so_gencnt,
4149 SOCK_DOM(so), SOCK_TYPE(so), error);
4150 /*
4151 * This socket should have been disconnected and flushed
4152 * prior to being returned from sodefunct(); there should
4153 * be no data on its receive list, so panic otherwise.
4154 */
4155 if (so->so_state & SS_DEFUNCT) {
4156 sb_empty_assert(sb, __func__);
4157 }
4158 goto release;
4159 }
4160
4161 *mp = NULL;
4162
4163 restart:
4164 /*
4165 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4166 * and if so just return to the caller. This could happen when
4167 * soreceive() is called by a socket upcall function during the
4168 * time the socket is freed. The socket buffer would have been
4169 * locked across the upcall, therefore we cannot put this thread
4170 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4171 * we may livelock), because the lock on the socket buffer will
4172 * only be released when the upcall routine returns to its caller.
4173 * Because the socket has been officially closed, there can be
4174 * no further read on it.
4175 */
4176 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4177 (SS_NOFDREF | SS_CANTRCVMORE)) {
4178 error = 0;
4179 goto out;
4180 }
4181
4182 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4183 if (error) {
4184 goto out;
4185 }
4186 sblocked = 1;
4187
4188 m = so->so_rcv.sb_mb;
4189 /*
4190 * Block awaiting more datagram if needed
4191 */
4192 if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4193 so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4194 /*
4195 * Panic if we notice inconsistencies in the socket's
4196 * receive list; both sb_mb and sb_cc should correctly
4197 * reflect the contents of the list, otherwise we may
4198 * end up with false positives during select() or poll()
4199 * which could put the application in a bad state.
4200 */
4201 SB_MB_CHECK(&so->so_rcv);
4202
4203 if (so->so_error) {
4204 if (m != NULL) {
4205 goto dontblock;
4206 }
4207 error = so->so_error;
4208 if ((flags & MSG_PEEK) == 0) {
4209 so->so_error = 0;
4210 }
4211 goto release;
4212 }
4213 if (so->so_state & SS_CANTRCVMORE) {
4214 if (m != NULL) {
4215 goto dontblock;
4216 } else {
4217 goto release;
4218 }
4219 }
4220 for (; m != NULL; m = m->m_next) {
4221 if (m->m_flags & M_EOR) {
4222 m = so->so_rcv.sb_mb;
4223 goto dontblock;
4224 }
4225 }
4226 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4227 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4228 error = ENOTCONN;
4229 goto release;
4230 }
4231 if ((so->so_state & SS_NBIO) ||
4232 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4233 error = EWOULDBLOCK;
4234 goto release;
4235 }
4236 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4237 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4238
4239 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4240 sblocked = 0;
4241
4242 error = sbwait(&so->so_rcv);
4243 if (error != 0) {
4244 goto release;
4245 }
4246 goto restart;
4247 }
4248 dontblock:
4249 m = so->so_rcv.sb_mb;
4250 if (m == NULL) {
4251 goto release;
4252 }
4253
4254 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4255 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4256 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4257 nextrecord = m->m_nextpkt;
4258
4259 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4260 mbuf_ref_t maddr = NULL;
4261
4262 error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4263 &nextrecord, 1);
4264 if (error == ERESTART) {
4265 goto restart;
4266 } else if (error != 0) {
4267 goto release;
4268 }
4269
4270 if (maddr != NULL) {
4271 maddr->m_nextpkt = NULL;
4272 maddr->m_next = NULL;
4273 if (maddrp != NULL) {
4274 *maddrp = maddr;
4275 maddrp = &maddr->m_nextpkt;
4276 } else {
4277 maddr->m_next = free_list;
4278 free_list = maddr;
4279 }
4280 }
4281 }
4282
4283 /*
4284 * Process one or more MT_CONTROL mbufs present before any data mbufs
4285 * in the first mbuf chain on the socket buffer.
4286 * We call into the protocol to perform externalization.
4287 */
4288 if (m != NULL && m->m_type == MT_CONTROL) {
4289 mbuf_ref_t control = NULL;
4290
4291 error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4292 if (error != 0) {
4293 goto release;
4294 }
4295 if (control != NULL) {
4296 control->m_nextpkt = NULL;
4297 control->m_next = NULL;
4298 if (controlp != NULL) {
4299 *controlp = control;
4300 controlp = &control->m_nextpkt;
4301 } else {
4302 control->m_next = free_list;
4303 free_list = control;
4304 }
4305 }
4306 }
4307
4308 /*
4309 * Link the packet to the list
4310 */
4311 if (m != NULL) {
4312 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4313 panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4314 }
4315 m->m_nextpkt = NULL;
4316 *mp = m;
4317 mp = &m->m_nextpkt;
4318 }
4319 while (m != NULL) {
4320 sbfree(&so->so_rcv, m);
4321
4322 m = m->m_next;
4323 }
4324
4325 so->so_rcv.sb_mb = nextrecord;
4326 /*
4327 * First part is an inline SB_EMPTY_FIXUP(). Second
4328 * part makes sure sb_lastrecord is up-to-date if
4329 * there is still data in the socket buffer.
4330 */
4331 if (so->so_rcv.sb_mb == NULL) {
4332 so->so_rcv.sb_mbtail = NULL;
4333 so->so_rcv.sb_lastrecord = NULL;
4334 } else if (nextrecord->m_nextpkt == NULL) {
4335 so->so_rcv.sb_lastrecord = nextrecord;
4336 }
4337 SB_MB_CHECK(&so->so_rcv);
4338
4339 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4340 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4341
4342 npkts += 1;
4343
4344 /*
4345 * We continue as long as all those conditions as we have less packets
4346 * than requested and the socket buffer is not empty
4347 */
4348 if (npkts < *pktcntp) {
4349 if (so->so_rcv.sb_mb != NULL) {
4350 goto dontblock;
4351 }
4352 if ((flags & MSG_WAITALL) != 0) {
4353 goto restart;
4354 }
4355 }
4356
4357 if (flagsp != NULL) {
4358 *flagsp |= flags;
4359 }
4360
4361 release:
4362 /*
4363 * pru_rcvd may cause more data to be received if the socket lock
4364 * is dropped so we set MSG_HAVEMORE now based on what we know.
4365 * That way the caller won't be surprised if it receives less data
4366 * than requested.
4367 */
4368 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4369 flags |= MSG_HAVEMORE;
4370 }
4371
4372 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4373 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4374 }
4375
4376 if (sblocked) {
4377 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4378 } else {
4379 socket_unlock(so, 1);
4380 }
4381
4382 out:
4383 *pktcntp = npkts;
4384 /*
4385 * Amortize the cost of freeing the mbufs
4386 */
4387 if (free_list != NULL) {
4388 m_freem_list(free_list);
4389 }
4390
4391 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4392 0, 0, 0, 0);
4393 return error;
4394 }
4395
4396 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4397 so_statistics_event_to_nstat_event(int64_t *input_options,
4398 uint64_t *nstat_event)
4399 {
4400 int error = 0;
4401 switch (*input_options) {
4402 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4403 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4404 break;
4405 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4406 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4407 break;
4408 #if (DEBUG || DEVELOPMENT)
4409 case SO_STATISTICS_EVENT_RESERVED_1:
4410 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4411 break;
4412 case SO_STATISTICS_EVENT_RESERVED_2:
4413 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4414 break;
4415 #endif /* (DEBUG || DEVELOPMENT) */
4416 default:
4417 error = EINVAL;
4418 break;
4419 }
4420 return error;
4421 }
4422
4423 /*
4424 * Returns: 0 Success
4425 * EINVAL
4426 * ENOTCONN
4427 * <pru_shutdown>:EINVAL
4428 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4429 * <pru_shutdown>:ENOBUFS[TCP]
4430 * <pru_shutdown>:EMSGSIZE[TCP]
4431 * <pru_shutdown>:EHOSTUNREACH[TCP]
4432 * <pru_shutdown>:ENETUNREACH[TCP]
4433 * <pru_shutdown>:ENETDOWN[TCP]
4434 * <pru_shutdown>:ENOMEM[TCP]
4435 * <pru_shutdown>:EACCES[TCP]
4436 * <pru_shutdown>:EMSGSIZE[TCP]
4437 * <pru_shutdown>:ENOBUFS[TCP]
4438 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4439 * <pru_shutdown>:??? [other protocol families]
4440 */
4441 int
soshutdown(struct socket * so,int how)4442 soshutdown(struct socket *so, int how)
4443 {
4444 int error;
4445
4446 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4447
4448 switch (how) {
4449 case SHUT_RD:
4450 case SHUT_WR:
4451 case SHUT_RDWR:
4452 socket_lock(so, 1);
4453 if ((so->so_state &
4454 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4455 error = ENOTCONN;
4456 } else {
4457 error = soshutdownlock(so, how);
4458 }
4459 socket_unlock(so, 1);
4460 break;
4461 default:
4462 error = EINVAL;
4463 break;
4464 }
4465
4466 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4467
4468 return error;
4469 }
4470
4471 int
soshutdownlock_final(struct socket * so,int how)4472 soshutdownlock_final(struct socket *so, int how)
4473 {
4474 struct protosw *pr = so->so_proto;
4475 int error = 0;
4476
4477 sflt_notify(so, sock_evt_shutdown, &how);
4478
4479 if (how != SHUT_WR) {
4480 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4481 /* read already shut down */
4482 error = ENOTCONN;
4483 goto done;
4484 }
4485 sorflush(so);
4486 }
4487 if (how != SHUT_RD) {
4488 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4489 /* write already shut down */
4490 error = ENOTCONN;
4491 goto done;
4492 }
4493 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4494 }
4495 done:
4496 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4497 return error;
4498 }
4499
4500 int
soshutdownlock(struct socket * so,int how)4501 soshutdownlock(struct socket *so, int how)
4502 {
4503 int error = 0;
4504
4505 #if CONTENT_FILTER
4506 /*
4507 * A content filter may delay the actual shutdown until it
4508 * has processed the pending data
4509 */
4510 if (so->so_flags & SOF_CONTENT_FILTER) {
4511 error = cfil_sock_shutdown(so, &how);
4512 if (error == EJUSTRETURN) {
4513 error = 0;
4514 goto done;
4515 } else if (error != 0) {
4516 goto done;
4517 }
4518 }
4519 #endif /* CONTENT_FILTER */
4520
4521 error = soshutdownlock_final(so, how);
4522
4523 done:
4524 return error;
4525 }
4526
4527 void
sowflush(struct socket * so)4528 sowflush(struct socket *so)
4529 {
4530 struct sockbuf *sb = &so->so_snd;
4531
4532 /*
4533 * Obtain lock on the socket buffer (SB_LOCK). This is required
4534 * to prevent the socket buffer from being unexpectedly altered
4535 * while it is used by another thread in socket send/receive.
4536 *
4537 * sblock() must not fail here, hence the assertion.
4538 */
4539 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4540 VERIFY(sb->sb_flags & SB_LOCK);
4541
4542 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4543 sb->sb_flags |= SB_DROP;
4544 sb->sb_upcall = NULL;
4545 sb->sb_upcallarg = NULL;
4546
4547 sbunlock(sb, TRUE); /* keep socket locked */
4548
4549 selthreadclear(&sb->sb_sel);
4550 sbrelease(sb);
4551 }
4552
4553 void
sorflush(struct socket * so)4554 sorflush(struct socket *so)
4555 {
4556 struct sockbuf *sb = &so->so_rcv;
4557 struct protosw *pr = so->so_proto;
4558 struct sockbuf asb;
4559 #ifdef notyet
4560 lck_mtx_t *mutex_held;
4561 /*
4562 * XXX: This code is currently commented out, because we may get here
4563 * as part of sofreelastref(), and at that time, pr_getlock() may no
4564 * longer be able to return us the lock; this will be fixed in future.
4565 */
4566 if (so->so_proto->pr_getlock != NULL) {
4567 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4568 } else {
4569 mutex_held = so->so_proto->pr_domain->dom_mtx;
4570 }
4571
4572 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4573 #endif /* notyet */
4574
4575 sflt_notify(so, sock_evt_flush_read, NULL);
4576
4577 socantrcvmore(so);
4578
4579 /*
4580 * Obtain lock on the socket buffer (SB_LOCK). This is required
4581 * to prevent the socket buffer from being unexpectedly altered
4582 * while it is used by another thread in socket send/receive.
4583 *
4584 * sblock() must not fail here, hence the assertion.
4585 */
4586 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4587 VERIFY(sb->sb_flags & SB_LOCK);
4588
4589 /*
4590 * Copy only the relevant fields from "sb" to "asb" which we
4591 * need for sbrelease() to function. In particular, skip
4592 * sb_sel as it contains the wait queue linkage, which would
4593 * wreak havoc if we were to issue selthreadclear() on "asb".
4594 * Make sure to not carry over SB_LOCK in "asb", as we need
4595 * to acquire it later as part of sbrelease().
4596 */
4597 bzero(&asb, sizeof(asb));
4598 asb.sb_cc = sb->sb_cc;
4599 asb.sb_hiwat = sb->sb_hiwat;
4600 asb.sb_mbcnt = sb->sb_mbcnt;
4601 asb.sb_mbmax = sb->sb_mbmax;
4602 asb.sb_ctl = sb->sb_ctl;
4603 asb.sb_lowat = sb->sb_lowat;
4604 asb.sb_mb = sb->sb_mb;
4605 asb.sb_mbtail = sb->sb_mbtail;
4606 asb.sb_lastrecord = sb->sb_lastrecord;
4607 asb.sb_so = sb->sb_so;
4608 asb.sb_flags = sb->sb_flags;
4609 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4610 asb.sb_flags |= SB_DROP;
4611
4612 /*
4613 * Ideally we'd bzero() these and preserve the ones we need;
4614 * but to do that we'd need to shuffle things around in the
4615 * sockbuf, and we can't do it now because there are KEXTS
4616 * that are directly referring to the socket structure.
4617 *
4618 * Setting SB_DROP acts as a barrier to prevent further appends.
4619 * Clearing SB_SEL is done for selthreadclear() below.
4620 */
4621 sb->sb_cc = 0;
4622 sb->sb_hiwat = 0;
4623 sb->sb_mbcnt = 0;
4624 sb->sb_mbmax = 0;
4625 sb->sb_ctl = 0;
4626 sb->sb_lowat = 0;
4627 sb->sb_mb = NULL;
4628 sb->sb_mbtail = NULL;
4629 sb->sb_lastrecord = NULL;
4630 sb->sb_timeo.tv_sec = 0;
4631 sb->sb_timeo.tv_usec = 0;
4632 sb->sb_upcall = NULL;
4633 sb->sb_upcallarg = NULL;
4634 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4635 sb->sb_flags |= SB_DROP;
4636
4637 sbunlock(sb, TRUE); /* keep socket locked */
4638
4639 /*
4640 * Note that selthreadclear() is called on the original "sb" and
4641 * not the local "asb" because of the way wait queue linkage is
4642 * implemented. Given that selwakeup() may be triggered, SB_SEL
4643 * should no longer be set (cleared above.)
4644 */
4645 selthreadclear(&sb->sb_sel);
4646
4647 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4648 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4649 }
4650
4651 sbrelease(&asb);
4652 }
4653
4654 /*
4655 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4656 * an additional variant to handle the case where the option value needs
4657 * to be some kind of integer, but not a specific size.
4658 * In addition to their use here, these functions are also called by the
4659 * protocol-level pr_ctloutput() routines.
4660 *
4661 * Returns: 0 Success
4662 * EINVAL
4663 * copyin:EFAULT
4664 */
4665 int
sooptcopyin(struct sockopt * sopt,void * __sized_by (len)buf,size_t len,size_t minlen)4666 sooptcopyin(struct sockopt *sopt, void *__sized_by(len) buf, size_t len, size_t minlen)
4667 {
4668 size_t valsize;
4669
4670 /*
4671 * If the user gives us more than we wanted, we ignore it,
4672 * but if we don't get the minimum length the caller
4673 * wants, we return EINVAL. On success, sopt->sopt_valsize
4674 * is set to however much we actually retrieved.
4675 */
4676 if ((valsize = sopt->sopt_valsize) < minlen) {
4677 return EINVAL;
4678 }
4679 if (valsize > len) {
4680 sopt->sopt_valsize = valsize = len;
4681 }
4682
4683 if (sopt->sopt_p != kernproc) {
4684 return copyin(sopt->sopt_val, buf, valsize);
4685 }
4686
4687 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4688 CAST_DOWN(caddr_t, sopt->sopt_val),
4689 valsize);
4690 bcopy(tmp, buf, valsize);
4691
4692 return 0;
4693 }
4694
4695 /*
4696 * sooptcopyin_timeval
4697 * Copy in a timeval value into tv_p, and take into account whether the
4698 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4699 * code here so that we can verify the 64-bit tv_sec value before we lose
4700 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4701 */
4702 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4703 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4704 {
4705 int error;
4706
4707 if (proc_is64bit(sopt->sopt_p)) {
4708 struct user64_timeval tv64;
4709
4710 if (sopt->sopt_valsize < sizeof(tv64)) {
4711 return EINVAL;
4712 }
4713
4714 sopt->sopt_valsize = sizeof(tv64);
4715 if (sopt->sopt_p != kernproc) {
4716 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4717 if (error != 0) {
4718 return error;
4719 }
4720 } else {
4721 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4722 CAST_DOWN(caddr_t, sopt->sopt_val),
4723 sizeof(tv64));
4724 bcopy(tmp, &tv64, sizeof(tv64));
4725 }
4726 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4727 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4728 return EDOM;
4729 }
4730
4731 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4732 tv_p->tv_usec = tv64.tv_usec;
4733 } else {
4734 struct user32_timeval tv32;
4735
4736 if (sopt->sopt_valsize < sizeof(tv32)) {
4737 return EINVAL;
4738 }
4739
4740 sopt->sopt_valsize = sizeof(tv32);
4741 if (sopt->sopt_p != kernproc) {
4742 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4743 if (error != 0) {
4744 return error;
4745 }
4746 } else {
4747 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4748 CAST_DOWN(caddr_t, sopt->sopt_val),
4749 sizeof(tv32));
4750 bcopy(tmp, &tv32, sizeof(tv32));
4751 }
4752 #ifndef __LP64__
4753 /*
4754 * K64todo "comparison is always false due to
4755 * limited range of data type"
4756 */
4757 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4758 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4759 return EDOM;
4760 }
4761 #endif
4762 tv_p->tv_sec = tv32.tv_sec;
4763 tv_p->tv_usec = tv32.tv_usec;
4764 }
4765 return 0;
4766 }
4767
4768 int
sooptcopyin_bindtodevice(struct sockopt * sopt,char * __sized_by (bufsize)buf,size_t bufsize)4769 sooptcopyin_bindtodevice(struct sockopt *sopt, char * __sized_by(bufsize) buf, size_t bufsize)
4770 {
4771 #define MIN_BINDTODEVICE_NAME_SIZE 2
4772 size_t maxlen = bufsize - 1; /* the max string length that fits in the buffer */
4773
4774 if (bufsize < MIN_BINDTODEVICE_NAME_SIZE) {
4775 #if DEBUG || DEVELOPMENT
4776 os_log(OS_LOG_DEFAULT, "%s: bufsize %lu < MIN_BINDTODEVICE_NAME_SIZE %d",
4777 __func__, bufsize, MIN_BINDTODEVICE_NAME_SIZE);
4778 #endif /* DEBUG || DEVELOPMENT */
4779 return EINVAL;
4780 }
4781
4782 memset(buf, 0, bufsize);
4783
4784 /*
4785 * bufsize includes the end-of-string because of the uncertainty wether
4786 * interface names are passed as strings or byte buffers.
4787 * If the user gives us more than the max string length return EINVAL.
4788 * On success, sopt->sopt_valsize is not modified
4789 */
4790 maxlen = bufsize - 1;
4791 if (sopt->sopt_valsize > maxlen) {
4792 os_log(OS_LOG_DEFAULT, "%s: sopt_valsize %lu > maxlen %lu",
4793 __func__, sopt->sopt_valsize, maxlen);
4794 return EINVAL;
4795 }
4796
4797 if (sopt->sopt_p != kernproc) {
4798 return copyin(sopt->sopt_val, buf, sopt->sopt_valsize);
4799 } else {
4800 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
4801 CAST_DOWN(caddr_t, sopt->sopt_val),
4802 sopt->sopt_valsize);
4803 bcopy(tmp, buf, sopt->sopt_valsize);
4804 }
4805
4806 return 0;
4807 #undef MIN_BINDTODEVICE_NAME_SIZE
4808 }
4809
4810 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4811 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4812 boolean_t ignore_delegate)
4813 {
4814 kauth_cred_t cred = NULL;
4815 proc_t ep = PROC_NULL;
4816 uid_t uid;
4817 int error = 0;
4818
4819 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4820 ep = proc_find(so->e_pid);
4821 if (ep) {
4822 cred = kauth_cred_proc_ref(ep);
4823 }
4824 }
4825
4826 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4827
4828 /* uid is 0 for root */
4829 if (uid != 0 || !allow_root) {
4830 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4831 }
4832 if (cred) {
4833 kauth_cred_unref(&cred);
4834 }
4835 if (ep != PROC_NULL) {
4836 proc_rele(ep);
4837 }
4838
4839 return error;
4840 }
4841
4842 /*
4843 * Returns: 0 Success
4844 * EINVAL
4845 * ENOPROTOOPT
4846 * ENOBUFS
4847 * EDOM
4848 * sooptcopyin:EINVAL
4849 * sooptcopyin:EFAULT
4850 * sooptcopyin_timeval:EINVAL
4851 * sooptcopyin_timeval:EFAULT
4852 * sooptcopyin_timeval:EDOM
4853 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4854 * <pr_ctloutput>:???w
4855 * sflt_attach_private:??? [whatever a filter author chooses]
4856 * <sf_setoption>:??? [whatever a filter author chooses]
4857 *
4858 * Notes: Other <pru_listen> returns depend on the protocol family; all
4859 * <sf_listen> returns depend on what the filter author causes
4860 * their filter to return.
4861 */
4862 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4863 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4864 {
4865 int error, optval;
4866 int64_t long_optval;
4867 struct linger l;
4868 struct timeval tv;
4869
4870 if (sopt->sopt_dir != SOPT_SET) {
4871 sopt->sopt_dir = SOPT_SET;
4872 }
4873
4874 if (dolock) {
4875 socket_lock(so, 1);
4876 }
4877
4878 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4879 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4880 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4881 /* the socket has been shutdown, no more sockopt's */
4882 error = EINVAL;
4883 goto out;
4884 }
4885
4886 error = sflt_setsockopt(so, sopt);
4887 if (error != 0) {
4888 if (error == EJUSTRETURN) {
4889 error = 0;
4890 }
4891 goto out;
4892 }
4893
4894 if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
4895 if (so->so_proto != NULL &&
4896 so->so_proto->pr_ctloutput != NULL) {
4897 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4898 goto out;
4899 }
4900 error = ENOPROTOOPT;
4901 } else {
4902 /*
4903 * Allow socket-level (SOL_SOCKET) options to be filtered by
4904 * the protocol layer, if needed. A zero value returned from
4905 * the handler means use default socket-level processing as
4906 * done by the rest of this routine. Otherwise, any other
4907 * return value indicates that the option is unsupported.
4908 */
4909 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4910 pru_socheckopt(so, sopt)) != 0) {
4911 goto out;
4912 }
4913
4914 error = 0;
4915 switch (sopt->sopt_name) {
4916 case SO_LINGER:
4917 case SO_LINGER_SEC: {
4918 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4919 if (error != 0) {
4920 goto out;
4921 }
4922 /* Make sure to use sane values */
4923 if (sopt->sopt_name == SO_LINGER) {
4924 so->so_linger = (short)l.l_linger;
4925 } else {
4926 so->so_linger = (short)((long)l.l_linger * hz);
4927 }
4928 if (l.l_onoff != 0) {
4929 so->so_options |= SO_LINGER;
4930 } else {
4931 so->so_options &= ~SO_LINGER;
4932 }
4933 break;
4934 }
4935 case SO_DEBUG:
4936 case SO_KEEPALIVE:
4937 case SO_DONTROUTE:
4938 case SO_USELOOPBACK:
4939 case SO_BROADCAST:
4940 case SO_REUSEADDR:
4941 case SO_REUSEPORT:
4942 case SO_OOBINLINE:
4943 case SO_TIMESTAMP:
4944 case SO_TIMESTAMP_MONOTONIC:
4945 case SO_TIMESTAMP_CONTINUOUS:
4946 case SO_DONTTRUNC:
4947 case SO_WANTMORE:
4948 case SO_WANTOOBFLAG:
4949 case SO_NOWAKEFROMSLEEP:
4950 case SO_NOAPNFALLBK:
4951 error = sooptcopyin(sopt, &optval, sizeof(optval),
4952 sizeof(optval));
4953 if (error != 0) {
4954 goto out;
4955 }
4956 if (optval) {
4957 so->so_options |= sopt->sopt_name;
4958 } else {
4959 so->so_options &= ~sopt->sopt_name;
4960 }
4961 #if SKYWALK
4962 inp_update_netns_flags(so);
4963 #endif /* SKYWALK */
4964 break;
4965
4966 case SO_SNDBUF:
4967 case SO_RCVBUF:
4968 case SO_SNDLOWAT:
4969 case SO_RCVLOWAT:
4970 error = sooptcopyin(sopt, &optval, sizeof(optval),
4971 sizeof(optval));
4972 if (error != 0) {
4973 goto out;
4974 }
4975
4976 /*
4977 * Values < 1 make no sense for any of these
4978 * options, so disallow them.
4979 */
4980 if (optval < 1) {
4981 error = EINVAL;
4982 goto out;
4983 }
4984
4985 switch (sopt->sopt_name) {
4986 case SO_SNDBUF:
4987 case SO_RCVBUF: {
4988 struct sockbuf *sb =
4989 (sopt->sopt_name == SO_SNDBUF) ?
4990 &so->so_snd : &so->so_rcv;
4991 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4992 error = ENOBUFS;
4993 goto out;
4994 }
4995 sb->sb_flags |= SB_USRSIZE;
4996 sb->sb_flags &= ~SB_AUTOSIZE;
4997 sb->sb_idealsize = (u_int32_t)optval;
4998 break;
4999 }
5000 /*
5001 * Make sure the low-water is never greater than
5002 * the high-water.
5003 */
5004 case SO_SNDLOWAT: {
5005 int space = sbspace(&so->so_snd);
5006 uint32_t hiwat = so->so_snd.sb_hiwat;
5007
5008 if (so->so_snd.sb_flags & SB_UNIX) {
5009 struct unpcb *unp =
5010 (struct unpcb *)(so->so_pcb);
5011 if (unp != NULL &&
5012 unp->unp_conn != NULL) {
5013 struct socket *so2 = unp->unp_conn->unp_socket;
5014 hiwat += unp->unp_conn->unp_cc;
5015 space = sbspace(&so2->so_rcv);
5016 }
5017 }
5018
5019 so->so_snd.sb_lowat =
5020 (optval > hiwat) ?
5021 hiwat : optval;
5022
5023 if (space >= so->so_snd.sb_lowat) {
5024 sowwakeup(so);
5025 }
5026 break;
5027 }
5028 case SO_RCVLOWAT: {
5029 int64_t data_len;
5030 so->so_rcv.sb_lowat =
5031 (optval > so->so_rcv.sb_hiwat) ?
5032 so->so_rcv.sb_hiwat : optval;
5033 if (so->so_rcv.sb_flags & SB_UNIX) {
5034 struct unpcb *unp =
5035 (struct unpcb *)(so->so_pcb);
5036 if (unp != NULL &&
5037 unp->unp_conn != NULL) {
5038 struct socket *so2 = unp->unp_conn->unp_socket;
5039 data_len = so2->so_snd.sb_cc
5040 - so2->so_snd.sb_ctl;
5041 } else {
5042 data_len = so->so_rcv.sb_cc
5043 - so->so_rcv.sb_ctl;
5044 }
5045 } else {
5046 data_len = so->so_rcv.sb_cc
5047 - so->so_rcv.sb_ctl;
5048 }
5049
5050 if (data_len >= so->so_rcv.sb_lowat) {
5051 sorwakeup(so);
5052 }
5053 break;
5054 }
5055 }
5056 break;
5057
5058 case SO_SNDTIMEO:
5059 case SO_RCVTIMEO:
5060 error = sooptcopyin_timeval(sopt, &tv);
5061 if (error != 0) {
5062 goto out;
5063 }
5064
5065 switch (sopt->sopt_name) {
5066 case SO_SNDTIMEO:
5067 so->so_snd.sb_timeo = tv;
5068 break;
5069 case SO_RCVTIMEO:
5070 so->so_rcv.sb_timeo = tv;
5071 break;
5072 }
5073 break;
5074
5075 case SO_NKE: {
5076 struct so_nke nke;
5077
5078 error = sooptcopyin(sopt, &nke, sizeof(nke),
5079 sizeof(nke));
5080 if (error != 0) {
5081 goto out;
5082 }
5083
5084 error = sflt_attach_internal(so, nke.nke_handle);
5085 break;
5086 }
5087
5088 case SO_NOSIGPIPE:
5089 error = sooptcopyin(sopt, &optval, sizeof(optval),
5090 sizeof(optval));
5091 if (error != 0) {
5092 goto out;
5093 }
5094 if (optval != 0) {
5095 so->so_flags |= SOF_NOSIGPIPE;
5096 } else {
5097 so->so_flags &= ~SOF_NOSIGPIPE;
5098 }
5099 break;
5100
5101 case SO_NOADDRERR:
5102 error = sooptcopyin(sopt, &optval, sizeof(optval),
5103 sizeof(optval));
5104 if (error != 0) {
5105 goto out;
5106 }
5107 if (optval != 0) {
5108 so->so_flags |= SOF_NOADDRAVAIL;
5109 } else {
5110 so->so_flags &= ~SOF_NOADDRAVAIL;
5111 }
5112 break;
5113
5114 case SO_REUSESHAREUID:
5115 error = sooptcopyin(sopt, &optval, sizeof(optval),
5116 sizeof(optval));
5117 if (error != 0) {
5118 goto out;
5119 }
5120 if (optval != 0) {
5121 so->so_flags |= SOF_REUSESHAREUID;
5122 } else {
5123 so->so_flags &= ~SOF_REUSESHAREUID;
5124 }
5125 break;
5126
5127 case SO_NOTIFYCONFLICT:
5128 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5129 error = EPERM;
5130 goto out;
5131 }
5132 error = sooptcopyin(sopt, &optval, sizeof(optval),
5133 sizeof(optval));
5134 if (error != 0) {
5135 goto out;
5136 }
5137 if (optval != 0) {
5138 so->so_flags |= SOF_NOTIFYCONFLICT;
5139 } else {
5140 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5141 }
5142 break;
5143
5144 case SO_RESTRICTIONS:
5145 error = sooptcopyin(sopt, &optval, sizeof(optval),
5146 sizeof(optval));
5147 if (error != 0) {
5148 goto out;
5149 }
5150
5151 error = so_set_restrictions(so, optval);
5152 break;
5153
5154 case SO_AWDL_UNRESTRICTED:
5155 if (SOCK_DOM(so) != PF_INET &&
5156 SOCK_DOM(so) != PF_INET6) {
5157 error = EOPNOTSUPP;
5158 goto out;
5159 }
5160 error = sooptcopyin(sopt, &optval, sizeof(optval),
5161 sizeof(optval));
5162 if (error != 0) {
5163 goto out;
5164 }
5165 if (optval != 0) {
5166 error = soopt_cred_check(so,
5167 PRIV_NET_RESTRICTED_AWDL, false, false);
5168 if (error == 0) {
5169 inp_set_awdl_unrestricted(
5170 sotoinpcb(so));
5171 }
5172 } else {
5173 inp_clear_awdl_unrestricted(sotoinpcb(so));
5174 }
5175 break;
5176 case SO_INTCOPROC_ALLOW:
5177 if (SOCK_DOM(so) != PF_INET6) {
5178 error = EOPNOTSUPP;
5179 goto out;
5180 }
5181 error = sooptcopyin(sopt, &optval, sizeof(optval),
5182 sizeof(optval));
5183 if (error != 0) {
5184 goto out;
5185 }
5186 if (optval != 0 &&
5187 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5188 error = soopt_cred_check(so,
5189 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5190 if (error == 0) {
5191 inp_set_intcoproc_allowed(
5192 sotoinpcb(so));
5193 }
5194 } else if (optval == 0) {
5195 inp_clear_intcoproc_allowed(sotoinpcb(so));
5196 }
5197 break;
5198
5199 case SO_LABEL:
5200 error = EOPNOTSUPP;
5201 break;
5202
5203 case SO_UPCALLCLOSEWAIT:
5204 error = sooptcopyin(sopt, &optval, sizeof(optval),
5205 sizeof(optval));
5206 if (error != 0) {
5207 goto out;
5208 }
5209 if (optval != 0) {
5210 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5211 } else {
5212 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5213 }
5214 break;
5215
5216 case SO_RANDOMPORT:
5217 error = sooptcopyin(sopt, &optval, sizeof(optval),
5218 sizeof(optval));
5219 if (error != 0) {
5220 goto out;
5221 }
5222 if (optval != 0) {
5223 so->so_flags |= SOF_BINDRANDOMPORT;
5224 } else {
5225 so->so_flags &= ~SOF_BINDRANDOMPORT;
5226 }
5227 break;
5228
5229 case SO_NP_EXTENSIONS: {
5230 struct so_np_extensions sonpx;
5231
5232 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5233 sizeof(sonpx));
5234 if (error != 0) {
5235 goto out;
5236 }
5237 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5238 error = EINVAL;
5239 goto out;
5240 }
5241 /*
5242 * Only one bit defined for now
5243 */
5244 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5245 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5246 so->so_flags |= SOF_NPX_SETOPTSHUT;
5247 } else {
5248 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5249 }
5250 }
5251 break;
5252 }
5253
5254 case SO_TRAFFIC_CLASS: {
5255 error = sooptcopyin(sopt, &optval, sizeof(optval),
5256 sizeof(optval));
5257 if (error != 0) {
5258 goto out;
5259 }
5260 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5261 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5262 error = so_set_net_service_type(so, netsvc);
5263 goto out;
5264 }
5265 error = so_set_traffic_class(so, optval);
5266 if (error != 0) {
5267 goto out;
5268 }
5269 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5270 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5271 break;
5272 }
5273
5274 case SO_RECV_TRAFFIC_CLASS: {
5275 error = sooptcopyin(sopt, &optval, sizeof(optval),
5276 sizeof(optval));
5277 if (error != 0) {
5278 goto out;
5279 }
5280 if (optval == 0) {
5281 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5282 } else {
5283 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5284 }
5285 break;
5286 }
5287
5288 #if (DEVELOPMENT || DEBUG)
5289 case SO_TRAFFIC_CLASS_DBG: {
5290 struct so_tcdbg so_tcdbg;
5291
5292 error = sooptcopyin(sopt, &so_tcdbg,
5293 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5294 if (error != 0) {
5295 goto out;
5296 }
5297 error = so_set_tcdbg(so, &so_tcdbg);
5298 if (error != 0) {
5299 goto out;
5300 }
5301 break;
5302 }
5303 #endif /* (DEVELOPMENT || DEBUG) */
5304
5305 case SO_PRIVILEGED_TRAFFIC_CLASS:
5306 error = priv_check_cred(kauth_cred_get(),
5307 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5308 if (error != 0) {
5309 goto out;
5310 }
5311 error = sooptcopyin(sopt, &optval, sizeof(optval),
5312 sizeof(optval));
5313 if (error != 0) {
5314 goto out;
5315 }
5316 if (optval == 0) {
5317 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5318 } else {
5319 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5320 }
5321 break;
5322
5323 #if (DEVELOPMENT || DEBUG)
5324 case SO_DEFUNCTIT:
5325 error = sosetdefunct(current_proc(), so, 0, FALSE);
5326 if (error == 0) {
5327 error = sodefunct(current_proc(), so, 0);
5328 }
5329
5330 break;
5331 #endif /* (DEVELOPMENT || DEBUG) */
5332
5333 case SO_DEFUNCTOK:
5334 error = sooptcopyin(sopt, &optval, sizeof(optval),
5335 sizeof(optval));
5336 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5337 if (error == 0) {
5338 error = EBADF;
5339 }
5340 goto out;
5341 }
5342 /*
5343 * Any process can set SO_DEFUNCTOK (clear
5344 * SOF_NODEFUNCT), but only root can clear
5345 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5346 */
5347 if (optval == 0 &&
5348 kauth_cred_issuser(kauth_cred_get()) == 0) {
5349 error = EPERM;
5350 goto out;
5351 }
5352 if (optval) {
5353 so->so_flags &= ~SOF_NODEFUNCT;
5354 } else {
5355 so->so_flags |= SOF_NODEFUNCT;
5356 }
5357
5358 if (SOCK_DOM(so) == PF_INET ||
5359 SOCK_DOM(so) == PF_INET6) {
5360 char s[MAX_IPv6_STR_LEN];
5361 char d[MAX_IPv6_STR_LEN];
5362 struct inpcb *inp = sotoinpcb(so);
5363
5364 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5365 "[%s %s:%d -> %s:%d] is now marked "
5366 "as %seligible for "
5367 "defunct\n", __func__, proc_selfpid(),
5368 proc_best_name(current_proc()),
5369 so->so_gencnt,
5370 (SOCK_TYPE(so) == SOCK_STREAM) ?
5371 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5372 ((SOCK_DOM(so) == PF_INET) ?
5373 (void *)&inp->inp_laddr.s_addr :
5374 (void *)&inp->in6p_laddr), s, sizeof(s)),
5375 ntohs(inp->in6p_lport),
5376 inet_ntop(SOCK_DOM(so),
5377 (SOCK_DOM(so) == PF_INET) ?
5378 (void *)&inp->inp_faddr.s_addr :
5379 (void *)&inp->in6p_faddr, d, sizeof(d)),
5380 ntohs(inp->in6p_fport),
5381 (so->so_flags & SOF_NODEFUNCT) ?
5382 "not " : "");
5383 } else {
5384 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5385 "is now marked as %seligible for "
5386 "defunct\n",
5387 __func__, proc_selfpid(),
5388 proc_best_name(current_proc()),
5389 so->so_gencnt,
5390 SOCK_DOM(so), SOCK_TYPE(so),
5391 (so->so_flags & SOF_NODEFUNCT) ?
5392 "not " : "");
5393 }
5394 break;
5395
5396 case SO_ISDEFUNCT:
5397 /* This option is not settable */
5398 error = EINVAL;
5399 break;
5400
5401 case SO_OPPORTUNISTIC:
5402 error = sooptcopyin(sopt, &optval, sizeof(optval),
5403 sizeof(optval));
5404 if (error == 0) {
5405 error = so_set_opportunistic(so, optval);
5406 }
5407 break;
5408
5409 case SO_FLUSH:
5410 /* This option is handled by lower layer(s) */
5411 error = 0;
5412 break;
5413
5414 case SO_RECV_ANYIF:
5415 error = sooptcopyin(sopt, &optval, sizeof(optval),
5416 sizeof(optval));
5417 if (error == 0) {
5418 error = so_set_recv_anyif(so, optval);
5419 }
5420 break;
5421
5422 case SO_TRAFFIC_MGT_BACKGROUND: {
5423 /* This option is handled by lower layer(s) */
5424 error = 0;
5425 break;
5426 }
5427
5428 #if FLOW_DIVERT
5429 case SO_FLOW_DIVERT_TOKEN:
5430 error = flow_divert_token_set(so, sopt);
5431 break;
5432 #endif /* FLOW_DIVERT */
5433
5434
5435 case SO_DELEGATED:
5436 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5437 sizeof(optval))) != 0) {
5438 break;
5439 }
5440
5441 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5442 break;
5443
5444 case SO_DELEGATED_UUID: {
5445 uuid_t euuid;
5446
5447 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5448 sizeof(euuid))) != 0) {
5449 break;
5450 }
5451
5452 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5453 break;
5454 }
5455
5456 #if NECP
5457 case SO_NECP_ATTRIBUTES:
5458 if (SOCK_DOM(so) == PF_MULTIPATH) {
5459 /* Handled by MPTCP itself */
5460 break;
5461 }
5462
5463 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5464 error = EINVAL;
5465 goto out;
5466 }
5467
5468 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5469 break;
5470
5471 case SO_NECP_CLIENTUUID: {
5472 if (SOCK_DOM(so) == PF_MULTIPATH) {
5473 /* Handled by MPTCP itself */
5474 break;
5475 }
5476
5477 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5478 error = EINVAL;
5479 goto out;
5480 }
5481
5482 struct inpcb *inp = sotoinpcb(so);
5483 if (!uuid_is_null(inp->necp_client_uuid)) {
5484 // Clear out the old client UUID if present
5485 necp_inpcb_remove_cb(inp);
5486 }
5487
5488 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5489 sizeof(uuid_t), sizeof(uuid_t));
5490 if (error != 0) {
5491 goto out;
5492 }
5493
5494 if (uuid_is_null(inp->necp_client_uuid)) {
5495 error = EINVAL;
5496 goto out;
5497 }
5498
5499 pid_t current_pid = proc_pid(current_proc());
5500 error = necp_client_register_socket_flow(current_pid,
5501 inp->necp_client_uuid, inp);
5502 if (error != 0) {
5503 uuid_clear(inp->necp_client_uuid);
5504 goto out;
5505 }
5506
5507 if (inp->inp_lport != 0) {
5508 // There is a bound local port, so this is not
5509 // a fresh socket. Assign to the client.
5510 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5511 }
5512
5513 break;
5514 }
5515 case SO_NECP_LISTENUUID: {
5516 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5517 error = EINVAL;
5518 goto out;
5519 }
5520
5521 struct inpcb *inp = sotoinpcb(so);
5522 if (!uuid_is_null(inp->necp_client_uuid)) {
5523 error = EINVAL;
5524 goto out;
5525 }
5526
5527 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5528 sizeof(uuid_t), sizeof(uuid_t));
5529 if (error != 0) {
5530 goto out;
5531 }
5532
5533 if (uuid_is_null(inp->necp_client_uuid)) {
5534 error = EINVAL;
5535 goto out;
5536 }
5537
5538 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5539 inp->necp_client_uuid, inp);
5540 if (error != 0) {
5541 uuid_clear(inp->necp_client_uuid);
5542 goto out;
5543 }
5544
5545 // Mark that the port registration is held by NECP
5546 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5547
5548 break;
5549 }
5550
5551 case SO_RESOLVER_SIGNATURE: {
5552 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5553 error = EINVAL;
5554 goto out;
5555 }
5556 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5557 break;
5558 }
5559 #endif /* NECP */
5560
5561 case SO_EXTENDED_BK_IDLE:
5562 error = sooptcopyin(sopt, &optval, sizeof(optval),
5563 sizeof(optval));
5564 if (error == 0) {
5565 error = so_set_extended_bk_idle(so, optval);
5566 }
5567 break;
5568
5569 case SO_MARK_CELLFALLBACK:
5570 error = sooptcopyin(sopt, &optval, sizeof(optval),
5571 sizeof(optval));
5572 if (error != 0) {
5573 goto out;
5574 }
5575 if (optval < 0) {
5576 error = EINVAL;
5577 goto out;
5578 }
5579 if (optval == 0) {
5580 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5581 } else {
5582 so->so_flags1 |= SOF1_CELLFALLBACK;
5583 }
5584 break;
5585
5586 case SO_MARK_CELLFALLBACK_UUID:
5587 {
5588 struct so_mark_cellfallback_uuid_args args;
5589
5590 error = sooptcopyin(sopt, &args, sizeof(args),
5591 sizeof(args));
5592 if (error != 0) {
5593 goto out;
5594 }
5595 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5596 args.flow_cellfallback);
5597 break;
5598 }
5599
5600 case SO_FALLBACK_MODE:
5601 error = sooptcopyin(sopt, &optval, sizeof(optval),
5602 sizeof(optval));
5603 if (error != 0) {
5604 goto out;
5605 }
5606 if (optval < SO_FALLBACK_MODE_NONE ||
5607 optval > SO_FALLBACK_MODE_PREFER) {
5608 error = EINVAL;
5609 goto out;
5610 }
5611 so->so_fallback_mode = (u_int8_t)optval;
5612 break;
5613
5614 case SO_MARK_KNOWN_TRACKER: {
5615 error = sooptcopyin(sopt, &optval, sizeof(optval),
5616 sizeof(optval));
5617 if (error != 0) {
5618 goto out;
5619 }
5620 if (optval < 0) {
5621 error = EINVAL;
5622 goto out;
5623 }
5624 if (optval == 0) {
5625 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5626 } else {
5627 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5628 }
5629 break;
5630 }
5631
5632 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5633 error = sooptcopyin(sopt, &optval, sizeof(optval),
5634 sizeof(optval));
5635 if (error != 0) {
5636 goto out;
5637 }
5638 if (optval < 0) {
5639 error = EINVAL;
5640 goto out;
5641 }
5642 if (optval == 0) {
5643 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5644 } else {
5645 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5646 }
5647 break;
5648 }
5649
5650 case SO_MARK_APPROVED_APP_DOMAIN: {
5651 error = sooptcopyin(sopt, &optval, sizeof(optval),
5652 sizeof(optval));
5653 if (error != 0) {
5654 goto out;
5655 }
5656 if (optval < 0) {
5657 error = EINVAL;
5658 goto out;
5659 }
5660 if (optval == 0) {
5661 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5662 } else {
5663 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5664 }
5665 break;
5666 }
5667
5668 case SO_STATISTICS_EVENT:
5669 error = sooptcopyin(sopt, &long_optval,
5670 sizeof(long_optval), sizeof(long_optval));
5671 if (error != 0) {
5672 goto out;
5673 }
5674 u_int64_t nstat_event = 0;
5675 error = so_statistics_event_to_nstat_event(
5676 &long_optval, &nstat_event);
5677 if (error != 0) {
5678 goto out;
5679 }
5680 nstat_pcb_event(sotoinpcb(so), nstat_event);
5681 break;
5682
5683 case SO_NET_SERVICE_TYPE: {
5684 error = sooptcopyin(sopt, &optval, sizeof(optval),
5685 sizeof(optval));
5686 if (error != 0) {
5687 goto out;
5688 }
5689 error = so_set_net_service_type(so, optval);
5690 break;
5691 }
5692
5693 case SO_QOSMARKING_POLICY_OVERRIDE:
5694 error = priv_check_cred(kauth_cred_get(),
5695 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5696 if (error != 0) {
5697 goto out;
5698 }
5699 error = sooptcopyin(sopt, &optval, sizeof(optval),
5700 sizeof(optval));
5701 if (error != 0) {
5702 goto out;
5703 }
5704 if (optval == 0) {
5705 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5706 } else {
5707 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5708 }
5709 break;
5710
5711 case SO_MPKL_SEND_INFO: {
5712 struct so_mpkl_send_info so_mpkl_send_info;
5713
5714 error = sooptcopyin(sopt, &so_mpkl_send_info,
5715 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5716 if (error != 0) {
5717 goto out;
5718 }
5719 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5720 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5721
5722 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5723 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5724 } else {
5725 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5726 }
5727 break;
5728 }
5729 case SO_WANT_KEV_SOCKET_CLOSED: {
5730 error = sooptcopyin(sopt, &optval, sizeof(optval),
5731 sizeof(optval));
5732 if (error != 0) {
5733 goto out;
5734 }
5735 if (optval == 0) {
5736 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5737 } else {
5738 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5739 }
5740 break;
5741 }
5742 case SO_MARK_WAKE_PKT: {
5743 error = sooptcopyin(sopt, &optval, sizeof(optval),
5744 sizeof(optval));
5745 if (error != 0) {
5746 goto out;
5747 }
5748 if (optval == 0) {
5749 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5750 } else {
5751 so->so_flags |= SOF_MARK_WAKE_PKT;
5752 }
5753 break;
5754 }
5755 case SO_RECV_WAKE_PKT: {
5756 error = sooptcopyin(sopt, &optval, sizeof(optval),
5757 sizeof(optval));
5758 if (error != 0) {
5759 goto out;
5760 }
5761 if (optval == 0) {
5762 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5763 } else {
5764 so->so_flags |= SOF_RECV_WAKE_PKT;
5765 }
5766 break;
5767 }
5768 case SO_APPLICATION_ID: {
5769 so_application_id_t application_id = { 0 };
5770
5771 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5772 error = EINVAL;
5773 goto out;
5774 }
5775 error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5776 sizeof(application_id));
5777 if (error != 0) {
5778 goto out;
5779 }
5780
5781 // The user needs to match
5782 if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5783 error = EINVAL;
5784 printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5785 goto out;
5786 }
5787 error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5788 if (error != 0) {
5789 printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5790 goto out;
5791 }
5792 if (application_id.persona_id != PERSONA_ID_NONE) {
5793 so->so_persona_id = application_id.persona_id;
5794 }
5795 break;
5796 }
5797 case SO_MARK_DOMAIN_INFO_SILENT:
5798 error = sooptcopyin(sopt, &optval, sizeof(optval),
5799 sizeof(optval));
5800 if (error != 0) {
5801 goto out;
5802 }
5803 if (optval < 0) {
5804 error = EINVAL;
5805 goto out;
5806 }
5807 if (optval == 0) {
5808 so->so_flags1 &= ~SOF1_DOMAIN_INFO_SILENT;
5809 } else {
5810 so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
5811 }
5812 break;
5813
5814 default:
5815 error = ENOPROTOOPT;
5816 break;
5817 }
5818 if (error == 0 && so->so_proto != NULL &&
5819 so->so_proto->pr_ctloutput != NULL) {
5820 (void) so->so_proto->pr_ctloutput(so, sopt);
5821 }
5822 }
5823 out:
5824 if (dolock) {
5825 socket_unlock(so, 1);
5826 }
5827 return error;
5828 }
5829
5830 /* Helper routines for getsockopt */
5831 int
sooptcopyout(struct sockopt * sopt,void * __sized_by (len)buf,size_t len)5832 sooptcopyout(struct sockopt *sopt, void *__sized_by(len) buf, size_t len)
5833 {
5834 int error;
5835 size_t valsize;
5836
5837 error = 0;
5838
5839 /*
5840 * Documented get behavior is that we always return a value,
5841 * possibly truncated to fit in the user's buffer.
5842 * Traditional behavior is that we always tell the user
5843 * precisely how much we copied, rather than something useful
5844 * like the total amount we had available for her.
5845 * Note that this interface is not idempotent; the entire answer must
5846 * generated ahead of time.
5847 */
5848 valsize = MIN(len, sopt->sopt_valsize);
5849 sopt->sopt_valsize = valsize;
5850 if (sopt->sopt_valsize != 0 && sopt->sopt_val != USER_ADDR_NULL) {
5851 if (sopt->sopt_p != kernproc) {
5852 error = copyout(buf, sopt->sopt_val, valsize);
5853 } else {
5854 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5855 CAST_DOWN(caddr_t, sopt->sopt_val),
5856 valsize);
5857 bcopy(buf, tmp, valsize);
5858 }
5859 }
5860 return error;
5861 }
5862
5863 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5864 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5865 {
5866 int error;
5867 size_t len;
5868 struct user64_timeval tv64 = {};
5869 struct user32_timeval tv32 = {};
5870 const void * val;
5871 size_t valsize;
5872
5873 error = 0;
5874 if (proc_is64bit(sopt->sopt_p)) {
5875 len = sizeof(tv64);
5876 tv64.tv_sec = tv_p->tv_sec;
5877 tv64.tv_usec = tv_p->tv_usec;
5878 val = &tv64;
5879 } else {
5880 len = sizeof(tv32);
5881 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5882 tv32.tv_usec = tv_p->tv_usec;
5883 val = &tv32;
5884 }
5885 valsize = MIN(len, sopt->sopt_valsize);
5886 sopt->sopt_valsize = valsize;
5887 if (sopt->sopt_val != USER_ADDR_NULL) {
5888 if (sopt->sopt_p != kernproc) {
5889 error = copyout(val, sopt->sopt_val, valsize);
5890 } else {
5891 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
5892 CAST_DOWN(caddr_t, sopt->sopt_val),
5893 valsize);
5894 bcopy(val, tmp, valsize);
5895 }
5896 }
5897 return error;
5898 }
5899
5900 /*
5901 * Return: 0 Success
5902 * ENOPROTOOPT
5903 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5904 * <pr_ctloutput>:???
5905 * <sf_getoption>:???
5906 */
5907 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5908 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5909 {
5910 int error, optval;
5911 struct linger l;
5912 struct timeval tv;
5913
5914 if (sopt->sopt_dir != SOPT_GET) {
5915 sopt->sopt_dir = SOPT_GET;
5916 }
5917
5918 if (dolock) {
5919 socket_lock(so, 1);
5920 }
5921
5922 error = sflt_getsockopt(so, sopt);
5923 if (error != 0) {
5924 if (error == EJUSTRETURN) {
5925 error = 0;
5926 }
5927 goto out;
5928 }
5929
5930 if (sopt->sopt_level != SOL_SOCKET || sopt->sopt_name == SO_BINDTODEVICE) {
5931 if (so->so_proto != NULL &&
5932 so->so_proto->pr_ctloutput != NULL) {
5933 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5934 goto out;
5935 }
5936 error = ENOPROTOOPT;
5937 } else {
5938 /*
5939 * Allow socket-level (SOL_SOCKET) options to be filtered by
5940 * the protocol layer, if needed. A zero value returned from
5941 * the handler means use default socket-level processing as
5942 * done by the rest of this routine. Otherwise, any other
5943 * return value indicates that the option is unsupported.
5944 */
5945 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5946 pru_socheckopt(so, sopt)) != 0) {
5947 goto out;
5948 }
5949
5950 error = 0;
5951 switch (sopt->sopt_name) {
5952 case SO_LINGER:
5953 case SO_LINGER_SEC:
5954 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5955 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5956 so->so_linger : so->so_linger / hz;
5957 error = sooptcopyout(sopt, &l, sizeof(l));
5958 break;
5959
5960 case SO_USELOOPBACK:
5961 case SO_DONTROUTE:
5962 case SO_DEBUG:
5963 case SO_KEEPALIVE:
5964 case SO_REUSEADDR:
5965 case SO_REUSEPORT:
5966 case SO_BROADCAST:
5967 case SO_OOBINLINE:
5968 case SO_TIMESTAMP:
5969 case SO_TIMESTAMP_MONOTONIC:
5970 case SO_TIMESTAMP_CONTINUOUS:
5971 case SO_DONTTRUNC:
5972 case SO_WANTMORE:
5973 case SO_WANTOOBFLAG:
5974 case SO_NOWAKEFROMSLEEP:
5975 case SO_NOAPNFALLBK:
5976 optval = so->so_options & sopt->sopt_name;
5977 integer:
5978 error = sooptcopyout(sopt, &optval, sizeof(optval));
5979 break;
5980
5981 case SO_TYPE:
5982 optval = so->so_type;
5983 goto integer;
5984
5985 case SO_NREAD:
5986 if (so->so_proto->pr_flags & PR_ATOMIC) {
5987 int pkt_total;
5988 struct mbuf *m1;
5989
5990 pkt_total = 0;
5991 m1 = so->so_rcv.sb_mb;
5992 while (m1 != NULL) {
5993 if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5994 pkt_total += m1->m_len;
5995 }
5996 m1 = m1->m_next;
5997 }
5998 optval = pkt_total;
5999 } else {
6000 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6001 }
6002 goto integer;
6003
6004 case SO_NUMRCVPKT:
6005 if (so->so_proto->pr_flags & PR_ATOMIC) {
6006 int cnt = 0;
6007 struct mbuf *m1;
6008
6009 m1 = so->so_rcv.sb_mb;
6010 while (m1 != NULL) {
6011 cnt += 1;
6012 m1 = m1->m_nextpkt;
6013 }
6014 optval = cnt;
6015 goto integer;
6016 } else {
6017 error = ENOPROTOOPT;
6018 break;
6019 }
6020
6021 case SO_NWRITE:
6022 optval = so->so_snd.sb_cc;
6023 goto integer;
6024
6025 case SO_ERROR:
6026 optval = so->so_error;
6027 so->so_error = 0;
6028 goto integer;
6029
6030 case SO_SNDBUF: {
6031 u_int32_t hiwat = so->so_snd.sb_hiwat;
6032
6033 if (so->so_snd.sb_flags & SB_UNIX) {
6034 struct unpcb *unp =
6035 (struct unpcb *)(so->so_pcb);
6036 if (unp != NULL && unp->unp_conn != NULL) {
6037 hiwat += unp->unp_conn->unp_cc;
6038 }
6039 }
6040
6041 optval = hiwat;
6042 goto integer;
6043 }
6044 case SO_RCVBUF:
6045 optval = so->so_rcv.sb_hiwat;
6046 goto integer;
6047
6048 case SO_SNDLOWAT:
6049 optval = so->so_snd.sb_lowat;
6050 goto integer;
6051
6052 case SO_RCVLOWAT:
6053 optval = so->so_rcv.sb_lowat;
6054 goto integer;
6055
6056 case SO_SNDTIMEO:
6057 case SO_RCVTIMEO:
6058 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6059 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6060
6061 error = sooptcopyout_timeval(sopt, &tv);
6062 break;
6063
6064 case SO_NOSIGPIPE:
6065 optval = (so->so_flags & SOF_NOSIGPIPE);
6066 goto integer;
6067
6068 case SO_NOADDRERR:
6069 optval = (so->so_flags & SOF_NOADDRAVAIL);
6070 goto integer;
6071
6072 case SO_REUSESHAREUID:
6073 optval = (so->so_flags & SOF_REUSESHAREUID);
6074 goto integer;
6075
6076
6077 case SO_NOTIFYCONFLICT:
6078 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6079 goto integer;
6080
6081 case SO_RESTRICTIONS:
6082 optval = so_get_restrictions(so);
6083 goto integer;
6084
6085 case SO_AWDL_UNRESTRICTED:
6086 if (SOCK_DOM(so) == PF_INET ||
6087 SOCK_DOM(so) == PF_INET6) {
6088 optval = inp_get_awdl_unrestricted(
6089 sotoinpcb(so));
6090 goto integer;
6091 } else {
6092 error = EOPNOTSUPP;
6093 }
6094 break;
6095
6096 case SO_INTCOPROC_ALLOW:
6097 if (SOCK_DOM(so) == PF_INET6) {
6098 optval = inp_get_intcoproc_allowed(
6099 sotoinpcb(so));
6100 goto integer;
6101 } else {
6102 error = EOPNOTSUPP;
6103 }
6104 break;
6105
6106 case SO_LABEL:
6107 error = EOPNOTSUPP;
6108 break;
6109
6110 case SO_PEERLABEL:
6111 error = EOPNOTSUPP;
6112 break;
6113
6114 #ifdef __APPLE_API_PRIVATE
6115 case SO_UPCALLCLOSEWAIT:
6116 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6117 goto integer;
6118 #endif
6119 case SO_RANDOMPORT:
6120 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6121 goto integer;
6122
6123 case SO_NP_EXTENSIONS: {
6124 struct so_np_extensions sonpx = {};
6125
6126 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6127 SONPX_SETOPTSHUT : 0;
6128 sonpx.npx_mask = SONPX_MASK_VALID;
6129
6130 error = sooptcopyout(sopt, &sonpx,
6131 sizeof(struct so_np_extensions));
6132 break;
6133 }
6134
6135 case SO_TRAFFIC_CLASS:
6136 optval = so->so_traffic_class;
6137 goto integer;
6138
6139 case SO_RECV_TRAFFIC_CLASS:
6140 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6141 goto integer;
6142
6143 #if (DEVELOPMENT || DEBUG)
6144 case SO_TRAFFIC_CLASS_DBG:
6145 error = sogetopt_tcdbg(so, sopt);
6146 break;
6147 #endif /* (DEVELOPMENT || DEBUG) */
6148
6149 case SO_PRIVILEGED_TRAFFIC_CLASS:
6150 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6151 goto integer;
6152
6153 case SO_DEFUNCTOK:
6154 optval = !(so->so_flags & SOF_NODEFUNCT);
6155 goto integer;
6156
6157 case SO_ISDEFUNCT:
6158 optval = (so->so_flags & SOF_DEFUNCT);
6159 goto integer;
6160
6161 case SO_OPPORTUNISTIC:
6162 optval = so_get_opportunistic(so);
6163 goto integer;
6164
6165 case SO_FLUSH:
6166 /* This option is not gettable */
6167 error = EINVAL;
6168 break;
6169
6170 case SO_RECV_ANYIF:
6171 optval = so_get_recv_anyif(so);
6172 goto integer;
6173
6174 case SO_TRAFFIC_MGT_BACKGROUND:
6175 /* This option is handled by lower layer(s) */
6176 if (so->so_proto != NULL &&
6177 so->so_proto->pr_ctloutput != NULL) {
6178 (void) so->so_proto->pr_ctloutput(so, sopt);
6179 }
6180 break;
6181
6182 #if FLOW_DIVERT
6183 case SO_FLOW_DIVERT_TOKEN:
6184 error = flow_divert_token_get(so, sopt);
6185 break;
6186 #endif /* FLOW_DIVERT */
6187
6188 #if NECP
6189 case SO_NECP_ATTRIBUTES:
6190 if (SOCK_DOM(so) == PF_MULTIPATH) {
6191 /* Handled by MPTCP itself */
6192 break;
6193 }
6194
6195 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6196 error = EINVAL;
6197 goto out;
6198 }
6199
6200 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6201 break;
6202
6203 case SO_NECP_CLIENTUUID: {
6204 uuid_t *ncu;
6205
6206 if (SOCK_DOM(so) == PF_MULTIPATH) {
6207 ncu = &mpsotomppcb(so)->necp_client_uuid;
6208 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6209 ncu = &sotoinpcb(so)->necp_client_uuid;
6210 } else {
6211 error = EINVAL;
6212 goto out;
6213 }
6214
6215 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6216 break;
6217 }
6218
6219 case SO_NECP_LISTENUUID: {
6220 uuid_t *nlu;
6221
6222 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6223 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6224 nlu = &sotoinpcb(so)->necp_client_uuid;
6225 } else {
6226 error = ENOENT;
6227 goto out;
6228 }
6229 } else {
6230 error = EINVAL;
6231 goto out;
6232 }
6233
6234 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6235 break;
6236 }
6237
6238 case SO_RESOLVER_SIGNATURE: {
6239 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6240 error = EINVAL;
6241 goto out;
6242 }
6243 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6244 break;
6245 }
6246
6247 #endif /* NECP */
6248
6249 #if CONTENT_FILTER
6250 case SO_CFIL_SOCK_ID: {
6251 cfil_sock_id_t sock_id;
6252
6253 sock_id = cfil_sock_id_from_socket(so);
6254
6255 error = sooptcopyout(sopt, &sock_id,
6256 sizeof(cfil_sock_id_t));
6257 break;
6258 }
6259 #endif /* CONTENT_FILTER */
6260
6261 case SO_EXTENDED_BK_IDLE:
6262 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6263 goto integer;
6264 case SO_MARK_CELLFALLBACK:
6265 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6266 ? 1 : 0;
6267 goto integer;
6268 case SO_FALLBACK_MODE:
6269 optval = so->so_fallback_mode;
6270 goto integer;
6271 case SO_MARK_KNOWN_TRACKER: {
6272 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6273 ? 1 : 0;
6274 goto integer;
6275 }
6276 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6277 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6278 ? 1 : 0;
6279 goto integer;
6280 }
6281 case SO_MARK_APPROVED_APP_DOMAIN: {
6282 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6283 ? 1 : 0;
6284 goto integer;
6285 }
6286 case SO_NET_SERVICE_TYPE: {
6287 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6288 optval = so->so_netsvctype;
6289 } else {
6290 optval = NET_SERVICE_TYPE_BE;
6291 }
6292 goto integer;
6293 }
6294 case SO_NETSVC_MARKING_LEVEL:
6295 optval = so_get_netsvc_marking_level(so);
6296 goto integer;
6297
6298 case SO_MPKL_SEND_INFO: {
6299 struct so_mpkl_send_info so_mpkl_send_info;
6300
6301 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6302 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6303 error = sooptcopyout(sopt, &so_mpkl_send_info,
6304 sizeof(struct so_mpkl_send_info));
6305 break;
6306 }
6307 case SO_MARK_WAKE_PKT:
6308 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6309 goto integer;
6310 case SO_RECV_WAKE_PKT:
6311 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6312 goto integer;
6313 case SO_APPLICATION_ID: {
6314 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6315 error = EINVAL;
6316 goto out;
6317 }
6318 so_application_id_t application_id = { 0 };
6319 application_id.uid = kauth_cred_getuid(so->so_cred);
6320 uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6321 application_id.persona_id = so->so_persona_id;
6322 error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6323 break;
6324 }
6325 case SO_MARK_DOMAIN_INFO_SILENT:
6326 optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
6327 ? 1 : 0;
6328 goto integer;
6329 default:
6330 error = ENOPROTOOPT;
6331 break;
6332 }
6333 }
6334 out:
6335 if (dolock) {
6336 socket_unlock(so, 1);
6337 }
6338 return error;
6339 }
6340
6341 /*
6342 * The size limits on our soopt_getm is different from that on FreeBSD.
6343 * We limit the size of options to MCLBYTES. This will have to change
6344 * if we need to define options that need more space than MCLBYTES.
6345 */
6346 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6347 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6348 {
6349 struct mbuf *m, *m_prev;
6350 int sopt_size = (int)sopt->sopt_valsize;
6351 int how;
6352
6353 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6354 return EMSGSIZE;
6355 }
6356
6357 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6358 MGET(m, how, MT_DATA);
6359 if (m == NULL) {
6360 return ENOBUFS;
6361 }
6362 if (sopt_size > MLEN) {
6363 MCLGET(m, how);
6364 if ((m->m_flags & M_EXT) == 0) {
6365 m_free(m);
6366 return ENOBUFS;
6367 }
6368 m->m_len = min(MCLBYTES, sopt_size);
6369 } else {
6370 m->m_len = min(MLEN, sopt_size);
6371 }
6372 sopt_size -= m->m_len;
6373 *mp = m;
6374 m_prev = m;
6375
6376 while (sopt_size > 0) {
6377 MGET(m, how, MT_DATA);
6378 if (m == NULL) {
6379 m_freem(*mp);
6380 return ENOBUFS;
6381 }
6382 if (sopt_size > MLEN) {
6383 MCLGET(m, how);
6384 if ((m->m_flags & M_EXT) == 0) {
6385 m_freem(*mp);
6386 m_freem(m);
6387 return ENOBUFS;
6388 }
6389 m->m_len = min(MCLBYTES, sopt_size);
6390 } else {
6391 m->m_len = min(MLEN, sopt_size);
6392 }
6393 sopt_size -= m->m_len;
6394 m_prev->m_next = m;
6395 m_prev = m;
6396 }
6397 return 0;
6398 }
6399
6400 /* copyin sopt data into mbuf chain */
6401 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6402 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6403 {
6404 struct mbuf *m0 = m;
6405
6406 if (sopt->sopt_val == USER_ADDR_NULL) {
6407 return 0;
6408 }
6409 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6410 if (sopt->sopt_p != kernproc) {
6411 int error;
6412
6413 error = copyin(sopt->sopt_val, mtod(m, char *),
6414 m->m_len);
6415 if (error != 0) {
6416 m_freem(m0);
6417 return error;
6418 }
6419 } else {
6420 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6421 CAST_DOWN(caddr_t, sopt->sopt_val),
6422 m->m_len);
6423 bcopy(tmp, mtod(m, char *), m->m_len);
6424 }
6425 sopt->sopt_valsize -= m->m_len;
6426 sopt->sopt_val += m->m_len;
6427 m = m->m_next;
6428 }
6429 /* should be allocated enoughly at ip6_sooptmcopyin() */
6430 if (m != NULL) {
6431 panic("soopt_mcopyin");
6432 /* NOTREACHED */
6433 }
6434 return 0;
6435 }
6436
6437 /* copyout mbuf chain data into soopt */
6438 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6439 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6440 {
6441 struct mbuf *m0 = m;
6442 size_t valsize = 0;
6443
6444 if (sopt->sopt_val == USER_ADDR_NULL) {
6445 return 0;
6446 }
6447 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6448 if (sopt->sopt_p != kernproc) {
6449 int error;
6450
6451 error = copyout(mtod(m, char *), sopt->sopt_val,
6452 m->m_len);
6453 if (error != 0) {
6454 m_freem(m0);
6455 return error;
6456 }
6457 } else {
6458 caddr_t tmp = __unsafe_forge_bidi_indexable(caddr_t,
6459 CAST_DOWN(caddr_t, sopt->sopt_val),
6460 m->m_len);
6461
6462 bcopy(mtod(m, char *), tmp, m->m_len);
6463 }
6464 sopt->sopt_valsize -= m->m_len;
6465 sopt->sopt_val += m->m_len;
6466 valsize += m->m_len;
6467 m = m->m_next;
6468 }
6469 if (m != NULL) {
6470 /* enough soopt buffer should be given from user-land */
6471 m_freem(m0);
6472 return EINVAL;
6473 }
6474 sopt->sopt_valsize = valsize;
6475 return 0;
6476 }
6477
6478 void
sohasoutofband(struct socket * so)6479 sohasoutofband(struct socket *so)
6480 {
6481 if (so->so_pgid < 0) {
6482 gsignal(-so->so_pgid, SIGURG);
6483 } else if (so->so_pgid > 0) {
6484 proc_signal(so->so_pgid, SIGURG);
6485 }
6486 selwakeup(&so->so_rcv.sb_sel);
6487 if (so->so_rcv.sb_flags & SB_KNOTE) {
6488 KNOTE(&so->so_rcv.sb_sel.si_note,
6489 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6490 }
6491 }
6492
6493 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6494 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6495 {
6496 #pragma unused(cred)
6497 struct proc *p = current_proc();
6498 int revents = 0;
6499
6500 socket_lock(so, 1);
6501 so_update_last_owner_locked(so, PROC_NULL);
6502 so_update_policy(so);
6503
6504 if (events & (POLLIN | POLLRDNORM)) {
6505 if (soreadable(so)) {
6506 revents |= events & (POLLIN | POLLRDNORM);
6507 }
6508 }
6509
6510 if (events & (POLLOUT | POLLWRNORM)) {
6511 if (sowriteable(so)) {
6512 revents |= events & (POLLOUT | POLLWRNORM);
6513 }
6514 }
6515
6516 if (events & (POLLPRI | POLLRDBAND)) {
6517 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6518 revents |= events & (POLLPRI | POLLRDBAND);
6519 }
6520 }
6521
6522 if (revents == 0) {
6523 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6524 /*
6525 * Darwin sets the flag first,
6526 * BSD calls selrecord first
6527 */
6528 so->so_rcv.sb_flags |= SB_SEL;
6529 selrecord(p, &so->so_rcv.sb_sel, wql);
6530 }
6531
6532 if (events & (POLLOUT | POLLWRNORM)) {
6533 /*
6534 * Darwin sets the flag first,
6535 * BSD calls selrecord first
6536 */
6537 so->so_snd.sb_flags |= SB_SEL;
6538 selrecord(p, &so->so_snd.sb_sel, wql);
6539 }
6540 }
6541
6542 socket_unlock(so, 1);
6543 return revents;
6544 }
6545
6546 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6547 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6548 {
6549 struct socket *so = (struct socket *)fp_get_data(fp);
6550 int result;
6551
6552 socket_lock(so, 1);
6553 so_update_last_owner_locked(so, PROC_NULL);
6554 so_update_policy(so);
6555
6556 switch (kn->kn_filter) {
6557 case EVFILT_READ:
6558 kn->kn_filtid = EVFILTID_SOREAD;
6559 break;
6560 case EVFILT_WRITE:
6561 kn->kn_filtid = EVFILTID_SOWRITE;
6562 break;
6563 case EVFILT_SOCK:
6564 kn->kn_filtid = EVFILTID_SCK;
6565 break;
6566 case EVFILT_EXCEPT:
6567 kn->kn_filtid = EVFILTID_SOEXCEPT;
6568 break;
6569 default:
6570 socket_unlock(so, 1);
6571 knote_set_error(kn, EINVAL);
6572 return 0;
6573 }
6574
6575 /*
6576 * call the appropriate sub-filter attach
6577 * with the socket still locked
6578 */
6579 result = knote_fops(kn)->f_attach(kn, kev);
6580
6581 socket_unlock(so, 1);
6582
6583 return result;
6584 }
6585
6586 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6587 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6588 {
6589 int retval = 0;
6590 int64_t data = 0;
6591
6592 if (so->so_options & SO_ACCEPTCONN) {
6593 /*
6594 * Radar 6615193 handle the listen case dynamically
6595 * for kqueue read filter. This allows to call listen()
6596 * after registering the kqueue EVFILT_READ.
6597 */
6598
6599 retval = !TAILQ_EMPTY(&so->so_comp);
6600 data = so->so_qlen;
6601 goto out;
6602 }
6603
6604 /* socket isn't a listener */
6605 /*
6606 * NOTE_LOWAT specifies new low water mark in data, i.e.
6607 * the bytes of protocol data. We therefore exclude any
6608 * control bytes.
6609 */
6610 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6611
6612 if (kn->kn_sfflags & NOTE_OOB) {
6613 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6614 kn->kn_fflags |= NOTE_OOB;
6615 data -= so->so_oobmark;
6616 retval = 1;
6617 goto out;
6618 }
6619 }
6620
6621 if ((so->so_state & SS_CANTRCVMORE)
6622 #if CONTENT_FILTER
6623 && cfil_sock_data_pending(&so->so_rcv) == 0
6624 #endif /* CONTENT_FILTER */
6625 ) {
6626 kn->kn_flags |= EV_EOF;
6627 kn->kn_fflags = so->so_error;
6628 retval = 1;
6629 goto out;
6630 }
6631
6632 if (so->so_error) { /* temporary udp error */
6633 retval = 1;
6634 goto out;
6635 }
6636
6637 int64_t lowwat = so->so_rcv.sb_lowat;
6638 /*
6639 * Ensure that when NOTE_LOWAT is used, the derived
6640 * low water mark is bounded by socket's rcv buf's
6641 * high and low water mark values.
6642 */
6643 if (kn->kn_sfflags & NOTE_LOWAT) {
6644 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6645 lowwat = so->so_rcv.sb_hiwat;
6646 } else if (kn->kn_sdata > lowwat) {
6647 lowwat = kn->kn_sdata;
6648 }
6649 }
6650
6651 /*
6652 * While the `data` field is the amount of data to read,
6653 * 0-sized packets need to wake up the kqueue, see 58140856,
6654 * so we need to take control bytes into account too.
6655 */
6656 retval = (so->so_rcv.sb_cc >= lowwat);
6657
6658 out:
6659 if (retval && kev) {
6660 knote_fill_kevent(kn, kev, data);
6661 }
6662 return retval;
6663 }
6664
6665 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6666 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6667 {
6668 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6669
6670 /* socket locked */
6671
6672 /*
6673 * If the caller explicitly asked for OOB results (e.g. poll())
6674 * from EVFILT_READ, then save that off in the hookid field
6675 * and reserve the kn_flags EV_OOBAND bit for output only.
6676 */
6677 if (kn->kn_filter == EVFILT_READ &&
6678 kn->kn_flags & EV_OOBAND) {
6679 kn->kn_flags &= ~EV_OOBAND;
6680 kn->kn_hook32 = EV_OOBAND;
6681 } else {
6682 kn->kn_hook32 = 0;
6683 }
6684 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6685 so->so_rcv.sb_flags |= SB_KNOTE;
6686 }
6687
6688 /* indicate if event is already fired */
6689 return filt_soread_common(kn, NULL, so);
6690 }
6691
6692 static void
filt_sordetach(struct knote * kn)6693 filt_sordetach(struct knote *kn)
6694 {
6695 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6696
6697 socket_lock(so, 1);
6698 if (so->so_rcv.sb_flags & SB_KNOTE) {
6699 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6700 so->so_rcv.sb_flags &= ~SB_KNOTE;
6701 }
6702 }
6703 socket_unlock(so, 1);
6704 }
6705
6706 /*ARGSUSED*/
6707 static int
filt_soread(struct knote * kn,long hint)6708 filt_soread(struct knote *kn, long hint)
6709 {
6710 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6711 int retval;
6712
6713 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6714 socket_lock(so, 1);
6715 }
6716
6717 retval = filt_soread_common(kn, NULL, so);
6718
6719 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6720 socket_unlock(so, 1);
6721 }
6722
6723 return retval;
6724 }
6725
6726 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6727 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6728 {
6729 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6730 int retval;
6731
6732 socket_lock(so, 1);
6733
6734 /* save off the new input fflags and data */
6735 kn->kn_sfflags = kev->fflags;
6736 kn->kn_sdata = kev->data;
6737
6738 /* determine if changes result in fired events */
6739 retval = filt_soread_common(kn, NULL, so);
6740
6741 socket_unlock(so, 1);
6742
6743 return retval;
6744 }
6745
6746 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6747 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6748 {
6749 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6750 int retval;
6751
6752 socket_lock(so, 1);
6753 retval = filt_soread_common(kn, kev, so);
6754 socket_unlock(so, 1);
6755
6756 return retval;
6757 }
6758
6759 int
so_wait_for_if_feedback(struct socket * so)6760 so_wait_for_if_feedback(struct socket *so)
6761 {
6762 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6763 (so->so_state & SS_ISCONNECTED)) {
6764 struct inpcb *inp = sotoinpcb(so);
6765 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6766 return 1;
6767 }
6768 }
6769 return 0;
6770 }
6771
6772 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6773 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6774 {
6775 int ret = 0;
6776 int64_t data = sbspace(&so->so_snd);
6777
6778 if (so->so_state & SS_CANTSENDMORE) {
6779 kn->kn_flags |= EV_EOF;
6780 kn->kn_fflags = so->so_error;
6781 ret = 1;
6782 goto out;
6783 }
6784
6785 if (so->so_error) { /* temporary udp error */
6786 ret = 1;
6787 goto out;
6788 }
6789
6790 if (!socanwrite(so)) {
6791 ret = 0;
6792 goto out;
6793 }
6794
6795 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6796 ret = 1;
6797 goto out;
6798 }
6799
6800 int64_t lowwat = so->so_snd.sb_lowat;
6801 const int64_t hiwat = so->so_snd.sb_hiwat;
6802 /*
6803 * Deal with connected UNIX domain sockets which
6804 * rely on the fact that the sender's socket buffer is
6805 * actually the receiver's socket buffer.
6806 */
6807 if (SOCK_DOM(so) == PF_LOCAL) {
6808 struct unpcb *unp = sotounpcb(so);
6809 if (unp != NULL && unp->unp_conn != NULL &&
6810 unp->unp_conn->unp_socket != NULL) {
6811 struct socket *so2 = unp->unp_conn->unp_socket;
6812 /*
6813 * At this point we know that `so' is locked
6814 * and that `unp_conn` isn't going to change.
6815 * However, we don't lock `so2` because doing so
6816 * may require unlocking `so'
6817 * (see unp_get_locks_in_order()).
6818 *
6819 * Two cases can happen:
6820 *
6821 * 1) we return 1 and tell the application that
6822 * it can write. Meanwhile, another thread
6823 * fills up the socket buffer. This will either
6824 * lead to a blocking send or EWOULDBLOCK
6825 * which the application should deal with.
6826 * 2) we return 0 and tell the application that
6827 * the socket is not writable. Meanwhile,
6828 * another thread depletes the receive socket
6829 * buffer. In this case the application will
6830 * be woken up by sb_notify().
6831 *
6832 * MIN() is required because otherwise sosendcheck()
6833 * may return EWOULDBLOCK since it only considers
6834 * so->so_snd.
6835 */
6836 data = MIN(data, sbspace(&so2->so_rcv));
6837 }
6838 }
6839
6840 if (kn->kn_sfflags & NOTE_LOWAT) {
6841 if (kn->kn_sdata > hiwat) {
6842 lowwat = hiwat;
6843 } else if (kn->kn_sdata > lowwat) {
6844 lowwat = kn->kn_sdata;
6845 }
6846 }
6847
6848 if (data > 0 && data >= lowwat) {
6849 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6850 #if (DEBUG || DEVELOPMENT)
6851 && so_notsent_lowat_check == 1
6852 #endif /* DEBUG || DEVELOPMENT */
6853 ) {
6854 if ((SOCK_DOM(so) == PF_INET ||
6855 SOCK_DOM(so) == PF_INET6) &&
6856 so->so_type == SOCK_STREAM) {
6857 ret = tcp_notsent_lowat_check(so);
6858 }
6859 #if MPTCP
6860 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6861 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6862 ret = mptcp_notsent_lowat_check(so);
6863 }
6864 #endif
6865 else {
6866 ret = 1;
6867 goto out;
6868 }
6869 } else {
6870 ret = 1;
6871 }
6872 }
6873 if (so_wait_for_if_feedback(so)) {
6874 ret = 0;
6875 }
6876
6877 out:
6878 if (ret && kev) {
6879 knote_fill_kevent(kn, kev, data);
6880 }
6881 return ret;
6882 }
6883
6884 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6885 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6886 {
6887 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6888
6889 /* socket locked */
6890 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6891 so->so_snd.sb_flags |= SB_KNOTE;
6892 }
6893
6894 /* determine if its already fired */
6895 return filt_sowrite_common(kn, NULL, so);
6896 }
6897
6898 static void
filt_sowdetach(struct knote * kn)6899 filt_sowdetach(struct knote *kn)
6900 {
6901 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6902 socket_lock(so, 1);
6903
6904 if (so->so_snd.sb_flags & SB_KNOTE) {
6905 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6906 so->so_snd.sb_flags &= ~SB_KNOTE;
6907 }
6908 }
6909 socket_unlock(so, 1);
6910 }
6911
6912 /*ARGSUSED*/
6913 static int
filt_sowrite(struct knote * kn,long hint)6914 filt_sowrite(struct knote *kn, long hint)
6915 {
6916 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6917 int ret;
6918
6919 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6920 socket_lock(so, 1);
6921 }
6922
6923 ret = filt_sowrite_common(kn, NULL, so);
6924
6925 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6926 socket_unlock(so, 1);
6927 }
6928
6929 return ret;
6930 }
6931
6932 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6933 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6934 {
6935 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6936 int ret;
6937
6938 socket_lock(so, 1);
6939
6940 /*save off the new input fflags and data */
6941 kn->kn_sfflags = kev->fflags;
6942 kn->kn_sdata = kev->data;
6943
6944 /* determine if these changes result in a triggered event */
6945 ret = filt_sowrite_common(kn, NULL, so);
6946
6947 socket_unlock(so, 1);
6948
6949 return ret;
6950 }
6951
6952 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6953 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6954 {
6955 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6956 int ret;
6957
6958 socket_lock(so, 1);
6959 ret = filt_sowrite_common(kn, kev, so);
6960 socket_unlock(so, 1);
6961
6962 return ret;
6963 }
6964
6965 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6966 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6967 struct socket *so, long ev_hint)
6968 {
6969 int ret = 0;
6970 int64_t data = 0;
6971 uint32_t level_trigger = 0;
6972
6973 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6974 kn->kn_fflags |= NOTE_CONNRESET;
6975 }
6976 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6977 kn->kn_fflags |= NOTE_TIMEOUT;
6978 }
6979 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6980 kn->kn_fflags |= NOTE_NOSRCADDR;
6981 }
6982 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6983 kn->kn_fflags |= NOTE_IFDENIED;
6984 }
6985 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6986 kn->kn_fflags |= NOTE_KEEPALIVE;
6987 }
6988 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6989 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6990 }
6991 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6992 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6993 }
6994 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6995 (so->so_state & SS_ISCONNECTED)) {
6996 kn->kn_fflags |= NOTE_CONNECTED;
6997 level_trigger |= NOTE_CONNECTED;
6998 }
6999 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7000 (so->so_state & SS_ISDISCONNECTED)) {
7001 kn->kn_fflags |= NOTE_DISCONNECTED;
7002 level_trigger |= NOTE_DISCONNECTED;
7003 }
7004 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7005 if (so->so_proto != NULL &&
7006 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7007 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7008 }
7009 }
7010 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7011 tcp_notify_ack_active(so)) {
7012 kn->kn_fflags |= NOTE_NOTIFY_ACK;
7013 }
7014 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7015 kn->kn_fflags |= NOTE_WAKE_PKT;
7016 }
7017
7018 if ((so->so_state & SS_CANTRCVMORE)
7019 #if CONTENT_FILTER
7020 && cfil_sock_data_pending(&so->so_rcv) == 0
7021 #endif /* CONTENT_FILTER */
7022 ) {
7023 kn->kn_fflags |= NOTE_READCLOSED;
7024 level_trigger |= NOTE_READCLOSED;
7025 }
7026
7027 if (so->so_state & SS_CANTSENDMORE) {
7028 kn->kn_fflags |= NOTE_WRITECLOSED;
7029 level_trigger |= NOTE_WRITECLOSED;
7030 }
7031
7032 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7033 (so->so_flags & SOF_SUSPENDED)) {
7034 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7035
7036 /* If resume event was delivered before, reset it */
7037 kn->kn_hook32 &= ~NOTE_RESUME;
7038
7039 kn->kn_fflags |= NOTE_SUSPEND;
7040 level_trigger |= NOTE_SUSPEND;
7041 }
7042
7043 if ((ev_hint & SO_FILT_HINT_RESUME) ||
7044 (so->so_flags & SOF_SUSPENDED) == 0) {
7045 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7046
7047 /* If suspend event was delivered before, reset it */
7048 kn->kn_hook32 &= ~NOTE_SUSPEND;
7049
7050 kn->kn_fflags |= NOTE_RESUME;
7051 level_trigger |= NOTE_RESUME;
7052 }
7053
7054 if (so->so_error != 0) {
7055 ret = 1;
7056 data = so->so_error;
7057 kn->kn_flags |= EV_EOF;
7058 } else {
7059 u_int32_t data32 = 0;
7060 get_sockev_state(so, &data32);
7061 data = data32;
7062 }
7063
7064 /* Reset any events that are not requested on this knote */
7065 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7066 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7067
7068 /* Find the level triggerred events that are already delivered */
7069 level_trigger &= kn->kn_hook32;
7070 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7071
7072 /* Do not deliver level triggerred events more than once */
7073 if ((kn->kn_fflags & ~level_trigger) != 0) {
7074 ret = 1;
7075 }
7076
7077 if (ret && kev) {
7078 /*
7079 * Store the state of the events being delivered. This
7080 * state can be used to deliver level triggered events
7081 * ateast once and still avoid waking up the application
7082 * multiple times as long as the event is active.
7083 */
7084 if (kn->kn_fflags != 0) {
7085 kn->kn_hook32 |= (kn->kn_fflags &
7086 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7087 }
7088
7089 /*
7090 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7091 * only one of them and remember the last one that was
7092 * delivered last
7093 */
7094 if (kn->kn_fflags & NOTE_SUSPEND) {
7095 kn->kn_hook32 &= ~NOTE_RESUME;
7096 }
7097 if (kn->kn_fflags & NOTE_RESUME) {
7098 kn->kn_hook32 &= ~NOTE_SUSPEND;
7099 }
7100
7101 knote_fill_kevent(kn, kev, data);
7102 }
7103 return ret;
7104 }
7105
7106 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7107 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7108 {
7109 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7110
7111 /* socket locked */
7112 kn->kn_hook32 = 0;
7113 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7114 so->so_flags |= SOF_KNOTE;
7115 }
7116
7117 /* determine if event already fired */
7118 return filt_sockev_common(kn, NULL, so, 0);
7119 }
7120
7121 static void
filt_sockdetach(struct knote * kn)7122 filt_sockdetach(struct knote *kn)
7123 {
7124 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7125 socket_lock(so, 1);
7126
7127 if ((so->so_flags & SOF_KNOTE) != 0) {
7128 if (KNOTE_DETACH(&so->so_klist, kn)) {
7129 so->so_flags &= ~SOF_KNOTE;
7130 }
7131 }
7132 socket_unlock(so, 1);
7133 }
7134
7135 static int
filt_sockev(struct knote * kn,long hint)7136 filt_sockev(struct knote *kn, long hint)
7137 {
7138 int ret = 0, locked = 0;
7139 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7140 long ev_hint = (hint & SO_FILT_HINT_EV);
7141
7142 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7143 socket_lock(so, 1);
7144 locked = 1;
7145 }
7146
7147 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7148
7149 if (locked) {
7150 socket_unlock(so, 1);
7151 }
7152
7153 return ret;
7154 }
7155
7156
7157
7158 /*
7159 * filt_socktouch - update event state
7160 */
7161 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7162 filt_socktouch(
7163 struct knote *kn,
7164 struct kevent_qos_s *kev)
7165 {
7166 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7167 uint32_t changed_flags;
7168 int ret;
7169
7170 socket_lock(so, 1);
7171
7172 /* save off the [result] data and fflags */
7173 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7174
7175 /* save off the new input fflags and data */
7176 kn->kn_sfflags = kev->fflags;
7177 kn->kn_sdata = kev->data;
7178
7179 /* restrict the current results to the (smaller?) set of new interest */
7180 /*
7181 * For compatibility with previous implementations, we leave kn_fflags
7182 * as they were before.
7183 */
7184 //kn->kn_fflags &= kev->fflags;
7185
7186 /*
7187 * Since we keep track of events that are already
7188 * delivered, if any of those events are not requested
7189 * anymore the state related to them can be reset
7190 */
7191 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7192
7193 /* determine if we have events to deliver */
7194 ret = filt_sockev_common(kn, NULL, so, 0);
7195
7196 socket_unlock(so, 1);
7197
7198 return ret;
7199 }
7200
7201 /*
7202 * filt_sockprocess - query event fired state and return data
7203 */
7204 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7205 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7206 {
7207 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7208 int ret = 0;
7209
7210 socket_lock(so, 1);
7211
7212 ret = filt_sockev_common(kn, kev, so, 0);
7213
7214 socket_unlock(so, 1);
7215
7216 return ret;
7217 }
7218
7219 void
get_sockev_state(struct socket * so,u_int32_t * statep)7220 get_sockev_state(struct socket *so, u_int32_t *statep)
7221 {
7222 u_int32_t state = *(statep);
7223
7224 /*
7225 * If the state variable is already used by a previous event,
7226 * reset it.
7227 */
7228 if (state != 0) {
7229 return;
7230 }
7231
7232 if (so->so_state & SS_ISCONNECTED) {
7233 state |= SOCKEV_CONNECTED;
7234 } else {
7235 state &= ~(SOCKEV_CONNECTED);
7236 }
7237 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7238 *(statep) = state;
7239 }
7240
7241 #define SO_LOCK_HISTORY_STR_LEN \
7242 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7243
7244 __private_extern__ const char *
solockhistory_nr(struct socket * so)7245 solockhistory_nr(struct socket *so)
7246 {
7247 size_t n = 0;
7248 int i;
7249 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7250
7251 bzero(lock_history_str, sizeof(lock_history_str));
7252 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7253 n += scnprintf(lock_history_str + n,
7254 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7255 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7256 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7257 }
7258 return __unsafe_null_terminated_from_indexable(lock_history_str);
7259 }
7260
7261 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7262 socket_getlock(struct socket *so, int flags)
7263 {
7264 if (so->so_proto->pr_getlock != NULL) {
7265 return (*so->so_proto->pr_getlock)(so, flags);
7266 } else {
7267 return so->so_proto->pr_domain->dom_mtx;
7268 }
7269 }
7270
7271 void
socket_lock(struct socket * so,int refcount)7272 socket_lock(struct socket *so, int refcount)
7273 {
7274 void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7275
7276 if (so->so_proto->pr_lock) {
7277 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7278 } else {
7279 #ifdef MORE_LOCKING_DEBUG
7280 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7281 LCK_MTX_ASSERT_NOTOWNED);
7282 #endif
7283 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7284 if (refcount) {
7285 so->so_usecount++;
7286 }
7287 so->lock_lr[so->next_lock_lr] = lr_saved;
7288 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7289 }
7290 }
7291
7292 void
socket_lock_assert_owned(struct socket * so)7293 socket_lock_assert_owned(struct socket *so)
7294 {
7295 lck_mtx_t *mutex_held;
7296
7297 if (so->so_proto->pr_getlock != NULL) {
7298 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7299 } else {
7300 mutex_held = so->so_proto->pr_domain->dom_mtx;
7301 }
7302
7303 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7304 }
7305
7306 int
socket_try_lock(struct socket * so)7307 socket_try_lock(struct socket *so)
7308 {
7309 lck_mtx_t *mtx;
7310
7311 if (so->so_proto->pr_getlock != NULL) {
7312 mtx = (*so->so_proto->pr_getlock)(so, 0);
7313 } else {
7314 mtx = so->so_proto->pr_domain->dom_mtx;
7315 }
7316
7317 return lck_mtx_try_lock(mtx);
7318 }
7319
7320 void
socket_unlock(struct socket * so,int refcount)7321 socket_unlock(struct socket *so, int refcount)
7322 {
7323 lck_mtx_t *mutex_held;
7324 void *__single lr_saved = __unsafe_forge_single(void *, __builtin_return_address(0));
7325
7326 if (so == NULL || so->so_proto == NULL) {
7327 panic("%s: null so_proto so=%p", __func__, so);
7328 /* NOTREACHED */
7329 }
7330
7331 if (so->so_proto->pr_unlock) {
7332 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7333 } else {
7334 mutex_held = so->so_proto->pr_domain->dom_mtx;
7335 #ifdef MORE_LOCKING_DEBUG
7336 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7337 #endif
7338 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7339 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7340
7341 if (refcount) {
7342 if (so->so_usecount <= 0) {
7343 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7344 "lrh=%s", __func__, so->so_usecount, so,
7345 SOCK_DOM(so), so->so_type,
7346 SOCK_PROTO(so), solockhistory_nr(so));
7347 /* NOTREACHED */
7348 }
7349
7350 so->so_usecount--;
7351 if (so->so_usecount == 0) {
7352 sofreelastref(so, 1);
7353 }
7354 }
7355 lck_mtx_unlock(mutex_held);
7356 }
7357 }
7358
7359 /* Called with socket locked, will unlock socket */
7360 void
sofree(struct socket * so)7361 sofree(struct socket *so)
7362 {
7363 lck_mtx_t *mutex_held;
7364
7365 if (so->so_proto->pr_getlock != NULL) {
7366 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7367 } else {
7368 mutex_held = so->so_proto->pr_domain->dom_mtx;
7369 }
7370 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7371
7372 sofreelastref(so, 0);
7373 }
7374
7375 void
soreference(struct socket * so)7376 soreference(struct socket *so)
7377 {
7378 socket_lock(so, 1); /* locks & take one reference on socket */
7379 socket_unlock(so, 0); /* unlock only */
7380 }
7381
7382 void
sodereference(struct socket * so)7383 sodereference(struct socket *so)
7384 {
7385 socket_lock(so, 0);
7386 socket_unlock(so, 1);
7387 }
7388
7389 /*
7390 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7391 * possibility of using jumbo clusters. Caller must ensure to hold
7392 * the socket lock.
7393 */
7394 void
somultipages(struct socket * so,boolean_t set)7395 somultipages(struct socket *so, boolean_t set)
7396 {
7397 if (set) {
7398 so->so_flags |= SOF_MULTIPAGES;
7399 } else {
7400 so->so_flags &= ~SOF_MULTIPAGES;
7401 }
7402 }
7403
7404 void
soif2kcl(struct socket * so,boolean_t set)7405 soif2kcl(struct socket *so, boolean_t set)
7406 {
7407 if (set) {
7408 so->so_flags1 |= SOF1_IF_2KCL;
7409 } else {
7410 so->so_flags1 &= ~SOF1_IF_2KCL;
7411 }
7412 }
7413
7414 int
so_isdstlocal(struct socket * so)7415 so_isdstlocal(struct socket *so)
7416 {
7417 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7418
7419 if (SOCK_DOM(so) == PF_INET) {
7420 return inaddr_local(inp->inp_faddr);
7421 } else if (SOCK_DOM(so) == PF_INET6) {
7422 return in6addr_local(&inp->in6p_faddr);
7423 }
7424
7425 return 0;
7426 }
7427
7428 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7429 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7430 {
7431 struct sockbuf *rcv, *snd;
7432 int err = 0, defunct;
7433
7434 rcv = &so->so_rcv;
7435 snd = &so->so_snd;
7436
7437 defunct = (so->so_flags & SOF_DEFUNCT);
7438 if (defunct) {
7439 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7440 panic("%s: SB_DROP not set", __func__);
7441 /* NOTREACHED */
7442 }
7443 goto done;
7444 }
7445
7446 if (so->so_flags & SOF_NODEFUNCT) {
7447 if (noforce) {
7448 err = EOPNOTSUPP;
7449 if (p != PROC_NULL) {
7450 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7451 "name %s level %d) so 0x%llu [%d,%d] "
7452 "is not eligible for defunct "
7453 "(%d)\n", __func__, proc_selfpid(),
7454 proc_best_name(current_proc()), proc_pid(p),
7455 proc_best_name(p), level,
7456 so->so_gencnt,
7457 SOCK_DOM(so), SOCK_TYPE(so), err);
7458 }
7459 return err;
7460 }
7461 so->so_flags &= ~SOF_NODEFUNCT;
7462 if (p != PROC_NULL) {
7463 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7464 "name %s level %d) so 0x%llu [%d,%d] "
7465 "defunct by force "
7466 "(%d)\n", __func__, proc_selfpid(),
7467 proc_best_name(current_proc()), proc_pid(p),
7468 proc_best_name(p), level,
7469 so->so_gencnt,
7470 SOCK_DOM(so), SOCK_TYPE(so), err);
7471 }
7472 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7473 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7474 struct ifnet *ifp = inp->inp_last_outifp;
7475
7476 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7477 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7478 } else if (so->so_flags & SOF_DELEGATED) {
7479 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7480 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7481 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7482 } else if (noforce && p != PROC_NULL) {
7483 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7484
7485 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7486 so->so_extended_bk_start = net_uptime();
7487 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7488
7489 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7490
7491 err = EOPNOTSUPP;
7492 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7493 "name %s level %d) so 0x%llu [%d,%d] "
7494 "extend bk idle "
7495 "(%d)\n", __func__, proc_selfpid(),
7496 proc_best_name(current_proc()), proc_pid(p),
7497 proc_best_name(p), level,
7498 so->so_gencnt,
7499 SOCK_DOM(so), SOCK_TYPE(so), err);
7500 return err;
7501 } else {
7502 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7503 }
7504 }
7505
7506 so->so_flags |= SOF_DEFUNCT;
7507
7508 /* Prevent further data from being appended to the socket buffers */
7509 snd->sb_flags |= SB_DROP;
7510 rcv->sb_flags |= SB_DROP;
7511
7512 /* Flush any existing data in the socket buffers */
7513 if (rcv->sb_cc != 0) {
7514 rcv->sb_flags &= ~SB_SEL;
7515 selthreadclear(&rcv->sb_sel);
7516 sbrelease(rcv);
7517 }
7518 if (snd->sb_cc != 0) {
7519 snd->sb_flags &= ~SB_SEL;
7520 selthreadclear(&snd->sb_sel);
7521 sbrelease(snd);
7522 }
7523
7524 done:
7525 if (p != PROC_NULL) {
7526 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7527 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7528 proc_selfpid(), proc_best_name(current_proc()),
7529 proc_pid(p), proc_best_name(p), level,
7530 so->so_gencnt, SOCK_DOM(so),
7531 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7532 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7533 " extbkidle" : "");
7534 }
7535 return err;
7536 }
7537
7538 int
sodefunct(struct proc * p,struct socket * so,int level)7539 sodefunct(struct proc *p, struct socket *so, int level)
7540 {
7541 struct sockbuf *rcv, *snd;
7542
7543 if (!(so->so_flags & SOF_DEFUNCT)) {
7544 panic("%s improperly called", __func__);
7545 /* NOTREACHED */
7546 }
7547 if (so->so_state & SS_DEFUNCT) {
7548 goto done;
7549 }
7550
7551 rcv = &so->so_rcv;
7552 snd = &so->so_snd;
7553
7554 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7555 char s[MAX_IPv6_STR_LEN];
7556 char d[MAX_IPv6_STR_LEN];
7557 struct inpcb *inp = sotoinpcb(so);
7558
7559 if (p != PROC_NULL) {
7560 SODEFUNCTLOG(
7561 "%s[%d, %s]: (target pid %d name %s level %d) "
7562 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7563 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7564 " snd_fl 0x%x]\n", __func__,
7565 proc_selfpid(), proc_best_name(current_proc()),
7566 proc_pid(p), proc_best_name(p), level,
7567 so->so_gencnt,
7568 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7569 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7570 (void *)&inp->inp_laddr.s_addr :
7571 (void *)&inp->in6p_laddr),
7572 s, sizeof(s)), ntohs(inp->in6p_lport),
7573 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7574 (void *)&inp->inp_faddr.s_addr :
7575 (void *)&inp->in6p_faddr,
7576 d, sizeof(d)), ntohs(inp->in6p_fport),
7577 (uint32_t)rcv->sb_sel.si_flags,
7578 (uint32_t)snd->sb_sel.si_flags,
7579 rcv->sb_flags, snd->sb_flags);
7580 }
7581 } else if (p != PROC_NULL) {
7582 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7583 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7584 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7585 proc_selfpid(), proc_best_name(current_proc()),
7586 proc_pid(p), proc_best_name(p), level,
7587 so->so_gencnt,
7588 SOCK_DOM(so), SOCK_TYPE(so),
7589 (uint32_t)rcv->sb_sel.si_flags,
7590 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7591 snd->sb_flags);
7592 }
7593
7594 /*
7595 * First tell the protocol the flow is defunct
7596 */
7597 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7598
7599 /*
7600 * Unwedge threads blocked on sbwait() and sb_lock().
7601 */
7602 sbwakeup(rcv);
7603 sbwakeup(snd);
7604
7605 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7606 if (rcv->sb_flags & SB_LOCK) {
7607 sbunlock(rcv, TRUE); /* keep socket locked */
7608 }
7609 if (snd->sb_flags & SB_LOCK) {
7610 sbunlock(snd, TRUE); /* keep socket locked */
7611 }
7612 /*
7613 * Flush the buffers and disconnect. We explicitly call shutdown
7614 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7615 * states are set for the socket. This would also flush out data
7616 * hanging off the receive list of this socket.
7617 */
7618 (void) soshutdownlock_final(so, SHUT_RD);
7619 (void) soshutdownlock_final(so, SHUT_WR);
7620 (void) sodisconnectlocked(so);
7621
7622 /*
7623 * Explicitly handle connectionless-protocol disconnection
7624 * and release any remaining data in the socket buffers.
7625 */
7626 if (!(so->so_state & SS_ISDISCONNECTED)) {
7627 (void) soisdisconnected(so);
7628 }
7629
7630 if (so->so_error == 0) {
7631 so->so_error = EBADF;
7632 }
7633
7634 if (rcv->sb_cc != 0) {
7635 rcv->sb_flags &= ~SB_SEL;
7636 selthreadclear(&rcv->sb_sel);
7637 sbrelease(rcv);
7638 }
7639 if (snd->sb_cc != 0) {
7640 snd->sb_flags &= ~SB_SEL;
7641 selthreadclear(&snd->sb_sel);
7642 sbrelease(snd);
7643 }
7644 so->so_state |= SS_DEFUNCT;
7645 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7646
7647 done:
7648 return 0;
7649 }
7650
7651 int
soresume(struct proc * p,struct socket * so,int locked)7652 soresume(struct proc *p, struct socket *so, int locked)
7653 {
7654 if (locked == 0) {
7655 socket_lock(so, 1);
7656 }
7657
7658 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7659 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7660 "[%d,%d] resumed from bk idle\n",
7661 __func__, proc_selfpid(), proc_best_name(current_proc()),
7662 proc_pid(p), proc_best_name(p),
7663 so->so_gencnt,
7664 SOCK_DOM(so), SOCK_TYPE(so));
7665
7666 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7667 so->so_extended_bk_start = 0;
7668 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7669
7670 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7671 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7672 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7673 }
7674 if (locked == 0) {
7675 socket_unlock(so, 1);
7676 }
7677
7678 return 0;
7679 }
7680
7681 /*
7682 * Does not attempt to account for sockets that are delegated from
7683 * the current process
7684 */
7685 int
so_set_extended_bk_idle(struct socket * so,int optval)7686 so_set_extended_bk_idle(struct socket *so, int optval)
7687 {
7688 int error = 0;
7689
7690 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7691 SOCK_PROTO(so) != IPPROTO_TCP) {
7692 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7693 error = EOPNOTSUPP;
7694 } else if (optval == 0) {
7695 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7696
7697 soresume(current_proc(), so, 1);
7698 } else {
7699 struct proc *p = current_proc();
7700 struct fileproc *fp;
7701 int count = 0;
7702
7703 /*
7704 * Unlock socket to avoid lock ordering issue with
7705 * the proc fd table lock
7706 */
7707 socket_unlock(so, 0);
7708
7709 proc_fdlock(p);
7710 fdt_foreach(fp, p) {
7711 struct socket *so2;
7712
7713 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7714 continue;
7715 }
7716
7717 so2 = (struct socket *)fp_get_data(fp);
7718 if (so != so2 &&
7719 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7720 count++;
7721 }
7722 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7723 break;
7724 }
7725 }
7726 proc_fdunlock(p);
7727
7728 socket_lock(so, 0);
7729
7730 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7731 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7732 error = EBUSY;
7733 } else if (so->so_flags & SOF_DELEGATED) {
7734 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7735 error = EBUSY;
7736 } else {
7737 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7738 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7739 }
7740 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7741 "%s marked for extended bk idle\n",
7742 __func__, proc_selfpid(), proc_best_name(current_proc()),
7743 so->so_gencnt,
7744 SOCK_DOM(so), SOCK_TYPE(so),
7745 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7746 "is" : "not");
7747 }
7748
7749 return error;
7750 }
7751
7752 static void
so_stop_extended_bk_idle(struct socket * so)7753 so_stop_extended_bk_idle(struct socket *so)
7754 {
7755 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7756 so->so_extended_bk_start = 0;
7757
7758 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7759 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7760 /*
7761 * Force defunct
7762 */
7763 sosetdefunct(current_proc(), so,
7764 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7765 if (so->so_flags & SOF_DEFUNCT) {
7766 sodefunct(current_proc(), so,
7767 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7768 }
7769 }
7770
7771 void
so_drain_extended_bk_idle(struct socket * so)7772 so_drain_extended_bk_idle(struct socket *so)
7773 {
7774 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7775 /*
7776 * Only penalize sockets that have outstanding data
7777 */
7778 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7779 so_stop_extended_bk_idle(so);
7780
7781 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7782 }
7783 }
7784 }
7785
7786 /*
7787 * Return values tells if socket is still in extended background idle
7788 */
7789 int
so_check_extended_bk_idle_time(struct socket * so)7790 so_check_extended_bk_idle_time(struct socket *so)
7791 {
7792 int ret = 1;
7793
7794 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7795 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7796 __func__, proc_selfpid(), proc_best_name(current_proc()),
7797 so->so_gencnt,
7798 SOCK_DOM(so), SOCK_TYPE(so));
7799 if (net_uptime() - so->so_extended_bk_start >
7800 soextbkidlestat.so_xbkidle_time) {
7801 so_stop_extended_bk_idle(so);
7802
7803 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7804
7805 ret = 0;
7806 } else {
7807 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7808
7809 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7810 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7811 }
7812 }
7813
7814 return ret;
7815 }
7816
7817 void
resume_proc_sockets(proc_t p)7818 resume_proc_sockets(proc_t p)
7819 {
7820 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7821 struct fileproc *fp;
7822 struct socket *so;
7823
7824 proc_fdlock(p);
7825 fdt_foreach(fp, p) {
7826 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7827 continue;
7828 }
7829
7830 so = (struct socket *)fp_get_data(fp);
7831 (void) soresume(p, so, 0);
7832 }
7833 proc_fdunlock(p);
7834
7835 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7836 }
7837 }
7838
7839 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7840 so_set_recv_anyif(struct socket *so, int optval)
7841 {
7842 int ret = 0;
7843
7844 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7845 if (optval) {
7846 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7847 } else {
7848 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7849 }
7850 #if SKYWALK
7851 inp_update_netns_flags(so);
7852 #endif /* SKYWALK */
7853 }
7854
7855
7856 return ret;
7857 }
7858
7859 __private_extern__ int
so_get_recv_anyif(struct socket * so)7860 so_get_recv_anyif(struct socket *so)
7861 {
7862 int ret = 0;
7863
7864 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7865 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7866 }
7867
7868 return ret;
7869 }
7870
7871 int
so_set_restrictions(struct socket * so,uint32_t vals)7872 so_set_restrictions(struct socket *so, uint32_t vals)
7873 {
7874 int nocell_old, nocell_new;
7875 int noexpensive_old, noexpensive_new;
7876 int noconstrained_old, noconstrained_new;
7877
7878 /*
7879 * Deny-type restrictions are trapdoors; once set they cannot be
7880 * unset for the lifetime of the socket. This allows them to be
7881 * issued by a framework on behalf of the application without
7882 * having to worry that they can be undone.
7883 *
7884 * Note here that socket-level restrictions overrides any protocol
7885 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7886 * socket restriction issued on the socket has a higher precendence
7887 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7888 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7889 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7890 */
7891 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7892 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7893 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7894 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7895 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7896 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7897 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7898 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7899 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7900
7901 /* we can only set, not clear restrictions */
7902 if ((nocell_new - nocell_old) == 0 &&
7903 (noexpensive_new - noexpensive_old) == 0 &&
7904 (noconstrained_new - noconstrained_old) == 0) {
7905 return 0;
7906 }
7907 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7908 if (nocell_new - nocell_old != 0) {
7909 /*
7910 * if deny cellular is now set, do what's needed
7911 * for INPCB
7912 */
7913 inp_set_nocellular(sotoinpcb(so));
7914 }
7915 if (noexpensive_new - noexpensive_old != 0) {
7916 inp_set_noexpensive(sotoinpcb(so));
7917 }
7918 if (noconstrained_new - noconstrained_old != 0) {
7919 inp_set_noconstrained(sotoinpcb(so));
7920 }
7921 }
7922
7923 if (SOCK_DOM(so) == PF_MULTIPATH) {
7924 mptcp_set_restrictions(so);
7925 }
7926
7927 return 0;
7928 }
7929
7930 uint32_t
so_get_restrictions(struct socket * so)7931 so_get_restrictions(struct socket *so)
7932 {
7933 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7934 SO_RESTRICT_DENY_OUT |
7935 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7936 }
7937
7938 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7939 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7940 {
7941 struct proc *ep = PROC_NULL;
7942 int error = 0;
7943
7944 /* pid 0 is reserved for kernel */
7945 if (epid == 0) {
7946 error = EINVAL;
7947 goto done;
7948 }
7949
7950 /*
7951 * If this is an in-kernel socket, prevent its delegate
7952 * association from changing unless the socket option is
7953 * coming from within the kernel itself.
7954 */
7955 if (so->last_pid == 0 && p != kernproc) {
7956 error = EACCES;
7957 goto done;
7958 }
7959
7960 /*
7961 * If this is issued by a process that's recorded as the
7962 * real owner of the socket, or if the pid is the same as
7963 * the process's own pid, then proceed. Otherwise ensure
7964 * that the issuing process has the necessary privileges.
7965 */
7966 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7967 if ((error = priv_check_cred(kauth_cred_get(),
7968 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7969 error = EACCES;
7970 goto done;
7971 }
7972 }
7973
7974 /* Find the process that corresponds to the effective pid */
7975 if ((ep = proc_find(epid)) == PROC_NULL) {
7976 error = ESRCH;
7977 goto done;
7978 }
7979
7980 /*
7981 * If a process tries to delegate the socket to itself, then
7982 * there's really nothing to do; treat it as a way for the
7983 * delegate association to be cleared. Note that we check
7984 * the passed-in proc rather than calling proc_selfpid(),
7985 * as we need to check the process issuing the socket option
7986 * which could be kernproc. Given that we don't allow 0 for
7987 * effective pid, it means that a delegated in-kernel socket
7988 * stays delegated during its lifetime (which is probably OK.)
7989 */
7990 if (epid == proc_pid(p)) {
7991 so->so_flags &= ~SOF_DELEGATED;
7992 so->e_upid = 0;
7993 so->e_pid = 0;
7994 uuid_clear(so->e_uuid);
7995 } else {
7996 so->so_flags |= SOF_DELEGATED;
7997 so->e_upid = proc_uniqueid(ep);
7998 so->e_pid = proc_pid(ep);
7999 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8000
8001 #if defined(XNU_TARGET_OS_OSX)
8002 if (ep->p_responsible_pid != so->e_pid) {
8003 proc_t rp = proc_find(ep->p_responsible_pid);
8004 if (rp != PROC_NULL) {
8005 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8006 so->so_rpid = ep->p_responsible_pid;
8007 proc_rele(rp);
8008 } else {
8009 uuid_clear(so->so_ruuid);
8010 so->so_rpid = -1;
8011 }
8012 }
8013 #endif
8014 }
8015 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8016 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8017 }
8018 done:
8019 if (error == 0 && net_io_policy_log) {
8020 uuid_string_t buf;
8021
8022 uuid_unparse(so->e_uuid, buf);
8023 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8024 "euuid %s%s\n", __func__, proc_name_address(p),
8025 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8026 SOCK_DOM(so), SOCK_TYPE(so),
8027 so->e_pid, proc_name_address(ep), buf,
8028 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8029 } else if (error != 0 && net_io_policy_log) {
8030 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8031 "ERROR (%d)\n", __func__, proc_name_address(p),
8032 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8033 SOCK_DOM(so), SOCK_TYPE(so),
8034 epid, (ep == PROC_NULL) ? "PROC_NULL" :
8035 proc_name_address(ep), error);
8036 }
8037
8038 /* Update this socket's policy upon success */
8039 if (error == 0) {
8040 so->so_policy_gencnt *= -1;
8041 so_update_policy(so);
8042 #if NECP
8043 so_update_necp_policy(so, NULL, NULL);
8044 #endif /* NECP */
8045 }
8046
8047 if (ep != PROC_NULL) {
8048 proc_rele(ep);
8049 }
8050
8051 return error;
8052 }
8053
8054 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8055 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8056 {
8057 uuid_string_t buf;
8058 uuid_t uuid;
8059 int error = 0;
8060
8061 /* UUID must not be all-zeroes (reserved for kernel) */
8062 if (uuid_is_null(euuid)) {
8063 error = EINVAL;
8064 goto done;
8065 }
8066
8067 /*
8068 * If this is an in-kernel socket, prevent its delegate
8069 * association from changing unless the socket option is
8070 * coming from within the kernel itself.
8071 */
8072 if (so->last_pid == 0 && p != kernproc) {
8073 error = EACCES;
8074 goto done;
8075 }
8076
8077 /* Get the UUID of the issuing process */
8078 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8079
8080 /*
8081 * If this is issued by a process that's recorded as the
8082 * real owner of the socket, or if the uuid is the same as
8083 * the process's own uuid, then proceed. Otherwise ensure
8084 * that the issuing process has the necessary privileges.
8085 */
8086 if (check_cred &&
8087 (uuid_compare(euuid, so->last_uuid) != 0 ||
8088 uuid_compare(euuid, uuid) != 0)) {
8089 if ((error = priv_check_cred(kauth_cred_get(),
8090 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8091 error = EACCES;
8092 goto done;
8093 }
8094 }
8095
8096 /*
8097 * If a process tries to delegate the socket to itself, then
8098 * there's really nothing to do; treat it as a way for the
8099 * delegate association to be cleared. Note that we check
8100 * the uuid of the passed-in proc rather than that of the
8101 * current process, as we need to check the process issuing
8102 * the socket option which could be kernproc itself. Given
8103 * that we don't allow 0 for effective uuid, it means that
8104 * a delegated in-kernel socket stays delegated during its
8105 * lifetime (which is okay.)
8106 */
8107 if (uuid_compare(euuid, uuid) == 0) {
8108 so->so_flags &= ~SOF_DELEGATED;
8109 so->e_upid = 0;
8110 so->e_pid = 0;
8111 uuid_clear(so->e_uuid);
8112 } else {
8113 so->so_flags |= SOF_DELEGATED;
8114 /*
8115 * Unlike so_set_effective_pid(), we only have the UUID
8116 * here and the process ID is not known. Inherit the
8117 * real {pid,upid} of the socket.
8118 */
8119 so->e_upid = so->last_upid;
8120 so->e_pid = so->last_pid;
8121 uuid_copy(so->e_uuid, euuid);
8122 }
8123 /*
8124 * The following will clear the effective process name as it's the same
8125 * as the real process
8126 */
8127 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8128 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8129 }
8130 done:
8131 if (error == 0 && net_io_policy_log) {
8132 uuid_unparse(so->e_uuid, buf);
8133 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8134 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8135 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8136 SOCK_TYPE(so), so->e_pid, buf,
8137 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8138 } else if (error != 0 && net_io_policy_log) {
8139 uuid_unparse(euuid, buf);
8140 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8141 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8142 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8143 SOCK_TYPE(so), buf, error);
8144 }
8145
8146 /* Update this socket's policy upon success */
8147 if (error == 0) {
8148 so->so_policy_gencnt *= -1;
8149 so_update_policy(so);
8150 #if NECP
8151 so_update_necp_policy(so, NULL, NULL);
8152 #endif /* NECP */
8153 }
8154
8155 return error;
8156 }
8157
8158 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8159 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8160 uint32_t ev_datalen)
8161 {
8162 struct kev_msg ev_msg;
8163
8164 /*
8165 * A netpolicy event always starts with a netpolicy_event_data
8166 * structure, but the caller can provide for a longer event
8167 * structure to post, depending on the event code.
8168 */
8169 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8170
8171 bzero(&ev_msg, sizeof(ev_msg));
8172 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8173 ev_msg.kev_class = KEV_NETWORK_CLASS;
8174 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8175 ev_msg.event_code = ev_code;
8176
8177 ev_msg.dv[0].data_ptr = ev_data;
8178 ev_msg.dv[0].data_length = ev_datalen;
8179
8180 kev_post_msg(&ev_msg);
8181 }
8182
8183 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8184 socket_post_kev_msg(uint32_t ev_code,
8185 struct kev_socket_event_data *ev_data,
8186 uint32_t ev_datalen)
8187 {
8188 struct kev_msg ev_msg;
8189
8190 bzero(&ev_msg, sizeof(ev_msg));
8191 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8192 ev_msg.kev_class = KEV_NETWORK_CLASS;
8193 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8194 ev_msg.event_code = ev_code;
8195
8196 ev_msg.dv[0].data_ptr = ev_data;
8197 ev_msg.dv[0].data_length = ev_datalen;
8198
8199 kev_post_msg(&ev_msg);
8200 }
8201
8202 void
socket_post_kev_msg_closed(struct socket * so)8203 socket_post_kev_msg_closed(struct socket *so)
8204 {
8205 struct kev_socket_closed ev = {};
8206 struct sockaddr *__single socksa = NULL, *__single peersa = NULL;
8207 int err;
8208
8209 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8210 return;
8211 }
8212 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8213 if (err == 0) {
8214 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8215 &peersa);
8216 if (err == 0) {
8217 SOCKADDR_COPY(socksa, &ev.ev_data.kev_sockname,
8218 min(socksa->sa_len,
8219 sizeof(ev.ev_data.kev_sockname)));
8220 SOCKADDR_COPY(peersa, &ev.ev_data.kev_peername,
8221 min(peersa->sa_len,
8222 sizeof(ev.ev_data.kev_peername)));
8223 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8224 &ev.ev_data, sizeof(ev));
8225 }
8226 }
8227 free_sockaddr(socksa);
8228 free_sockaddr(peersa);
8229 }
8230
8231 __attribute__((noinline, cold, not_tail_called, noreturn))
8232 __private_extern__ int
assfail(const char * a,const char * f,int l)8233 assfail(const char *a, const char *f, int l)
8234 {
8235 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8236 /* NOTREACHED */
8237 __builtin_unreachable();
8238 }
8239