1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <netinet/in.h>
106 #include <netinet/in_pcb.h>
107 #include <netinet/in_tclass.h>
108 #include <netinet/in_var.h>
109 #include <netinet/tcp_var.h>
110 #include <netinet/ip6.h>
111 #include <netinet6/ip6_var.h>
112 #include <netinet/flow_divert.h>
113 #include <kern/zalloc.h>
114 #include <kern/locks.h>
115 #include <machine/limits.h>
116 #include <libkern/OSAtomic.h>
117 #include <pexpert/pexpert.h>
118 #include <kern/assert.h>
119 #include <kern/task.h>
120 #include <kern/policy_internal.h>
121
122 #include <sys/kpi_mbuf.h>
123 #include <sys/mcache.h>
124 #include <sys/unpcb.h>
125 #include <libkern/section_keywords.h>
126
127 #include <os/log.h>
128
129 #if CONFIG_MACF
130 #include <security/mac_framework.h>
131 #endif /* MAC */
132
133 #if MULTIPATH
134 #include <netinet/mp_pcb.h>
135 #include <netinet/mptcp_var.h>
136 #endif /* MULTIPATH */
137
138 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
139
140 #if DEBUG || DEVELOPMENT
141 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
142 #else
143 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
144 #endif
145
146 /* TODO: this should be in a header file somewhere */
147 extern char *proc_name_address(void *p);
148
149 static u_int32_t so_cache_hw; /* High water mark for socache */
150 static u_int32_t so_cache_timeouts; /* number of timeouts */
151 static u_int32_t so_cache_max_freed; /* max freed per timeout */
152 static u_int32_t cached_sock_count = 0;
153 STAILQ_HEAD(, socket) so_cache_head;
154 int max_cached_sock_count = MAX_CACHED_SOCKETS;
155 static uint64_t so_cache_time;
156 static int socketinit_done;
157 static struct zone *so_cache_zone;
158
159 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
160 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
161
162 #include <machine/limits.h>
163
164 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void filt_sordetach(struct knote *kn);
166 static int filt_soread(struct knote *kn, long hint);
167 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
169
170 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void filt_sowdetach(struct knote *kn);
172 static int filt_sowrite(struct knote *kn, long hint);
173 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
175
176 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
177 static void filt_sockdetach(struct knote *kn);
178 static int filt_sockev(struct knote *kn, long hint);
179 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
180 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
181
182 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
183 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
184
185 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
186 .f_isfd = 1,
187 .f_attach = filt_sorattach,
188 .f_detach = filt_sordetach,
189 .f_event = filt_soread,
190 .f_touch = filt_sortouch,
191 .f_process = filt_sorprocess,
192 };
193
194 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
195 .f_isfd = 1,
196 .f_attach = filt_sowattach,
197 .f_detach = filt_sowdetach,
198 .f_event = filt_sowrite,
199 .f_touch = filt_sowtouch,
200 .f_process = filt_sowprocess,
201 };
202
203 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
204 .f_isfd = 1,
205 .f_attach = filt_sockattach,
206 .f_detach = filt_sockdetach,
207 .f_event = filt_sockev,
208 .f_touch = filt_socktouch,
209 .f_process = filt_sockprocess,
210 };
211
212 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
213 .f_isfd = 1,
214 .f_attach = filt_sorattach,
215 .f_detach = filt_sordetach,
216 .f_event = filt_soread,
217 .f_touch = filt_sortouch,
218 .f_process = filt_sorprocess,
219 };
220
221 SYSCTL_DECL(_kern_ipc);
222
223 #define EVEN_MORE_LOCKING_DEBUG 0
224
225 int socket_debug = 0;
226 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
227 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
228
229 #if (DEBUG || DEVELOPMENT)
230 #define DEFAULT_SOSEND_ASSERT_PANIC 1
231 #else
232 #define DEFAULT_SOSEND_ASSERT_PANIC 0
233 #endif /* (DEBUG || DEVELOPMENT) */
234
235 int sosend_assert_panic = 0;
236 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
238
239 static unsigned long sodefunct_calls = 0;
240 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
241 &sodefunct_calls, "");
242
243 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
244 so_gen_t so_gencnt; /* generation count for sockets */
245
246 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
247
248 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
249 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
250 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
251 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
252 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
253 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
254 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
255 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
256 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
257
258 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
259
260 int somaxconn = SOMAXCONN;
261 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
262 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
263
264 /* Should we get a maximum also ??? */
265 static int sosendmaxchain = 65536;
266 static int sosendminchain = 16384;
267 static int sorecvmincopy = 16384;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
270 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
271 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
272
273 /*
274 * Set to enable jumbo clusters (if available) for large writes when
275 * the socket is marked with SOF_MULTIPAGES; see below.
276 */
277 int sosendjcl = 1;
278 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
279 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
280
281 /*
282 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
283 * writes on the socket for all protocols on any network interfaces,
284 * depending upon sosendjcl above. Be extra careful when setting this
285 * to 1, because sending down packets that cross physical pages down to
286 * broken drivers (those that falsely assume that the physical pages
287 * are contiguous) might lead to system panics or silent data corruption.
288 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
289 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
290 * capable. Set this to 1 only for testing/debugging purposes.
291 */
292 int sosendjcl_ignore_capab = 0;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
294 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
295
296 /*
297 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
298 * writes on the socket for all protocols on any network interfaces.
299 * Be extra careful when setting this to 1, because sending down packets with
300 * clusters larger that 2 KB might lead to system panics or data corruption.
301 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
302 * on the outgoing interface
303 * Set this to 1 for testing/debugging purposes only.
304 */
305 int sosendbigcl_ignore_capab = 0;
306 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
307 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
308
309 int sodefunctlog = 0;
310 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
311 &sodefunctlog, 0, "");
312
313 int sothrottlelog = 0;
314 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
315 &sothrottlelog, 0, "");
316
317 int sorestrictrecv = 1;
318 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
319 &sorestrictrecv, 0, "Enable inbound interface restrictions");
320
321 int sorestrictsend = 1;
322 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
323 &sorestrictsend, 0, "Enable outbound interface restrictions");
324
325 int soreserveheadroom = 1;
326 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
328
329 #if (DEBUG || DEVELOPMENT)
330 int so_notsent_lowat_check = 1;
331 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
332 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
333 #endif /* DEBUG || DEVELOPMENT */
334
335 int so_accept_list_waits = 0;
336 #if (DEBUG || DEVELOPMENT)
337 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
338 &so_accept_list_waits, 0, "number of waits for listener incomp list");
339 #endif /* DEBUG || DEVELOPMENT */
340
341 extern struct inpcbinfo tcbinfo;
342
343 /* TODO: these should be in header file */
344 extern int get_inpcb_str_size(void);
345 extern int get_tcp_str_size(void);
346
347 vm_size_t so_cache_zone_element_size;
348
349 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
350 user_ssize_t *);
351 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
352 static void cached_sock_free(struct socket *);
353
354 /*
355 * Maximum of extended background idle sockets per process
356 * Set to zero to disable further setting of the option
357 */
358
359 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
360 #define SO_IDLE_BK_IDLE_TIME 600
361 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
362
363 struct soextbkidlestat soextbkidlestat;
364
365 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
366 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
367 "Maximum of extended background idle sockets per process");
368
369 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
370 &soextbkidlestat.so_xbkidle_time, 0,
371 "Time in seconds to keep extended background idle sockets");
372
373 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
374 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
375 "High water mark for extended background idle sockets");
376
377 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
378 &soextbkidlestat, soextbkidlestat, "");
379
380 int so_set_extended_bk_idle(struct socket *, int);
381
382 #define SO_MAX_MSG_X 1024
383
384 /*
385 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
386 * setting the DSCP code on the packet based on the service class; see
387 * <rdar://problem/11277343> for details.
388 */
389 __private_extern__ u_int32_t sotcdb = 0;
390 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
391 &sotcdb, 0, "");
392
393 void
socketinit(void)394 socketinit(void)
395 {
396 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
397 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
398
399 #ifdef __LP64__
400 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
401 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
402 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
403 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
404 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
405 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
406 #else
407 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
408 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
409 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
410 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
411 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
412 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
413 #endif
414
415 if (socketinit_done) {
416 printf("socketinit: already called...\n");
417 return;
418 }
419 socketinit_done = 1;
420
421 PE_parse_boot_argn("socket_debug", &socket_debug,
422 sizeof(socket_debug));
423
424 PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
425 sizeof(sosend_assert_panic));
426
427 STAILQ_INIT(&so_cache_head);
428
429 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
430 + get_inpcb_str_size() + 4 + get_tcp_str_size());
431
432 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
433 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
434
435 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
436 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
437 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
438 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
439
440 in_pcbinit();
441 }
442
443 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)444 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
445 {
446 caddr_t temp;
447 uintptr_t offset;
448
449 lck_mtx_lock(&so_cache_mtx);
450
451 if (!STAILQ_EMPTY(&so_cache_head)) {
452 VERIFY(cached_sock_count > 0);
453
454 *so = STAILQ_FIRST(&so_cache_head);
455 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
456 STAILQ_NEXT((*so), so_cache_ent) = NULL;
457
458 cached_sock_count--;
459 lck_mtx_unlock(&so_cache_mtx);
460
461 temp = (*so)->so_saved_pcb;
462 bzero((caddr_t)*so, sizeof(struct socket));
463
464 (*so)->so_saved_pcb = temp;
465 } else {
466 lck_mtx_unlock(&so_cache_mtx);
467
468 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
469
470 /*
471 * Define offsets for extra structures into our
472 * single block of memory. Align extra structures
473 * on longword boundaries.
474 */
475
476 offset = (uintptr_t)*so;
477 offset += sizeof(struct socket);
478
479 offset = ALIGN(offset);
480
481 (*so)->so_saved_pcb = (caddr_t)offset;
482 offset += get_inpcb_str_size();
483
484 offset = ALIGN(offset);
485
486 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
487 (caddr_t)offset;
488 }
489
490 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
491 }
492
493 static void
cached_sock_free(struct socket * so)494 cached_sock_free(struct socket *so)
495 {
496 lck_mtx_lock(&so_cache_mtx);
497
498 so_cache_time = net_uptime();
499 if (++cached_sock_count > max_cached_sock_count) {
500 --cached_sock_count;
501 lck_mtx_unlock(&so_cache_mtx);
502 zfree(so_cache_zone, so);
503 } else {
504 if (so_cache_hw < cached_sock_count) {
505 so_cache_hw = cached_sock_count;
506 }
507
508 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
509
510 so->cache_timestamp = so_cache_time;
511 lck_mtx_unlock(&so_cache_mtx);
512 }
513 }
514
515 void
so_update_last_owner_locked(struct socket * so,proc_t self)516 so_update_last_owner_locked(struct socket *so, proc_t self)
517 {
518 if (so->last_pid != 0) {
519 /*
520 * last_pid and last_upid should remain zero for sockets
521 * created using sock_socket. The check above achieves that
522 */
523 if (self == PROC_NULL) {
524 self = current_proc();
525 }
526
527 if (so->last_upid != proc_uniqueid(self) ||
528 so->last_pid != proc_pid(self)) {
529 so->last_upid = proc_uniqueid(self);
530 so->last_pid = proc_pid(self);
531 proc_getexecutableuuid(self, so->last_uuid,
532 sizeof(so->last_uuid));
533 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
534 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
535 }
536 }
537 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
538 }
539 }
540
541 void
so_update_policy(struct socket * so)542 so_update_policy(struct socket *so)
543 {
544 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
545 (void) inp_update_policy(sotoinpcb(so));
546 }
547 }
548
549 #if NECP
550 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)551 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
552 struct sockaddr *override_remote_addr)
553 {
554 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
555 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
556 override_remote_addr, 0);
557 }
558 }
559 #endif /* NECP */
560
561 boolean_t
so_cache_timer(void)562 so_cache_timer(void)
563 {
564 struct socket *p;
565 int n_freed = 0;
566 boolean_t rc = FALSE;
567
568 lck_mtx_lock(&so_cache_mtx);
569 so_cache_timeouts++;
570 so_cache_time = net_uptime();
571
572 while (!STAILQ_EMPTY(&so_cache_head)) {
573 VERIFY(cached_sock_count > 0);
574 p = STAILQ_FIRST(&so_cache_head);
575 if ((so_cache_time - p->cache_timestamp) <
576 SO_CACHE_TIME_LIMIT) {
577 break;
578 }
579
580 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
581 --cached_sock_count;
582
583 zfree(so_cache_zone, p);
584
585 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
586 so_cache_max_freed++;
587 break;
588 }
589 }
590
591 /* Schedule again if there is more to cleanup */
592 if (!STAILQ_EMPTY(&so_cache_head)) {
593 rc = TRUE;
594 }
595
596 lck_mtx_unlock(&so_cache_mtx);
597 return rc;
598 }
599
600 /*
601 * Get a socket structure from our zone, and initialize it.
602 * We don't implement `waitok' yet (see comments in uipc_domain.c).
603 * Note that it would probably be better to allocate socket
604 * and PCB at the same time, but I'm not convinced that all
605 * the protocols can be easily modified to do this.
606 */
607 struct socket *
soalloc(int waitok,int dom,int type)608 soalloc(int waitok, int dom, int type)
609 {
610 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
611 struct socket *so;
612
613 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
614 cached_sock_alloc(&so, how);
615 } else {
616 so = zalloc_flags(socket_zone, how | Z_ZERO);
617 }
618 if (so != NULL) {
619 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
620
621 /*
622 * Increment the socket allocation statistics
623 */
624 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
625 }
626
627 return so;
628 }
629
630 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)631 socreate_internal(int dom, struct socket **aso, int type, int proto,
632 struct proc *p, uint32_t flags, struct proc *ep)
633 {
634 struct protosw *prp;
635 struct socket *so;
636 int error = 0;
637 pid_t rpid = -1;
638
639 #if TCPDEBUG
640 extern int tcpconsdebug;
641 #endif
642
643 VERIFY(aso != NULL);
644 *aso = NULL;
645
646 if (proto != 0) {
647 prp = pffindproto(dom, proto, type);
648 } else {
649 prp = pffindtype(dom, type);
650 }
651
652 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
653 if (pffinddomain(dom) == NULL) {
654 return EAFNOSUPPORT;
655 }
656 if (proto != 0) {
657 if (pffindprotonotype(dom, proto) != NULL) {
658 return EPROTOTYPE;
659 }
660 }
661 return EPROTONOSUPPORT;
662 }
663 if (prp->pr_type != type) {
664 return EPROTOTYPE;
665 }
666 so = soalloc(1, dom, type);
667 if (so == NULL) {
668 return ENOBUFS;
669 }
670
671 switch (dom) {
672 case PF_LOCAL:
673 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
674 break;
675 case PF_INET:
676 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
677 if (type == SOCK_STREAM) {
678 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
679 } else {
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
681 }
682 break;
683 case PF_ROUTE:
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
685 break;
686 case PF_NDRV:
687 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
688 break;
689 case PF_KEY:
690 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
691 break;
692 case PF_INET6:
693 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
694 if (type == SOCK_STREAM) {
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
696 } else {
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
698 }
699 break;
700 case PF_SYSTEM:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
702 break;
703 case PF_MULTIPATH:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
705 break;
706 default:
707 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
708 break;
709 }
710
711 if (flags & SOCF_MPTCP) {
712 so->so_state |= SS_NBIO;
713 }
714
715 TAILQ_INIT(&so->so_incomp);
716 TAILQ_INIT(&so->so_comp);
717 so->so_type = (short)type;
718 so->so_family = prp->pr_domain->dom_family;
719 so->so_protocol = prp->pr_protocol;
720 so->last_upid = proc_uniqueid(p);
721 so->last_pid = proc_pid(p);
722 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
723 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
724
725 so->so_rpid = -1;
726 uuid_clear(so->so_ruuid);
727
728 if (ep != PROC_NULL && ep != p) {
729 so->e_upid = proc_uniqueid(ep);
730 so->e_pid = proc_pid(ep);
731 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
732 so->so_flags |= SOF_DELEGATED;
733 if (ep->p_responsible_pid != so->e_pid) {
734 rpid = ep->p_responsible_pid;
735 so->so_rpid = rpid;
736 proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
737 }
738 }
739
740 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
741 rpid = p->p_responsible_pid;
742 so->so_rpid = rpid;
743 proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
744 }
745
746 so->so_cred = kauth_cred_proc_ref(p);
747 if (!suser(kauth_cred_get(), NULL)) {
748 so->so_state |= SS_PRIV;
749 }
750
751 so->so_persona_id = current_persona_get_id();
752 so->so_proto = prp;
753 so->so_rcv.sb_flags |= SB_RECV;
754 so->so_rcv.sb_so = so->so_snd.sb_so = so;
755 so->next_lock_lr = 0;
756 so->next_unlock_lr = 0;
757
758 /*
759 * Attachment will create the per pcb lock if necessary and
760 * increase refcount for creation, make sure it's done before
761 * socket is inserted in lists.
762 */
763 so->so_usecount++;
764
765 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
766 if (error != 0) {
767 /*
768 * Warning:
769 * If so_pcb is not zero, the socket will be leaked,
770 * so protocol attachment handler must be coded carefuly
771 */
772 if (so->so_pcb != NULL) {
773 os_log_error(OS_LOG_DEFAULT,
774 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
775 error, dom, proto, type);
776 }
777 /*
778 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
779 */
780 so->so_state |= SS_NOFDREF;
781 so->so_flags |= SOF_PCBCLEARING;
782 VERIFY(so->so_usecount > 0);
783 so->so_usecount--;
784 sofreelastref(so, 1); /* will deallocate the socket */
785 return error;
786 }
787
788 /*
789 * Note: needs so_pcb to be set after pru_attach
790 */
791 if (prp->pr_update_last_owner != NULL) {
792 (*prp->pr_update_last_owner)(so, p, ep);
793 }
794
795 os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
796
797 /* Attach socket filters for this protocol */
798 sflt_initsock(so);
799 #if TCPDEBUG
800 if (tcpconsdebug == 2) {
801 so->so_options |= SO_DEBUG;
802 }
803 #endif
804 so_set_default_traffic_class(so);
805
806 /*
807 * If this thread or task is marked to create backgrounded sockets,
808 * mark the socket as background.
809 */
810 if (!(flags & SOCF_MPTCP) &&
811 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
812 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
813 so->so_background_thread = current_thread();
814 }
815
816 switch (dom) {
817 /*
818 * Don't mark Unix domain or system
819 * eligible for defunct by default.
820 */
821 case PF_LOCAL:
822 case PF_SYSTEM:
823 so->so_flags |= SOF_NODEFUNCT;
824 break;
825 default:
826 break;
827 }
828
829 /*
830 * Entitlements can't be checked at socket creation time except if the
831 * application requested a feature guarded by a privilege (c.f., socket
832 * delegation).
833 * The priv(9) and the Sandboxing APIs are designed with the idea that
834 * a privilege check should only be triggered by a userland request.
835 * A privilege check at socket creation time is time consuming and
836 * could trigger many authorisation error messages from the security
837 * APIs.
838 */
839
840 *aso = so;
841
842 return 0;
843 }
844
845 /*
846 * Returns: 0 Success
847 * EAFNOSUPPORT
848 * EPROTOTYPE
849 * EPROTONOSUPPORT
850 * ENOBUFS
851 * <pru_attach>:ENOBUFS[AF_UNIX]
852 * <pru_attach>:ENOBUFS[TCP]
853 * <pru_attach>:ENOMEM[TCP]
854 * <pru_attach>:??? [other protocol families, IPSEC]
855 */
856 int
socreate(int dom,struct socket ** aso,int type,int proto)857 socreate(int dom, struct socket **aso, int type, int proto)
858 {
859 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
860 PROC_NULL);
861 }
862
863 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)864 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
865 {
866 int error = 0;
867 struct proc *ep = PROC_NULL;
868
869 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
870 error = ESRCH;
871 goto done;
872 }
873
874 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
875
876 /*
877 * It might not be wise to hold the proc reference when calling
878 * socreate_internal since it calls soalloc with M_WAITOK
879 */
880 done:
881 if (ep != PROC_NULL) {
882 proc_rele(ep);
883 }
884
885 return error;
886 }
887
888 /*
889 * Returns: 0 Success
890 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
891 * <pru_bind>:EAFNOSUPPORT Address family not supported
892 * <pru_bind>:EADDRNOTAVAIL Address not available.
893 * <pru_bind>:EINVAL Invalid argument
894 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
895 * <pru_bind>:EACCES Permission denied
896 * <pru_bind>:EADDRINUSE Address in use
897 * <pru_bind>:EAGAIN Resource unavailable, try again
898 * <pru_bind>:EPERM Operation not permitted
899 * <pru_bind>:???
900 * <sf_bind>:???
901 *
902 * Notes: It's not possible to fully enumerate the return codes above,
903 * since socket filter authors and protocol family authors may
904 * not choose to limit their error returns to those listed, even
905 * though this may result in some software operating incorrectly.
906 *
907 * The error codes which are enumerated above are those known to
908 * be returned by the tcp_usr_bind function supplied.
909 */
910 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)911 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
912 {
913 struct proc *p = current_proc();
914 int error = 0;
915
916 if (dolock) {
917 socket_lock(so, 1);
918 }
919
920 so_update_last_owner_locked(so, p);
921 so_update_policy(so);
922
923 #if NECP
924 so_update_necp_policy(so, nam, NULL);
925 #endif /* NECP */
926
927 /*
928 * If this is a bind request on a socket that has been marked
929 * as inactive, reject it now before we go any further.
930 */
931 if (so->so_flags & SOF_DEFUNCT) {
932 error = EINVAL;
933 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
934 __func__, proc_pid(p), proc_best_name(p),
935 so->so_gencnt,
936 SOCK_DOM(so), SOCK_TYPE(so), error);
937 goto out;
938 }
939
940 /* Socket filter */
941 error = sflt_bind(so, nam);
942
943 if (error == 0) {
944 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
945 }
946 out:
947 if (dolock) {
948 socket_unlock(so, 1);
949 }
950
951 if (error == EJUSTRETURN) {
952 error = 0;
953 }
954
955 return error;
956 }
957
958 void
sodealloc(struct socket * so)959 sodealloc(struct socket *so)
960 {
961 kauth_cred_unref(&so->so_cred);
962
963 /* Remove any filters */
964 sflt_termsock(so);
965
966 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
967
968 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
969 cached_sock_free(so);
970 } else {
971 zfree(socket_zone, so);
972 }
973 }
974
975 /*
976 * Returns: 0 Success
977 * EINVAL
978 * EOPNOTSUPP
979 * <pru_listen>:EINVAL[AF_UNIX]
980 * <pru_listen>:EINVAL[TCP]
981 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
982 * <pru_listen>:EINVAL[TCP] Invalid argument
983 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
984 * <pru_listen>:EACCES[TCP] Permission denied
985 * <pru_listen>:EADDRINUSE[TCP] Address in use
986 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
987 * <pru_listen>:EPERM[TCP] Operation not permitted
988 * <sf_listen>:???
989 *
990 * Notes: Other <pru_listen> returns depend on the protocol family; all
991 * <sf_listen> returns depend on what the filter author causes
992 * their filter to return.
993 */
994 int
solisten(struct socket * so,int backlog)995 solisten(struct socket *so, int backlog)
996 {
997 struct proc *p = current_proc();
998 int error = 0;
999
1000 socket_lock(so, 1);
1001
1002 so_update_last_owner_locked(so, p);
1003 so_update_policy(so);
1004
1005 if (TAILQ_EMPTY(&so->so_comp)) {
1006 so->so_options |= SO_ACCEPTCONN;
1007 }
1008
1009 #if NECP
1010 so_update_necp_policy(so, NULL, NULL);
1011 #endif /* NECP */
1012
1013 if (so->so_proto == NULL) {
1014 error = EINVAL;
1015 so->so_options &= ~SO_ACCEPTCONN;
1016 goto out;
1017 }
1018 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1019 error = EOPNOTSUPP;
1020 so->so_options &= ~SO_ACCEPTCONN;
1021 goto out;
1022 }
1023
1024 /*
1025 * If the listen request is made on a socket that is not fully
1026 * disconnected, or on a socket that has been marked as inactive,
1027 * reject the request now.
1028 */
1029 if ((so->so_state &
1030 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1031 (so->so_flags & SOF_DEFUNCT)) {
1032 error = EINVAL;
1033 if (so->so_flags & SOF_DEFUNCT) {
1034 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1035 "(%d)\n", __func__, proc_pid(p),
1036 proc_best_name(p),
1037 so->so_gencnt,
1038 SOCK_DOM(so), SOCK_TYPE(so), error);
1039 }
1040 so->so_options &= ~SO_ACCEPTCONN;
1041 goto out;
1042 }
1043
1044 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1045 error = EPERM;
1046 so->so_options &= ~SO_ACCEPTCONN;
1047 goto out;
1048 }
1049
1050 error = sflt_listen(so);
1051 if (error == 0) {
1052 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1053 }
1054
1055 if (error) {
1056 if (error == EJUSTRETURN) {
1057 error = 0;
1058 }
1059 so->so_options &= ~SO_ACCEPTCONN;
1060 goto out;
1061 }
1062
1063 /*
1064 * POSIX: The implementation may have an upper limit on the length of
1065 * the listen queue-either global or per accepting socket. If backlog
1066 * exceeds this limit, the length of the listen queue is set to the
1067 * limit.
1068 *
1069 * If listen() is called with a backlog argument value that is less
1070 * than 0, the function behaves as if it had been called with a backlog
1071 * argument value of 0.
1072 *
1073 * A backlog argument of 0 may allow the socket to accept connections,
1074 * in which case the length of the listen queue may be set to an
1075 * implementation-defined minimum value.
1076 */
1077 if (backlog <= 0 || backlog > somaxconn) {
1078 backlog = somaxconn;
1079 }
1080
1081 so->so_qlimit = (short)backlog;
1082 out:
1083 socket_unlock(so, 1);
1084 return error;
1085 }
1086
1087 /*
1088 * The "accept list lock" protects the fields related to the listener queues
1089 * because we can unlock a socket to respect the lock ordering between
1090 * the listener socket and its clients sockets. The lock ordering is first to
1091 * acquire the client socket before the listener socket.
1092 *
1093 * The accept list lock serializes access to the following fields:
1094 * - of the listener socket:
1095 * - so_comp
1096 * - so_incomp
1097 * - so_qlen
1098 * - so_inqlen
1099 * - of client sockets that are in so_comp or so_incomp:
1100 * - so_head
1101 * - so_list
1102 *
1103 * As one can see the accept list lock protects the consistent of the
1104 * linkage of the client sockets.
1105 *
1106 * Note that those fields may be read without holding the accept list lock
1107 * for a preflight provided the accept list lock is taken when committing
1108 * to take an action based on the result of the preflight. The preflight
1109 * saves the cost of doing the unlock/lock dance.
1110 */
1111 void
so_acquire_accept_list(struct socket * head,struct socket * so)1112 so_acquire_accept_list(struct socket *head, struct socket *so)
1113 {
1114 lck_mtx_t *mutex_held;
1115
1116 if (head->so_proto->pr_getlock == NULL) {
1117 return;
1118 }
1119 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1120 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1121
1122 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1123 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1124 return;
1125 }
1126 if (so != NULL) {
1127 socket_unlock(so, 0);
1128 }
1129 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1130 so_accept_list_waits += 1;
1131 msleep((caddr_t)&head->so_incomp, mutex_held,
1132 PSOCK | PCATCH, __func__, NULL);
1133 }
1134 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1135 if (so != NULL) {
1136 socket_unlock(head, 0);
1137 socket_lock(so, 0);
1138 socket_lock(head, 0);
1139 }
1140 }
1141
1142 void
so_release_accept_list(struct socket * head)1143 so_release_accept_list(struct socket *head)
1144 {
1145 if (head->so_proto->pr_getlock != NULL) {
1146 lck_mtx_t *mutex_held;
1147
1148 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1149 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1150
1151 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1152 wakeup((caddr_t)&head->so_incomp);
1153 }
1154 }
1155
1156 void
sofreelastref(struct socket * so,int dealloc)1157 sofreelastref(struct socket *so, int dealloc)
1158 {
1159 struct socket *head = so->so_head;
1160
1161 /* Assume socket is locked */
1162
1163 #if FLOW_DIVERT
1164 if (so->so_flags & SOF_FLOW_DIVERT) {
1165 flow_divert_detach(so);
1166 }
1167 #endif /* FLOW_DIVERT */
1168
1169 #if CONTENT_FILTER
1170 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1171 cfil_sock_detach(so);
1172 }
1173 #endif /* CONTENT_FILTER */
1174
1175 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1176 soflow_detach(so);
1177 }
1178
1179 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1180 selthreadclear(&so->so_snd.sb_sel);
1181 selthreadclear(&so->so_rcv.sb_sel);
1182 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1183 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1184 so->so_event = sonullevent;
1185 return;
1186 }
1187 if (head != NULL) {
1188 /*
1189 * Need to lock the listener when the protocol has
1190 * per socket locks
1191 */
1192 if (head->so_proto->pr_getlock != NULL) {
1193 socket_lock(head, 1);
1194 so_acquire_accept_list(head, so);
1195 }
1196 if (so->so_state & SS_INCOMP) {
1197 so->so_state &= ~SS_INCOMP;
1198 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1199 head->so_incqlen--;
1200 head->so_qlen--;
1201 so->so_head = NULL;
1202
1203 if (head->so_proto->pr_getlock != NULL) {
1204 so_release_accept_list(head);
1205 socket_unlock(head, 1);
1206 }
1207 } else if (so->so_state & SS_COMP) {
1208 if (head->so_proto->pr_getlock != NULL) {
1209 so_release_accept_list(head);
1210 socket_unlock(head, 1);
1211 }
1212 /*
1213 * We must not decommission a socket that's
1214 * on the accept(2) queue. If we do, then
1215 * accept(2) may hang after select(2) indicated
1216 * that the listening socket was ready.
1217 */
1218 selthreadclear(&so->so_snd.sb_sel);
1219 selthreadclear(&so->so_rcv.sb_sel);
1220 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1221 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1222 so->so_event = sonullevent;
1223 return;
1224 } else {
1225 if (head->so_proto->pr_getlock != NULL) {
1226 so_release_accept_list(head);
1227 socket_unlock(head, 1);
1228 }
1229 printf("sofree: not queued\n");
1230 }
1231 }
1232 sowflush(so);
1233 sorflush(so);
1234
1235 /* 3932268: disable upcall */
1236 so->so_rcv.sb_flags &= ~SB_UPCALL;
1237 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1238 so->so_event = sonullevent;
1239
1240 if (dealloc) {
1241 sodealloc(so);
1242 }
1243 }
1244
1245 void
soclose_wait_locked(struct socket * so)1246 soclose_wait_locked(struct socket *so)
1247 {
1248 lck_mtx_t *mutex_held;
1249
1250 if (so->so_proto->pr_getlock != NULL) {
1251 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1252 } else {
1253 mutex_held = so->so_proto->pr_domain->dom_mtx;
1254 }
1255 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1256
1257 /*
1258 * Double check here and return if there's no outstanding upcall;
1259 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1260 */
1261 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1262 return;
1263 }
1264 so->so_rcv.sb_flags &= ~SB_UPCALL;
1265 so->so_snd.sb_flags &= ~SB_UPCALL;
1266 so->so_flags |= SOF_CLOSEWAIT;
1267
1268 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1269 "soclose_wait_locked", NULL);
1270 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1271 so->so_flags &= ~SOF_CLOSEWAIT;
1272 }
1273
1274 /*
1275 * Close a socket on last file table reference removal.
1276 * Initiate disconnect if connected.
1277 * Free socket when disconnect complete.
1278 */
1279 int
soclose_locked(struct socket * so)1280 soclose_locked(struct socket *so)
1281 {
1282 int error = 0;
1283 struct timespec ts;
1284
1285 if (so->so_usecount == 0) {
1286 panic("soclose: so=%p refcount=0", so);
1287 /* NOTREACHED */
1288 }
1289
1290 sflt_notify(so, sock_evt_closing, NULL);
1291
1292 if (so->so_upcallusecount) {
1293 soclose_wait_locked(so);
1294 }
1295
1296 #if CONTENT_FILTER
1297 /*
1298 * We have to wait until the content filters are done
1299 */
1300 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1301 cfil_sock_close_wait(so);
1302 cfil_sock_is_closed(so);
1303 cfil_sock_detach(so);
1304 }
1305 #endif /* CONTENT_FILTER */
1306
1307 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1308 soflow_detach(so);
1309 }
1310
1311 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1312 soresume(current_proc(), so, 1);
1313 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1314 }
1315
1316 if ((so->so_options & SO_ACCEPTCONN)) {
1317 struct socket *sp, *sonext;
1318 int persocklock = 0;
1319 int incomp_overflow_only;
1320
1321 /*
1322 * We do not want new connection to be added
1323 * to the connection queues
1324 */
1325 so->so_options &= ~SO_ACCEPTCONN;
1326
1327 /*
1328 * We can drop the lock on the listener once
1329 * we've acquired the incoming list
1330 */
1331 if (so->so_proto->pr_getlock != NULL) {
1332 persocklock = 1;
1333 so_acquire_accept_list(so, NULL);
1334 socket_unlock(so, 0);
1335 }
1336 again:
1337 incomp_overflow_only = 1;
1338
1339 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1340 /*
1341 * Radar 5350314
1342 * skip sockets thrown away by tcpdropdropblreq
1343 * they will get cleanup by the garbage collection.
1344 * otherwise, remove the incomp socket from the queue
1345 * and let soabort trigger the appropriate cleanup.
1346 */
1347 if (sp->so_flags & SOF_OVERFLOW) {
1348 continue;
1349 }
1350
1351 if (persocklock != 0) {
1352 socket_lock(sp, 1);
1353 }
1354
1355 /*
1356 * Radar 27945981
1357 * The extra reference for the list insure the
1358 * validity of the socket pointer when we perform the
1359 * unlock of the head above
1360 */
1361 if (sp->so_state & SS_INCOMP) {
1362 sp->so_state &= ~SS_INCOMP;
1363 sp->so_head = NULL;
1364 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1365 so->so_incqlen--;
1366 so->so_qlen--;
1367
1368 (void) soabort(sp);
1369 } else {
1370 panic("%s sp %p in so_incomp but !SS_INCOMP",
1371 __func__, sp);
1372 }
1373
1374 if (persocklock != 0) {
1375 socket_unlock(sp, 1);
1376 }
1377 }
1378
1379 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1380 /* Dequeue from so_comp since sofree() won't do it */
1381 if (persocklock != 0) {
1382 socket_lock(sp, 1);
1383 }
1384
1385 if (sp->so_state & SS_COMP) {
1386 sp->so_state &= ~SS_COMP;
1387 sp->so_head = NULL;
1388 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1389 so->so_qlen--;
1390
1391 (void) soabort(sp);
1392 } else {
1393 panic("%s sp %p in so_comp but !SS_COMP",
1394 __func__, sp);
1395 }
1396
1397 if (persocklock) {
1398 socket_unlock(sp, 1);
1399 }
1400 }
1401
1402 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1403 #if (DEBUG | DEVELOPMENT)
1404 panic("%s head %p so_comp not empty", __func__, so);
1405 #endif /* (DEVELOPMENT || DEBUG) */
1406
1407 goto again;
1408 }
1409
1410 if (!TAILQ_EMPTY(&so->so_comp)) {
1411 #if (DEBUG | DEVELOPMENT)
1412 panic("%s head %p so_comp not empty", __func__, so);
1413 #endif /* (DEVELOPMENT || DEBUG) */
1414
1415 goto again;
1416 }
1417
1418 if (persocklock) {
1419 socket_lock(so, 0);
1420 so_release_accept_list(so);
1421 }
1422 }
1423 if (so->so_pcb == NULL) {
1424 /* 3915887: mark the socket as ready for dealloc */
1425 so->so_flags |= SOF_PCBCLEARING;
1426 goto discard;
1427 }
1428
1429 if (so->so_state & SS_ISCONNECTED) {
1430 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1431 error = sodisconnectlocked(so);
1432 if (error) {
1433 goto drop;
1434 }
1435 }
1436 if (so->so_options & SO_LINGER) {
1437 if ((so->so_state & SS_ISDISCONNECTING) &&
1438 (so->so_state & SS_NBIO)) {
1439 goto drop;
1440 }
1441 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1442 lck_mtx_t *mutex_held;
1443
1444 if (so->so_proto->pr_getlock != NULL) {
1445 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1446 } else {
1447 mutex_held = so->so_proto->pr_domain->dom_mtx;
1448 }
1449 ts.tv_sec = (so->so_linger / 100);
1450 ts.tv_nsec = (so->so_linger % 100) *
1451 NSEC_PER_USEC * 1000 * 10;
1452 error = msleep((caddr_t)&so->so_timeo,
1453 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1454 if (error) {
1455 /*
1456 * It's OK when the time fires,
1457 * don't report an error
1458 */
1459 if (error == EWOULDBLOCK) {
1460 error = 0;
1461 }
1462 break;
1463 }
1464 }
1465 }
1466 }
1467 drop:
1468 if (so->so_usecount == 0) {
1469 panic("soclose: usecount is zero so=%p", so);
1470 /* NOTREACHED */
1471 }
1472 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1473 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1474 if (error == 0) {
1475 error = error2;
1476 }
1477 }
1478 if (so->so_usecount <= 0) {
1479 panic("soclose: usecount is zero so=%p", so);
1480 /* NOTREACHED */
1481 }
1482 discard:
1483 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1484 (so->so_state & SS_NOFDREF)) {
1485 panic("soclose: NOFDREF");
1486 /* NOTREACHED */
1487 }
1488 so->so_state |= SS_NOFDREF;
1489
1490 if ((so->so_flags & SOF_KNOTE) != 0) {
1491 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1492 }
1493
1494 os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1495
1496 VERIFY(so->so_usecount > 0);
1497 so->so_usecount--;
1498 sofree(so);
1499 return error;
1500 }
1501
1502 int
soclose(struct socket * so)1503 soclose(struct socket *so)
1504 {
1505 int error = 0;
1506 socket_lock(so, 1);
1507
1508 if (so->so_retaincnt == 0) {
1509 error = soclose_locked(so);
1510 } else {
1511 /*
1512 * if the FD is going away, but socket is
1513 * retained in kernel remove its reference
1514 */
1515 so->so_usecount--;
1516 if (so->so_usecount < 2) {
1517 panic("soclose: retaincnt non null and so=%p "
1518 "usecount=%d\n", so, so->so_usecount);
1519 }
1520 }
1521 socket_unlock(so, 1);
1522 return error;
1523 }
1524
1525 /*
1526 * Must be called at splnet...
1527 */
1528 /* Should already be locked */
1529 int
soabort(struct socket * so)1530 soabort(struct socket *so)
1531 {
1532 int error;
1533
1534 #ifdef MORE_LOCKING_DEBUG
1535 lck_mtx_t *mutex_held;
1536
1537 if (so->so_proto->pr_getlock != NULL) {
1538 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1539 } else {
1540 mutex_held = so->so_proto->pr_domain->dom_mtx;
1541 }
1542 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1543 #endif
1544
1545 if ((so->so_flags & SOF_ABORTED) == 0) {
1546 so->so_flags |= SOF_ABORTED;
1547 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1548 if (error) {
1549 sofree(so);
1550 return error;
1551 }
1552 }
1553 return 0;
1554 }
1555
1556 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1557 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1558 {
1559 int error;
1560
1561 if (dolock) {
1562 socket_lock(so, 1);
1563 }
1564
1565 so_update_last_owner_locked(so, PROC_NULL);
1566 so_update_policy(so);
1567 #if NECP
1568 so_update_necp_policy(so, NULL, NULL);
1569 #endif /* NECP */
1570
1571 if ((so->so_state & SS_NOFDREF) == 0) {
1572 panic("soaccept: !NOFDREF");
1573 }
1574 so->so_state &= ~SS_NOFDREF;
1575 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1576
1577 if (dolock) {
1578 socket_unlock(so, 1);
1579 }
1580 return error;
1581 }
1582
1583 int
soaccept(struct socket * so,struct sockaddr ** nam)1584 soaccept(struct socket *so, struct sockaddr **nam)
1585 {
1586 return soacceptlock(so, nam, 1);
1587 }
1588
1589 int
soacceptfilter(struct socket * so,struct socket * head)1590 soacceptfilter(struct socket *so, struct socket *head)
1591 {
1592 struct sockaddr *local = NULL, *remote = NULL;
1593 int error = 0;
1594
1595 /*
1596 * Hold the lock even if this socket has not been made visible
1597 * to the filter(s). For sockets with global locks, this protects
1598 * against the head or peer going away
1599 */
1600 socket_lock(so, 1);
1601 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1602 sogetaddr_locked(so, &local, 0) != 0) {
1603 so->so_state &= ~SS_NOFDREF;
1604 socket_unlock(so, 1);
1605 soclose(so);
1606 /* Out of resources; try it again next time */
1607 error = ECONNABORTED;
1608 goto done;
1609 }
1610
1611 error = sflt_accept(head, so, local, remote);
1612
1613 /*
1614 * If we get EJUSTRETURN from one of the filters, mark this socket
1615 * as inactive and return it anyway. This newly accepted socket
1616 * will be disconnected later before we hand it off to the caller.
1617 */
1618 if (error == EJUSTRETURN) {
1619 error = 0;
1620 (void) sosetdefunct(current_proc(), so,
1621 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1622 }
1623
1624 if (error != 0) {
1625 /*
1626 * This may seem like a duplication to the above error
1627 * handling part when we return ECONNABORTED, except
1628 * the following is done while holding the lock since
1629 * the socket has been exposed to the filter(s) earlier.
1630 */
1631 so->so_state &= ~SS_NOFDREF;
1632 socket_unlock(so, 1);
1633 soclose(so);
1634 /* Propagate socket filter's error code to the caller */
1635 } else {
1636 socket_unlock(so, 1);
1637 }
1638 done:
1639 /* Callee checks for NULL pointer */
1640 sock_freeaddr(remote);
1641 sock_freeaddr(local);
1642 return error;
1643 }
1644
1645 /*
1646 * Returns: 0 Success
1647 * EOPNOTSUPP Operation not supported on socket
1648 * EISCONN Socket is connected
1649 * <pru_connect>:EADDRNOTAVAIL Address not available.
1650 * <pru_connect>:EINVAL Invalid argument
1651 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1652 * <pru_connect>:EACCES Permission denied
1653 * <pru_connect>:EADDRINUSE Address in use
1654 * <pru_connect>:EAGAIN Resource unavailable, try again
1655 * <pru_connect>:EPERM Operation not permitted
1656 * <sf_connect_out>:??? [anything a filter writer might set]
1657 */
1658 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1659 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1660 {
1661 int error;
1662 struct proc *p = current_proc();
1663 tracker_metadata_t metadata = { };
1664
1665 if (dolock) {
1666 socket_lock(so, 1);
1667 }
1668
1669 so_update_last_owner_locked(so, p);
1670 so_update_policy(so);
1671
1672 /*
1673 * If this is a listening socket or if this is a previously-accepted
1674 * socket that has been marked as inactive, reject the connect request.
1675 */
1676 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1677 error = EOPNOTSUPP;
1678 if (so->so_flags & SOF_DEFUNCT) {
1679 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1680 "(%d)\n", __func__, proc_pid(p),
1681 proc_best_name(p),
1682 so->so_gencnt,
1683 SOCK_DOM(so), SOCK_TYPE(so), error);
1684 }
1685 if (dolock) {
1686 socket_unlock(so, 1);
1687 }
1688 return error;
1689 }
1690
1691 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1692 if (dolock) {
1693 socket_unlock(so, 1);
1694 }
1695 return EPERM;
1696 }
1697
1698 /*
1699 * If protocol is connection-based, can only connect once.
1700 * Otherwise, if connected, try to disconnect first.
1701 * This allows user to disconnect by connecting to, e.g.,
1702 * a null address.
1703 */
1704 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1705 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1706 (error = sodisconnectlocked(so)))) {
1707 error = EISCONN;
1708 } else {
1709 /*
1710 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1711 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1712 */
1713 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1714 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1715 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1716 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1717 }
1718 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1719 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1720 }
1721 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1722 printf("connect() - failed necp_set_socket_domain_attributes");
1723 }
1724 }
1725 }
1726
1727 #if NECP
1728 /* Update NECP evaluation after setting any domain via the tracker checks */
1729 so_update_necp_policy(so, NULL, nam);
1730 #endif /* NECP */
1731
1732 /*
1733 * Run connect filter before calling protocol:
1734 * - non-blocking connect returns before completion;
1735 */
1736 error = sflt_connectout(so, nam);
1737 if (error != 0) {
1738 if (error == EJUSTRETURN) {
1739 error = 0;
1740 }
1741 } else {
1742 error = (*so->so_proto->pr_usrreqs->pru_connect)
1743 (so, nam, p);
1744 if (error != 0) {
1745 so->so_state &= ~SS_ISCONNECTING;
1746 }
1747 }
1748 }
1749 if (dolock) {
1750 socket_unlock(so, 1);
1751 }
1752 return error;
1753 }
1754
1755 int
soconnect(struct socket * so,struct sockaddr * nam)1756 soconnect(struct socket *so, struct sockaddr *nam)
1757 {
1758 return soconnectlock(so, nam, 1);
1759 }
1760
1761 /*
1762 * Returns: 0 Success
1763 * <pru_connect2>:EINVAL[AF_UNIX]
1764 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1765 * <pru_connect2>:??? [other protocol families]
1766 *
1767 * Notes: <pru_connect2> is not supported by [TCP].
1768 */
1769 int
soconnect2(struct socket * so1,struct socket * so2)1770 soconnect2(struct socket *so1, struct socket *so2)
1771 {
1772 int error;
1773
1774 socket_lock(so1, 1);
1775 if (so2->so_proto->pr_lock) {
1776 socket_lock(so2, 1);
1777 }
1778
1779 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1780
1781 socket_unlock(so1, 1);
1782 if (so2->so_proto->pr_lock) {
1783 socket_unlock(so2, 1);
1784 }
1785 return error;
1786 }
1787
1788 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1789 soconnectxlocked(struct socket *so, struct sockaddr *src,
1790 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1791 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1792 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1793 {
1794 int error;
1795 tracker_metadata_t metadata = { };
1796
1797 so_update_last_owner_locked(so, p);
1798 so_update_policy(so);
1799
1800 /*
1801 * If this is a listening socket or if this is a previously-accepted
1802 * socket that has been marked as inactive, reject the connect request.
1803 */
1804 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1805 error = EOPNOTSUPP;
1806 if (so->so_flags & SOF_DEFUNCT) {
1807 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1808 "(%d)\n", __func__, proc_pid(p),
1809 proc_best_name(p),
1810 so->so_gencnt,
1811 SOCK_DOM(so), SOCK_TYPE(so), error);
1812 }
1813 return error;
1814 }
1815
1816 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1817 return EPERM;
1818 }
1819
1820 /*
1821 * If protocol is connection-based, can only connect once
1822 * unless PR_MULTICONN is set. Otherwise, if connected,
1823 * try to disconnect first. This allows user to disconnect
1824 * by connecting to, e.g., a null address.
1825 */
1826 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1827 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1828 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1829 (error = sodisconnectlocked(so)) != 0)) {
1830 error = EISCONN;
1831 } else {
1832 /*
1833 * For TCP, check if destination address is a tracker and mark the socket accordingly
1834 * (only if it hasn't been marked yet).
1835 */
1836 if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1837 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1838 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1839 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1840 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1841 }
1842 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1843 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1844 }
1845 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1846 printf("connectx() - failed necp_set_socket_domain_attributes");
1847 }
1848 }
1849 }
1850
1851 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1852 (flags & CONNECT_DATA_IDEMPOTENT)) {
1853 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1854
1855 if (flags & CONNECT_DATA_AUTHENTICATED) {
1856 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1857 }
1858 }
1859
1860 /*
1861 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1862 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1863 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1864 * Case 3 allows user to combine write with connect even if they have
1865 * no use for TFO (such as regular TCP, and UDP).
1866 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1867 */
1868 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1869 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1870 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1871 }
1872
1873 /*
1874 * If a user sets data idempotent and does not pass an uio, or
1875 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1876 * SOF1_DATA_IDEMPOTENT.
1877 */
1878 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1879 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1880 /* We should return EINVAL instead perhaps. */
1881 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1882 }
1883
1884 /*
1885 * Run connect filter before calling protocol:
1886 * - non-blocking connect returns before completion;
1887 */
1888 error = sflt_connectout(so, dst);
1889 if (error != 0) {
1890 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1891 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1892 if (error == EJUSTRETURN) {
1893 error = 0;
1894 }
1895 } else {
1896 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1897 (so, src, dst, p, ifscope, aid, pcid,
1898 flags, arg, arglen, auio, bytes_written);
1899 if (error != 0) {
1900 so->so_state &= ~SS_ISCONNECTING;
1901 if (error != EINPROGRESS) {
1902 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1903 }
1904 }
1905 }
1906 }
1907
1908 return error;
1909 }
1910
1911 int
sodisconnectlocked(struct socket * so)1912 sodisconnectlocked(struct socket *so)
1913 {
1914 int error;
1915
1916 if ((so->so_state & SS_ISCONNECTED) == 0) {
1917 error = ENOTCONN;
1918 goto bad;
1919 }
1920 if (so->so_state & SS_ISDISCONNECTING) {
1921 error = EALREADY;
1922 goto bad;
1923 }
1924
1925 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1926 if (error == 0) {
1927 sflt_notify(so, sock_evt_disconnected, NULL);
1928 }
1929
1930 bad:
1931 return error;
1932 }
1933
1934 /* Locking version */
1935 int
sodisconnect(struct socket * so)1936 sodisconnect(struct socket *so)
1937 {
1938 int error;
1939
1940 socket_lock(so, 1);
1941 error = sodisconnectlocked(so);
1942 socket_unlock(so, 1);
1943 return error;
1944 }
1945
1946 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1947 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1948 {
1949 int error;
1950
1951 /*
1952 * Call the protocol disconnectx handler; let it handle all
1953 * matters related to the connection state of this session.
1954 */
1955 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1956 if (error == 0) {
1957 /*
1958 * The event applies only for the session, not for
1959 * the disconnection of individual subflows.
1960 */
1961 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1962 sflt_notify(so, sock_evt_disconnected, NULL);
1963 }
1964 }
1965 return error;
1966 }
1967
1968 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1969 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1970 {
1971 int error;
1972
1973 socket_lock(so, 1);
1974 error = sodisconnectxlocked(so, aid, cid);
1975 socket_unlock(so, 1);
1976 return error;
1977 }
1978
1979 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1980
1981 /*
1982 * sosendcheck will lock the socket buffer if it isn't locked and
1983 * verify that there is space for the data being inserted.
1984 *
1985 * Returns: 0 Success
1986 * EPIPE
1987 * sblock:EWOULDBLOCK
1988 * sblock:EINTR
1989 * sbwait:EBADF
1990 * sbwait:EINTR
1991 * [so_error]:???
1992 */
1993 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1994 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1995 int32_t clen, int32_t atomic, int flags, int *sblocked)
1996 {
1997 int error = 0;
1998 int32_t space;
1999 int assumelock = 0;
2000
2001 restart:
2002 if (*sblocked == 0) {
2003 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2004 so->so_send_filt_thread != 0 &&
2005 so->so_send_filt_thread == current_thread()) {
2006 /*
2007 * We're being called recursively from a filter,
2008 * allow this to continue. Radar 4150520.
2009 * Don't set sblocked because we don't want
2010 * to perform an unlock later.
2011 */
2012 assumelock = 1;
2013 } else {
2014 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2015 if (error) {
2016 if (so->so_flags & SOF_DEFUNCT) {
2017 goto defunct;
2018 }
2019 return error;
2020 }
2021 *sblocked = 1;
2022 }
2023 }
2024
2025 /*
2026 * If a send attempt is made on a socket that has been marked
2027 * as inactive (disconnected), reject the request.
2028 */
2029 if (so->so_flags & SOF_DEFUNCT) {
2030 defunct:
2031 error = EPIPE;
2032 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2033 __func__, proc_selfpid(), proc_best_name(current_proc()),
2034 so->so_gencnt,
2035 SOCK_DOM(so), SOCK_TYPE(so), error);
2036 return error;
2037 }
2038
2039 if (so->so_state & SS_CANTSENDMORE) {
2040 #if CONTENT_FILTER
2041 /*
2042 * Can re-inject data of half closed connections
2043 */
2044 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2045 so->so_snd.sb_cfil_thread == current_thread() &&
2046 cfil_sock_data_pending(&so->so_snd) != 0) {
2047 CFIL_LOG(LOG_INFO,
2048 "so %llx ignore SS_CANTSENDMORE",
2049 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2050 } else
2051 #endif /* CONTENT_FILTER */
2052 return EPIPE;
2053 }
2054 if (so->so_error) {
2055 error = so->so_error;
2056 so->so_error = 0;
2057 return error;
2058 }
2059
2060 if ((so->so_state & SS_ISCONNECTED) == 0) {
2061 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2062 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2063 (resid != 0 || clen == 0) &&
2064 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2065 return ENOTCONN;
2066 }
2067 } else if (addr == 0) {
2068 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2069 ENOTCONN : EDESTADDRREQ;
2070 }
2071 }
2072
2073 space = sbspace(&so->so_snd);
2074
2075 if (flags & MSG_OOB) {
2076 space += 1024;
2077 }
2078 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2079 clen > so->so_snd.sb_hiwat) {
2080 return EMSGSIZE;
2081 }
2082
2083 if ((space < resid + clen &&
2084 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2085 space < clen)) ||
2086 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2087 /*
2088 * don't block the connectx call when there's more data
2089 * than can be copied.
2090 */
2091 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2092 if (space == 0) {
2093 return EWOULDBLOCK;
2094 }
2095 if (space < (int32_t)so->so_snd.sb_lowat) {
2096 return 0;
2097 }
2098 }
2099 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2100 assumelock) {
2101 return EWOULDBLOCK;
2102 }
2103 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2104 *sblocked = 0;
2105 error = sbwait(&so->so_snd);
2106 if (error) {
2107 if (so->so_flags & SOF_DEFUNCT) {
2108 goto defunct;
2109 }
2110 return error;
2111 }
2112 goto restart;
2113 }
2114 return 0;
2115 }
2116
2117 /*
2118 * Send on a socket.
2119 * If send must go all at once and message is larger than
2120 * send buffering, then hard error.
2121 * Lock against other senders.
2122 * If must go all at once and not enough room now, then
2123 * inform user that this would block and do nothing.
2124 * Otherwise, if nonblocking, send as much as possible.
2125 * The data to be sent is described by "uio" if nonzero,
2126 * otherwise by the mbuf chain "top" (which must be null
2127 * if uio is not). Data provided in mbuf chain must be small
2128 * enough to send all at once.
2129 *
2130 * Returns nonzero on error, timeout or signal; callers
2131 * must check for short counts if EINTR/ERESTART are returned.
2132 * Data and control buffers are freed on return.
2133 *
2134 * Returns: 0 Success
2135 * EOPNOTSUPP
2136 * EINVAL
2137 * ENOBUFS
2138 * uiomove:EFAULT
2139 * sosendcheck:EPIPE
2140 * sosendcheck:EWOULDBLOCK
2141 * sosendcheck:EINTR
2142 * sosendcheck:EBADF
2143 * sosendcheck:EINTR
2144 * sosendcheck:??? [value from so_error]
2145 * <pru_send>:ECONNRESET[TCP]
2146 * <pru_send>:EINVAL[TCP]
2147 * <pru_send>:ENOBUFS[TCP]
2148 * <pru_send>:EADDRINUSE[TCP]
2149 * <pru_send>:EADDRNOTAVAIL[TCP]
2150 * <pru_send>:EAFNOSUPPORT[TCP]
2151 * <pru_send>:EACCES[TCP]
2152 * <pru_send>:EAGAIN[TCP]
2153 * <pru_send>:EPERM[TCP]
2154 * <pru_send>:EMSGSIZE[TCP]
2155 * <pru_send>:EHOSTUNREACH[TCP]
2156 * <pru_send>:ENETUNREACH[TCP]
2157 * <pru_send>:ENETDOWN[TCP]
2158 * <pru_send>:ENOMEM[TCP]
2159 * <pru_send>:ENOBUFS[TCP]
2160 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2161 * <pru_send>:EINVAL[AF_UNIX]
2162 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2163 * <pru_send>:EPIPE[AF_UNIX]
2164 * <pru_send>:ENOTCONN[AF_UNIX]
2165 * <pru_send>:EISCONN[AF_UNIX]
2166 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2167 * <sf_data_out>:??? [whatever a filter author chooses]
2168 *
2169 * Notes: Other <pru_send> returns depend on the protocol family; all
2170 * <sf_data_out> returns depend on what the filter author causes
2171 * their filter to return.
2172 */
2173 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2174 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2175 struct mbuf *top, struct mbuf *control, int flags)
2176 {
2177 struct mbuf **mp;
2178 struct mbuf *m, *freelist = NULL;
2179 struct soflow_hash_entry *dgram_flow_entry = NULL;
2180 user_ssize_t space, len, resid, orig_resid;
2181 int clen = 0, error, dontroute, sendflags;
2182 int atomic = sosendallatonce(so) || top;
2183 int sblocked = 0;
2184 struct proc *p = current_proc();
2185 uint16_t headroom = 0;
2186 ssize_t mlen;
2187 boolean_t en_tracing = FALSE;
2188
2189 if (uio != NULL) {
2190 resid = uio_resid(uio);
2191 } else {
2192 resid = top->m_pkthdr.len;
2193 }
2194 orig_resid = resid;
2195
2196 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2197 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2198
2199 socket_lock(so, 1);
2200
2201 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2202 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2203 }
2204
2205 /*
2206 * trace if tracing & network (vs. unix) sockets & and
2207 * non-loopback
2208 */
2209 if (ENTR_SHOULDTRACE &&
2210 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2211 struct inpcb *inp = sotoinpcb(so);
2212 if (inp->inp_last_outifp != NULL &&
2213 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2214 en_tracing = TRUE;
2215 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2216 VM_KERNEL_ADDRPERM(so),
2217 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2218 (int64_t)resid);
2219 }
2220 }
2221
2222 /*
2223 * Re-injection should not affect process accounting
2224 */
2225 if ((flags & MSG_SKIPCFIL) == 0) {
2226 so_update_last_owner_locked(so, p);
2227 so_update_policy(so);
2228
2229 #if NECP
2230 so_update_necp_policy(so, NULL, addr);
2231 #endif /* NECP */
2232 }
2233
2234 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2235 error = EOPNOTSUPP;
2236 goto out_locked;
2237 }
2238
2239 /*
2240 * In theory resid should be unsigned.
2241 * However, space must be signed, as it might be less than 0
2242 * if we over-committed, and we must use a signed comparison
2243 * of space and resid. On the other hand, a negative resid
2244 * causes us to loop sending 0-length segments to the protocol.
2245 *
2246 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2247 *
2248 * Note: We limit resid to be a positive int value as we use
2249 * imin() to set bytes_to_copy -- radr://14558484
2250 */
2251 if (resid < 0 || resid > INT_MAX ||
2252 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2253 error = EINVAL;
2254 goto out_locked;
2255 }
2256
2257 dontroute = (flags & MSG_DONTROUTE) &&
2258 (so->so_options & SO_DONTROUTE) == 0 &&
2259 (so->so_proto->pr_flags & PR_ATOMIC);
2260 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2261
2262 if (control != NULL) {
2263 clen = control->m_len;
2264 }
2265
2266 if (soreserveheadroom != 0) {
2267 headroom = so->so_pktheadroom;
2268 }
2269
2270 do {
2271 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2272 &sblocked);
2273 if (error) {
2274 goto out_locked;
2275 }
2276
2277 mp = ⊤
2278 space = sbspace(&so->so_snd) - clen;
2279 space += ((flags & MSG_OOB) ? 1024 : 0);
2280
2281 do {
2282 if (uio == NULL) {
2283 /*
2284 * Data is prepackaged in "top".
2285 */
2286 resid = 0;
2287 if (flags & MSG_EOR) {
2288 top->m_flags |= M_EOR;
2289 }
2290 } else {
2291 int chainlength;
2292 int bytes_to_copy;
2293 boolean_t jumbocl;
2294 boolean_t bigcl;
2295 int bytes_to_alloc;
2296
2297 bytes_to_copy = imin((int)resid, (int)space);
2298
2299 bytes_to_alloc = bytes_to_copy;
2300 if (top == NULL) {
2301 bytes_to_alloc += headroom;
2302 }
2303
2304 if (sosendminchain > 0) {
2305 chainlength = 0;
2306 } else {
2307 chainlength = sosendmaxchain;
2308 }
2309
2310 /*
2311 * Use big 4 KB cluster when the outgoing interface
2312 * does not prefer 2 KB clusters
2313 */
2314 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2315 sosendbigcl_ignore_capab;
2316
2317 /*
2318 * Attempt to use larger than system page-size
2319 * clusters for large writes only if there is
2320 * a jumbo cluster pool and if the socket is
2321 * marked accordingly.
2322 */
2323 jumbocl = sosendjcl && njcl > 0 &&
2324 ((so->so_flags & SOF_MULTIPAGES) ||
2325 sosendjcl_ignore_capab) &&
2326 bigcl;
2327
2328 socket_unlock(so, 0);
2329
2330 do {
2331 int num_needed;
2332 int hdrs_needed = (top == NULL) ? 1 : 0;
2333
2334 /*
2335 * try to maintain a local cache of mbuf
2336 * clusters needed to complete this
2337 * write the list is further limited to
2338 * the number that are currently needed
2339 * to fill the socket this mechanism
2340 * allows a large number of mbufs/
2341 * clusters to be grabbed under a single
2342 * mbuf lock... if we can't get any
2343 * clusters, than fall back to trying
2344 * for mbufs if we fail early (or
2345 * miscalcluate the number needed) make
2346 * sure to release any clusters we
2347 * haven't yet consumed.
2348 */
2349 if (freelist == NULL &&
2350 bytes_to_alloc > MBIGCLBYTES &&
2351 jumbocl) {
2352 num_needed =
2353 bytes_to_alloc / M16KCLBYTES;
2354
2355 if ((bytes_to_alloc -
2356 (num_needed * M16KCLBYTES))
2357 >= MINCLSIZE) {
2358 num_needed++;
2359 }
2360
2361 freelist =
2362 m_getpackets_internal(
2363 (unsigned int *)&num_needed,
2364 hdrs_needed, M_WAIT, 0,
2365 M16KCLBYTES);
2366 /*
2367 * Fall back to 4K cluster size
2368 * if allocation failed
2369 */
2370 }
2371
2372 if (freelist == NULL &&
2373 bytes_to_alloc > MCLBYTES &&
2374 bigcl) {
2375 num_needed =
2376 bytes_to_alloc / MBIGCLBYTES;
2377
2378 if ((bytes_to_alloc -
2379 (num_needed * MBIGCLBYTES)) >=
2380 MINCLSIZE) {
2381 num_needed++;
2382 }
2383
2384 freelist =
2385 m_getpackets_internal(
2386 (unsigned int *)&num_needed,
2387 hdrs_needed, M_WAIT, 0,
2388 MBIGCLBYTES);
2389 /*
2390 * Fall back to cluster size
2391 * if allocation failed
2392 */
2393 }
2394
2395 /*
2396 * Allocate a cluster as we want to
2397 * avoid to split the data in more
2398 * that one segment and using MINCLSIZE
2399 * would lead us to allocate two mbufs
2400 */
2401 if (soreserveheadroom != 0 &&
2402 freelist == NULL &&
2403 ((top == NULL &&
2404 bytes_to_alloc > _MHLEN) ||
2405 bytes_to_alloc > _MLEN)) {
2406 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2407 MCLBYTES;
2408 freelist =
2409 m_getpackets_internal(
2410 (unsigned int *)&num_needed,
2411 hdrs_needed, M_WAIT, 0,
2412 MCLBYTES);
2413 /*
2414 * Fall back to a single mbuf
2415 * if allocation failed
2416 */
2417 } else if (freelist == NULL &&
2418 bytes_to_alloc > MINCLSIZE) {
2419 num_needed =
2420 bytes_to_alloc / MCLBYTES;
2421
2422 if ((bytes_to_alloc -
2423 (num_needed * MCLBYTES)) >=
2424 MINCLSIZE) {
2425 num_needed++;
2426 }
2427
2428 freelist =
2429 m_getpackets_internal(
2430 (unsigned int *)&num_needed,
2431 hdrs_needed, M_WAIT, 0,
2432 MCLBYTES);
2433 /*
2434 * Fall back to a single mbuf
2435 * if allocation failed
2436 */
2437 }
2438 /*
2439 * For datagram protocols, leave
2440 * headroom for protocol headers
2441 * in the first cluster of the chain
2442 */
2443 if (freelist != NULL && atomic &&
2444 top == NULL && headroom > 0) {
2445 freelist->m_data += headroom;
2446 }
2447
2448 /*
2449 * Fall back to regular mbufs without
2450 * reserving the socket headroom
2451 */
2452 if (freelist == NULL) {
2453 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2454 if (top == NULL) {
2455 MGETHDR(freelist,
2456 M_WAIT, MT_DATA);
2457 } else {
2458 MGET(freelist,
2459 M_WAIT, MT_DATA);
2460 }
2461 }
2462
2463 if (freelist == NULL) {
2464 error = ENOBUFS;
2465 socket_lock(so, 0);
2466 goto out_locked;
2467 }
2468 /*
2469 * For datagram protocols,
2470 * leave room for protocol
2471 * headers in first mbuf.
2472 */
2473 if (atomic && top == NULL &&
2474 bytes_to_copy > 0 &&
2475 bytes_to_copy < MHLEN) {
2476 MH_ALIGN(freelist,
2477 bytes_to_copy);
2478 }
2479 }
2480 m = freelist;
2481 freelist = m->m_next;
2482 m->m_next = NULL;
2483
2484 if ((m->m_flags & M_EXT)) {
2485 mlen = m->m_ext.ext_size -
2486 M_LEADINGSPACE(m);
2487 } else if ((m->m_flags & M_PKTHDR)) {
2488 mlen = MHLEN - M_LEADINGSPACE(m);
2489 m_add_crumb(m, PKT_CRUMB_SOSEND);
2490 } else {
2491 mlen = MLEN - M_LEADINGSPACE(m);
2492 }
2493 len = imin((int)mlen, bytes_to_copy);
2494
2495 chainlength += len;
2496
2497 space -= len;
2498
2499 error = uiomove(mtod(m, caddr_t),
2500 (int)len, uio);
2501
2502 resid = uio_resid(uio);
2503
2504 m->m_len = (int32_t)len;
2505 *mp = m;
2506 top->m_pkthdr.len += len;
2507 if (error) {
2508 break;
2509 }
2510 mp = &m->m_next;
2511 if (resid <= 0) {
2512 if (flags & MSG_EOR) {
2513 top->m_flags |= M_EOR;
2514 }
2515 break;
2516 }
2517 bytes_to_copy = imin((int)resid, (int)space);
2518 } while (space > 0 &&
2519 (chainlength < sosendmaxchain || atomic ||
2520 resid < MINCLSIZE));
2521
2522 socket_lock(so, 0);
2523
2524 if (error) {
2525 goto out_locked;
2526 }
2527 }
2528
2529 if (dontroute) {
2530 so->so_options |= SO_DONTROUTE;
2531 }
2532
2533 /*
2534 * Compute flags here, for pru_send and NKEs
2535 *
2536 * If the user set MSG_EOF, the protocol
2537 * understands this flag and nothing left to
2538 * send then use PRU_SEND_EOF instead of PRU_SEND.
2539 */
2540 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2541 ((flags & MSG_EOF) &&
2542 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2543 (resid <= 0)) ? PRUS_EOF :
2544 /* If there is more to send set PRUS_MORETOCOME */
2545 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2546
2547 if ((flags & MSG_SKIPCFIL) == 0) {
2548 /*
2549 * Socket filter processing
2550 */
2551 error = sflt_data_out(so, addr, &top,
2552 &control, (sendflags & MSG_OOB) ?
2553 sock_data_filt_flag_oob : 0);
2554 if (error) {
2555 if (error == EJUSTRETURN) {
2556 error = 0;
2557 goto packet_consumed;
2558 }
2559 goto out_locked;
2560 }
2561 #if CONTENT_FILTER
2562 /*
2563 * Content filter processing
2564 */
2565 error = cfil_sock_data_out(so, addr, top,
2566 control, sendflags, dgram_flow_entry);
2567 if (error) {
2568 if (error == EJUSTRETURN) {
2569 error = 0;
2570 goto packet_consumed;
2571 }
2572 goto out_locked;
2573 }
2574 #endif /* CONTENT_FILTER */
2575 }
2576 error = (*so->so_proto->pr_usrreqs->pru_send)
2577 (so, sendflags, top, addr, control, p);
2578
2579 packet_consumed:
2580 if (dontroute) {
2581 so->so_options &= ~SO_DONTROUTE;
2582 }
2583
2584 clen = 0;
2585 control = NULL;
2586 top = NULL;
2587 mp = ⊤
2588 if (error) {
2589 goto out_locked;
2590 }
2591 } while (resid && space > 0);
2592 } while (resid);
2593
2594
2595 out_locked:
2596 if (resid > orig_resid) {
2597 char pname[MAXCOMLEN] = {};
2598 pid_t current_pid = proc_pid(current_proc());
2599 proc_name(current_pid, pname, sizeof(pname));
2600
2601 if (sosend_assert_panic != 0) {
2602 panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2603 so, resid, orig_resid, pname, current_pid);
2604 } else {
2605 os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2606 so->so_gencnt, resid, orig_resid, pname, current_pid);
2607 }
2608 }
2609
2610 if (sblocked) {
2611 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2612 } else {
2613 socket_unlock(so, 1);
2614 }
2615 if (top != NULL) {
2616 m_freem(top);
2617 }
2618 if (control != NULL) {
2619 m_freem(control);
2620 }
2621 if (freelist != NULL) {
2622 m_freem_list(freelist);
2623 }
2624
2625 if (dgram_flow_entry != NULL) {
2626 soflow_free_flow(dgram_flow_entry);
2627 }
2628
2629 soclearfastopen(so);
2630
2631 if (en_tracing) {
2632 /* resid passed here is the bytes left in uio */
2633 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2634 VM_KERNEL_ADDRPERM(so),
2635 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2636 (int64_t)(orig_resid - resid));
2637 }
2638 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2639 so->so_snd.sb_cc, space, error);
2640
2641 return error;
2642 }
2643
2644 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2645 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2646 {
2647 struct mbuf *m0 = NULL, *control_end = NULL;
2648
2649 socket_lock_assert_owned(so);
2650
2651 /*
2652 * top must points to mbuf chain to be sent.
2653 * If control is not NULL, top must be packet header
2654 */
2655 VERIFY(top != NULL &&
2656 (control == NULL || top->m_flags & M_PKTHDR));
2657
2658 /*
2659 * If control is not passed in, see if we can get it
2660 * from top.
2661 */
2662 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2663 // Locate start of control if present and start of data
2664 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2665 if (m0->m_flags & M_PKTHDR) {
2666 top = m0;
2667 break;
2668 } else if (m0->m_type == MT_CONTROL) {
2669 if (control == NULL) {
2670 // Found start of control
2671 control = m0;
2672 }
2673 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2674 // Found end of control
2675 control_end = m0;
2676 }
2677 }
2678 }
2679 if (control_end != NULL) {
2680 control_end->m_next = NULL;
2681 }
2682 }
2683
2684 int error = (*so->so_proto->pr_usrreqs->pru_send)
2685 (so, sendflags, top, addr, control, current_proc());
2686
2687 return error;
2688 }
2689
2690 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp)2691 mbuf_detach_control_from_list(struct mbuf **mp)
2692 {
2693 struct mbuf *control = NULL;
2694 struct mbuf *m = *mp;
2695
2696 if (m->m_type == MT_CONTROL) {
2697 struct mbuf *control_end;
2698 struct mbuf *n;
2699
2700 n = control_end = control = m;
2701
2702 /*
2703 * Break the chain per mbuf type
2704 */
2705 while (n != NULL && n->m_type == MT_CONTROL) {
2706 control_end = n;
2707 n = n->m_next;
2708 }
2709 control_end->m_next = NULL;
2710 *mp = n;
2711 }
2712 VERIFY(*mp != NULL);
2713
2714 return control;
2715 }
2716
2717 /*
2718 * Supported only connected sockets (no address) without ancillary data
2719 * (control mbuf) for atomic protocols
2720 */
2721 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2722 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2723 {
2724 struct mbuf *m;
2725 struct soflow_hash_entry *dgram_flow_entry = NULL;
2726 int error, dontroute;
2727 int atomic = sosendallatonce(so);
2728 int sblocked = 0;
2729 struct proc *p = current_proc();
2730 struct mbuf *top = pktlist;
2731 bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2732
2733 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2734 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2735
2736 if (so->so_type != SOCK_DGRAM) {
2737 error = EINVAL;
2738 os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2739 error);
2740 goto out;
2741 }
2742 if (atomic == 0) {
2743 error = EINVAL;
2744 os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2745 error);
2746 goto out;
2747 }
2748 if ((so->so_state & SS_ISCONNECTED) == 0) {
2749 error = ENOTCONN;
2750 os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2751 error);
2752 goto out;
2753 }
2754 if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2755 error = EINVAL;
2756 os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2757 flags, error);
2758 goto out;
2759 }
2760
2761 socket_lock(so, 1);
2762 so_update_last_owner_locked(so, p);
2763 so_update_policy(so);
2764
2765 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2766 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, true, 0);
2767 }
2768
2769 #if NECP
2770 so_update_necp_policy(so, NULL, NULL);
2771 #endif /* NECP */
2772
2773 dontroute = (flags & MSG_DONTROUTE) &&
2774 (so->so_options & SO_DONTROUTE) == 0 &&
2775 (so->so_proto->pr_flags & PR_ATOMIC);
2776 if (dontroute) {
2777 so->so_options |= SO_DONTROUTE;
2778 }
2779
2780 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2781
2782 error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2783 if (error) {
2784 os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2785 error);
2786 goto release;
2787 }
2788
2789 if (!skip_filt) {
2790 struct mbuf **prevnextp = NULL;
2791
2792 for (m = top; m != NULL; m = m->m_nextpkt) {
2793 struct mbuf *control = NULL;
2794 struct mbuf *last_control = NULL;
2795 struct mbuf *nextpkt;
2796
2797 /*
2798 * Remove packet from the list of packets
2799 */
2800 nextpkt = m->m_nextpkt;
2801 if (prevnextp != NULL) {
2802 *prevnextp = nextpkt;
2803 } else {
2804 top = nextpkt;
2805 }
2806 m->m_nextpkt = NULL;
2807
2808 /*
2809 * Break the chain per mbuf type
2810 */
2811 if (m->m_type == MT_CONTROL) {
2812 control = mbuf_detach_control_from_list(&m);
2813 }
2814 /*
2815 * Socket filter processing
2816 */
2817 error = sflt_data_out(so, NULL, &m,
2818 &control, 0);
2819 if (error != 0 && error != EJUSTRETURN) {
2820 os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2821 error);
2822 goto release;
2823 }
2824
2825 #if CONTENT_FILTER
2826 if (error == 0) {
2827 /*
2828 * Content filter processing
2829 */
2830 error = cfil_sock_data_out(so, NULL, m,
2831 control, 0, dgram_flow_entry);
2832 if (error != 0 && error != EJUSTRETURN) {
2833 os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2834 error);
2835 goto release;
2836 }
2837 }
2838 #endif /* CONTENT_FILTER */
2839 if (error == EJUSTRETURN) {
2840 /*
2841 * When swallowed by a filter, the packet is not
2842 * in the list anymore
2843 */
2844 error = 0;
2845 } else {
2846 /*
2847 * Rebuild the mbuf chain of the packet
2848 */
2849 if (control != NULL) {
2850 last_control->m_next = m;
2851 m = control;
2852 }
2853 /*
2854 * Reinsert the packet in the list of packets
2855 */
2856 m->m_nextpkt = nextpkt;
2857 if (prevnextp != NULL) {
2858 *prevnextp = m;
2859 } else {
2860 top = m;
2861 }
2862 prevnextp = &m->m_nextpkt;
2863 }
2864 }
2865 }
2866
2867 if (top != NULL) {
2868 if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2869 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2870 (so, top, pktcnt, flags);
2871 if (error != 0) {
2872 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2873 error);
2874 }
2875 top = NULL;
2876 } else {
2877 *pktcnt = 0;
2878 for (m = top; m != NULL; m = top) {
2879 struct mbuf *control = NULL;
2880
2881 top = m->m_nextpkt;
2882 m->m_nextpkt = NULL;
2883
2884 /*
2885 * Break the chain per mbuf type
2886 */
2887 if (m->m_type == MT_CONTROL) {
2888 control = mbuf_detach_control_from_list(&m);
2889 }
2890
2891 error = (*so->so_proto->pr_usrreqs->pru_send)
2892 (so, 0, m, NULL, control, current_proc());
2893 if (error != 0) {
2894 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2895 error);
2896 goto release;
2897 }
2898 *pktcnt += 1;
2899 }
2900 }
2901 }
2902
2903 release:
2904 if (dontroute) {
2905 so->so_options &= ~SO_DONTROUTE;
2906 }
2907 if (sblocked) {
2908 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2909 } else {
2910 socket_unlock(so, 1);
2911 }
2912 out:
2913 if (top != NULL) {
2914 os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2915 error);
2916 m_freem_list(top);
2917 }
2918
2919 if (dgram_flow_entry != NULL) {
2920 soflow_free_flow(dgram_flow_entry);
2921 }
2922
2923 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2924 so->so_snd.sb_cc, 0, error);
2925
2926 return error;
2927 }
2928
2929 /*
2930 * May return ERESTART when packet is dropped by MAC policy check
2931 */
2932 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2933 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2934 struct mbuf **maddrp,
2935 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2936 {
2937 int error = 0;
2938 struct mbuf *m = *mp;
2939 struct mbuf *nextrecord = *nextrecordp;
2940
2941 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2942 #if CONFIG_MACF_SOCKET_SUBSET
2943 /*
2944 * Call the MAC framework for policy checking if we're in
2945 * the user process context and the socket isn't connected.
2946 */
2947 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2948 struct mbuf *m0 = m;
2949 /*
2950 * Dequeue this record (temporarily) from the receive
2951 * list since we're about to drop the socket's lock
2952 * where a new record may arrive and be appended to
2953 * the list. Upon MAC policy failure, the record
2954 * will be freed. Otherwise, we'll add it back to
2955 * the head of the list. We cannot rely on SB_LOCK
2956 * because append operation uses the socket's lock.
2957 */
2958 do {
2959 m->m_nextpkt = NULL;
2960 sbfree(&so->so_rcv, m);
2961 m = m->m_next;
2962 } while (m != NULL);
2963 m = m0;
2964 so->so_rcv.sb_mb = nextrecord;
2965 SB_EMPTY_FIXUP(&so->so_rcv);
2966 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2967 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2968 socket_unlock(so, 0);
2969
2970 error = mac_socket_check_received(kauth_cred_get(), so,
2971 mtod(m, struct sockaddr *));
2972
2973 if (error != 0) {
2974 /*
2975 * MAC policy failure; free this record and
2976 * process the next record (or block until
2977 * one is available). We have adjusted sb_cc
2978 * and sb_mbcnt above so there is no need to
2979 * call sbfree() again.
2980 */
2981 m_freem(m);
2982 /*
2983 * Clear SB_LOCK but don't unlock the socket.
2984 * Process the next record or wait for one.
2985 */
2986 socket_lock(so, 0);
2987 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2988 error = ERESTART;
2989 goto done;
2990 }
2991 socket_lock(so, 0);
2992 /*
2993 * If the socket has been defunct'd, drop it.
2994 */
2995 if (so->so_flags & SOF_DEFUNCT) {
2996 m_freem(m);
2997 error = ENOTCONN;
2998 goto done;
2999 }
3000 /*
3001 * Re-adjust the socket receive list and re-enqueue
3002 * the record in front of any packets which may have
3003 * been appended while we dropped the lock.
3004 */
3005 for (m = m0; m->m_next != NULL; m = m->m_next) {
3006 sballoc(&so->so_rcv, m);
3007 }
3008 sballoc(&so->so_rcv, m);
3009 if (so->so_rcv.sb_mb == NULL) {
3010 so->so_rcv.sb_lastrecord = m0;
3011 so->so_rcv.sb_mbtail = m;
3012 }
3013 m = m0;
3014 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3015 so->so_rcv.sb_mb = m;
3016 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3017 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3018 }
3019 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3020 if (psa != NULL) {
3021 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3022 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3023 error = EWOULDBLOCK;
3024 goto done;
3025 }
3026 } else if (maddrp != NULL) {
3027 *maddrp = m;
3028 }
3029 if (flags & MSG_PEEK) {
3030 m = m->m_next;
3031 } else {
3032 sbfree(&so->so_rcv, m);
3033 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3034 panic("%s: about to create invalid socketbuf",
3035 __func__);
3036 /* NOTREACHED */
3037 }
3038 if (maddrp == NULL) {
3039 MFREE(m, so->so_rcv.sb_mb);
3040 } else {
3041 so->so_rcv.sb_mb = m->m_next;
3042 m->m_next = NULL;
3043 }
3044 m = so->so_rcv.sb_mb;
3045 if (m != NULL) {
3046 m->m_nextpkt = nextrecord;
3047 } else {
3048 so->so_rcv.sb_mb = nextrecord;
3049 SB_EMPTY_FIXUP(&so->so_rcv);
3050 }
3051 }
3052 done:
3053 *mp = m;
3054 *nextrecordp = nextrecord;
3055
3056 return error;
3057 }
3058
3059 /*
3060 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3061 * so clear the data portion in order not to leak the file pointers
3062 */
3063 static void
sopeek_scm_rights(struct mbuf * rights)3064 sopeek_scm_rights(struct mbuf *rights)
3065 {
3066 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3067
3068 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3069 VERIFY(cm->cmsg_len <= rights->m_len);
3070 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3071 }
3072 }
3073
3074 /*
3075 * Process one or more MT_CONTROL mbufs present before any data mbufs
3076 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3077 * just copy the data; if !MSG_PEEK, we call into the protocol to
3078 * perform externalization.
3079 */
3080 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3081 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3082 struct mbuf **mp, struct mbuf **nextrecordp)
3083 {
3084 int error = 0;
3085 struct mbuf *cm = NULL, *cmn;
3086 struct mbuf **cme = &cm;
3087 struct sockbuf *sb_rcv = &so->so_rcv;
3088 struct mbuf **msgpcm = NULL;
3089 struct mbuf *m = *mp;
3090 struct mbuf *nextrecord = *nextrecordp;
3091 struct protosw *pr = so->so_proto;
3092
3093 /*
3094 * Externalizing the control messages would require us to
3095 * drop the socket's lock below. Once we re-acquire the
3096 * lock, the mbuf chain might change. In order to preserve
3097 * consistency, we unlink all control messages from the
3098 * first mbuf chain in one shot and link them separately
3099 * onto a different chain.
3100 */
3101 do {
3102 if (flags & MSG_PEEK) {
3103 if (controlp != NULL) {
3104 if (*controlp == NULL) {
3105 msgpcm = controlp;
3106 }
3107 *controlp = m_copy(m, 0, m->m_len);
3108
3109 /*
3110 * If we failed to allocate an mbuf,
3111 * release any previously allocated
3112 * mbufs for control data. Return
3113 * an error. Keep the mbufs in the
3114 * socket as this is using
3115 * MSG_PEEK flag.
3116 */
3117 if (*controlp == NULL) {
3118 m_freem(*msgpcm);
3119 error = ENOBUFS;
3120 goto done;
3121 }
3122
3123 if (pr->pr_domain->dom_externalize != NULL) {
3124 sopeek_scm_rights(*controlp);
3125 }
3126
3127 controlp = &(*controlp)->m_next;
3128 }
3129 m = m->m_next;
3130 } else {
3131 m->m_nextpkt = NULL;
3132 sbfree(sb_rcv, m);
3133 sb_rcv->sb_mb = m->m_next;
3134 m->m_next = NULL;
3135 *cme = m;
3136 cme = &(*cme)->m_next;
3137 m = sb_rcv->sb_mb;
3138 }
3139 } while (m != NULL && m->m_type == MT_CONTROL);
3140
3141 if (!(flags & MSG_PEEK)) {
3142 if (sb_rcv->sb_mb != NULL) {
3143 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3144 } else {
3145 sb_rcv->sb_mb = nextrecord;
3146 SB_EMPTY_FIXUP(sb_rcv);
3147 }
3148 if (nextrecord == NULL) {
3149 sb_rcv->sb_lastrecord = m;
3150 }
3151 }
3152
3153 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3154 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3155
3156 while (cm != NULL) {
3157 int cmsg_level;
3158 int cmsg_type;
3159
3160 cmn = cm->m_next;
3161 cm->m_next = NULL;
3162 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3163 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3164
3165 /*
3166 * Call the protocol to externalize SCM_RIGHTS message
3167 * and return the modified message to the caller upon
3168 * success. Otherwise, all other control messages are
3169 * returned unmodified to the caller. Note that we
3170 * only get into this loop if MSG_PEEK is not set.
3171 */
3172 if (pr->pr_domain->dom_externalize != NULL &&
3173 cmsg_level == SOL_SOCKET &&
3174 cmsg_type == SCM_RIGHTS) {
3175 /*
3176 * Release socket lock: see 3903171. This
3177 * would also allow more records to be appended
3178 * to the socket buffer. We still have SB_LOCK
3179 * set on it, so we can be sure that the head
3180 * of the mbuf chain won't change.
3181 */
3182 socket_unlock(so, 0);
3183 error = (*pr->pr_domain->dom_externalize)(cm);
3184 socket_lock(so, 0);
3185 } else {
3186 error = 0;
3187 }
3188
3189 if (controlp != NULL && error == 0) {
3190 *controlp = cm;
3191 controlp = &(*controlp)->m_next;
3192 } else {
3193 (void) m_free(cm);
3194 }
3195 cm = cmn;
3196 }
3197 /*
3198 * Update the value of nextrecord in case we received new
3199 * records when the socket was unlocked above for
3200 * externalizing SCM_RIGHTS.
3201 */
3202 if (m != NULL) {
3203 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3204 } else {
3205 nextrecord = sb_rcv->sb_mb;
3206 }
3207
3208 done:
3209 *mp = m;
3210 *nextrecordp = nextrecord;
3211
3212 return error;
3213 }
3214
3215 /*
3216 * If we have less data than requested, block awaiting more
3217 * (subject to any timeout) if:
3218 * 1. the current count is less than the low water mark, or
3219 * 2. MSG_WAITALL is set, and it is possible to do the entire
3220 * receive operation at once if we block (resid <= hiwat).
3221 * 3. MSG_DONTWAIT is not set
3222 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3223 * we have to do the receive in sections, and thus risk returning
3224 * a short count if a timeout or signal occurs after we start.
3225 */
3226 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3227 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3228 {
3229 struct protosw *pr = so->so_proto;
3230
3231 /* No mbufs in the receive-queue? Wait! */
3232 if (m == NULL) {
3233 return true;
3234 }
3235
3236 /* Not enough data in the receive socket-buffer - we may have to wait */
3237 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3238 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3239 /*
3240 * Application did set the lowater-mark, so we should wait for
3241 * this data to be present.
3242 */
3243 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3244 return true;
3245 }
3246
3247 /*
3248 * Application wants all the data - so let's try to do the
3249 * receive-operation at once by waiting for everything to
3250 * be there.
3251 */
3252 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3253 return true;
3254 }
3255 }
3256
3257 return false;
3258 }
3259
3260 /*
3261 * Implement receive operations on a socket.
3262 * We depend on the way that records are added to the sockbuf
3263 * by sbappend*. In particular, each record (mbufs linked through m_next)
3264 * must begin with an address if the protocol so specifies,
3265 * followed by an optional mbuf or mbufs containing ancillary data,
3266 * and then zero or more mbufs of data.
3267 * In order to avoid blocking network interrupts for the entire time here,
3268 * we splx() while doing the actual copy to user space.
3269 * Although the sockbuf is locked, new data may still be appended,
3270 * and thus we must maintain consistency of the sockbuf during that time.
3271 *
3272 * The caller may receive the data as a single mbuf chain by supplying
3273 * an mbuf **mp0 for use in returning the chain. The uio is then used
3274 * only for the count in uio_resid.
3275 *
3276 * Returns: 0 Success
3277 * ENOBUFS
3278 * ENOTCONN
3279 * EWOULDBLOCK
3280 * uiomove:EFAULT
3281 * sblock:EWOULDBLOCK
3282 * sblock:EINTR
3283 * sbwait:EBADF
3284 * sbwait:EINTR
3285 * sodelayed_copy:EFAULT
3286 * <pru_rcvoob>:EINVAL[TCP]
3287 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3288 * <pru_rcvoob>:???
3289 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3290 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3291 * <pr_domain->dom_externalize>:???
3292 *
3293 * Notes: Additional return values from calls through <pru_rcvoob> and
3294 * <pr_domain->dom_externalize> depend on protocols other than
3295 * TCP or AF_UNIX, which are documented above.
3296 */
3297 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3298 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3299 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3300 {
3301 struct mbuf *m, **mp, *ml = NULL;
3302 struct mbuf *nextrecord, *free_list;
3303 int flags, error, offset;
3304 user_ssize_t len;
3305 struct protosw *pr = so->so_proto;
3306 int moff, type = 0;
3307 user_ssize_t orig_resid = uio_resid(uio);
3308 user_ssize_t delayed_copy_len;
3309 int can_delay;
3310 struct proc *p = current_proc();
3311 boolean_t en_tracing = FALSE;
3312
3313 /*
3314 * Sanity check on the length passed by caller as we are making 'int'
3315 * comparisons
3316 */
3317 if (orig_resid < 0 || orig_resid > INT_MAX) {
3318 return EINVAL;
3319 }
3320
3321 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3322 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3323 so->so_rcv.sb_hiwat);
3324
3325 socket_lock(so, 1);
3326 so_update_last_owner_locked(so, p);
3327 so_update_policy(so);
3328
3329 #ifdef MORE_LOCKING_DEBUG
3330 if (so->so_usecount == 1) {
3331 panic("%s: so=%x no other reference on socket", __func__, so);
3332 /* NOTREACHED */
3333 }
3334 #endif
3335 mp = mp0;
3336 if (psa != NULL) {
3337 *psa = NULL;
3338 }
3339 if (controlp != NULL) {
3340 *controlp = NULL;
3341 }
3342 if (flagsp != NULL) {
3343 flags = *flagsp & ~MSG_EOR;
3344 } else {
3345 flags = 0;
3346 }
3347
3348 /*
3349 * If a recv attempt is made on a previously-accepted socket
3350 * that has been marked as inactive (disconnected), reject
3351 * the request.
3352 */
3353 if (so->so_flags & SOF_DEFUNCT) {
3354 struct sockbuf *sb = &so->so_rcv;
3355
3356 error = ENOTCONN;
3357 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3358 __func__, proc_pid(p), proc_best_name(p),
3359 so->so_gencnt,
3360 SOCK_DOM(so), SOCK_TYPE(so), error);
3361 /*
3362 * This socket should have been disconnected and flushed
3363 * prior to being returned from sodefunct(); there should
3364 * be no data on its receive list, so panic otherwise.
3365 */
3366 if (so->so_state & SS_DEFUNCT) {
3367 sb_empty_assert(sb, __func__);
3368 }
3369 socket_unlock(so, 1);
3370 return error;
3371 }
3372
3373 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3374 pr->pr_usrreqs->pru_preconnect) {
3375 /*
3376 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3377 * calling write() right after this. *If* the app calls a read
3378 * we do not want to block this read indefinetely. Thus,
3379 * we trigger a connect so that the session gets initiated.
3380 */
3381 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3382
3383 if (error) {
3384 socket_unlock(so, 1);
3385 return error;
3386 }
3387 }
3388
3389 if (ENTR_SHOULDTRACE &&
3390 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3391 /*
3392 * enable energy tracing for inet sockets that go over
3393 * non-loopback interfaces only.
3394 */
3395 struct inpcb *inp = sotoinpcb(so);
3396 if (inp->inp_last_outifp != NULL &&
3397 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3398 en_tracing = TRUE;
3399 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3400 VM_KERNEL_ADDRPERM(so),
3401 ((so->so_state & SS_NBIO) ?
3402 kEnTrFlagNonBlocking : 0),
3403 (int64_t)orig_resid);
3404 }
3405 }
3406
3407 /*
3408 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3409 * regardless of the flags argument. Here is the case were
3410 * out-of-band data is not inline.
3411 */
3412 if ((flags & MSG_OOB) ||
3413 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3414 (so->so_options & SO_OOBINLINE) == 0 &&
3415 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3416 m = m_get(M_WAIT, MT_DATA);
3417 if (m == NULL) {
3418 socket_unlock(so, 1);
3419 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3420 ENOBUFS, 0, 0, 0, 0);
3421 return ENOBUFS;
3422 }
3423 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3424 if (error) {
3425 goto bad;
3426 }
3427 socket_unlock(so, 0);
3428 do {
3429 error = uiomove(mtod(m, caddr_t),
3430 imin((int)uio_resid(uio), m->m_len), uio);
3431 m = m_free(m);
3432 } while (uio_resid(uio) && error == 0 && m != NULL);
3433 socket_lock(so, 0);
3434 bad:
3435 if (m != NULL) {
3436 m_freem(m);
3437 }
3438
3439 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3440 if (error == EWOULDBLOCK || error == EINVAL) {
3441 /*
3442 * Let's try to get normal data:
3443 * EWOULDBLOCK: out-of-band data not
3444 * receive yet. EINVAL: out-of-band data
3445 * already read.
3446 */
3447 error = 0;
3448 goto nooob;
3449 } else if (error == 0 && flagsp != NULL) {
3450 *flagsp |= MSG_OOB;
3451 }
3452 }
3453 socket_unlock(so, 1);
3454 if (en_tracing) {
3455 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3456 VM_KERNEL_ADDRPERM(so), 0,
3457 (int64_t)(orig_resid - uio_resid(uio)));
3458 }
3459 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3460 0, 0, 0, 0);
3461
3462 return error;
3463 }
3464 nooob:
3465 if (mp != NULL) {
3466 *mp = NULL;
3467 }
3468
3469 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3470 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3471 }
3472
3473 free_list = NULL;
3474 delayed_copy_len = 0;
3475 restart:
3476 #ifdef MORE_LOCKING_DEBUG
3477 if (so->so_usecount <= 1) {
3478 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3479 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3480 }
3481 #endif
3482 /*
3483 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3484 * and if so just return to the caller. This could happen when
3485 * soreceive() is called by a socket upcall function during the
3486 * time the socket is freed. The socket buffer would have been
3487 * locked across the upcall, therefore we cannot put this thread
3488 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3489 * we may livelock), because the lock on the socket buffer will
3490 * only be released when the upcall routine returns to its caller.
3491 * Because the socket has been officially closed, there can be
3492 * no further read on it.
3493 *
3494 * A multipath subflow socket would have its SS_NOFDREF set by
3495 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3496 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3497 */
3498 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3499 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3500 socket_unlock(so, 1);
3501 return 0;
3502 }
3503
3504 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3505 if (error) {
3506 socket_unlock(so, 1);
3507 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3508 0, 0, 0, 0);
3509 if (en_tracing) {
3510 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3511 VM_KERNEL_ADDRPERM(so), 0,
3512 (int64_t)(orig_resid - uio_resid(uio)));
3513 }
3514 return error;
3515 }
3516
3517 m = so->so_rcv.sb_mb;
3518 if (so_should_wait(so, uio, m, flags)) {
3519 /*
3520 * Panic if we notice inconsistencies in the socket's
3521 * receive list; both sb_mb and sb_cc should correctly
3522 * reflect the contents of the list, otherwise we may
3523 * end up with false positives during select() or poll()
3524 * which could put the application in a bad state.
3525 */
3526 SB_MB_CHECK(&so->so_rcv);
3527
3528 if (so->so_error) {
3529 if (m != NULL) {
3530 goto dontblock;
3531 }
3532 error = so->so_error;
3533 if ((flags & MSG_PEEK) == 0) {
3534 so->so_error = 0;
3535 }
3536 goto release;
3537 }
3538 if (so->so_state & SS_CANTRCVMORE) {
3539 #if CONTENT_FILTER
3540 /*
3541 * Deal with half closed connections
3542 */
3543 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3544 cfil_sock_data_pending(&so->so_rcv) != 0) {
3545 CFIL_LOG(LOG_INFO,
3546 "so %llx ignore SS_CANTRCVMORE",
3547 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3548 } else
3549 #endif /* CONTENT_FILTER */
3550 if (m != NULL) {
3551 goto dontblock;
3552 } else {
3553 goto release;
3554 }
3555 }
3556 for (; m != NULL; m = m->m_next) {
3557 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3558 m = so->so_rcv.sb_mb;
3559 goto dontblock;
3560 }
3561 }
3562 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3563 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3564 error = ENOTCONN;
3565 goto release;
3566 }
3567 if (uio_resid(uio) == 0) {
3568 goto release;
3569 }
3570
3571 if ((so->so_state & SS_NBIO) ||
3572 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3573 error = EWOULDBLOCK;
3574 goto release;
3575 }
3576 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3577 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3578 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3579 #if EVEN_MORE_LOCKING_DEBUG
3580 if (socket_debug) {
3581 printf("Waiting for socket data\n");
3582 }
3583 #endif
3584
3585 /*
3586 * Depending on the protocol (e.g. TCP), the following
3587 * might cause the socket lock to be dropped and later
3588 * be reacquired, and more data could have arrived and
3589 * have been appended to the receive socket buffer by
3590 * the time it returns. Therefore, we only sleep in
3591 * sbwait() below if and only if the wait-condition is still
3592 * true.
3593 */
3594 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3595 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3596 }
3597
3598 error = 0;
3599 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3600 error = sbwait(&so->so_rcv);
3601 }
3602
3603 #if EVEN_MORE_LOCKING_DEBUG
3604 if (socket_debug) {
3605 printf("SORECEIVE - sbwait returned %d\n", error);
3606 }
3607 #endif
3608 if (so->so_usecount < 1) {
3609 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3610 __func__, so, so->so_usecount);
3611 /* NOTREACHED */
3612 }
3613 if (error) {
3614 socket_unlock(so, 1);
3615 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3616 0, 0, 0, 0);
3617 if (en_tracing) {
3618 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3619 VM_KERNEL_ADDRPERM(so), 0,
3620 (int64_t)(orig_resid - uio_resid(uio)));
3621 }
3622 return error;
3623 }
3624 goto restart;
3625 }
3626 dontblock:
3627 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3628 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3629 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3630 nextrecord = m->m_nextpkt;
3631
3632 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3633 error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3634 mp0 == NULL);
3635 if (error == ERESTART) {
3636 goto restart;
3637 } else if (error != 0) {
3638 goto release;
3639 }
3640 orig_resid = 0;
3641 }
3642
3643 /*
3644 * Process one or more MT_CONTROL mbufs present before any data mbufs
3645 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3646 * just copy the data; if !MSG_PEEK, we call into the protocol to
3647 * perform externalization.
3648 */
3649 if (m != NULL && m->m_type == MT_CONTROL) {
3650 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3651 if (error != 0) {
3652 goto release;
3653 }
3654 orig_resid = 0;
3655 }
3656
3657 if (m != NULL) {
3658 if (!(flags & MSG_PEEK)) {
3659 /*
3660 * We get here because m points to an mbuf following
3661 * any MT_SONAME or MT_CONTROL mbufs which have been
3662 * processed above. In any case, m should be pointing
3663 * to the head of the mbuf chain, and the nextrecord
3664 * should be either NULL or equal to m->m_nextpkt.
3665 * See comments above about SB_LOCK.
3666 */
3667 if (m != so->so_rcv.sb_mb ||
3668 m->m_nextpkt != nextrecord) {
3669 panic("%s: post-control !sync so=%p m=%p "
3670 "nextrecord=%p\n", __func__, so, m,
3671 nextrecord);
3672 /* NOTREACHED */
3673 }
3674 if (nextrecord == NULL) {
3675 so->so_rcv.sb_lastrecord = m;
3676 }
3677 }
3678 type = m->m_type;
3679 if (type == MT_OOBDATA) {
3680 flags |= MSG_OOB;
3681 }
3682 } else {
3683 if (!(flags & MSG_PEEK)) {
3684 SB_EMPTY_FIXUP(&so->so_rcv);
3685 }
3686 }
3687 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3688 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3689
3690 moff = 0;
3691 offset = 0;
3692
3693 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3694 can_delay = 1;
3695 } else {
3696 can_delay = 0;
3697 }
3698
3699 while (m != NULL &&
3700 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3701 if (m->m_type == MT_OOBDATA) {
3702 if (type != MT_OOBDATA) {
3703 break;
3704 }
3705 } else if (type == MT_OOBDATA) {
3706 break;
3707 }
3708
3709 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3710 break;
3711 }
3712 /*
3713 * Make sure to allways set MSG_OOB event when getting
3714 * out of band data inline.
3715 */
3716 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3717 (so->so_options & SO_OOBINLINE) != 0 &&
3718 (so->so_state & SS_RCVATMARK) != 0) {
3719 flags |= MSG_OOB;
3720 }
3721 so->so_state &= ~SS_RCVATMARK;
3722 len = uio_resid(uio) - delayed_copy_len;
3723 if (so->so_oobmark && len > so->so_oobmark - offset) {
3724 len = so->so_oobmark - offset;
3725 }
3726 if (len > m->m_len - moff) {
3727 len = m->m_len - moff;
3728 }
3729 /*
3730 * If mp is set, just pass back the mbufs.
3731 * Otherwise copy them out via the uio, then free.
3732 * Sockbuf must be consistent here (points to current mbuf,
3733 * it points to next record) when we drop priority;
3734 * we must note any additions to the sockbuf when we
3735 * block interrupts again.
3736 */
3737 if (mp == NULL) {
3738 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3739 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3740 if (can_delay && len == m->m_len) {
3741 /*
3742 * only delay the copy if we're consuming the
3743 * mbuf and we're NOT in MSG_PEEK mode
3744 * and we have enough data to make it worthwile
3745 * to drop and retake the lock... can_delay
3746 * reflects the state of the 2 latter
3747 * constraints moff should always be zero
3748 * in these cases
3749 */
3750 delayed_copy_len += len;
3751 } else {
3752 if (delayed_copy_len) {
3753 error = sodelayed_copy(so, uio,
3754 &free_list, &delayed_copy_len);
3755
3756 if (error) {
3757 goto release;
3758 }
3759 /*
3760 * can only get here if MSG_PEEK is not
3761 * set therefore, m should point at the
3762 * head of the rcv queue; if it doesn't,
3763 * it means something drastically
3764 * changed while we were out from behind
3765 * the lock in sodelayed_copy. perhaps
3766 * a RST on the stream. in any event,
3767 * the stream has been interrupted. it's
3768 * probably best just to return whatever
3769 * data we've moved and let the caller
3770 * sort it out...
3771 */
3772 if (m != so->so_rcv.sb_mb) {
3773 break;
3774 }
3775 }
3776 socket_unlock(so, 0);
3777 error = uiomove(mtod(m, caddr_t) + moff,
3778 (int)len, uio);
3779 socket_lock(so, 0);
3780
3781 if (error) {
3782 goto release;
3783 }
3784 }
3785 } else {
3786 uio_setresid(uio, (uio_resid(uio) - len));
3787 }
3788 if (len == m->m_len - moff) {
3789 if (m->m_flags & M_EOR) {
3790 flags |= MSG_EOR;
3791 }
3792 if (flags & MSG_PEEK) {
3793 m = m->m_next;
3794 moff = 0;
3795 } else {
3796 nextrecord = m->m_nextpkt;
3797 sbfree(&so->so_rcv, m);
3798 m->m_nextpkt = NULL;
3799
3800 if (mp != NULL) {
3801 *mp = m;
3802 mp = &m->m_next;
3803 so->so_rcv.sb_mb = m = m->m_next;
3804 *mp = NULL;
3805 } else {
3806 if (free_list == NULL) {
3807 free_list = m;
3808 } else {
3809 ml->m_next = m;
3810 }
3811 ml = m;
3812 so->so_rcv.sb_mb = m = m->m_next;
3813 ml->m_next = NULL;
3814 }
3815 if (m != NULL) {
3816 m->m_nextpkt = nextrecord;
3817 if (nextrecord == NULL) {
3818 so->so_rcv.sb_lastrecord = m;
3819 }
3820 } else {
3821 so->so_rcv.sb_mb = nextrecord;
3822 SB_EMPTY_FIXUP(&so->so_rcv);
3823 }
3824 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3825 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3826 }
3827 } else {
3828 if (flags & MSG_PEEK) {
3829 moff += len;
3830 } else {
3831 if (mp != NULL) {
3832 int copy_flag;
3833
3834 if (flags & MSG_DONTWAIT) {
3835 copy_flag = M_DONTWAIT;
3836 } else {
3837 copy_flag = M_WAIT;
3838 }
3839 *mp = m_copym(m, 0, (int)len, copy_flag);
3840 /*
3841 * Failed to allocate an mbuf?
3842 * Adjust uio_resid back, it was
3843 * adjusted down by len bytes which
3844 * we didn't copy over.
3845 */
3846 if (*mp == NULL) {
3847 uio_setresid(uio,
3848 (uio_resid(uio) + len));
3849 break;
3850 }
3851 }
3852 m->m_data += len;
3853 m->m_len -= len;
3854 so->so_rcv.sb_cc -= len;
3855 }
3856 }
3857 if (so->so_oobmark) {
3858 if ((flags & MSG_PEEK) == 0) {
3859 so->so_oobmark -= len;
3860 if (so->so_oobmark == 0) {
3861 so->so_state |= SS_RCVATMARK;
3862 break;
3863 }
3864 } else {
3865 offset += len;
3866 if (offset == so->so_oobmark) {
3867 break;
3868 }
3869 }
3870 }
3871 if (flags & MSG_EOR) {
3872 break;
3873 }
3874 /*
3875 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3876 * (for non-atomic socket), we must not quit until
3877 * "uio->uio_resid == 0" or an error termination.
3878 * If a signal/timeout occurs, return with a short
3879 * count but without error. Keep sockbuf locked
3880 * against other readers.
3881 */
3882 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3883 (uio_resid(uio) - delayed_copy_len) > 0 &&
3884 !sosendallatonce(so) && !nextrecord) {
3885 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3886 #if CONTENT_FILTER
3887 && cfil_sock_data_pending(&so->so_rcv) == 0
3888 #endif /* CONTENT_FILTER */
3889 )) {
3890 goto release;
3891 }
3892
3893 /*
3894 * Depending on the protocol (e.g. TCP), the following
3895 * might cause the socket lock to be dropped and later
3896 * be reacquired, and more data could have arrived and
3897 * have been appended to the receive socket buffer by
3898 * the time it returns. Therefore, we only sleep in
3899 * sbwait() below if and only if the socket buffer is
3900 * empty, in order to avoid a false sleep.
3901 */
3902 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3903 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3904 }
3905
3906 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3907 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3908
3909 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3910 error = 0;
3911 goto release;
3912 }
3913 /*
3914 * have to wait until after we get back from the sbwait
3915 * to do the copy because we will drop the lock if we
3916 * have enough data that has been delayed... by dropping
3917 * the lock we open up a window allowing the netisr
3918 * thread to process the incoming packets and to change
3919 * the state of this socket... we're issuing the sbwait
3920 * because the socket is empty and we're expecting the
3921 * netisr thread to wake us up when more packets arrive;
3922 * if we allow that processing to happen and then sbwait
3923 * we could stall forever with packets sitting in the
3924 * socket if no further packets arrive from the remote
3925 * side.
3926 *
3927 * we want to copy before we've collected all the data
3928 * to satisfy this request to allow the copy to overlap
3929 * the incoming packet processing on an MP system
3930 */
3931 if (delayed_copy_len > sorecvmincopy &&
3932 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3933 error = sodelayed_copy(so, uio,
3934 &free_list, &delayed_copy_len);
3935
3936 if (error) {
3937 goto release;
3938 }
3939 }
3940 m = so->so_rcv.sb_mb;
3941 if (m != NULL) {
3942 nextrecord = m->m_nextpkt;
3943 }
3944 SB_MB_CHECK(&so->so_rcv);
3945 }
3946 }
3947 #ifdef MORE_LOCKING_DEBUG
3948 if (so->so_usecount <= 1) {
3949 panic("%s: after big while so=%p ref=%d on socket",
3950 __func__, so, so->so_usecount);
3951 /* NOTREACHED */
3952 }
3953 #endif
3954
3955 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3956 if (so->so_options & SO_DONTTRUNC) {
3957 flags |= MSG_RCVMORE;
3958 } else {
3959 flags |= MSG_TRUNC;
3960 if ((flags & MSG_PEEK) == 0) {
3961 (void) sbdroprecord(&so->so_rcv);
3962 }
3963 }
3964 }
3965
3966 /*
3967 * pru_rcvd below (for TCP) may cause more data to be received
3968 * if the socket lock is dropped prior to sending the ACK; some
3969 * legacy OpenTransport applications don't handle this well
3970 * (if it receives less data than requested while MSG_HAVEMORE
3971 * is set), and so we set the flag now based on what we know
3972 * prior to calling pru_rcvd.
3973 */
3974 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3975 flags |= MSG_HAVEMORE;
3976 }
3977
3978 if ((flags & MSG_PEEK) == 0) {
3979 if (m == NULL) {
3980 so->so_rcv.sb_mb = nextrecord;
3981 /*
3982 * First part is an inline SB_EMPTY_FIXUP(). Second
3983 * part makes sure sb_lastrecord is up-to-date if
3984 * there is still data in the socket buffer.
3985 */
3986 if (so->so_rcv.sb_mb == NULL) {
3987 so->so_rcv.sb_mbtail = NULL;
3988 so->so_rcv.sb_lastrecord = NULL;
3989 } else if (nextrecord->m_nextpkt == NULL) {
3990 so->so_rcv.sb_lastrecord = nextrecord;
3991 }
3992 SB_MB_CHECK(&so->so_rcv);
3993 }
3994 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3995 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3996 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3997 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3998 }
3999 }
4000
4001 if (delayed_copy_len) {
4002 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4003 if (error) {
4004 goto release;
4005 }
4006 }
4007 if (free_list != NULL) {
4008 m_freem_list(free_list);
4009 free_list = NULL;
4010 }
4011
4012 if (orig_resid == uio_resid(uio) && orig_resid &&
4013 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4014 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4015 goto restart;
4016 }
4017
4018 if (flagsp != NULL) {
4019 *flagsp |= flags;
4020 }
4021 release:
4022 #ifdef MORE_LOCKING_DEBUG
4023 if (so->so_usecount <= 1) {
4024 panic("%s: release so=%p ref=%d on socket", __func__,
4025 so, so->so_usecount);
4026 /* NOTREACHED */
4027 }
4028 #endif
4029 if (delayed_copy_len) {
4030 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4031 }
4032
4033 if (free_list != NULL) {
4034 m_freem_list(free_list);
4035 }
4036
4037 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4038
4039 if (en_tracing) {
4040 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4041 VM_KERNEL_ADDRPERM(so),
4042 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4043 (int64_t)(orig_resid - uio_resid(uio)));
4044 }
4045 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4046 so->so_rcv.sb_cc, 0, error);
4047
4048 return error;
4049 }
4050
4051 /*
4052 * Returns: 0 Success
4053 * uiomove:EFAULT
4054 */
4055 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4056 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4057 user_ssize_t *resid)
4058 {
4059 int error = 0;
4060 struct mbuf *m;
4061
4062 m = *free_list;
4063
4064 socket_unlock(so, 0);
4065
4066 while (m != NULL && error == 0) {
4067 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4068 m = m->m_next;
4069 }
4070 m_freem_list(*free_list);
4071
4072 *free_list = NULL;
4073 *resid = 0;
4074
4075 socket_lock(so, 0);
4076
4077 return error;
4078 }
4079
4080 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)4081 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
4082 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
4083 {
4084 struct mbuf *m, **mp;
4085 struct mbuf *nextrecord;
4086 int flags, error;
4087 struct protosw *pr = so->so_proto;
4088 struct proc *p = current_proc();
4089 u_int npkts = 0;
4090 struct mbuf *free_list = NULL;
4091 int sblocked = 0;
4092
4093 /*
4094 * Sanity check on the parameters passed by caller
4095 */
4096 if (mp0 == NULL || pktcntp == NULL) {
4097 return EINVAL;
4098 }
4099 if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
4100 return EINVAL;
4101 }
4102
4103 mp = mp0;
4104 *mp0 = NULL;
4105 if (controlp != NULL) {
4106 *controlp = NULL;
4107 }
4108 if (maddrp != NULL) {
4109 *maddrp = NULL;
4110 }
4111 if (flagsp != NULL) {
4112 flags = *flagsp;
4113 } else {
4114 flags = 0;
4115 }
4116
4117 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4118 *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4119 so->so_rcv.sb_hiwat);
4120
4121 socket_lock(so, 1);
4122 so_update_last_owner_locked(so, p);
4123 so_update_policy(so);
4124
4125 #if NECP
4126 so_update_necp_policy(so, NULL, NULL);
4127 #endif /* NECP */
4128
4129 /*
4130 * If a recv attempt is made on a previously-accepted socket
4131 * that has been marked as inactive (disconnected), reject
4132 * the request.
4133 */
4134 if (so->so_flags & SOF_DEFUNCT) {
4135 struct sockbuf *sb = &so->so_rcv;
4136
4137 error = ENOTCONN;
4138 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4139 __func__, proc_pid(p), proc_best_name(p),
4140 so->so_gencnt,
4141 SOCK_DOM(so), SOCK_TYPE(so), error);
4142 /*
4143 * This socket should have been disconnected and flushed
4144 * prior to being returned from sodefunct(); there should
4145 * be no data on its receive list, so panic otherwise.
4146 */
4147 if (so->so_state & SS_DEFUNCT) {
4148 sb_empty_assert(sb, __func__);
4149 }
4150 goto release;
4151 }
4152
4153 *mp = NULL;
4154
4155 restart:
4156 /*
4157 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4158 * and if so just return to the caller. This could happen when
4159 * soreceive() is called by a socket upcall function during the
4160 * time the socket is freed. The socket buffer would have been
4161 * locked across the upcall, therefore we cannot put this thread
4162 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4163 * we may livelock), because the lock on the socket buffer will
4164 * only be released when the upcall routine returns to its caller.
4165 * Because the socket has been officially closed, there can be
4166 * no further read on it.
4167 */
4168 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4169 (SS_NOFDREF | SS_CANTRCVMORE)) {
4170 error = 0;
4171 goto out;
4172 }
4173
4174 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4175 if (error) {
4176 goto out;
4177 }
4178 sblocked = 1;
4179
4180 m = so->so_rcv.sb_mb;
4181 /*
4182 * Block awaiting more datagram if needed
4183 */
4184 if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4185 so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4186 /*
4187 * Panic if we notice inconsistencies in the socket's
4188 * receive list; both sb_mb and sb_cc should correctly
4189 * reflect the contents of the list, otherwise we may
4190 * end up with false positives during select() or poll()
4191 * which could put the application in a bad state.
4192 */
4193 SB_MB_CHECK(&so->so_rcv);
4194
4195 if (so->so_error) {
4196 if (m != NULL) {
4197 goto dontblock;
4198 }
4199 error = so->so_error;
4200 if ((flags & MSG_PEEK) == 0) {
4201 so->so_error = 0;
4202 }
4203 goto release;
4204 }
4205 if (so->so_state & SS_CANTRCVMORE) {
4206 if (m != NULL) {
4207 goto dontblock;
4208 } else {
4209 goto release;
4210 }
4211 }
4212 for (; m != NULL; m = m->m_next) {
4213 if (m->m_flags & M_EOR) {
4214 m = so->so_rcv.sb_mb;
4215 goto dontblock;
4216 }
4217 }
4218 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4219 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4220 error = ENOTCONN;
4221 goto release;
4222 }
4223 if ((so->so_state & SS_NBIO) ||
4224 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4225 error = EWOULDBLOCK;
4226 goto release;
4227 }
4228 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4229 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4230
4231 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4232 sblocked = 0;
4233
4234 error = sbwait(&so->so_rcv);
4235 if (error != 0) {
4236 goto release;
4237 }
4238 goto restart;
4239 }
4240 dontblock:
4241 m = so->so_rcv.sb_mb;
4242 if (m == NULL) {
4243 goto release;
4244 }
4245
4246 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4247 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4248 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4249 nextrecord = m->m_nextpkt;
4250
4251 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4252 struct mbuf *maddr = NULL;
4253
4254 error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4255 &nextrecord, 1);
4256 if (error == ERESTART) {
4257 goto restart;
4258 } else if (error != 0) {
4259 goto release;
4260 }
4261
4262 if (maddr != NULL) {
4263 maddr->m_nextpkt = NULL;
4264 maddr->m_next = NULL;
4265 if (maddrp != NULL) {
4266 *maddrp = maddr;
4267 maddrp = &maddr->m_nextpkt;
4268 } else {
4269 maddr->m_next = free_list;
4270 free_list = maddr;
4271 }
4272 }
4273 }
4274
4275 /*
4276 * Process one or more MT_CONTROL mbufs present before any data mbufs
4277 * in the first mbuf chain on the socket buffer.
4278 * We call into the protocol to perform externalization.
4279 */
4280 if (m != NULL && m->m_type == MT_CONTROL) {
4281 struct mbuf *control = NULL;
4282
4283 error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4284 if (error != 0) {
4285 goto release;
4286 }
4287 if (control != NULL) {
4288 control->m_nextpkt = NULL;
4289 control->m_next = NULL;
4290 if (controlp != NULL) {
4291 *controlp = control;
4292 controlp = &control->m_nextpkt;
4293 } else {
4294 control->m_next = free_list;
4295 free_list = control;
4296 }
4297 }
4298 }
4299
4300 /*
4301 * Link the packet to the list
4302 */
4303 if (m != NULL) {
4304 if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4305 panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4306 }
4307 m->m_nextpkt = NULL;
4308 *mp = m;
4309 mp = &m->m_nextpkt;
4310 }
4311 while (m != NULL) {
4312 sbfree(&so->so_rcv, m);
4313
4314 m = m->m_next;
4315 }
4316
4317 so->so_rcv.sb_mb = nextrecord;
4318 /*
4319 * First part is an inline SB_EMPTY_FIXUP(). Second
4320 * part makes sure sb_lastrecord is up-to-date if
4321 * there is still data in the socket buffer.
4322 */
4323 if (so->so_rcv.sb_mb == NULL) {
4324 so->so_rcv.sb_mbtail = NULL;
4325 so->so_rcv.sb_lastrecord = NULL;
4326 } else if (nextrecord->m_nextpkt == NULL) {
4327 so->so_rcv.sb_lastrecord = nextrecord;
4328 }
4329 SB_MB_CHECK(&so->so_rcv);
4330
4331 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4332 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4333
4334 npkts += 1;
4335
4336 /*
4337 * We continue as long as all those conditions as we have less packets
4338 * than requested and the socket buffer is not empty
4339 */
4340 if (npkts < *pktcntp) {
4341 if (so->so_rcv.sb_mb != NULL) {
4342 goto dontblock;
4343 }
4344 if ((flags & MSG_WAITALL) != 0) {
4345 goto restart;
4346 }
4347 }
4348
4349 if (flagsp != NULL) {
4350 *flagsp |= flags;
4351 }
4352
4353 release:
4354 /*
4355 * pru_rcvd may cause more data to be received if the socket lock
4356 * is dropped so we set MSG_HAVEMORE now based on what we know.
4357 * That way the caller won't be surprised if it receives less data
4358 * than requested.
4359 */
4360 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4361 flags |= MSG_HAVEMORE;
4362 }
4363
4364 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4365 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4366 }
4367
4368 if (sblocked) {
4369 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4370 } else {
4371 socket_unlock(so, 1);
4372 }
4373
4374 out:
4375 *pktcntp = npkts;
4376 /*
4377 * Amortize the cost of freeing the mbufs
4378 */
4379 if (free_list != NULL) {
4380 m_freem_list(free_list);
4381 }
4382
4383 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4384 0, 0, 0, 0);
4385 return error;
4386 }
4387
4388 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4389 so_statistics_event_to_nstat_event(int64_t *input_options,
4390 uint64_t *nstat_event)
4391 {
4392 int error = 0;
4393 switch (*input_options) {
4394 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4395 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4396 break;
4397 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4398 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4399 break;
4400 case SO_STATISTICS_EVENT_ATTRIBUTION_CHANGE:
4401 *nstat_event = NSTAT_EVENT_SRC_ATTRIBUTION_CHANGE;
4402 break;
4403 #if (DEBUG || DEVELOPMENT)
4404 case SO_STATISTICS_EVENT_RESERVED_2:
4405 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4406 break;
4407 #endif /* (DEBUG || DEVELOPMENT) */
4408 default:
4409 error = EINVAL;
4410 break;
4411 }
4412 return error;
4413 }
4414
4415 /*
4416 * Returns: 0 Success
4417 * EINVAL
4418 * ENOTCONN
4419 * <pru_shutdown>:EINVAL
4420 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4421 * <pru_shutdown>:ENOBUFS[TCP]
4422 * <pru_shutdown>:EMSGSIZE[TCP]
4423 * <pru_shutdown>:EHOSTUNREACH[TCP]
4424 * <pru_shutdown>:ENETUNREACH[TCP]
4425 * <pru_shutdown>:ENETDOWN[TCP]
4426 * <pru_shutdown>:ENOMEM[TCP]
4427 * <pru_shutdown>:EACCES[TCP]
4428 * <pru_shutdown>:EMSGSIZE[TCP]
4429 * <pru_shutdown>:ENOBUFS[TCP]
4430 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4431 * <pru_shutdown>:??? [other protocol families]
4432 */
4433 int
soshutdown(struct socket * so,int how)4434 soshutdown(struct socket *so, int how)
4435 {
4436 int error;
4437
4438 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4439
4440 switch (how) {
4441 case SHUT_RD:
4442 case SHUT_WR:
4443 case SHUT_RDWR:
4444 socket_lock(so, 1);
4445 if ((so->so_state &
4446 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4447 error = ENOTCONN;
4448 } else {
4449 error = soshutdownlock(so, how);
4450 }
4451 socket_unlock(so, 1);
4452 break;
4453 default:
4454 error = EINVAL;
4455 break;
4456 }
4457
4458 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4459
4460 return error;
4461 }
4462
4463 int
soshutdownlock_final(struct socket * so,int how)4464 soshutdownlock_final(struct socket *so, int how)
4465 {
4466 struct protosw *pr = so->so_proto;
4467 int error = 0;
4468
4469 sflt_notify(so, sock_evt_shutdown, &how);
4470
4471 if (how != SHUT_WR) {
4472 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4473 /* read already shut down */
4474 error = ENOTCONN;
4475 goto done;
4476 }
4477 sorflush(so);
4478 }
4479 if (how != SHUT_RD) {
4480 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4481 /* write already shut down */
4482 error = ENOTCONN;
4483 goto done;
4484 }
4485 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4486 }
4487 done:
4488 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4489 return error;
4490 }
4491
4492 int
soshutdownlock(struct socket * so,int how)4493 soshutdownlock(struct socket *so, int how)
4494 {
4495 int error = 0;
4496
4497 #if CONTENT_FILTER
4498 /*
4499 * A content filter may delay the actual shutdown until it
4500 * has processed the pending data
4501 */
4502 if (so->so_flags & SOF_CONTENT_FILTER) {
4503 error = cfil_sock_shutdown(so, &how);
4504 if (error == EJUSTRETURN) {
4505 error = 0;
4506 goto done;
4507 } else if (error != 0) {
4508 goto done;
4509 }
4510 }
4511 #endif /* CONTENT_FILTER */
4512
4513 error = soshutdownlock_final(so, how);
4514
4515 done:
4516 return error;
4517 }
4518
4519 void
sowflush(struct socket * so)4520 sowflush(struct socket *so)
4521 {
4522 struct sockbuf *sb = &so->so_snd;
4523
4524 /*
4525 * Obtain lock on the socket buffer (SB_LOCK). This is required
4526 * to prevent the socket buffer from being unexpectedly altered
4527 * while it is used by another thread in socket send/receive.
4528 *
4529 * sblock() must not fail here, hence the assertion.
4530 */
4531 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4532 VERIFY(sb->sb_flags & SB_LOCK);
4533
4534 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4535 sb->sb_flags |= SB_DROP;
4536 sb->sb_upcall = NULL;
4537 sb->sb_upcallarg = NULL;
4538
4539 sbunlock(sb, TRUE); /* keep socket locked */
4540
4541 selthreadclear(&sb->sb_sel);
4542 sbrelease(sb);
4543 }
4544
4545 void
sorflush(struct socket * so)4546 sorflush(struct socket *so)
4547 {
4548 struct sockbuf *sb = &so->so_rcv;
4549 struct protosw *pr = so->so_proto;
4550 struct sockbuf asb;
4551 #ifdef notyet
4552 lck_mtx_t *mutex_held;
4553 /*
4554 * XXX: This code is currently commented out, because we may get here
4555 * as part of sofreelastref(), and at that time, pr_getlock() may no
4556 * longer be able to return us the lock; this will be fixed in future.
4557 */
4558 if (so->so_proto->pr_getlock != NULL) {
4559 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4560 } else {
4561 mutex_held = so->so_proto->pr_domain->dom_mtx;
4562 }
4563
4564 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4565 #endif /* notyet */
4566
4567 sflt_notify(so, sock_evt_flush_read, NULL);
4568
4569 socantrcvmore(so);
4570
4571 /*
4572 * Obtain lock on the socket buffer (SB_LOCK). This is required
4573 * to prevent the socket buffer from being unexpectedly altered
4574 * while it is used by another thread in socket send/receive.
4575 *
4576 * sblock() must not fail here, hence the assertion.
4577 */
4578 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4579 VERIFY(sb->sb_flags & SB_LOCK);
4580
4581 /*
4582 * Copy only the relevant fields from "sb" to "asb" which we
4583 * need for sbrelease() to function. In particular, skip
4584 * sb_sel as it contains the wait queue linkage, which would
4585 * wreak havoc if we were to issue selthreadclear() on "asb".
4586 * Make sure to not carry over SB_LOCK in "asb", as we need
4587 * to acquire it later as part of sbrelease().
4588 */
4589 bzero(&asb, sizeof(asb));
4590 asb.sb_cc = sb->sb_cc;
4591 asb.sb_hiwat = sb->sb_hiwat;
4592 asb.sb_mbcnt = sb->sb_mbcnt;
4593 asb.sb_mbmax = sb->sb_mbmax;
4594 asb.sb_ctl = sb->sb_ctl;
4595 asb.sb_lowat = sb->sb_lowat;
4596 asb.sb_mb = sb->sb_mb;
4597 asb.sb_mbtail = sb->sb_mbtail;
4598 asb.sb_lastrecord = sb->sb_lastrecord;
4599 asb.sb_so = sb->sb_so;
4600 asb.sb_flags = sb->sb_flags;
4601 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4602 asb.sb_flags |= SB_DROP;
4603
4604 /*
4605 * Ideally we'd bzero() these and preserve the ones we need;
4606 * but to do that we'd need to shuffle things around in the
4607 * sockbuf, and we can't do it now because there are KEXTS
4608 * that are directly referring to the socket structure.
4609 *
4610 * Setting SB_DROP acts as a barrier to prevent further appends.
4611 * Clearing SB_SEL is done for selthreadclear() below.
4612 */
4613 sb->sb_cc = 0;
4614 sb->sb_hiwat = 0;
4615 sb->sb_mbcnt = 0;
4616 sb->sb_mbmax = 0;
4617 sb->sb_ctl = 0;
4618 sb->sb_lowat = 0;
4619 sb->sb_mb = NULL;
4620 sb->sb_mbtail = NULL;
4621 sb->sb_lastrecord = NULL;
4622 sb->sb_timeo.tv_sec = 0;
4623 sb->sb_timeo.tv_usec = 0;
4624 sb->sb_upcall = NULL;
4625 sb->sb_upcallarg = NULL;
4626 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4627 sb->sb_flags |= SB_DROP;
4628
4629 sbunlock(sb, TRUE); /* keep socket locked */
4630
4631 /*
4632 * Note that selthreadclear() is called on the original "sb" and
4633 * not the local "asb" because of the way wait queue linkage is
4634 * implemented. Given that selwakeup() may be triggered, SB_SEL
4635 * should no longer be set (cleared above.)
4636 */
4637 selthreadclear(&sb->sb_sel);
4638
4639 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4640 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4641 }
4642
4643 sbrelease(&asb);
4644 }
4645
4646 /*
4647 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4648 * an additional variant to handle the case where the option value needs
4649 * to be some kind of integer, but not a specific size.
4650 * In addition to their use here, these functions are also called by the
4651 * protocol-level pr_ctloutput() routines.
4652 *
4653 * Returns: 0 Success
4654 * EINVAL
4655 * copyin:EFAULT
4656 */
4657 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4658 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4659 {
4660 size_t valsize;
4661
4662 /*
4663 * If the user gives us more than we wanted, we ignore it,
4664 * but if we don't get the minimum length the caller
4665 * wants, we return EINVAL. On success, sopt->sopt_valsize
4666 * is set to however much we actually retrieved.
4667 */
4668 if ((valsize = sopt->sopt_valsize) < minlen) {
4669 return EINVAL;
4670 }
4671 if (valsize > len) {
4672 sopt->sopt_valsize = valsize = len;
4673 }
4674
4675 if (sopt->sopt_p != kernproc) {
4676 return copyin(sopt->sopt_val, buf, valsize);
4677 }
4678
4679 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4680 return 0;
4681 }
4682
4683 /*
4684 * sooptcopyin_timeval
4685 * Copy in a timeval value into tv_p, and take into account whether the
4686 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4687 * code here so that we can verify the 64-bit tv_sec value before we lose
4688 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4689 */
4690 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4691 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4692 {
4693 int error;
4694
4695 if (proc_is64bit(sopt->sopt_p)) {
4696 struct user64_timeval tv64;
4697
4698 if (sopt->sopt_valsize < sizeof(tv64)) {
4699 return EINVAL;
4700 }
4701
4702 sopt->sopt_valsize = sizeof(tv64);
4703 if (sopt->sopt_p != kernproc) {
4704 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4705 if (error != 0) {
4706 return error;
4707 }
4708 } else {
4709 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4710 sizeof(tv64));
4711 }
4712 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4713 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4714 return EDOM;
4715 }
4716
4717 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4718 tv_p->tv_usec = tv64.tv_usec;
4719 } else {
4720 struct user32_timeval tv32;
4721
4722 if (sopt->sopt_valsize < sizeof(tv32)) {
4723 return EINVAL;
4724 }
4725
4726 sopt->sopt_valsize = sizeof(tv32);
4727 if (sopt->sopt_p != kernproc) {
4728 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4729 if (error != 0) {
4730 return error;
4731 }
4732 } else {
4733 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4734 sizeof(tv32));
4735 }
4736 #ifndef __LP64__
4737 /*
4738 * K64todo "comparison is always false due to
4739 * limited range of data type"
4740 */
4741 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4742 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4743 return EDOM;
4744 }
4745 #endif
4746 tv_p->tv_sec = tv32.tv_sec;
4747 tv_p->tv_usec = tv32.tv_usec;
4748 }
4749 return 0;
4750 }
4751
4752 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4753 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4754 boolean_t ignore_delegate)
4755 {
4756 kauth_cred_t cred = NULL;
4757 proc_t ep = PROC_NULL;
4758 uid_t uid;
4759 int error = 0;
4760
4761 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4762 ep = proc_find(so->e_pid);
4763 if (ep) {
4764 cred = kauth_cred_proc_ref(ep);
4765 }
4766 }
4767
4768 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4769
4770 /* uid is 0 for root */
4771 if (uid != 0 || !allow_root) {
4772 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4773 }
4774 if (cred) {
4775 kauth_cred_unref(&cred);
4776 }
4777 if (ep != PROC_NULL) {
4778 proc_rele(ep);
4779 }
4780
4781 return error;
4782 }
4783
4784 /*
4785 * Returns: 0 Success
4786 * EINVAL
4787 * ENOPROTOOPT
4788 * ENOBUFS
4789 * EDOM
4790 * sooptcopyin:EINVAL
4791 * sooptcopyin:EFAULT
4792 * sooptcopyin_timeval:EINVAL
4793 * sooptcopyin_timeval:EFAULT
4794 * sooptcopyin_timeval:EDOM
4795 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4796 * <pr_ctloutput>:???w
4797 * sflt_attach_private:??? [whatever a filter author chooses]
4798 * <sf_setoption>:??? [whatever a filter author chooses]
4799 *
4800 * Notes: Other <pru_listen> returns depend on the protocol family; all
4801 * <sf_listen> returns depend on what the filter author causes
4802 * their filter to return.
4803 */
4804 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4805 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4806 {
4807 int error, optval;
4808 int64_t long_optval;
4809 struct linger l;
4810 struct timeval tv;
4811
4812 if (sopt->sopt_dir != SOPT_SET) {
4813 sopt->sopt_dir = SOPT_SET;
4814 }
4815
4816 if (dolock) {
4817 socket_lock(so, 1);
4818 }
4819
4820 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4821 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4822 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4823 /* the socket has been shutdown, no more sockopt's */
4824 error = EINVAL;
4825 goto out;
4826 }
4827
4828 error = sflt_setsockopt(so, sopt);
4829 if (error != 0) {
4830 if (error == EJUSTRETURN) {
4831 error = 0;
4832 }
4833 goto out;
4834 }
4835
4836 if (sopt->sopt_level != SOL_SOCKET) {
4837 if (so->so_proto != NULL &&
4838 so->so_proto->pr_ctloutput != NULL) {
4839 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4840 goto out;
4841 }
4842 error = ENOPROTOOPT;
4843 } else {
4844 /*
4845 * Allow socket-level (SOL_SOCKET) options to be filtered by
4846 * the protocol layer, if needed. A zero value returned from
4847 * the handler means use default socket-level processing as
4848 * done by the rest of this routine. Otherwise, any other
4849 * return value indicates that the option is unsupported.
4850 */
4851 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4852 pru_socheckopt(so, sopt)) != 0) {
4853 goto out;
4854 }
4855
4856 error = 0;
4857 switch (sopt->sopt_name) {
4858 case SO_LINGER:
4859 case SO_LINGER_SEC: {
4860 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4861 if (error != 0) {
4862 goto out;
4863 }
4864 /* Make sure to use sane values */
4865 if (sopt->sopt_name == SO_LINGER) {
4866 so->so_linger = (short)l.l_linger;
4867 } else {
4868 so->so_linger = (short)((long)l.l_linger * hz);
4869 }
4870 if (l.l_onoff != 0) {
4871 so->so_options |= SO_LINGER;
4872 } else {
4873 so->so_options &= ~SO_LINGER;
4874 }
4875 break;
4876 }
4877 case SO_DEBUG:
4878 case SO_KEEPALIVE:
4879 case SO_DONTROUTE:
4880 case SO_USELOOPBACK:
4881 case SO_BROADCAST:
4882 case SO_REUSEADDR:
4883 case SO_REUSEPORT:
4884 case SO_OOBINLINE:
4885 case SO_TIMESTAMP:
4886 case SO_TIMESTAMP_MONOTONIC:
4887 case SO_TIMESTAMP_CONTINUOUS:
4888 case SO_DONTTRUNC:
4889 case SO_WANTMORE:
4890 case SO_WANTOOBFLAG:
4891 case SO_NOWAKEFROMSLEEP:
4892 case SO_NOAPNFALLBK:
4893 error = sooptcopyin(sopt, &optval, sizeof(optval),
4894 sizeof(optval));
4895 if (error != 0) {
4896 goto out;
4897 }
4898 if (optval) {
4899 so->so_options |= sopt->sopt_name;
4900 } else {
4901 so->so_options &= ~sopt->sopt_name;
4902 }
4903 #if SKYWALK
4904 inp_update_netns_flags(so);
4905 #endif /* SKYWALK */
4906 break;
4907
4908 case SO_SNDBUF:
4909 case SO_RCVBUF:
4910 case SO_SNDLOWAT:
4911 case SO_RCVLOWAT:
4912 error = sooptcopyin(sopt, &optval, sizeof(optval),
4913 sizeof(optval));
4914 if (error != 0) {
4915 goto out;
4916 }
4917
4918 /*
4919 * Values < 1 make no sense for any of these
4920 * options, so disallow them.
4921 */
4922 if (optval < 1) {
4923 error = EINVAL;
4924 goto out;
4925 }
4926
4927 switch (sopt->sopt_name) {
4928 case SO_SNDBUF:
4929 case SO_RCVBUF: {
4930 struct sockbuf *sb =
4931 (sopt->sopt_name == SO_SNDBUF) ?
4932 &so->so_snd : &so->so_rcv;
4933 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4934 error = ENOBUFS;
4935 goto out;
4936 }
4937 sb->sb_flags |= SB_USRSIZE;
4938 sb->sb_flags &= ~SB_AUTOSIZE;
4939 sb->sb_idealsize = (u_int32_t)optval;
4940 break;
4941 }
4942 /*
4943 * Make sure the low-water is never greater than
4944 * the high-water.
4945 */
4946 case SO_SNDLOWAT: {
4947 int space = sbspace(&so->so_snd);
4948 uint32_t hiwat = so->so_snd.sb_hiwat;
4949
4950 if (so->so_snd.sb_flags & SB_UNIX) {
4951 struct unpcb *unp =
4952 (struct unpcb *)(so->so_pcb);
4953 if (unp != NULL &&
4954 unp->unp_conn != NULL) {
4955 struct socket *so2 = unp->unp_conn->unp_socket;
4956 hiwat += unp->unp_conn->unp_cc;
4957 space = sbspace(&so2->so_rcv);
4958 }
4959 }
4960
4961 so->so_snd.sb_lowat =
4962 (optval > hiwat) ?
4963 hiwat : optval;
4964
4965 if (space >= so->so_snd.sb_lowat) {
4966 sowwakeup(so);
4967 }
4968 break;
4969 }
4970 case SO_RCVLOWAT: {
4971 int64_t data_len;
4972 so->so_rcv.sb_lowat =
4973 (optval > so->so_rcv.sb_hiwat) ?
4974 so->so_rcv.sb_hiwat : optval;
4975 if (so->so_rcv.sb_flags & SB_UNIX) {
4976 struct unpcb *unp =
4977 (struct unpcb *)(so->so_pcb);
4978 if (unp != NULL &&
4979 unp->unp_conn != NULL) {
4980 struct socket *so2 = unp->unp_conn->unp_socket;
4981 data_len = so2->so_snd.sb_cc
4982 - so2->so_snd.sb_ctl;
4983 } else {
4984 data_len = so->so_rcv.sb_cc
4985 - so->so_rcv.sb_ctl;
4986 }
4987 } else {
4988 data_len = so->so_rcv.sb_cc
4989 - so->so_rcv.sb_ctl;
4990 }
4991
4992 if (data_len >= so->so_rcv.sb_lowat) {
4993 sorwakeup(so);
4994 }
4995 break;
4996 }
4997 }
4998 break;
4999
5000 case SO_SNDTIMEO:
5001 case SO_RCVTIMEO:
5002 error = sooptcopyin_timeval(sopt, &tv);
5003 if (error != 0) {
5004 goto out;
5005 }
5006
5007 switch (sopt->sopt_name) {
5008 case SO_SNDTIMEO:
5009 so->so_snd.sb_timeo = tv;
5010 break;
5011 case SO_RCVTIMEO:
5012 so->so_rcv.sb_timeo = tv;
5013 break;
5014 }
5015 break;
5016
5017 case SO_NKE: {
5018 struct so_nke nke;
5019
5020 error = sooptcopyin(sopt, &nke, sizeof(nke),
5021 sizeof(nke));
5022 if (error != 0) {
5023 goto out;
5024 }
5025
5026 error = sflt_attach_internal(so, nke.nke_handle);
5027 break;
5028 }
5029
5030 case SO_NOSIGPIPE:
5031 error = sooptcopyin(sopt, &optval, sizeof(optval),
5032 sizeof(optval));
5033 if (error != 0) {
5034 goto out;
5035 }
5036 if (optval != 0) {
5037 so->so_flags |= SOF_NOSIGPIPE;
5038 } else {
5039 so->so_flags &= ~SOF_NOSIGPIPE;
5040 }
5041 break;
5042
5043 case SO_NOADDRERR:
5044 error = sooptcopyin(sopt, &optval, sizeof(optval),
5045 sizeof(optval));
5046 if (error != 0) {
5047 goto out;
5048 }
5049 if (optval != 0) {
5050 so->so_flags |= SOF_NOADDRAVAIL;
5051 } else {
5052 so->so_flags &= ~SOF_NOADDRAVAIL;
5053 }
5054 break;
5055
5056 case SO_REUSESHAREUID:
5057 error = sooptcopyin(sopt, &optval, sizeof(optval),
5058 sizeof(optval));
5059 if (error != 0) {
5060 goto out;
5061 }
5062 if (optval != 0) {
5063 so->so_flags |= SOF_REUSESHAREUID;
5064 } else {
5065 so->so_flags &= ~SOF_REUSESHAREUID;
5066 }
5067 break;
5068
5069 case SO_NOTIFYCONFLICT:
5070 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5071 error = EPERM;
5072 goto out;
5073 }
5074 error = sooptcopyin(sopt, &optval, sizeof(optval),
5075 sizeof(optval));
5076 if (error != 0) {
5077 goto out;
5078 }
5079 if (optval != 0) {
5080 so->so_flags |= SOF_NOTIFYCONFLICT;
5081 } else {
5082 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5083 }
5084 break;
5085
5086 case SO_RESTRICTIONS:
5087 error = sooptcopyin(sopt, &optval, sizeof(optval),
5088 sizeof(optval));
5089 if (error != 0) {
5090 goto out;
5091 }
5092
5093 error = so_set_restrictions(so, optval);
5094 break;
5095
5096 case SO_AWDL_UNRESTRICTED:
5097 if (SOCK_DOM(so) != PF_INET &&
5098 SOCK_DOM(so) != PF_INET6) {
5099 error = EOPNOTSUPP;
5100 goto out;
5101 }
5102 error = sooptcopyin(sopt, &optval, sizeof(optval),
5103 sizeof(optval));
5104 if (error != 0) {
5105 goto out;
5106 }
5107 if (optval != 0) {
5108 error = soopt_cred_check(so,
5109 PRIV_NET_RESTRICTED_AWDL, false, false);
5110 if (error == 0) {
5111 inp_set_awdl_unrestricted(
5112 sotoinpcb(so));
5113 }
5114 } else {
5115 inp_clear_awdl_unrestricted(sotoinpcb(so));
5116 }
5117 break;
5118 case SO_INTCOPROC_ALLOW:
5119 if (SOCK_DOM(so) != PF_INET6) {
5120 error = EOPNOTSUPP;
5121 goto out;
5122 }
5123 error = sooptcopyin(sopt, &optval, sizeof(optval),
5124 sizeof(optval));
5125 if (error != 0) {
5126 goto out;
5127 }
5128 if (optval != 0 &&
5129 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5130 error = soopt_cred_check(so,
5131 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5132 if (error == 0) {
5133 inp_set_intcoproc_allowed(
5134 sotoinpcb(so));
5135 }
5136 } else if (optval == 0) {
5137 inp_clear_intcoproc_allowed(sotoinpcb(so));
5138 }
5139 break;
5140
5141 case SO_LABEL:
5142 error = EOPNOTSUPP;
5143 break;
5144
5145 case SO_UPCALLCLOSEWAIT:
5146 error = sooptcopyin(sopt, &optval, sizeof(optval),
5147 sizeof(optval));
5148 if (error != 0) {
5149 goto out;
5150 }
5151 if (optval != 0) {
5152 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5153 } else {
5154 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5155 }
5156 break;
5157
5158 case SO_RANDOMPORT:
5159 error = sooptcopyin(sopt, &optval, sizeof(optval),
5160 sizeof(optval));
5161 if (error != 0) {
5162 goto out;
5163 }
5164 if (optval != 0) {
5165 so->so_flags |= SOF_BINDRANDOMPORT;
5166 } else {
5167 so->so_flags &= ~SOF_BINDRANDOMPORT;
5168 }
5169 break;
5170
5171 case SO_NP_EXTENSIONS: {
5172 struct so_np_extensions sonpx;
5173
5174 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5175 sizeof(sonpx));
5176 if (error != 0) {
5177 goto out;
5178 }
5179 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5180 error = EINVAL;
5181 goto out;
5182 }
5183 /*
5184 * Only one bit defined for now
5185 */
5186 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5187 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5188 so->so_flags |= SOF_NPX_SETOPTSHUT;
5189 } else {
5190 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5191 }
5192 }
5193 break;
5194 }
5195
5196 case SO_TRAFFIC_CLASS: {
5197 error = sooptcopyin(sopt, &optval, sizeof(optval),
5198 sizeof(optval));
5199 if (error != 0) {
5200 goto out;
5201 }
5202 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5203 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5204 error = so_set_net_service_type(so, netsvc);
5205 goto out;
5206 }
5207 error = so_set_traffic_class(so, optval);
5208 if (error != 0) {
5209 goto out;
5210 }
5211 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5212 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5213 break;
5214 }
5215
5216 case SO_RECV_TRAFFIC_CLASS: {
5217 error = sooptcopyin(sopt, &optval, sizeof(optval),
5218 sizeof(optval));
5219 if (error != 0) {
5220 goto out;
5221 }
5222 if (optval == 0) {
5223 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5224 } else {
5225 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5226 }
5227 break;
5228 }
5229
5230 #if (DEVELOPMENT || DEBUG)
5231 case SO_TRAFFIC_CLASS_DBG: {
5232 struct so_tcdbg so_tcdbg;
5233
5234 error = sooptcopyin(sopt, &so_tcdbg,
5235 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5236 if (error != 0) {
5237 goto out;
5238 }
5239 error = so_set_tcdbg(so, &so_tcdbg);
5240 if (error != 0) {
5241 goto out;
5242 }
5243 break;
5244 }
5245 #endif /* (DEVELOPMENT || DEBUG) */
5246
5247 case SO_PRIVILEGED_TRAFFIC_CLASS:
5248 error = priv_check_cred(kauth_cred_get(),
5249 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5250 if (error != 0) {
5251 goto out;
5252 }
5253 error = sooptcopyin(sopt, &optval, sizeof(optval),
5254 sizeof(optval));
5255 if (error != 0) {
5256 goto out;
5257 }
5258 if (optval == 0) {
5259 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5260 } else {
5261 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5262 }
5263 break;
5264
5265 #if (DEVELOPMENT || DEBUG)
5266 case SO_DEFUNCTIT:
5267 error = sosetdefunct(current_proc(), so, 0, FALSE);
5268 if (error == 0) {
5269 error = sodefunct(current_proc(), so, 0);
5270 }
5271
5272 break;
5273 #endif /* (DEVELOPMENT || DEBUG) */
5274
5275 case SO_DEFUNCTOK:
5276 error = sooptcopyin(sopt, &optval, sizeof(optval),
5277 sizeof(optval));
5278 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5279 if (error == 0) {
5280 error = EBADF;
5281 }
5282 goto out;
5283 }
5284 /*
5285 * Any process can set SO_DEFUNCTOK (clear
5286 * SOF_NODEFUNCT), but only root can clear
5287 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5288 */
5289 if (optval == 0 &&
5290 kauth_cred_issuser(kauth_cred_get()) == 0) {
5291 error = EPERM;
5292 goto out;
5293 }
5294 if (optval) {
5295 so->so_flags &= ~SOF_NODEFUNCT;
5296 } else {
5297 so->so_flags |= SOF_NODEFUNCT;
5298 }
5299
5300 if (SOCK_DOM(so) == PF_INET ||
5301 SOCK_DOM(so) == PF_INET6) {
5302 char s[MAX_IPv6_STR_LEN];
5303 char d[MAX_IPv6_STR_LEN];
5304 struct inpcb *inp = sotoinpcb(so);
5305
5306 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5307 "[%s %s:%d -> %s:%d] is now marked "
5308 "as %seligible for "
5309 "defunct\n", __func__, proc_selfpid(),
5310 proc_best_name(current_proc()),
5311 so->so_gencnt,
5312 (SOCK_TYPE(so) == SOCK_STREAM) ?
5313 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5314 ((SOCK_DOM(so) == PF_INET) ?
5315 (void *)&inp->inp_laddr.s_addr :
5316 (void *)&inp->in6p_laddr), s, sizeof(s)),
5317 ntohs(inp->in6p_lport),
5318 inet_ntop(SOCK_DOM(so),
5319 (SOCK_DOM(so) == PF_INET) ?
5320 (void *)&inp->inp_faddr.s_addr :
5321 (void *)&inp->in6p_faddr, d, sizeof(d)),
5322 ntohs(inp->in6p_fport),
5323 (so->so_flags & SOF_NODEFUNCT) ?
5324 "not " : "");
5325 } else {
5326 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5327 "is now marked as %seligible for "
5328 "defunct\n",
5329 __func__, proc_selfpid(),
5330 proc_best_name(current_proc()),
5331 so->so_gencnt,
5332 SOCK_DOM(so), SOCK_TYPE(so),
5333 (so->so_flags & SOF_NODEFUNCT) ?
5334 "not " : "");
5335 }
5336 break;
5337
5338 case SO_ISDEFUNCT:
5339 /* This option is not settable */
5340 error = EINVAL;
5341 break;
5342
5343 case SO_OPPORTUNISTIC:
5344 error = sooptcopyin(sopt, &optval, sizeof(optval),
5345 sizeof(optval));
5346 if (error == 0) {
5347 error = so_set_opportunistic(so, optval);
5348 }
5349 break;
5350
5351 case SO_FLUSH:
5352 /* This option is handled by lower layer(s) */
5353 error = 0;
5354 break;
5355
5356 case SO_RECV_ANYIF:
5357 error = sooptcopyin(sopt, &optval, sizeof(optval),
5358 sizeof(optval));
5359 if (error == 0) {
5360 error = so_set_recv_anyif(so, optval);
5361 }
5362 break;
5363
5364 case SO_TRAFFIC_MGT_BACKGROUND: {
5365 /* This option is handled by lower layer(s) */
5366 error = 0;
5367 break;
5368 }
5369
5370 #if FLOW_DIVERT
5371 case SO_FLOW_DIVERT_TOKEN:
5372 error = flow_divert_token_set(so, sopt);
5373 break;
5374 #endif /* FLOW_DIVERT */
5375
5376
5377 case SO_DELEGATED:
5378 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5379 sizeof(optval))) != 0) {
5380 break;
5381 }
5382
5383 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5384 break;
5385
5386 case SO_DELEGATED_UUID: {
5387 uuid_t euuid;
5388
5389 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5390 sizeof(euuid))) != 0) {
5391 break;
5392 }
5393
5394 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5395 break;
5396 }
5397
5398 #if NECP
5399 case SO_NECP_ATTRIBUTES:
5400 if (SOCK_DOM(so) == PF_MULTIPATH) {
5401 /* Handled by MPTCP itself */
5402 break;
5403 }
5404
5405 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5406 error = EINVAL;
5407 goto out;
5408 }
5409
5410 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5411 break;
5412
5413 case SO_NECP_CLIENTUUID: {
5414 if (SOCK_DOM(so) == PF_MULTIPATH) {
5415 /* Handled by MPTCP itself */
5416 break;
5417 }
5418
5419 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5420 error = EINVAL;
5421 goto out;
5422 }
5423
5424 struct inpcb *inp = sotoinpcb(so);
5425 if (!uuid_is_null(inp->necp_client_uuid)) {
5426 // Clear out the old client UUID if present
5427 necp_inpcb_remove_cb(inp);
5428 }
5429
5430 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5431 sizeof(uuid_t), sizeof(uuid_t));
5432 if (error != 0) {
5433 goto out;
5434 }
5435
5436 if (uuid_is_null(inp->necp_client_uuid)) {
5437 error = EINVAL;
5438 goto out;
5439 }
5440
5441 pid_t current_pid = proc_pid(current_proc());
5442 error = necp_client_register_socket_flow(current_pid,
5443 inp->necp_client_uuid, inp);
5444 if (error != 0) {
5445 uuid_clear(inp->necp_client_uuid);
5446 goto out;
5447 }
5448
5449 if (inp->inp_lport != 0) {
5450 // There is a bound local port, so this is not
5451 // a fresh socket. Assign to the client.
5452 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5453 }
5454
5455 break;
5456 }
5457 case SO_NECP_LISTENUUID: {
5458 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5459 error = EINVAL;
5460 goto out;
5461 }
5462
5463 struct inpcb *inp = sotoinpcb(so);
5464 if (!uuid_is_null(inp->necp_client_uuid)) {
5465 error = EINVAL;
5466 goto out;
5467 }
5468
5469 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5470 sizeof(uuid_t), sizeof(uuid_t));
5471 if (error != 0) {
5472 goto out;
5473 }
5474
5475 if (uuid_is_null(inp->necp_client_uuid)) {
5476 error = EINVAL;
5477 goto out;
5478 }
5479
5480 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5481 inp->necp_client_uuid, inp);
5482 if (error != 0) {
5483 uuid_clear(inp->necp_client_uuid);
5484 goto out;
5485 }
5486
5487 // Mark that the port registration is held by NECP
5488 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5489
5490 break;
5491 }
5492
5493 case SO_RESOLVER_SIGNATURE: {
5494 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5495 error = EINVAL;
5496 goto out;
5497 }
5498 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5499 break;
5500 }
5501 #endif /* NECP */
5502
5503 case SO_EXTENDED_BK_IDLE:
5504 error = sooptcopyin(sopt, &optval, sizeof(optval),
5505 sizeof(optval));
5506 if (error == 0) {
5507 error = so_set_extended_bk_idle(so, optval);
5508 }
5509 break;
5510
5511 case SO_MARK_CELLFALLBACK:
5512 error = sooptcopyin(sopt, &optval, sizeof(optval),
5513 sizeof(optval));
5514 if (error != 0) {
5515 goto out;
5516 }
5517 if (optval < 0) {
5518 error = EINVAL;
5519 goto out;
5520 }
5521 if (optval == 0) {
5522 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5523 } else {
5524 so->so_flags1 |= SOF1_CELLFALLBACK;
5525 }
5526 break;
5527
5528 case SO_MARK_CELLFALLBACK_UUID:
5529 {
5530 struct so_mark_cellfallback_uuid_args args;
5531
5532 error = sooptcopyin(sopt, &args, sizeof(args),
5533 sizeof(args));
5534 if (error != 0) {
5535 goto out;
5536 }
5537 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5538 args.flow_cellfallback);
5539 break;
5540 }
5541
5542 case SO_FALLBACK_MODE:
5543 error = sooptcopyin(sopt, &optval, sizeof(optval),
5544 sizeof(optval));
5545 if (error != 0) {
5546 goto out;
5547 }
5548 if (optval < SO_FALLBACK_MODE_NONE ||
5549 optval > SO_FALLBACK_MODE_PREFER) {
5550 error = EINVAL;
5551 goto out;
5552 }
5553 so->so_fallback_mode = (u_int8_t)optval;
5554 break;
5555
5556 case SO_MARK_KNOWN_TRACKER: {
5557 error = sooptcopyin(sopt, &optval, sizeof(optval),
5558 sizeof(optval));
5559 if (error != 0) {
5560 goto out;
5561 }
5562 if (optval < 0) {
5563 error = EINVAL;
5564 goto out;
5565 }
5566 if (optval == 0) {
5567 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5568 } else {
5569 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5570 }
5571 break;
5572 }
5573
5574 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5575 error = sooptcopyin(sopt, &optval, sizeof(optval),
5576 sizeof(optval));
5577 if (error != 0) {
5578 goto out;
5579 }
5580 if (optval < 0) {
5581 error = EINVAL;
5582 goto out;
5583 }
5584 if (optval == 0) {
5585 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5586 } else {
5587 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5588 }
5589 break;
5590 }
5591
5592 case SO_MARK_APPROVED_APP_DOMAIN: {
5593 error = sooptcopyin(sopt, &optval, sizeof(optval),
5594 sizeof(optval));
5595 if (error != 0) {
5596 goto out;
5597 }
5598 if (optval < 0) {
5599 error = EINVAL;
5600 goto out;
5601 }
5602 if (optval == 0) {
5603 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5604 } else {
5605 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5606 }
5607 break;
5608 }
5609
5610 case SO_STATISTICS_EVENT:
5611 error = sooptcopyin(sopt, &long_optval,
5612 sizeof(long_optval), sizeof(long_optval));
5613 if (error != 0) {
5614 goto out;
5615 }
5616 u_int64_t nstat_event = 0;
5617 error = so_statistics_event_to_nstat_event(
5618 &long_optval, &nstat_event);
5619 if (error != 0) {
5620 goto out;
5621 }
5622 nstat_pcb_event(sotoinpcb(so), nstat_event);
5623 break;
5624
5625 case SO_NET_SERVICE_TYPE: {
5626 error = sooptcopyin(sopt, &optval, sizeof(optval),
5627 sizeof(optval));
5628 if (error != 0) {
5629 goto out;
5630 }
5631 error = so_set_net_service_type(so, optval);
5632 break;
5633 }
5634
5635 case SO_QOSMARKING_POLICY_OVERRIDE:
5636 error = priv_check_cred(kauth_cred_get(),
5637 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5638 if (error != 0) {
5639 goto out;
5640 }
5641 error = sooptcopyin(sopt, &optval, sizeof(optval),
5642 sizeof(optval));
5643 if (error != 0) {
5644 goto out;
5645 }
5646 if (optval == 0) {
5647 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5648 } else {
5649 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5650 }
5651 break;
5652
5653 case SO_MPKL_SEND_INFO: {
5654 struct so_mpkl_send_info so_mpkl_send_info;
5655
5656 error = sooptcopyin(sopt, &so_mpkl_send_info,
5657 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5658 if (error != 0) {
5659 goto out;
5660 }
5661 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5662 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5663
5664 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5665 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5666 } else {
5667 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5668 }
5669 break;
5670 }
5671 case SO_WANT_KEV_SOCKET_CLOSED: {
5672 error = sooptcopyin(sopt, &optval, sizeof(optval),
5673 sizeof(optval));
5674 if (error != 0) {
5675 goto out;
5676 }
5677 if (optval == 0) {
5678 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5679 } else {
5680 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5681 }
5682 break;
5683 }
5684 case SO_MARK_WAKE_PKT: {
5685 error = sooptcopyin(sopt, &optval, sizeof(optval),
5686 sizeof(optval));
5687 if (error != 0) {
5688 goto out;
5689 }
5690 if (optval == 0) {
5691 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5692 } else {
5693 so->so_flags |= SOF_MARK_WAKE_PKT;
5694 }
5695 break;
5696 }
5697 case SO_RECV_WAKE_PKT: {
5698 error = sooptcopyin(sopt, &optval, sizeof(optval),
5699 sizeof(optval));
5700 if (error != 0) {
5701 goto out;
5702 }
5703 if (optval == 0) {
5704 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5705 } else {
5706 so->so_flags |= SOF_RECV_WAKE_PKT;
5707 }
5708 break;
5709 }
5710 case SO_APPLICATION_ID: {
5711 so_application_id_t application_id = { 0 };
5712
5713 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5714 error = EINVAL;
5715 goto out;
5716 }
5717 error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5718 sizeof(application_id));
5719 if (error != 0) {
5720 goto out;
5721 }
5722
5723 // The user needs to match
5724 if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5725 error = EINVAL;
5726 printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5727 goto out;
5728 }
5729 error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5730 if (error != 0) {
5731 printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5732 goto out;
5733 }
5734 if (application_id.persona_id != PERSONA_ID_NONE) {
5735 so->so_persona_id = application_id.persona_id;
5736 }
5737 break;
5738 }
5739 case SO_MARK_DOMAIN_INFO_SILENT:
5740 error = sooptcopyin(sopt, &optval, sizeof(optval),
5741 sizeof(optval));
5742 if (error != 0) {
5743 goto out;
5744 }
5745 if (optval < 0) {
5746 error = EINVAL;
5747 goto out;
5748 }
5749 if (optval == 0) {
5750 so->so_flags1 &= ~SOF1_DOMAIN_INFO_SILENT;
5751 } else {
5752 so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
5753 }
5754 break;
5755
5756 default:
5757 error = ENOPROTOOPT;
5758 break;
5759 }
5760 if (error == 0 && so->so_proto != NULL &&
5761 so->so_proto->pr_ctloutput != NULL) {
5762 (void) so->so_proto->pr_ctloutput(so, sopt);
5763 }
5764 }
5765 out:
5766 if (dolock) {
5767 socket_unlock(so, 1);
5768 }
5769 return error;
5770 }
5771
5772 /* Helper routines for getsockopt */
5773 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5774 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5775 {
5776 int error;
5777 size_t valsize;
5778
5779 error = 0;
5780
5781 /*
5782 * Documented get behavior is that we always return a value,
5783 * possibly truncated to fit in the user's buffer.
5784 * Traditional behavior is that we always tell the user
5785 * precisely how much we copied, rather than something useful
5786 * like the total amount we had available for her.
5787 * Note that this interface is not idempotent; the entire answer must
5788 * generated ahead of time.
5789 */
5790 valsize = MIN(len, sopt->sopt_valsize);
5791 sopt->sopt_valsize = valsize;
5792 if (sopt->sopt_val != USER_ADDR_NULL) {
5793 if (sopt->sopt_p != kernproc) {
5794 error = copyout(buf, sopt->sopt_val, valsize);
5795 } else {
5796 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5797 }
5798 }
5799 return error;
5800 }
5801
5802 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5803 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5804 {
5805 int error;
5806 size_t len;
5807 struct user64_timeval tv64 = {};
5808 struct user32_timeval tv32 = {};
5809 const void * val;
5810 size_t valsize;
5811
5812 error = 0;
5813 if (proc_is64bit(sopt->sopt_p)) {
5814 len = sizeof(tv64);
5815 tv64.tv_sec = tv_p->tv_sec;
5816 tv64.tv_usec = tv_p->tv_usec;
5817 val = &tv64;
5818 } else {
5819 len = sizeof(tv32);
5820 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5821 tv32.tv_usec = tv_p->tv_usec;
5822 val = &tv32;
5823 }
5824 valsize = MIN(len, sopt->sopt_valsize);
5825 sopt->sopt_valsize = valsize;
5826 if (sopt->sopt_val != USER_ADDR_NULL) {
5827 if (sopt->sopt_p != kernproc) {
5828 error = copyout(val, sopt->sopt_val, valsize);
5829 } else {
5830 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5831 }
5832 }
5833 return error;
5834 }
5835
5836 /*
5837 * Return: 0 Success
5838 * ENOPROTOOPT
5839 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5840 * <pr_ctloutput>:???
5841 * <sf_getoption>:???
5842 */
5843 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5844 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5845 {
5846 int error, optval;
5847 struct linger l;
5848 struct timeval tv;
5849
5850 if (sopt->sopt_dir != SOPT_GET) {
5851 sopt->sopt_dir = SOPT_GET;
5852 }
5853
5854 if (dolock) {
5855 socket_lock(so, 1);
5856 }
5857
5858 error = sflt_getsockopt(so, sopt);
5859 if (error != 0) {
5860 if (error == EJUSTRETURN) {
5861 error = 0;
5862 }
5863 goto out;
5864 }
5865
5866 if (sopt->sopt_level != SOL_SOCKET) {
5867 if (so->so_proto != NULL &&
5868 so->so_proto->pr_ctloutput != NULL) {
5869 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5870 goto out;
5871 }
5872 error = ENOPROTOOPT;
5873 } else {
5874 /*
5875 * Allow socket-level (SOL_SOCKET) options to be filtered by
5876 * the protocol layer, if needed. A zero value returned from
5877 * the handler means use default socket-level processing as
5878 * done by the rest of this routine. Otherwise, any other
5879 * return value indicates that the option is unsupported.
5880 */
5881 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5882 pru_socheckopt(so, sopt)) != 0) {
5883 goto out;
5884 }
5885
5886 error = 0;
5887 switch (sopt->sopt_name) {
5888 case SO_LINGER:
5889 case SO_LINGER_SEC:
5890 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5891 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5892 so->so_linger : so->so_linger / hz;
5893 error = sooptcopyout(sopt, &l, sizeof(l));
5894 break;
5895
5896 case SO_USELOOPBACK:
5897 case SO_DONTROUTE:
5898 case SO_DEBUG:
5899 case SO_KEEPALIVE:
5900 case SO_REUSEADDR:
5901 case SO_REUSEPORT:
5902 case SO_BROADCAST:
5903 case SO_OOBINLINE:
5904 case SO_TIMESTAMP:
5905 case SO_TIMESTAMP_MONOTONIC:
5906 case SO_TIMESTAMP_CONTINUOUS:
5907 case SO_DONTTRUNC:
5908 case SO_WANTMORE:
5909 case SO_WANTOOBFLAG:
5910 case SO_NOWAKEFROMSLEEP:
5911 case SO_NOAPNFALLBK:
5912 optval = so->so_options & sopt->sopt_name;
5913 integer:
5914 error = sooptcopyout(sopt, &optval, sizeof(optval));
5915 break;
5916
5917 case SO_TYPE:
5918 optval = so->so_type;
5919 goto integer;
5920
5921 case SO_NREAD:
5922 if (so->so_proto->pr_flags & PR_ATOMIC) {
5923 int pkt_total;
5924 struct mbuf *m1;
5925
5926 pkt_total = 0;
5927 m1 = so->so_rcv.sb_mb;
5928 while (m1 != NULL) {
5929 if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5930 pkt_total += m1->m_len;
5931 }
5932 m1 = m1->m_next;
5933 }
5934 optval = pkt_total;
5935 } else {
5936 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5937 }
5938 goto integer;
5939
5940 case SO_NUMRCVPKT:
5941 if (so->so_proto->pr_flags & PR_ATOMIC) {
5942 int cnt = 0;
5943 struct mbuf *m1;
5944
5945 m1 = so->so_rcv.sb_mb;
5946 while (m1 != NULL) {
5947 cnt += 1;
5948 m1 = m1->m_nextpkt;
5949 }
5950 optval = cnt;
5951 goto integer;
5952 } else {
5953 error = ENOPROTOOPT;
5954 break;
5955 }
5956
5957 case SO_NWRITE:
5958 optval = so->so_snd.sb_cc;
5959 goto integer;
5960
5961 case SO_ERROR:
5962 optval = so->so_error;
5963 so->so_error = 0;
5964 goto integer;
5965
5966 case SO_SNDBUF: {
5967 u_int32_t hiwat = so->so_snd.sb_hiwat;
5968
5969 if (so->so_snd.sb_flags & SB_UNIX) {
5970 struct unpcb *unp =
5971 (struct unpcb *)(so->so_pcb);
5972 if (unp != NULL && unp->unp_conn != NULL) {
5973 hiwat += unp->unp_conn->unp_cc;
5974 }
5975 }
5976
5977 optval = hiwat;
5978 goto integer;
5979 }
5980 case SO_RCVBUF:
5981 optval = so->so_rcv.sb_hiwat;
5982 goto integer;
5983
5984 case SO_SNDLOWAT:
5985 optval = so->so_snd.sb_lowat;
5986 goto integer;
5987
5988 case SO_RCVLOWAT:
5989 optval = so->so_rcv.sb_lowat;
5990 goto integer;
5991
5992 case SO_SNDTIMEO:
5993 case SO_RCVTIMEO:
5994 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5995 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5996
5997 error = sooptcopyout_timeval(sopt, &tv);
5998 break;
5999
6000 case SO_NOSIGPIPE:
6001 optval = (so->so_flags & SOF_NOSIGPIPE);
6002 goto integer;
6003
6004 case SO_NOADDRERR:
6005 optval = (so->so_flags & SOF_NOADDRAVAIL);
6006 goto integer;
6007
6008 case SO_REUSESHAREUID:
6009 optval = (so->so_flags & SOF_REUSESHAREUID);
6010 goto integer;
6011
6012
6013 case SO_NOTIFYCONFLICT:
6014 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6015 goto integer;
6016
6017 case SO_RESTRICTIONS:
6018 optval = so_get_restrictions(so);
6019 goto integer;
6020
6021 case SO_AWDL_UNRESTRICTED:
6022 if (SOCK_DOM(so) == PF_INET ||
6023 SOCK_DOM(so) == PF_INET6) {
6024 optval = inp_get_awdl_unrestricted(
6025 sotoinpcb(so));
6026 goto integer;
6027 } else {
6028 error = EOPNOTSUPP;
6029 }
6030 break;
6031
6032 case SO_INTCOPROC_ALLOW:
6033 if (SOCK_DOM(so) == PF_INET6) {
6034 optval = inp_get_intcoproc_allowed(
6035 sotoinpcb(so));
6036 goto integer;
6037 } else {
6038 error = EOPNOTSUPP;
6039 }
6040 break;
6041
6042 case SO_LABEL:
6043 error = EOPNOTSUPP;
6044 break;
6045
6046 case SO_PEERLABEL:
6047 error = EOPNOTSUPP;
6048 break;
6049
6050 #ifdef __APPLE_API_PRIVATE
6051 case SO_UPCALLCLOSEWAIT:
6052 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6053 goto integer;
6054 #endif
6055 case SO_RANDOMPORT:
6056 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6057 goto integer;
6058
6059 case SO_NP_EXTENSIONS: {
6060 struct so_np_extensions sonpx = {};
6061
6062 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6063 SONPX_SETOPTSHUT : 0;
6064 sonpx.npx_mask = SONPX_MASK_VALID;
6065
6066 error = sooptcopyout(sopt, &sonpx,
6067 sizeof(struct so_np_extensions));
6068 break;
6069 }
6070
6071 case SO_TRAFFIC_CLASS:
6072 optval = so->so_traffic_class;
6073 goto integer;
6074
6075 case SO_RECV_TRAFFIC_CLASS:
6076 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6077 goto integer;
6078
6079 #if (DEVELOPMENT || DEBUG)
6080 case SO_TRAFFIC_CLASS_DBG:
6081 error = sogetopt_tcdbg(so, sopt);
6082 break;
6083 #endif /* (DEVELOPMENT || DEBUG) */
6084
6085 case SO_PRIVILEGED_TRAFFIC_CLASS:
6086 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6087 goto integer;
6088
6089 case SO_DEFUNCTOK:
6090 optval = !(so->so_flags & SOF_NODEFUNCT);
6091 goto integer;
6092
6093 case SO_ISDEFUNCT:
6094 optval = (so->so_flags & SOF_DEFUNCT);
6095 goto integer;
6096
6097 case SO_OPPORTUNISTIC:
6098 optval = so_get_opportunistic(so);
6099 goto integer;
6100
6101 case SO_FLUSH:
6102 /* This option is not gettable */
6103 error = EINVAL;
6104 break;
6105
6106 case SO_RECV_ANYIF:
6107 optval = so_get_recv_anyif(so);
6108 goto integer;
6109
6110 case SO_TRAFFIC_MGT_BACKGROUND:
6111 /* This option is handled by lower layer(s) */
6112 if (so->so_proto != NULL &&
6113 so->so_proto->pr_ctloutput != NULL) {
6114 (void) so->so_proto->pr_ctloutput(so, sopt);
6115 }
6116 break;
6117
6118 #if FLOW_DIVERT
6119 case SO_FLOW_DIVERT_TOKEN:
6120 error = flow_divert_token_get(so, sopt);
6121 break;
6122 #endif /* FLOW_DIVERT */
6123
6124 #if NECP
6125 case SO_NECP_ATTRIBUTES:
6126 if (SOCK_DOM(so) == PF_MULTIPATH) {
6127 /* Handled by MPTCP itself */
6128 break;
6129 }
6130
6131 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6132 error = EINVAL;
6133 goto out;
6134 }
6135
6136 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6137 break;
6138
6139 case SO_NECP_CLIENTUUID: {
6140 uuid_t *ncu;
6141
6142 if (SOCK_DOM(so) == PF_MULTIPATH) {
6143 ncu = &mpsotomppcb(so)->necp_client_uuid;
6144 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6145 ncu = &sotoinpcb(so)->necp_client_uuid;
6146 } else {
6147 error = EINVAL;
6148 goto out;
6149 }
6150
6151 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6152 break;
6153 }
6154
6155 case SO_NECP_LISTENUUID: {
6156 uuid_t *nlu;
6157
6158 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6159 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6160 nlu = &sotoinpcb(so)->necp_client_uuid;
6161 } else {
6162 error = ENOENT;
6163 goto out;
6164 }
6165 } else {
6166 error = EINVAL;
6167 goto out;
6168 }
6169
6170 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6171 break;
6172 }
6173
6174 case SO_RESOLVER_SIGNATURE: {
6175 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6176 error = EINVAL;
6177 goto out;
6178 }
6179 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6180 break;
6181 }
6182
6183 #endif /* NECP */
6184
6185 #if CONTENT_FILTER
6186 case SO_CFIL_SOCK_ID: {
6187 cfil_sock_id_t sock_id;
6188
6189 sock_id = cfil_sock_id_from_socket(so);
6190
6191 error = sooptcopyout(sopt, &sock_id,
6192 sizeof(cfil_sock_id_t));
6193 break;
6194 }
6195 #endif /* CONTENT_FILTER */
6196
6197 case SO_EXTENDED_BK_IDLE:
6198 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6199 goto integer;
6200 case SO_MARK_CELLFALLBACK:
6201 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6202 ? 1 : 0;
6203 goto integer;
6204 case SO_FALLBACK_MODE:
6205 optval = so->so_fallback_mode;
6206 goto integer;
6207 case SO_MARK_KNOWN_TRACKER: {
6208 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6209 ? 1 : 0;
6210 goto integer;
6211 }
6212 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6213 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6214 ? 1 : 0;
6215 goto integer;
6216 }
6217 case SO_MARK_APPROVED_APP_DOMAIN: {
6218 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6219 ? 1 : 0;
6220 goto integer;
6221 }
6222 case SO_NET_SERVICE_TYPE: {
6223 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6224 optval = so->so_netsvctype;
6225 } else {
6226 optval = NET_SERVICE_TYPE_BE;
6227 }
6228 goto integer;
6229 }
6230 case SO_NETSVC_MARKING_LEVEL:
6231 optval = so_get_netsvc_marking_level(so);
6232 goto integer;
6233
6234 case SO_MPKL_SEND_INFO: {
6235 struct so_mpkl_send_info so_mpkl_send_info;
6236
6237 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6238 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6239 error = sooptcopyout(sopt, &so_mpkl_send_info,
6240 sizeof(struct so_mpkl_send_info));
6241 break;
6242 }
6243 case SO_MARK_WAKE_PKT:
6244 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6245 goto integer;
6246 case SO_RECV_WAKE_PKT:
6247 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6248 goto integer;
6249 case SO_APPLICATION_ID: {
6250 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6251 error = EINVAL;
6252 goto out;
6253 }
6254 so_application_id_t application_id = { 0 };
6255 application_id.uid = kauth_cred_getuid(so->so_cred);
6256 uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6257 application_id.persona_id = so->so_persona_id;
6258 error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6259 break;
6260 }
6261 case SO_MARK_DOMAIN_INFO_SILENT:
6262 optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
6263 ? 1 : 0;
6264 goto integer;
6265 default:
6266 error = ENOPROTOOPT;
6267 break;
6268 }
6269 }
6270 out:
6271 if (dolock) {
6272 socket_unlock(so, 1);
6273 }
6274 return error;
6275 }
6276
6277 /*
6278 * The size limits on our soopt_getm is different from that on FreeBSD.
6279 * We limit the size of options to MCLBYTES. This will have to change
6280 * if we need to define options that need more space than MCLBYTES.
6281 */
6282 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6283 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6284 {
6285 struct mbuf *m, *m_prev;
6286 int sopt_size = (int)sopt->sopt_valsize;
6287 int how;
6288
6289 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6290 return EMSGSIZE;
6291 }
6292
6293 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6294 MGET(m, how, MT_DATA);
6295 if (m == NULL) {
6296 return ENOBUFS;
6297 }
6298 if (sopt_size > MLEN) {
6299 MCLGET(m, how);
6300 if ((m->m_flags & M_EXT) == 0) {
6301 m_free(m);
6302 return ENOBUFS;
6303 }
6304 m->m_len = min(MCLBYTES, sopt_size);
6305 } else {
6306 m->m_len = min(MLEN, sopt_size);
6307 }
6308 sopt_size -= m->m_len;
6309 *mp = m;
6310 m_prev = m;
6311
6312 while (sopt_size > 0) {
6313 MGET(m, how, MT_DATA);
6314 if (m == NULL) {
6315 m_freem(*mp);
6316 return ENOBUFS;
6317 }
6318 if (sopt_size > MLEN) {
6319 MCLGET(m, how);
6320 if ((m->m_flags & M_EXT) == 0) {
6321 m_freem(*mp);
6322 m_freem(m);
6323 return ENOBUFS;
6324 }
6325 m->m_len = min(MCLBYTES, sopt_size);
6326 } else {
6327 m->m_len = min(MLEN, sopt_size);
6328 }
6329 sopt_size -= m->m_len;
6330 m_prev->m_next = m;
6331 m_prev = m;
6332 }
6333 return 0;
6334 }
6335
6336 /* copyin sopt data into mbuf chain */
6337 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6338 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6339 {
6340 struct mbuf *m0 = m;
6341
6342 if (sopt->sopt_val == USER_ADDR_NULL) {
6343 return 0;
6344 }
6345 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6346 if (sopt->sopt_p != kernproc) {
6347 int error;
6348
6349 error = copyin(sopt->sopt_val, mtod(m, char *),
6350 m->m_len);
6351 if (error != 0) {
6352 m_freem(m0);
6353 return error;
6354 }
6355 } else {
6356 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6357 mtod(m, char *), m->m_len);
6358 }
6359 sopt->sopt_valsize -= m->m_len;
6360 sopt->sopt_val += m->m_len;
6361 m = m->m_next;
6362 }
6363 /* should be allocated enoughly at ip6_sooptmcopyin() */
6364 if (m != NULL) {
6365 panic("soopt_mcopyin");
6366 /* NOTREACHED */
6367 }
6368 return 0;
6369 }
6370
6371 /* copyout mbuf chain data into soopt */
6372 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6373 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6374 {
6375 struct mbuf *m0 = m;
6376 size_t valsize = 0;
6377
6378 if (sopt->sopt_val == USER_ADDR_NULL) {
6379 return 0;
6380 }
6381 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6382 if (sopt->sopt_p != kernproc) {
6383 int error;
6384
6385 error = copyout(mtod(m, char *), sopt->sopt_val,
6386 m->m_len);
6387 if (error != 0) {
6388 m_freem(m0);
6389 return error;
6390 }
6391 } else {
6392 bcopy(mtod(m, char *),
6393 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6394 }
6395 sopt->sopt_valsize -= m->m_len;
6396 sopt->sopt_val += m->m_len;
6397 valsize += m->m_len;
6398 m = m->m_next;
6399 }
6400 if (m != NULL) {
6401 /* enough soopt buffer should be given from user-land */
6402 m_freem(m0);
6403 return EINVAL;
6404 }
6405 sopt->sopt_valsize = valsize;
6406 return 0;
6407 }
6408
6409 void
sohasoutofband(struct socket * so)6410 sohasoutofband(struct socket *so)
6411 {
6412 if (so->so_pgid < 0) {
6413 gsignal(-so->so_pgid, SIGURG);
6414 } else if (so->so_pgid > 0) {
6415 proc_signal(so->so_pgid, SIGURG);
6416 }
6417 selwakeup(&so->so_rcv.sb_sel);
6418 if (so->so_rcv.sb_flags & SB_KNOTE) {
6419 KNOTE(&so->so_rcv.sb_sel.si_note,
6420 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6421 }
6422 }
6423
6424 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6425 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6426 {
6427 #pragma unused(cred)
6428 struct proc *p = current_proc();
6429 int revents = 0;
6430
6431 socket_lock(so, 1);
6432 so_update_last_owner_locked(so, PROC_NULL);
6433 so_update_policy(so);
6434
6435 if (events & (POLLIN | POLLRDNORM)) {
6436 if (soreadable(so)) {
6437 revents |= events & (POLLIN | POLLRDNORM);
6438 }
6439 }
6440
6441 if (events & (POLLOUT | POLLWRNORM)) {
6442 if (sowriteable(so)) {
6443 revents |= events & (POLLOUT | POLLWRNORM);
6444 }
6445 }
6446
6447 if (events & (POLLPRI | POLLRDBAND)) {
6448 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6449 revents |= events & (POLLPRI | POLLRDBAND);
6450 }
6451 }
6452
6453 if (revents == 0) {
6454 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6455 /*
6456 * Darwin sets the flag first,
6457 * BSD calls selrecord first
6458 */
6459 so->so_rcv.sb_flags |= SB_SEL;
6460 selrecord(p, &so->so_rcv.sb_sel, wql);
6461 }
6462
6463 if (events & (POLLOUT | POLLWRNORM)) {
6464 /*
6465 * Darwin sets the flag first,
6466 * BSD calls selrecord first
6467 */
6468 so->so_snd.sb_flags |= SB_SEL;
6469 selrecord(p, &so->so_snd.sb_sel, wql);
6470 }
6471 }
6472
6473 socket_unlock(so, 1);
6474 return revents;
6475 }
6476
6477 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6478 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6479 {
6480 struct socket *so = (struct socket *)fp_get_data(fp);
6481 int result;
6482
6483 socket_lock(so, 1);
6484 so_update_last_owner_locked(so, PROC_NULL);
6485 so_update_policy(so);
6486
6487 switch (kn->kn_filter) {
6488 case EVFILT_READ:
6489 kn->kn_filtid = EVFILTID_SOREAD;
6490 break;
6491 case EVFILT_WRITE:
6492 kn->kn_filtid = EVFILTID_SOWRITE;
6493 break;
6494 case EVFILT_SOCK:
6495 kn->kn_filtid = EVFILTID_SCK;
6496 break;
6497 case EVFILT_EXCEPT:
6498 kn->kn_filtid = EVFILTID_SOEXCEPT;
6499 break;
6500 default:
6501 socket_unlock(so, 1);
6502 knote_set_error(kn, EINVAL);
6503 return 0;
6504 }
6505
6506 /*
6507 * call the appropriate sub-filter attach
6508 * with the socket still locked
6509 */
6510 result = knote_fops(kn)->f_attach(kn, kev);
6511
6512 socket_unlock(so, 1);
6513
6514 return result;
6515 }
6516
6517 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6518 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6519 {
6520 int retval = 0;
6521 int64_t data = 0;
6522
6523 if (so->so_options & SO_ACCEPTCONN) {
6524 /*
6525 * Radar 6615193 handle the listen case dynamically
6526 * for kqueue read filter. This allows to call listen()
6527 * after registering the kqueue EVFILT_READ.
6528 */
6529
6530 retval = !TAILQ_EMPTY(&so->so_comp);
6531 data = so->so_qlen;
6532 goto out;
6533 }
6534
6535 /* socket isn't a listener */
6536 /*
6537 * NOTE_LOWAT specifies new low water mark in data, i.e.
6538 * the bytes of protocol data. We therefore exclude any
6539 * control bytes.
6540 */
6541 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6542
6543 if (kn->kn_sfflags & NOTE_OOB) {
6544 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6545 kn->kn_fflags |= NOTE_OOB;
6546 data -= so->so_oobmark;
6547 retval = 1;
6548 goto out;
6549 }
6550 }
6551
6552 if ((so->so_state & SS_CANTRCVMORE)
6553 #if CONTENT_FILTER
6554 && cfil_sock_data_pending(&so->so_rcv) == 0
6555 #endif /* CONTENT_FILTER */
6556 ) {
6557 kn->kn_flags |= EV_EOF;
6558 kn->kn_fflags = so->so_error;
6559 retval = 1;
6560 goto out;
6561 }
6562
6563 if (so->so_error) { /* temporary udp error */
6564 retval = 1;
6565 goto out;
6566 }
6567
6568 int64_t lowwat = so->so_rcv.sb_lowat;
6569 /*
6570 * Ensure that when NOTE_LOWAT is used, the derived
6571 * low water mark is bounded by socket's rcv buf's
6572 * high and low water mark values.
6573 */
6574 if (kn->kn_sfflags & NOTE_LOWAT) {
6575 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6576 lowwat = so->so_rcv.sb_hiwat;
6577 } else if (kn->kn_sdata > lowwat) {
6578 lowwat = kn->kn_sdata;
6579 }
6580 }
6581
6582 /*
6583 * While the `data` field is the amount of data to read,
6584 * 0-sized packets need to wake up the kqueue, see 58140856,
6585 * so we need to take control bytes into account too.
6586 */
6587 retval = (so->so_rcv.sb_cc >= lowwat);
6588
6589 out:
6590 if (retval && kev) {
6591 knote_fill_kevent(kn, kev, data);
6592 }
6593 return retval;
6594 }
6595
6596 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6597 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6598 {
6599 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6600
6601 /* socket locked */
6602
6603 /*
6604 * If the caller explicitly asked for OOB results (e.g. poll())
6605 * from EVFILT_READ, then save that off in the hookid field
6606 * and reserve the kn_flags EV_OOBAND bit for output only.
6607 */
6608 if (kn->kn_filter == EVFILT_READ &&
6609 kn->kn_flags & EV_OOBAND) {
6610 kn->kn_flags &= ~EV_OOBAND;
6611 kn->kn_hook32 = EV_OOBAND;
6612 } else {
6613 kn->kn_hook32 = 0;
6614 }
6615 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6616 so->so_rcv.sb_flags |= SB_KNOTE;
6617 }
6618
6619 /* indicate if event is already fired */
6620 return filt_soread_common(kn, NULL, so);
6621 }
6622
6623 static void
filt_sordetach(struct knote * kn)6624 filt_sordetach(struct knote *kn)
6625 {
6626 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6627
6628 socket_lock(so, 1);
6629 if (so->so_rcv.sb_flags & SB_KNOTE) {
6630 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6631 so->so_rcv.sb_flags &= ~SB_KNOTE;
6632 }
6633 }
6634 socket_unlock(so, 1);
6635 }
6636
6637 /*ARGSUSED*/
6638 static int
filt_soread(struct knote * kn,long hint)6639 filt_soread(struct knote *kn, long hint)
6640 {
6641 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6642 int retval;
6643
6644 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6645 socket_lock(so, 1);
6646 }
6647
6648 retval = filt_soread_common(kn, NULL, so);
6649
6650 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6651 socket_unlock(so, 1);
6652 }
6653
6654 return retval;
6655 }
6656
6657 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6658 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6659 {
6660 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6661 int retval;
6662
6663 socket_lock(so, 1);
6664
6665 /* save off the new input fflags and data */
6666 kn->kn_sfflags = kev->fflags;
6667 kn->kn_sdata = kev->data;
6668
6669 /* determine if changes result in fired events */
6670 retval = filt_soread_common(kn, NULL, so);
6671
6672 socket_unlock(so, 1);
6673
6674 return retval;
6675 }
6676
6677 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6678 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6679 {
6680 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6681 int retval;
6682
6683 socket_lock(so, 1);
6684 retval = filt_soread_common(kn, kev, so);
6685 socket_unlock(so, 1);
6686
6687 return retval;
6688 }
6689
6690 int
so_wait_for_if_feedback(struct socket * so)6691 so_wait_for_if_feedback(struct socket *so)
6692 {
6693 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6694 (so->so_state & SS_ISCONNECTED)) {
6695 struct inpcb *inp = sotoinpcb(so);
6696 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6697 return 1;
6698 }
6699 }
6700 return 0;
6701 }
6702
6703 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6704 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6705 {
6706 int ret = 0;
6707 int64_t data = sbspace(&so->so_snd);
6708
6709 if (so->so_state & SS_CANTSENDMORE) {
6710 kn->kn_flags |= EV_EOF;
6711 kn->kn_fflags = so->so_error;
6712 ret = 1;
6713 goto out;
6714 }
6715
6716 if (so->so_error) { /* temporary udp error */
6717 ret = 1;
6718 goto out;
6719 }
6720
6721 if (!socanwrite(so)) {
6722 ret = 0;
6723 goto out;
6724 }
6725
6726 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6727 ret = 1;
6728 goto out;
6729 }
6730
6731 int64_t lowwat = so->so_snd.sb_lowat;
6732 const int64_t hiwat = so->so_snd.sb_hiwat;
6733 /*
6734 * Deal with connected UNIX domain sockets which
6735 * rely on the fact that the sender's socket buffer is
6736 * actually the receiver's socket buffer.
6737 */
6738 if (SOCK_DOM(so) == PF_LOCAL) {
6739 struct unpcb *unp = sotounpcb(so);
6740 if (unp != NULL && unp->unp_conn != NULL &&
6741 unp->unp_conn->unp_socket != NULL) {
6742 struct socket *so2 = unp->unp_conn->unp_socket;
6743 /*
6744 * At this point we know that `so' is locked
6745 * and that `unp_conn` isn't going to change.
6746 * However, we don't lock `so2` because doing so
6747 * may require unlocking `so'
6748 * (see unp_get_locks_in_order()).
6749 *
6750 * Two cases can happen:
6751 *
6752 * 1) we return 1 and tell the application that
6753 * it can write. Meanwhile, another thread
6754 * fills up the socket buffer. This will either
6755 * lead to a blocking send or EWOULDBLOCK
6756 * which the application should deal with.
6757 * 2) we return 0 and tell the application that
6758 * the socket is not writable. Meanwhile,
6759 * another thread depletes the receive socket
6760 * buffer. In this case the application will
6761 * be woken up by sb_notify().
6762 *
6763 * MIN() is required because otherwise sosendcheck()
6764 * may return EWOULDBLOCK since it only considers
6765 * so->so_snd.
6766 */
6767 data = MIN(data, sbspace(&so2->so_rcv));
6768 }
6769 }
6770
6771 if (kn->kn_sfflags & NOTE_LOWAT) {
6772 if (kn->kn_sdata > hiwat) {
6773 lowwat = hiwat;
6774 } else if (kn->kn_sdata > lowwat) {
6775 lowwat = kn->kn_sdata;
6776 }
6777 }
6778
6779 if (data > 0 && data >= lowwat) {
6780 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6781 #if (DEBUG || DEVELOPMENT)
6782 && so_notsent_lowat_check == 1
6783 #endif /* DEBUG || DEVELOPMENT */
6784 ) {
6785 if ((SOCK_DOM(so) == PF_INET ||
6786 SOCK_DOM(so) == PF_INET6) &&
6787 so->so_type == SOCK_STREAM) {
6788 ret = tcp_notsent_lowat_check(so);
6789 }
6790 #if MPTCP
6791 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6792 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6793 ret = mptcp_notsent_lowat_check(so);
6794 }
6795 #endif
6796 else {
6797 ret = 1;
6798 goto out;
6799 }
6800 } else {
6801 ret = 1;
6802 }
6803 }
6804 if (so_wait_for_if_feedback(so)) {
6805 ret = 0;
6806 }
6807
6808 out:
6809 if (ret && kev) {
6810 knote_fill_kevent(kn, kev, data);
6811 }
6812 return ret;
6813 }
6814
6815 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6816 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6817 {
6818 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6819
6820 /* socket locked */
6821 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6822 so->so_snd.sb_flags |= SB_KNOTE;
6823 }
6824
6825 /* determine if its already fired */
6826 return filt_sowrite_common(kn, NULL, so);
6827 }
6828
6829 static void
filt_sowdetach(struct knote * kn)6830 filt_sowdetach(struct knote *kn)
6831 {
6832 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6833 socket_lock(so, 1);
6834
6835 if (so->so_snd.sb_flags & SB_KNOTE) {
6836 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6837 so->so_snd.sb_flags &= ~SB_KNOTE;
6838 }
6839 }
6840 socket_unlock(so, 1);
6841 }
6842
6843 /*ARGSUSED*/
6844 static int
filt_sowrite(struct knote * kn,long hint)6845 filt_sowrite(struct knote *kn, long hint)
6846 {
6847 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6848 int ret;
6849
6850 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6851 socket_lock(so, 1);
6852 }
6853
6854 ret = filt_sowrite_common(kn, NULL, so);
6855
6856 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6857 socket_unlock(so, 1);
6858 }
6859
6860 return ret;
6861 }
6862
6863 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6864 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6865 {
6866 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6867 int ret;
6868
6869 socket_lock(so, 1);
6870
6871 /*save off the new input fflags and data */
6872 kn->kn_sfflags = kev->fflags;
6873 kn->kn_sdata = kev->data;
6874
6875 /* determine if these changes result in a triggered event */
6876 ret = filt_sowrite_common(kn, NULL, so);
6877
6878 socket_unlock(so, 1);
6879
6880 return ret;
6881 }
6882
6883 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6884 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6885 {
6886 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6887 int ret;
6888
6889 socket_lock(so, 1);
6890 ret = filt_sowrite_common(kn, kev, so);
6891 socket_unlock(so, 1);
6892
6893 return ret;
6894 }
6895
6896 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6897 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6898 struct socket *so, long ev_hint)
6899 {
6900 int ret = 0;
6901 int64_t data = 0;
6902 uint32_t level_trigger = 0;
6903
6904 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6905 kn->kn_fflags |= NOTE_CONNRESET;
6906 }
6907 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6908 kn->kn_fflags |= NOTE_TIMEOUT;
6909 }
6910 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6911 kn->kn_fflags |= NOTE_NOSRCADDR;
6912 }
6913 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6914 kn->kn_fflags |= NOTE_IFDENIED;
6915 }
6916 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6917 kn->kn_fflags |= NOTE_KEEPALIVE;
6918 }
6919 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6920 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6921 }
6922 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6923 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6924 }
6925 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6926 (so->so_state & SS_ISCONNECTED)) {
6927 kn->kn_fflags |= NOTE_CONNECTED;
6928 level_trigger |= NOTE_CONNECTED;
6929 }
6930 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6931 (so->so_state & SS_ISDISCONNECTED)) {
6932 kn->kn_fflags |= NOTE_DISCONNECTED;
6933 level_trigger |= NOTE_DISCONNECTED;
6934 }
6935 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6936 if (so->so_proto != NULL &&
6937 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6938 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6939 }
6940 }
6941 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6942 tcp_notify_ack_active(so)) {
6943 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6944 }
6945 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6946 kn->kn_fflags |= NOTE_WAKE_PKT;
6947 }
6948
6949 if ((so->so_state & SS_CANTRCVMORE)
6950 #if CONTENT_FILTER
6951 && cfil_sock_data_pending(&so->so_rcv) == 0
6952 #endif /* CONTENT_FILTER */
6953 ) {
6954 kn->kn_fflags |= NOTE_READCLOSED;
6955 level_trigger |= NOTE_READCLOSED;
6956 }
6957
6958 if (so->so_state & SS_CANTSENDMORE) {
6959 kn->kn_fflags |= NOTE_WRITECLOSED;
6960 level_trigger |= NOTE_WRITECLOSED;
6961 }
6962
6963 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6964 (so->so_flags & SOF_SUSPENDED)) {
6965 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6966
6967 /* If resume event was delivered before, reset it */
6968 kn->kn_hook32 &= ~NOTE_RESUME;
6969
6970 kn->kn_fflags |= NOTE_SUSPEND;
6971 level_trigger |= NOTE_SUSPEND;
6972 }
6973
6974 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6975 (so->so_flags & SOF_SUSPENDED) == 0) {
6976 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6977
6978 /* If suspend event was delivered before, reset it */
6979 kn->kn_hook32 &= ~NOTE_SUSPEND;
6980
6981 kn->kn_fflags |= NOTE_RESUME;
6982 level_trigger |= NOTE_RESUME;
6983 }
6984
6985 if (so->so_error != 0) {
6986 ret = 1;
6987 data = so->so_error;
6988 kn->kn_flags |= EV_EOF;
6989 } else {
6990 u_int32_t data32 = 0;
6991 get_sockev_state(so, &data32);
6992 data = data32;
6993 }
6994
6995 /* Reset any events that are not requested on this knote */
6996 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6997 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6998
6999 /* Find the level triggerred events that are already delivered */
7000 level_trigger &= kn->kn_hook32;
7001 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7002
7003 /* Do not deliver level triggerred events more than once */
7004 if ((kn->kn_fflags & ~level_trigger) != 0) {
7005 ret = 1;
7006 }
7007
7008 if (ret && kev) {
7009 /*
7010 * Store the state of the events being delivered. This
7011 * state can be used to deliver level triggered events
7012 * ateast once and still avoid waking up the application
7013 * multiple times as long as the event is active.
7014 */
7015 if (kn->kn_fflags != 0) {
7016 kn->kn_hook32 |= (kn->kn_fflags &
7017 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7018 }
7019
7020 /*
7021 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7022 * only one of them and remember the last one that was
7023 * delivered last
7024 */
7025 if (kn->kn_fflags & NOTE_SUSPEND) {
7026 kn->kn_hook32 &= ~NOTE_RESUME;
7027 }
7028 if (kn->kn_fflags & NOTE_RESUME) {
7029 kn->kn_hook32 &= ~NOTE_SUSPEND;
7030 }
7031
7032 knote_fill_kevent(kn, kev, data);
7033 }
7034 return ret;
7035 }
7036
7037 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7038 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7039 {
7040 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7041
7042 /* socket locked */
7043 kn->kn_hook32 = 0;
7044 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7045 so->so_flags |= SOF_KNOTE;
7046 }
7047
7048 /* determine if event already fired */
7049 return filt_sockev_common(kn, NULL, so, 0);
7050 }
7051
7052 static void
filt_sockdetach(struct knote * kn)7053 filt_sockdetach(struct knote *kn)
7054 {
7055 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7056 socket_lock(so, 1);
7057
7058 if ((so->so_flags & SOF_KNOTE) != 0) {
7059 if (KNOTE_DETACH(&so->so_klist, kn)) {
7060 so->so_flags &= ~SOF_KNOTE;
7061 }
7062 }
7063 socket_unlock(so, 1);
7064 }
7065
7066 static int
filt_sockev(struct knote * kn,long hint)7067 filt_sockev(struct knote *kn, long hint)
7068 {
7069 int ret = 0, locked = 0;
7070 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7071 long ev_hint = (hint & SO_FILT_HINT_EV);
7072
7073 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7074 socket_lock(so, 1);
7075 locked = 1;
7076 }
7077
7078 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7079
7080 if (locked) {
7081 socket_unlock(so, 1);
7082 }
7083
7084 return ret;
7085 }
7086
7087
7088
7089 /*
7090 * filt_socktouch - update event state
7091 */
7092 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7093 filt_socktouch(
7094 struct knote *kn,
7095 struct kevent_qos_s *kev)
7096 {
7097 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7098 uint32_t changed_flags;
7099 int ret;
7100
7101 socket_lock(so, 1);
7102
7103 /* save off the [result] data and fflags */
7104 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7105
7106 /* save off the new input fflags and data */
7107 kn->kn_sfflags = kev->fflags;
7108 kn->kn_sdata = kev->data;
7109
7110 /* restrict the current results to the (smaller?) set of new interest */
7111 /*
7112 * For compatibility with previous implementations, we leave kn_fflags
7113 * as they were before.
7114 */
7115 //kn->kn_fflags &= kev->fflags;
7116
7117 /*
7118 * Since we keep track of events that are already
7119 * delivered, if any of those events are not requested
7120 * anymore the state related to them can be reset
7121 */
7122 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7123
7124 /* determine if we have events to deliver */
7125 ret = filt_sockev_common(kn, NULL, so, 0);
7126
7127 socket_unlock(so, 1);
7128
7129 return ret;
7130 }
7131
7132 /*
7133 * filt_sockprocess - query event fired state and return data
7134 */
7135 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7136 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7137 {
7138 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7139 int ret = 0;
7140
7141 socket_lock(so, 1);
7142
7143 ret = filt_sockev_common(kn, kev, so, 0);
7144
7145 socket_unlock(so, 1);
7146
7147 return ret;
7148 }
7149
7150 void
get_sockev_state(struct socket * so,u_int32_t * statep)7151 get_sockev_state(struct socket *so, u_int32_t *statep)
7152 {
7153 u_int32_t state = *(statep);
7154
7155 /*
7156 * If the state variable is already used by a previous event,
7157 * reset it.
7158 */
7159 if (state != 0) {
7160 return;
7161 }
7162
7163 if (so->so_state & SS_ISCONNECTED) {
7164 state |= SOCKEV_CONNECTED;
7165 } else {
7166 state &= ~(SOCKEV_CONNECTED);
7167 }
7168 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7169 *(statep) = state;
7170 }
7171
7172 #define SO_LOCK_HISTORY_STR_LEN \
7173 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7174
7175 __private_extern__ const char *
solockhistory_nr(struct socket * so)7176 solockhistory_nr(struct socket *so)
7177 {
7178 size_t n = 0;
7179 int i;
7180 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7181
7182 bzero(lock_history_str, sizeof(lock_history_str));
7183 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7184 n += scnprintf(lock_history_str + n,
7185 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7186 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7187 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7188 }
7189 return lock_history_str;
7190 }
7191
7192 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7193 socket_getlock(struct socket *so, int flags)
7194 {
7195 if (so->so_proto->pr_getlock != NULL) {
7196 return (*so->so_proto->pr_getlock)(so, flags);
7197 } else {
7198 return so->so_proto->pr_domain->dom_mtx;
7199 }
7200 }
7201
7202 void
socket_lock(struct socket * so,int refcount)7203 socket_lock(struct socket *so, int refcount)
7204 {
7205 void *lr_saved;
7206
7207 lr_saved = __builtin_return_address(0);
7208
7209 if (so->so_proto->pr_lock) {
7210 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7211 } else {
7212 #ifdef MORE_LOCKING_DEBUG
7213 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7214 LCK_MTX_ASSERT_NOTOWNED);
7215 #endif
7216 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7217 if (refcount) {
7218 so->so_usecount++;
7219 }
7220 so->lock_lr[so->next_lock_lr] = lr_saved;
7221 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7222 }
7223 }
7224
7225 void
socket_lock_assert_owned(struct socket * so)7226 socket_lock_assert_owned(struct socket *so)
7227 {
7228 lck_mtx_t *mutex_held;
7229
7230 if (so->so_proto->pr_getlock != NULL) {
7231 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7232 } else {
7233 mutex_held = so->so_proto->pr_domain->dom_mtx;
7234 }
7235
7236 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7237 }
7238
7239 int
socket_try_lock(struct socket * so)7240 socket_try_lock(struct socket *so)
7241 {
7242 lck_mtx_t *mtx;
7243
7244 if (so->so_proto->pr_getlock != NULL) {
7245 mtx = (*so->so_proto->pr_getlock)(so, 0);
7246 } else {
7247 mtx = so->so_proto->pr_domain->dom_mtx;
7248 }
7249
7250 return lck_mtx_try_lock(mtx);
7251 }
7252
7253 void
socket_unlock(struct socket * so,int refcount)7254 socket_unlock(struct socket *so, int refcount)
7255 {
7256 void *lr_saved;
7257 lck_mtx_t *mutex_held;
7258
7259 lr_saved = __builtin_return_address(0);
7260
7261 if (so == NULL || so->so_proto == NULL) {
7262 panic("%s: null so_proto so=%p", __func__, so);
7263 /* NOTREACHED */
7264 }
7265
7266 if (so->so_proto->pr_unlock) {
7267 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7268 } else {
7269 mutex_held = so->so_proto->pr_domain->dom_mtx;
7270 #ifdef MORE_LOCKING_DEBUG
7271 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7272 #endif
7273 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7274 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7275
7276 if (refcount) {
7277 if (so->so_usecount <= 0) {
7278 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7279 "lrh=%s", __func__, so->so_usecount, so,
7280 SOCK_DOM(so), so->so_type,
7281 SOCK_PROTO(so), solockhistory_nr(so));
7282 /* NOTREACHED */
7283 }
7284
7285 so->so_usecount--;
7286 if (so->so_usecount == 0) {
7287 sofreelastref(so, 1);
7288 }
7289 }
7290 lck_mtx_unlock(mutex_held);
7291 }
7292 }
7293
7294 /* Called with socket locked, will unlock socket */
7295 void
sofree(struct socket * so)7296 sofree(struct socket *so)
7297 {
7298 lck_mtx_t *mutex_held;
7299
7300 if (so->so_proto->pr_getlock != NULL) {
7301 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7302 } else {
7303 mutex_held = so->so_proto->pr_domain->dom_mtx;
7304 }
7305 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7306
7307 sofreelastref(so, 0);
7308 }
7309
7310 void
soreference(struct socket * so)7311 soreference(struct socket *so)
7312 {
7313 socket_lock(so, 1); /* locks & take one reference on socket */
7314 socket_unlock(so, 0); /* unlock only */
7315 }
7316
7317 void
sodereference(struct socket * so)7318 sodereference(struct socket *so)
7319 {
7320 socket_lock(so, 0);
7321 socket_unlock(so, 1);
7322 }
7323
7324 /*
7325 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7326 * possibility of using jumbo clusters. Caller must ensure to hold
7327 * the socket lock.
7328 */
7329 void
somultipages(struct socket * so,boolean_t set)7330 somultipages(struct socket *so, boolean_t set)
7331 {
7332 if (set) {
7333 so->so_flags |= SOF_MULTIPAGES;
7334 } else {
7335 so->so_flags &= ~SOF_MULTIPAGES;
7336 }
7337 }
7338
7339 void
soif2kcl(struct socket * so,boolean_t set)7340 soif2kcl(struct socket *so, boolean_t set)
7341 {
7342 if (set) {
7343 so->so_flags1 |= SOF1_IF_2KCL;
7344 } else {
7345 so->so_flags1 &= ~SOF1_IF_2KCL;
7346 }
7347 }
7348
7349 int
so_isdstlocal(struct socket * so)7350 so_isdstlocal(struct socket *so)
7351 {
7352 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7353
7354 if (SOCK_DOM(so) == PF_INET) {
7355 return inaddr_local(inp->inp_faddr);
7356 } else if (SOCK_DOM(so) == PF_INET6) {
7357 return in6addr_local(&inp->in6p_faddr);
7358 }
7359
7360 return 0;
7361 }
7362
7363 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7364 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7365 {
7366 struct sockbuf *rcv, *snd;
7367 int err = 0, defunct;
7368
7369 rcv = &so->so_rcv;
7370 snd = &so->so_snd;
7371
7372 defunct = (so->so_flags & SOF_DEFUNCT);
7373 if (defunct) {
7374 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7375 panic("%s: SB_DROP not set", __func__);
7376 /* NOTREACHED */
7377 }
7378 goto done;
7379 }
7380
7381 if (so->so_flags & SOF_NODEFUNCT) {
7382 if (noforce) {
7383 err = EOPNOTSUPP;
7384 if (p != PROC_NULL) {
7385 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7386 "name %s level %d) so 0x%llu [%d,%d] "
7387 "is not eligible for defunct "
7388 "(%d)\n", __func__, proc_selfpid(),
7389 proc_best_name(current_proc()), proc_pid(p),
7390 proc_best_name(p), level,
7391 so->so_gencnt,
7392 SOCK_DOM(so), SOCK_TYPE(so), err);
7393 }
7394 return err;
7395 }
7396 so->so_flags &= ~SOF_NODEFUNCT;
7397 if (p != PROC_NULL) {
7398 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7399 "name %s level %d) so 0x%llu [%d,%d] "
7400 "defunct by force "
7401 "(%d)\n", __func__, proc_selfpid(),
7402 proc_best_name(current_proc()), proc_pid(p),
7403 proc_best_name(p), level,
7404 so->so_gencnt,
7405 SOCK_DOM(so), SOCK_TYPE(so), err);
7406 }
7407 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7408 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7409 struct ifnet *ifp = inp->inp_last_outifp;
7410
7411 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7412 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7413 } else if (so->so_flags & SOF_DELEGATED) {
7414 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7415 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7416 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7417 } else if (noforce && p != PROC_NULL) {
7418 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7419
7420 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7421 so->so_extended_bk_start = net_uptime();
7422 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7423
7424 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7425
7426 err = EOPNOTSUPP;
7427 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7428 "name %s level %d) so 0x%llu [%d,%d] "
7429 "extend bk idle "
7430 "(%d)\n", __func__, proc_selfpid(),
7431 proc_best_name(current_proc()), proc_pid(p),
7432 proc_best_name(p), level,
7433 so->so_gencnt,
7434 SOCK_DOM(so), SOCK_TYPE(so), err);
7435 return err;
7436 } else {
7437 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7438 }
7439 }
7440
7441 so->so_flags |= SOF_DEFUNCT;
7442
7443 /* Prevent further data from being appended to the socket buffers */
7444 snd->sb_flags |= SB_DROP;
7445 rcv->sb_flags |= SB_DROP;
7446
7447 /* Flush any existing data in the socket buffers */
7448 if (rcv->sb_cc != 0) {
7449 rcv->sb_flags &= ~SB_SEL;
7450 selthreadclear(&rcv->sb_sel);
7451 sbrelease(rcv);
7452 }
7453 if (snd->sb_cc != 0) {
7454 snd->sb_flags &= ~SB_SEL;
7455 selthreadclear(&snd->sb_sel);
7456 sbrelease(snd);
7457 }
7458
7459 done:
7460 if (p != PROC_NULL) {
7461 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7462 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7463 proc_selfpid(), proc_best_name(current_proc()),
7464 proc_pid(p), proc_best_name(p), level,
7465 so->so_gencnt, SOCK_DOM(so),
7466 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7467 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7468 " extbkidle" : "");
7469 }
7470 return err;
7471 }
7472
7473 int
sodefunct(struct proc * p,struct socket * so,int level)7474 sodefunct(struct proc *p, struct socket *so, int level)
7475 {
7476 struct sockbuf *rcv, *snd;
7477
7478 if (!(so->so_flags & SOF_DEFUNCT)) {
7479 panic("%s improperly called", __func__);
7480 /* NOTREACHED */
7481 }
7482 if (so->so_state & SS_DEFUNCT) {
7483 goto done;
7484 }
7485
7486 rcv = &so->so_rcv;
7487 snd = &so->so_snd;
7488
7489 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7490 char s[MAX_IPv6_STR_LEN];
7491 char d[MAX_IPv6_STR_LEN];
7492 struct inpcb *inp = sotoinpcb(so);
7493
7494 if (p != PROC_NULL) {
7495 SODEFUNCTLOG(
7496 "%s[%d, %s]: (target pid %d name %s level %d) "
7497 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7498 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7499 " snd_fl 0x%x]\n", __func__,
7500 proc_selfpid(), proc_best_name(current_proc()),
7501 proc_pid(p), proc_best_name(p), level,
7502 so->so_gencnt,
7503 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7504 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7505 (void *)&inp->inp_laddr.s_addr :
7506 (void *)&inp->in6p_laddr),
7507 s, sizeof(s)), ntohs(inp->in6p_lport),
7508 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7509 (void *)&inp->inp_faddr.s_addr :
7510 (void *)&inp->in6p_faddr,
7511 d, sizeof(d)), ntohs(inp->in6p_fport),
7512 (uint32_t)rcv->sb_sel.si_flags,
7513 (uint32_t)snd->sb_sel.si_flags,
7514 rcv->sb_flags, snd->sb_flags);
7515 }
7516 } else if (p != PROC_NULL) {
7517 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7518 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7519 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7520 proc_selfpid(), proc_best_name(current_proc()),
7521 proc_pid(p), proc_best_name(p), level,
7522 so->so_gencnt,
7523 SOCK_DOM(so), SOCK_TYPE(so),
7524 (uint32_t)rcv->sb_sel.si_flags,
7525 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7526 snd->sb_flags);
7527 }
7528
7529 /*
7530 * First tell the protocol the flow is defunct
7531 */
7532 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7533
7534 /*
7535 * Unwedge threads blocked on sbwait() and sb_lock().
7536 */
7537 sbwakeup(rcv);
7538 sbwakeup(snd);
7539
7540 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7541 if (rcv->sb_flags & SB_LOCK) {
7542 sbunlock(rcv, TRUE); /* keep socket locked */
7543 }
7544 if (snd->sb_flags & SB_LOCK) {
7545 sbunlock(snd, TRUE); /* keep socket locked */
7546 }
7547 /*
7548 * Flush the buffers and disconnect. We explicitly call shutdown
7549 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7550 * states are set for the socket. This would also flush out data
7551 * hanging off the receive list of this socket.
7552 */
7553 (void) soshutdownlock_final(so, SHUT_RD);
7554 (void) soshutdownlock_final(so, SHUT_WR);
7555 (void) sodisconnectlocked(so);
7556
7557 /*
7558 * Explicitly handle connectionless-protocol disconnection
7559 * and release any remaining data in the socket buffers.
7560 */
7561 if (!(so->so_state & SS_ISDISCONNECTED)) {
7562 (void) soisdisconnected(so);
7563 }
7564
7565 if (so->so_error == 0) {
7566 so->so_error = EBADF;
7567 }
7568
7569 if (rcv->sb_cc != 0) {
7570 rcv->sb_flags &= ~SB_SEL;
7571 selthreadclear(&rcv->sb_sel);
7572 sbrelease(rcv);
7573 }
7574 if (snd->sb_cc != 0) {
7575 snd->sb_flags &= ~SB_SEL;
7576 selthreadclear(&snd->sb_sel);
7577 sbrelease(snd);
7578 }
7579 so->so_state |= SS_DEFUNCT;
7580 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7581
7582 done:
7583 return 0;
7584 }
7585
7586 int
soresume(struct proc * p,struct socket * so,int locked)7587 soresume(struct proc *p, struct socket *so, int locked)
7588 {
7589 if (locked == 0) {
7590 socket_lock(so, 1);
7591 }
7592
7593 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7594 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7595 "[%d,%d] resumed from bk idle\n",
7596 __func__, proc_selfpid(), proc_best_name(current_proc()),
7597 proc_pid(p), proc_best_name(p),
7598 so->so_gencnt,
7599 SOCK_DOM(so), SOCK_TYPE(so));
7600
7601 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7602 so->so_extended_bk_start = 0;
7603 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7604
7605 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7606 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7607 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7608 }
7609 if (locked == 0) {
7610 socket_unlock(so, 1);
7611 }
7612
7613 return 0;
7614 }
7615
7616 /*
7617 * Does not attempt to account for sockets that are delegated from
7618 * the current process
7619 */
7620 int
so_set_extended_bk_idle(struct socket * so,int optval)7621 so_set_extended_bk_idle(struct socket *so, int optval)
7622 {
7623 int error = 0;
7624
7625 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7626 SOCK_PROTO(so) != IPPROTO_TCP) {
7627 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7628 error = EOPNOTSUPP;
7629 } else if (optval == 0) {
7630 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7631
7632 soresume(current_proc(), so, 1);
7633 } else {
7634 struct proc *p = current_proc();
7635 struct fileproc *fp;
7636 int count = 0;
7637
7638 /*
7639 * Unlock socket to avoid lock ordering issue with
7640 * the proc fd table lock
7641 */
7642 socket_unlock(so, 0);
7643
7644 proc_fdlock(p);
7645 fdt_foreach(fp, p) {
7646 struct socket *so2;
7647
7648 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7649 continue;
7650 }
7651
7652 so2 = (struct socket *)fp_get_data(fp);
7653 if (so != so2 &&
7654 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7655 count++;
7656 }
7657 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7658 break;
7659 }
7660 }
7661 proc_fdunlock(p);
7662
7663 socket_lock(so, 0);
7664
7665 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7666 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7667 error = EBUSY;
7668 } else if (so->so_flags & SOF_DELEGATED) {
7669 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7670 error = EBUSY;
7671 } else {
7672 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7673 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7674 }
7675 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7676 "%s marked for extended bk idle\n",
7677 __func__, proc_selfpid(), proc_best_name(current_proc()),
7678 so->so_gencnt,
7679 SOCK_DOM(so), SOCK_TYPE(so),
7680 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7681 "is" : "not");
7682 }
7683
7684 return error;
7685 }
7686
7687 static void
so_stop_extended_bk_idle(struct socket * so)7688 so_stop_extended_bk_idle(struct socket *so)
7689 {
7690 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7691 so->so_extended_bk_start = 0;
7692
7693 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7694 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7695 /*
7696 * Force defunct
7697 */
7698 sosetdefunct(current_proc(), so,
7699 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7700 if (so->so_flags & SOF_DEFUNCT) {
7701 sodefunct(current_proc(), so,
7702 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7703 }
7704 }
7705
7706 void
so_drain_extended_bk_idle(struct socket * so)7707 so_drain_extended_bk_idle(struct socket *so)
7708 {
7709 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7710 /*
7711 * Only penalize sockets that have outstanding data
7712 */
7713 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7714 so_stop_extended_bk_idle(so);
7715
7716 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7717 }
7718 }
7719 }
7720
7721 /*
7722 * Return values tells if socket is still in extended background idle
7723 */
7724 int
so_check_extended_bk_idle_time(struct socket * so)7725 so_check_extended_bk_idle_time(struct socket *so)
7726 {
7727 int ret = 1;
7728
7729 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7730 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7731 __func__, proc_selfpid(), proc_best_name(current_proc()),
7732 so->so_gencnt,
7733 SOCK_DOM(so), SOCK_TYPE(so));
7734 if (net_uptime() - so->so_extended_bk_start >
7735 soextbkidlestat.so_xbkidle_time) {
7736 so_stop_extended_bk_idle(so);
7737
7738 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7739
7740 ret = 0;
7741 } else {
7742 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7743
7744 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7745 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7746 }
7747 }
7748
7749 return ret;
7750 }
7751
7752 void
resume_proc_sockets(proc_t p)7753 resume_proc_sockets(proc_t p)
7754 {
7755 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7756 struct fileproc *fp;
7757 struct socket *so;
7758
7759 proc_fdlock(p);
7760 fdt_foreach(fp, p) {
7761 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7762 continue;
7763 }
7764
7765 so = (struct socket *)fp_get_data(fp);
7766 (void) soresume(p, so, 0);
7767 }
7768 proc_fdunlock(p);
7769
7770 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7771 }
7772 }
7773
7774 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7775 so_set_recv_anyif(struct socket *so, int optval)
7776 {
7777 int ret = 0;
7778
7779 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7780 if (optval) {
7781 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7782 } else {
7783 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7784 }
7785 #if SKYWALK
7786 inp_update_netns_flags(so);
7787 #endif /* SKYWALK */
7788 }
7789
7790
7791 return ret;
7792 }
7793
7794 __private_extern__ int
so_get_recv_anyif(struct socket * so)7795 so_get_recv_anyif(struct socket *so)
7796 {
7797 int ret = 0;
7798
7799 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7800 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7801 }
7802
7803 return ret;
7804 }
7805
7806 int
so_set_restrictions(struct socket * so,uint32_t vals)7807 so_set_restrictions(struct socket *so, uint32_t vals)
7808 {
7809 int nocell_old, nocell_new;
7810 int noexpensive_old, noexpensive_new;
7811 int noconstrained_old, noconstrained_new;
7812
7813 /*
7814 * Deny-type restrictions are trapdoors; once set they cannot be
7815 * unset for the lifetime of the socket. This allows them to be
7816 * issued by a framework on behalf of the application without
7817 * having to worry that they can be undone.
7818 *
7819 * Note here that socket-level restrictions overrides any protocol
7820 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7821 * socket restriction issued on the socket has a higher precendence
7822 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7823 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7824 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7825 */
7826 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7827 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7828 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7829 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7830 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7831 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7832 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7833 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7834 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7835
7836 /* we can only set, not clear restrictions */
7837 if ((nocell_new - nocell_old) == 0 &&
7838 (noexpensive_new - noexpensive_old) == 0 &&
7839 (noconstrained_new - noconstrained_old) == 0) {
7840 return 0;
7841 }
7842 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7843 if (nocell_new - nocell_old != 0) {
7844 /*
7845 * if deny cellular is now set, do what's needed
7846 * for INPCB
7847 */
7848 inp_set_nocellular(sotoinpcb(so));
7849 }
7850 if (noexpensive_new - noexpensive_old != 0) {
7851 inp_set_noexpensive(sotoinpcb(so));
7852 }
7853 if (noconstrained_new - noconstrained_old != 0) {
7854 inp_set_noconstrained(sotoinpcb(so));
7855 }
7856 }
7857
7858 if (SOCK_DOM(so) == PF_MULTIPATH) {
7859 mptcp_set_restrictions(so);
7860 }
7861
7862 return 0;
7863 }
7864
7865 uint32_t
so_get_restrictions(struct socket * so)7866 so_get_restrictions(struct socket *so)
7867 {
7868 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7869 SO_RESTRICT_DENY_OUT |
7870 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7871 }
7872
7873 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7874 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7875 {
7876 struct proc *ep = PROC_NULL;
7877 int error = 0;
7878
7879 /* pid 0 is reserved for kernel */
7880 if (epid == 0) {
7881 error = EINVAL;
7882 goto done;
7883 }
7884
7885 /*
7886 * If this is an in-kernel socket, prevent its delegate
7887 * association from changing unless the socket option is
7888 * coming from within the kernel itself.
7889 */
7890 if (so->last_pid == 0 && p != kernproc) {
7891 error = EACCES;
7892 goto done;
7893 }
7894
7895 /*
7896 * If this is issued by a process that's recorded as the
7897 * real owner of the socket, or if the pid is the same as
7898 * the process's own pid, then proceed. Otherwise ensure
7899 * that the issuing process has the necessary privileges.
7900 */
7901 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7902 if ((error = priv_check_cred(kauth_cred_get(),
7903 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7904 error = EACCES;
7905 goto done;
7906 }
7907 }
7908
7909 /* Find the process that corresponds to the effective pid */
7910 if ((ep = proc_find(epid)) == PROC_NULL) {
7911 error = ESRCH;
7912 goto done;
7913 }
7914
7915 /*
7916 * If a process tries to delegate the socket to itself, then
7917 * there's really nothing to do; treat it as a way for the
7918 * delegate association to be cleared. Note that we check
7919 * the passed-in proc rather than calling proc_selfpid(),
7920 * as we need to check the process issuing the socket option
7921 * which could be kernproc. Given that we don't allow 0 for
7922 * effective pid, it means that a delegated in-kernel socket
7923 * stays delegated during its lifetime (which is probably OK.)
7924 */
7925 if (epid == proc_pid(p)) {
7926 so->so_flags &= ~SOF_DELEGATED;
7927 so->e_upid = 0;
7928 so->e_pid = 0;
7929 uuid_clear(so->e_uuid);
7930 } else {
7931 so->so_flags |= SOF_DELEGATED;
7932 so->e_upid = proc_uniqueid(ep);
7933 so->e_pid = proc_pid(ep);
7934 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7935
7936 #if defined(XNU_TARGET_OS_OSX)
7937 if (ep->p_responsible_pid != so->e_pid) {
7938 proc_t rp = proc_find(ep->p_responsible_pid);
7939 if (rp != PROC_NULL) {
7940 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7941 so->so_rpid = ep->p_responsible_pid;
7942 proc_rele(rp);
7943 } else {
7944 uuid_clear(so->so_ruuid);
7945 so->so_rpid = -1;
7946 }
7947 }
7948 #endif
7949 }
7950 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7951 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7952 }
7953 done:
7954 if (error == 0 && net_io_policy_log) {
7955 uuid_string_t buf;
7956
7957 uuid_unparse(so->e_uuid, buf);
7958 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7959 "euuid %s%s\n", __func__, proc_name_address(p),
7960 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7961 SOCK_DOM(so), SOCK_TYPE(so),
7962 so->e_pid, proc_name_address(ep), buf,
7963 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7964 } else if (error != 0 && net_io_policy_log) {
7965 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7966 "ERROR (%d)\n", __func__, proc_name_address(p),
7967 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7968 SOCK_DOM(so), SOCK_TYPE(so),
7969 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7970 proc_name_address(ep), error);
7971 }
7972
7973 /* Update this socket's policy upon success */
7974 if (error == 0) {
7975 so->so_policy_gencnt *= -1;
7976 so_update_policy(so);
7977 #if NECP
7978 so_update_necp_policy(so, NULL, NULL);
7979 #endif /* NECP */
7980 }
7981
7982 if (ep != PROC_NULL) {
7983 proc_rele(ep);
7984 }
7985
7986 return error;
7987 }
7988
7989 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7990 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7991 {
7992 uuid_string_t buf;
7993 uuid_t uuid;
7994 int error = 0;
7995
7996 /* UUID must not be all-zeroes (reserved for kernel) */
7997 if (uuid_is_null(euuid)) {
7998 error = EINVAL;
7999 goto done;
8000 }
8001
8002 /*
8003 * If this is an in-kernel socket, prevent its delegate
8004 * association from changing unless the socket option is
8005 * coming from within the kernel itself.
8006 */
8007 if (so->last_pid == 0 && p != kernproc) {
8008 error = EACCES;
8009 goto done;
8010 }
8011
8012 /* Get the UUID of the issuing process */
8013 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8014
8015 /*
8016 * If this is issued by a process that's recorded as the
8017 * real owner of the socket, or if the uuid is the same as
8018 * the process's own uuid, then proceed. Otherwise ensure
8019 * that the issuing process has the necessary privileges.
8020 */
8021 if (check_cred &&
8022 (uuid_compare(euuid, so->last_uuid) != 0 ||
8023 uuid_compare(euuid, uuid) != 0)) {
8024 if ((error = priv_check_cred(kauth_cred_get(),
8025 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8026 error = EACCES;
8027 goto done;
8028 }
8029 }
8030
8031 /*
8032 * If a process tries to delegate the socket to itself, then
8033 * there's really nothing to do; treat it as a way for the
8034 * delegate association to be cleared. Note that we check
8035 * the uuid of the passed-in proc rather than that of the
8036 * current process, as we need to check the process issuing
8037 * the socket option which could be kernproc itself. Given
8038 * that we don't allow 0 for effective uuid, it means that
8039 * a delegated in-kernel socket stays delegated during its
8040 * lifetime (which is okay.)
8041 */
8042 if (uuid_compare(euuid, uuid) == 0) {
8043 so->so_flags &= ~SOF_DELEGATED;
8044 so->e_upid = 0;
8045 so->e_pid = 0;
8046 uuid_clear(so->e_uuid);
8047 } else {
8048 so->so_flags |= SOF_DELEGATED;
8049 /*
8050 * Unlike so_set_effective_pid(), we only have the UUID
8051 * here and the process ID is not known. Inherit the
8052 * real {pid,upid} of the socket.
8053 */
8054 so->e_upid = so->last_upid;
8055 so->e_pid = so->last_pid;
8056 uuid_copy(so->e_uuid, euuid);
8057 }
8058 /*
8059 * The following will clear the effective process name as it's the same
8060 * as the real process
8061 */
8062 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8063 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8064 }
8065 done:
8066 if (error == 0 && net_io_policy_log) {
8067 uuid_unparse(so->e_uuid, buf);
8068 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8069 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8070 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8071 SOCK_TYPE(so), so->e_pid, buf,
8072 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8073 } else if (error != 0 && net_io_policy_log) {
8074 uuid_unparse(euuid, buf);
8075 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8076 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8077 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8078 SOCK_TYPE(so), buf, error);
8079 }
8080
8081 /* Update this socket's policy upon success */
8082 if (error == 0) {
8083 so->so_policy_gencnt *= -1;
8084 so_update_policy(so);
8085 #if NECP
8086 so_update_necp_policy(so, NULL, NULL);
8087 #endif /* NECP */
8088 }
8089
8090 return error;
8091 }
8092
8093 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8094 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8095 uint32_t ev_datalen)
8096 {
8097 struct kev_msg ev_msg;
8098
8099 /*
8100 * A netpolicy event always starts with a netpolicy_event_data
8101 * structure, but the caller can provide for a longer event
8102 * structure to post, depending on the event code.
8103 */
8104 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8105
8106 bzero(&ev_msg, sizeof(ev_msg));
8107 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8108 ev_msg.kev_class = KEV_NETWORK_CLASS;
8109 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8110 ev_msg.event_code = ev_code;
8111
8112 ev_msg.dv[0].data_ptr = ev_data;
8113 ev_msg.dv[0].data_length = ev_datalen;
8114
8115 kev_post_msg(&ev_msg);
8116 }
8117
8118 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8119 socket_post_kev_msg(uint32_t ev_code,
8120 struct kev_socket_event_data *ev_data,
8121 uint32_t ev_datalen)
8122 {
8123 struct kev_msg ev_msg;
8124
8125 bzero(&ev_msg, sizeof(ev_msg));
8126 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8127 ev_msg.kev_class = KEV_NETWORK_CLASS;
8128 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8129 ev_msg.event_code = ev_code;
8130
8131 ev_msg.dv[0].data_ptr = ev_data;
8132 ev_msg.dv[0].data_length = ev_datalen;
8133
8134 kev_post_msg(&ev_msg);
8135 }
8136
8137 void
socket_post_kev_msg_closed(struct socket * so)8138 socket_post_kev_msg_closed(struct socket *so)
8139 {
8140 struct kev_socket_closed ev = {};
8141 struct sockaddr *socksa = NULL, *peersa = NULL;
8142 int err;
8143
8144 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8145 return;
8146 }
8147 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8148 if (err == 0) {
8149 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8150 &peersa);
8151 if (err == 0) {
8152 memcpy(&ev.ev_data.kev_sockname, socksa,
8153 min(socksa->sa_len,
8154 sizeof(ev.ev_data.kev_sockname)));
8155 memcpy(&ev.ev_data.kev_peername, peersa,
8156 min(peersa->sa_len,
8157 sizeof(ev.ev_data.kev_peername)));
8158 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8159 &ev.ev_data, sizeof(ev));
8160 }
8161 }
8162 free_sockaddr(socksa);
8163 free_sockaddr(peersa);
8164 }
8165
8166 __attribute__((noinline, cold, not_tail_called, noreturn))
8167 __private_extern__ int
assfail(const char * a,const char * f,int l)8168 assfail(const char *a, const char *f, int l)
8169 {
8170 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8171 /* NOTREACHED */
8172 __builtin_unreachable();
8173 }
8174