1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <sys/persona.h>
100 #include <net/route.h>
101 #include <net/init.h>
102 #include <net/net_api_stats.h>
103 #include <net/ntstat.h>
104 #include <net/content_filter.h>
105 #include <netinet/in.h>
106 #include <netinet/in_pcb.h>
107 #include <netinet/in_tclass.h>
108 #include <netinet/in_var.h>
109 #include <netinet/tcp_var.h>
110 #include <netinet/ip6.h>
111 #include <netinet6/ip6_var.h>
112 #include <netinet/flow_divert.h>
113 #include <kern/zalloc.h>
114 #include <kern/locks.h>
115 #include <machine/limits.h>
116 #include <libkern/OSAtomic.h>
117 #include <pexpert/pexpert.h>
118 #include <kern/assert.h>
119 #include <kern/task.h>
120 #include <kern/policy_internal.h>
121
122 #include <sys/kpi_mbuf.h>
123 #include <sys/mcache.h>
124 #include <sys/unpcb.h>
125 #include <libkern/section_keywords.h>
126
127 #include <os/log.h>
128
129 #if CONFIG_MACF
130 #include <security/mac_framework.h>
131 #endif /* MAC */
132
133 #if MULTIPATH
134 #include <netinet/mp_pcb.h>
135 #include <netinet/mptcp_var.h>
136 #endif /* MULTIPATH */
137
138 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
139
140 #if DEBUG || DEVELOPMENT
141 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
142 #else
143 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
144 #endif
145
146 /* TODO: this should be in a header file somewhere */
147 extern char *proc_name_address(void *p);
148
149 static u_int32_t so_cache_hw; /* High water mark for socache */
150 static u_int32_t so_cache_timeouts; /* number of timeouts */
151 static u_int32_t so_cache_max_freed; /* max freed per timeout */
152 static u_int32_t cached_sock_count = 0;
153 STAILQ_HEAD(, socket) so_cache_head;
154 int max_cached_sock_count = MAX_CACHED_SOCKETS;
155 static uint64_t so_cache_time;
156 static int socketinit_done;
157 static struct zone *so_cache_zone;
158
159 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
160 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
161
162 #include <machine/limits.h>
163
164 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
165 static void filt_sordetach(struct knote *kn);
166 static int filt_soread(struct knote *kn, long hint);
167 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
168 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
169
170 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
171 static void filt_sowdetach(struct knote *kn);
172 static int filt_sowrite(struct knote *kn, long hint);
173 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
174 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
175
176 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
177 static void filt_sockdetach(struct knote *kn);
178 static int filt_sockev(struct knote *kn, long hint);
179 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
180 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
181
182 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
183 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
184
185 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
186 .f_isfd = 1,
187 .f_attach = filt_sorattach,
188 .f_detach = filt_sordetach,
189 .f_event = filt_soread,
190 .f_touch = filt_sortouch,
191 .f_process = filt_sorprocess,
192 };
193
194 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
195 .f_isfd = 1,
196 .f_attach = filt_sowattach,
197 .f_detach = filt_sowdetach,
198 .f_event = filt_sowrite,
199 .f_touch = filt_sowtouch,
200 .f_process = filt_sowprocess,
201 };
202
203 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
204 .f_isfd = 1,
205 .f_attach = filt_sockattach,
206 .f_detach = filt_sockdetach,
207 .f_event = filt_sockev,
208 .f_touch = filt_socktouch,
209 .f_process = filt_sockprocess,
210 };
211
212 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
213 .f_isfd = 1,
214 .f_attach = filt_sorattach,
215 .f_detach = filt_sordetach,
216 .f_event = filt_soread,
217 .f_touch = filt_sortouch,
218 .f_process = filt_sorprocess,
219 };
220
221 SYSCTL_DECL(_kern_ipc);
222
223 #define EVEN_MORE_LOCKING_DEBUG 0
224
225 int socket_debug = 0;
226 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
227 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
228
229 #if (DEBUG || DEVELOPMENT)
230 #define DEFAULT_SOSEND_ASSERT_PANIC 1
231 #else
232 #define DEFAULT_SOSEND_ASSERT_PANIC 0
233 #endif /* (DEBUG || DEVELOPMENT) */
234
235 int sosend_assert_panic = 0;
236 SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
238
239 static unsigned long sodefunct_calls = 0;
240 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
241 &sodefunct_calls, "");
242
243 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
244 so_gen_t so_gencnt; /* generation count for sockets */
245
246 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
247
248 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
249 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
250 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
251 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
252 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
253 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
254 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
255 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
256 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
257
258 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
259
260 int somaxconn = SOMAXCONN;
261 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
262 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
263
264 /* Should we get a maximum also ??? */
265 static int sosendmaxchain = 65536;
266 static int sosendminchain = 16384;
267 static int sorecvmincopy = 16384;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
270 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
271 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
272
273 /*
274 * Set to enable jumbo clusters (if available) for large writes when
275 * the socket is marked with SOF_MULTIPAGES; see below.
276 */
277 int sosendjcl = 1;
278 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
279 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
280
281 /*
282 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
283 * writes on the socket for all protocols on any network interfaces,
284 * depending upon sosendjcl above. Be extra careful when setting this
285 * to 1, because sending down packets that cross physical pages down to
286 * broken drivers (those that falsely assume that the physical pages
287 * are contiguous) might lead to system panics or silent data corruption.
288 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
289 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
290 * capable. Set this to 1 only for testing/debugging purposes.
291 */
292 int sosendjcl_ignore_capab = 0;
293 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
294 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
295
296 /*
297 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
298 * writes on the socket for all protocols on any network interfaces.
299 * Be extra careful when setting this to 1, because sending down packets with
300 * clusters larger that 2 KB might lead to system panics or data corruption.
301 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
302 * on the outgoing interface
303 * Set this to 1 for testing/debugging purposes only.
304 */
305 int sosendbigcl_ignore_capab = 0;
306 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
307 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
308
309 int sodefunctlog = 0;
310 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
311 &sodefunctlog, 0, "");
312
313 int sothrottlelog = 0;
314 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
315 &sothrottlelog, 0, "");
316
317 int sorestrictrecv = 1;
318 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
319 &sorestrictrecv, 0, "Enable inbound interface restrictions");
320
321 int sorestrictsend = 1;
322 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
323 &sorestrictsend, 0, "Enable outbound interface restrictions");
324
325 int soreserveheadroom = 1;
326 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
328
329 #if (DEBUG || DEVELOPMENT)
330 int so_notsent_lowat_check = 1;
331 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
332 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
333 #endif /* DEBUG || DEVELOPMENT */
334
335 int so_accept_list_waits = 0;
336 #if (DEBUG || DEVELOPMENT)
337 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
338 &so_accept_list_waits, 0, "number of waits for listener incomp list");
339 #endif /* DEBUG || DEVELOPMENT */
340
341 extern struct inpcbinfo tcbinfo;
342
343 /* TODO: these should be in header file */
344 extern int get_inpcb_str_size(void);
345 extern int get_tcp_str_size(void);
346
347 vm_size_t so_cache_zone_element_size;
348
349 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
350 user_ssize_t *);
351 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
352 static void cached_sock_free(struct socket *);
353
354 /*
355 * Maximum of extended background idle sockets per process
356 * Set to zero to disable further setting of the option
357 */
358
359 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
360 #define SO_IDLE_BK_IDLE_TIME 600
361 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
362
363 struct soextbkidlestat soextbkidlestat;
364
365 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
366 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
367 "Maximum of extended background idle sockets per process");
368
369 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
370 &soextbkidlestat.so_xbkidle_time, 0,
371 "Time in seconds to keep extended background idle sockets");
372
373 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
374 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
375 "High water mark for extended background idle sockets");
376
377 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
378 &soextbkidlestat, soextbkidlestat, "");
379
380 int so_set_extended_bk_idle(struct socket *, int);
381
382 #define SO_MAX_MSG_X 1024
383
384 /*
385 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
386 * setting the DSCP code on the packet based on the service class; see
387 * <rdar://problem/11277343> for details.
388 */
389 __private_extern__ u_int32_t sotcdb = 0;
390 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
391 &sotcdb, 0, "");
392
393 void
socketinit(void)394 socketinit(void)
395 {
396 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
397 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
398
399 #ifdef __LP64__
400 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
401 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
402 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
403 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
404 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
405 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
406 #else
407 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
408 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
409 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
410 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
411 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
412 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
413 #endif
414
415 if (socketinit_done) {
416 printf("socketinit: already called...\n");
417 return;
418 }
419 socketinit_done = 1;
420
421 PE_parse_boot_argn("socket_debug", &socket_debug,
422 sizeof(socket_debug));
423
424 PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
425 sizeof(sosend_assert_panic));
426
427 STAILQ_INIT(&so_cache_head);
428
429 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
430 + get_inpcb_str_size() + 4 + get_tcp_str_size());
431
432 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
433 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
434
435 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
436 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
437 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
438 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
439
440 in_pcbinit();
441 }
442
443 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)444 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
445 {
446 caddr_t temp;
447 uintptr_t offset;
448
449 lck_mtx_lock(&so_cache_mtx);
450
451 if (!STAILQ_EMPTY(&so_cache_head)) {
452 VERIFY(cached_sock_count > 0);
453
454 *so = STAILQ_FIRST(&so_cache_head);
455 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
456 STAILQ_NEXT((*so), so_cache_ent) = NULL;
457
458 cached_sock_count--;
459 lck_mtx_unlock(&so_cache_mtx);
460
461 temp = (*so)->so_saved_pcb;
462 bzero((caddr_t)*so, sizeof(struct socket));
463
464 (*so)->so_saved_pcb = temp;
465 } else {
466 lck_mtx_unlock(&so_cache_mtx);
467
468 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
469
470 /*
471 * Define offsets for extra structures into our
472 * single block of memory. Align extra structures
473 * on longword boundaries.
474 */
475
476 offset = (uintptr_t)*so;
477 offset += sizeof(struct socket);
478
479 offset = ALIGN(offset);
480
481 (*so)->so_saved_pcb = (caddr_t)offset;
482 offset += get_inpcb_str_size();
483
484 offset = ALIGN(offset);
485
486 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
487 (caddr_t)offset;
488 }
489
490 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
491 }
492
493 static void
cached_sock_free(struct socket * so)494 cached_sock_free(struct socket *so)
495 {
496 lck_mtx_lock(&so_cache_mtx);
497
498 so_cache_time = net_uptime();
499 if (++cached_sock_count > max_cached_sock_count) {
500 --cached_sock_count;
501 lck_mtx_unlock(&so_cache_mtx);
502 zfree(so_cache_zone, so);
503 } else {
504 if (so_cache_hw < cached_sock_count) {
505 so_cache_hw = cached_sock_count;
506 }
507
508 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
509
510 so->cache_timestamp = so_cache_time;
511 lck_mtx_unlock(&so_cache_mtx);
512 }
513 }
514
515 void
so_update_last_owner_locked(struct socket * so,proc_t self)516 so_update_last_owner_locked(struct socket *so, proc_t self)
517 {
518 if (so->last_pid != 0) {
519 /*
520 * last_pid and last_upid should remain zero for sockets
521 * created using sock_socket. The check above achieves that
522 */
523 if (self == PROC_NULL) {
524 self = current_proc();
525 }
526
527 if (so->last_upid != proc_uniqueid(self) ||
528 so->last_pid != proc_pid(self)) {
529 so->last_upid = proc_uniqueid(self);
530 so->last_pid = proc_pid(self);
531 proc_getexecutableuuid(self, so->last_uuid,
532 sizeof(so->last_uuid));
533 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
534 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
535 }
536 }
537 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
538 }
539 }
540
541 void
so_update_policy(struct socket * so)542 so_update_policy(struct socket *so)
543 {
544 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
545 (void) inp_update_policy(sotoinpcb(so));
546 }
547 }
548
549 #if NECP
550 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)551 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
552 struct sockaddr *override_remote_addr)
553 {
554 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
555 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
556 override_remote_addr, 0);
557 }
558 }
559 #endif /* NECP */
560
561 boolean_t
so_cache_timer(void)562 so_cache_timer(void)
563 {
564 struct socket *p;
565 int n_freed = 0;
566 boolean_t rc = FALSE;
567
568 lck_mtx_lock(&so_cache_mtx);
569 so_cache_timeouts++;
570 so_cache_time = net_uptime();
571
572 while (!STAILQ_EMPTY(&so_cache_head)) {
573 VERIFY(cached_sock_count > 0);
574 p = STAILQ_FIRST(&so_cache_head);
575 if ((so_cache_time - p->cache_timestamp) <
576 SO_CACHE_TIME_LIMIT) {
577 break;
578 }
579
580 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
581 --cached_sock_count;
582
583 zfree(so_cache_zone, p);
584
585 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
586 so_cache_max_freed++;
587 break;
588 }
589 }
590
591 /* Schedule again if there is more to cleanup */
592 if (!STAILQ_EMPTY(&so_cache_head)) {
593 rc = TRUE;
594 }
595
596 lck_mtx_unlock(&so_cache_mtx);
597 return rc;
598 }
599
600 /*
601 * Get a socket structure from our zone, and initialize it.
602 * We don't implement `waitok' yet (see comments in uipc_domain.c).
603 * Note that it would probably be better to allocate socket
604 * and PCB at the same time, but I'm not convinced that all
605 * the protocols can be easily modified to do this.
606 */
607 struct socket *
soalloc(int waitok,int dom,int type)608 soalloc(int waitok, int dom, int type)
609 {
610 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
611 struct socket *so;
612
613 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
614 cached_sock_alloc(&so, how);
615 } else {
616 so = zalloc_flags(socket_zone, how | Z_ZERO);
617 }
618 if (so != NULL) {
619 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
620
621 /*
622 * Increment the socket allocation statistics
623 */
624 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
625 }
626
627 return so;
628 }
629
630 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)631 socreate_internal(int dom, struct socket **aso, int type, int proto,
632 struct proc *p, uint32_t flags, struct proc *ep)
633 {
634 struct protosw *prp;
635 struct socket *so;
636 int error = 0;
637 #if defined(XNU_TARGET_OS_OSX)
638 pid_t rpid = -1;
639 #endif
640
641 #if TCPDEBUG
642 extern int tcpconsdebug;
643 #endif
644
645 VERIFY(aso != NULL);
646 *aso = NULL;
647
648 if (proto != 0) {
649 prp = pffindproto(dom, proto, type);
650 } else {
651 prp = pffindtype(dom, type);
652 }
653
654 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
655 if (pffinddomain(dom) == NULL) {
656 return EAFNOSUPPORT;
657 }
658 if (proto != 0) {
659 if (pffindprotonotype(dom, proto) != NULL) {
660 return EPROTOTYPE;
661 }
662 }
663 return EPROTONOSUPPORT;
664 }
665 if (prp->pr_type != type) {
666 return EPROTOTYPE;
667 }
668 so = soalloc(1, dom, type);
669 if (so == NULL) {
670 return ENOBUFS;
671 }
672
673 switch (dom) {
674 case PF_LOCAL:
675 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
676 break;
677 case PF_INET:
678 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
679 if (type == SOCK_STREAM) {
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
681 } else {
682 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
683 }
684 break;
685 case PF_ROUTE:
686 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
687 break;
688 case PF_NDRV:
689 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
690 break;
691 case PF_KEY:
692 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
693 break;
694 case PF_INET6:
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
696 if (type == SOCK_STREAM) {
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
698 } else {
699 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
700 }
701 break;
702 case PF_SYSTEM:
703 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
704 break;
705 case PF_MULTIPATH:
706 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
707 break;
708 default:
709 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
710 break;
711 }
712
713 if (flags & SOCF_MPTCP) {
714 so->so_state |= SS_NBIO;
715 }
716
717 TAILQ_INIT(&so->so_incomp);
718 TAILQ_INIT(&so->so_comp);
719 so->so_type = (short)type;
720 so->last_upid = proc_uniqueid(p);
721 so->last_pid = proc_pid(p);
722 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
723 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
724
725 if (ep != PROC_NULL && ep != p) {
726 so->e_upid = proc_uniqueid(ep);
727 so->e_pid = proc_pid(ep);
728 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
729 so->so_flags |= SOF_DELEGATED;
730 #if defined(XNU_TARGET_OS_OSX)
731 if (ep->p_responsible_pid != so->e_pid) {
732 rpid = ep->p_responsible_pid;
733 }
734 #endif
735 }
736
737 #if defined(XNU_TARGET_OS_OSX)
738 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
739 rpid = p->p_responsible_pid;
740 }
741
742 so->so_rpid = -1;
743 uuid_clear(so->so_ruuid);
744 if (rpid >= 0) {
745 proc_t rp = proc_find(rpid);
746 if (rp != PROC_NULL) {
747 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
748 so->so_rpid = rpid;
749 proc_rele(rp);
750 }
751 }
752 #endif
753
754 so->so_cred = kauth_cred_proc_ref(p);
755 if (!suser(kauth_cred_get(), NULL)) {
756 so->so_state |= SS_PRIV;
757 }
758
759 so->so_persona_id = current_persona_get_id();
760 so->so_proto = prp;
761 so->so_rcv.sb_flags |= SB_RECV;
762 so->so_rcv.sb_so = so->so_snd.sb_so = so;
763 so->next_lock_lr = 0;
764 so->next_unlock_lr = 0;
765
766 /*
767 * Attachment will create the per pcb lock if necessary and
768 * increase refcount for creation, make sure it's done before
769 * socket is inserted in lists.
770 */
771 so->so_usecount++;
772
773 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
774 if (error != 0) {
775 /*
776 * Warning:
777 * If so_pcb is not zero, the socket will be leaked,
778 * so protocol attachment handler must be coded carefuly
779 */
780 if (so->so_pcb != NULL) {
781 os_log_error(OS_LOG_DEFAULT,
782 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
783 error, dom, proto, type);
784 }
785 /*
786 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
787 */
788 so->so_state |= SS_NOFDREF;
789 so->so_flags |= SOF_PCBCLEARING;
790 VERIFY(so->so_usecount > 0);
791 so->so_usecount--;
792 sofreelastref(so, 1); /* will deallocate the socket */
793 return error;
794 }
795
796 /*
797 * Note: needs so_pcb to be set after pru_attach
798 */
799 if (prp->pr_update_last_owner != NULL) {
800 (*prp->pr_update_last_owner)(so, p, ep);
801 }
802
803 os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
804
805 /* Attach socket filters for this protocol */
806 sflt_initsock(so);
807 #if TCPDEBUG
808 if (tcpconsdebug == 2) {
809 so->so_options |= SO_DEBUG;
810 }
811 #endif
812 so_set_default_traffic_class(so);
813
814 /*
815 * If this thread or task is marked to create backgrounded sockets,
816 * mark the socket as background.
817 */
818 if (!(flags & SOCF_MPTCP) &&
819 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
820 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
821 so->so_background_thread = current_thread();
822 }
823
824 switch (dom) {
825 /*
826 * Don't mark Unix domain or system
827 * eligible for defunct by default.
828 */
829 case PF_LOCAL:
830 case PF_SYSTEM:
831 so->so_flags |= SOF_NODEFUNCT;
832 break;
833 default:
834 break;
835 }
836
837 /*
838 * Entitlements can't be checked at socket creation time except if the
839 * application requested a feature guarded by a privilege (c.f., socket
840 * delegation).
841 * The priv(9) and the Sandboxing APIs are designed with the idea that
842 * a privilege check should only be triggered by a userland request.
843 * A privilege check at socket creation time is time consuming and
844 * could trigger many authorisation error messages from the security
845 * APIs.
846 */
847
848 *aso = so;
849
850 return 0;
851 }
852
853 /*
854 * Returns: 0 Success
855 * EAFNOSUPPORT
856 * EPROTOTYPE
857 * EPROTONOSUPPORT
858 * ENOBUFS
859 * <pru_attach>:ENOBUFS[AF_UNIX]
860 * <pru_attach>:ENOBUFS[TCP]
861 * <pru_attach>:ENOMEM[TCP]
862 * <pru_attach>:??? [other protocol families, IPSEC]
863 */
864 int
socreate(int dom,struct socket ** aso,int type,int proto)865 socreate(int dom, struct socket **aso, int type, int proto)
866 {
867 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
868 PROC_NULL);
869 }
870
871 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)872 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
873 {
874 int error = 0;
875 struct proc *ep = PROC_NULL;
876
877 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
878 error = ESRCH;
879 goto done;
880 }
881
882 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
883
884 /*
885 * It might not be wise to hold the proc reference when calling
886 * socreate_internal since it calls soalloc with M_WAITOK
887 */
888 done:
889 if (ep != PROC_NULL) {
890 proc_rele(ep);
891 }
892
893 return error;
894 }
895
896 /*
897 * Returns: 0 Success
898 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
899 * <pru_bind>:EAFNOSUPPORT Address family not supported
900 * <pru_bind>:EADDRNOTAVAIL Address not available.
901 * <pru_bind>:EINVAL Invalid argument
902 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
903 * <pru_bind>:EACCES Permission denied
904 * <pru_bind>:EADDRINUSE Address in use
905 * <pru_bind>:EAGAIN Resource unavailable, try again
906 * <pru_bind>:EPERM Operation not permitted
907 * <pru_bind>:???
908 * <sf_bind>:???
909 *
910 * Notes: It's not possible to fully enumerate the return codes above,
911 * since socket filter authors and protocol family authors may
912 * not choose to limit their error returns to those listed, even
913 * though this may result in some software operating incorrectly.
914 *
915 * The error codes which are enumerated above are those known to
916 * be returned by the tcp_usr_bind function supplied.
917 */
918 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)919 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
920 {
921 struct proc *p = current_proc();
922 int error = 0;
923
924 if (dolock) {
925 socket_lock(so, 1);
926 }
927
928 so_update_last_owner_locked(so, p);
929 so_update_policy(so);
930
931 #if NECP
932 so_update_necp_policy(so, nam, NULL);
933 #endif /* NECP */
934
935 /*
936 * If this is a bind request on a socket that has been marked
937 * as inactive, reject it now before we go any further.
938 */
939 if (so->so_flags & SOF_DEFUNCT) {
940 error = EINVAL;
941 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
942 __func__, proc_pid(p), proc_best_name(p),
943 so->so_gencnt,
944 SOCK_DOM(so), SOCK_TYPE(so), error);
945 goto out;
946 }
947
948 /* Socket filter */
949 error = sflt_bind(so, nam);
950
951 if (error == 0) {
952 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
953 }
954 out:
955 if (dolock) {
956 socket_unlock(so, 1);
957 }
958
959 if (error == EJUSTRETURN) {
960 error = 0;
961 }
962
963 return error;
964 }
965
966 void
sodealloc(struct socket * so)967 sodealloc(struct socket *so)
968 {
969 kauth_cred_unref(&so->so_cred);
970
971 /* Remove any filters */
972 sflt_termsock(so);
973
974 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
975
976 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
977 cached_sock_free(so);
978 } else {
979 zfree(socket_zone, so);
980 }
981 }
982
983 /*
984 * Returns: 0 Success
985 * EINVAL
986 * EOPNOTSUPP
987 * <pru_listen>:EINVAL[AF_UNIX]
988 * <pru_listen>:EINVAL[TCP]
989 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
990 * <pru_listen>:EINVAL[TCP] Invalid argument
991 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
992 * <pru_listen>:EACCES[TCP] Permission denied
993 * <pru_listen>:EADDRINUSE[TCP] Address in use
994 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
995 * <pru_listen>:EPERM[TCP] Operation not permitted
996 * <sf_listen>:???
997 *
998 * Notes: Other <pru_listen> returns depend on the protocol family; all
999 * <sf_listen> returns depend on what the filter author causes
1000 * their filter to return.
1001 */
1002 int
solisten(struct socket * so,int backlog)1003 solisten(struct socket *so, int backlog)
1004 {
1005 struct proc *p = current_proc();
1006 int error = 0;
1007
1008 socket_lock(so, 1);
1009
1010 so_update_last_owner_locked(so, p);
1011 so_update_policy(so);
1012
1013 if (TAILQ_EMPTY(&so->so_comp)) {
1014 so->so_options |= SO_ACCEPTCONN;
1015 }
1016
1017 #if NECP
1018 so_update_necp_policy(so, NULL, NULL);
1019 #endif /* NECP */
1020
1021 if (so->so_proto == NULL) {
1022 error = EINVAL;
1023 so->so_options &= ~SO_ACCEPTCONN;
1024 goto out;
1025 }
1026 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1027 error = EOPNOTSUPP;
1028 so->so_options &= ~SO_ACCEPTCONN;
1029 goto out;
1030 }
1031
1032 /*
1033 * If the listen request is made on a socket that is not fully
1034 * disconnected, or on a socket that has been marked as inactive,
1035 * reject the request now.
1036 */
1037 if ((so->so_state &
1038 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1039 (so->so_flags & SOF_DEFUNCT)) {
1040 error = EINVAL;
1041 if (so->so_flags & SOF_DEFUNCT) {
1042 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1043 "(%d)\n", __func__, proc_pid(p),
1044 proc_best_name(p),
1045 so->so_gencnt,
1046 SOCK_DOM(so), SOCK_TYPE(so), error);
1047 }
1048 so->so_options &= ~SO_ACCEPTCONN;
1049 goto out;
1050 }
1051
1052 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1053 error = EPERM;
1054 so->so_options &= ~SO_ACCEPTCONN;
1055 goto out;
1056 }
1057
1058 error = sflt_listen(so);
1059 if (error == 0) {
1060 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1061 }
1062
1063 if (error) {
1064 if (error == EJUSTRETURN) {
1065 error = 0;
1066 }
1067 so->so_options &= ~SO_ACCEPTCONN;
1068 goto out;
1069 }
1070
1071 /*
1072 * POSIX: The implementation may have an upper limit on the length of
1073 * the listen queue-either global or per accepting socket. If backlog
1074 * exceeds this limit, the length of the listen queue is set to the
1075 * limit.
1076 *
1077 * If listen() is called with a backlog argument value that is less
1078 * than 0, the function behaves as if it had been called with a backlog
1079 * argument value of 0.
1080 *
1081 * A backlog argument of 0 may allow the socket to accept connections,
1082 * in which case the length of the listen queue may be set to an
1083 * implementation-defined minimum value.
1084 */
1085 if (backlog <= 0 || backlog > somaxconn) {
1086 backlog = somaxconn;
1087 }
1088
1089 so->so_qlimit = (short)backlog;
1090 out:
1091 socket_unlock(so, 1);
1092 return error;
1093 }
1094
1095 /*
1096 * The "accept list lock" protects the fields related to the listener queues
1097 * because we can unlock a socket to respect the lock ordering between
1098 * the listener socket and its clients sockets. The lock ordering is first to
1099 * acquire the client socket before the listener socket.
1100 *
1101 * The accept list lock serializes access to the following fields:
1102 * - of the listener socket:
1103 * - so_comp
1104 * - so_incomp
1105 * - so_qlen
1106 * - so_inqlen
1107 * - of client sockets that are in so_comp or so_incomp:
1108 * - so_head
1109 * - so_list
1110 *
1111 * As one can see the accept list lock protects the consistent of the
1112 * linkage of the client sockets.
1113 *
1114 * Note that those fields may be read without holding the accept list lock
1115 * for a preflight provided the accept list lock is taken when committing
1116 * to take an action based on the result of the preflight. The preflight
1117 * saves the cost of doing the unlock/lock dance.
1118 */
1119 void
so_acquire_accept_list(struct socket * head,struct socket * so)1120 so_acquire_accept_list(struct socket *head, struct socket *so)
1121 {
1122 lck_mtx_t *mutex_held;
1123
1124 if (head->so_proto->pr_getlock == NULL) {
1125 return;
1126 }
1127 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1128 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1129
1130 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1131 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1132 return;
1133 }
1134 if (so != NULL) {
1135 socket_unlock(so, 0);
1136 }
1137 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1138 so_accept_list_waits += 1;
1139 msleep((caddr_t)&head->so_incomp, mutex_held,
1140 PSOCK | PCATCH, __func__, NULL);
1141 }
1142 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1143 if (so != NULL) {
1144 socket_unlock(head, 0);
1145 socket_lock(so, 0);
1146 socket_lock(head, 0);
1147 }
1148 }
1149
1150 void
so_release_accept_list(struct socket * head)1151 so_release_accept_list(struct socket *head)
1152 {
1153 if (head->so_proto->pr_getlock != NULL) {
1154 lck_mtx_t *mutex_held;
1155
1156 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1157 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1158
1159 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1160 wakeup((caddr_t)&head->so_incomp);
1161 }
1162 }
1163
1164 void
sofreelastref(struct socket * so,int dealloc)1165 sofreelastref(struct socket *so, int dealloc)
1166 {
1167 struct socket *head = so->so_head;
1168
1169 /* Assume socket is locked */
1170
1171 #if FLOW_DIVERT
1172 if (so->so_flags & SOF_FLOW_DIVERT) {
1173 flow_divert_detach(so);
1174 }
1175 #endif /* FLOW_DIVERT */
1176
1177 #if CONTENT_FILTER
1178 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1179 cfil_sock_detach(so);
1180 }
1181 #endif /* CONTENT_FILTER */
1182
1183 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1184 soflow_detach(so);
1185 }
1186
1187 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1188 selthreadclear(&so->so_snd.sb_sel);
1189 selthreadclear(&so->so_rcv.sb_sel);
1190 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1191 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1192 so->so_event = sonullevent;
1193 return;
1194 }
1195 if (head != NULL) {
1196 /*
1197 * Need to lock the listener when the protocol has
1198 * per socket locks
1199 */
1200 if (head->so_proto->pr_getlock != NULL) {
1201 socket_lock(head, 1);
1202 so_acquire_accept_list(head, so);
1203 }
1204 if (so->so_state & SS_INCOMP) {
1205 so->so_state &= ~SS_INCOMP;
1206 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1207 head->so_incqlen--;
1208 head->so_qlen--;
1209 so->so_head = NULL;
1210
1211 if (head->so_proto->pr_getlock != NULL) {
1212 so_release_accept_list(head);
1213 socket_unlock(head, 1);
1214 }
1215 } else if (so->so_state & SS_COMP) {
1216 if (head->so_proto->pr_getlock != NULL) {
1217 so_release_accept_list(head);
1218 socket_unlock(head, 1);
1219 }
1220 /*
1221 * We must not decommission a socket that's
1222 * on the accept(2) queue. If we do, then
1223 * accept(2) may hang after select(2) indicated
1224 * that the listening socket was ready.
1225 */
1226 selthreadclear(&so->so_snd.sb_sel);
1227 selthreadclear(&so->so_rcv.sb_sel);
1228 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1229 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1230 so->so_event = sonullevent;
1231 return;
1232 } else {
1233 if (head->so_proto->pr_getlock != NULL) {
1234 so_release_accept_list(head);
1235 socket_unlock(head, 1);
1236 }
1237 printf("sofree: not queued\n");
1238 }
1239 }
1240 sowflush(so);
1241 sorflush(so);
1242
1243 /* 3932268: disable upcall */
1244 so->so_rcv.sb_flags &= ~SB_UPCALL;
1245 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1246 so->so_event = sonullevent;
1247
1248 if (dealloc) {
1249 sodealloc(so);
1250 }
1251 }
1252
1253 void
soclose_wait_locked(struct socket * so)1254 soclose_wait_locked(struct socket *so)
1255 {
1256 lck_mtx_t *mutex_held;
1257
1258 if (so->so_proto->pr_getlock != NULL) {
1259 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1260 } else {
1261 mutex_held = so->so_proto->pr_domain->dom_mtx;
1262 }
1263 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1264
1265 /*
1266 * Double check here and return if there's no outstanding upcall;
1267 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1268 */
1269 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1270 return;
1271 }
1272 so->so_rcv.sb_flags &= ~SB_UPCALL;
1273 so->so_snd.sb_flags &= ~SB_UPCALL;
1274 so->so_flags |= SOF_CLOSEWAIT;
1275
1276 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1277 "soclose_wait_locked", NULL);
1278 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1279 so->so_flags &= ~SOF_CLOSEWAIT;
1280 }
1281
1282 /*
1283 * Close a socket on last file table reference removal.
1284 * Initiate disconnect if connected.
1285 * Free socket when disconnect complete.
1286 */
1287 int
soclose_locked(struct socket * so)1288 soclose_locked(struct socket *so)
1289 {
1290 int error = 0;
1291 struct timespec ts;
1292
1293 if (so->so_usecount == 0) {
1294 panic("soclose: so=%p refcount=0", so);
1295 /* NOTREACHED */
1296 }
1297
1298 sflt_notify(so, sock_evt_closing, NULL);
1299
1300 if (so->so_upcallusecount) {
1301 soclose_wait_locked(so);
1302 }
1303
1304 #if CONTENT_FILTER
1305 /*
1306 * We have to wait until the content filters are done
1307 */
1308 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1309 cfil_sock_close_wait(so);
1310 cfil_sock_is_closed(so);
1311 cfil_sock_detach(so);
1312 }
1313 #endif /* CONTENT_FILTER */
1314
1315 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1316 soflow_detach(so);
1317 }
1318
1319 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1320 soresume(current_proc(), so, 1);
1321 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1322 }
1323
1324 if ((so->so_options & SO_ACCEPTCONN)) {
1325 struct socket *sp, *sonext;
1326 int persocklock = 0;
1327 int incomp_overflow_only;
1328
1329 /*
1330 * We do not want new connection to be added
1331 * to the connection queues
1332 */
1333 so->so_options &= ~SO_ACCEPTCONN;
1334
1335 /*
1336 * We can drop the lock on the listener once
1337 * we've acquired the incoming list
1338 */
1339 if (so->so_proto->pr_getlock != NULL) {
1340 persocklock = 1;
1341 so_acquire_accept_list(so, NULL);
1342 socket_unlock(so, 0);
1343 }
1344 again:
1345 incomp_overflow_only = 1;
1346
1347 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1348 /*
1349 * Radar 5350314
1350 * skip sockets thrown away by tcpdropdropblreq
1351 * they will get cleanup by the garbage collection.
1352 * otherwise, remove the incomp socket from the queue
1353 * and let soabort trigger the appropriate cleanup.
1354 */
1355 if (sp->so_flags & SOF_OVERFLOW) {
1356 continue;
1357 }
1358
1359 if (persocklock != 0) {
1360 socket_lock(sp, 1);
1361 }
1362
1363 /*
1364 * Radar 27945981
1365 * The extra reference for the list insure the
1366 * validity of the socket pointer when we perform the
1367 * unlock of the head above
1368 */
1369 if (sp->so_state & SS_INCOMP) {
1370 sp->so_state &= ~SS_INCOMP;
1371 sp->so_head = NULL;
1372 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1373 so->so_incqlen--;
1374 so->so_qlen--;
1375
1376 (void) soabort(sp);
1377 } else {
1378 panic("%s sp %p in so_incomp but !SS_INCOMP",
1379 __func__, sp);
1380 }
1381
1382 if (persocklock != 0) {
1383 socket_unlock(sp, 1);
1384 }
1385 }
1386
1387 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1388 /* Dequeue from so_comp since sofree() won't do it */
1389 if (persocklock != 0) {
1390 socket_lock(sp, 1);
1391 }
1392
1393 if (sp->so_state & SS_COMP) {
1394 sp->so_state &= ~SS_COMP;
1395 sp->so_head = NULL;
1396 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1397 so->so_qlen--;
1398
1399 (void) soabort(sp);
1400 } else {
1401 panic("%s sp %p in so_comp but !SS_COMP",
1402 __func__, sp);
1403 }
1404
1405 if (persocklock) {
1406 socket_unlock(sp, 1);
1407 }
1408 }
1409
1410 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1411 #if (DEBUG | DEVELOPMENT)
1412 panic("%s head %p so_comp not empty", __func__, so);
1413 #endif /* (DEVELOPMENT || DEBUG) */
1414
1415 goto again;
1416 }
1417
1418 if (!TAILQ_EMPTY(&so->so_comp)) {
1419 #if (DEBUG | DEVELOPMENT)
1420 panic("%s head %p so_comp not empty", __func__, so);
1421 #endif /* (DEVELOPMENT || DEBUG) */
1422
1423 goto again;
1424 }
1425
1426 if (persocklock) {
1427 socket_lock(so, 0);
1428 so_release_accept_list(so);
1429 }
1430 }
1431 if (so->so_pcb == NULL) {
1432 /* 3915887: mark the socket as ready for dealloc */
1433 so->so_flags |= SOF_PCBCLEARING;
1434 goto discard;
1435 }
1436
1437 if (so->so_state & SS_ISCONNECTED) {
1438 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1439 error = sodisconnectlocked(so);
1440 if (error) {
1441 goto drop;
1442 }
1443 }
1444 if (so->so_options & SO_LINGER) {
1445 if ((so->so_state & SS_ISDISCONNECTING) &&
1446 (so->so_state & SS_NBIO)) {
1447 goto drop;
1448 }
1449 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1450 lck_mtx_t *mutex_held;
1451
1452 if (so->so_proto->pr_getlock != NULL) {
1453 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1454 } else {
1455 mutex_held = so->so_proto->pr_domain->dom_mtx;
1456 }
1457 ts.tv_sec = (so->so_linger / 100);
1458 ts.tv_nsec = (so->so_linger % 100) *
1459 NSEC_PER_USEC * 1000 * 10;
1460 error = msleep((caddr_t)&so->so_timeo,
1461 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1462 if (error) {
1463 /*
1464 * It's OK when the time fires,
1465 * don't report an error
1466 */
1467 if (error == EWOULDBLOCK) {
1468 error = 0;
1469 }
1470 break;
1471 }
1472 }
1473 }
1474 }
1475 drop:
1476 if (so->so_usecount == 0) {
1477 panic("soclose: usecount is zero so=%p", so);
1478 /* NOTREACHED */
1479 }
1480 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1481 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1482 if (error == 0) {
1483 error = error2;
1484 }
1485 }
1486 if (so->so_usecount <= 0) {
1487 panic("soclose: usecount is zero so=%p", so);
1488 /* NOTREACHED */
1489 }
1490 discard:
1491 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1492 (so->so_state & SS_NOFDREF)) {
1493 panic("soclose: NOFDREF");
1494 /* NOTREACHED */
1495 }
1496 so->so_state |= SS_NOFDREF;
1497
1498 if ((so->so_flags & SOF_KNOTE) != 0) {
1499 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1500 }
1501
1502 os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1503
1504 VERIFY(so->so_usecount > 0);
1505 so->so_usecount--;
1506 sofree(so);
1507 return error;
1508 }
1509
1510 int
soclose(struct socket * so)1511 soclose(struct socket *so)
1512 {
1513 int error = 0;
1514 socket_lock(so, 1);
1515
1516 if (so->so_retaincnt == 0) {
1517 error = soclose_locked(so);
1518 } else {
1519 /*
1520 * if the FD is going away, but socket is
1521 * retained in kernel remove its reference
1522 */
1523 so->so_usecount--;
1524 if (so->so_usecount < 2) {
1525 panic("soclose: retaincnt non null and so=%p "
1526 "usecount=%d\n", so, so->so_usecount);
1527 }
1528 }
1529 socket_unlock(so, 1);
1530 return error;
1531 }
1532
1533 /*
1534 * Must be called at splnet...
1535 */
1536 /* Should already be locked */
1537 int
soabort(struct socket * so)1538 soabort(struct socket *so)
1539 {
1540 int error;
1541
1542 #ifdef MORE_LOCKING_DEBUG
1543 lck_mtx_t *mutex_held;
1544
1545 if (so->so_proto->pr_getlock != NULL) {
1546 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1547 } else {
1548 mutex_held = so->so_proto->pr_domain->dom_mtx;
1549 }
1550 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1551 #endif
1552
1553 if ((so->so_flags & SOF_ABORTED) == 0) {
1554 so->so_flags |= SOF_ABORTED;
1555 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1556 if (error) {
1557 sofree(so);
1558 return error;
1559 }
1560 }
1561 return 0;
1562 }
1563
1564 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1565 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1566 {
1567 int error;
1568
1569 if (dolock) {
1570 socket_lock(so, 1);
1571 }
1572
1573 so_update_last_owner_locked(so, PROC_NULL);
1574 so_update_policy(so);
1575 #if NECP
1576 so_update_necp_policy(so, NULL, NULL);
1577 #endif /* NECP */
1578
1579 if ((so->so_state & SS_NOFDREF) == 0) {
1580 panic("soaccept: !NOFDREF");
1581 }
1582 so->so_state &= ~SS_NOFDREF;
1583 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1584
1585 if (dolock) {
1586 socket_unlock(so, 1);
1587 }
1588 return error;
1589 }
1590
1591 int
soaccept(struct socket * so,struct sockaddr ** nam)1592 soaccept(struct socket *so, struct sockaddr **nam)
1593 {
1594 return soacceptlock(so, nam, 1);
1595 }
1596
1597 int
soacceptfilter(struct socket * so,struct socket * head)1598 soacceptfilter(struct socket *so, struct socket *head)
1599 {
1600 struct sockaddr *local = NULL, *remote = NULL;
1601 int error = 0;
1602
1603 /*
1604 * Hold the lock even if this socket has not been made visible
1605 * to the filter(s). For sockets with global locks, this protects
1606 * against the head or peer going away
1607 */
1608 socket_lock(so, 1);
1609 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1610 sogetaddr_locked(so, &local, 0) != 0) {
1611 so->so_state &= ~SS_NOFDREF;
1612 socket_unlock(so, 1);
1613 soclose(so);
1614 /* Out of resources; try it again next time */
1615 error = ECONNABORTED;
1616 goto done;
1617 }
1618
1619 error = sflt_accept(head, so, local, remote);
1620
1621 /*
1622 * If we get EJUSTRETURN from one of the filters, mark this socket
1623 * as inactive and return it anyway. This newly accepted socket
1624 * will be disconnected later before we hand it off to the caller.
1625 */
1626 if (error == EJUSTRETURN) {
1627 error = 0;
1628 (void) sosetdefunct(current_proc(), so,
1629 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1630 }
1631
1632 if (error != 0) {
1633 /*
1634 * This may seem like a duplication to the above error
1635 * handling part when we return ECONNABORTED, except
1636 * the following is done while holding the lock since
1637 * the socket has been exposed to the filter(s) earlier.
1638 */
1639 so->so_state &= ~SS_NOFDREF;
1640 socket_unlock(so, 1);
1641 soclose(so);
1642 /* Propagate socket filter's error code to the caller */
1643 } else {
1644 socket_unlock(so, 1);
1645 }
1646 done:
1647 /* Callee checks for NULL pointer */
1648 sock_freeaddr(remote);
1649 sock_freeaddr(local);
1650 return error;
1651 }
1652
1653 /*
1654 * Returns: 0 Success
1655 * EOPNOTSUPP Operation not supported on socket
1656 * EISCONN Socket is connected
1657 * <pru_connect>:EADDRNOTAVAIL Address not available.
1658 * <pru_connect>:EINVAL Invalid argument
1659 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1660 * <pru_connect>:EACCES Permission denied
1661 * <pru_connect>:EADDRINUSE Address in use
1662 * <pru_connect>:EAGAIN Resource unavailable, try again
1663 * <pru_connect>:EPERM Operation not permitted
1664 * <sf_connect_out>:??? [anything a filter writer might set]
1665 */
1666 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1667 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1668 {
1669 int error;
1670 struct proc *p = current_proc();
1671 tracker_metadata_t metadata = { };
1672
1673 if (dolock) {
1674 socket_lock(so, 1);
1675 }
1676
1677 so_update_last_owner_locked(so, p);
1678 so_update_policy(so);
1679
1680 /*
1681 * If this is a listening socket or if this is a previously-accepted
1682 * socket that has been marked as inactive, reject the connect request.
1683 */
1684 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1685 error = EOPNOTSUPP;
1686 if (so->so_flags & SOF_DEFUNCT) {
1687 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1688 "(%d)\n", __func__, proc_pid(p),
1689 proc_best_name(p),
1690 so->so_gencnt,
1691 SOCK_DOM(so), SOCK_TYPE(so), error);
1692 }
1693 if (dolock) {
1694 socket_unlock(so, 1);
1695 }
1696 return error;
1697 }
1698
1699 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1700 if (dolock) {
1701 socket_unlock(so, 1);
1702 }
1703 return EPERM;
1704 }
1705
1706 /*
1707 * If protocol is connection-based, can only connect once.
1708 * Otherwise, if connected, try to disconnect first.
1709 * This allows user to disconnect by connecting to, e.g.,
1710 * a null address.
1711 */
1712 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1713 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1714 (error = sodisconnectlocked(so)))) {
1715 error = EISCONN;
1716 } else {
1717 /*
1718 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1719 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1720 */
1721 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1722 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1723 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1724 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1725 }
1726 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1727 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1728 }
1729 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1730 printf("connect() - failed necp_set_socket_domain_attributes");
1731 }
1732 }
1733 }
1734
1735 #if NECP
1736 /* Update NECP evaluation after setting any domain via the tracker checks */
1737 so_update_necp_policy(so, NULL, nam);
1738 #endif /* NECP */
1739
1740 /*
1741 * Run connect filter before calling protocol:
1742 * - non-blocking connect returns before completion;
1743 */
1744 error = sflt_connectout(so, nam);
1745 if (error != 0) {
1746 if (error == EJUSTRETURN) {
1747 error = 0;
1748 }
1749 } else {
1750 error = (*so->so_proto->pr_usrreqs->pru_connect)
1751 (so, nam, p);
1752 if (error != 0) {
1753 so->so_state &= ~SS_ISCONNECTING;
1754 }
1755 }
1756 }
1757 if (dolock) {
1758 socket_unlock(so, 1);
1759 }
1760 return error;
1761 }
1762
1763 int
soconnect(struct socket * so,struct sockaddr * nam)1764 soconnect(struct socket *so, struct sockaddr *nam)
1765 {
1766 return soconnectlock(so, nam, 1);
1767 }
1768
1769 /*
1770 * Returns: 0 Success
1771 * <pru_connect2>:EINVAL[AF_UNIX]
1772 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1773 * <pru_connect2>:??? [other protocol families]
1774 *
1775 * Notes: <pru_connect2> is not supported by [TCP].
1776 */
1777 int
soconnect2(struct socket * so1,struct socket * so2)1778 soconnect2(struct socket *so1, struct socket *so2)
1779 {
1780 int error;
1781
1782 socket_lock(so1, 1);
1783 if (so2->so_proto->pr_lock) {
1784 socket_lock(so2, 1);
1785 }
1786
1787 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1788
1789 socket_unlock(so1, 1);
1790 if (so2->so_proto->pr_lock) {
1791 socket_unlock(so2, 1);
1792 }
1793 return error;
1794 }
1795
1796 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1797 soconnectxlocked(struct socket *so, struct sockaddr *src,
1798 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1799 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1800 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1801 {
1802 int error;
1803 tracker_metadata_t metadata = { };
1804
1805 so_update_last_owner_locked(so, p);
1806 so_update_policy(so);
1807
1808 /*
1809 * If this is a listening socket or if this is a previously-accepted
1810 * socket that has been marked as inactive, reject the connect request.
1811 */
1812 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1813 error = EOPNOTSUPP;
1814 if (so->so_flags & SOF_DEFUNCT) {
1815 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1816 "(%d)\n", __func__, proc_pid(p),
1817 proc_best_name(p),
1818 so->so_gencnt,
1819 SOCK_DOM(so), SOCK_TYPE(so), error);
1820 }
1821 return error;
1822 }
1823
1824 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1825 return EPERM;
1826 }
1827
1828 /*
1829 * If protocol is connection-based, can only connect once
1830 * unless PR_MULTICONN is set. Otherwise, if connected,
1831 * try to disconnect first. This allows user to disconnect
1832 * by connecting to, e.g., a null address.
1833 */
1834 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1835 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1836 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1837 (error = sodisconnectlocked(so)) != 0)) {
1838 error = EISCONN;
1839 } else {
1840 /*
1841 * For TCP, check if destination address is a tracker and mark the socket accordingly
1842 * (only if it hasn't been marked yet).
1843 */
1844 if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1845 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1846 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1847 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1848 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1849 }
1850 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1851 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1852 }
1853 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1854 printf("connectx() - failed necp_set_socket_domain_attributes");
1855 }
1856 }
1857 }
1858
1859 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1860 (flags & CONNECT_DATA_IDEMPOTENT)) {
1861 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1862
1863 if (flags & CONNECT_DATA_AUTHENTICATED) {
1864 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1865 }
1866 }
1867
1868 /*
1869 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1870 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1871 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1872 * Case 3 allows user to combine write with connect even if they have
1873 * no use for TFO (such as regular TCP, and UDP).
1874 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1875 */
1876 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1877 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1878 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1879 }
1880
1881 /*
1882 * If a user sets data idempotent and does not pass an uio, or
1883 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1884 * SOF1_DATA_IDEMPOTENT.
1885 */
1886 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1887 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1888 /* We should return EINVAL instead perhaps. */
1889 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1890 }
1891
1892 /*
1893 * Run connect filter before calling protocol:
1894 * - non-blocking connect returns before completion;
1895 */
1896 error = sflt_connectout(so, dst);
1897 if (error != 0) {
1898 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1899 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1900 if (error == EJUSTRETURN) {
1901 error = 0;
1902 }
1903 } else {
1904 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1905 (so, src, dst, p, ifscope, aid, pcid,
1906 flags, arg, arglen, auio, bytes_written);
1907 if (error != 0) {
1908 so->so_state &= ~SS_ISCONNECTING;
1909 if (error != EINPROGRESS) {
1910 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1911 }
1912 }
1913 }
1914 }
1915
1916 return error;
1917 }
1918
1919 int
sodisconnectlocked(struct socket * so)1920 sodisconnectlocked(struct socket *so)
1921 {
1922 int error;
1923
1924 if ((so->so_state & SS_ISCONNECTED) == 0) {
1925 error = ENOTCONN;
1926 goto bad;
1927 }
1928 if (so->so_state & SS_ISDISCONNECTING) {
1929 error = EALREADY;
1930 goto bad;
1931 }
1932
1933 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1934 if (error == 0) {
1935 sflt_notify(so, sock_evt_disconnected, NULL);
1936 }
1937
1938 bad:
1939 return error;
1940 }
1941
1942 /* Locking version */
1943 int
sodisconnect(struct socket * so)1944 sodisconnect(struct socket *so)
1945 {
1946 int error;
1947
1948 socket_lock(so, 1);
1949 error = sodisconnectlocked(so);
1950 socket_unlock(so, 1);
1951 return error;
1952 }
1953
1954 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1955 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1956 {
1957 int error;
1958
1959 /*
1960 * Call the protocol disconnectx handler; let it handle all
1961 * matters related to the connection state of this session.
1962 */
1963 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1964 if (error == 0) {
1965 /*
1966 * The event applies only for the session, not for
1967 * the disconnection of individual subflows.
1968 */
1969 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1970 sflt_notify(so, sock_evt_disconnected, NULL);
1971 }
1972 }
1973 return error;
1974 }
1975
1976 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1977 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1978 {
1979 int error;
1980
1981 socket_lock(so, 1);
1982 error = sodisconnectxlocked(so, aid, cid);
1983 socket_unlock(so, 1);
1984 return error;
1985 }
1986
1987 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1988
1989 /*
1990 * sosendcheck will lock the socket buffer if it isn't locked and
1991 * verify that there is space for the data being inserted.
1992 *
1993 * Returns: 0 Success
1994 * EPIPE
1995 * sblock:EWOULDBLOCK
1996 * sblock:EINTR
1997 * sbwait:EBADF
1998 * sbwait:EINTR
1999 * [so_error]:???
2000 */
2001 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)2002 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
2003 int32_t clen, int32_t atomic, int flags, int *sblocked)
2004 {
2005 int error = 0;
2006 int32_t space;
2007 int assumelock = 0;
2008
2009 restart:
2010 if (*sblocked == 0) {
2011 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2012 so->so_send_filt_thread != 0 &&
2013 so->so_send_filt_thread == current_thread()) {
2014 /*
2015 * We're being called recursively from a filter,
2016 * allow this to continue. Radar 4150520.
2017 * Don't set sblocked because we don't want
2018 * to perform an unlock later.
2019 */
2020 assumelock = 1;
2021 } else {
2022 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2023 if (error) {
2024 if (so->so_flags & SOF_DEFUNCT) {
2025 goto defunct;
2026 }
2027 return error;
2028 }
2029 *sblocked = 1;
2030 }
2031 }
2032
2033 /*
2034 * If a send attempt is made on a socket that has been marked
2035 * as inactive (disconnected), reject the request.
2036 */
2037 if (so->so_flags & SOF_DEFUNCT) {
2038 defunct:
2039 error = EPIPE;
2040 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2041 __func__, proc_selfpid(), proc_best_name(current_proc()),
2042 so->so_gencnt,
2043 SOCK_DOM(so), SOCK_TYPE(so), error);
2044 return error;
2045 }
2046
2047 if (so->so_state & SS_CANTSENDMORE) {
2048 #if CONTENT_FILTER
2049 /*
2050 * Can re-inject data of half closed connections
2051 */
2052 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2053 so->so_snd.sb_cfil_thread == current_thread() &&
2054 cfil_sock_data_pending(&so->so_snd) != 0) {
2055 CFIL_LOG(LOG_INFO,
2056 "so %llx ignore SS_CANTSENDMORE",
2057 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2058 } else
2059 #endif /* CONTENT_FILTER */
2060 return EPIPE;
2061 }
2062 if (so->so_error) {
2063 error = so->so_error;
2064 so->so_error = 0;
2065 return error;
2066 }
2067
2068 if ((so->so_state & SS_ISCONNECTED) == 0) {
2069 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2070 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2071 (resid != 0 || clen == 0) &&
2072 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2073 return ENOTCONN;
2074 }
2075 } else if (addr == 0) {
2076 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2077 ENOTCONN : EDESTADDRREQ;
2078 }
2079 }
2080
2081 space = sbspace(&so->so_snd);
2082
2083 if (flags & MSG_OOB) {
2084 space += 1024;
2085 }
2086 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2087 clen > so->so_snd.sb_hiwat) {
2088 return EMSGSIZE;
2089 }
2090
2091 if ((space < resid + clen &&
2092 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2093 space < clen)) ||
2094 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2095 /*
2096 * don't block the connectx call when there's more data
2097 * than can be copied.
2098 */
2099 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2100 if (space == 0) {
2101 return EWOULDBLOCK;
2102 }
2103 if (space < (int32_t)so->so_snd.sb_lowat) {
2104 return 0;
2105 }
2106 }
2107 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2108 assumelock) {
2109 return EWOULDBLOCK;
2110 }
2111 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2112 *sblocked = 0;
2113 error = sbwait(&so->so_snd);
2114 if (error) {
2115 if (so->so_flags & SOF_DEFUNCT) {
2116 goto defunct;
2117 }
2118 return error;
2119 }
2120 goto restart;
2121 }
2122 return 0;
2123 }
2124
2125 /*
2126 * Send on a socket.
2127 * If send must go all at once and message is larger than
2128 * send buffering, then hard error.
2129 * Lock against other senders.
2130 * If must go all at once and not enough room now, then
2131 * inform user that this would block and do nothing.
2132 * Otherwise, if nonblocking, send as much as possible.
2133 * The data to be sent is described by "uio" if nonzero,
2134 * otherwise by the mbuf chain "top" (which must be null
2135 * if uio is not). Data provided in mbuf chain must be small
2136 * enough to send all at once.
2137 *
2138 * Returns nonzero on error, timeout or signal; callers
2139 * must check for short counts if EINTR/ERESTART are returned.
2140 * Data and control buffers are freed on return.
2141 *
2142 * Returns: 0 Success
2143 * EOPNOTSUPP
2144 * EINVAL
2145 * ENOBUFS
2146 * uiomove:EFAULT
2147 * sosendcheck:EPIPE
2148 * sosendcheck:EWOULDBLOCK
2149 * sosendcheck:EINTR
2150 * sosendcheck:EBADF
2151 * sosendcheck:EINTR
2152 * sosendcheck:??? [value from so_error]
2153 * <pru_send>:ECONNRESET[TCP]
2154 * <pru_send>:EINVAL[TCP]
2155 * <pru_send>:ENOBUFS[TCP]
2156 * <pru_send>:EADDRINUSE[TCP]
2157 * <pru_send>:EADDRNOTAVAIL[TCP]
2158 * <pru_send>:EAFNOSUPPORT[TCP]
2159 * <pru_send>:EACCES[TCP]
2160 * <pru_send>:EAGAIN[TCP]
2161 * <pru_send>:EPERM[TCP]
2162 * <pru_send>:EMSGSIZE[TCP]
2163 * <pru_send>:EHOSTUNREACH[TCP]
2164 * <pru_send>:ENETUNREACH[TCP]
2165 * <pru_send>:ENETDOWN[TCP]
2166 * <pru_send>:ENOMEM[TCP]
2167 * <pru_send>:ENOBUFS[TCP]
2168 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2169 * <pru_send>:EINVAL[AF_UNIX]
2170 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2171 * <pru_send>:EPIPE[AF_UNIX]
2172 * <pru_send>:ENOTCONN[AF_UNIX]
2173 * <pru_send>:EISCONN[AF_UNIX]
2174 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2175 * <sf_data_out>:??? [whatever a filter author chooses]
2176 *
2177 * Notes: Other <pru_send> returns depend on the protocol family; all
2178 * <sf_data_out> returns depend on what the filter author causes
2179 * their filter to return.
2180 */
2181 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2182 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2183 struct mbuf *top, struct mbuf *control, int flags)
2184 {
2185 struct mbuf **mp;
2186 struct mbuf *m, *freelist = NULL;
2187 struct soflow_hash_entry *dgram_flow_entry = NULL;
2188 user_ssize_t space, len, resid, orig_resid;
2189 int clen = 0, error, dontroute, sendflags;
2190 int atomic = sosendallatonce(so) || top;
2191 int sblocked = 0;
2192 struct proc *p = current_proc();
2193 uint16_t headroom = 0;
2194 ssize_t mlen;
2195 boolean_t en_tracing = FALSE;
2196
2197 if (uio != NULL) {
2198 resid = uio_resid(uio);
2199 } else {
2200 resid = top->m_pkthdr.len;
2201 }
2202 orig_resid = resid;
2203
2204 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2205 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2206
2207 socket_lock(so, 1);
2208
2209 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2210 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2211 }
2212
2213 /*
2214 * trace if tracing & network (vs. unix) sockets & and
2215 * non-loopback
2216 */
2217 if (ENTR_SHOULDTRACE &&
2218 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2219 struct inpcb *inp = sotoinpcb(so);
2220 if (inp->inp_last_outifp != NULL &&
2221 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2222 en_tracing = TRUE;
2223 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2224 VM_KERNEL_ADDRPERM(so),
2225 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2226 (int64_t)resid);
2227 }
2228 }
2229
2230 /*
2231 * Re-injection should not affect process accounting
2232 */
2233 if ((flags & MSG_SKIPCFIL) == 0) {
2234 so_update_last_owner_locked(so, p);
2235 so_update_policy(so);
2236
2237 #if NECP
2238 so_update_necp_policy(so, NULL, addr);
2239 #endif /* NECP */
2240 }
2241
2242 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2243 error = EOPNOTSUPP;
2244 goto out_locked;
2245 }
2246
2247 /*
2248 * In theory resid should be unsigned.
2249 * However, space must be signed, as it might be less than 0
2250 * if we over-committed, and we must use a signed comparison
2251 * of space and resid. On the other hand, a negative resid
2252 * causes us to loop sending 0-length segments to the protocol.
2253 *
2254 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2255 *
2256 * Note: We limit resid to be a positive int value as we use
2257 * imin() to set bytes_to_copy -- radr://14558484
2258 */
2259 if (resid < 0 || resid > INT_MAX ||
2260 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2261 error = EINVAL;
2262 goto out_locked;
2263 }
2264
2265 dontroute = (flags & MSG_DONTROUTE) &&
2266 (so->so_options & SO_DONTROUTE) == 0 &&
2267 (so->so_proto->pr_flags & PR_ATOMIC);
2268 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2269
2270 if (control != NULL) {
2271 clen = control->m_len;
2272 }
2273
2274 if (soreserveheadroom != 0) {
2275 headroom = so->so_pktheadroom;
2276 }
2277
2278 do {
2279 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2280 &sblocked);
2281 if (error) {
2282 goto out_locked;
2283 }
2284
2285 mp = ⊤
2286 space = sbspace(&so->so_snd) - clen;
2287 space += ((flags & MSG_OOB) ? 1024 : 0);
2288
2289 do {
2290 if (uio == NULL) {
2291 /*
2292 * Data is prepackaged in "top".
2293 */
2294 resid = 0;
2295 if (flags & MSG_EOR) {
2296 top->m_flags |= M_EOR;
2297 }
2298 } else {
2299 int chainlength;
2300 int bytes_to_copy;
2301 boolean_t jumbocl;
2302 boolean_t bigcl;
2303 int bytes_to_alloc;
2304
2305 bytes_to_copy = imin((int)resid, (int)space);
2306
2307 bytes_to_alloc = bytes_to_copy;
2308 if (top == NULL) {
2309 bytes_to_alloc += headroom;
2310 }
2311
2312 if (sosendminchain > 0) {
2313 chainlength = 0;
2314 } else {
2315 chainlength = sosendmaxchain;
2316 }
2317
2318 /*
2319 * Use big 4 KB cluster when the outgoing interface
2320 * does not prefer 2 KB clusters
2321 */
2322 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2323 sosendbigcl_ignore_capab;
2324
2325 /*
2326 * Attempt to use larger than system page-size
2327 * clusters for large writes only if there is
2328 * a jumbo cluster pool and if the socket is
2329 * marked accordingly.
2330 */
2331 jumbocl = sosendjcl && njcl > 0 &&
2332 ((so->so_flags & SOF_MULTIPAGES) ||
2333 sosendjcl_ignore_capab) &&
2334 bigcl;
2335
2336 socket_unlock(so, 0);
2337
2338 do {
2339 int num_needed;
2340 int hdrs_needed = (top == NULL) ? 1 : 0;
2341
2342 /*
2343 * try to maintain a local cache of mbuf
2344 * clusters needed to complete this
2345 * write the list is further limited to
2346 * the number that are currently needed
2347 * to fill the socket this mechanism
2348 * allows a large number of mbufs/
2349 * clusters to be grabbed under a single
2350 * mbuf lock... if we can't get any
2351 * clusters, than fall back to trying
2352 * for mbufs if we fail early (or
2353 * miscalcluate the number needed) make
2354 * sure to release any clusters we
2355 * haven't yet consumed.
2356 */
2357 if (freelist == NULL &&
2358 bytes_to_alloc > MBIGCLBYTES &&
2359 jumbocl) {
2360 num_needed =
2361 bytes_to_alloc / M16KCLBYTES;
2362
2363 if ((bytes_to_alloc -
2364 (num_needed * M16KCLBYTES))
2365 >= MINCLSIZE) {
2366 num_needed++;
2367 }
2368
2369 freelist =
2370 m_getpackets_internal(
2371 (unsigned int *)&num_needed,
2372 hdrs_needed, M_WAIT, 0,
2373 M16KCLBYTES);
2374 /*
2375 * Fall back to 4K cluster size
2376 * if allocation failed
2377 */
2378 }
2379
2380 if (freelist == NULL &&
2381 bytes_to_alloc > MCLBYTES &&
2382 bigcl) {
2383 num_needed =
2384 bytes_to_alloc / MBIGCLBYTES;
2385
2386 if ((bytes_to_alloc -
2387 (num_needed * MBIGCLBYTES)) >=
2388 MINCLSIZE) {
2389 num_needed++;
2390 }
2391
2392 freelist =
2393 m_getpackets_internal(
2394 (unsigned int *)&num_needed,
2395 hdrs_needed, M_WAIT, 0,
2396 MBIGCLBYTES);
2397 /*
2398 * Fall back to cluster size
2399 * if allocation failed
2400 */
2401 }
2402
2403 /*
2404 * Allocate a cluster as we want to
2405 * avoid to split the data in more
2406 * that one segment and using MINCLSIZE
2407 * would lead us to allocate two mbufs
2408 */
2409 if (soreserveheadroom != 0 &&
2410 freelist == NULL &&
2411 ((top == NULL &&
2412 bytes_to_alloc > _MHLEN) ||
2413 bytes_to_alloc > _MLEN)) {
2414 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2415 MCLBYTES;
2416 freelist =
2417 m_getpackets_internal(
2418 (unsigned int *)&num_needed,
2419 hdrs_needed, M_WAIT, 0,
2420 MCLBYTES);
2421 /*
2422 * Fall back to a single mbuf
2423 * if allocation failed
2424 */
2425 } else if (freelist == NULL &&
2426 bytes_to_alloc > MINCLSIZE) {
2427 num_needed =
2428 bytes_to_alloc / MCLBYTES;
2429
2430 if ((bytes_to_alloc -
2431 (num_needed * MCLBYTES)) >=
2432 MINCLSIZE) {
2433 num_needed++;
2434 }
2435
2436 freelist =
2437 m_getpackets_internal(
2438 (unsigned int *)&num_needed,
2439 hdrs_needed, M_WAIT, 0,
2440 MCLBYTES);
2441 /*
2442 * Fall back to a single mbuf
2443 * if allocation failed
2444 */
2445 }
2446 /*
2447 * For datagram protocols, leave
2448 * headroom for protocol headers
2449 * in the first cluster of the chain
2450 */
2451 if (freelist != NULL && atomic &&
2452 top == NULL && headroom > 0) {
2453 freelist->m_data += headroom;
2454 }
2455
2456 /*
2457 * Fall back to regular mbufs without
2458 * reserving the socket headroom
2459 */
2460 if (freelist == NULL) {
2461 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2462 if (top == NULL) {
2463 MGETHDR(freelist,
2464 M_WAIT, MT_DATA);
2465 } else {
2466 MGET(freelist,
2467 M_WAIT, MT_DATA);
2468 }
2469 }
2470
2471 if (freelist == NULL) {
2472 error = ENOBUFS;
2473 socket_lock(so, 0);
2474 goto out_locked;
2475 }
2476 /*
2477 * For datagram protocols,
2478 * leave room for protocol
2479 * headers in first mbuf.
2480 */
2481 if (atomic && top == NULL &&
2482 bytes_to_copy > 0 &&
2483 bytes_to_copy < MHLEN) {
2484 MH_ALIGN(freelist,
2485 bytes_to_copy);
2486 }
2487 }
2488 m = freelist;
2489 freelist = m->m_next;
2490 m->m_next = NULL;
2491
2492 if ((m->m_flags & M_EXT)) {
2493 mlen = m->m_ext.ext_size -
2494 M_LEADINGSPACE(m);
2495 } else if ((m->m_flags & M_PKTHDR)) {
2496 mlen = MHLEN - M_LEADINGSPACE(m);
2497 m_add_crumb(m, PKT_CRUMB_SOSEND);
2498 } else {
2499 mlen = MLEN - M_LEADINGSPACE(m);
2500 }
2501 len = imin((int)mlen, bytes_to_copy);
2502
2503 chainlength += len;
2504
2505 space -= len;
2506
2507 error = uiomove(mtod(m, caddr_t),
2508 (int)len, uio);
2509
2510 resid = uio_resid(uio);
2511
2512 m->m_len = (int32_t)len;
2513 *mp = m;
2514 top->m_pkthdr.len += len;
2515 if (error) {
2516 break;
2517 }
2518 mp = &m->m_next;
2519 if (resid <= 0) {
2520 if (flags & MSG_EOR) {
2521 top->m_flags |= M_EOR;
2522 }
2523 break;
2524 }
2525 bytes_to_copy = imin((int)resid, (int)space);
2526 } while (space > 0 &&
2527 (chainlength < sosendmaxchain || atomic ||
2528 resid < MINCLSIZE));
2529
2530 socket_lock(so, 0);
2531
2532 if (error) {
2533 goto out_locked;
2534 }
2535 }
2536
2537 if (dontroute) {
2538 so->so_options |= SO_DONTROUTE;
2539 }
2540
2541 /*
2542 * Compute flags here, for pru_send and NKEs
2543 *
2544 * If the user set MSG_EOF, the protocol
2545 * understands this flag and nothing left to
2546 * send then use PRU_SEND_EOF instead of PRU_SEND.
2547 */
2548 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2549 ((flags & MSG_EOF) &&
2550 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2551 (resid <= 0)) ? PRUS_EOF :
2552 /* If there is more to send set PRUS_MORETOCOME */
2553 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2554
2555 if ((flags & MSG_SKIPCFIL) == 0) {
2556 /*
2557 * Socket filter processing
2558 */
2559 error = sflt_data_out(so, addr, &top,
2560 &control, (sendflags & MSG_OOB) ?
2561 sock_data_filt_flag_oob : 0);
2562 if (error) {
2563 if (error == EJUSTRETURN) {
2564 error = 0;
2565 goto packet_consumed;
2566 }
2567 goto out_locked;
2568 }
2569 #if CONTENT_FILTER
2570 /*
2571 * Content filter processing
2572 */
2573 error = cfil_sock_data_out(so, addr, top,
2574 control, sendflags, dgram_flow_entry);
2575 if (error) {
2576 if (error == EJUSTRETURN) {
2577 error = 0;
2578 goto packet_consumed;
2579 }
2580 goto out_locked;
2581 }
2582 #endif /* CONTENT_FILTER */
2583 }
2584 error = (*so->so_proto->pr_usrreqs->pru_send)
2585 (so, sendflags, top, addr, control, p);
2586
2587 packet_consumed:
2588 if (dontroute) {
2589 so->so_options &= ~SO_DONTROUTE;
2590 }
2591
2592 clen = 0;
2593 control = NULL;
2594 top = NULL;
2595 mp = ⊤
2596 if (error) {
2597 goto out_locked;
2598 }
2599 } while (resid && space > 0);
2600 } while (resid);
2601
2602
2603 out_locked:
2604 if (resid > orig_resid) {
2605 char pname[MAXCOMLEN] = {};
2606 pid_t current_pid = proc_pid(current_proc());
2607 proc_name(current_pid, pname, sizeof(pname));
2608
2609 if (sosend_assert_panic != 0) {
2610 panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2611 so, resid, orig_resid, pname, current_pid);
2612 } else {
2613 os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2614 so->so_gencnt, resid, orig_resid, pname, current_pid);
2615 }
2616 }
2617
2618 if (sblocked) {
2619 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2620 } else {
2621 socket_unlock(so, 1);
2622 }
2623 if (top != NULL) {
2624 m_freem(top);
2625 }
2626 if (control != NULL) {
2627 m_freem(control);
2628 }
2629 if (freelist != NULL) {
2630 m_freem_list(freelist);
2631 }
2632
2633 if (dgram_flow_entry != NULL) {
2634 soflow_free_flow(dgram_flow_entry);
2635 }
2636
2637 soclearfastopen(so);
2638
2639 if (en_tracing) {
2640 /* resid passed here is the bytes left in uio */
2641 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2642 VM_KERNEL_ADDRPERM(so),
2643 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2644 (int64_t)(orig_resid - resid));
2645 }
2646 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2647 so->so_snd.sb_cc, space, error);
2648
2649 return error;
2650 }
2651
2652 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2653 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2654 {
2655 struct mbuf *m0 = NULL, *control_end = NULL;
2656
2657 socket_lock_assert_owned(so);
2658
2659 /*
2660 * top must points to mbuf chain to be sent.
2661 * If control is not NULL, top must be packet header
2662 */
2663 VERIFY(top != NULL &&
2664 (control == NULL || top->m_flags & M_PKTHDR));
2665
2666 /*
2667 * If control is not passed in, see if we can get it
2668 * from top.
2669 */
2670 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2671 // Locate start of control if present and start of data
2672 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2673 if (m0->m_flags & M_PKTHDR) {
2674 top = m0;
2675 break;
2676 } else if (m0->m_type == MT_CONTROL) {
2677 if (control == NULL) {
2678 // Found start of control
2679 control = m0;
2680 }
2681 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2682 // Found end of control
2683 control_end = m0;
2684 }
2685 }
2686 }
2687 if (control_end != NULL) {
2688 control_end->m_next = NULL;
2689 }
2690 }
2691
2692 int error = (*so->so_proto->pr_usrreqs->pru_send)
2693 (so, sendflags, top, addr, control, current_proc());
2694
2695 return error;
2696 }
2697
2698 static struct mbuf *
mbuf_detach_control_from_list(struct mbuf ** mp)2699 mbuf_detach_control_from_list(struct mbuf **mp)
2700 {
2701 struct mbuf *control = NULL;
2702 struct mbuf *m = *mp;
2703
2704 if (m->m_type == MT_CONTROL) {
2705 struct mbuf *control_end;
2706 struct mbuf *n;
2707
2708 n = control_end = control = m;
2709
2710 /*
2711 * Break the chain per mbuf type
2712 */
2713 while (n != NULL && n->m_type == MT_CONTROL) {
2714 control_end = n;
2715 n = n->m_next;
2716 }
2717 control_end->m_next = NULL;
2718 *mp = n;
2719 }
2720 VERIFY(*mp != NULL);
2721
2722 return control;
2723 }
2724
2725 /*
2726 * Supported only connected sockets (no address) without ancillary data
2727 * (control mbuf) for atomic protocols
2728 */
2729 int
sosend_list(struct socket * so,struct mbuf * pktlist,size_t total_len,u_int * pktcnt,int flags)2730 sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2731 {
2732 struct mbuf *m;
2733 struct soflow_hash_entry *dgram_flow_entry = NULL;
2734 int error, dontroute;
2735 int atomic = sosendallatonce(so);
2736 int sblocked = 0;
2737 struct proc *p = current_proc();
2738 struct mbuf *top = pktlist;
2739 bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2740
2741 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2742 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2743
2744 if (so->so_type != SOCK_DGRAM) {
2745 error = EINVAL;
2746 os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2747 error);
2748 goto out;
2749 }
2750 if (atomic == 0) {
2751 error = EINVAL;
2752 os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2753 error);
2754 goto out;
2755 }
2756 if ((so->so_state & SS_ISCONNECTED) == 0) {
2757 error = ENOTCONN;
2758 os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2759 error);
2760 goto out;
2761 }
2762 if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2763 error = EINVAL;
2764 os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2765 flags, error);
2766 goto out;
2767 }
2768
2769 socket_lock(so, 1);
2770 so_update_last_owner_locked(so, p);
2771 so_update_policy(so);
2772
2773 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2774 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, true, 0);
2775 }
2776
2777 #if NECP
2778 so_update_necp_policy(so, NULL, NULL);
2779 #endif /* NECP */
2780
2781 dontroute = (flags & MSG_DONTROUTE) &&
2782 (so->so_options & SO_DONTROUTE) == 0 &&
2783 (so->so_proto->pr_flags & PR_ATOMIC);
2784 if (dontroute) {
2785 so->so_options |= SO_DONTROUTE;
2786 }
2787
2788 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2789
2790 error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked);
2791 if (error) {
2792 os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2793 error);
2794 goto release;
2795 }
2796
2797 if (!skip_filt) {
2798 struct mbuf **prevnextp = NULL;
2799
2800 for (m = top; m != NULL; m = m->m_nextpkt) {
2801 struct mbuf *control = NULL;
2802 struct mbuf *last_control = NULL;
2803 struct mbuf *nextpkt;
2804
2805 /*
2806 * Remove packet from the list of packets
2807 */
2808 nextpkt = m->m_nextpkt;
2809 if (prevnextp != NULL) {
2810 *prevnextp = nextpkt;
2811 } else {
2812 top = nextpkt;
2813 }
2814 m->m_nextpkt = NULL;
2815
2816 /*
2817 * Break the chain per mbuf type
2818 */
2819 if (m->m_type == MT_CONTROL) {
2820 control = mbuf_detach_control_from_list(&m);
2821 }
2822 /*
2823 * Socket filter processing
2824 */
2825 error = sflt_data_out(so, NULL, &m,
2826 &control, 0);
2827 if (error != 0 && error != EJUSTRETURN) {
2828 os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2829 error);
2830 goto release;
2831 }
2832
2833 #if CONTENT_FILTER
2834 if (error == 0) {
2835 /*
2836 * Content filter processing
2837 */
2838 error = cfil_sock_data_out(so, NULL, m,
2839 control, 0, dgram_flow_entry);
2840 if (error != 0 && error != EJUSTRETURN) {
2841 os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2842 error);
2843 goto release;
2844 }
2845 }
2846 #endif /* CONTENT_FILTER */
2847 if (error == EJUSTRETURN) {
2848 /*
2849 * When swallowed by a filter, the packet is not
2850 * in the list anymore
2851 */
2852 error = 0;
2853 } else {
2854 /*
2855 * Rebuild the mbuf chain of the packet
2856 */
2857 if (control != NULL) {
2858 last_control->m_next = m;
2859 m = control;
2860 }
2861 /*
2862 * Reinsert the packet in the list of packets
2863 */
2864 m->m_nextpkt = nextpkt;
2865 if (prevnextp != NULL) {
2866 *prevnextp = m;
2867 } else {
2868 top = m;
2869 }
2870 prevnextp = &m->m_nextpkt;
2871 }
2872 }
2873 }
2874
2875 if (top != NULL) {
2876 if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2877 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2878 (so, top, pktcnt, flags);
2879 if (error != 0) {
2880 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2881 error);
2882 }
2883 top = NULL;
2884 } else {
2885 *pktcnt = 0;
2886 for (m = top; m != NULL; m = top) {
2887 struct mbuf *control = NULL;
2888
2889 top = m->m_nextpkt;
2890 m->m_nextpkt = NULL;
2891
2892 /*
2893 * Break the chain per mbuf type
2894 */
2895 if (m->m_type == MT_CONTROL) {
2896 control = mbuf_detach_control_from_list(&m);
2897 }
2898
2899 error = (*so->so_proto->pr_usrreqs->pru_send)
2900 (so, 0, m, NULL, control, current_proc());
2901 if (error != 0) {
2902 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2903 error);
2904 goto release;
2905 }
2906 *pktcnt += 1;
2907 }
2908 }
2909 }
2910
2911 release:
2912 if (dontroute) {
2913 so->so_options &= ~SO_DONTROUTE;
2914 }
2915 if (sblocked) {
2916 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2917 } else {
2918 socket_unlock(so, 1);
2919 }
2920 out:
2921 if (top != NULL) {
2922 os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2923 error);
2924 m_freem_list(top);
2925 }
2926
2927 if (dgram_flow_entry != NULL) {
2928 soflow_free_flow(dgram_flow_entry);
2929 }
2930
2931 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2932 so->so_snd.sb_cc, 0, error);
2933
2934 return error;
2935 }
2936
2937 /*
2938 * May return ERESTART when packet is dropped by MAC policy check
2939 */
2940 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,struct mbuf ** maddrp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2941 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2942 struct mbuf **maddrp,
2943 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2944 {
2945 int error = 0;
2946 struct mbuf *m = *mp;
2947 struct mbuf *nextrecord = *nextrecordp;
2948
2949 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2950 #if CONFIG_MACF_SOCKET_SUBSET
2951 /*
2952 * Call the MAC framework for policy checking if we're in
2953 * the user process context and the socket isn't connected.
2954 */
2955 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2956 struct mbuf *m0 = m;
2957 /*
2958 * Dequeue this record (temporarily) from the receive
2959 * list since we're about to drop the socket's lock
2960 * where a new record may arrive and be appended to
2961 * the list. Upon MAC policy failure, the record
2962 * will be freed. Otherwise, we'll add it back to
2963 * the head of the list. We cannot rely on SB_LOCK
2964 * because append operation uses the socket's lock.
2965 */
2966 do {
2967 m->m_nextpkt = NULL;
2968 sbfree(&so->so_rcv, m);
2969 m = m->m_next;
2970 } while (m != NULL);
2971 m = m0;
2972 so->so_rcv.sb_mb = nextrecord;
2973 SB_EMPTY_FIXUP(&so->so_rcv);
2974 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2975 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2976 socket_unlock(so, 0);
2977
2978 error = mac_socket_check_received(kauth_cred_get(), so,
2979 mtod(m, struct sockaddr *));
2980
2981 if (error != 0) {
2982 /*
2983 * MAC policy failure; free this record and
2984 * process the next record (or block until
2985 * one is available). We have adjusted sb_cc
2986 * and sb_mbcnt above so there is no need to
2987 * call sbfree() again.
2988 */
2989 m_freem(m);
2990 /*
2991 * Clear SB_LOCK but don't unlock the socket.
2992 * Process the next record or wait for one.
2993 */
2994 socket_lock(so, 0);
2995 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2996 error = ERESTART;
2997 goto done;
2998 }
2999 socket_lock(so, 0);
3000 /*
3001 * If the socket has been defunct'd, drop it.
3002 */
3003 if (so->so_flags & SOF_DEFUNCT) {
3004 m_freem(m);
3005 error = ENOTCONN;
3006 goto done;
3007 }
3008 /*
3009 * Re-adjust the socket receive list and re-enqueue
3010 * the record in front of any packets which may have
3011 * been appended while we dropped the lock.
3012 */
3013 for (m = m0; m->m_next != NULL; m = m->m_next) {
3014 sballoc(&so->so_rcv, m);
3015 }
3016 sballoc(&so->so_rcv, m);
3017 if (so->so_rcv.sb_mb == NULL) {
3018 so->so_rcv.sb_lastrecord = m0;
3019 so->so_rcv.sb_mbtail = m;
3020 }
3021 m = m0;
3022 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3023 so->so_rcv.sb_mb = m;
3024 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3025 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3026 }
3027 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3028 if (psa != NULL) {
3029 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3030 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3031 error = EWOULDBLOCK;
3032 goto done;
3033 }
3034 } else if (maddrp != NULL) {
3035 *maddrp = m;
3036 }
3037 if (flags & MSG_PEEK) {
3038 m = m->m_next;
3039 } else {
3040 sbfree(&so->so_rcv, m);
3041 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3042 panic("%s: about to create invalid socketbuf",
3043 __func__);
3044 /* NOTREACHED */
3045 }
3046 if (maddrp == NULL) {
3047 MFREE(m, so->so_rcv.sb_mb);
3048 } else {
3049 so->so_rcv.sb_mb = m->m_next;
3050 m->m_next = NULL;
3051 }
3052 m = so->so_rcv.sb_mb;
3053 if (m != NULL) {
3054 m->m_nextpkt = nextrecord;
3055 } else {
3056 so->so_rcv.sb_mb = nextrecord;
3057 SB_EMPTY_FIXUP(&so->so_rcv);
3058 }
3059 }
3060 done:
3061 *mp = m;
3062 *nextrecordp = nextrecord;
3063
3064 return error;
3065 }
3066
3067 /*
3068 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3069 * so clear the data portion in order not to leak the file pointers
3070 */
3071 static void
sopeek_scm_rights(struct mbuf * rights)3072 sopeek_scm_rights(struct mbuf *rights)
3073 {
3074 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3075
3076 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3077 VERIFY(cm->cmsg_len <= rights->m_len);
3078 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3079 }
3080 }
3081
3082 /*
3083 * Process one or more MT_CONTROL mbufs present before any data mbufs
3084 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3085 * just copy the data; if !MSG_PEEK, we call into the protocol to
3086 * perform externalization.
3087 */
3088 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3089 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3090 struct mbuf **mp, struct mbuf **nextrecordp)
3091 {
3092 int error = 0;
3093 struct mbuf *cm = NULL, *cmn;
3094 struct mbuf **cme = &cm;
3095 struct sockbuf *sb_rcv = &so->so_rcv;
3096 struct mbuf **msgpcm = NULL;
3097 struct mbuf *m = *mp;
3098 struct mbuf *nextrecord = *nextrecordp;
3099 struct protosw *pr = so->so_proto;
3100
3101 /*
3102 * Externalizing the control messages would require us to
3103 * drop the socket's lock below. Once we re-acquire the
3104 * lock, the mbuf chain might change. In order to preserve
3105 * consistency, we unlink all control messages from the
3106 * first mbuf chain in one shot and link them separately
3107 * onto a different chain.
3108 */
3109 do {
3110 if (flags & MSG_PEEK) {
3111 if (controlp != NULL) {
3112 if (*controlp == NULL) {
3113 msgpcm = controlp;
3114 }
3115 *controlp = m_copy(m, 0, m->m_len);
3116
3117 /*
3118 * If we failed to allocate an mbuf,
3119 * release any previously allocated
3120 * mbufs for control data. Return
3121 * an error. Keep the mbufs in the
3122 * socket as this is using
3123 * MSG_PEEK flag.
3124 */
3125 if (*controlp == NULL) {
3126 m_freem(*msgpcm);
3127 error = ENOBUFS;
3128 goto done;
3129 }
3130
3131 if (pr->pr_domain->dom_externalize != NULL) {
3132 sopeek_scm_rights(*controlp);
3133 }
3134
3135 controlp = &(*controlp)->m_next;
3136 }
3137 m = m->m_next;
3138 } else {
3139 m->m_nextpkt = NULL;
3140 sbfree(sb_rcv, m);
3141 sb_rcv->sb_mb = m->m_next;
3142 m->m_next = NULL;
3143 *cme = m;
3144 cme = &(*cme)->m_next;
3145 m = sb_rcv->sb_mb;
3146 }
3147 } while (m != NULL && m->m_type == MT_CONTROL);
3148
3149 if (!(flags & MSG_PEEK)) {
3150 if (sb_rcv->sb_mb != NULL) {
3151 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3152 } else {
3153 sb_rcv->sb_mb = nextrecord;
3154 SB_EMPTY_FIXUP(sb_rcv);
3155 }
3156 if (nextrecord == NULL) {
3157 sb_rcv->sb_lastrecord = m;
3158 }
3159 }
3160
3161 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3162 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3163
3164 while (cm != NULL) {
3165 int cmsg_level;
3166 int cmsg_type;
3167
3168 cmn = cm->m_next;
3169 cm->m_next = NULL;
3170 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3171 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3172
3173 /*
3174 * Call the protocol to externalize SCM_RIGHTS message
3175 * and return the modified message to the caller upon
3176 * success. Otherwise, all other control messages are
3177 * returned unmodified to the caller. Note that we
3178 * only get into this loop if MSG_PEEK is not set.
3179 */
3180 if (pr->pr_domain->dom_externalize != NULL &&
3181 cmsg_level == SOL_SOCKET &&
3182 cmsg_type == SCM_RIGHTS) {
3183 /*
3184 * Release socket lock: see 3903171. This
3185 * would also allow more records to be appended
3186 * to the socket buffer. We still have SB_LOCK
3187 * set on it, so we can be sure that the head
3188 * of the mbuf chain won't change.
3189 */
3190 socket_unlock(so, 0);
3191 error = (*pr->pr_domain->dom_externalize)(cm);
3192 socket_lock(so, 0);
3193 } else {
3194 error = 0;
3195 }
3196
3197 if (controlp != NULL && error == 0) {
3198 *controlp = cm;
3199 controlp = &(*controlp)->m_next;
3200 } else {
3201 (void) m_free(cm);
3202 }
3203 cm = cmn;
3204 }
3205 /*
3206 * Update the value of nextrecord in case we received new
3207 * records when the socket was unlocked above for
3208 * externalizing SCM_RIGHTS.
3209 */
3210 if (m != NULL) {
3211 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3212 } else {
3213 nextrecord = sb_rcv->sb_mb;
3214 }
3215
3216 done:
3217 *mp = m;
3218 *nextrecordp = nextrecord;
3219
3220 return error;
3221 }
3222
3223 /*
3224 * If we have less data than requested, block awaiting more
3225 * (subject to any timeout) if:
3226 * 1. the current count is less than the low water mark, or
3227 * 2. MSG_WAITALL is set, and it is possible to do the entire
3228 * receive operation at once if we block (resid <= hiwat).
3229 * 3. MSG_DONTWAIT is not set
3230 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3231 * we have to do the receive in sections, and thus risk returning
3232 * a short count if a timeout or signal occurs after we start.
3233 */
3234 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3235 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3236 {
3237 struct protosw *pr = so->so_proto;
3238
3239 /* No mbufs in the receive-queue? Wait! */
3240 if (m == NULL) {
3241 return true;
3242 }
3243
3244 /* Not enough data in the receive socket-buffer - we may have to wait */
3245 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3246 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3247 /*
3248 * Application did set the lowater-mark, so we should wait for
3249 * this data to be present.
3250 */
3251 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3252 return true;
3253 }
3254
3255 /*
3256 * Application wants all the data - so let's try to do the
3257 * receive-operation at once by waiting for everything to
3258 * be there.
3259 */
3260 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3261 return true;
3262 }
3263 }
3264
3265 return false;
3266 }
3267
3268 /*
3269 * Implement receive operations on a socket.
3270 * We depend on the way that records are added to the sockbuf
3271 * by sbappend*. In particular, each record (mbufs linked through m_next)
3272 * must begin with an address if the protocol so specifies,
3273 * followed by an optional mbuf or mbufs containing ancillary data,
3274 * and then zero or more mbufs of data.
3275 * In order to avoid blocking network interrupts for the entire time here,
3276 * we splx() while doing the actual copy to user space.
3277 * Although the sockbuf is locked, new data may still be appended,
3278 * and thus we must maintain consistency of the sockbuf during that time.
3279 *
3280 * The caller may receive the data as a single mbuf chain by supplying
3281 * an mbuf **mp0 for use in returning the chain. The uio is then used
3282 * only for the count in uio_resid.
3283 *
3284 * Returns: 0 Success
3285 * ENOBUFS
3286 * ENOTCONN
3287 * EWOULDBLOCK
3288 * uiomove:EFAULT
3289 * sblock:EWOULDBLOCK
3290 * sblock:EINTR
3291 * sbwait:EBADF
3292 * sbwait:EINTR
3293 * sodelayed_copy:EFAULT
3294 * <pru_rcvoob>:EINVAL[TCP]
3295 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3296 * <pru_rcvoob>:???
3297 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3298 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3299 * <pr_domain->dom_externalize>:???
3300 *
3301 * Notes: Additional return values from calls through <pru_rcvoob> and
3302 * <pr_domain->dom_externalize> depend on protocols other than
3303 * TCP or AF_UNIX, which are documented above.
3304 */
3305 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3306 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3307 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3308 {
3309 struct mbuf *m, **mp, *ml = NULL;
3310 struct mbuf *nextrecord, *free_list;
3311 int flags, error, offset;
3312 user_ssize_t len;
3313 struct protosw *pr = so->so_proto;
3314 int moff, type = 0;
3315 user_ssize_t orig_resid = uio_resid(uio);
3316 user_ssize_t delayed_copy_len;
3317 int can_delay;
3318 struct proc *p = current_proc();
3319 boolean_t en_tracing = FALSE;
3320
3321 /*
3322 * Sanity check on the length passed by caller as we are making 'int'
3323 * comparisons
3324 */
3325 if (orig_resid < 0 || orig_resid > INT_MAX) {
3326 return EINVAL;
3327 }
3328
3329 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3330 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3331 so->so_rcv.sb_hiwat);
3332
3333 socket_lock(so, 1);
3334 so_update_last_owner_locked(so, p);
3335 so_update_policy(so);
3336
3337 #ifdef MORE_LOCKING_DEBUG
3338 if (so->so_usecount == 1) {
3339 panic("%s: so=%x no other reference on socket", __func__, so);
3340 /* NOTREACHED */
3341 }
3342 #endif
3343 mp = mp0;
3344 if (psa != NULL) {
3345 *psa = NULL;
3346 }
3347 if (controlp != NULL) {
3348 *controlp = NULL;
3349 }
3350 if (flagsp != NULL) {
3351 flags = *flagsp & ~MSG_EOR;
3352 } else {
3353 flags = 0;
3354 }
3355
3356 /*
3357 * If a recv attempt is made on a previously-accepted socket
3358 * that has been marked as inactive (disconnected), reject
3359 * the request.
3360 */
3361 if (so->so_flags & SOF_DEFUNCT) {
3362 struct sockbuf *sb = &so->so_rcv;
3363
3364 error = ENOTCONN;
3365 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3366 __func__, proc_pid(p), proc_best_name(p),
3367 so->so_gencnt,
3368 SOCK_DOM(so), SOCK_TYPE(so), error);
3369 /*
3370 * This socket should have been disconnected and flushed
3371 * prior to being returned from sodefunct(); there should
3372 * be no data on its receive list, so panic otherwise.
3373 */
3374 if (so->so_state & SS_DEFUNCT) {
3375 sb_empty_assert(sb, __func__);
3376 }
3377 socket_unlock(so, 1);
3378 return error;
3379 }
3380
3381 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3382 pr->pr_usrreqs->pru_preconnect) {
3383 /*
3384 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3385 * calling write() right after this. *If* the app calls a read
3386 * we do not want to block this read indefinetely. Thus,
3387 * we trigger a connect so that the session gets initiated.
3388 */
3389 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3390
3391 if (error) {
3392 socket_unlock(so, 1);
3393 return error;
3394 }
3395 }
3396
3397 if (ENTR_SHOULDTRACE &&
3398 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3399 /*
3400 * enable energy tracing for inet sockets that go over
3401 * non-loopback interfaces only.
3402 */
3403 struct inpcb *inp = sotoinpcb(so);
3404 if (inp->inp_last_outifp != NULL &&
3405 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3406 en_tracing = TRUE;
3407 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3408 VM_KERNEL_ADDRPERM(so),
3409 ((so->so_state & SS_NBIO) ?
3410 kEnTrFlagNonBlocking : 0),
3411 (int64_t)orig_resid);
3412 }
3413 }
3414
3415 /*
3416 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3417 * regardless of the flags argument. Here is the case were
3418 * out-of-band data is not inline.
3419 */
3420 if ((flags & MSG_OOB) ||
3421 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3422 (so->so_options & SO_OOBINLINE) == 0 &&
3423 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3424 m = m_get(M_WAIT, MT_DATA);
3425 if (m == NULL) {
3426 socket_unlock(so, 1);
3427 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3428 ENOBUFS, 0, 0, 0, 0);
3429 return ENOBUFS;
3430 }
3431 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3432 if (error) {
3433 goto bad;
3434 }
3435 socket_unlock(so, 0);
3436 do {
3437 error = uiomove(mtod(m, caddr_t),
3438 imin((int)uio_resid(uio), m->m_len), uio);
3439 m = m_free(m);
3440 } while (uio_resid(uio) && error == 0 && m != NULL);
3441 socket_lock(so, 0);
3442 bad:
3443 if (m != NULL) {
3444 m_freem(m);
3445 }
3446
3447 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3448 if (error == EWOULDBLOCK || error == EINVAL) {
3449 /*
3450 * Let's try to get normal data:
3451 * EWOULDBLOCK: out-of-band data not
3452 * receive yet. EINVAL: out-of-band data
3453 * already read.
3454 */
3455 error = 0;
3456 goto nooob;
3457 } else if (error == 0 && flagsp != NULL) {
3458 *flagsp |= MSG_OOB;
3459 }
3460 }
3461 socket_unlock(so, 1);
3462 if (en_tracing) {
3463 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3464 VM_KERNEL_ADDRPERM(so), 0,
3465 (int64_t)(orig_resid - uio_resid(uio)));
3466 }
3467 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3468 0, 0, 0, 0);
3469
3470 return error;
3471 }
3472 nooob:
3473 if (mp != NULL) {
3474 *mp = NULL;
3475 }
3476
3477 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3478 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3479 }
3480
3481 free_list = NULL;
3482 delayed_copy_len = 0;
3483 restart:
3484 #ifdef MORE_LOCKING_DEBUG
3485 if (so->so_usecount <= 1) {
3486 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3487 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3488 }
3489 #endif
3490 /*
3491 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3492 * and if so just return to the caller. This could happen when
3493 * soreceive() is called by a socket upcall function during the
3494 * time the socket is freed. The socket buffer would have been
3495 * locked across the upcall, therefore we cannot put this thread
3496 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3497 * we may livelock), because the lock on the socket buffer will
3498 * only be released when the upcall routine returns to its caller.
3499 * Because the socket has been officially closed, there can be
3500 * no further read on it.
3501 *
3502 * A multipath subflow socket would have its SS_NOFDREF set by
3503 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3504 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3505 */
3506 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3507 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3508 socket_unlock(so, 1);
3509 return 0;
3510 }
3511
3512 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3513 if (error) {
3514 socket_unlock(so, 1);
3515 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3516 0, 0, 0, 0);
3517 if (en_tracing) {
3518 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3519 VM_KERNEL_ADDRPERM(so), 0,
3520 (int64_t)(orig_resid - uio_resid(uio)));
3521 }
3522 return error;
3523 }
3524
3525 m = so->so_rcv.sb_mb;
3526 if (so_should_wait(so, uio, m, flags)) {
3527 /*
3528 * Panic if we notice inconsistencies in the socket's
3529 * receive list; both sb_mb and sb_cc should correctly
3530 * reflect the contents of the list, otherwise we may
3531 * end up with false positives during select() or poll()
3532 * which could put the application in a bad state.
3533 */
3534 SB_MB_CHECK(&so->so_rcv);
3535
3536 if (so->so_error) {
3537 if (m != NULL) {
3538 goto dontblock;
3539 }
3540 error = so->so_error;
3541 if ((flags & MSG_PEEK) == 0) {
3542 so->so_error = 0;
3543 }
3544 goto release;
3545 }
3546 if (so->so_state & SS_CANTRCVMORE) {
3547 #if CONTENT_FILTER
3548 /*
3549 * Deal with half closed connections
3550 */
3551 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3552 cfil_sock_data_pending(&so->so_rcv) != 0) {
3553 CFIL_LOG(LOG_INFO,
3554 "so %llx ignore SS_CANTRCVMORE",
3555 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3556 } else
3557 #endif /* CONTENT_FILTER */
3558 if (m != NULL) {
3559 goto dontblock;
3560 } else {
3561 goto release;
3562 }
3563 }
3564 for (; m != NULL; m = m->m_next) {
3565 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3566 m = so->so_rcv.sb_mb;
3567 goto dontblock;
3568 }
3569 }
3570 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3571 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3572 error = ENOTCONN;
3573 goto release;
3574 }
3575 if (uio_resid(uio) == 0) {
3576 goto release;
3577 }
3578
3579 if ((so->so_state & SS_NBIO) ||
3580 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3581 error = EWOULDBLOCK;
3582 goto release;
3583 }
3584 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3585 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3586 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3587 #if EVEN_MORE_LOCKING_DEBUG
3588 if (socket_debug) {
3589 printf("Waiting for socket data\n");
3590 }
3591 #endif
3592
3593 /*
3594 * Depending on the protocol (e.g. TCP), the following
3595 * might cause the socket lock to be dropped and later
3596 * be reacquired, and more data could have arrived and
3597 * have been appended to the receive socket buffer by
3598 * the time it returns. Therefore, we only sleep in
3599 * sbwait() below if and only if the wait-condition is still
3600 * true.
3601 */
3602 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3603 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3604 }
3605
3606 error = 0;
3607 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3608 error = sbwait(&so->so_rcv);
3609 }
3610
3611 #if EVEN_MORE_LOCKING_DEBUG
3612 if (socket_debug) {
3613 printf("SORECEIVE - sbwait returned %d\n", error);
3614 }
3615 #endif
3616 if (so->so_usecount < 1) {
3617 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3618 __func__, so, so->so_usecount);
3619 /* NOTREACHED */
3620 }
3621 if (error) {
3622 socket_unlock(so, 1);
3623 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3624 0, 0, 0, 0);
3625 if (en_tracing) {
3626 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3627 VM_KERNEL_ADDRPERM(so), 0,
3628 (int64_t)(orig_resid - uio_resid(uio)));
3629 }
3630 return error;
3631 }
3632 goto restart;
3633 }
3634 dontblock:
3635 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3636 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3637 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3638 nextrecord = m->m_nextpkt;
3639
3640 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3641 error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord,
3642 mp0 == NULL);
3643 if (error == ERESTART) {
3644 goto restart;
3645 } else if (error != 0) {
3646 goto release;
3647 }
3648 orig_resid = 0;
3649 }
3650
3651 /*
3652 * Process one or more MT_CONTROL mbufs present before any data mbufs
3653 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3654 * just copy the data; if !MSG_PEEK, we call into the protocol to
3655 * perform externalization.
3656 */
3657 if (m != NULL && m->m_type == MT_CONTROL) {
3658 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3659 if (error != 0) {
3660 goto release;
3661 }
3662 orig_resid = 0;
3663 }
3664
3665 if (m != NULL) {
3666 if (!(flags & MSG_PEEK)) {
3667 /*
3668 * We get here because m points to an mbuf following
3669 * any MT_SONAME or MT_CONTROL mbufs which have been
3670 * processed above. In any case, m should be pointing
3671 * to the head of the mbuf chain, and the nextrecord
3672 * should be either NULL or equal to m->m_nextpkt.
3673 * See comments above about SB_LOCK.
3674 */
3675 if (m != so->so_rcv.sb_mb ||
3676 m->m_nextpkt != nextrecord) {
3677 panic("%s: post-control !sync so=%p m=%p "
3678 "nextrecord=%p\n", __func__, so, m,
3679 nextrecord);
3680 /* NOTREACHED */
3681 }
3682 if (nextrecord == NULL) {
3683 so->so_rcv.sb_lastrecord = m;
3684 }
3685 }
3686 type = m->m_type;
3687 if (type == MT_OOBDATA) {
3688 flags |= MSG_OOB;
3689 }
3690 } else {
3691 if (!(flags & MSG_PEEK)) {
3692 SB_EMPTY_FIXUP(&so->so_rcv);
3693 }
3694 }
3695 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3696 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3697
3698 moff = 0;
3699 offset = 0;
3700
3701 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3702 can_delay = 1;
3703 } else {
3704 can_delay = 0;
3705 }
3706
3707 while (m != NULL &&
3708 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3709 if (m->m_type == MT_OOBDATA) {
3710 if (type != MT_OOBDATA) {
3711 break;
3712 }
3713 } else if (type == MT_OOBDATA) {
3714 break;
3715 }
3716
3717 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3718 m->m_type != MT_HEADER) {
3719 break;
3720 }
3721 /*
3722 * Make sure to allways set MSG_OOB event when getting
3723 * out of band data inline.
3724 */
3725 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3726 (so->so_options & SO_OOBINLINE) != 0 &&
3727 (so->so_state & SS_RCVATMARK) != 0) {
3728 flags |= MSG_OOB;
3729 }
3730 so->so_state &= ~SS_RCVATMARK;
3731 len = uio_resid(uio) - delayed_copy_len;
3732 if (so->so_oobmark && len > so->so_oobmark - offset) {
3733 len = so->so_oobmark - offset;
3734 }
3735 if (len > m->m_len - moff) {
3736 len = m->m_len - moff;
3737 }
3738 /*
3739 * If mp is set, just pass back the mbufs.
3740 * Otherwise copy them out via the uio, then free.
3741 * Sockbuf must be consistent here (points to current mbuf,
3742 * it points to next record) when we drop priority;
3743 * we must note any additions to the sockbuf when we
3744 * block interrupts again.
3745 */
3746 if (mp == NULL) {
3747 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3748 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3749 if (can_delay && len == m->m_len) {
3750 /*
3751 * only delay the copy if we're consuming the
3752 * mbuf and we're NOT in MSG_PEEK mode
3753 * and we have enough data to make it worthwile
3754 * to drop and retake the lock... can_delay
3755 * reflects the state of the 2 latter
3756 * constraints moff should always be zero
3757 * in these cases
3758 */
3759 delayed_copy_len += len;
3760 } else {
3761 if (delayed_copy_len) {
3762 error = sodelayed_copy(so, uio,
3763 &free_list, &delayed_copy_len);
3764
3765 if (error) {
3766 goto release;
3767 }
3768 /*
3769 * can only get here if MSG_PEEK is not
3770 * set therefore, m should point at the
3771 * head of the rcv queue; if it doesn't,
3772 * it means something drastically
3773 * changed while we were out from behind
3774 * the lock in sodelayed_copy. perhaps
3775 * a RST on the stream. in any event,
3776 * the stream has been interrupted. it's
3777 * probably best just to return whatever
3778 * data we've moved and let the caller
3779 * sort it out...
3780 */
3781 if (m != so->so_rcv.sb_mb) {
3782 break;
3783 }
3784 }
3785 socket_unlock(so, 0);
3786 error = uiomove(mtod(m, caddr_t) + moff,
3787 (int)len, uio);
3788 socket_lock(so, 0);
3789
3790 if (error) {
3791 goto release;
3792 }
3793 }
3794 } else {
3795 uio_setresid(uio, (uio_resid(uio) - len));
3796 }
3797 if (len == m->m_len - moff) {
3798 if (m->m_flags & M_EOR) {
3799 flags |= MSG_EOR;
3800 }
3801 if (flags & MSG_PEEK) {
3802 m = m->m_next;
3803 moff = 0;
3804 } else {
3805 nextrecord = m->m_nextpkt;
3806 sbfree(&so->so_rcv, m);
3807 m->m_nextpkt = NULL;
3808
3809 if (mp != NULL) {
3810 *mp = m;
3811 mp = &m->m_next;
3812 so->so_rcv.sb_mb = m = m->m_next;
3813 *mp = NULL;
3814 } else {
3815 if (free_list == NULL) {
3816 free_list = m;
3817 } else {
3818 ml->m_next = m;
3819 }
3820 ml = m;
3821 so->so_rcv.sb_mb = m = m->m_next;
3822 ml->m_next = NULL;
3823 }
3824 if (m != NULL) {
3825 m->m_nextpkt = nextrecord;
3826 if (nextrecord == NULL) {
3827 so->so_rcv.sb_lastrecord = m;
3828 }
3829 } else {
3830 so->so_rcv.sb_mb = nextrecord;
3831 SB_EMPTY_FIXUP(&so->so_rcv);
3832 }
3833 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3834 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3835 }
3836 } else {
3837 if (flags & MSG_PEEK) {
3838 moff += len;
3839 } else {
3840 if (mp != NULL) {
3841 int copy_flag;
3842
3843 if (flags & MSG_DONTWAIT) {
3844 copy_flag = M_DONTWAIT;
3845 } else {
3846 copy_flag = M_WAIT;
3847 }
3848 *mp = m_copym(m, 0, (int)len, copy_flag);
3849 /*
3850 * Failed to allocate an mbuf?
3851 * Adjust uio_resid back, it was
3852 * adjusted down by len bytes which
3853 * we didn't copy over.
3854 */
3855 if (*mp == NULL) {
3856 uio_setresid(uio,
3857 (uio_resid(uio) + len));
3858 break;
3859 }
3860 }
3861 m->m_data += len;
3862 m->m_len -= len;
3863 so->so_rcv.sb_cc -= len;
3864 }
3865 }
3866 if (so->so_oobmark) {
3867 if ((flags & MSG_PEEK) == 0) {
3868 so->so_oobmark -= len;
3869 if (so->so_oobmark == 0) {
3870 so->so_state |= SS_RCVATMARK;
3871 break;
3872 }
3873 } else {
3874 offset += len;
3875 if (offset == so->so_oobmark) {
3876 break;
3877 }
3878 }
3879 }
3880 if (flags & MSG_EOR) {
3881 break;
3882 }
3883 /*
3884 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3885 * (for non-atomic socket), we must not quit until
3886 * "uio->uio_resid == 0" or an error termination.
3887 * If a signal/timeout occurs, return with a short
3888 * count but without error. Keep sockbuf locked
3889 * against other readers.
3890 */
3891 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3892 (uio_resid(uio) - delayed_copy_len) > 0 &&
3893 !sosendallatonce(so) && !nextrecord) {
3894 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3895 #if CONTENT_FILTER
3896 && cfil_sock_data_pending(&so->so_rcv) == 0
3897 #endif /* CONTENT_FILTER */
3898 )) {
3899 goto release;
3900 }
3901
3902 /*
3903 * Depending on the protocol (e.g. TCP), the following
3904 * might cause the socket lock to be dropped and later
3905 * be reacquired, and more data could have arrived and
3906 * have been appended to the receive socket buffer by
3907 * the time it returns. Therefore, we only sleep in
3908 * sbwait() below if and only if the socket buffer is
3909 * empty, in order to avoid a false sleep.
3910 */
3911 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3912 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3913 }
3914
3915 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3916 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3917
3918 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3919 error = 0;
3920 goto release;
3921 }
3922 /*
3923 * have to wait until after we get back from the sbwait
3924 * to do the copy because we will drop the lock if we
3925 * have enough data that has been delayed... by dropping
3926 * the lock we open up a window allowing the netisr
3927 * thread to process the incoming packets and to change
3928 * the state of this socket... we're issuing the sbwait
3929 * because the socket is empty and we're expecting the
3930 * netisr thread to wake us up when more packets arrive;
3931 * if we allow that processing to happen and then sbwait
3932 * we could stall forever with packets sitting in the
3933 * socket if no further packets arrive from the remote
3934 * side.
3935 *
3936 * we want to copy before we've collected all the data
3937 * to satisfy this request to allow the copy to overlap
3938 * the incoming packet processing on an MP system
3939 */
3940 if (delayed_copy_len > sorecvmincopy &&
3941 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3942 error = sodelayed_copy(so, uio,
3943 &free_list, &delayed_copy_len);
3944
3945 if (error) {
3946 goto release;
3947 }
3948 }
3949 m = so->so_rcv.sb_mb;
3950 if (m != NULL) {
3951 nextrecord = m->m_nextpkt;
3952 }
3953 SB_MB_CHECK(&so->so_rcv);
3954 }
3955 }
3956 #ifdef MORE_LOCKING_DEBUG
3957 if (so->so_usecount <= 1) {
3958 panic("%s: after big while so=%p ref=%d on socket",
3959 __func__, so, so->so_usecount);
3960 /* NOTREACHED */
3961 }
3962 #endif
3963
3964 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3965 if (so->so_options & SO_DONTTRUNC) {
3966 flags |= MSG_RCVMORE;
3967 } else {
3968 flags |= MSG_TRUNC;
3969 if ((flags & MSG_PEEK) == 0) {
3970 (void) sbdroprecord(&so->so_rcv);
3971 }
3972 }
3973 }
3974
3975 /*
3976 * pru_rcvd below (for TCP) may cause more data to be received
3977 * if the socket lock is dropped prior to sending the ACK; some
3978 * legacy OpenTransport applications don't handle this well
3979 * (if it receives less data than requested while MSG_HAVEMORE
3980 * is set), and so we set the flag now based on what we know
3981 * prior to calling pru_rcvd.
3982 */
3983 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3984 flags |= MSG_HAVEMORE;
3985 }
3986
3987 if ((flags & MSG_PEEK) == 0) {
3988 if (m == NULL) {
3989 so->so_rcv.sb_mb = nextrecord;
3990 /*
3991 * First part is an inline SB_EMPTY_FIXUP(). Second
3992 * part makes sure sb_lastrecord is up-to-date if
3993 * there is still data in the socket buffer.
3994 */
3995 if (so->so_rcv.sb_mb == NULL) {
3996 so->so_rcv.sb_mbtail = NULL;
3997 so->so_rcv.sb_lastrecord = NULL;
3998 } else if (nextrecord->m_nextpkt == NULL) {
3999 so->so_rcv.sb_lastrecord = nextrecord;
4000 }
4001 SB_MB_CHECK(&so->so_rcv);
4002 }
4003 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4004 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4005 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4006 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4007 }
4008 }
4009
4010 if (delayed_copy_len) {
4011 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4012 if (error) {
4013 goto release;
4014 }
4015 }
4016 if (free_list != NULL) {
4017 m_freem_list(free_list);
4018 free_list = NULL;
4019 }
4020
4021 if (orig_resid == uio_resid(uio) && orig_resid &&
4022 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4023 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4024 goto restart;
4025 }
4026
4027 if (flagsp != NULL) {
4028 *flagsp |= flags;
4029 }
4030 release:
4031 #ifdef MORE_LOCKING_DEBUG
4032 if (so->so_usecount <= 1) {
4033 panic("%s: release so=%p ref=%d on socket", __func__,
4034 so, so->so_usecount);
4035 /* NOTREACHED */
4036 }
4037 #endif
4038 if (delayed_copy_len) {
4039 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4040 }
4041
4042 if (free_list != NULL) {
4043 m_freem_list(free_list);
4044 }
4045
4046 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4047
4048 if (en_tracing) {
4049 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4050 VM_KERNEL_ADDRPERM(so),
4051 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4052 (int64_t)(orig_resid - uio_resid(uio)));
4053 }
4054 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4055 so->so_rcv.sb_cc, 0, error);
4056
4057 return error;
4058 }
4059
4060 /*
4061 * Returns: 0 Success
4062 * uiomove:EFAULT
4063 */
4064 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4065 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4066 user_ssize_t *resid)
4067 {
4068 int error = 0;
4069 struct mbuf *m;
4070
4071 m = *free_list;
4072
4073 socket_unlock(so, 0);
4074
4075 while (m != NULL && error == 0) {
4076 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4077 m = m->m_next;
4078 }
4079 m_freem_list(*free_list);
4080
4081 *free_list = NULL;
4082 *resid = 0;
4083
4084 socket_lock(so, 0);
4085
4086 return error;
4087 }
4088
4089 int
soreceive_m_list(struct socket * so,u_int * pktcntp,struct mbuf ** maddrp,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)4090 soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
4091 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
4092 {
4093 struct mbuf *m, **mp;
4094 struct mbuf *nextrecord;
4095 int flags, error;
4096 struct protosw *pr = so->so_proto;
4097 struct proc *p = current_proc();
4098 u_int npkts = 0;
4099 struct mbuf *free_list = NULL;
4100 int sblocked = 0;
4101
4102 /*
4103 * Sanity check on the parameters passed by caller
4104 */
4105 if (mp0 == NULL || pktcntp == NULL) {
4106 return EINVAL;
4107 }
4108 if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
4109 return EINVAL;
4110 }
4111
4112 mp = mp0;
4113 *mp0 = NULL;
4114 if (controlp != NULL) {
4115 *controlp = NULL;
4116 }
4117 if (maddrp != NULL) {
4118 *maddrp = NULL;
4119 }
4120 if (flagsp != NULL) {
4121 flags = *flagsp;
4122 } else {
4123 flags = 0;
4124 }
4125
4126 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4127 *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4128 so->so_rcv.sb_hiwat);
4129
4130 socket_lock(so, 1);
4131 so_update_last_owner_locked(so, p);
4132 so_update_policy(so);
4133
4134 #if NECP
4135 so_update_necp_policy(so, NULL, NULL);
4136 #endif /* NECP */
4137
4138 /*
4139 * If a recv attempt is made on a previously-accepted socket
4140 * that has been marked as inactive (disconnected), reject
4141 * the request.
4142 */
4143 if (so->so_flags & SOF_DEFUNCT) {
4144 struct sockbuf *sb = &so->so_rcv;
4145
4146 error = ENOTCONN;
4147 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4148 __func__, proc_pid(p), proc_best_name(p),
4149 so->so_gencnt,
4150 SOCK_DOM(so), SOCK_TYPE(so), error);
4151 /*
4152 * This socket should have been disconnected and flushed
4153 * prior to being returned from sodefunct(); there should
4154 * be no data on its receive list, so panic otherwise.
4155 */
4156 if (so->so_state & SS_DEFUNCT) {
4157 sb_empty_assert(sb, __func__);
4158 }
4159 goto release;
4160 }
4161
4162 *mp = NULL;
4163
4164 restart:
4165 /*
4166 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4167 * and if so just return to the caller. This could happen when
4168 * soreceive() is called by a socket upcall function during the
4169 * time the socket is freed. The socket buffer would have been
4170 * locked across the upcall, therefore we cannot put this thread
4171 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4172 * we may livelock), because the lock on the socket buffer will
4173 * only be released when the upcall routine returns to its caller.
4174 * Because the socket has been officially closed, there can be
4175 * no further read on it.
4176 */
4177 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4178 (SS_NOFDREF | SS_CANTRCVMORE)) {
4179 error = 0;
4180 goto out;
4181 }
4182
4183 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4184 if (error) {
4185 goto out;
4186 }
4187 sblocked = 1;
4188
4189 m = so->so_rcv.sb_mb;
4190 /*
4191 * Block awaiting more datagram if needed
4192 */
4193 if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4194 so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4195 /*
4196 * Panic if we notice inconsistencies in the socket's
4197 * receive list; both sb_mb and sb_cc should correctly
4198 * reflect the contents of the list, otherwise we may
4199 * end up with false positives during select() or poll()
4200 * which could put the application in a bad state.
4201 */
4202 SB_MB_CHECK(&so->so_rcv);
4203
4204 if (so->so_error) {
4205 if (m != NULL) {
4206 goto dontblock;
4207 }
4208 error = so->so_error;
4209 if ((flags & MSG_PEEK) == 0) {
4210 so->so_error = 0;
4211 }
4212 goto release;
4213 }
4214 if (so->so_state & SS_CANTRCVMORE) {
4215 if (m != NULL) {
4216 goto dontblock;
4217 } else {
4218 goto release;
4219 }
4220 }
4221 for (; m != NULL; m = m->m_next) {
4222 if (m->m_flags & M_EOR) {
4223 m = so->so_rcv.sb_mb;
4224 goto dontblock;
4225 }
4226 }
4227 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4228 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4229 error = ENOTCONN;
4230 goto release;
4231 }
4232 if ((so->so_state & SS_NBIO) ||
4233 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4234 error = EWOULDBLOCK;
4235 goto release;
4236 }
4237 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4238 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4239
4240 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4241 sblocked = 0;
4242
4243 error = sbwait(&so->so_rcv);
4244 if (error != 0) {
4245 goto release;
4246 }
4247 goto restart;
4248 }
4249 dontblock:
4250 m = so->so_rcv.sb_mb;
4251 if (m == NULL) {
4252 goto release;
4253 }
4254
4255 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4256 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4257 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4258 nextrecord = m->m_nextpkt;
4259
4260 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4261 struct mbuf *maddr = NULL;
4262
4263 error = soreceive_addr(p, so, NULL, &maddr, flags, &m,
4264 &nextrecord, 1);
4265 if (error == ERESTART) {
4266 goto restart;
4267 } else if (error != 0) {
4268 goto release;
4269 }
4270
4271 if (maddr != NULL) {
4272 maddr->m_nextpkt = NULL;
4273 maddr->m_next = NULL;
4274 if (maddrp != NULL) {
4275 *maddrp = maddr;
4276 maddrp = &maddr->m_nextpkt;
4277 } else {
4278 maddr->m_next = free_list;
4279 free_list = maddr;
4280 }
4281 }
4282 }
4283
4284 /*
4285 * Process one or more MT_CONTROL mbufs present before any data mbufs
4286 * in the first mbuf chain on the socket buffer.
4287 * We call into the protocol to perform externalization.
4288 */
4289 if (m != NULL && m->m_type == MT_CONTROL) {
4290 struct mbuf *control = NULL;
4291
4292 error = soreceive_ctl(so, &control, flags, &m, &nextrecord);
4293 if (error != 0) {
4294 goto release;
4295 }
4296 if (control != NULL) {
4297 control->m_nextpkt = NULL;
4298 control->m_next = NULL;
4299 if (controlp != NULL) {
4300 *controlp = control;
4301 controlp = &control->m_nextpkt;
4302 } else {
4303 control->m_next = free_list;
4304 free_list = control;
4305 }
4306 }
4307 }
4308
4309 /*
4310 * Link the packet to the list
4311 */
4312 if (m != NULL) {
4313 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
4314 m->m_type != MT_HEADER) {
4315 panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4316 }
4317 m->m_nextpkt = NULL;
4318 *mp = m;
4319 mp = &m->m_nextpkt;
4320 }
4321 while (m != NULL) {
4322 sbfree(&so->so_rcv, m);
4323
4324 m = m->m_next;
4325 }
4326
4327 so->so_rcv.sb_mb = nextrecord;
4328 /*
4329 * First part is an inline SB_EMPTY_FIXUP(). Second
4330 * part makes sure sb_lastrecord is up-to-date if
4331 * there is still data in the socket buffer.
4332 */
4333 if (so->so_rcv.sb_mb == NULL) {
4334 so->so_rcv.sb_mbtail = NULL;
4335 so->so_rcv.sb_lastrecord = NULL;
4336 } else if (nextrecord->m_nextpkt == NULL) {
4337 so->so_rcv.sb_lastrecord = nextrecord;
4338 }
4339 SB_MB_CHECK(&so->so_rcv);
4340
4341 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4342 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4343
4344 npkts += 1;
4345
4346 /*
4347 * We continue as long as all those conditions as we have less packets
4348 * than requested and the socket buffer is not empty
4349 */
4350 if (npkts < *pktcntp) {
4351 if (so->so_rcv.sb_mb != NULL) {
4352 goto dontblock;
4353 }
4354 if ((flags & MSG_WAITALL) != 0) {
4355 goto restart;
4356 }
4357 }
4358
4359 if (flagsp != NULL) {
4360 *flagsp |= flags;
4361 }
4362
4363 release:
4364 /*
4365 * pru_rcvd may cause more data to be received if the socket lock
4366 * is dropped so we set MSG_HAVEMORE now based on what we know.
4367 * That way the caller won't be surprised if it receives less data
4368 * than requested.
4369 */
4370 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4371 flags |= MSG_HAVEMORE;
4372 }
4373
4374 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4375 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4376 }
4377
4378 if (sblocked) {
4379 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4380 } else {
4381 socket_unlock(so, 1);
4382 }
4383
4384 out:
4385 *pktcntp = npkts;
4386 /*
4387 * Amortize the cost of freeing the mbufs
4388 */
4389 if (free_list != NULL) {
4390 m_freem_list(free_list);
4391 }
4392
4393 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4394 0, 0, 0, 0);
4395 return error;
4396 }
4397
4398 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4399 so_statistics_event_to_nstat_event(int64_t *input_options,
4400 uint64_t *nstat_event)
4401 {
4402 int error = 0;
4403 switch (*input_options) {
4404 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4405 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4406 break;
4407 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4408 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4409 break;
4410 #if (DEBUG || DEVELOPMENT)
4411 case SO_STATISTICS_EVENT_RESERVED_1:
4412 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4413 break;
4414 case SO_STATISTICS_EVENT_RESERVED_2:
4415 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4416 break;
4417 #endif /* (DEBUG || DEVELOPMENT) */
4418 default:
4419 error = EINVAL;
4420 break;
4421 }
4422 return error;
4423 }
4424
4425 /*
4426 * Returns: 0 Success
4427 * EINVAL
4428 * ENOTCONN
4429 * <pru_shutdown>:EINVAL
4430 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4431 * <pru_shutdown>:ENOBUFS[TCP]
4432 * <pru_shutdown>:EMSGSIZE[TCP]
4433 * <pru_shutdown>:EHOSTUNREACH[TCP]
4434 * <pru_shutdown>:ENETUNREACH[TCP]
4435 * <pru_shutdown>:ENETDOWN[TCP]
4436 * <pru_shutdown>:ENOMEM[TCP]
4437 * <pru_shutdown>:EACCES[TCP]
4438 * <pru_shutdown>:EMSGSIZE[TCP]
4439 * <pru_shutdown>:ENOBUFS[TCP]
4440 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4441 * <pru_shutdown>:??? [other protocol families]
4442 */
4443 int
soshutdown(struct socket * so,int how)4444 soshutdown(struct socket *so, int how)
4445 {
4446 int error;
4447
4448 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4449
4450 switch (how) {
4451 case SHUT_RD:
4452 case SHUT_WR:
4453 case SHUT_RDWR:
4454 socket_lock(so, 1);
4455 if ((so->so_state &
4456 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4457 error = ENOTCONN;
4458 } else {
4459 error = soshutdownlock(so, how);
4460 }
4461 socket_unlock(so, 1);
4462 break;
4463 default:
4464 error = EINVAL;
4465 break;
4466 }
4467
4468 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4469
4470 return error;
4471 }
4472
4473 int
soshutdownlock_final(struct socket * so,int how)4474 soshutdownlock_final(struct socket *so, int how)
4475 {
4476 struct protosw *pr = so->so_proto;
4477 int error = 0;
4478
4479 sflt_notify(so, sock_evt_shutdown, &how);
4480
4481 if (how != SHUT_WR) {
4482 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4483 /* read already shut down */
4484 error = ENOTCONN;
4485 goto done;
4486 }
4487 sorflush(so);
4488 }
4489 if (how != SHUT_RD) {
4490 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4491 /* write already shut down */
4492 error = ENOTCONN;
4493 goto done;
4494 }
4495 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4496 }
4497 done:
4498 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4499 return error;
4500 }
4501
4502 int
soshutdownlock(struct socket * so,int how)4503 soshutdownlock(struct socket *so, int how)
4504 {
4505 int error = 0;
4506
4507 #if CONTENT_FILTER
4508 /*
4509 * A content filter may delay the actual shutdown until it
4510 * has processed the pending data
4511 */
4512 if (so->so_flags & SOF_CONTENT_FILTER) {
4513 error = cfil_sock_shutdown(so, &how);
4514 if (error == EJUSTRETURN) {
4515 error = 0;
4516 goto done;
4517 } else if (error != 0) {
4518 goto done;
4519 }
4520 }
4521 #endif /* CONTENT_FILTER */
4522
4523 error = soshutdownlock_final(so, how);
4524
4525 done:
4526 return error;
4527 }
4528
4529 void
sowflush(struct socket * so)4530 sowflush(struct socket *so)
4531 {
4532 struct sockbuf *sb = &so->so_snd;
4533
4534 /*
4535 * Obtain lock on the socket buffer (SB_LOCK). This is required
4536 * to prevent the socket buffer from being unexpectedly altered
4537 * while it is used by another thread in socket send/receive.
4538 *
4539 * sblock() must not fail here, hence the assertion.
4540 */
4541 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4542 VERIFY(sb->sb_flags & SB_LOCK);
4543
4544 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4545 sb->sb_flags |= SB_DROP;
4546 sb->sb_upcall = NULL;
4547 sb->sb_upcallarg = NULL;
4548
4549 sbunlock(sb, TRUE); /* keep socket locked */
4550
4551 selthreadclear(&sb->sb_sel);
4552 sbrelease(sb);
4553 }
4554
4555 void
sorflush(struct socket * so)4556 sorflush(struct socket *so)
4557 {
4558 struct sockbuf *sb = &so->so_rcv;
4559 struct protosw *pr = so->so_proto;
4560 struct sockbuf asb;
4561 #ifdef notyet
4562 lck_mtx_t *mutex_held;
4563 /*
4564 * XXX: This code is currently commented out, because we may get here
4565 * as part of sofreelastref(), and at that time, pr_getlock() may no
4566 * longer be able to return us the lock; this will be fixed in future.
4567 */
4568 if (so->so_proto->pr_getlock != NULL) {
4569 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4570 } else {
4571 mutex_held = so->so_proto->pr_domain->dom_mtx;
4572 }
4573
4574 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4575 #endif /* notyet */
4576
4577 sflt_notify(so, sock_evt_flush_read, NULL);
4578
4579 socantrcvmore(so);
4580
4581 /*
4582 * Obtain lock on the socket buffer (SB_LOCK). This is required
4583 * to prevent the socket buffer from being unexpectedly altered
4584 * while it is used by another thread in socket send/receive.
4585 *
4586 * sblock() must not fail here, hence the assertion.
4587 */
4588 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4589 VERIFY(sb->sb_flags & SB_LOCK);
4590
4591 /*
4592 * Copy only the relevant fields from "sb" to "asb" which we
4593 * need for sbrelease() to function. In particular, skip
4594 * sb_sel as it contains the wait queue linkage, which would
4595 * wreak havoc if we were to issue selthreadclear() on "asb".
4596 * Make sure to not carry over SB_LOCK in "asb", as we need
4597 * to acquire it later as part of sbrelease().
4598 */
4599 bzero(&asb, sizeof(asb));
4600 asb.sb_cc = sb->sb_cc;
4601 asb.sb_hiwat = sb->sb_hiwat;
4602 asb.sb_mbcnt = sb->sb_mbcnt;
4603 asb.sb_mbmax = sb->sb_mbmax;
4604 asb.sb_ctl = sb->sb_ctl;
4605 asb.sb_lowat = sb->sb_lowat;
4606 asb.sb_mb = sb->sb_mb;
4607 asb.sb_mbtail = sb->sb_mbtail;
4608 asb.sb_lastrecord = sb->sb_lastrecord;
4609 asb.sb_so = sb->sb_so;
4610 asb.sb_flags = sb->sb_flags;
4611 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4612 asb.sb_flags |= SB_DROP;
4613
4614 /*
4615 * Ideally we'd bzero() these and preserve the ones we need;
4616 * but to do that we'd need to shuffle things around in the
4617 * sockbuf, and we can't do it now because there are KEXTS
4618 * that are directly referring to the socket structure.
4619 *
4620 * Setting SB_DROP acts as a barrier to prevent further appends.
4621 * Clearing SB_SEL is done for selthreadclear() below.
4622 */
4623 sb->sb_cc = 0;
4624 sb->sb_hiwat = 0;
4625 sb->sb_mbcnt = 0;
4626 sb->sb_mbmax = 0;
4627 sb->sb_ctl = 0;
4628 sb->sb_lowat = 0;
4629 sb->sb_mb = NULL;
4630 sb->sb_mbtail = NULL;
4631 sb->sb_lastrecord = NULL;
4632 sb->sb_timeo.tv_sec = 0;
4633 sb->sb_timeo.tv_usec = 0;
4634 sb->sb_upcall = NULL;
4635 sb->sb_upcallarg = NULL;
4636 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4637 sb->sb_flags |= SB_DROP;
4638
4639 sbunlock(sb, TRUE); /* keep socket locked */
4640
4641 /*
4642 * Note that selthreadclear() is called on the original "sb" and
4643 * not the local "asb" because of the way wait queue linkage is
4644 * implemented. Given that selwakeup() may be triggered, SB_SEL
4645 * should no longer be set (cleared above.)
4646 */
4647 selthreadclear(&sb->sb_sel);
4648
4649 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4650 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4651 }
4652
4653 sbrelease(&asb);
4654 }
4655
4656 /*
4657 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4658 * an additional variant to handle the case where the option value needs
4659 * to be some kind of integer, but not a specific size.
4660 * In addition to their use here, these functions are also called by the
4661 * protocol-level pr_ctloutput() routines.
4662 *
4663 * Returns: 0 Success
4664 * EINVAL
4665 * copyin:EFAULT
4666 */
4667 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4668 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4669 {
4670 size_t valsize;
4671
4672 /*
4673 * If the user gives us more than we wanted, we ignore it,
4674 * but if we don't get the minimum length the caller
4675 * wants, we return EINVAL. On success, sopt->sopt_valsize
4676 * is set to however much we actually retrieved.
4677 */
4678 if ((valsize = sopt->sopt_valsize) < minlen) {
4679 return EINVAL;
4680 }
4681 if (valsize > len) {
4682 sopt->sopt_valsize = valsize = len;
4683 }
4684
4685 if (sopt->sopt_p != kernproc) {
4686 return copyin(sopt->sopt_val, buf, valsize);
4687 }
4688
4689 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4690 return 0;
4691 }
4692
4693 /*
4694 * sooptcopyin_timeval
4695 * Copy in a timeval value into tv_p, and take into account whether the
4696 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4697 * code here so that we can verify the 64-bit tv_sec value before we lose
4698 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4699 */
4700 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4701 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4702 {
4703 int error;
4704
4705 if (proc_is64bit(sopt->sopt_p)) {
4706 struct user64_timeval tv64;
4707
4708 if (sopt->sopt_valsize < sizeof(tv64)) {
4709 return EINVAL;
4710 }
4711
4712 sopt->sopt_valsize = sizeof(tv64);
4713 if (sopt->sopt_p != kernproc) {
4714 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4715 if (error != 0) {
4716 return error;
4717 }
4718 } else {
4719 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4720 sizeof(tv64));
4721 }
4722 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4723 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4724 return EDOM;
4725 }
4726
4727 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4728 tv_p->tv_usec = tv64.tv_usec;
4729 } else {
4730 struct user32_timeval tv32;
4731
4732 if (sopt->sopt_valsize < sizeof(tv32)) {
4733 return EINVAL;
4734 }
4735
4736 sopt->sopt_valsize = sizeof(tv32);
4737 if (sopt->sopt_p != kernproc) {
4738 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4739 if (error != 0) {
4740 return error;
4741 }
4742 } else {
4743 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4744 sizeof(tv32));
4745 }
4746 #ifndef __LP64__
4747 /*
4748 * K64todo "comparison is always false due to
4749 * limited range of data type"
4750 */
4751 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4752 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4753 return EDOM;
4754 }
4755 #endif
4756 tv_p->tv_sec = tv32.tv_sec;
4757 tv_p->tv_usec = tv32.tv_usec;
4758 }
4759 return 0;
4760 }
4761
4762 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4763 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4764 boolean_t ignore_delegate)
4765 {
4766 kauth_cred_t cred = NULL;
4767 proc_t ep = PROC_NULL;
4768 uid_t uid;
4769 int error = 0;
4770
4771 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4772 ep = proc_find(so->e_pid);
4773 if (ep) {
4774 cred = kauth_cred_proc_ref(ep);
4775 }
4776 }
4777
4778 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4779
4780 /* uid is 0 for root */
4781 if (uid != 0 || !allow_root) {
4782 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4783 }
4784 if (cred) {
4785 kauth_cred_unref(&cred);
4786 }
4787 if (ep != PROC_NULL) {
4788 proc_rele(ep);
4789 }
4790
4791 return error;
4792 }
4793
4794 /*
4795 * Returns: 0 Success
4796 * EINVAL
4797 * ENOPROTOOPT
4798 * ENOBUFS
4799 * EDOM
4800 * sooptcopyin:EINVAL
4801 * sooptcopyin:EFAULT
4802 * sooptcopyin_timeval:EINVAL
4803 * sooptcopyin_timeval:EFAULT
4804 * sooptcopyin_timeval:EDOM
4805 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4806 * <pr_ctloutput>:???w
4807 * sflt_attach_private:??? [whatever a filter author chooses]
4808 * <sf_setoption>:??? [whatever a filter author chooses]
4809 *
4810 * Notes: Other <pru_listen> returns depend on the protocol family; all
4811 * <sf_listen> returns depend on what the filter author causes
4812 * their filter to return.
4813 */
4814 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)4815 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4816 {
4817 int error, optval;
4818 int64_t long_optval;
4819 struct linger l;
4820 struct timeval tv;
4821
4822 if (sopt->sopt_dir != SOPT_SET) {
4823 sopt->sopt_dir = SOPT_SET;
4824 }
4825
4826 if (dolock) {
4827 socket_lock(so, 1);
4828 }
4829
4830 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4831 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4832 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4833 /* the socket has been shutdown, no more sockopt's */
4834 error = EINVAL;
4835 goto out;
4836 }
4837
4838 error = sflt_setsockopt(so, sopt);
4839 if (error != 0) {
4840 if (error == EJUSTRETURN) {
4841 error = 0;
4842 }
4843 goto out;
4844 }
4845
4846 if (sopt->sopt_level != SOL_SOCKET) {
4847 if (so->so_proto != NULL &&
4848 so->so_proto->pr_ctloutput != NULL) {
4849 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4850 goto out;
4851 }
4852 error = ENOPROTOOPT;
4853 } else {
4854 /*
4855 * Allow socket-level (SOL_SOCKET) options to be filtered by
4856 * the protocol layer, if needed. A zero value returned from
4857 * the handler means use default socket-level processing as
4858 * done by the rest of this routine. Otherwise, any other
4859 * return value indicates that the option is unsupported.
4860 */
4861 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4862 pru_socheckopt(so, sopt)) != 0) {
4863 goto out;
4864 }
4865
4866 error = 0;
4867 switch (sopt->sopt_name) {
4868 case SO_LINGER:
4869 case SO_LINGER_SEC: {
4870 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
4871 if (error != 0) {
4872 goto out;
4873 }
4874 /* Make sure to use sane values */
4875 if (sopt->sopt_name == SO_LINGER) {
4876 so->so_linger = (short)l.l_linger;
4877 } else {
4878 so->so_linger = (short)((long)l.l_linger * hz);
4879 }
4880 if (l.l_onoff != 0) {
4881 so->so_options |= SO_LINGER;
4882 } else {
4883 so->so_options &= ~SO_LINGER;
4884 }
4885 break;
4886 }
4887 case SO_DEBUG:
4888 case SO_KEEPALIVE:
4889 case SO_DONTROUTE:
4890 case SO_USELOOPBACK:
4891 case SO_BROADCAST:
4892 case SO_REUSEADDR:
4893 case SO_REUSEPORT:
4894 case SO_OOBINLINE:
4895 case SO_TIMESTAMP:
4896 case SO_TIMESTAMP_MONOTONIC:
4897 case SO_TIMESTAMP_CONTINUOUS:
4898 case SO_DONTTRUNC:
4899 case SO_WANTMORE:
4900 case SO_WANTOOBFLAG:
4901 case SO_NOWAKEFROMSLEEP:
4902 case SO_NOAPNFALLBK:
4903 error = sooptcopyin(sopt, &optval, sizeof(optval),
4904 sizeof(optval));
4905 if (error != 0) {
4906 goto out;
4907 }
4908 if (optval) {
4909 so->so_options |= sopt->sopt_name;
4910 } else {
4911 so->so_options &= ~sopt->sopt_name;
4912 }
4913 #if SKYWALK
4914 inp_update_netns_flags(so);
4915 #endif /* SKYWALK */
4916 break;
4917
4918 case SO_SNDBUF:
4919 case SO_RCVBUF:
4920 case SO_SNDLOWAT:
4921 case SO_RCVLOWAT:
4922 error = sooptcopyin(sopt, &optval, sizeof(optval),
4923 sizeof(optval));
4924 if (error != 0) {
4925 goto out;
4926 }
4927
4928 /*
4929 * Values < 1 make no sense for any of these
4930 * options, so disallow them.
4931 */
4932 if (optval < 1) {
4933 error = EINVAL;
4934 goto out;
4935 }
4936
4937 switch (sopt->sopt_name) {
4938 case SO_SNDBUF:
4939 case SO_RCVBUF: {
4940 struct sockbuf *sb =
4941 (sopt->sopt_name == SO_SNDBUF) ?
4942 &so->so_snd : &so->so_rcv;
4943 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4944 error = ENOBUFS;
4945 goto out;
4946 }
4947 sb->sb_flags |= SB_USRSIZE;
4948 sb->sb_flags &= ~SB_AUTOSIZE;
4949 sb->sb_idealsize = (u_int32_t)optval;
4950 break;
4951 }
4952 /*
4953 * Make sure the low-water is never greater than
4954 * the high-water.
4955 */
4956 case SO_SNDLOWAT: {
4957 int space = sbspace(&so->so_snd);
4958 uint32_t hiwat = so->so_snd.sb_hiwat;
4959
4960 if (so->so_snd.sb_flags & SB_UNIX) {
4961 struct unpcb *unp =
4962 (struct unpcb *)(so->so_pcb);
4963 if (unp != NULL &&
4964 unp->unp_conn != NULL) {
4965 struct socket *so2 = unp->unp_conn->unp_socket;
4966 hiwat += unp->unp_conn->unp_cc;
4967 space = sbspace(&so2->so_rcv);
4968 }
4969 }
4970
4971 so->so_snd.sb_lowat =
4972 (optval > hiwat) ?
4973 hiwat : optval;
4974
4975 if (space >= so->so_snd.sb_lowat) {
4976 sowwakeup(so);
4977 }
4978 break;
4979 }
4980 case SO_RCVLOWAT: {
4981 int64_t data_len;
4982 so->so_rcv.sb_lowat =
4983 (optval > so->so_rcv.sb_hiwat) ?
4984 so->so_rcv.sb_hiwat : optval;
4985 if (so->so_rcv.sb_flags & SB_UNIX) {
4986 struct unpcb *unp =
4987 (struct unpcb *)(so->so_pcb);
4988 if (unp != NULL &&
4989 unp->unp_conn != NULL) {
4990 struct socket *so2 = unp->unp_conn->unp_socket;
4991 data_len = so2->so_snd.sb_cc
4992 - so2->so_snd.sb_ctl;
4993 } else {
4994 data_len = so->so_rcv.sb_cc
4995 - so->so_rcv.sb_ctl;
4996 }
4997 } else {
4998 data_len = so->so_rcv.sb_cc
4999 - so->so_rcv.sb_ctl;
5000 }
5001
5002 if (data_len >= so->so_rcv.sb_lowat) {
5003 sorwakeup(so);
5004 }
5005 break;
5006 }
5007 }
5008 break;
5009
5010 case SO_SNDTIMEO:
5011 case SO_RCVTIMEO:
5012 error = sooptcopyin_timeval(sopt, &tv);
5013 if (error != 0) {
5014 goto out;
5015 }
5016
5017 switch (sopt->sopt_name) {
5018 case SO_SNDTIMEO:
5019 so->so_snd.sb_timeo = tv;
5020 break;
5021 case SO_RCVTIMEO:
5022 so->so_rcv.sb_timeo = tv;
5023 break;
5024 }
5025 break;
5026
5027 case SO_NKE: {
5028 struct so_nke nke;
5029
5030 error = sooptcopyin(sopt, &nke, sizeof(nke),
5031 sizeof(nke));
5032 if (error != 0) {
5033 goto out;
5034 }
5035
5036 error = sflt_attach_internal(so, nke.nke_handle);
5037 break;
5038 }
5039
5040 case SO_NOSIGPIPE:
5041 error = sooptcopyin(sopt, &optval, sizeof(optval),
5042 sizeof(optval));
5043 if (error != 0) {
5044 goto out;
5045 }
5046 if (optval != 0) {
5047 so->so_flags |= SOF_NOSIGPIPE;
5048 } else {
5049 so->so_flags &= ~SOF_NOSIGPIPE;
5050 }
5051 break;
5052
5053 case SO_NOADDRERR:
5054 error = sooptcopyin(sopt, &optval, sizeof(optval),
5055 sizeof(optval));
5056 if (error != 0) {
5057 goto out;
5058 }
5059 if (optval != 0) {
5060 so->so_flags |= SOF_NOADDRAVAIL;
5061 } else {
5062 so->so_flags &= ~SOF_NOADDRAVAIL;
5063 }
5064 break;
5065
5066 case SO_REUSESHAREUID:
5067 error = sooptcopyin(sopt, &optval, sizeof(optval),
5068 sizeof(optval));
5069 if (error != 0) {
5070 goto out;
5071 }
5072 if (optval != 0) {
5073 so->so_flags |= SOF_REUSESHAREUID;
5074 } else {
5075 so->so_flags &= ~SOF_REUSESHAREUID;
5076 }
5077 break;
5078
5079 case SO_NOTIFYCONFLICT:
5080 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5081 error = EPERM;
5082 goto out;
5083 }
5084 error = sooptcopyin(sopt, &optval, sizeof(optval),
5085 sizeof(optval));
5086 if (error != 0) {
5087 goto out;
5088 }
5089 if (optval != 0) {
5090 so->so_flags |= SOF_NOTIFYCONFLICT;
5091 } else {
5092 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5093 }
5094 break;
5095
5096 case SO_RESTRICTIONS:
5097 error = sooptcopyin(sopt, &optval, sizeof(optval),
5098 sizeof(optval));
5099 if (error != 0) {
5100 goto out;
5101 }
5102
5103 error = so_set_restrictions(so, optval);
5104 break;
5105
5106 case SO_AWDL_UNRESTRICTED:
5107 if (SOCK_DOM(so) != PF_INET &&
5108 SOCK_DOM(so) != PF_INET6) {
5109 error = EOPNOTSUPP;
5110 goto out;
5111 }
5112 error = sooptcopyin(sopt, &optval, sizeof(optval),
5113 sizeof(optval));
5114 if (error != 0) {
5115 goto out;
5116 }
5117 if (optval != 0) {
5118 error = soopt_cred_check(so,
5119 PRIV_NET_RESTRICTED_AWDL, false, false);
5120 if (error == 0) {
5121 inp_set_awdl_unrestricted(
5122 sotoinpcb(so));
5123 }
5124 } else {
5125 inp_clear_awdl_unrestricted(sotoinpcb(so));
5126 }
5127 break;
5128 case SO_INTCOPROC_ALLOW:
5129 if (SOCK_DOM(so) != PF_INET6) {
5130 error = EOPNOTSUPP;
5131 goto out;
5132 }
5133 error = sooptcopyin(sopt, &optval, sizeof(optval),
5134 sizeof(optval));
5135 if (error != 0) {
5136 goto out;
5137 }
5138 if (optval != 0 &&
5139 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5140 error = soopt_cred_check(so,
5141 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5142 if (error == 0) {
5143 inp_set_intcoproc_allowed(
5144 sotoinpcb(so));
5145 }
5146 } else if (optval == 0) {
5147 inp_clear_intcoproc_allowed(sotoinpcb(so));
5148 }
5149 break;
5150
5151 case SO_LABEL:
5152 error = EOPNOTSUPP;
5153 break;
5154
5155 case SO_UPCALLCLOSEWAIT:
5156 error = sooptcopyin(sopt, &optval, sizeof(optval),
5157 sizeof(optval));
5158 if (error != 0) {
5159 goto out;
5160 }
5161 if (optval != 0) {
5162 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5163 } else {
5164 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5165 }
5166 break;
5167
5168 case SO_RANDOMPORT:
5169 error = sooptcopyin(sopt, &optval, sizeof(optval),
5170 sizeof(optval));
5171 if (error != 0) {
5172 goto out;
5173 }
5174 if (optval != 0) {
5175 so->so_flags |= SOF_BINDRANDOMPORT;
5176 } else {
5177 so->so_flags &= ~SOF_BINDRANDOMPORT;
5178 }
5179 break;
5180
5181 case SO_NP_EXTENSIONS: {
5182 struct so_np_extensions sonpx;
5183
5184 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5185 sizeof(sonpx));
5186 if (error != 0) {
5187 goto out;
5188 }
5189 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5190 error = EINVAL;
5191 goto out;
5192 }
5193 /*
5194 * Only one bit defined for now
5195 */
5196 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5197 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5198 so->so_flags |= SOF_NPX_SETOPTSHUT;
5199 } else {
5200 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5201 }
5202 }
5203 break;
5204 }
5205
5206 case SO_TRAFFIC_CLASS: {
5207 error = sooptcopyin(sopt, &optval, sizeof(optval),
5208 sizeof(optval));
5209 if (error != 0) {
5210 goto out;
5211 }
5212 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5213 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5214 error = so_set_net_service_type(so, netsvc);
5215 goto out;
5216 }
5217 error = so_set_traffic_class(so, optval);
5218 if (error != 0) {
5219 goto out;
5220 }
5221 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5222 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5223 break;
5224 }
5225
5226 case SO_RECV_TRAFFIC_CLASS: {
5227 error = sooptcopyin(sopt, &optval, sizeof(optval),
5228 sizeof(optval));
5229 if (error != 0) {
5230 goto out;
5231 }
5232 if (optval == 0) {
5233 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5234 } else {
5235 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5236 }
5237 break;
5238 }
5239
5240 #if (DEVELOPMENT || DEBUG)
5241 case SO_TRAFFIC_CLASS_DBG: {
5242 struct so_tcdbg so_tcdbg;
5243
5244 error = sooptcopyin(sopt, &so_tcdbg,
5245 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5246 if (error != 0) {
5247 goto out;
5248 }
5249 error = so_set_tcdbg(so, &so_tcdbg);
5250 if (error != 0) {
5251 goto out;
5252 }
5253 break;
5254 }
5255 #endif /* (DEVELOPMENT || DEBUG) */
5256
5257 case SO_PRIVILEGED_TRAFFIC_CLASS:
5258 error = priv_check_cred(kauth_cred_get(),
5259 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5260 if (error != 0) {
5261 goto out;
5262 }
5263 error = sooptcopyin(sopt, &optval, sizeof(optval),
5264 sizeof(optval));
5265 if (error != 0) {
5266 goto out;
5267 }
5268 if (optval == 0) {
5269 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5270 } else {
5271 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5272 }
5273 break;
5274
5275 #if (DEVELOPMENT || DEBUG)
5276 case SO_DEFUNCTIT:
5277 error = sosetdefunct(current_proc(), so, 0, FALSE);
5278 if (error == 0) {
5279 error = sodefunct(current_proc(), so, 0);
5280 }
5281
5282 break;
5283 #endif /* (DEVELOPMENT || DEBUG) */
5284
5285 case SO_DEFUNCTOK:
5286 error = sooptcopyin(sopt, &optval, sizeof(optval),
5287 sizeof(optval));
5288 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5289 if (error == 0) {
5290 error = EBADF;
5291 }
5292 goto out;
5293 }
5294 /*
5295 * Any process can set SO_DEFUNCTOK (clear
5296 * SOF_NODEFUNCT), but only root can clear
5297 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5298 */
5299 if (optval == 0 &&
5300 kauth_cred_issuser(kauth_cred_get()) == 0) {
5301 error = EPERM;
5302 goto out;
5303 }
5304 if (optval) {
5305 so->so_flags &= ~SOF_NODEFUNCT;
5306 } else {
5307 so->so_flags |= SOF_NODEFUNCT;
5308 }
5309
5310 if (SOCK_DOM(so) == PF_INET ||
5311 SOCK_DOM(so) == PF_INET6) {
5312 char s[MAX_IPv6_STR_LEN];
5313 char d[MAX_IPv6_STR_LEN];
5314 struct inpcb *inp = sotoinpcb(so);
5315
5316 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5317 "[%s %s:%d -> %s:%d] is now marked "
5318 "as %seligible for "
5319 "defunct\n", __func__, proc_selfpid(),
5320 proc_best_name(current_proc()),
5321 so->so_gencnt,
5322 (SOCK_TYPE(so) == SOCK_STREAM) ?
5323 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5324 ((SOCK_DOM(so) == PF_INET) ?
5325 (void *)&inp->inp_laddr.s_addr :
5326 (void *)&inp->in6p_laddr), s, sizeof(s)),
5327 ntohs(inp->in6p_lport),
5328 inet_ntop(SOCK_DOM(so),
5329 (SOCK_DOM(so) == PF_INET) ?
5330 (void *)&inp->inp_faddr.s_addr :
5331 (void *)&inp->in6p_faddr, d, sizeof(d)),
5332 ntohs(inp->in6p_fport),
5333 (so->so_flags & SOF_NODEFUNCT) ?
5334 "not " : "");
5335 } else {
5336 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5337 "is now marked as %seligible for "
5338 "defunct\n",
5339 __func__, proc_selfpid(),
5340 proc_best_name(current_proc()),
5341 so->so_gencnt,
5342 SOCK_DOM(so), SOCK_TYPE(so),
5343 (so->so_flags & SOF_NODEFUNCT) ?
5344 "not " : "");
5345 }
5346 break;
5347
5348 case SO_ISDEFUNCT:
5349 /* This option is not settable */
5350 error = EINVAL;
5351 break;
5352
5353 case SO_OPPORTUNISTIC:
5354 error = sooptcopyin(sopt, &optval, sizeof(optval),
5355 sizeof(optval));
5356 if (error == 0) {
5357 error = so_set_opportunistic(so, optval);
5358 }
5359 break;
5360
5361 case SO_FLUSH:
5362 /* This option is handled by lower layer(s) */
5363 error = 0;
5364 break;
5365
5366 case SO_RECV_ANYIF:
5367 error = sooptcopyin(sopt, &optval, sizeof(optval),
5368 sizeof(optval));
5369 if (error == 0) {
5370 error = so_set_recv_anyif(so, optval);
5371 }
5372 break;
5373
5374 case SO_TRAFFIC_MGT_BACKGROUND: {
5375 /* This option is handled by lower layer(s) */
5376 error = 0;
5377 break;
5378 }
5379
5380 #if FLOW_DIVERT
5381 case SO_FLOW_DIVERT_TOKEN:
5382 error = flow_divert_token_set(so, sopt);
5383 break;
5384 #endif /* FLOW_DIVERT */
5385
5386
5387 case SO_DELEGATED:
5388 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5389 sizeof(optval))) != 0) {
5390 break;
5391 }
5392
5393 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5394 break;
5395
5396 case SO_DELEGATED_UUID: {
5397 uuid_t euuid;
5398
5399 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5400 sizeof(euuid))) != 0) {
5401 break;
5402 }
5403
5404 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5405 break;
5406 }
5407
5408 #if NECP
5409 case SO_NECP_ATTRIBUTES:
5410 if (SOCK_DOM(so) == PF_MULTIPATH) {
5411 /* Handled by MPTCP itself */
5412 break;
5413 }
5414
5415 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5416 error = EINVAL;
5417 goto out;
5418 }
5419
5420 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5421 break;
5422
5423 case SO_NECP_CLIENTUUID: {
5424 if (SOCK_DOM(so) == PF_MULTIPATH) {
5425 /* Handled by MPTCP itself */
5426 break;
5427 }
5428
5429 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5430 error = EINVAL;
5431 goto out;
5432 }
5433
5434 struct inpcb *inp = sotoinpcb(so);
5435 if (!uuid_is_null(inp->necp_client_uuid)) {
5436 // Clear out the old client UUID if present
5437 necp_inpcb_remove_cb(inp);
5438 }
5439
5440 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5441 sizeof(uuid_t), sizeof(uuid_t));
5442 if (error != 0) {
5443 goto out;
5444 }
5445
5446 if (uuid_is_null(inp->necp_client_uuid)) {
5447 error = EINVAL;
5448 goto out;
5449 }
5450
5451 pid_t current_pid = proc_pid(current_proc());
5452 error = necp_client_register_socket_flow(current_pid,
5453 inp->necp_client_uuid, inp);
5454 if (error != 0) {
5455 uuid_clear(inp->necp_client_uuid);
5456 goto out;
5457 }
5458
5459 if (inp->inp_lport != 0) {
5460 // There is a bound local port, so this is not
5461 // a fresh socket. Assign to the client.
5462 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5463 }
5464
5465 break;
5466 }
5467 case SO_NECP_LISTENUUID: {
5468 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5469 error = EINVAL;
5470 goto out;
5471 }
5472
5473 struct inpcb *inp = sotoinpcb(so);
5474 if (!uuid_is_null(inp->necp_client_uuid)) {
5475 error = EINVAL;
5476 goto out;
5477 }
5478
5479 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5480 sizeof(uuid_t), sizeof(uuid_t));
5481 if (error != 0) {
5482 goto out;
5483 }
5484
5485 if (uuid_is_null(inp->necp_client_uuid)) {
5486 error = EINVAL;
5487 goto out;
5488 }
5489
5490 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5491 inp->necp_client_uuid, inp);
5492 if (error != 0) {
5493 uuid_clear(inp->necp_client_uuid);
5494 goto out;
5495 }
5496
5497 // Mark that the port registration is held by NECP
5498 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5499
5500 break;
5501 }
5502
5503 case SO_RESOLVER_SIGNATURE: {
5504 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5505 error = EINVAL;
5506 goto out;
5507 }
5508 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5509 break;
5510 }
5511 #endif /* NECP */
5512
5513 case SO_EXTENDED_BK_IDLE:
5514 error = sooptcopyin(sopt, &optval, sizeof(optval),
5515 sizeof(optval));
5516 if (error == 0) {
5517 error = so_set_extended_bk_idle(so, optval);
5518 }
5519 break;
5520
5521 case SO_MARK_CELLFALLBACK:
5522 error = sooptcopyin(sopt, &optval, sizeof(optval),
5523 sizeof(optval));
5524 if (error != 0) {
5525 goto out;
5526 }
5527 if (optval < 0) {
5528 error = EINVAL;
5529 goto out;
5530 }
5531 if (optval == 0) {
5532 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5533 } else {
5534 so->so_flags1 |= SOF1_CELLFALLBACK;
5535 }
5536 break;
5537
5538 case SO_MARK_CELLFALLBACK_UUID:
5539 {
5540 struct so_mark_cellfallback_uuid_args args;
5541
5542 error = sooptcopyin(sopt, &args, sizeof(args),
5543 sizeof(args));
5544 if (error != 0) {
5545 goto out;
5546 }
5547 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5548 args.flow_cellfallback);
5549 break;
5550 }
5551
5552 case SO_FALLBACK_MODE:
5553 error = sooptcopyin(sopt, &optval, sizeof(optval),
5554 sizeof(optval));
5555 if (error != 0) {
5556 goto out;
5557 }
5558 if (optval < SO_FALLBACK_MODE_NONE ||
5559 optval > SO_FALLBACK_MODE_PREFER) {
5560 error = EINVAL;
5561 goto out;
5562 }
5563 so->so_fallback_mode = (u_int8_t)optval;
5564 break;
5565
5566 case SO_MARK_KNOWN_TRACKER: {
5567 error = sooptcopyin(sopt, &optval, sizeof(optval),
5568 sizeof(optval));
5569 if (error != 0) {
5570 goto out;
5571 }
5572 if (optval < 0) {
5573 error = EINVAL;
5574 goto out;
5575 }
5576 if (optval == 0) {
5577 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5578 } else {
5579 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5580 }
5581 break;
5582 }
5583
5584 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5585 error = sooptcopyin(sopt, &optval, sizeof(optval),
5586 sizeof(optval));
5587 if (error != 0) {
5588 goto out;
5589 }
5590 if (optval < 0) {
5591 error = EINVAL;
5592 goto out;
5593 }
5594 if (optval == 0) {
5595 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5596 } else {
5597 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5598 }
5599 break;
5600 }
5601
5602 case SO_MARK_APPROVED_APP_DOMAIN: {
5603 error = sooptcopyin(sopt, &optval, sizeof(optval),
5604 sizeof(optval));
5605 if (error != 0) {
5606 goto out;
5607 }
5608 if (optval < 0) {
5609 error = EINVAL;
5610 goto out;
5611 }
5612 if (optval == 0) {
5613 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5614 } else {
5615 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5616 }
5617 break;
5618 }
5619
5620 case SO_STATISTICS_EVENT:
5621 error = sooptcopyin(sopt, &long_optval,
5622 sizeof(long_optval), sizeof(long_optval));
5623 if (error != 0) {
5624 goto out;
5625 }
5626 u_int64_t nstat_event = 0;
5627 error = so_statistics_event_to_nstat_event(
5628 &long_optval, &nstat_event);
5629 if (error != 0) {
5630 goto out;
5631 }
5632 nstat_pcb_event(sotoinpcb(so), nstat_event);
5633 break;
5634
5635 case SO_NET_SERVICE_TYPE: {
5636 error = sooptcopyin(sopt, &optval, sizeof(optval),
5637 sizeof(optval));
5638 if (error != 0) {
5639 goto out;
5640 }
5641 error = so_set_net_service_type(so, optval);
5642 break;
5643 }
5644
5645 case SO_QOSMARKING_POLICY_OVERRIDE:
5646 error = priv_check_cred(kauth_cred_get(),
5647 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5648 if (error != 0) {
5649 goto out;
5650 }
5651 error = sooptcopyin(sopt, &optval, sizeof(optval),
5652 sizeof(optval));
5653 if (error != 0) {
5654 goto out;
5655 }
5656 if (optval == 0) {
5657 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5658 } else {
5659 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5660 }
5661 break;
5662
5663 case SO_MPKL_SEND_INFO: {
5664 struct so_mpkl_send_info so_mpkl_send_info;
5665
5666 error = sooptcopyin(sopt, &so_mpkl_send_info,
5667 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5668 if (error != 0) {
5669 goto out;
5670 }
5671 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5672 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5673
5674 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5675 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5676 } else {
5677 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5678 }
5679 break;
5680 }
5681 case SO_WANT_KEV_SOCKET_CLOSED: {
5682 error = sooptcopyin(sopt, &optval, sizeof(optval),
5683 sizeof(optval));
5684 if (error != 0) {
5685 goto out;
5686 }
5687 if (optval == 0) {
5688 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5689 } else {
5690 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5691 }
5692 break;
5693 }
5694 case SO_MARK_WAKE_PKT: {
5695 error = sooptcopyin(sopt, &optval, sizeof(optval),
5696 sizeof(optval));
5697 if (error != 0) {
5698 goto out;
5699 }
5700 if (optval == 0) {
5701 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5702 } else {
5703 so->so_flags |= SOF_MARK_WAKE_PKT;
5704 }
5705 break;
5706 }
5707 case SO_RECV_WAKE_PKT: {
5708 error = sooptcopyin(sopt, &optval, sizeof(optval),
5709 sizeof(optval));
5710 if (error != 0) {
5711 goto out;
5712 }
5713 if (optval == 0) {
5714 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5715 } else {
5716 so->so_flags |= SOF_RECV_WAKE_PKT;
5717 }
5718 break;
5719 }
5720 case SO_APPLICATION_ID: {
5721 so_application_id_t application_id = { 0 };
5722
5723 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5724 error = EINVAL;
5725 goto out;
5726 }
5727 error = sooptcopyin(sopt, &application_id, sizeof(application_id),
5728 sizeof(application_id));
5729 if (error != 0) {
5730 goto out;
5731 }
5732
5733 // The user needs to match
5734 if (kauth_cred_getuid(so->so_cred) != application_id.uid) {
5735 error = EINVAL;
5736 printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5737 goto out;
5738 }
5739 error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true);
5740 if (error != 0) {
5741 printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5742 goto out;
5743 }
5744 if (application_id.persona_id != PERSONA_ID_NONE) {
5745 so->so_persona_id = application_id.persona_id;
5746 }
5747 break;
5748 }
5749 default:
5750 error = ENOPROTOOPT;
5751 break;
5752 }
5753 if (error == 0 && so->so_proto != NULL &&
5754 so->so_proto->pr_ctloutput != NULL) {
5755 (void) so->so_proto->pr_ctloutput(so, sopt);
5756 }
5757 }
5758 out:
5759 if (dolock) {
5760 socket_unlock(so, 1);
5761 }
5762 return error;
5763 }
5764
5765 /* Helper routines for getsockopt */
5766 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5767 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5768 {
5769 int error;
5770 size_t valsize;
5771
5772 error = 0;
5773
5774 /*
5775 * Documented get behavior is that we always return a value,
5776 * possibly truncated to fit in the user's buffer.
5777 * Traditional behavior is that we always tell the user
5778 * precisely how much we copied, rather than something useful
5779 * like the total amount we had available for her.
5780 * Note that this interface is not idempotent; the entire answer must
5781 * generated ahead of time.
5782 */
5783 valsize = MIN(len, sopt->sopt_valsize);
5784 sopt->sopt_valsize = valsize;
5785 if (sopt->sopt_val != USER_ADDR_NULL) {
5786 if (sopt->sopt_p != kernproc) {
5787 error = copyout(buf, sopt->sopt_val, valsize);
5788 } else {
5789 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5790 }
5791 }
5792 return error;
5793 }
5794
5795 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5796 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5797 {
5798 int error;
5799 size_t len;
5800 struct user64_timeval tv64 = {};
5801 struct user32_timeval tv32 = {};
5802 const void * val;
5803 size_t valsize;
5804
5805 error = 0;
5806 if (proc_is64bit(sopt->sopt_p)) {
5807 len = sizeof(tv64);
5808 tv64.tv_sec = tv_p->tv_sec;
5809 tv64.tv_usec = tv_p->tv_usec;
5810 val = &tv64;
5811 } else {
5812 len = sizeof(tv32);
5813 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5814 tv32.tv_usec = tv_p->tv_usec;
5815 val = &tv32;
5816 }
5817 valsize = MIN(len, sopt->sopt_valsize);
5818 sopt->sopt_valsize = valsize;
5819 if (sopt->sopt_val != USER_ADDR_NULL) {
5820 if (sopt->sopt_p != kernproc) {
5821 error = copyout(val, sopt->sopt_val, valsize);
5822 } else {
5823 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5824 }
5825 }
5826 return error;
5827 }
5828
5829 /*
5830 * Return: 0 Success
5831 * ENOPROTOOPT
5832 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5833 * <pr_ctloutput>:???
5834 * <sf_getoption>:???
5835 */
5836 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5837 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5838 {
5839 int error, optval;
5840 struct linger l;
5841 struct timeval tv;
5842
5843 if (sopt->sopt_dir != SOPT_GET) {
5844 sopt->sopt_dir = SOPT_GET;
5845 }
5846
5847 if (dolock) {
5848 socket_lock(so, 1);
5849 }
5850
5851 error = sflt_getsockopt(so, sopt);
5852 if (error != 0) {
5853 if (error == EJUSTRETURN) {
5854 error = 0;
5855 }
5856 goto out;
5857 }
5858
5859 if (sopt->sopt_level != SOL_SOCKET) {
5860 if (so->so_proto != NULL &&
5861 so->so_proto->pr_ctloutput != NULL) {
5862 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5863 goto out;
5864 }
5865 error = ENOPROTOOPT;
5866 } else {
5867 /*
5868 * Allow socket-level (SOL_SOCKET) options to be filtered by
5869 * the protocol layer, if needed. A zero value returned from
5870 * the handler means use default socket-level processing as
5871 * done by the rest of this routine. Otherwise, any other
5872 * return value indicates that the option is unsupported.
5873 */
5874 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5875 pru_socheckopt(so, sopt)) != 0) {
5876 goto out;
5877 }
5878
5879 error = 0;
5880 switch (sopt->sopt_name) {
5881 case SO_LINGER:
5882 case SO_LINGER_SEC:
5883 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5884 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5885 so->so_linger : so->so_linger / hz;
5886 error = sooptcopyout(sopt, &l, sizeof(l));
5887 break;
5888
5889 case SO_USELOOPBACK:
5890 case SO_DONTROUTE:
5891 case SO_DEBUG:
5892 case SO_KEEPALIVE:
5893 case SO_REUSEADDR:
5894 case SO_REUSEPORT:
5895 case SO_BROADCAST:
5896 case SO_OOBINLINE:
5897 case SO_TIMESTAMP:
5898 case SO_TIMESTAMP_MONOTONIC:
5899 case SO_TIMESTAMP_CONTINUOUS:
5900 case SO_DONTTRUNC:
5901 case SO_WANTMORE:
5902 case SO_WANTOOBFLAG:
5903 case SO_NOWAKEFROMSLEEP:
5904 case SO_NOAPNFALLBK:
5905 optval = so->so_options & sopt->sopt_name;
5906 integer:
5907 error = sooptcopyout(sopt, &optval, sizeof(optval));
5908 break;
5909
5910 case SO_TYPE:
5911 optval = so->so_type;
5912 goto integer;
5913
5914 case SO_NREAD:
5915 if (so->so_proto->pr_flags & PR_ATOMIC) {
5916 int pkt_total;
5917 struct mbuf *m1;
5918
5919 pkt_total = 0;
5920 m1 = so->so_rcv.sb_mb;
5921 while (m1 != NULL) {
5922 if (m1->m_type == MT_DATA ||
5923 m1->m_type == MT_HEADER ||
5924 m1->m_type == MT_OOBDATA) {
5925 pkt_total += m1->m_len;
5926 }
5927 m1 = m1->m_next;
5928 }
5929 optval = pkt_total;
5930 } else {
5931 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5932 }
5933 goto integer;
5934
5935 case SO_NUMRCVPKT:
5936 if (so->so_proto->pr_flags & PR_ATOMIC) {
5937 int cnt = 0;
5938 struct mbuf *m1;
5939
5940 m1 = so->so_rcv.sb_mb;
5941 while (m1 != NULL) {
5942 cnt += 1;
5943 m1 = m1->m_nextpkt;
5944 }
5945 optval = cnt;
5946 goto integer;
5947 } else {
5948 error = ENOPROTOOPT;
5949 break;
5950 }
5951
5952 case SO_NWRITE:
5953 optval = so->so_snd.sb_cc;
5954 goto integer;
5955
5956 case SO_ERROR:
5957 optval = so->so_error;
5958 so->so_error = 0;
5959 goto integer;
5960
5961 case SO_SNDBUF: {
5962 u_int32_t hiwat = so->so_snd.sb_hiwat;
5963
5964 if (so->so_snd.sb_flags & SB_UNIX) {
5965 struct unpcb *unp =
5966 (struct unpcb *)(so->so_pcb);
5967 if (unp != NULL && unp->unp_conn != NULL) {
5968 hiwat += unp->unp_conn->unp_cc;
5969 }
5970 }
5971
5972 optval = hiwat;
5973 goto integer;
5974 }
5975 case SO_RCVBUF:
5976 optval = so->so_rcv.sb_hiwat;
5977 goto integer;
5978
5979 case SO_SNDLOWAT:
5980 optval = so->so_snd.sb_lowat;
5981 goto integer;
5982
5983 case SO_RCVLOWAT:
5984 optval = so->so_rcv.sb_lowat;
5985 goto integer;
5986
5987 case SO_SNDTIMEO:
5988 case SO_RCVTIMEO:
5989 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5990 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5991
5992 error = sooptcopyout_timeval(sopt, &tv);
5993 break;
5994
5995 case SO_NOSIGPIPE:
5996 optval = (so->so_flags & SOF_NOSIGPIPE);
5997 goto integer;
5998
5999 case SO_NOADDRERR:
6000 optval = (so->so_flags & SOF_NOADDRAVAIL);
6001 goto integer;
6002
6003 case SO_REUSESHAREUID:
6004 optval = (so->so_flags & SOF_REUSESHAREUID);
6005 goto integer;
6006
6007
6008 case SO_NOTIFYCONFLICT:
6009 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6010 goto integer;
6011
6012 case SO_RESTRICTIONS:
6013 optval = so_get_restrictions(so);
6014 goto integer;
6015
6016 case SO_AWDL_UNRESTRICTED:
6017 if (SOCK_DOM(so) == PF_INET ||
6018 SOCK_DOM(so) == PF_INET6) {
6019 optval = inp_get_awdl_unrestricted(
6020 sotoinpcb(so));
6021 goto integer;
6022 } else {
6023 error = EOPNOTSUPP;
6024 }
6025 break;
6026
6027 case SO_INTCOPROC_ALLOW:
6028 if (SOCK_DOM(so) == PF_INET6) {
6029 optval = inp_get_intcoproc_allowed(
6030 sotoinpcb(so));
6031 goto integer;
6032 } else {
6033 error = EOPNOTSUPP;
6034 }
6035 break;
6036
6037 case SO_LABEL:
6038 error = EOPNOTSUPP;
6039 break;
6040
6041 case SO_PEERLABEL:
6042 error = EOPNOTSUPP;
6043 break;
6044
6045 #ifdef __APPLE_API_PRIVATE
6046 case SO_UPCALLCLOSEWAIT:
6047 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6048 goto integer;
6049 #endif
6050 case SO_RANDOMPORT:
6051 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6052 goto integer;
6053
6054 case SO_NP_EXTENSIONS: {
6055 struct so_np_extensions sonpx = {};
6056
6057 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6058 SONPX_SETOPTSHUT : 0;
6059 sonpx.npx_mask = SONPX_MASK_VALID;
6060
6061 error = sooptcopyout(sopt, &sonpx,
6062 sizeof(struct so_np_extensions));
6063 break;
6064 }
6065
6066 case SO_TRAFFIC_CLASS:
6067 optval = so->so_traffic_class;
6068 goto integer;
6069
6070 case SO_RECV_TRAFFIC_CLASS:
6071 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6072 goto integer;
6073
6074 #if (DEVELOPMENT || DEBUG)
6075 case SO_TRAFFIC_CLASS_DBG:
6076 error = sogetopt_tcdbg(so, sopt);
6077 break;
6078 #endif /* (DEVELOPMENT || DEBUG) */
6079
6080 case SO_PRIVILEGED_TRAFFIC_CLASS:
6081 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6082 goto integer;
6083
6084 case SO_DEFUNCTOK:
6085 optval = !(so->so_flags & SOF_NODEFUNCT);
6086 goto integer;
6087
6088 case SO_ISDEFUNCT:
6089 optval = (so->so_flags & SOF_DEFUNCT);
6090 goto integer;
6091
6092 case SO_OPPORTUNISTIC:
6093 optval = so_get_opportunistic(so);
6094 goto integer;
6095
6096 case SO_FLUSH:
6097 /* This option is not gettable */
6098 error = EINVAL;
6099 break;
6100
6101 case SO_RECV_ANYIF:
6102 optval = so_get_recv_anyif(so);
6103 goto integer;
6104
6105 case SO_TRAFFIC_MGT_BACKGROUND:
6106 /* This option is handled by lower layer(s) */
6107 if (so->so_proto != NULL &&
6108 so->so_proto->pr_ctloutput != NULL) {
6109 (void) so->so_proto->pr_ctloutput(so, sopt);
6110 }
6111 break;
6112
6113 #if FLOW_DIVERT
6114 case SO_FLOW_DIVERT_TOKEN:
6115 error = flow_divert_token_get(so, sopt);
6116 break;
6117 #endif /* FLOW_DIVERT */
6118
6119 #if NECP
6120 case SO_NECP_ATTRIBUTES:
6121 if (SOCK_DOM(so) == PF_MULTIPATH) {
6122 /* Handled by MPTCP itself */
6123 break;
6124 }
6125
6126 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6127 error = EINVAL;
6128 goto out;
6129 }
6130
6131 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6132 break;
6133
6134 case SO_NECP_CLIENTUUID: {
6135 uuid_t *ncu;
6136
6137 if (SOCK_DOM(so) == PF_MULTIPATH) {
6138 ncu = &mpsotomppcb(so)->necp_client_uuid;
6139 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6140 ncu = &sotoinpcb(so)->necp_client_uuid;
6141 } else {
6142 error = EINVAL;
6143 goto out;
6144 }
6145
6146 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6147 break;
6148 }
6149
6150 case SO_NECP_LISTENUUID: {
6151 uuid_t *nlu;
6152
6153 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6154 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6155 nlu = &sotoinpcb(so)->necp_client_uuid;
6156 } else {
6157 error = ENOENT;
6158 goto out;
6159 }
6160 } else {
6161 error = EINVAL;
6162 goto out;
6163 }
6164
6165 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6166 break;
6167 }
6168
6169 case SO_RESOLVER_SIGNATURE: {
6170 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6171 error = EINVAL;
6172 goto out;
6173 }
6174 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6175 break;
6176 }
6177
6178 #endif /* NECP */
6179
6180 #if CONTENT_FILTER
6181 case SO_CFIL_SOCK_ID: {
6182 cfil_sock_id_t sock_id;
6183
6184 sock_id = cfil_sock_id_from_socket(so);
6185
6186 error = sooptcopyout(sopt, &sock_id,
6187 sizeof(cfil_sock_id_t));
6188 break;
6189 }
6190 #endif /* CONTENT_FILTER */
6191
6192 case SO_EXTENDED_BK_IDLE:
6193 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6194 goto integer;
6195 case SO_MARK_CELLFALLBACK:
6196 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6197 ? 1 : 0;
6198 goto integer;
6199 case SO_FALLBACK_MODE:
6200 optval = so->so_fallback_mode;
6201 goto integer;
6202 case SO_MARK_KNOWN_TRACKER: {
6203 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6204 ? 1 : 0;
6205 goto integer;
6206 }
6207 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6208 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6209 ? 1 : 0;
6210 goto integer;
6211 }
6212 case SO_MARK_APPROVED_APP_DOMAIN: {
6213 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6214 ? 1 : 0;
6215 goto integer;
6216 }
6217 case SO_NET_SERVICE_TYPE: {
6218 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6219 optval = so->so_netsvctype;
6220 } else {
6221 optval = NET_SERVICE_TYPE_BE;
6222 }
6223 goto integer;
6224 }
6225 case SO_NETSVC_MARKING_LEVEL:
6226 optval = so_get_netsvc_marking_level(so);
6227 goto integer;
6228
6229 case SO_MPKL_SEND_INFO: {
6230 struct so_mpkl_send_info so_mpkl_send_info;
6231
6232 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6233 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6234 error = sooptcopyout(sopt, &so_mpkl_send_info,
6235 sizeof(struct so_mpkl_send_info));
6236 break;
6237 }
6238 case SO_MARK_WAKE_PKT:
6239 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6240 goto integer;
6241 case SO_RECV_WAKE_PKT:
6242 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6243 goto integer;
6244 case SO_APPLICATION_ID: {
6245 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6246 error = EINVAL;
6247 goto out;
6248 }
6249 so_application_id_t application_id = { 0 };
6250 application_id.uid = kauth_cred_getuid(so->so_cred);
6251 uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid);
6252 application_id.persona_id = so->so_persona_id;
6253 error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t));
6254 break;
6255 }
6256 default:
6257 error = ENOPROTOOPT;
6258 break;
6259 }
6260 }
6261 out:
6262 if (dolock) {
6263 socket_unlock(so, 1);
6264 }
6265 return error;
6266 }
6267
6268 /*
6269 * The size limits on our soopt_getm is different from that on FreeBSD.
6270 * We limit the size of options to MCLBYTES. This will have to change
6271 * if we need to define options that need more space than MCLBYTES.
6272 */
6273 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6274 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6275 {
6276 struct mbuf *m, *m_prev;
6277 int sopt_size = (int)sopt->sopt_valsize;
6278 int how;
6279
6280 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6281 return EMSGSIZE;
6282 }
6283
6284 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6285 MGET(m, how, MT_DATA);
6286 if (m == NULL) {
6287 return ENOBUFS;
6288 }
6289 if (sopt_size > MLEN) {
6290 MCLGET(m, how);
6291 if ((m->m_flags & M_EXT) == 0) {
6292 m_free(m);
6293 return ENOBUFS;
6294 }
6295 m->m_len = min(MCLBYTES, sopt_size);
6296 } else {
6297 m->m_len = min(MLEN, sopt_size);
6298 }
6299 sopt_size -= m->m_len;
6300 *mp = m;
6301 m_prev = m;
6302
6303 while (sopt_size > 0) {
6304 MGET(m, how, MT_DATA);
6305 if (m == NULL) {
6306 m_freem(*mp);
6307 return ENOBUFS;
6308 }
6309 if (sopt_size > MLEN) {
6310 MCLGET(m, how);
6311 if ((m->m_flags & M_EXT) == 0) {
6312 m_freem(*mp);
6313 m_freem(m);
6314 return ENOBUFS;
6315 }
6316 m->m_len = min(MCLBYTES, sopt_size);
6317 } else {
6318 m->m_len = min(MLEN, sopt_size);
6319 }
6320 sopt_size -= m->m_len;
6321 m_prev->m_next = m;
6322 m_prev = m;
6323 }
6324 return 0;
6325 }
6326
6327 /* copyin sopt data into mbuf chain */
6328 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6329 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6330 {
6331 struct mbuf *m0 = m;
6332
6333 if (sopt->sopt_val == USER_ADDR_NULL) {
6334 return 0;
6335 }
6336 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6337 if (sopt->sopt_p != kernproc) {
6338 int error;
6339
6340 error = copyin(sopt->sopt_val, mtod(m, char *),
6341 m->m_len);
6342 if (error != 0) {
6343 m_freem(m0);
6344 return error;
6345 }
6346 } else {
6347 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6348 mtod(m, char *), m->m_len);
6349 }
6350 sopt->sopt_valsize -= m->m_len;
6351 sopt->sopt_val += m->m_len;
6352 m = m->m_next;
6353 }
6354 /* should be allocated enoughly at ip6_sooptmcopyin() */
6355 if (m != NULL) {
6356 panic("soopt_mcopyin");
6357 /* NOTREACHED */
6358 }
6359 return 0;
6360 }
6361
6362 /* copyout mbuf chain data into soopt */
6363 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6364 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6365 {
6366 struct mbuf *m0 = m;
6367 size_t valsize = 0;
6368
6369 if (sopt->sopt_val == USER_ADDR_NULL) {
6370 return 0;
6371 }
6372 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6373 if (sopt->sopt_p != kernproc) {
6374 int error;
6375
6376 error = copyout(mtod(m, char *), sopt->sopt_val,
6377 m->m_len);
6378 if (error != 0) {
6379 m_freem(m0);
6380 return error;
6381 }
6382 } else {
6383 bcopy(mtod(m, char *),
6384 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6385 }
6386 sopt->sopt_valsize -= m->m_len;
6387 sopt->sopt_val += m->m_len;
6388 valsize += m->m_len;
6389 m = m->m_next;
6390 }
6391 if (m != NULL) {
6392 /* enough soopt buffer should be given from user-land */
6393 m_freem(m0);
6394 return EINVAL;
6395 }
6396 sopt->sopt_valsize = valsize;
6397 return 0;
6398 }
6399
6400 void
sohasoutofband(struct socket * so)6401 sohasoutofband(struct socket *so)
6402 {
6403 if (so->so_pgid < 0) {
6404 gsignal(-so->so_pgid, SIGURG);
6405 } else if (so->so_pgid > 0) {
6406 proc_signal(so->so_pgid, SIGURG);
6407 }
6408 selwakeup(&so->so_rcv.sb_sel);
6409 if (so->so_rcv.sb_flags & SB_KNOTE) {
6410 KNOTE(&so->so_rcv.sb_sel.si_note,
6411 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6412 }
6413 }
6414
6415 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6416 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6417 {
6418 #pragma unused(cred)
6419 struct proc *p = current_proc();
6420 int revents = 0;
6421
6422 socket_lock(so, 1);
6423 so_update_last_owner_locked(so, PROC_NULL);
6424 so_update_policy(so);
6425
6426 if (events & (POLLIN | POLLRDNORM)) {
6427 if (soreadable(so)) {
6428 revents |= events & (POLLIN | POLLRDNORM);
6429 }
6430 }
6431
6432 if (events & (POLLOUT | POLLWRNORM)) {
6433 if (sowriteable(so)) {
6434 revents |= events & (POLLOUT | POLLWRNORM);
6435 }
6436 }
6437
6438 if (events & (POLLPRI | POLLRDBAND)) {
6439 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6440 revents |= events & (POLLPRI | POLLRDBAND);
6441 }
6442 }
6443
6444 if (revents == 0) {
6445 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6446 /*
6447 * Darwin sets the flag first,
6448 * BSD calls selrecord first
6449 */
6450 so->so_rcv.sb_flags |= SB_SEL;
6451 selrecord(p, &so->so_rcv.sb_sel, wql);
6452 }
6453
6454 if (events & (POLLOUT | POLLWRNORM)) {
6455 /*
6456 * Darwin sets the flag first,
6457 * BSD calls selrecord first
6458 */
6459 so->so_snd.sb_flags |= SB_SEL;
6460 selrecord(p, &so->so_snd.sb_sel, wql);
6461 }
6462 }
6463
6464 socket_unlock(so, 1);
6465 return revents;
6466 }
6467
6468 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6469 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6470 {
6471 struct socket *so = (struct socket *)fp_get_data(fp);
6472 int result;
6473
6474 socket_lock(so, 1);
6475 so_update_last_owner_locked(so, PROC_NULL);
6476 so_update_policy(so);
6477
6478 switch (kn->kn_filter) {
6479 case EVFILT_READ:
6480 kn->kn_filtid = EVFILTID_SOREAD;
6481 break;
6482 case EVFILT_WRITE:
6483 kn->kn_filtid = EVFILTID_SOWRITE;
6484 break;
6485 case EVFILT_SOCK:
6486 kn->kn_filtid = EVFILTID_SCK;
6487 break;
6488 case EVFILT_EXCEPT:
6489 kn->kn_filtid = EVFILTID_SOEXCEPT;
6490 break;
6491 default:
6492 socket_unlock(so, 1);
6493 knote_set_error(kn, EINVAL);
6494 return 0;
6495 }
6496
6497 /*
6498 * call the appropriate sub-filter attach
6499 * with the socket still locked
6500 */
6501 result = knote_fops(kn)->f_attach(kn, kev);
6502
6503 socket_unlock(so, 1);
6504
6505 return result;
6506 }
6507
6508 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6509 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6510 {
6511 int retval = 0;
6512 int64_t data = 0;
6513
6514 if (so->so_options & SO_ACCEPTCONN) {
6515 /*
6516 * Radar 6615193 handle the listen case dynamically
6517 * for kqueue read filter. This allows to call listen()
6518 * after registering the kqueue EVFILT_READ.
6519 */
6520
6521 retval = !TAILQ_EMPTY(&so->so_comp);
6522 data = so->so_qlen;
6523 goto out;
6524 }
6525
6526 /* socket isn't a listener */
6527 /*
6528 * NOTE_LOWAT specifies new low water mark in data, i.e.
6529 * the bytes of protocol data. We therefore exclude any
6530 * control bytes.
6531 */
6532 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6533
6534 if (kn->kn_sfflags & NOTE_OOB) {
6535 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6536 kn->kn_fflags |= NOTE_OOB;
6537 data -= so->so_oobmark;
6538 retval = 1;
6539 goto out;
6540 }
6541 }
6542
6543 if ((so->so_state & SS_CANTRCVMORE)
6544 #if CONTENT_FILTER
6545 && cfil_sock_data_pending(&so->so_rcv) == 0
6546 #endif /* CONTENT_FILTER */
6547 ) {
6548 kn->kn_flags |= EV_EOF;
6549 kn->kn_fflags = so->so_error;
6550 retval = 1;
6551 goto out;
6552 }
6553
6554 if (so->so_error) { /* temporary udp error */
6555 retval = 1;
6556 goto out;
6557 }
6558
6559 int64_t lowwat = so->so_rcv.sb_lowat;
6560 /*
6561 * Ensure that when NOTE_LOWAT is used, the derived
6562 * low water mark is bounded by socket's rcv buf's
6563 * high and low water mark values.
6564 */
6565 if (kn->kn_sfflags & NOTE_LOWAT) {
6566 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6567 lowwat = so->so_rcv.sb_hiwat;
6568 } else if (kn->kn_sdata > lowwat) {
6569 lowwat = kn->kn_sdata;
6570 }
6571 }
6572
6573 /*
6574 * While the `data` field is the amount of data to read,
6575 * 0-sized packets need to wake up the kqueue, see 58140856,
6576 * so we need to take control bytes into account too.
6577 */
6578 retval = (so->so_rcv.sb_cc >= lowwat);
6579
6580 out:
6581 if (retval && kev) {
6582 knote_fill_kevent(kn, kev, data);
6583 }
6584 return retval;
6585 }
6586
6587 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6588 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6589 {
6590 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6591
6592 /* socket locked */
6593
6594 /*
6595 * If the caller explicitly asked for OOB results (e.g. poll())
6596 * from EVFILT_READ, then save that off in the hookid field
6597 * and reserve the kn_flags EV_OOBAND bit for output only.
6598 */
6599 if (kn->kn_filter == EVFILT_READ &&
6600 kn->kn_flags & EV_OOBAND) {
6601 kn->kn_flags &= ~EV_OOBAND;
6602 kn->kn_hook32 = EV_OOBAND;
6603 } else {
6604 kn->kn_hook32 = 0;
6605 }
6606 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6607 so->so_rcv.sb_flags |= SB_KNOTE;
6608 }
6609
6610 /* indicate if event is already fired */
6611 return filt_soread_common(kn, NULL, so);
6612 }
6613
6614 static void
filt_sordetach(struct knote * kn)6615 filt_sordetach(struct knote *kn)
6616 {
6617 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6618
6619 socket_lock(so, 1);
6620 if (so->so_rcv.sb_flags & SB_KNOTE) {
6621 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6622 so->so_rcv.sb_flags &= ~SB_KNOTE;
6623 }
6624 }
6625 socket_unlock(so, 1);
6626 }
6627
6628 /*ARGSUSED*/
6629 static int
filt_soread(struct knote * kn,long hint)6630 filt_soread(struct knote *kn, long hint)
6631 {
6632 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6633 int retval;
6634
6635 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6636 socket_lock(so, 1);
6637 }
6638
6639 retval = filt_soread_common(kn, NULL, so);
6640
6641 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6642 socket_unlock(so, 1);
6643 }
6644
6645 return retval;
6646 }
6647
6648 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6649 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6650 {
6651 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6652 int retval;
6653
6654 socket_lock(so, 1);
6655
6656 /* save off the new input fflags and data */
6657 kn->kn_sfflags = kev->fflags;
6658 kn->kn_sdata = kev->data;
6659
6660 /* determine if changes result in fired events */
6661 retval = filt_soread_common(kn, NULL, so);
6662
6663 socket_unlock(so, 1);
6664
6665 return retval;
6666 }
6667
6668 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6669 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6670 {
6671 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6672 int retval;
6673
6674 socket_lock(so, 1);
6675 retval = filt_soread_common(kn, kev, so);
6676 socket_unlock(so, 1);
6677
6678 return retval;
6679 }
6680
6681 int
so_wait_for_if_feedback(struct socket * so)6682 so_wait_for_if_feedback(struct socket *so)
6683 {
6684 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6685 (so->so_state & SS_ISCONNECTED)) {
6686 struct inpcb *inp = sotoinpcb(so);
6687 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6688 return 1;
6689 }
6690 }
6691 return 0;
6692 }
6693
6694 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6695 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6696 {
6697 int ret = 0;
6698 int64_t data = sbspace(&so->so_snd);
6699
6700 if (so->so_state & SS_CANTSENDMORE) {
6701 kn->kn_flags |= EV_EOF;
6702 kn->kn_fflags = so->so_error;
6703 ret = 1;
6704 goto out;
6705 }
6706
6707 if (so->so_error) { /* temporary udp error */
6708 ret = 1;
6709 goto out;
6710 }
6711
6712 if (!socanwrite(so)) {
6713 ret = 0;
6714 goto out;
6715 }
6716
6717 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6718 ret = 1;
6719 goto out;
6720 }
6721
6722 int64_t lowwat = so->so_snd.sb_lowat;
6723 const int64_t hiwat = so->so_snd.sb_hiwat;
6724 /*
6725 * Deal with connected UNIX domain sockets which
6726 * rely on the fact that the sender's socket buffer is
6727 * actually the receiver's socket buffer.
6728 */
6729 if (SOCK_DOM(so) == PF_LOCAL) {
6730 struct unpcb *unp = sotounpcb(so);
6731 if (unp != NULL && unp->unp_conn != NULL &&
6732 unp->unp_conn->unp_socket != NULL) {
6733 struct socket *so2 = unp->unp_conn->unp_socket;
6734 /*
6735 * At this point we know that `so' is locked
6736 * and that `unp_conn` isn't going to change.
6737 * However, we don't lock `so2` because doing so
6738 * may require unlocking `so'
6739 * (see unp_get_locks_in_order()).
6740 *
6741 * Two cases can happen:
6742 *
6743 * 1) we return 1 and tell the application that
6744 * it can write. Meanwhile, another thread
6745 * fills up the socket buffer. This will either
6746 * lead to a blocking send or EWOULDBLOCK
6747 * which the application should deal with.
6748 * 2) we return 0 and tell the application that
6749 * the socket is not writable. Meanwhile,
6750 * another thread depletes the receive socket
6751 * buffer. In this case the application will
6752 * be woken up by sb_notify().
6753 *
6754 * MIN() is required because otherwise sosendcheck()
6755 * may return EWOULDBLOCK since it only considers
6756 * so->so_snd.
6757 */
6758 data = MIN(data, sbspace(&so2->so_rcv));
6759 }
6760 }
6761
6762 if (kn->kn_sfflags & NOTE_LOWAT) {
6763 if (kn->kn_sdata > hiwat) {
6764 lowwat = hiwat;
6765 } else if (kn->kn_sdata > lowwat) {
6766 lowwat = kn->kn_sdata;
6767 }
6768 }
6769
6770 if (data > 0 && data >= lowwat) {
6771 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6772 #if (DEBUG || DEVELOPMENT)
6773 && so_notsent_lowat_check == 1
6774 #endif /* DEBUG || DEVELOPMENT */
6775 ) {
6776 if ((SOCK_DOM(so) == PF_INET ||
6777 SOCK_DOM(so) == PF_INET6) &&
6778 so->so_type == SOCK_STREAM) {
6779 ret = tcp_notsent_lowat_check(so);
6780 }
6781 #if MPTCP
6782 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6783 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6784 ret = mptcp_notsent_lowat_check(so);
6785 }
6786 #endif
6787 else {
6788 ret = 1;
6789 goto out;
6790 }
6791 } else {
6792 ret = 1;
6793 }
6794 }
6795 if (so_wait_for_if_feedback(so)) {
6796 ret = 0;
6797 }
6798
6799 out:
6800 if (ret && kev) {
6801 knote_fill_kevent(kn, kev, data);
6802 }
6803 return ret;
6804 }
6805
6806 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6807 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6808 {
6809 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6810
6811 /* socket locked */
6812 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6813 so->so_snd.sb_flags |= SB_KNOTE;
6814 }
6815
6816 /* determine if its already fired */
6817 return filt_sowrite_common(kn, NULL, so);
6818 }
6819
6820 static void
filt_sowdetach(struct knote * kn)6821 filt_sowdetach(struct knote *kn)
6822 {
6823 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6824 socket_lock(so, 1);
6825
6826 if (so->so_snd.sb_flags & SB_KNOTE) {
6827 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6828 so->so_snd.sb_flags &= ~SB_KNOTE;
6829 }
6830 }
6831 socket_unlock(so, 1);
6832 }
6833
6834 /*ARGSUSED*/
6835 static int
filt_sowrite(struct knote * kn,long hint)6836 filt_sowrite(struct knote *kn, long hint)
6837 {
6838 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6839 int ret;
6840
6841 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6842 socket_lock(so, 1);
6843 }
6844
6845 ret = filt_sowrite_common(kn, NULL, so);
6846
6847 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6848 socket_unlock(so, 1);
6849 }
6850
6851 return ret;
6852 }
6853
6854 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6855 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6856 {
6857 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6858 int ret;
6859
6860 socket_lock(so, 1);
6861
6862 /*save off the new input fflags and data */
6863 kn->kn_sfflags = kev->fflags;
6864 kn->kn_sdata = kev->data;
6865
6866 /* determine if these changes result in a triggered event */
6867 ret = filt_sowrite_common(kn, NULL, so);
6868
6869 socket_unlock(so, 1);
6870
6871 return ret;
6872 }
6873
6874 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6875 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6876 {
6877 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6878 int ret;
6879
6880 socket_lock(so, 1);
6881 ret = filt_sowrite_common(kn, kev, so);
6882 socket_unlock(so, 1);
6883
6884 return ret;
6885 }
6886
6887 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6888 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6889 struct socket *so, long ev_hint)
6890 {
6891 int ret = 0;
6892 int64_t data = 0;
6893 uint32_t level_trigger = 0;
6894
6895 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6896 kn->kn_fflags |= NOTE_CONNRESET;
6897 }
6898 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6899 kn->kn_fflags |= NOTE_TIMEOUT;
6900 }
6901 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6902 kn->kn_fflags |= NOTE_NOSRCADDR;
6903 }
6904 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6905 kn->kn_fflags |= NOTE_IFDENIED;
6906 }
6907 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6908 kn->kn_fflags |= NOTE_KEEPALIVE;
6909 }
6910 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6911 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6912 }
6913 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6914 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6915 }
6916 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6917 (so->so_state & SS_ISCONNECTED)) {
6918 kn->kn_fflags |= NOTE_CONNECTED;
6919 level_trigger |= NOTE_CONNECTED;
6920 }
6921 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6922 (so->so_state & SS_ISDISCONNECTED)) {
6923 kn->kn_fflags |= NOTE_DISCONNECTED;
6924 level_trigger |= NOTE_DISCONNECTED;
6925 }
6926 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6927 if (so->so_proto != NULL &&
6928 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6929 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6930 }
6931 }
6932 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6933 tcp_notify_ack_active(so)) {
6934 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6935 }
6936 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6937 kn->kn_fflags |= NOTE_WAKE_PKT;
6938 }
6939
6940 if ((so->so_state & SS_CANTRCVMORE)
6941 #if CONTENT_FILTER
6942 && cfil_sock_data_pending(&so->so_rcv) == 0
6943 #endif /* CONTENT_FILTER */
6944 ) {
6945 kn->kn_fflags |= NOTE_READCLOSED;
6946 level_trigger |= NOTE_READCLOSED;
6947 }
6948
6949 if (so->so_state & SS_CANTSENDMORE) {
6950 kn->kn_fflags |= NOTE_WRITECLOSED;
6951 level_trigger |= NOTE_WRITECLOSED;
6952 }
6953
6954 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6955 (so->so_flags & SOF_SUSPENDED)) {
6956 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6957
6958 /* If resume event was delivered before, reset it */
6959 kn->kn_hook32 &= ~NOTE_RESUME;
6960
6961 kn->kn_fflags |= NOTE_SUSPEND;
6962 level_trigger |= NOTE_SUSPEND;
6963 }
6964
6965 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6966 (so->so_flags & SOF_SUSPENDED) == 0) {
6967 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6968
6969 /* If suspend event was delivered before, reset it */
6970 kn->kn_hook32 &= ~NOTE_SUSPEND;
6971
6972 kn->kn_fflags |= NOTE_RESUME;
6973 level_trigger |= NOTE_RESUME;
6974 }
6975
6976 if (so->so_error != 0) {
6977 ret = 1;
6978 data = so->so_error;
6979 kn->kn_flags |= EV_EOF;
6980 } else {
6981 u_int32_t data32 = 0;
6982 get_sockev_state(so, &data32);
6983 data = data32;
6984 }
6985
6986 /* Reset any events that are not requested on this knote */
6987 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6988 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6989
6990 /* Find the level triggerred events that are already delivered */
6991 level_trigger &= kn->kn_hook32;
6992 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6993
6994 /* Do not deliver level triggerred events more than once */
6995 if ((kn->kn_fflags & ~level_trigger) != 0) {
6996 ret = 1;
6997 }
6998
6999 if (ret && kev) {
7000 /*
7001 * Store the state of the events being delivered. This
7002 * state can be used to deliver level triggered events
7003 * ateast once and still avoid waking up the application
7004 * multiple times as long as the event is active.
7005 */
7006 if (kn->kn_fflags != 0) {
7007 kn->kn_hook32 |= (kn->kn_fflags &
7008 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7009 }
7010
7011 /*
7012 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7013 * only one of them and remember the last one that was
7014 * delivered last
7015 */
7016 if (kn->kn_fflags & NOTE_SUSPEND) {
7017 kn->kn_hook32 &= ~NOTE_RESUME;
7018 }
7019 if (kn->kn_fflags & NOTE_RESUME) {
7020 kn->kn_hook32 &= ~NOTE_SUSPEND;
7021 }
7022
7023 knote_fill_kevent(kn, kev, data);
7024 }
7025 return ret;
7026 }
7027
7028 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7029 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7030 {
7031 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7032
7033 /* socket locked */
7034 kn->kn_hook32 = 0;
7035 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7036 so->so_flags |= SOF_KNOTE;
7037 }
7038
7039 /* determine if event already fired */
7040 return filt_sockev_common(kn, NULL, so, 0);
7041 }
7042
7043 static void
filt_sockdetach(struct knote * kn)7044 filt_sockdetach(struct knote *kn)
7045 {
7046 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7047 socket_lock(so, 1);
7048
7049 if ((so->so_flags & SOF_KNOTE) != 0) {
7050 if (KNOTE_DETACH(&so->so_klist, kn)) {
7051 so->so_flags &= ~SOF_KNOTE;
7052 }
7053 }
7054 socket_unlock(so, 1);
7055 }
7056
7057 static int
filt_sockev(struct knote * kn,long hint)7058 filt_sockev(struct knote *kn, long hint)
7059 {
7060 int ret = 0, locked = 0;
7061 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7062 long ev_hint = (hint & SO_FILT_HINT_EV);
7063
7064 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7065 socket_lock(so, 1);
7066 locked = 1;
7067 }
7068
7069 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7070
7071 if (locked) {
7072 socket_unlock(so, 1);
7073 }
7074
7075 return ret;
7076 }
7077
7078
7079
7080 /*
7081 * filt_socktouch - update event state
7082 */
7083 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7084 filt_socktouch(
7085 struct knote *kn,
7086 struct kevent_qos_s *kev)
7087 {
7088 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7089 uint32_t changed_flags;
7090 int ret;
7091
7092 socket_lock(so, 1);
7093
7094 /* save off the [result] data and fflags */
7095 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7096
7097 /* save off the new input fflags and data */
7098 kn->kn_sfflags = kev->fflags;
7099 kn->kn_sdata = kev->data;
7100
7101 /* restrict the current results to the (smaller?) set of new interest */
7102 /*
7103 * For compatibility with previous implementations, we leave kn_fflags
7104 * as they were before.
7105 */
7106 //kn->kn_fflags &= kev->fflags;
7107
7108 /*
7109 * Since we keep track of events that are already
7110 * delivered, if any of those events are not requested
7111 * anymore the state related to them can be reset
7112 */
7113 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7114
7115 /* determine if we have events to deliver */
7116 ret = filt_sockev_common(kn, NULL, so, 0);
7117
7118 socket_unlock(so, 1);
7119
7120 return ret;
7121 }
7122
7123 /*
7124 * filt_sockprocess - query event fired state and return data
7125 */
7126 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7127 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7128 {
7129 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7130 int ret = 0;
7131
7132 socket_lock(so, 1);
7133
7134 ret = filt_sockev_common(kn, kev, so, 0);
7135
7136 socket_unlock(so, 1);
7137
7138 return ret;
7139 }
7140
7141 void
get_sockev_state(struct socket * so,u_int32_t * statep)7142 get_sockev_state(struct socket *so, u_int32_t *statep)
7143 {
7144 u_int32_t state = *(statep);
7145
7146 /*
7147 * If the state variable is already used by a previous event,
7148 * reset it.
7149 */
7150 if (state != 0) {
7151 return;
7152 }
7153
7154 if (so->so_state & SS_ISCONNECTED) {
7155 state |= SOCKEV_CONNECTED;
7156 } else {
7157 state &= ~(SOCKEV_CONNECTED);
7158 }
7159 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7160 *(statep) = state;
7161 }
7162
7163 #define SO_LOCK_HISTORY_STR_LEN \
7164 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7165
7166 __private_extern__ const char *
solockhistory_nr(struct socket * so)7167 solockhistory_nr(struct socket *so)
7168 {
7169 size_t n = 0;
7170 int i;
7171 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7172
7173 bzero(lock_history_str, sizeof(lock_history_str));
7174 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7175 n += scnprintf(lock_history_str + n,
7176 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7177 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7178 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7179 }
7180 return lock_history_str;
7181 }
7182
7183 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7184 socket_getlock(struct socket *so, int flags)
7185 {
7186 if (so->so_proto->pr_getlock != NULL) {
7187 return (*so->so_proto->pr_getlock)(so, flags);
7188 } else {
7189 return so->so_proto->pr_domain->dom_mtx;
7190 }
7191 }
7192
7193 void
socket_lock(struct socket * so,int refcount)7194 socket_lock(struct socket *so, int refcount)
7195 {
7196 void *lr_saved;
7197
7198 lr_saved = __builtin_return_address(0);
7199
7200 if (so->so_proto->pr_lock) {
7201 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7202 } else {
7203 #ifdef MORE_LOCKING_DEBUG
7204 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7205 LCK_MTX_ASSERT_NOTOWNED);
7206 #endif
7207 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7208 if (refcount) {
7209 so->so_usecount++;
7210 }
7211 so->lock_lr[so->next_lock_lr] = lr_saved;
7212 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7213 }
7214 }
7215
7216 void
socket_lock_assert_owned(struct socket * so)7217 socket_lock_assert_owned(struct socket *so)
7218 {
7219 lck_mtx_t *mutex_held;
7220
7221 if (so->so_proto->pr_getlock != NULL) {
7222 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7223 } else {
7224 mutex_held = so->so_proto->pr_domain->dom_mtx;
7225 }
7226
7227 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7228 }
7229
7230 int
socket_try_lock(struct socket * so)7231 socket_try_lock(struct socket *so)
7232 {
7233 lck_mtx_t *mtx;
7234
7235 if (so->so_proto->pr_getlock != NULL) {
7236 mtx = (*so->so_proto->pr_getlock)(so, 0);
7237 } else {
7238 mtx = so->so_proto->pr_domain->dom_mtx;
7239 }
7240
7241 return lck_mtx_try_lock(mtx);
7242 }
7243
7244 void
socket_unlock(struct socket * so,int refcount)7245 socket_unlock(struct socket *so, int refcount)
7246 {
7247 void *lr_saved;
7248 lck_mtx_t *mutex_held;
7249
7250 lr_saved = __builtin_return_address(0);
7251
7252 if (so == NULL || so->so_proto == NULL) {
7253 panic("%s: null so_proto so=%p", __func__, so);
7254 /* NOTREACHED */
7255 }
7256
7257 if (so->so_proto->pr_unlock) {
7258 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7259 } else {
7260 mutex_held = so->so_proto->pr_domain->dom_mtx;
7261 #ifdef MORE_LOCKING_DEBUG
7262 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7263 #endif
7264 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7265 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7266
7267 if (refcount) {
7268 if (so->so_usecount <= 0) {
7269 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7270 "lrh=%s", __func__, so->so_usecount, so,
7271 SOCK_DOM(so), so->so_type,
7272 SOCK_PROTO(so), solockhistory_nr(so));
7273 /* NOTREACHED */
7274 }
7275
7276 so->so_usecount--;
7277 if (so->so_usecount == 0) {
7278 sofreelastref(so, 1);
7279 }
7280 }
7281 lck_mtx_unlock(mutex_held);
7282 }
7283 }
7284
7285 /* Called with socket locked, will unlock socket */
7286 void
sofree(struct socket * so)7287 sofree(struct socket *so)
7288 {
7289 lck_mtx_t *mutex_held;
7290
7291 if (so->so_proto->pr_getlock != NULL) {
7292 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7293 } else {
7294 mutex_held = so->so_proto->pr_domain->dom_mtx;
7295 }
7296 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7297
7298 sofreelastref(so, 0);
7299 }
7300
7301 void
soreference(struct socket * so)7302 soreference(struct socket *so)
7303 {
7304 socket_lock(so, 1); /* locks & take one reference on socket */
7305 socket_unlock(so, 0); /* unlock only */
7306 }
7307
7308 void
sodereference(struct socket * so)7309 sodereference(struct socket *so)
7310 {
7311 socket_lock(so, 0);
7312 socket_unlock(so, 1);
7313 }
7314
7315 /*
7316 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7317 * possibility of using jumbo clusters. Caller must ensure to hold
7318 * the socket lock.
7319 */
7320 void
somultipages(struct socket * so,boolean_t set)7321 somultipages(struct socket *so, boolean_t set)
7322 {
7323 if (set) {
7324 so->so_flags |= SOF_MULTIPAGES;
7325 } else {
7326 so->so_flags &= ~SOF_MULTIPAGES;
7327 }
7328 }
7329
7330 void
soif2kcl(struct socket * so,boolean_t set)7331 soif2kcl(struct socket *so, boolean_t set)
7332 {
7333 if (set) {
7334 so->so_flags1 |= SOF1_IF_2KCL;
7335 } else {
7336 so->so_flags1 &= ~SOF1_IF_2KCL;
7337 }
7338 }
7339
7340 int
so_isdstlocal(struct socket * so)7341 so_isdstlocal(struct socket *so)
7342 {
7343 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7344
7345 if (SOCK_DOM(so) == PF_INET) {
7346 return inaddr_local(inp->inp_faddr);
7347 } else if (SOCK_DOM(so) == PF_INET6) {
7348 return in6addr_local(&inp->in6p_faddr);
7349 }
7350
7351 return 0;
7352 }
7353
7354 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7355 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7356 {
7357 struct sockbuf *rcv, *snd;
7358 int err = 0, defunct;
7359
7360 rcv = &so->so_rcv;
7361 snd = &so->so_snd;
7362
7363 defunct = (so->so_flags & SOF_DEFUNCT);
7364 if (defunct) {
7365 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7366 panic("%s: SB_DROP not set", __func__);
7367 /* NOTREACHED */
7368 }
7369 goto done;
7370 }
7371
7372 if (so->so_flags & SOF_NODEFUNCT) {
7373 if (noforce) {
7374 err = EOPNOTSUPP;
7375 if (p != PROC_NULL) {
7376 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7377 "name %s level %d) so 0x%llu [%d,%d] "
7378 "is not eligible for defunct "
7379 "(%d)\n", __func__, proc_selfpid(),
7380 proc_best_name(current_proc()), proc_pid(p),
7381 proc_best_name(p), level,
7382 so->so_gencnt,
7383 SOCK_DOM(so), SOCK_TYPE(so), err);
7384 }
7385 return err;
7386 }
7387 so->so_flags &= ~SOF_NODEFUNCT;
7388 if (p != PROC_NULL) {
7389 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7390 "name %s level %d) so 0x%llu [%d,%d] "
7391 "defunct by force "
7392 "(%d)\n", __func__, proc_selfpid(),
7393 proc_best_name(current_proc()), proc_pid(p),
7394 proc_best_name(p), level,
7395 so->so_gencnt,
7396 SOCK_DOM(so), SOCK_TYPE(so), err);
7397 }
7398 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7399 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7400 struct ifnet *ifp = inp->inp_last_outifp;
7401
7402 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7403 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7404 } else if (so->so_flags & SOF_DELEGATED) {
7405 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7406 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7407 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7408 } else if (noforce && p != PROC_NULL) {
7409 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7410
7411 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7412 so->so_extended_bk_start = net_uptime();
7413 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7414
7415 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7416
7417 err = EOPNOTSUPP;
7418 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7419 "name %s level %d) so 0x%llu [%d,%d] "
7420 "extend bk idle "
7421 "(%d)\n", __func__, proc_selfpid(),
7422 proc_best_name(current_proc()), proc_pid(p),
7423 proc_best_name(p), level,
7424 so->so_gencnt,
7425 SOCK_DOM(so), SOCK_TYPE(so), err);
7426 return err;
7427 } else {
7428 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7429 }
7430 }
7431
7432 so->so_flags |= SOF_DEFUNCT;
7433
7434 /* Prevent further data from being appended to the socket buffers */
7435 snd->sb_flags |= SB_DROP;
7436 rcv->sb_flags |= SB_DROP;
7437
7438 /* Flush any existing data in the socket buffers */
7439 if (rcv->sb_cc != 0) {
7440 rcv->sb_flags &= ~SB_SEL;
7441 selthreadclear(&rcv->sb_sel);
7442 sbrelease(rcv);
7443 }
7444 if (snd->sb_cc != 0) {
7445 snd->sb_flags &= ~SB_SEL;
7446 selthreadclear(&snd->sb_sel);
7447 sbrelease(snd);
7448 }
7449
7450 done:
7451 if (p != PROC_NULL) {
7452 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7453 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7454 proc_selfpid(), proc_best_name(current_proc()),
7455 proc_pid(p), proc_best_name(p), level,
7456 so->so_gencnt, SOCK_DOM(so),
7457 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7458 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7459 " extbkidle" : "");
7460 }
7461 return err;
7462 }
7463
7464 int
sodefunct(struct proc * p,struct socket * so,int level)7465 sodefunct(struct proc *p, struct socket *so, int level)
7466 {
7467 struct sockbuf *rcv, *snd;
7468
7469 if (!(so->so_flags & SOF_DEFUNCT)) {
7470 panic("%s improperly called", __func__);
7471 /* NOTREACHED */
7472 }
7473 if (so->so_state & SS_DEFUNCT) {
7474 goto done;
7475 }
7476
7477 rcv = &so->so_rcv;
7478 snd = &so->so_snd;
7479
7480 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7481 char s[MAX_IPv6_STR_LEN];
7482 char d[MAX_IPv6_STR_LEN];
7483 struct inpcb *inp = sotoinpcb(so);
7484
7485 if (p != PROC_NULL) {
7486 SODEFUNCTLOG(
7487 "%s[%d, %s]: (target pid %d name %s level %d) "
7488 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7489 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7490 " snd_fl 0x%x]\n", __func__,
7491 proc_selfpid(), proc_best_name(current_proc()),
7492 proc_pid(p), proc_best_name(p), level,
7493 so->so_gencnt,
7494 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7495 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7496 (void *)&inp->inp_laddr.s_addr :
7497 (void *)&inp->in6p_laddr),
7498 s, sizeof(s)), ntohs(inp->in6p_lport),
7499 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7500 (void *)&inp->inp_faddr.s_addr :
7501 (void *)&inp->in6p_faddr,
7502 d, sizeof(d)), ntohs(inp->in6p_fport),
7503 (uint32_t)rcv->sb_sel.si_flags,
7504 (uint32_t)snd->sb_sel.si_flags,
7505 rcv->sb_flags, snd->sb_flags);
7506 }
7507 } else if (p != PROC_NULL) {
7508 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7509 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7510 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7511 proc_selfpid(), proc_best_name(current_proc()),
7512 proc_pid(p), proc_best_name(p), level,
7513 so->so_gencnt,
7514 SOCK_DOM(so), SOCK_TYPE(so),
7515 (uint32_t)rcv->sb_sel.si_flags,
7516 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7517 snd->sb_flags);
7518 }
7519
7520 /*
7521 * First tell the protocol the flow is defunct
7522 */
7523 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7524
7525 /*
7526 * Unwedge threads blocked on sbwait() and sb_lock().
7527 */
7528 sbwakeup(rcv);
7529 sbwakeup(snd);
7530
7531 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7532 if (rcv->sb_flags & SB_LOCK) {
7533 sbunlock(rcv, TRUE); /* keep socket locked */
7534 }
7535 if (snd->sb_flags & SB_LOCK) {
7536 sbunlock(snd, TRUE); /* keep socket locked */
7537 }
7538 /*
7539 * Flush the buffers and disconnect. We explicitly call shutdown
7540 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7541 * states are set for the socket. This would also flush out data
7542 * hanging off the receive list of this socket.
7543 */
7544 (void) soshutdownlock_final(so, SHUT_RD);
7545 (void) soshutdownlock_final(so, SHUT_WR);
7546 (void) sodisconnectlocked(so);
7547
7548 /*
7549 * Explicitly handle connectionless-protocol disconnection
7550 * and release any remaining data in the socket buffers.
7551 */
7552 if (!(so->so_state & SS_ISDISCONNECTED)) {
7553 (void) soisdisconnected(so);
7554 }
7555
7556 if (so->so_error == 0) {
7557 so->so_error = EBADF;
7558 }
7559
7560 if (rcv->sb_cc != 0) {
7561 rcv->sb_flags &= ~SB_SEL;
7562 selthreadclear(&rcv->sb_sel);
7563 sbrelease(rcv);
7564 }
7565 if (snd->sb_cc != 0) {
7566 snd->sb_flags &= ~SB_SEL;
7567 selthreadclear(&snd->sb_sel);
7568 sbrelease(snd);
7569 }
7570 so->so_state |= SS_DEFUNCT;
7571 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7572
7573 done:
7574 return 0;
7575 }
7576
7577 int
soresume(struct proc * p,struct socket * so,int locked)7578 soresume(struct proc *p, struct socket *so, int locked)
7579 {
7580 if (locked == 0) {
7581 socket_lock(so, 1);
7582 }
7583
7584 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7585 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7586 "[%d,%d] resumed from bk idle\n",
7587 __func__, proc_selfpid(), proc_best_name(current_proc()),
7588 proc_pid(p), proc_best_name(p),
7589 so->so_gencnt,
7590 SOCK_DOM(so), SOCK_TYPE(so));
7591
7592 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7593 so->so_extended_bk_start = 0;
7594 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7595
7596 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7597 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7598 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7599 }
7600 if (locked == 0) {
7601 socket_unlock(so, 1);
7602 }
7603
7604 return 0;
7605 }
7606
7607 /*
7608 * Does not attempt to account for sockets that are delegated from
7609 * the current process
7610 */
7611 int
so_set_extended_bk_idle(struct socket * so,int optval)7612 so_set_extended_bk_idle(struct socket *so, int optval)
7613 {
7614 int error = 0;
7615
7616 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7617 SOCK_PROTO(so) != IPPROTO_TCP) {
7618 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7619 error = EOPNOTSUPP;
7620 } else if (optval == 0) {
7621 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7622
7623 soresume(current_proc(), so, 1);
7624 } else {
7625 struct proc *p = current_proc();
7626 struct fileproc *fp;
7627 int count = 0;
7628
7629 /*
7630 * Unlock socket to avoid lock ordering issue with
7631 * the proc fd table lock
7632 */
7633 socket_unlock(so, 0);
7634
7635 proc_fdlock(p);
7636 fdt_foreach(fp, p) {
7637 struct socket *so2;
7638
7639 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7640 continue;
7641 }
7642
7643 so2 = (struct socket *)fp_get_data(fp);
7644 if (so != so2 &&
7645 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7646 count++;
7647 }
7648 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7649 break;
7650 }
7651 }
7652 proc_fdunlock(p);
7653
7654 socket_lock(so, 0);
7655
7656 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7657 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7658 error = EBUSY;
7659 } else if (so->so_flags & SOF_DELEGATED) {
7660 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7661 error = EBUSY;
7662 } else {
7663 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7664 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7665 }
7666 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7667 "%s marked for extended bk idle\n",
7668 __func__, proc_selfpid(), proc_best_name(current_proc()),
7669 so->so_gencnt,
7670 SOCK_DOM(so), SOCK_TYPE(so),
7671 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7672 "is" : "not");
7673 }
7674
7675 return error;
7676 }
7677
7678 static void
so_stop_extended_bk_idle(struct socket * so)7679 so_stop_extended_bk_idle(struct socket *so)
7680 {
7681 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7682 so->so_extended_bk_start = 0;
7683
7684 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7685 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7686 /*
7687 * Force defunct
7688 */
7689 sosetdefunct(current_proc(), so,
7690 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7691 if (so->so_flags & SOF_DEFUNCT) {
7692 sodefunct(current_proc(), so,
7693 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7694 }
7695 }
7696
7697 void
so_drain_extended_bk_idle(struct socket * so)7698 so_drain_extended_bk_idle(struct socket *so)
7699 {
7700 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7701 /*
7702 * Only penalize sockets that have outstanding data
7703 */
7704 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7705 so_stop_extended_bk_idle(so);
7706
7707 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7708 }
7709 }
7710 }
7711
7712 /*
7713 * Return values tells if socket is still in extended background idle
7714 */
7715 int
so_check_extended_bk_idle_time(struct socket * so)7716 so_check_extended_bk_idle_time(struct socket *so)
7717 {
7718 int ret = 1;
7719
7720 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7721 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7722 __func__, proc_selfpid(), proc_best_name(current_proc()),
7723 so->so_gencnt,
7724 SOCK_DOM(so), SOCK_TYPE(so));
7725 if (net_uptime() - so->so_extended_bk_start >
7726 soextbkidlestat.so_xbkidle_time) {
7727 so_stop_extended_bk_idle(so);
7728
7729 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7730
7731 ret = 0;
7732 } else {
7733 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7734
7735 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7736 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7737 }
7738 }
7739
7740 return ret;
7741 }
7742
7743 void
resume_proc_sockets(proc_t p)7744 resume_proc_sockets(proc_t p)
7745 {
7746 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7747 struct fileproc *fp;
7748 struct socket *so;
7749
7750 proc_fdlock(p);
7751 fdt_foreach(fp, p) {
7752 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7753 continue;
7754 }
7755
7756 so = (struct socket *)fp_get_data(fp);
7757 (void) soresume(p, so, 0);
7758 }
7759 proc_fdunlock(p);
7760
7761 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7762 }
7763 }
7764
7765 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7766 so_set_recv_anyif(struct socket *so, int optval)
7767 {
7768 int ret = 0;
7769
7770 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7771 if (optval) {
7772 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7773 } else {
7774 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7775 }
7776 #if SKYWALK
7777 inp_update_netns_flags(so);
7778 #endif /* SKYWALK */
7779 }
7780
7781
7782 return ret;
7783 }
7784
7785 __private_extern__ int
so_get_recv_anyif(struct socket * so)7786 so_get_recv_anyif(struct socket *so)
7787 {
7788 int ret = 0;
7789
7790 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7791 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7792 }
7793
7794 return ret;
7795 }
7796
7797 int
so_set_restrictions(struct socket * so,uint32_t vals)7798 so_set_restrictions(struct socket *so, uint32_t vals)
7799 {
7800 int nocell_old, nocell_new;
7801 int noexpensive_old, noexpensive_new;
7802 int noconstrained_old, noconstrained_new;
7803
7804 /*
7805 * Deny-type restrictions are trapdoors; once set they cannot be
7806 * unset for the lifetime of the socket. This allows them to be
7807 * issued by a framework on behalf of the application without
7808 * having to worry that they can be undone.
7809 *
7810 * Note here that socket-level restrictions overrides any protocol
7811 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7812 * socket restriction issued on the socket has a higher precendence
7813 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7814 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7815 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7816 */
7817 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7818 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7819 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7820 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7821 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7822 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7823 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7824 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7825 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7826
7827 /* we can only set, not clear restrictions */
7828 if ((nocell_new - nocell_old) == 0 &&
7829 (noexpensive_new - noexpensive_old) == 0 &&
7830 (noconstrained_new - noconstrained_old) == 0) {
7831 return 0;
7832 }
7833 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7834 if (nocell_new - nocell_old != 0) {
7835 /*
7836 * if deny cellular is now set, do what's needed
7837 * for INPCB
7838 */
7839 inp_set_nocellular(sotoinpcb(so));
7840 }
7841 if (noexpensive_new - noexpensive_old != 0) {
7842 inp_set_noexpensive(sotoinpcb(so));
7843 }
7844 if (noconstrained_new - noconstrained_old != 0) {
7845 inp_set_noconstrained(sotoinpcb(so));
7846 }
7847 }
7848
7849 if (SOCK_DOM(so) == PF_MULTIPATH) {
7850 mptcp_set_restrictions(so);
7851 }
7852
7853 return 0;
7854 }
7855
7856 uint32_t
so_get_restrictions(struct socket * so)7857 so_get_restrictions(struct socket *so)
7858 {
7859 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7860 SO_RESTRICT_DENY_OUT |
7861 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7862 }
7863
7864 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7865 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7866 {
7867 struct proc *ep = PROC_NULL;
7868 int error = 0;
7869
7870 /* pid 0 is reserved for kernel */
7871 if (epid == 0) {
7872 error = EINVAL;
7873 goto done;
7874 }
7875
7876 /*
7877 * If this is an in-kernel socket, prevent its delegate
7878 * association from changing unless the socket option is
7879 * coming from within the kernel itself.
7880 */
7881 if (so->last_pid == 0 && p != kernproc) {
7882 error = EACCES;
7883 goto done;
7884 }
7885
7886 /*
7887 * If this is issued by a process that's recorded as the
7888 * real owner of the socket, or if the pid is the same as
7889 * the process's own pid, then proceed. Otherwise ensure
7890 * that the issuing process has the necessary privileges.
7891 */
7892 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7893 if ((error = priv_check_cred(kauth_cred_get(),
7894 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7895 error = EACCES;
7896 goto done;
7897 }
7898 }
7899
7900 /* Find the process that corresponds to the effective pid */
7901 if ((ep = proc_find(epid)) == PROC_NULL) {
7902 error = ESRCH;
7903 goto done;
7904 }
7905
7906 /*
7907 * If a process tries to delegate the socket to itself, then
7908 * there's really nothing to do; treat it as a way for the
7909 * delegate association to be cleared. Note that we check
7910 * the passed-in proc rather than calling proc_selfpid(),
7911 * as we need to check the process issuing the socket option
7912 * which could be kernproc. Given that we don't allow 0 for
7913 * effective pid, it means that a delegated in-kernel socket
7914 * stays delegated during its lifetime (which is probably OK.)
7915 */
7916 if (epid == proc_pid(p)) {
7917 so->so_flags &= ~SOF_DELEGATED;
7918 so->e_upid = 0;
7919 so->e_pid = 0;
7920 uuid_clear(so->e_uuid);
7921 } else {
7922 so->so_flags |= SOF_DELEGATED;
7923 so->e_upid = proc_uniqueid(ep);
7924 so->e_pid = proc_pid(ep);
7925 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7926
7927 #if defined(XNU_TARGET_OS_OSX)
7928 if (ep->p_responsible_pid != so->e_pid) {
7929 proc_t rp = proc_find(ep->p_responsible_pid);
7930 if (rp != PROC_NULL) {
7931 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7932 so->so_rpid = ep->p_responsible_pid;
7933 proc_rele(rp);
7934 } else {
7935 uuid_clear(so->so_ruuid);
7936 so->so_rpid = -1;
7937 }
7938 }
7939 #endif
7940 }
7941 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7942 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7943 }
7944 done:
7945 if (error == 0 && net_io_policy_log) {
7946 uuid_string_t buf;
7947
7948 uuid_unparse(so->e_uuid, buf);
7949 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7950 "euuid %s%s\n", __func__, proc_name_address(p),
7951 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7952 SOCK_DOM(so), SOCK_TYPE(so),
7953 so->e_pid, proc_name_address(ep), buf,
7954 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7955 } else if (error != 0 && net_io_policy_log) {
7956 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7957 "ERROR (%d)\n", __func__, proc_name_address(p),
7958 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7959 SOCK_DOM(so), SOCK_TYPE(so),
7960 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7961 proc_name_address(ep), error);
7962 }
7963
7964 /* Update this socket's policy upon success */
7965 if (error == 0) {
7966 so->so_policy_gencnt *= -1;
7967 so_update_policy(so);
7968 #if NECP
7969 so_update_necp_policy(so, NULL, NULL);
7970 #endif /* NECP */
7971 }
7972
7973 if (ep != PROC_NULL) {
7974 proc_rele(ep);
7975 }
7976
7977 return error;
7978 }
7979
7980 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)7981 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7982 {
7983 uuid_string_t buf;
7984 uuid_t uuid;
7985 int error = 0;
7986
7987 /* UUID must not be all-zeroes (reserved for kernel) */
7988 if (uuid_is_null(euuid)) {
7989 error = EINVAL;
7990 goto done;
7991 }
7992
7993 /*
7994 * If this is an in-kernel socket, prevent its delegate
7995 * association from changing unless the socket option is
7996 * coming from within the kernel itself.
7997 */
7998 if (so->last_pid == 0 && p != kernproc) {
7999 error = EACCES;
8000 goto done;
8001 }
8002
8003 /* Get the UUID of the issuing process */
8004 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8005
8006 /*
8007 * If this is issued by a process that's recorded as the
8008 * real owner of the socket, or if the uuid is the same as
8009 * the process's own uuid, then proceed. Otherwise ensure
8010 * that the issuing process has the necessary privileges.
8011 */
8012 if (check_cred &&
8013 (uuid_compare(euuid, so->last_uuid) != 0 ||
8014 uuid_compare(euuid, uuid) != 0)) {
8015 if ((error = priv_check_cred(kauth_cred_get(),
8016 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8017 error = EACCES;
8018 goto done;
8019 }
8020 }
8021
8022 /*
8023 * If a process tries to delegate the socket to itself, then
8024 * there's really nothing to do; treat it as a way for the
8025 * delegate association to be cleared. Note that we check
8026 * the uuid of the passed-in proc rather than that of the
8027 * current process, as we need to check the process issuing
8028 * the socket option which could be kernproc itself. Given
8029 * that we don't allow 0 for effective uuid, it means that
8030 * a delegated in-kernel socket stays delegated during its
8031 * lifetime (which is okay.)
8032 */
8033 if (uuid_compare(euuid, uuid) == 0) {
8034 so->so_flags &= ~SOF_DELEGATED;
8035 so->e_upid = 0;
8036 so->e_pid = 0;
8037 uuid_clear(so->e_uuid);
8038 } else {
8039 so->so_flags |= SOF_DELEGATED;
8040 /*
8041 * Unlike so_set_effective_pid(), we only have the UUID
8042 * here and the process ID is not known. Inherit the
8043 * real {pid,upid} of the socket.
8044 */
8045 so->e_upid = so->last_upid;
8046 so->e_pid = so->last_pid;
8047 uuid_copy(so->e_uuid, euuid);
8048 }
8049 /*
8050 * The following will clear the effective process name as it's the same
8051 * as the real process
8052 */
8053 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8054 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8055 }
8056 done:
8057 if (error == 0 && net_io_policy_log) {
8058 uuid_unparse(so->e_uuid, buf);
8059 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8060 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8061 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8062 SOCK_TYPE(so), so->e_pid, buf,
8063 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8064 } else if (error != 0 && net_io_policy_log) {
8065 uuid_unparse(euuid, buf);
8066 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8067 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8068 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8069 SOCK_TYPE(so), buf, error);
8070 }
8071
8072 /* Update this socket's policy upon success */
8073 if (error == 0) {
8074 so->so_policy_gencnt *= -1;
8075 so_update_policy(so);
8076 #if NECP
8077 so_update_necp_policy(so, NULL, NULL);
8078 #endif /* NECP */
8079 }
8080
8081 return error;
8082 }
8083
8084 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8085 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8086 uint32_t ev_datalen)
8087 {
8088 struct kev_msg ev_msg;
8089
8090 /*
8091 * A netpolicy event always starts with a netpolicy_event_data
8092 * structure, but the caller can provide for a longer event
8093 * structure to post, depending on the event code.
8094 */
8095 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8096
8097 bzero(&ev_msg, sizeof(ev_msg));
8098 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8099 ev_msg.kev_class = KEV_NETWORK_CLASS;
8100 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8101 ev_msg.event_code = ev_code;
8102
8103 ev_msg.dv[0].data_ptr = ev_data;
8104 ev_msg.dv[0].data_length = ev_datalen;
8105
8106 kev_post_msg(&ev_msg);
8107 }
8108
8109 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8110 socket_post_kev_msg(uint32_t ev_code,
8111 struct kev_socket_event_data *ev_data,
8112 uint32_t ev_datalen)
8113 {
8114 struct kev_msg ev_msg;
8115
8116 bzero(&ev_msg, sizeof(ev_msg));
8117 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8118 ev_msg.kev_class = KEV_NETWORK_CLASS;
8119 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8120 ev_msg.event_code = ev_code;
8121
8122 ev_msg.dv[0].data_ptr = ev_data;
8123 ev_msg.dv[0].data_length = ev_datalen;
8124
8125 kev_post_msg(&ev_msg);
8126 }
8127
8128 void
socket_post_kev_msg_closed(struct socket * so)8129 socket_post_kev_msg_closed(struct socket *so)
8130 {
8131 struct kev_socket_closed ev = {};
8132 struct sockaddr *socksa = NULL, *peersa = NULL;
8133 int err;
8134
8135 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8136 return;
8137 }
8138 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8139 if (err == 0) {
8140 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8141 &peersa);
8142 if (err == 0) {
8143 memcpy(&ev.ev_data.kev_sockname, socksa,
8144 min(socksa->sa_len,
8145 sizeof(ev.ev_data.kev_sockname)));
8146 memcpy(&ev.ev_data.kev_peername, peersa,
8147 min(peersa->sa_len,
8148 sizeof(ev.ev_data.kev_peername)));
8149 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8150 &ev.ev_data, sizeof(ev));
8151 }
8152 }
8153 free_sockaddr(socksa);
8154 free_sockaddr(peersa);
8155 }
8156
8157 __attribute__((noinline, cold, not_tail_called, noreturn))
8158 __private_extern__ int
assfail(const char * a,const char * f,int l)8159 assfail(const char *a, const char *f, int l)
8160 {
8161 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8162 /* NOTREACHED */
8163 __builtin_unreachable();
8164 }
8165