1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #include <os/log.h>
127
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147
148 static u_int32_t so_cache_hw; /* High water mark for socache */
149 static u_int32_t so_cache_timeouts; /* number of timeouts */
150 static u_int32_t so_cache_max_freed; /* max freed per timeout */
151 static u_int32_t cached_sock_count = 0;
152 STAILQ_HEAD(, socket) so_cache_head;
153 int max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t so_cache_time;
155 static int socketinit_done;
156 static struct zone *so_cache_zone;
157
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329
330 extern struct inpcbinfo tcbinfo;
331
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371
372 /*
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
376 */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &sotcdb, 0, "");
380
381 void
socketinit(void)382 socketinit(void)
383 {
384 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387 #ifdef __LP64__
388 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402
403 if (socketinit_done) {
404 printf("socketinit: already called...\n");
405 return;
406 }
407 socketinit_done = 1;
408
409 PE_parse_boot_argn("socket_debug", &socket_debug,
410 sizeof(socket_debug));
411
412 STAILQ_INIT(&so_cache_head);
413
414 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 + get_inpcb_str_size() + 4 + get_tcp_str_size());
416
417 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419
420 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424
425 in_pcbinit();
426 }
427
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 caddr_t temp;
432 uintptr_t offset;
433
434 lck_mtx_lock(&so_cache_mtx);
435
436 if (!STAILQ_EMPTY(&so_cache_head)) {
437 VERIFY(cached_sock_count > 0);
438
439 *so = STAILQ_FIRST(&so_cache_head);
440 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 STAILQ_NEXT((*so), so_cache_ent) = NULL;
442
443 cached_sock_count--;
444 lck_mtx_unlock(&so_cache_mtx);
445
446 temp = (*so)->so_saved_pcb;
447 bzero((caddr_t)*so, sizeof(struct socket));
448
449 (*so)->so_saved_pcb = temp;
450 } else {
451 lck_mtx_unlock(&so_cache_mtx);
452
453 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454
455 /*
456 * Define offsets for extra structures into our
457 * single block of memory. Align extra structures
458 * on longword boundaries.
459 */
460
461 offset = (uintptr_t)*so;
462 offset += sizeof(struct socket);
463
464 offset = ALIGN(offset);
465
466 (*so)->so_saved_pcb = (caddr_t)offset;
467 offset += get_inpcb_str_size();
468
469 offset = ALIGN(offset);
470
471 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 (caddr_t)offset;
473 }
474
475 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 lck_mtx_lock(&so_cache_mtx);
482
483 so_cache_time = net_uptime();
484 if (++cached_sock_count > max_cached_sock_count) {
485 --cached_sock_count;
486 lck_mtx_unlock(&so_cache_mtx);
487 zfree(so_cache_zone, so);
488 } else {
489 if (so_cache_hw < cached_sock_count) {
490 so_cache_hw = cached_sock_count;
491 }
492
493 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494
495 so->cache_timestamp = so_cache_time;
496 lck_mtx_unlock(&so_cache_mtx);
497 }
498 }
499
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 if (so->last_pid != 0) {
504 /*
505 * last_pid and last_upid should remain zero for sockets
506 * created using sock_socket. The check above achieves that
507 */
508 if (self == PROC_NULL) {
509 self = current_proc();
510 }
511
512 if (so->last_upid != proc_uniqueid(self) ||
513 so->last_pid != proc_pid(self)) {
514 so->last_upid = proc_uniqueid(self);
515 so->last_pid = proc_pid(self);
516 proc_getexecutableuuid(self, so->last_uuid,
517 sizeof(so->last_uuid));
518 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 }
521 }
522 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 }
524 }
525
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 (void) inp_update_policy(sotoinpcb(so));
531 }
532 }
533
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537 struct sockaddr *override_remote_addr)
538 {
539 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 override_remote_addr, 0);
542 }
543 }
544 #endif /* NECP */
545
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 struct socket *p;
550 int n_freed = 0;
551 boolean_t rc = FALSE;
552
553 lck_mtx_lock(&so_cache_mtx);
554 so_cache_timeouts++;
555 so_cache_time = net_uptime();
556
557 while (!STAILQ_EMPTY(&so_cache_head)) {
558 VERIFY(cached_sock_count > 0);
559 p = STAILQ_FIRST(&so_cache_head);
560 if ((so_cache_time - p->cache_timestamp) <
561 SO_CACHE_TIME_LIMIT) {
562 break;
563 }
564
565 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 --cached_sock_count;
567
568 zfree(so_cache_zone, p);
569
570 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 so_cache_max_freed++;
572 break;
573 }
574 }
575
576 /* Schedule again if there is more to cleanup */
577 if (!STAILQ_EMPTY(&so_cache_head)) {
578 rc = TRUE;
579 }
580
581 lck_mtx_unlock(&so_cache_mtx);
582 return rc;
583 }
584
585 /*
586 * Get a socket structure from our zone, and initialize it.
587 * We don't implement `waitok' yet (see comments in uipc_domain.c).
588 * Note that it would probably be better to allocate socket
589 * and PCB at the same time, but I'm not convinced that all
590 * the protocols can be easily modified to do this.
591 */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 struct socket *so;
597
598 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 cached_sock_alloc(&so, how);
600 } else {
601 so = zalloc_flags(socket_zone, how | Z_ZERO);
602 }
603 if (so != NULL) {
604 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605
606 /*
607 * Increment the socket allocation statistics
608 */
609 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 }
611
612 return so;
613 }
614
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617 struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 struct protosw *prp;
620 struct socket *so;
621 int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 pid_t rpid = -1;
624 #endif
625
626 #if TCPDEBUG
627 extern int tcpconsdebug;
628 #endif
629
630 VERIFY(aso != NULL);
631 *aso = NULL;
632
633 if (proto != 0) {
634 prp = pffindproto(dom, proto, type);
635 } else {
636 prp = pffindtype(dom, type);
637 }
638
639 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 if (pffinddomain(dom) == NULL) {
641 return EAFNOSUPPORT;
642 }
643 if (proto != 0) {
644 if (pffindprotonotype(dom, proto) != NULL) {
645 return EPROTOTYPE;
646 }
647 }
648 return EPROTONOSUPPORT;
649 }
650 if (prp->pr_type != type) {
651 return EPROTOTYPE;
652 }
653 so = soalloc(1, dom, type);
654 if (so == NULL) {
655 return ENOBUFS;
656 }
657
658 switch (dom) {
659 case PF_LOCAL:
660 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 break;
662 case PF_INET:
663 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 if (type == SOCK_STREAM) {
665 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 } else {
667 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 }
669 break;
670 case PF_ROUTE:
671 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 break;
673 case PF_NDRV:
674 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 break;
676 case PF_KEY:
677 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 break;
679 case PF_INET6:
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 if (type == SOCK_STREAM) {
682 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 } else {
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 }
686 break;
687 case PF_SYSTEM:
688 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 break;
690 case PF_MULTIPATH:
691 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 break;
693 default:
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 break;
696 }
697
698 if (flags & SOCF_MPTCP) {
699 so->so_state |= SS_NBIO;
700 }
701
702 TAILQ_INIT(&so->so_incomp);
703 TAILQ_INIT(&so->so_comp);
704 so->so_type = (short)type;
705 so->last_upid = proc_uniqueid(p);
706 so->last_pid = proc_pid(p);
707 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709
710 if (ep != PROC_NULL && ep != p) {
711 so->e_upid = proc_uniqueid(ep);
712 so->e_pid = proc_pid(ep);
713 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 if (ep->p_responsible_pid != so->e_pid) {
717 rpid = ep->p_responsible_pid;
718 }
719 #endif
720 }
721
722 #if defined(XNU_TARGET_OS_OSX)
723 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 rpid = p->p_responsible_pid;
725 }
726
727 so->so_rpid = -1;
728 uuid_clear(so->so_ruuid);
729 if (rpid >= 0) {
730 proc_t rp = proc_find(rpid);
731 if (rp != PROC_NULL) {
732 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 so->so_rpid = rpid;
734 proc_rele(rp);
735 }
736 }
737 #endif
738
739 so->so_cred = kauth_cred_proc_ref(p);
740 if (!suser(kauth_cred_get(), NULL)) {
741 so->so_state |= SS_PRIV;
742 }
743
744 so->so_proto = prp;
745 so->so_rcv.sb_flags |= SB_RECV;
746 so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 so->next_lock_lr = 0;
748 so->next_unlock_lr = 0;
749
750 /*
751 * Attachment will create the per pcb lock if necessary and
752 * increase refcount for creation, make sure it's done before
753 * socket is inserted in lists.
754 */
755 so->so_usecount++;
756
757 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 if (error != 0) {
759 /*
760 * Warning:
761 * If so_pcb is not zero, the socket will be leaked,
762 * so protocol attachment handler must be coded carefuly
763 */
764 if (so->so_pcb != NULL) {
765 os_log_error(OS_LOG_DEFAULT,
766 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 error, dom, proto, type);
768 }
769 /*
770 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 */
772 so->so_state |= SS_NOFDREF;
773 so->so_flags |= SOF_PCBCLEARING;
774 VERIFY(so->so_usecount > 0);
775 so->so_usecount--;
776 sofreelastref(so, 1); /* will deallocate the socket */
777 return error;
778 }
779
780 /*
781 * Note: needs so_pcb to be set after pru_attach
782 */
783 if (prp->pr_update_last_owner != NULL) {
784 (*prp->pr_update_last_owner)(so, p, ep);
785 }
786
787 atomic_add_32(&prp->pr_domain->dom_refs, 1);
788
789 /* Attach socket filters for this protocol */
790 sflt_initsock(so);
791 #if TCPDEBUG
792 if (tcpconsdebug == 2) {
793 so->so_options |= SO_DEBUG;
794 }
795 #endif
796 so_set_default_traffic_class(so);
797
798 /*
799 * If this thread or task is marked to create backgrounded sockets,
800 * mark the socket as background.
801 */
802 if (!(flags & SOCF_MPTCP) &&
803 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 so->so_background_thread = current_thread();
806 }
807
808 switch (dom) {
809 /*
810 * Don't mark Unix domain or system
811 * eligible for defunct by default.
812 */
813 case PF_LOCAL:
814 case PF_SYSTEM:
815 so->so_flags |= SOF_NODEFUNCT;
816 break;
817 default:
818 break;
819 }
820
821 /*
822 * Entitlements can't be checked at socket creation time except if the
823 * application requested a feature guarded by a privilege (c.f., socket
824 * delegation).
825 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 * a privilege check should only be triggered by a userland request.
827 * A privilege check at socket creation time is time consuming and
828 * could trigger many authorisation error messages from the security
829 * APIs.
830 */
831
832 *aso = so;
833
834 return 0;
835 }
836
837 /*
838 * Returns: 0 Success
839 * EAFNOSUPPORT
840 * EPROTOTYPE
841 * EPROTONOSUPPORT
842 * ENOBUFS
843 * <pru_attach>:ENOBUFS[AF_UNIX]
844 * <pru_attach>:ENOBUFS[TCP]
845 * <pru_attach>:ENOMEM[TCP]
846 * <pru_attach>:??? [other protocol families, IPSEC]
847 */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 PROC_NULL);
853 }
854
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 int error = 0;
859 struct proc *ep = PROC_NULL;
860
861 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 error = ESRCH;
863 goto done;
864 }
865
866 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867
868 /*
869 * It might not be wise to hold the proc reference when calling
870 * socreate_internal since it calls soalloc with M_WAITOK
871 */
872 done:
873 if (ep != PROC_NULL) {
874 proc_rele(ep);
875 }
876
877 return error;
878 }
879
880 /*
881 * Returns: 0 Success
882 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
883 * <pru_bind>:EAFNOSUPPORT Address family not supported
884 * <pru_bind>:EADDRNOTAVAIL Address not available.
885 * <pru_bind>:EINVAL Invalid argument
886 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
887 * <pru_bind>:EACCES Permission denied
888 * <pru_bind>:EADDRINUSE Address in use
889 * <pru_bind>:EAGAIN Resource unavailable, try again
890 * <pru_bind>:EPERM Operation not permitted
891 * <pru_bind>:???
892 * <sf_bind>:???
893 *
894 * Notes: It's not possible to fully enumerate the return codes above,
895 * since socket filter authors and protocol family authors may
896 * not choose to limit their error returns to those listed, even
897 * though this may result in some software operating incorrectly.
898 *
899 * The error codes which are enumerated above are those known to
900 * be returned by the tcp_usr_bind function supplied.
901 */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 struct proc *p = current_proc();
906 int error = 0;
907
908 if (dolock) {
909 socket_lock(so, 1);
910 }
911
912 so_update_last_owner_locked(so, p);
913 so_update_policy(so);
914
915 #if NECP
916 so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918
919 /*
920 * If this is a bind request on a socket that has been marked
921 * as inactive, reject it now before we go any further.
922 */
923 if (so->so_flags & SOF_DEFUNCT) {
924 error = EINVAL;
925 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
926 __func__, proc_pid(p), proc_best_name(p),
927 so->so_gencnt,
928 SOCK_DOM(so), SOCK_TYPE(so), error);
929 goto out;
930 }
931
932 /* Socket filter */
933 error = sflt_bind(so, nam);
934
935 if (error == 0) {
936 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 }
938 out:
939 if (dolock) {
940 socket_unlock(so, 1);
941 }
942
943 if (error == EJUSTRETURN) {
944 error = 0;
945 }
946
947 return error;
948 }
949
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 kauth_cred_unref(&so->so_cred);
954
955 /* Remove any filters */
956 sflt_termsock(so);
957
958 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959
960 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
961 cached_sock_free(so);
962 } else {
963 zfree(socket_zone, so);
964 }
965 }
966
967 /*
968 * Returns: 0 Success
969 * EINVAL
970 * EOPNOTSUPP
971 * <pru_listen>:EINVAL[AF_UNIX]
972 * <pru_listen>:EINVAL[TCP]
973 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
974 * <pru_listen>:EINVAL[TCP] Invalid argument
975 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
976 * <pru_listen>:EACCES[TCP] Permission denied
977 * <pru_listen>:EADDRINUSE[TCP] Address in use
978 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
979 * <pru_listen>:EPERM[TCP] Operation not permitted
980 * <sf_listen>:???
981 *
982 * Notes: Other <pru_listen> returns depend on the protocol family; all
983 * <sf_listen> returns depend on what the filter author causes
984 * their filter to return.
985 */
986 int
solisten(struct socket * so,int backlog)987 solisten(struct socket *so, int backlog)
988 {
989 struct proc *p = current_proc();
990 int error = 0;
991
992 socket_lock(so, 1);
993
994 so_update_last_owner_locked(so, p);
995 so_update_policy(so);
996
997 if (TAILQ_EMPTY(&so->so_comp)) {
998 so->so_options |= SO_ACCEPTCONN;
999 }
1000
1001 #if NECP
1002 so_update_necp_policy(so, NULL, NULL);
1003 #endif /* NECP */
1004
1005 if (so->so_proto == NULL) {
1006 error = EINVAL;
1007 so->so_options &= ~SO_ACCEPTCONN;
1008 goto out;
1009 }
1010 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1011 error = EOPNOTSUPP;
1012 so->so_options &= ~SO_ACCEPTCONN;
1013 goto out;
1014 }
1015
1016 /*
1017 * If the listen request is made on a socket that is not fully
1018 * disconnected, or on a socket that has been marked as inactive,
1019 * reject the request now.
1020 */
1021 if ((so->so_state &
1022 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1023 (so->so_flags & SOF_DEFUNCT)) {
1024 error = EINVAL;
1025 if (so->so_flags & SOF_DEFUNCT) {
1026 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1027 "(%d)\n", __func__, proc_pid(p),
1028 proc_best_name(p),
1029 so->so_gencnt,
1030 SOCK_DOM(so), SOCK_TYPE(so), error);
1031 }
1032 so->so_options &= ~SO_ACCEPTCONN;
1033 goto out;
1034 }
1035
1036 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1037 error = EPERM;
1038 so->so_options &= ~SO_ACCEPTCONN;
1039 goto out;
1040 }
1041
1042 error = sflt_listen(so);
1043 if (error == 0) {
1044 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1045 }
1046
1047 if (error) {
1048 if (error == EJUSTRETURN) {
1049 error = 0;
1050 }
1051 so->so_options &= ~SO_ACCEPTCONN;
1052 goto out;
1053 }
1054
1055 /*
1056 * POSIX: The implementation may have an upper limit on the length of
1057 * the listen queue-either global or per accepting socket. If backlog
1058 * exceeds this limit, the length of the listen queue is set to the
1059 * limit.
1060 *
1061 * If listen() is called with a backlog argument value that is less
1062 * than 0, the function behaves as if it had been called with a backlog
1063 * argument value of 0.
1064 *
1065 * A backlog argument of 0 may allow the socket to accept connections,
1066 * in which case the length of the listen queue may be set to an
1067 * implementation-defined minimum value.
1068 */
1069 if (backlog <= 0 || backlog > somaxconn) {
1070 backlog = somaxconn;
1071 }
1072
1073 so->so_qlimit = (short)backlog;
1074 out:
1075 socket_unlock(so, 1);
1076 return error;
1077 }
1078
1079 /*
1080 * The "accept list lock" protects the fields related to the listener queues
1081 * because we can unlock a socket to respect the lock ordering between
1082 * the listener socket and its clients sockets. The lock ordering is first to
1083 * acquire the client socket before the listener socket.
1084 *
1085 * The accept list lock serializes access to the following fields:
1086 * - of the listener socket:
1087 * - so_comp
1088 * - so_incomp
1089 * - so_qlen
1090 * - so_inqlen
1091 * - of client sockets that are in so_comp or so_incomp:
1092 * - so_head
1093 * - so_list
1094 *
1095 * As one can see the accept list lock protects the consistent of the
1096 * linkage of the client sockets.
1097 *
1098 * Note that those fields may be read without holding the accept list lock
1099 * for a preflight provided the accept list lock is taken when committing
1100 * to take an action based on the result of the preflight. The preflight
1101 * saves the cost of doing the unlock/lock dance.
1102 */
1103 void
so_acquire_accept_list(struct socket * head,struct socket * so)1104 so_acquire_accept_list(struct socket *head, struct socket *so)
1105 {
1106 lck_mtx_t *mutex_held;
1107
1108 if (head->so_proto->pr_getlock == NULL) {
1109 return;
1110 }
1111 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1112 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1113
1114 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1115 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1116 return;
1117 }
1118 if (so != NULL) {
1119 socket_unlock(so, 0);
1120 }
1121 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1122 so_accept_list_waits += 1;
1123 msleep((caddr_t)&head->so_incomp, mutex_held,
1124 PSOCK | PCATCH, __func__, NULL);
1125 }
1126 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1127 if (so != NULL) {
1128 socket_unlock(head, 0);
1129 socket_lock(so, 0);
1130 socket_lock(head, 0);
1131 }
1132 }
1133
1134 void
so_release_accept_list(struct socket * head)1135 so_release_accept_list(struct socket *head)
1136 {
1137 if (head->so_proto->pr_getlock != NULL) {
1138 lck_mtx_t *mutex_held;
1139
1140 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1141 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1142
1143 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1144 wakeup((caddr_t)&head->so_incomp);
1145 }
1146 }
1147
1148 void
sofreelastref(struct socket * so,int dealloc)1149 sofreelastref(struct socket *so, int dealloc)
1150 {
1151 struct socket *head = so->so_head;
1152
1153 /* Assume socket is locked */
1154
1155 #if FLOW_DIVERT
1156 if (so->so_flags & SOF_FLOW_DIVERT) {
1157 flow_divert_detach(so);
1158 }
1159 #endif /* FLOW_DIVERT */
1160
1161 #if CONTENT_FILTER
1162 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1163 cfil_sock_detach(so);
1164 }
1165 #endif /* CONTENT_FILTER */
1166
1167 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1168 soflow_detach(so);
1169 }
1170
1171 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1172 selthreadclear(&so->so_snd.sb_sel);
1173 selthreadclear(&so->so_rcv.sb_sel);
1174 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1175 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1176 so->so_event = sonullevent;
1177 return;
1178 }
1179 if (head != NULL) {
1180 /*
1181 * Need to lock the listener when the protocol has
1182 * per socket locks
1183 */
1184 if (head->so_proto->pr_getlock != NULL) {
1185 socket_lock(head, 1);
1186 so_acquire_accept_list(head, so);
1187 }
1188 if (so->so_state & SS_INCOMP) {
1189 so->so_state &= ~SS_INCOMP;
1190 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1191 head->so_incqlen--;
1192 head->so_qlen--;
1193 so->so_head = NULL;
1194
1195 if (head->so_proto->pr_getlock != NULL) {
1196 so_release_accept_list(head);
1197 socket_unlock(head, 1);
1198 }
1199 } else if (so->so_state & SS_COMP) {
1200 if (head->so_proto->pr_getlock != NULL) {
1201 so_release_accept_list(head);
1202 socket_unlock(head, 1);
1203 }
1204 /*
1205 * We must not decommission a socket that's
1206 * on the accept(2) queue. If we do, then
1207 * accept(2) may hang after select(2) indicated
1208 * that the listening socket was ready.
1209 */
1210 selthreadclear(&so->so_snd.sb_sel);
1211 selthreadclear(&so->so_rcv.sb_sel);
1212 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1213 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1214 so->so_event = sonullevent;
1215 return;
1216 } else {
1217 if (head->so_proto->pr_getlock != NULL) {
1218 so_release_accept_list(head);
1219 socket_unlock(head, 1);
1220 }
1221 printf("sofree: not queued\n");
1222 }
1223 }
1224 sowflush(so);
1225 sorflush(so);
1226
1227 /* 3932268: disable upcall */
1228 so->so_rcv.sb_flags &= ~SB_UPCALL;
1229 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1230 so->so_event = sonullevent;
1231
1232 if (dealloc) {
1233 sodealloc(so);
1234 }
1235 }
1236
1237 void
soclose_wait_locked(struct socket * so)1238 soclose_wait_locked(struct socket *so)
1239 {
1240 lck_mtx_t *mutex_held;
1241
1242 if (so->so_proto->pr_getlock != NULL) {
1243 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1244 } else {
1245 mutex_held = so->so_proto->pr_domain->dom_mtx;
1246 }
1247 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1248
1249 /*
1250 * Double check here and return if there's no outstanding upcall;
1251 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1252 */
1253 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1254 return;
1255 }
1256 so->so_rcv.sb_flags &= ~SB_UPCALL;
1257 so->so_snd.sb_flags &= ~SB_UPCALL;
1258 so->so_flags |= SOF_CLOSEWAIT;
1259
1260 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1261 "soclose_wait_locked", NULL);
1262 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1263 so->so_flags &= ~SOF_CLOSEWAIT;
1264 }
1265
1266 /*
1267 * Close a socket on last file table reference removal.
1268 * Initiate disconnect if connected.
1269 * Free socket when disconnect complete.
1270 */
1271 int
soclose_locked(struct socket * so)1272 soclose_locked(struct socket *so)
1273 {
1274 int error = 0;
1275 struct timespec ts;
1276
1277 if (so->so_usecount == 0) {
1278 panic("soclose: so=%p refcount=0", so);
1279 /* NOTREACHED */
1280 }
1281
1282 sflt_notify(so, sock_evt_closing, NULL);
1283
1284 if (so->so_upcallusecount) {
1285 soclose_wait_locked(so);
1286 }
1287
1288 #if CONTENT_FILTER
1289 /*
1290 * We have to wait until the content filters are done
1291 */
1292 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1293 cfil_sock_close_wait(so);
1294 cfil_sock_is_closed(so);
1295 cfil_sock_detach(so);
1296 }
1297 #endif /* CONTENT_FILTER */
1298
1299 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1300 soflow_detach(so);
1301 }
1302
1303 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1304 soresume(current_proc(), so, 1);
1305 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1306 }
1307
1308 if ((so->so_options & SO_ACCEPTCONN)) {
1309 struct socket *sp, *sonext;
1310 int persocklock = 0;
1311 int incomp_overflow_only;
1312
1313 /*
1314 * We do not want new connection to be added
1315 * to the connection queues
1316 */
1317 so->so_options &= ~SO_ACCEPTCONN;
1318
1319 /*
1320 * We can drop the lock on the listener once
1321 * we've acquired the incoming list
1322 */
1323 if (so->so_proto->pr_getlock != NULL) {
1324 persocklock = 1;
1325 so_acquire_accept_list(so, NULL);
1326 socket_unlock(so, 0);
1327 }
1328 again:
1329 incomp_overflow_only = 1;
1330
1331 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1332 /*
1333 * Radar 5350314
1334 * skip sockets thrown away by tcpdropdropblreq
1335 * they will get cleanup by the garbage collection.
1336 * otherwise, remove the incomp socket from the queue
1337 * and let soabort trigger the appropriate cleanup.
1338 */
1339 if (sp->so_flags & SOF_OVERFLOW) {
1340 continue;
1341 }
1342
1343 if (persocklock != 0) {
1344 socket_lock(sp, 1);
1345 }
1346
1347 /*
1348 * Radar 27945981
1349 * The extra reference for the list insure the
1350 * validity of the socket pointer when we perform the
1351 * unlock of the head above
1352 */
1353 if (sp->so_state & SS_INCOMP) {
1354 sp->so_state &= ~SS_INCOMP;
1355 sp->so_head = NULL;
1356 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1357 so->so_incqlen--;
1358 so->so_qlen--;
1359
1360 (void) soabort(sp);
1361 } else {
1362 panic("%s sp %p in so_incomp but !SS_INCOMP",
1363 __func__, sp);
1364 }
1365
1366 if (persocklock != 0) {
1367 socket_unlock(sp, 1);
1368 }
1369 }
1370
1371 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1372 /* Dequeue from so_comp since sofree() won't do it */
1373 if (persocklock != 0) {
1374 socket_lock(sp, 1);
1375 }
1376
1377 if (sp->so_state & SS_COMP) {
1378 sp->so_state &= ~SS_COMP;
1379 sp->so_head = NULL;
1380 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1381 so->so_qlen--;
1382
1383 (void) soabort(sp);
1384 } else {
1385 panic("%s sp %p in so_comp but !SS_COMP",
1386 __func__, sp);
1387 }
1388
1389 if (persocklock) {
1390 socket_unlock(sp, 1);
1391 }
1392 }
1393
1394 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1395 #if (DEBUG | DEVELOPMENT)
1396 panic("%s head %p so_comp not empty", __func__, so);
1397 #endif /* (DEVELOPMENT || DEBUG) */
1398
1399 goto again;
1400 }
1401
1402 if (!TAILQ_EMPTY(&so->so_comp)) {
1403 #if (DEBUG | DEVELOPMENT)
1404 panic("%s head %p so_comp not empty", __func__, so);
1405 #endif /* (DEVELOPMENT || DEBUG) */
1406
1407 goto again;
1408 }
1409
1410 if (persocklock) {
1411 socket_lock(so, 0);
1412 so_release_accept_list(so);
1413 }
1414 }
1415 if (so->so_pcb == NULL) {
1416 /* 3915887: mark the socket as ready for dealloc */
1417 so->so_flags |= SOF_PCBCLEARING;
1418 goto discard;
1419 }
1420
1421 if (so->so_state & SS_ISCONNECTED) {
1422 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1423 error = sodisconnectlocked(so);
1424 if (error) {
1425 goto drop;
1426 }
1427 }
1428 if (so->so_options & SO_LINGER) {
1429 if ((so->so_state & SS_ISDISCONNECTING) &&
1430 (so->so_state & SS_NBIO)) {
1431 goto drop;
1432 }
1433 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1434 lck_mtx_t *mutex_held;
1435
1436 if (so->so_proto->pr_getlock != NULL) {
1437 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1438 } else {
1439 mutex_held = so->so_proto->pr_domain->dom_mtx;
1440 }
1441 ts.tv_sec = (so->so_linger / 100);
1442 ts.tv_nsec = (so->so_linger % 100) *
1443 NSEC_PER_USEC * 1000 * 10;
1444 error = msleep((caddr_t)&so->so_timeo,
1445 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1446 if (error) {
1447 /*
1448 * It's OK when the time fires,
1449 * don't report an error
1450 */
1451 if (error == EWOULDBLOCK) {
1452 error = 0;
1453 }
1454 break;
1455 }
1456 }
1457 }
1458 }
1459 drop:
1460 if (so->so_usecount == 0) {
1461 panic("soclose: usecount is zero so=%p", so);
1462 /* NOTREACHED */
1463 }
1464 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1465 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1466 if (error == 0) {
1467 error = error2;
1468 }
1469 }
1470 if (so->so_usecount <= 0) {
1471 panic("soclose: usecount is zero so=%p", so);
1472 /* NOTREACHED */
1473 }
1474 discard:
1475 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1476 (so->so_state & SS_NOFDREF)) {
1477 panic("soclose: NOFDREF");
1478 /* NOTREACHED */
1479 }
1480 so->so_state |= SS_NOFDREF;
1481
1482 if ((so->so_flags & SOF_KNOTE) != 0) {
1483 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1484 }
1485
1486 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1487
1488 VERIFY(so->so_usecount > 0);
1489 so->so_usecount--;
1490 sofree(so);
1491 return error;
1492 }
1493
1494 int
soclose(struct socket * so)1495 soclose(struct socket *so)
1496 {
1497 int error = 0;
1498 socket_lock(so, 1);
1499
1500 if (so->so_retaincnt == 0) {
1501 error = soclose_locked(so);
1502 } else {
1503 /*
1504 * if the FD is going away, but socket is
1505 * retained in kernel remove its reference
1506 */
1507 so->so_usecount--;
1508 if (so->so_usecount < 2) {
1509 panic("soclose: retaincnt non null and so=%p "
1510 "usecount=%d\n", so, so->so_usecount);
1511 }
1512 }
1513 socket_unlock(so, 1);
1514 return error;
1515 }
1516
1517 /*
1518 * Must be called at splnet...
1519 */
1520 /* Should already be locked */
1521 int
soabort(struct socket * so)1522 soabort(struct socket *so)
1523 {
1524 int error;
1525
1526 #ifdef MORE_LOCKING_DEBUG
1527 lck_mtx_t *mutex_held;
1528
1529 if (so->so_proto->pr_getlock != NULL) {
1530 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1531 } else {
1532 mutex_held = so->so_proto->pr_domain->dom_mtx;
1533 }
1534 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1535 #endif
1536
1537 if ((so->so_flags & SOF_ABORTED) == 0) {
1538 so->so_flags |= SOF_ABORTED;
1539 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1540 if (error) {
1541 sofree(so);
1542 return error;
1543 }
1544 }
1545 return 0;
1546 }
1547
1548 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1549 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1550 {
1551 int error;
1552
1553 if (dolock) {
1554 socket_lock(so, 1);
1555 }
1556
1557 so_update_last_owner_locked(so, PROC_NULL);
1558 so_update_policy(so);
1559 #if NECP
1560 so_update_necp_policy(so, NULL, NULL);
1561 #endif /* NECP */
1562
1563 if ((so->so_state & SS_NOFDREF) == 0) {
1564 panic("soaccept: !NOFDREF");
1565 }
1566 so->so_state &= ~SS_NOFDREF;
1567 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1568
1569 if (dolock) {
1570 socket_unlock(so, 1);
1571 }
1572 return error;
1573 }
1574
1575 int
soaccept(struct socket * so,struct sockaddr ** nam)1576 soaccept(struct socket *so, struct sockaddr **nam)
1577 {
1578 return soacceptlock(so, nam, 1);
1579 }
1580
1581 int
soacceptfilter(struct socket * so,struct socket * head)1582 soacceptfilter(struct socket *so, struct socket *head)
1583 {
1584 struct sockaddr *local = NULL, *remote = NULL;
1585 int error = 0;
1586
1587 /*
1588 * Hold the lock even if this socket has not been made visible
1589 * to the filter(s). For sockets with global locks, this protects
1590 * against the head or peer going away
1591 */
1592 socket_lock(so, 1);
1593 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1594 sogetaddr_locked(so, &local, 0) != 0) {
1595 so->so_state &= ~SS_NOFDREF;
1596 socket_unlock(so, 1);
1597 soclose(so);
1598 /* Out of resources; try it again next time */
1599 error = ECONNABORTED;
1600 goto done;
1601 }
1602
1603 error = sflt_accept(head, so, local, remote);
1604
1605 /*
1606 * If we get EJUSTRETURN from one of the filters, mark this socket
1607 * as inactive and return it anyway. This newly accepted socket
1608 * will be disconnected later before we hand it off to the caller.
1609 */
1610 if (error == EJUSTRETURN) {
1611 error = 0;
1612 (void) sosetdefunct(current_proc(), so,
1613 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1614 }
1615
1616 if (error != 0) {
1617 /*
1618 * This may seem like a duplication to the above error
1619 * handling part when we return ECONNABORTED, except
1620 * the following is done while holding the lock since
1621 * the socket has been exposed to the filter(s) earlier.
1622 */
1623 so->so_state &= ~SS_NOFDREF;
1624 socket_unlock(so, 1);
1625 soclose(so);
1626 /* Propagate socket filter's error code to the caller */
1627 } else {
1628 socket_unlock(so, 1);
1629 }
1630 done:
1631 /* Callee checks for NULL pointer */
1632 sock_freeaddr(remote);
1633 sock_freeaddr(local);
1634 return error;
1635 }
1636
1637 /*
1638 * Returns: 0 Success
1639 * EOPNOTSUPP Operation not supported on socket
1640 * EISCONN Socket is connected
1641 * <pru_connect>:EADDRNOTAVAIL Address not available.
1642 * <pru_connect>:EINVAL Invalid argument
1643 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1644 * <pru_connect>:EACCES Permission denied
1645 * <pru_connect>:EADDRINUSE Address in use
1646 * <pru_connect>:EAGAIN Resource unavailable, try again
1647 * <pru_connect>:EPERM Operation not permitted
1648 * <sf_connect_out>:??? [anything a filter writer might set]
1649 */
1650 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1651 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1652 {
1653 int error;
1654 struct proc *p = current_proc();
1655 tracker_metadata_t metadata = { };
1656
1657 if (dolock) {
1658 socket_lock(so, 1);
1659 }
1660
1661 so_update_last_owner_locked(so, p);
1662 so_update_policy(so);
1663
1664 #if NECP
1665 so_update_necp_policy(so, NULL, nam);
1666 #endif /* NECP */
1667
1668 /*
1669 * If this is a listening socket or if this is a previously-accepted
1670 * socket that has been marked as inactive, reject the connect request.
1671 */
1672 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1673 error = EOPNOTSUPP;
1674 if (so->so_flags & SOF_DEFUNCT) {
1675 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1676 "(%d)\n", __func__, proc_pid(p),
1677 proc_best_name(p),
1678 so->so_gencnt,
1679 SOCK_DOM(so), SOCK_TYPE(so), error);
1680 }
1681 if (dolock) {
1682 socket_unlock(so, 1);
1683 }
1684 return error;
1685 }
1686
1687 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1688 if (dolock) {
1689 socket_unlock(so, 1);
1690 }
1691 return EPERM;
1692 }
1693
1694 /*
1695 * If protocol is connection-based, can only connect once.
1696 * Otherwise, if connected, try to disconnect first.
1697 * This allows user to disconnect by connecting to, e.g.,
1698 * a null address.
1699 */
1700 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1701 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1702 (error = sodisconnectlocked(so)))) {
1703 error = EISCONN;
1704 } else {
1705 /*
1706 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1707 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1708 */
1709 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1710 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1711 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1712 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1713 }
1714 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1715 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1716 }
1717 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1718 printf("connect() - failed necp_set_socket_domain_attributes");
1719 }
1720 }
1721 }
1722
1723 /*
1724 * Run connect filter before calling protocol:
1725 * - non-blocking connect returns before completion;
1726 */
1727 error = sflt_connectout(so, nam);
1728 if (error != 0) {
1729 if (error == EJUSTRETURN) {
1730 error = 0;
1731 }
1732 } else {
1733 error = (*so->so_proto->pr_usrreqs->pru_connect)
1734 (so, nam, p);
1735 if (error != 0) {
1736 so->so_state &= ~SS_ISCONNECTING;
1737 }
1738 }
1739 }
1740 if (dolock) {
1741 socket_unlock(so, 1);
1742 }
1743 return error;
1744 }
1745
1746 int
soconnect(struct socket * so,struct sockaddr * nam)1747 soconnect(struct socket *so, struct sockaddr *nam)
1748 {
1749 return soconnectlock(so, nam, 1);
1750 }
1751
1752 /*
1753 * Returns: 0 Success
1754 * <pru_connect2>:EINVAL[AF_UNIX]
1755 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1756 * <pru_connect2>:??? [other protocol families]
1757 *
1758 * Notes: <pru_connect2> is not supported by [TCP].
1759 */
1760 int
soconnect2(struct socket * so1,struct socket * so2)1761 soconnect2(struct socket *so1, struct socket *so2)
1762 {
1763 int error;
1764
1765 socket_lock(so1, 1);
1766 if (so2->so_proto->pr_lock) {
1767 socket_lock(so2, 1);
1768 }
1769
1770 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1771
1772 socket_unlock(so1, 1);
1773 if (so2->so_proto->pr_lock) {
1774 socket_unlock(so2, 1);
1775 }
1776 return error;
1777 }
1778
1779 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1780 soconnectxlocked(struct socket *so, struct sockaddr *src,
1781 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1782 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1783 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1784 {
1785 int error;
1786 tracker_metadata_t metadata = { };
1787
1788 so_update_last_owner_locked(so, p);
1789 so_update_policy(so);
1790
1791 /*
1792 * If this is a listening socket or if this is a previously-accepted
1793 * socket that has been marked as inactive, reject the connect request.
1794 */
1795 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1796 error = EOPNOTSUPP;
1797 if (so->so_flags & SOF_DEFUNCT) {
1798 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1799 "(%d)\n", __func__, proc_pid(p),
1800 proc_best_name(p),
1801 so->so_gencnt,
1802 SOCK_DOM(so), SOCK_TYPE(so), error);
1803 }
1804 return error;
1805 }
1806
1807 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1808 return EPERM;
1809 }
1810
1811 /*
1812 * If protocol is connection-based, can only connect once
1813 * unless PR_MULTICONN is set. Otherwise, if connected,
1814 * try to disconnect first. This allows user to disconnect
1815 * by connecting to, e.g., a null address.
1816 */
1817 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1818 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1819 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1820 (error = sodisconnectlocked(so)) != 0)) {
1821 error = EISCONN;
1822 } else {
1823 /*
1824 * For TCP, check if destination address is a tracker and mark the socket accordingly
1825 * (only if it hasn't been marked yet).
1826 */
1827 if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1828 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1829 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1830 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1831 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1832 }
1833 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1834 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1835 }
1836 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1837 printf("connectx() - failed necp_set_socket_domain_attributes");
1838 }
1839 }
1840 }
1841
1842 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1843 (flags & CONNECT_DATA_IDEMPOTENT)) {
1844 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1845
1846 if (flags & CONNECT_DATA_AUTHENTICATED) {
1847 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1848 }
1849 }
1850
1851 /*
1852 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1853 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1854 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1855 * Case 3 allows user to combine write with connect even if they have
1856 * no use for TFO (such as regular TCP, and UDP).
1857 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1858 */
1859 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1860 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1861 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1862 }
1863
1864 /*
1865 * If a user sets data idempotent and does not pass an uio, or
1866 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1867 * SOF1_DATA_IDEMPOTENT.
1868 */
1869 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1870 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1871 /* We should return EINVAL instead perhaps. */
1872 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1873 }
1874
1875 /*
1876 * Run connect filter before calling protocol:
1877 * - non-blocking connect returns before completion;
1878 */
1879 error = sflt_connectout(so, dst);
1880 if (error != 0) {
1881 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1882 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1883 if (error == EJUSTRETURN) {
1884 error = 0;
1885 }
1886 } else {
1887 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1888 (so, src, dst, p, ifscope, aid, pcid,
1889 flags, arg, arglen, auio, bytes_written);
1890 if (error != 0) {
1891 so->so_state &= ~SS_ISCONNECTING;
1892 if (error != EINPROGRESS) {
1893 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1894 }
1895 }
1896 }
1897 }
1898
1899 return error;
1900 }
1901
1902 int
sodisconnectlocked(struct socket * so)1903 sodisconnectlocked(struct socket *so)
1904 {
1905 int error;
1906
1907 if ((so->so_state & SS_ISCONNECTED) == 0) {
1908 error = ENOTCONN;
1909 goto bad;
1910 }
1911 if (so->so_state & SS_ISDISCONNECTING) {
1912 error = EALREADY;
1913 goto bad;
1914 }
1915
1916 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1917 if (error == 0) {
1918 sflt_notify(so, sock_evt_disconnected, NULL);
1919 }
1920
1921 bad:
1922 return error;
1923 }
1924
1925 /* Locking version */
1926 int
sodisconnect(struct socket * so)1927 sodisconnect(struct socket *so)
1928 {
1929 int error;
1930
1931 socket_lock(so, 1);
1932 error = sodisconnectlocked(so);
1933 socket_unlock(so, 1);
1934 return error;
1935 }
1936
1937 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1938 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1939 {
1940 int error;
1941
1942 /*
1943 * Call the protocol disconnectx handler; let it handle all
1944 * matters related to the connection state of this session.
1945 */
1946 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1947 if (error == 0) {
1948 /*
1949 * The event applies only for the session, not for
1950 * the disconnection of individual subflows.
1951 */
1952 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1953 sflt_notify(so, sock_evt_disconnected, NULL);
1954 }
1955 }
1956 return error;
1957 }
1958
1959 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1960 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1961 {
1962 int error;
1963
1964 socket_lock(so, 1);
1965 error = sodisconnectxlocked(so, aid, cid);
1966 socket_unlock(so, 1);
1967 return error;
1968 }
1969
1970 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1971
1972 /*
1973 * sosendcheck will lock the socket buffer if it isn't locked and
1974 * verify that there is space for the data being inserted.
1975 *
1976 * Returns: 0 Success
1977 * EPIPE
1978 * sblock:EWOULDBLOCK
1979 * sblock:EINTR
1980 * sbwait:EBADF
1981 * sbwait:EINTR
1982 * [so_error]:???
1983 */
1984 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1985 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1986 int32_t clen, int32_t atomic, int flags, int *sblocked)
1987 {
1988 int error = 0;
1989 int32_t space;
1990 int assumelock = 0;
1991
1992 restart:
1993 if (*sblocked == 0) {
1994 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1995 so->so_send_filt_thread != 0 &&
1996 so->so_send_filt_thread == current_thread()) {
1997 /*
1998 * We're being called recursively from a filter,
1999 * allow this to continue. Radar 4150520.
2000 * Don't set sblocked because we don't want
2001 * to perform an unlock later.
2002 */
2003 assumelock = 1;
2004 } else {
2005 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2006 if (error) {
2007 if (so->so_flags & SOF_DEFUNCT) {
2008 goto defunct;
2009 }
2010 return error;
2011 }
2012 *sblocked = 1;
2013 }
2014 }
2015
2016 /*
2017 * If a send attempt is made on a socket that has been marked
2018 * as inactive (disconnected), reject the request.
2019 */
2020 if (so->so_flags & SOF_DEFUNCT) {
2021 defunct:
2022 error = EPIPE;
2023 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2024 __func__, proc_selfpid(), proc_best_name(current_proc()),
2025 so->so_gencnt,
2026 SOCK_DOM(so), SOCK_TYPE(so), error);
2027 return error;
2028 }
2029
2030 if (so->so_state & SS_CANTSENDMORE) {
2031 #if CONTENT_FILTER
2032 /*
2033 * Can re-inject data of half closed connections
2034 */
2035 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2036 so->so_snd.sb_cfil_thread == current_thread() &&
2037 cfil_sock_data_pending(&so->so_snd) != 0) {
2038 CFIL_LOG(LOG_INFO,
2039 "so %llx ignore SS_CANTSENDMORE",
2040 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2041 } else
2042 #endif /* CONTENT_FILTER */
2043 return EPIPE;
2044 }
2045 if (so->so_error) {
2046 error = so->so_error;
2047 so->so_error = 0;
2048 return error;
2049 }
2050
2051 if ((so->so_state & SS_ISCONNECTED) == 0) {
2052 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2053 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2054 (resid != 0 || clen == 0) &&
2055 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2056 return ENOTCONN;
2057 }
2058 } else if (addr == 0) {
2059 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2060 ENOTCONN : EDESTADDRREQ;
2061 }
2062 }
2063
2064 space = sbspace(&so->so_snd);
2065
2066 if (flags & MSG_OOB) {
2067 space += 1024;
2068 }
2069 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2070 clen > so->so_snd.sb_hiwat) {
2071 return EMSGSIZE;
2072 }
2073
2074 if ((space < resid + clen &&
2075 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2076 space < clen)) ||
2077 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2078 /*
2079 * don't block the connectx call when there's more data
2080 * than can be copied.
2081 */
2082 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2083 if (space == 0) {
2084 return EWOULDBLOCK;
2085 }
2086 if (space < (int32_t)so->so_snd.sb_lowat) {
2087 return 0;
2088 }
2089 }
2090 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2091 assumelock) {
2092 return EWOULDBLOCK;
2093 }
2094 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2095 *sblocked = 0;
2096 error = sbwait(&so->so_snd);
2097 if (error) {
2098 if (so->so_flags & SOF_DEFUNCT) {
2099 goto defunct;
2100 }
2101 return error;
2102 }
2103 goto restart;
2104 }
2105 return 0;
2106 }
2107
2108 /*
2109 * Send on a socket.
2110 * If send must go all at once and message is larger than
2111 * send buffering, then hard error.
2112 * Lock against other senders.
2113 * If must go all at once and not enough room now, then
2114 * inform user that this would block and do nothing.
2115 * Otherwise, if nonblocking, send as much as possible.
2116 * The data to be sent is described by "uio" if nonzero,
2117 * otherwise by the mbuf chain "top" (which must be null
2118 * if uio is not). Data provided in mbuf chain must be small
2119 * enough to send all at once.
2120 *
2121 * Returns nonzero on error, timeout or signal; callers
2122 * must check for short counts if EINTR/ERESTART are returned.
2123 * Data and control buffers are freed on return.
2124 *
2125 * Returns: 0 Success
2126 * EOPNOTSUPP
2127 * EINVAL
2128 * ENOBUFS
2129 * uiomove:EFAULT
2130 * sosendcheck:EPIPE
2131 * sosendcheck:EWOULDBLOCK
2132 * sosendcheck:EINTR
2133 * sosendcheck:EBADF
2134 * sosendcheck:EINTR
2135 * sosendcheck:??? [value from so_error]
2136 * <pru_send>:ECONNRESET[TCP]
2137 * <pru_send>:EINVAL[TCP]
2138 * <pru_send>:ENOBUFS[TCP]
2139 * <pru_send>:EADDRINUSE[TCP]
2140 * <pru_send>:EADDRNOTAVAIL[TCP]
2141 * <pru_send>:EAFNOSUPPORT[TCP]
2142 * <pru_send>:EACCES[TCP]
2143 * <pru_send>:EAGAIN[TCP]
2144 * <pru_send>:EPERM[TCP]
2145 * <pru_send>:EMSGSIZE[TCP]
2146 * <pru_send>:EHOSTUNREACH[TCP]
2147 * <pru_send>:ENETUNREACH[TCP]
2148 * <pru_send>:ENETDOWN[TCP]
2149 * <pru_send>:ENOMEM[TCP]
2150 * <pru_send>:ENOBUFS[TCP]
2151 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2152 * <pru_send>:EINVAL[AF_UNIX]
2153 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2154 * <pru_send>:EPIPE[AF_UNIX]
2155 * <pru_send>:ENOTCONN[AF_UNIX]
2156 * <pru_send>:EISCONN[AF_UNIX]
2157 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2158 * <sf_data_out>:??? [whatever a filter author chooses]
2159 *
2160 * Notes: Other <pru_send> returns depend on the protocol family; all
2161 * <sf_data_out> returns depend on what the filter author causes
2162 * their filter to return.
2163 */
2164 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2165 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2166 struct mbuf *top, struct mbuf *control, int flags)
2167 {
2168 struct mbuf **mp;
2169 struct mbuf *m, *freelist = NULL;
2170 struct soflow_hash_entry *dgram_flow_entry = NULL;
2171 user_ssize_t space, len, resid, orig_resid;
2172 int clen = 0, error, dontroute, sendflags;
2173 int atomic = sosendallatonce(so) || top;
2174 int sblocked = 0;
2175 struct proc *p = current_proc();
2176 uint16_t headroom = 0;
2177 ssize_t mlen;
2178 boolean_t en_tracing = FALSE;
2179
2180 if (uio != NULL) {
2181 resid = uio_resid(uio);
2182 } else {
2183 resid = top->m_pkthdr.len;
2184 }
2185
2186 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2187 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2188
2189 socket_lock(so, 1);
2190
2191 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2192 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2193 }
2194
2195 /*
2196 * trace if tracing & network (vs. unix) sockets & and
2197 * non-loopback
2198 */
2199 if (ENTR_SHOULDTRACE &&
2200 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2201 struct inpcb *inp = sotoinpcb(so);
2202 if (inp->inp_last_outifp != NULL &&
2203 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2204 en_tracing = TRUE;
2205 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2206 VM_KERNEL_ADDRPERM(so),
2207 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2208 (int64_t)resid);
2209 orig_resid = resid;
2210 }
2211 }
2212
2213 /*
2214 * Re-injection should not affect process accounting
2215 */
2216 if ((flags & MSG_SKIPCFIL) == 0) {
2217 so_update_last_owner_locked(so, p);
2218 so_update_policy(so);
2219
2220 #if NECP
2221 so_update_necp_policy(so, NULL, addr);
2222 #endif /* NECP */
2223 }
2224
2225 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2226 error = EOPNOTSUPP;
2227 goto out_locked;
2228 }
2229
2230 /*
2231 * In theory resid should be unsigned.
2232 * However, space must be signed, as it might be less than 0
2233 * if we over-committed, and we must use a signed comparison
2234 * of space and resid. On the other hand, a negative resid
2235 * causes us to loop sending 0-length segments to the protocol.
2236 *
2237 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2238 *
2239 * Note: We limit resid to be a positive int value as we use
2240 * imin() to set bytes_to_copy -- radr://14558484
2241 */
2242 if (resid < 0 || resid > INT_MAX ||
2243 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2244 error = EINVAL;
2245 goto out_locked;
2246 }
2247
2248 dontroute = (flags & MSG_DONTROUTE) &&
2249 (so->so_options & SO_DONTROUTE) == 0 &&
2250 (so->so_proto->pr_flags & PR_ATOMIC);
2251 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2252
2253 if (control != NULL) {
2254 clen = control->m_len;
2255 }
2256
2257 if (soreserveheadroom != 0) {
2258 headroom = so->so_pktheadroom;
2259 }
2260
2261 do {
2262 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2263 &sblocked);
2264 if (error) {
2265 goto out_locked;
2266 }
2267
2268 mp = ⊤
2269 space = sbspace(&so->so_snd) - clen;
2270 space += ((flags & MSG_OOB) ? 1024 : 0);
2271
2272 do {
2273 if (uio == NULL) {
2274 /*
2275 * Data is prepackaged in "top".
2276 */
2277 resid = 0;
2278 if (flags & MSG_EOR) {
2279 top->m_flags |= M_EOR;
2280 }
2281 } else {
2282 int chainlength;
2283 int bytes_to_copy;
2284 boolean_t jumbocl;
2285 boolean_t bigcl;
2286 int bytes_to_alloc;
2287
2288 bytes_to_copy = imin((int)resid, (int)space);
2289
2290 bytes_to_alloc = bytes_to_copy;
2291 if (top == NULL) {
2292 bytes_to_alloc += headroom;
2293 }
2294
2295 if (sosendminchain > 0) {
2296 chainlength = 0;
2297 } else {
2298 chainlength = sosendmaxchain;
2299 }
2300
2301 /*
2302 * Use big 4 KB cluster when the outgoing interface
2303 * does not prefer 2 KB clusters
2304 */
2305 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2306 sosendbigcl_ignore_capab;
2307
2308 /*
2309 * Attempt to use larger than system page-size
2310 * clusters for large writes only if there is
2311 * a jumbo cluster pool and if the socket is
2312 * marked accordingly.
2313 */
2314 jumbocl = sosendjcl && njcl > 0 &&
2315 ((so->so_flags & SOF_MULTIPAGES) ||
2316 sosendjcl_ignore_capab) &&
2317 bigcl;
2318
2319 socket_unlock(so, 0);
2320
2321 do {
2322 int num_needed;
2323 int hdrs_needed = (top == NULL) ? 1 : 0;
2324
2325 /*
2326 * try to maintain a local cache of mbuf
2327 * clusters needed to complete this
2328 * write the list is further limited to
2329 * the number that are currently needed
2330 * to fill the socket this mechanism
2331 * allows a large number of mbufs/
2332 * clusters to be grabbed under a single
2333 * mbuf lock... if we can't get any
2334 * clusters, than fall back to trying
2335 * for mbufs if we fail early (or
2336 * miscalcluate the number needed) make
2337 * sure to release any clusters we
2338 * haven't yet consumed.
2339 */
2340 if (freelist == NULL &&
2341 bytes_to_alloc > MBIGCLBYTES &&
2342 jumbocl) {
2343 num_needed =
2344 bytes_to_alloc / M16KCLBYTES;
2345
2346 if ((bytes_to_alloc -
2347 (num_needed * M16KCLBYTES))
2348 >= MINCLSIZE) {
2349 num_needed++;
2350 }
2351
2352 freelist =
2353 m_getpackets_internal(
2354 (unsigned int *)&num_needed,
2355 hdrs_needed, M_WAIT, 0,
2356 M16KCLBYTES);
2357 /*
2358 * Fall back to 4K cluster size
2359 * if allocation failed
2360 */
2361 }
2362
2363 if (freelist == NULL &&
2364 bytes_to_alloc > MCLBYTES &&
2365 bigcl) {
2366 num_needed =
2367 bytes_to_alloc / MBIGCLBYTES;
2368
2369 if ((bytes_to_alloc -
2370 (num_needed * MBIGCLBYTES)) >=
2371 MINCLSIZE) {
2372 num_needed++;
2373 }
2374
2375 freelist =
2376 m_getpackets_internal(
2377 (unsigned int *)&num_needed,
2378 hdrs_needed, M_WAIT, 0,
2379 MBIGCLBYTES);
2380 /*
2381 * Fall back to cluster size
2382 * if allocation failed
2383 */
2384 }
2385
2386 /*
2387 * Allocate a cluster as we want to
2388 * avoid to split the data in more
2389 * that one segment and using MINCLSIZE
2390 * would lead us to allocate two mbufs
2391 */
2392 if (soreserveheadroom != 0 &&
2393 freelist == NULL &&
2394 ((top == NULL &&
2395 bytes_to_alloc > _MHLEN) ||
2396 bytes_to_alloc > _MLEN)) {
2397 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2398 MCLBYTES;
2399 freelist =
2400 m_getpackets_internal(
2401 (unsigned int *)&num_needed,
2402 hdrs_needed, M_WAIT, 0,
2403 MCLBYTES);
2404 /*
2405 * Fall back to a single mbuf
2406 * if allocation failed
2407 */
2408 } else if (freelist == NULL &&
2409 bytes_to_alloc > MINCLSIZE) {
2410 num_needed =
2411 bytes_to_alloc / MCLBYTES;
2412
2413 if ((bytes_to_alloc -
2414 (num_needed * MCLBYTES)) >=
2415 MINCLSIZE) {
2416 num_needed++;
2417 }
2418
2419 freelist =
2420 m_getpackets_internal(
2421 (unsigned int *)&num_needed,
2422 hdrs_needed, M_WAIT, 0,
2423 MCLBYTES);
2424 /*
2425 * Fall back to a single mbuf
2426 * if allocation failed
2427 */
2428 }
2429 /*
2430 * For datagram protocols, leave
2431 * headroom for protocol headers
2432 * in the first cluster of the chain
2433 */
2434 if (freelist != NULL && atomic &&
2435 top == NULL && headroom > 0) {
2436 freelist->m_data += headroom;
2437 }
2438
2439 /*
2440 * Fall back to regular mbufs without
2441 * reserving the socket headroom
2442 */
2443 if (freelist == NULL) {
2444 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2445 if (top == NULL) {
2446 MGETHDR(freelist,
2447 M_WAIT, MT_DATA);
2448 } else {
2449 MGET(freelist,
2450 M_WAIT, MT_DATA);
2451 }
2452 }
2453
2454 if (freelist == NULL) {
2455 error = ENOBUFS;
2456 socket_lock(so, 0);
2457 goto out_locked;
2458 }
2459 /*
2460 * For datagram protocols,
2461 * leave room for protocol
2462 * headers in first mbuf.
2463 */
2464 if (atomic && top == NULL &&
2465 bytes_to_copy > 0 &&
2466 bytes_to_copy < MHLEN) {
2467 MH_ALIGN(freelist,
2468 bytes_to_copy);
2469 }
2470 }
2471 m = freelist;
2472 freelist = m->m_next;
2473 m->m_next = NULL;
2474
2475 if ((m->m_flags & M_EXT)) {
2476 mlen = m->m_ext.ext_size -
2477 M_LEADINGSPACE(m);
2478 } else if ((m->m_flags & M_PKTHDR)) {
2479 mlen = MHLEN - M_LEADINGSPACE(m);
2480 m_add_crumb(m, PKT_CRUMB_SOSEND);
2481 } else {
2482 mlen = MLEN - M_LEADINGSPACE(m);
2483 }
2484 len = imin((int)mlen, bytes_to_copy);
2485
2486 chainlength += len;
2487
2488 space -= len;
2489
2490 error = uiomove(mtod(m, caddr_t),
2491 (int)len, uio);
2492
2493 resid = uio_resid(uio);
2494
2495 m->m_len = (int32_t)len;
2496 *mp = m;
2497 top->m_pkthdr.len += len;
2498 if (error) {
2499 break;
2500 }
2501 mp = &m->m_next;
2502 if (resid <= 0) {
2503 if (flags & MSG_EOR) {
2504 top->m_flags |= M_EOR;
2505 }
2506 break;
2507 }
2508 bytes_to_copy = imin((int)resid, (int)space);
2509 } while (space > 0 &&
2510 (chainlength < sosendmaxchain || atomic ||
2511 resid < MINCLSIZE));
2512
2513 socket_lock(so, 0);
2514
2515 if (error) {
2516 goto out_locked;
2517 }
2518 }
2519
2520 if (dontroute) {
2521 so->so_options |= SO_DONTROUTE;
2522 }
2523
2524 /*
2525 * Compute flags here, for pru_send and NKEs
2526 *
2527 * If the user set MSG_EOF, the protocol
2528 * understands this flag and nothing left to
2529 * send then use PRU_SEND_EOF instead of PRU_SEND.
2530 */
2531 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2532 ((flags & MSG_EOF) &&
2533 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2534 (resid <= 0)) ? PRUS_EOF :
2535 /* If there is more to send set PRUS_MORETOCOME */
2536 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2537
2538 if ((flags & MSG_SKIPCFIL) == 0) {
2539 /*
2540 * Socket filter processing
2541 */
2542 error = sflt_data_out(so, addr, &top,
2543 &control, (sendflags & MSG_OOB) ?
2544 sock_data_filt_flag_oob : 0);
2545 if (error) {
2546 if (error == EJUSTRETURN) {
2547 error = 0;
2548 goto packet_consumed;
2549 }
2550 goto out_locked;
2551 }
2552 #if CONTENT_FILTER
2553 /*
2554 * Content filter processing
2555 */
2556 error = cfil_sock_data_out(so, addr, top,
2557 control, sendflags, dgram_flow_entry);
2558 if (error) {
2559 if (error == EJUSTRETURN) {
2560 error = 0;
2561 goto packet_consumed;
2562 }
2563 goto out_locked;
2564 }
2565 #endif /* CONTENT_FILTER */
2566 }
2567 error = (*so->so_proto->pr_usrreqs->pru_send)
2568 (so, sendflags, top, addr, control, p);
2569
2570 packet_consumed:
2571 if (dontroute) {
2572 so->so_options &= ~SO_DONTROUTE;
2573 }
2574
2575 clen = 0;
2576 control = NULL;
2577 top = NULL;
2578 mp = ⊤
2579 if (error) {
2580 goto out_locked;
2581 }
2582 } while (resid && space > 0);
2583 } while (resid);
2584
2585 out_locked:
2586 if (sblocked) {
2587 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2588 } else {
2589 socket_unlock(so, 1);
2590 }
2591 if (top != NULL) {
2592 m_freem(top);
2593 }
2594 if (control != NULL) {
2595 m_freem(control);
2596 }
2597 if (freelist != NULL) {
2598 m_freem_list(freelist);
2599 }
2600
2601 if (dgram_flow_entry != NULL) {
2602 soflow_free_flow(dgram_flow_entry);
2603 }
2604
2605 soclearfastopen(so);
2606
2607 if (en_tracing) {
2608 /* resid passed here is the bytes left in uio */
2609 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2610 VM_KERNEL_ADDRPERM(so),
2611 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2612 (int64_t)(orig_resid - resid));
2613 }
2614 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2615 so->so_snd.sb_cc, space, error);
2616
2617 return error;
2618 }
2619
2620 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2621 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2622 {
2623 struct mbuf *m0 = NULL, *control_end = NULL;
2624
2625 socket_lock_assert_owned(so);
2626
2627 /*
2628 * top must points to mbuf chain to be sent.
2629 * If control is not NULL, top must be packet header
2630 */
2631 VERIFY(top != NULL &&
2632 (control == NULL || top->m_flags & M_PKTHDR));
2633
2634 /*
2635 * If control is not passed in, see if we can get it
2636 * from top.
2637 */
2638 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2639 // Locate start of control if present and start of data
2640 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2641 if (m0->m_flags & M_PKTHDR) {
2642 top = m0;
2643 break;
2644 } else if (m0->m_type == MT_CONTROL) {
2645 if (control == NULL) {
2646 // Found start of control
2647 control = m0;
2648 }
2649 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2650 // Found end of control
2651 control_end = m0;
2652 }
2653 }
2654 }
2655 if (control_end != NULL) {
2656 control_end->m_next = NULL;
2657 }
2658 }
2659
2660 int error = (*so->so_proto->pr_usrreqs->pru_send)
2661 (so, sendflags, top, addr, control, current_proc());
2662
2663 return error;
2664 }
2665
2666 /*
2667 * Supported only connected sockets (no address) without ancillary data
2668 * (control mbuf) for atomic protocols
2669 */
2670 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2671 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2672 {
2673 struct mbuf *m, *freelist = NULL;
2674 struct soflow_hash_entry *dgram_flow_entry = NULL;
2675 user_ssize_t len, resid;
2676 int error, dontroute;
2677 int atomic = sosendallatonce(so);
2678 int sblocked = 0;
2679 struct proc *p = current_proc();
2680 u_int uiofirst = 0;
2681 u_int uiolast = 0;
2682 struct mbuf *top = NULL;
2683 uint16_t headroom = 0;
2684 ssize_t mlen;
2685 boolean_t bigcl;
2686
2687 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2688 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2689
2690 if (so->so_type != SOCK_DGRAM) {
2691 error = EINVAL;
2692 goto out;
2693 }
2694 if (atomic == 0) {
2695 error = EINVAL;
2696 goto out;
2697 }
2698 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2699 error = EPROTONOSUPPORT;
2700 goto out;
2701 }
2702 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2703 error = EINVAL;
2704 goto out;
2705 }
2706 resid = uio_array_resid(uioarray, uiocnt);
2707
2708 /*
2709 * In theory resid should be unsigned.
2710 * However, space must be signed, as it might be less than 0
2711 * if we over-committed, and we must use a signed comparison
2712 * of space and resid. On the other hand, a negative resid
2713 * causes us to loop sending 0-length segments to the protocol.
2714 *
2715 * Note: We limit resid to be a positive int value as we use
2716 * imin() to set bytes_to_copy -- radr://14558484
2717 */
2718 if (resid < 0 || resid > INT_MAX) {
2719 error = EINVAL;
2720 goto out;
2721 }
2722
2723 socket_lock(so, 1);
2724 so_update_last_owner_locked(so, p);
2725 so_update_policy(so);
2726
2727 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2728 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2729 }
2730
2731 #if NECP
2732 so_update_necp_policy(so, NULL, NULL);
2733 #endif /* NECP */
2734
2735 dontroute = (flags & MSG_DONTROUTE) &&
2736 (so->so_options & SO_DONTROUTE) == 0 &&
2737 (so->so_proto->pr_flags & PR_ATOMIC);
2738 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2739
2740 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2741 if (error) {
2742 goto release;
2743 }
2744
2745 /*
2746 * Use big 4 KB clusters when the outgoing interface does not prefer
2747 * 2 KB clusters
2748 */
2749 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2750
2751 if (soreserveheadroom != 0) {
2752 headroom = so->so_pktheadroom;
2753 }
2754
2755 do {
2756 int i;
2757 int num_needed = 0;
2758 int chainlength;
2759 size_t maxpktlen = 0;
2760 int bytes_to_alloc;
2761
2762 if (sosendminchain > 0) {
2763 chainlength = 0;
2764 } else {
2765 chainlength = sosendmaxchain;
2766 }
2767
2768 socket_unlock(so, 0);
2769
2770 /*
2771 * Find a set of uio that fit in a reasonable number
2772 * of mbuf packets
2773 */
2774 for (i = uiofirst; i < uiocnt; i++) {
2775 struct uio *auio = uioarray[i];
2776
2777 len = uio_resid(auio);
2778
2779 /* Do nothing for empty messages */
2780 if (len == 0) {
2781 continue;
2782 }
2783
2784 num_needed += 1;
2785 uiolast += 1;
2786
2787 if (len > maxpktlen) {
2788 maxpktlen = len;
2789 }
2790
2791 chainlength += len;
2792 if (chainlength > sosendmaxchain) {
2793 break;
2794 }
2795 }
2796 /*
2797 * Nothing left to send
2798 */
2799 if (num_needed == 0) {
2800 socket_lock(so, 0);
2801 break;
2802 }
2803 /*
2804 * Allocate buffer large enough to include headroom space for
2805 * network and link header
2806 *
2807 */
2808 bytes_to_alloc = (int) maxpktlen + headroom;
2809
2810 /*
2811 * Allocate a single contiguous buffer of the smallest available
2812 * size when possible
2813 */
2814 if (bytes_to_alloc > MCLBYTES &&
2815 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2816 freelist = m_getpackets_internal(
2817 (unsigned int *)&num_needed,
2818 num_needed, M_WAIT, 1,
2819 MBIGCLBYTES);
2820 } else if (bytes_to_alloc > _MHLEN &&
2821 bytes_to_alloc <= MCLBYTES) {
2822 freelist = m_getpackets_internal(
2823 (unsigned int *)&num_needed,
2824 num_needed, M_WAIT, 1,
2825 MCLBYTES);
2826 } else {
2827 freelist = m_allocpacket_internal(
2828 (unsigned int *)&num_needed,
2829 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2830 }
2831
2832 if (freelist == NULL) {
2833 socket_lock(so, 0);
2834 error = ENOMEM;
2835 goto release;
2836 }
2837 /*
2838 * Copy each uio of the set into its own mbuf packet
2839 */
2840 for (i = uiofirst, m = freelist;
2841 i < uiolast && m != NULL;
2842 i++) {
2843 int bytes_to_copy;
2844 struct mbuf *n;
2845 struct uio *auio = uioarray[i];
2846
2847 bytes_to_copy = (int)uio_resid(auio);
2848
2849 /* Do nothing for empty messages */
2850 if (bytes_to_copy == 0) {
2851 continue;
2852 }
2853 /*
2854 * Leave headroom for protocol headers
2855 * in the first mbuf of the chain
2856 */
2857 m->m_data += headroom;
2858
2859 for (n = m; n != NULL; n = n->m_next) {
2860 if ((m->m_flags & M_EXT)) {
2861 mlen = m->m_ext.ext_size -
2862 M_LEADINGSPACE(m);
2863 } else if ((m->m_flags & M_PKTHDR)) {
2864 mlen =
2865 MHLEN - M_LEADINGSPACE(m);
2866 } else {
2867 mlen = MLEN - M_LEADINGSPACE(m);
2868 }
2869 len = imin((int)mlen, bytes_to_copy);
2870
2871 /*
2872 * Note: uiomove() decrements the iovec
2873 * length
2874 */
2875 error = uiomove(mtod(n, caddr_t),
2876 (int)len, auio);
2877 if (error != 0) {
2878 break;
2879 }
2880 n->m_len = (int32_t)len;
2881 m->m_pkthdr.len += len;
2882
2883 VERIFY(m->m_pkthdr.len <= maxpktlen);
2884
2885 bytes_to_copy -= len;
2886 resid -= len;
2887 }
2888 if (m->m_pkthdr.len == 0) {
2889 printf(
2890 "%s:%d so %llx pkt %llx type %u len null\n",
2891 __func__, __LINE__,
2892 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2893 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2894 m->m_type);
2895 }
2896 if (error != 0) {
2897 break;
2898 }
2899 m = m->m_nextpkt;
2900 }
2901
2902 socket_lock(so, 0);
2903
2904 if (error) {
2905 goto release;
2906 }
2907 top = freelist;
2908 freelist = NULL;
2909
2910 if (dontroute) {
2911 so->so_options |= SO_DONTROUTE;
2912 }
2913
2914 if ((flags & MSG_SKIPCFIL) == 0) {
2915 struct mbuf **prevnextp = NULL;
2916
2917 for (i = uiofirst, m = top;
2918 i < uiolast && m != NULL;
2919 i++) {
2920 struct mbuf *nextpkt = m->m_nextpkt;
2921
2922 /*
2923 * Socket filter processing
2924 */
2925 error = sflt_data_out(so, NULL, &m,
2926 NULL, 0);
2927 if (error != 0 && error != EJUSTRETURN) {
2928 goto release;
2929 }
2930
2931 #if CONTENT_FILTER
2932 if (error == 0) {
2933 /*
2934 * Content filter processing
2935 */
2936 error = cfil_sock_data_out(so, NULL, m,
2937 NULL, 0, dgram_flow_entry);
2938 if (error != 0 && error != EJUSTRETURN) {
2939 goto release;
2940 }
2941 }
2942 #endif /* CONTENT_FILTER */
2943 /*
2944 * Remove packet from the list when
2945 * swallowed by a filter
2946 */
2947 if (error == EJUSTRETURN) {
2948 error = 0;
2949 if (prevnextp != NULL) {
2950 *prevnextp = nextpkt;
2951 } else {
2952 top = nextpkt;
2953 }
2954 }
2955
2956 m = nextpkt;
2957 if (m != NULL) {
2958 prevnextp = &m->m_nextpkt;
2959 }
2960 }
2961 }
2962 if (top != NULL) {
2963 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2964 (so, 0, top, NULL, NULL, p);
2965 }
2966
2967 if (dontroute) {
2968 so->so_options &= ~SO_DONTROUTE;
2969 }
2970
2971 top = NULL;
2972 uiofirst = uiolast;
2973 } while (resid > 0 && error == 0);
2974 release:
2975 if (sblocked) {
2976 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2977 } else {
2978 socket_unlock(so, 1);
2979 }
2980 out:
2981 if (top != NULL) {
2982 m_freem(top);
2983 }
2984 if (freelist != NULL) {
2985 m_freem_list(freelist);
2986 }
2987
2988 if (dgram_flow_entry != NULL) {
2989 soflow_free_flow(dgram_flow_entry);
2990 }
2991
2992 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2993 so->so_snd.sb_cc, 0, error);
2994
2995 return error;
2996 }
2997
2998 /*
2999 * May return ERESTART when packet is dropped by MAC policy check
3000 */
3001 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)3002 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
3003 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
3004 {
3005 int error = 0;
3006 struct mbuf *m = *mp;
3007 struct mbuf *nextrecord = *nextrecordp;
3008
3009 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3010 #if CONFIG_MACF_SOCKET_SUBSET
3011 /*
3012 * Call the MAC framework for policy checking if we're in
3013 * the user process context and the socket isn't connected.
3014 */
3015 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3016 struct mbuf *m0 = m;
3017 /*
3018 * Dequeue this record (temporarily) from the receive
3019 * list since we're about to drop the socket's lock
3020 * where a new record may arrive and be appended to
3021 * the list. Upon MAC policy failure, the record
3022 * will be freed. Otherwise, we'll add it back to
3023 * the head of the list. We cannot rely on SB_LOCK
3024 * because append operation uses the socket's lock.
3025 */
3026 do {
3027 m->m_nextpkt = NULL;
3028 sbfree(&so->so_rcv, m);
3029 m = m->m_next;
3030 } while (m != NULL);
3031 m = m0;
3032 so->so_rcv.sb_mb = nextrecord;
3033 SB_EMPTY_FIXUP(&so->so_rcv);
3034 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3035 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3036 socket_unlock(so, 0);
3037
3038 error = mac_socket_check_received(kauth_cred_get(), so,
3039 mtod(m, struct sockaddr *));
3040
3041 if (error != 0) {
3042 /*
3043 * MAC policy failure; free this record and
3044 * process the next record (or block until
3045 * one is available). We have adjusted sb_cc
3046 * and sb_mbcnt above so there is no need to
3047 * call sbfree() again.
3048 */
3049 m_freem(m);
3050 /*
3051 * Clear SB_LOCK but don't unlock the socket.
3052 * Process the next record or wait for one.
3053 */
3054 socket_lock(so, 0);
3055 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3056 error = ERESTART;
3057 goto done;
3058 }
3059 socket_lock(so, 0);
3060 /*
3061 * If the socket has been defunct'd, drop it.
3062 */
3063 if (so->so_flags & SOF_DEFUNCT) {
3064 m_freem(m);
3065 error = ENOTCONN;
3066 goto done;
3067 }
3068 /*
3069 * Re-adjust the socket receive list and re-enqueue
3070 * the record in front of any packets which may have
3071 * been appended while we dropped the lock.
3072 */
3073 for (m = m0; m->m_next != NULL; m = m->m_next) {
3074 sballoc(&so->so_rcv, m);
3075 }
3076 sballoc(&so->so_rcv, m);
3077 if (so->so_rcv.sb_mb == NULL) {
3078 so->so_rcv.sb_lastrecord = m0;
3079 so->so_rcv.sb_mbtail = m;
3080 }
3081 m = m0;
3082 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3083 so->so_rcv.sb_mb = m;
3084 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3085 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3086 }
3087 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3088 if (psa != NULL) {
3089 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3090 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3091 error = EWOULDBLOCK;
3092 goto done;
3093 }
3094 }
3095 if (flags & MSG_PEEK) {
3096 m = m->m_next;
3097 } else {
3098 sbfree(&so->so_rcv, m);
3099 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3100 panic("%s: about to create invalid socketbuf",
3101 __func__);
3102 /* NOTREACHED */
3103 }
3104 MFREE(m, so->so_rcv.sb_mb);
3105 m = so->so_rcv.sb_mb;
3106 if (m != NULL) {
3107 m->m_nextpkt = nextrecord;
3108 } else {
3109 so->so_rcv.sb_mb = nextrecord;
3110 SB_EMPTY_FIXUP(&so->so_rcv);
3111 }
3112 }
3113 done:
3114 *mp = m;
3115 *nextrecordp = nextrecord;
3116
3117 return error;
3118 }
3119
3120 /*
3121 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3122 * so clear the data portion in order not to leak the file pointers
3123 */
3124 static void
sopeek_scm_rights(struct mbuf * rights)3125 sopeek_scm_rights(struct mbuf *rights)
3126 {
3127 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3128
3129 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3130 VERIFY(cm->cmsg_len <= rights->m_len);
3131 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3132 }
3133 }
3134
3135 /*
3136 * Process one or more MT_CONTROL mbufs present before any data mbufs
3137 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3138 * just copy the data; if !MSG_PEEK, we call into the protocol to
3139 * perform externalization.
3140 */
3141 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3142 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3143 struct mbuf **mp, struct mbuf **nextrecordp)
3144 {
3145 int error = 0;
3146 struct mbuf *cm = NULL, *cmn;
3147 struct mbuf **cme = &cm;
3148 struct sockbuf *sb_rcv = &so->so_rcv;
3149 struct mbuf **msgpcm = NULL;
3150 struct mbuf *m = *mp;
3151 struct mbuf *nextrecord = *nextrecordp;
3152 struct protosw *pr = so->so_proto;
3153
3154 /*
3155 * Externalizing the control messages would require us to
3156 * drop the socket's lock below. Once we re-acquire the
3157 * lock, the mbuf chain might change. In order to preserve
3158 * consistency, we unlink all control messages from the
3159 * first mbuf chain in one shot and link them separately
3160 * onto a different chain.
3161 */
3162 do {
3163 if (flags & MSG_PEEK) {
3164 if (controlp != NULL) {
3165 if (*controlp == NULL) {
3166 msgpcm = controlp;
3167 }
3168 *controlp = m_copy(m, 0, m->m_len);
3169
3170 /*
3171 * If we failed to allocate an mbuf,
3172 * release any previously allocated
3173 * mbufs for control data. Return
3174 * an error. Keep the mbufs in the
3175 * socket as this is using
3176 * MSG_PEEK flag.
3177 */
3178 if (*controlp == NULL) {
3179 m_freem(*msgpcm);
3180 error = ENOBUFS;
3181 goto done;
3182 }
3183
3184 if (pr->pr_domain->dom_externalize != NULL) {
3185 sopeek_scm_rights(*controlp);
3186 }
3187
3188 controlp = &(*controlp)->m_next;
3189 }
3190 m = m->m_next;
3191 } else {
3192 m->m_nextpkt = NULL;
3193 sbfree(sb_rcv, m);
3194 sb_rcv->sb_mb = m->m_next;
3195 m->m_next = NULL;
3196 *cme = m;
3197 cme = &(*cme)->m_next;
3198 m = sb_rcv->sb_mb;
3199 }
3200 } while (m != NULL && m->m_type == MT_CONTROL);
3201
3202 if (!(flags & MSG_PEEK)) {
3203 if (sb_rcv->sb_mb != NULL) {
3204 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3205 } else {
3206 sb_rcv->sb_mb = nextrecord;
3207 SB_EMPTY_FIXUP(sb_rcv);
3208 }
3209 if (nextrecord == NULL) {
3210 sb_rcv->sb_lastrecord = m;
3211 }
3212 }
3213
3214 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3215 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3216
3217 while (cm != NULL) {
3218 int cmsg_level;
3219 int cmsg_type;
3220
3221 cmn = cm->m_next;
3222 cm->m_next = NULL;
3223 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3224 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3225
3226 /*
3227 * Call the protocol to externalize SCM_RIGHTS message
3228 * and return the modified message to the caller upon
3229 * success. Otherwise, all other control messages are
3230 * returned unmodified to the caller. Note that we
3231 * only get into this loop if MSG_PEEK is not set.
3232 */
3233 if (pr->pr_domain->dom_externalize != NULL &&
3234 cmsg_level == SOL_SOCKET &&
3235 cmsg_type == SCM_RIGHTS) {
3236 /*
3237 * Release socket lock: see 3903171. This
3238 * would also allow more records to be appended
3239 * to the socket buffer. We still have SB_LOCK
3240 * set on it, so we can be sure that the head
3241 * of the mbuf chain won't change.
3242 */
3243 socket_unlock(so, 0);
3244 error = (*pr->pr_domain->dom_externalize)(cm);
3245 socket_lock(so, 0);
3246 } else {
3247 error = 0;
3248 }
3249
3250 if (controlp != NULL && error == 0) {
3251 *controlp = cm;
3252 controlp = &(*controlp)->m_next;
3253 } else {
3254 (void) m_free(cm);
3255 }
3256 cm = cmn;
3257 }
3258 /*
3259 * Update the value of nextrecord in case we received new
3260 * records when the socket was unlocked above for
3261 * externalizing SCM_RIGHTS.
3262 */
3263 if (m != NULL) {
3264 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3265 } else {
3266 nextrecord = sb_rcv->sb_mb;
3267 }
3268
3269 done:
3270 *mp = m;
3271 *nextrecordp = nextrecord;
3272
3273 return error;
3274 }
3275
3276 /*
3277 * If we have less data than requested, block awaiting more
3278 * (subject to any timeout) if:
3279 * 1. the current count is less than the low water mark, or
3280 * 2. MSG_WAITALL is set, and it is possible to do the entire
3281 * receive operation at once if we block (resid <= hiwat).
3282 * 3. MSG_DONTWAIT is not set
3283 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3284 * we have to do the receive in sections, and thus risk returning
3285 * a short count if a timeout or signal occurs after we start.
3286 */
3287 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3288 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3289 {
3290 struct protosw *pr = so->so_proto;
3291
3292 /* No mbufs in the receive-queue? Wait! */
3293 if (m == NULL) {
3294 return true;
3295 }
3296
3297 /* Not enough data in the receive socket-buffer - we may have to wait */
3298 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3299 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3300 /*
3301 * Application did set the lowater-mark, so we should wait for
3302 * this data to be present.
3303 */
3304 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3305 return true;
3306 }
3307
3308 /*
3309 * Application wants all the data - so let's try to do the
3310 * receive-operation at once by waiting for everything to
3311 * be there.
3312 */
3313 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3314 return true;
3315 }
3316 }
3317
3318 return false;
3319 }
3320
3321 /*
3322 * Implement receive operations on a socket.
3323 * We depend on the way that records are added to the sockbuf
3324 * by sbappend*. In particular, each record (mbufs linked through m_next)
3325 * must begin with an address if the protocol so specifies,
3326 * followed by an optional mbuf or mbufs containing ancillary data,
3327 * and then zero or more mbufs of data.
3328 * In order to avoid blocking network interrupts for the entire time here,
3329 * we splx() while doing the actual copy to user space.
3330 * Although the sockbuf is locked, new data may still be appended,
3331 * and thus we must maintain consistency of the sockbuf during that time.
3332 *
3333 * The caller may receive the data as a single mbuf chain by supplying
3334 * an mbuf **mp0 for use in returning the chain. The uio is then used
3335 * only for the count in uio_resid.
3336 *
3337 * Returns: 0 Success
3338 * ENOBUFS
3339 * ENOTCONN
3340 * EWOULDBLOCK
3341 * uiomove:EFAULT
3342 * sblock:EWOULDBLOCK
3343 * sblock:EINTR
3344 * sbwait:EBADF
3345 * sbwait:EINTR
3346 * sodelayed_copy:EFAULT
3347 * <pru_rcvoob>:EINVAL[TCP]
3348 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3349 * <pru_rcvoob>:???
3350 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3351 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3352 * <pr_domain->dom_externalize>:???
3353 *
3354 * Notes: Additional return values from calls through <pru_rcvoob> and
3355 * <pr_domain->dom_externalize> depend on protocols other than
3356 * TCP or AF_UNIX, which are documented above.
3357 */
3358 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3359 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3360 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3361 {
3362 struct mbuf *m, **mp, *ml = NULL;
3363 struct mbuf *nextrecord, *free_list;
3364 int flags, error, offset;
3365 user_ssize_t len;
3366 struct protosw *pr = so->so_proto;
3367 int moff, type = 0;
3368 user_ssize_t orig_resid = uio_resid(uio);
3369 user_ssize_t delayed_copy_len;
3370 int can_delay;
3371 struct proc *p = current_proc();
3372 boolean_t en_tracing = FALSE;
3373
3374 /*
3375 * Sanity check on the length passed by caller as we are making 'int'
3376 * comparisons
3377 */
3378 if (orig_resid < 0 || orig_resid > INT_MAX) {
3379 return EINVAL;
3380 }
3381
3382 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3383 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3384 so->so_rcv.sb_hiwat);
3385
3386 socket_lock(so, 1);
3387 so_update_last_owner_locked(so, p);
3388 so_update_policy(so);
3389
3390 #ifdef MORE_LOCKING_DEBUG
3391 if (so->so_usecount == 1) {
3392 panic("%s: so=%x no other reference on socket", __func__, so);
3393 /* NOTREACHED */
3394 }
3395 #endif
3396 mp = mp0;
3397 if (psa != NULL) {
3398 *psa = NULL;
3399 }
3400 if (controlp != NULL) {
3401 *controlp = NULL;
3402 }
3403 if (flagsp != NULL) {
3404 flags = *flagsp & ~MSG_EOR;
3405 } else {
3406 flags = 0;
3407 }
3408
3409 /*
3410 * If a recv attempt is made on a previously-accepted socket
3411 * that has been marked as inactive (disconnected), reject
3412 * the request.
3413 */
3414 if (so->so_flags & SOF_DEFUNCT) {
3415 struct sockbuf *sb = &so->so_rcv;
3416
3417 error = ENOTCONN;
3418 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3419 __func__, proc_pid(p), proc_best_name(p),
3420 so->so_gencnt,
3421 SOCK_DOM(so), SOCK_TYPE(so), error);
3422 /*
3423 * This socket should have been disconnected and flushed
3424 * prior to being returned from sodefunct(); there should
3425 * be no data on its receive list, so panic otherwise.
3426 */
3427 if (so->so_state & SS_DEFUNCT) {
3428 sb_empty_assert(sb, __func__);
3429 }
3430 socket_unlock(so, 1);
3431 return error;
3432 }
3433
3434 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3435 pr->pr_usrreqs->pru_preconnect) {
3436 /*
3437 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3438 * calling write() right after this. *If* the app calls a read
3439 * we do not want to block this read indefinetely. Thus,
3440 * we trigger a connect so that the session gets initiated.
3441 */
3442 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3443
3444 if (error) {
3445 socket_unlock(so, 1);
3446 return error;
3447 }
3448 }
3449
3450 if (ENTR_SHOULDTRACE &&
3451 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3452 /*
3453 * enable energy tracing for inet sockets that go over
3454 * non-loopback interfaces only.
3455 */
3456 struct inpcb *inp = sotoinpcb(so);
3457 if (inp->inp_last_outifp != NULL &&
3458 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3459 en_tracing = TRUE;
3460 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3461 VM_KERNEL_ADDRPERM(so),
3462 ((so->so_state & SS_NBIO) ?
3463 kEnTrFlagNonBlocking : 0),
3464 (int64_t)orig_resid);
3465 }
3466 }
3467
3468 /*
3469 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3470 * regardless of the flags argument. Here is the case were
3471 * out-of-band data is not inline.
3472 */
3473 if ((flags & MSG_OOB) ||
3474 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3475 (so->so_options & SO_OOBINLINE) == 0 &&
3476 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3477 m = m_get(M_WAIT, MT_DATA);
3478 if (m == NULL) {
3479 socket_unlock(so, 1);
3480 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3481 ENOBUFS, 0, 0, 0, 0);
3482 return ENOBUFS;
3483 }
3484 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3485 if (error) {
3486 goto bad;
3487 }
3488 socket_unlock(so, 0);
3489 do {
3490 error = uiomove(mtod(m, caddr_t),
3491 imin((int)uio_resid(uio), m->m_len), uio);
3492 m = m_free(m);
3493 } while (uio_resid(uio) && error == 0 && m != NULL);
3494 socket_lock(so, 0);
3495 bad:
3496 if (m != NULL) {
3497 m_freem(m);
3498 }
3499
3500 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3501 if (error == EWOULDBLOCK || error == EINVAL) {
3502 /*
3503 * Let's try to get normal data:
3504 * EWOULDBLOCK: out-of-band data not
3505 * receive yet. EINVAL: out-of-band data
3506 * already read.
3507 */
3508 error = 0;
3509 goto nooob;
3510 } else if (error == 0 && flagsp != NULL) {
3511 *flagsp |= MSG_OOB;
3512 }
3513 }
3514 socket_unlock(so, 1);
3515 if (en_tracing) {
3516 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3517 VM_KERNEL_ADDRPERM(so), 0,
3518 (int64_t)(orig_resid - uio_resid(uio)));
3519 }
3520 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3521 0, 0, 0, 0);
3522
3523 return error;
3524 }
3525 nooob:
3526 if (mp != NULL) {
3527 *mp = NULL;
3528 }
3529
3530 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3531 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3532 }
3533
3534 free_list = NULL;
3535 delayed_copy_len = 0;
3536 restart:
3537 #ifdef MORE_LOCKING_DEBUG
3538 if (so->so_usecount <= 1) {
3539 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3540 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3541 }
3542 #endif
3543 /*
3544 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3545 * and if so just return to the caller. This could happen when
3546 * soreceive() is called by a socket upcall function during the
3547 * time the socket is freed. The socket buffer would have been
3548 * locked across the upcall, therefore we cannot put this thread
3549 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3550 * we may livelock), because the lock on the socket buffer will
3551 * only be released when the upcall routine returns to its caller.
3552 * Because the socket has been officially closed, there can be
3553 * no further read on it.
3554 *
3555 * A multipath subflow socket would have its SS_NOFDREF set by
3556 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3557 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3558 */
3559 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3560 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3561 socket_unlock(so, 1);
3562 return 0;
3563 }
3564
3565 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3566 if (error) {
3567 socket_unlock(so, 1);
3568 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3569 0, 0, 0, 0);
3570 if (en_tracing) {
3571 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3572 VM_KERNEL_ADDRPERM(so), 0,
3573 (int64_t)(orig_resid - uio_resid(uio)));
3574 }
3575 return error;
3576 }
3577
3578 m = so->so_rcv.sb_mb;
3579 if (so_should_wait(so, uio, m, flags)) {
3580 /*
3581 * Panic if we notice inconsistencies in the socket's
3582 * receive list; both sb_mb and sb_cc should correctly
3583 * reflect the contents of the list, otherwise we may
3584 * end up with false positives during select() or poll()
3585 * which could put the application in a bad state.
3586 */
3587 SB_MB_CHECK(&so->so_rcv);
3588
3589 if (so->so_error) {
3590 if (m != NULL) {
3591 goto dontblock;
3592 }
3593 error = so->so_error;
3594 if ((flags & MSG_PEEK) == 0) {
3595 so->so_error = 0;
3596 }
3597 goto release;
3598 }
3599 if (so->so_state & SS_CANTRCVMORE) {
3600 #if CONTENT_FILTER
3601 /*
3602 * Deal with half closed connections
3603 */
3604 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3605 cfil_sock_data_pending(&so->so_rcv) != 0) {
3606 CFIL_LOG(LOG_INFO,
3607 "so %llx ignore SS_CANTRCVMORE",
3608 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3609 } else
3610 #endif /* CONTENT_FILTER */
3611 if (m != NULL) {
3612 goto dontblock;
3613 } else {
3614 goto release;
3615 }
3616 }
3617 for (; m != NULL; m = m->m_next) {
3618 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3619 m = so->so_rcv.sb_mb;
3620 goto dontblock;
3621 }
3622 }
3623 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3624 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3625 error = ENOTCONN;
3626 goto release;
3627 }
3628 if (uio_resid(uio) == 0) {
3629 goto release;
3630 }
3631
3632 if ((so->so_state & SS_NBIO) ||
3633 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3634 error = EWOULDBLOCK;
3635 goto release;
3636 }
3637 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3638 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3639 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3640 #if EVEN_MORE_LOCKING_DEBUG
3641 if (socket_debug) {
3642 printf("Waiting for socket data\n");
3643 }
3644 #endif
3645
3646 /*
3647 * Depending on the protocol (e.g. TCP), the following
3648 * might cause the socket lock to be dropped and later
3649 * be reacquired, and more data could have arrived and
3650 * have been appended to the receive socket buffer by
3651 * the time it returns. Therefore, we only sleep in
3652 * sbwait() below if and only if the wait-condition is still
3653 * true.
3654 */
3655 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3656 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3657 }
3658
3659 error = 0;
3660 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3661 error = sbwait(&so->so_rcv);
3662 }
3663
3664 #if EVEN_MORE_LOCKING_DEBUG
3665 if (socket_debug) {
3666 printf("SORECEIVE - sbwait returned %d\n", error);
3667 }
3668 #endif
3669 if (so->so_usecount < 1) {
3670 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3671 __func__, so, so->so_usecount);
3672 /* NOTREACHED */
3673 }
3674 if (error) {
3675 socket_unlock(so, 1);
3676 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3677 0, 0, 0, 0);
3678 if (en_tracing) {
3679 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3680 VM_KERNEL_ADDRPERM(so), 0,
3681 (int64_t)(orig_resid - uio_resid(uio)));
3682 }
3683 return error;
3684 }
3685 goto restart;
3686 }
3687 dontblock:
3688 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3689 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3690 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3691 nextrecord = m->m_nextpkt;
3692
3693 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3694 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3695 mp0 == NULL);
3696 if (error == ERESTART) {
3697 goto restart;
3698 } else if (error != 0) {
3699 goto release;
3700 }
3701 orig_resid = 0;
3702 }
3703
3704 /*
3705 * Process one or more MT_CONTROL mbufs present before any data mbufs
3706 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3707 * just copy the data; if !MSG_PEEK, we call into the protocol to
3708 * perform externalization.
3709 */
3710 if (m != NULL && m->m_type == MT_CONTROL) {
3711 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3712 if (error != 0) {
3713 goto release;
3714 }
3715 orig_resid = 0;
3716 }
3717
3718 if (m != NULL) {
3719 if (!(flags & MSG_PEEK)) {
3720 /*
3721 * We get here because m points to an mbuf following
3722 * any MT_SONAME or MT_CONTROL mbufs which have been
3723 * processed above. In any case, m should be pointing
3724 * to the head of the mbuf chain, and the nextrecord
3725 * should be either NULL or equal to m->m_nextpkt.
3726 * See comments above about SB_LOCK.
3727 */
3728 if (m != so->so_rcv.sb_mb ||
3729 m->m_nextpkt != nextrecord) {
3730 panic("%s: post-control !sync so=%p m=%p "
3731 "nextrecord=%p\n", __func__, so, m,
3732 nextrecord);
3733 /* NOTREACHED */
3734 }
3735 if (nextrecord == NULL) {
3736 so->so_rcv.sb_lastrecord = m;
3737 }
3738 }
3739 type = m->m_type;
3740 if (type == MT_OOBDATA) {
3741 flags |= MSG_OOB;
3742 }
3743 } else {
3744 if (!(flags & MSG_PEEK)) {
3745 SB_EMPTY_FIXUP(&so->so_rcv);
3746 }
3747 }
3748 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3749 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3750
3751 moff = 0;
3752 offset = 0;
3753
3754 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3755 can_delay = 1;
3756 } else {
3757 can_delay = 0;
3758 }
3759
3760 while (m != NULL &&
3761 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3762 if (m->m_type == MT_OOBDATA) {
3763 if (type != MT_OOBDATA) {
3764 break;
3765 }
3766 } else if (type == MT_OOBDATA) {
3767 break;
3768 }
3769
3770 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3771 m->m_type != MT_HEADER) {
3772 break;
3773 }
3774 /*
3775 * Make sure to allways set MSG_OOB event when getting
3776 * out of band data inline.
3777 */
3778 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3779 (so->so_options & SO_OOBINLINE) != 0 &&
3780 (so->so_state & SS_RCVATMARK) != 0) {
3781 flags |= MSG_OOB;
3782 }
3783 so->so_state &= ~SS_RCVATMARK;
3784 len = uio_resid(uio) - delayed_copy_len;
3785 if (so->so_oobmark && len > so->so_oobmark - offset) {
3786 len = so->so_oobmark - offset;
3787 }
3788 if (len > m->m_len - moff) {
3789 len = m->m_len - moff;
3790 }
3791 /*
3792 * If mp is set, just pass back the mbufs.
3793 * Otherwise copy them out via the uio, then free.
3794 * Sockbuf must be consistent here (points to current mbuf,
3795 * it points to next record) when we drop priority;
3796 * we must note any additions to the sockbuf when we
3797 * block interrupts again.
3798 */
3799 if (mp == NULL) {
3800 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3801 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3802 if (can_delay && len == m->m_len) {
3803 /*
3804 * only delay the copy if we're consuming the
3805 * mbuf and we're NOT in MSG_PEEK mode
3806 * and we have enough data to make it worthwile
3807 * to drop and retake the lock... can_delay
3808 * reflects the state of the 2 latter
3809 * constraints moff should always be zero
3810 * in these cases
3811 */
3812 delayed_copy_len += len;
3813 } else {
3814 if (delayed_copy_len) {
3815 error = sodelayed_copy(so, uio,
3816 &free_list, &delayed_copy_len);
3817
3818 if (error) {
3819 goto release;
3820 }
3821 /*
3822 * can only get here if MSG_PEEK is not
3823 * set therefore, m should point at the
3824 * head of the rcv queue; if it doesn't,
3825 * it means something drastically
3826 * changed while we were out from behind
3827 * the lock in sodelayed_copy. perhaps
3828 * a RST on the stream. in any event,
3829 * the stream has been interrupted. it's
3830 * probably best just to return whatever
3831 * data we've moved and let the caller
3832 * sort it out...
3833 */
3834 if (m != so->so_rcv.sb_mb) {
3835 break;
3836 }
3837 }
3838 socket_unlock(so, 0);
3839 error = uiomove(mtod(m, caddr_t) + moff,
3840 (int)len, uio);
3841 socket_lock(so, 0);
3842
3843 if (error) {
3844 goto release;
3845 }
3846 }
3847 } else {
3848 uio_setresid(uio, (uio_resid(uio) - len));
3849 }
3850 if (len == m->m_len - moff) {
3851 if (m->m_flags & M_EOR) {
3852 flags |= MSG_EOR;
3853 }
3854 if (flags & MSG_PEEK) {
3855 m = m->m_next;
3856 moff = 0;
3857 } else {
3858 nextrecord = m->m_nextpkt;
3859 sbfree(&so->so_rcv, m);
3860 m->m_nextpkt = NULL;
3861
3862 if (mp != NULL) {
3863 *mp = m;
3864 mp = &m->m_next;
3865 so->so_rcv.sb_mb = m = m->m_next;
3866 *mp = NULL;
3867 } else {
3868 if (free_list == NULL) {
3869 free_list = m;
3870 } else {
3871 ml->m_next = m;
3872 }
3873 ml = m;
3874 so->so_rcv.sb_mb = m = m->m_next;
3875 ml->m_next = NULL;
3876 }
3877 if (m != NULL) {
3878 m->m_nextpkt = nextrecord;
3879 if (nextrecord == NULL) {
3880 so->so_rcv.sb_lastrecord = m;
3881 }
3882 } else {
3883 so->so_rcv.sb_mb = nextrecord;
3884 SB_EMPTY_FIXUP(&so->so_rcv);
3885 }
3886 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3887 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3888 }
3889 } else {
3890 if (flags & MSG_PEEK) {
3891 moff += len;
3892 } else {
3893 if (mp != NULL) {
3894 int copy_flag;
3895
3896 if (flags & MSG_DONTWAIT) {
3897 copy_flag = M_DONTWAIT;
3898 } else {
3899 copy_flag = M_WAIT;
3900 }
3901 *mp = m_copym(m, 0, (int)len, copy_flag);
3902 /*
3903 * Failed to allocate an mbuf?
3904 * Adjust uio_resid back, it was
3905 * adjusted down by len bytes which
3906 * we didn't copy over.
3907 */
3908 if (*mp == NULL) {
3909 uio_setresid(uio,
3910 (uio_resid(uio) + len));
3911 break;
3912 }
3913 }
3914 m->m_data += len;
3915 m->m_len -= len;
3916 so->so_rcv.sb_cc -= len;
3917 }
3918 }
3919 if (so->so_oobmark) {
3920 if ((flags & MSG_PEEK) == 0) {
3921 so->so_oobmark -= len;
3922 if (so->so_oobmark == 0) {
3923 so->so_state |= SS_RCVATMARK;
3924 break;
3925 }
3926 } else {
3927 offset += len;
3928 if (offset == so->so_oobmark) {
3929 break;
3930 }
3931 }
3932 }
3933 if (flags & MSG_EOR) {
3934 break;
3935 }
3936 /*
3937 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3938 * (for non-atomic socket), we must not quit until
3939 * "uio->uio_resid == 0" or an error termination.
3940 * If a signal/timeout occurs, return with a short
3941 * count but without error. Keep sockbuf locked
3942 * against other readers.
3943 */
3944 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3945 (uio_resid(uio) - delayed_copy_len) > 0 &&
3946 !sosendallatonce(so) && !nextrecord) {
3947 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3948 #if CONTENT_FILTER
3949 && cfil_sock_data_pending(&so->so_rcv) == 0
3950 #endif /* CONTENT_FILTER */
3951 )) {
3952 goto release;
3953 }
3954
3955 /*
3956 * Depending on the protocol (e.g. TCP), the following
3957 * might cause the socket lock to be dropped and later
3958 * be reacquired, and more data could have arrived and
3959 * have been appended to the receive socket buffer by
3960 * the time it returns. Therefore, we only sleep in
3961 * sbwait() below if and only if the socket buffer is
3962 * empty, in order to avoid a false sleep.
3963 */
3964 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3965 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3966 }
3967
3968 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3969 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3970
3971 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3972 error = 0;
3973 goto release;
3974 }
3975 /*
3976 * have to wait until after we get back from the sbwait
3977 * to do the copy because we will drop the lock if we
3978 * have enough data that has been delayed... by dropping
3979 * the lock we open up a window allowing the netisr
3980 * thread to process the incoming packets and to change
3981 * the state of this socket... we're issuing the sbwait
3982 * because the socket is empty and we're expecting the
3983 * netisr thread to wake us up when more packets arrive;
3984 * if we allow that processing to happen and then sbwait
3985 * we could stall forever with packets sitting in the
3986 * socket if no further packets arrive from the remote
3987 * side.
3988 *
3989 * we want to copy before we've collected all the data
3990 * to satisfy this request to allow the copy to overlap
3991 * the incoming packet processing on an MP system
3992 */
3993 if (delayed_copy_len > sorecvmincopy &&
3994 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3995 error = sodelayed_copy(so, uio,
3996 &free_list, &delayed_copy_len);
3997
3998 if (error) {
3999 goto release;
4000 }
4001 }
4002 m = so->so_rcv.sb_mb;
4003 if (m != NULL) {
4004 nextrecord = m->m_nextpkt;
4005 }
4006 SB_MB_CHECK(&so->so_rcv);
4007 }
4008 }
4009 #ifdef MORE_LOCKING_DEBUG
4010 if (so->so_usecount <= 1) {
4011 panic("%s: after big while so=%p ref=%d on socket",
4012 __func__, so, so->so_usecount);
4013 /* NOTREACHED */
4014 }
4015 #endif
4016
4017 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4018 if (so->so_options & SO_DONTTRUNC) {
4019 flags |= MSG_RCVMORE;
4020 } else {
4021 flags |= MSG_TRUNC;
4022 if ((flags & MSG_PEEK) == 0) {
4023 (void) sbdroprecord(&so->so_rcv);
4024 }
4025 }
4026 }
4027
4028 /*
4029 * pru_rcvd below (for TCP) may cause more data to be received
4030 * if the socket lock is dropped prior to sending the ACK; some
4031 * legacy OpenTransport applications don't handle this well
4032 * (if it receives less data than requested while MSG_HAVEMORE
4033 * is set), and so we set the flag now based on what we know
4034 * prior to calling pru_rcvd.
4035 */
4036 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4037 flags |= MSG_HAVEMORE;
4038 }
4039
4040 if ((flags & MSG_PEEK) == 0) {
4041 if (m == NULL) {
4042 so->so_rcv.sb_mb = nextrecord;
4043 /*
4044 * First part is an inline SB_EMPTY_FIXUP(). Second
4045 * part makes sure sb_lastrecord is up-to-date if
4046 * there is still data in the socket buffer.
4047 */
4048 if (so->so_rcv.sb_mb == NULL) {
4049 so->so_rcv.sb_mbtail = NULL;
4050 so->so_rcv.sb_lastrecord = NULL;
4051 } else if (nextrecord->m_nextpkt == NULL) {
4052 so->so_rcv.sb_lastrecord = nextrecord;
4053 }
4054 SB_MB_CHECK(&so->so_rcv);
4055 }
4056 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4057 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4058 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4059 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4060 }
4061 }
4062
4063 if (delayed_copy_len) {
4064 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4065 if (error) {
4066 goto release;
4067 }
4068 }
4069 if (free_list != NULL) {
4070 m_freem_list(free_list);
4071 free_list = NULL;
4072 }
4073
4074 if (orig_resid == uio_resid(uio) && orig_resid &&
4075 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4076 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4077 goto restart;
4078 }
4079
4080 if (flagsp != NULL) {
4081 *flagsp |= flags;
4082 }
4083 release:
4084 #ifdef MORE_LOCKING_DEBUG
4085 if (so->so_usecount <= 1) {
4086 panic("%s: release so=%p ref=%d on socket", __func__,
4087 so, so->so_usecount);
4088 /* NOTREACHED */
4089 }
4090 #endif
4091 if (delayed_copy_len) {
4092 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4093 }
4094
4095 if (free_list != NULL) {
4096 m_freem_list(free_list);
4097 }
4098
4099 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4100
4101 if (en_tracing) {
4102 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4103 VM_KERNEL_ADDRPERM(so),
4104 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4105 (int64_t)(orig_resid - uio_resid(uio)));
4106 }
4107 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4108 so->so_rcv.sb_cc, 0, error);
4109
4110 return error;
4111 }
4112
4113 /*
4114 * Returns: 0 Success
4115 * uiomove:EFAULT
4116 */
4117 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4118 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4119 user_ssize_t *resid)
4120 {
4121 int error = 0;
4122 struct mbuf *m;
4123
4124 m = *free_list;
4125
4126 socket_unlock(so, 0);
4127
4128 while (m != NULL && error == 0) {
4129 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4130 m = m->m_next;
4131 }
4132 m_freem_list(*free_list);
4133
4134 *free_list = NULL;
4135 *resid = 0;
4136
4137 socket_lock(so, 0);
4138
4139 return error;
4140 }
4141
4142 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4143 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4144 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4145 {
4146 #pragma unused(so)
4147 int error = 0;
4148 struct mbuf *ml, *m;
4149 int i = 0;
4150 struct uio *auio;
4151
4152 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4153 ml = ml->m_nextpkt, i++) {
4154 auio = msgarray[i].uio;
4155 for (m = ml; m != NULL; m = m->m_next) {
4156 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4157 if (error != 0) {
4158 goto out;
4159 }
4160 }
4161 }
4162 out:
4163 m_freem_list(*free_list);
4164
4165 *free_list = NULL;
4166 *resid = 0;
4167
4168 return error;
4169 }
4170
4171 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4172 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4173 int *flagsp)
4174 {
4175 struct mbuf *m;
4176 struct mbuf *nextrecord;
4177 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4178 int error;
4179 user_ssize_t len, pktlen, delayed_copy_len = 0;
4180 struct protosw *pr = so->so_proto;
4181 user_ssize_t resid;
4182 struct proc *p = current_proc();
4183 struct uio *auio = NULL;
4184 int npkts = 0;
4185 int sblocked = 0;
4186 struct sockaddr **psa = NULL;
4187 struct mbuf **controlp = NULL;
4188 int can_delay;
4189 int flags;
4190 struct mbuf *free_others = NULL;
4191
4192 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4193 so, uiocnt,
4194 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4195
4196 /*
4197 * Sanity checks:
4198 * - Only supports don't wait flags
4199 * - Only support datagram sockets (could be extended to raw)
4200 * - Must be atomic
4201 * - Protocol must support packet chains
4202 * - The uio array is NULL (should we panic?)
4203 */
4204 if (flagsp != NULL) {
4205 flags = *flagsp;
4206 } else {
4207 flags = 0;
4208 }
4209 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4210 MSG_NBIO)) {
4211 printf("%s invalid flags 0x%x\n", __func__, flags);
4212 error = EINVAL;
4213 goto out;
4214 }
4215 if (so->so_type != SOCK_DGRAM) {
4216 error = EINVAL;
4217 goto out;
4218 }
4219 if (sosendallatonce(so) == 0) {
4220 error = EINVAL;
4221 goto out;
4222 }
4223 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4224 error = EPROTONOSUPPORT;
4225 goto out;
4226 }
4227 if (msgarray == NULL) {
4228 printf("%s uioarray is NULL\n", __func__);
4229 error = EINVAL;
4230 goto out;
4231 }
4232 if (uiocnt == 0) {
4233 printf("%s uiocnt is 0\n", __func__);
4234 error = EINVAL;
4235 goto out;
4236 }
4237 /*
4238 * Sanity check on the length passed by caller as we are making 'int'
4239 * comparisons
4240 */
4241 resid = recv_msg_array_resid(msgarray, uiocnt);
4242 if (resid < 0 || resid > INT_MAX) {
4243 error = EINVAL;
4244 goto out;
4245 }
4246
4247 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4248 can_delay = 1;
4249 } else {
4250 can_delay = 0;
4251 }
4252
4253 socket_lock(so, 1);
4254 so_update_last_owner_locked(so, p);
4255 so_update_policy(so);
4256
4257 #if NECP
4258 so_update_necp_policy(so, NULL, NULL);
4259 #endif /* NECP */
4260
4261 /*
4262 * If a recv attempt is made on a previously-accepted socket
4263 * that has been marked as inactive (disconnected), reject
4264 * the request.
4265 */
4266 if (so->so_flags & SOF_DEFUNCT) {
4267 struct sockbuf *sb = &so->so_rcv;
4268
4269 error = ENOTCONN;
4270 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4271 __func__, proc_pid(p), proc_best_name(p),
4272 so->so_gencnt,
4273 SOCK_DOM(so), SOCK_TYPE(so), error);
4274 /*
4275 * This socket should have been disconnected and flushed
4276 * prior to being returned from sodefunct(); there should
4277 * be no data on its receive list, so panic otherwise.
4278 */
4279 if (so->so_state & SS_DEFUNCT) {
4280 sb_empty_assert(sb, __func__);
4281 }
4282 goto release;
4283 }
4284
4285 next:
4286 /*
4287 * The uio may be empty
4288 */
4289 if (npkts >= uiocnt) {
4290 error = 0;
4291 goto release;
4292 }
4293 restart:
4294 /*
4295 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4296 * and if so just return to the caller. This could happen when
4297 * soreceive() is called by a socket upcall function during the
4298 * time the socket is freed. The socket buffer would have been
4299 * locked across the upcall, therefore we cannot put this thread
4300 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4301 * we may livelock), because the lock on the socket buffer will
4302 * only be released when the upcall routine returns to its caller.
4303 * Because the socket has been officially closed, there can be
4304 * no further read on it.
4305 */
4306 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4307 (SS_NOFDREF | SS_CANTRCVMORE)) {
4308 error = 0;
4309 goto release;
4310 }
4311
4312 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4313 if (error) {
4314 goto release;
4315 }
4316 sblocked = 1;
4317
4318 m = so->so_rcv.sb_mb;
4319 /*
4320 * Block awaiting more datagram if needed
4321 */
4322 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4323 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4324 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4325 /*
4326 * Panic if we notice inconsistencies in the socket's
4327 * receive list; both sb_mb and sb_cc should correctly
4328 * reflect the contents of the list, otherwise we may
4329 * end up with false positives during select() or poll()
4330 * which could put the application in a bad state.
4331 */
4332 SB_MB_CHECK(&so->so_rcv);
4333
4334 if (so->so_error) {
4335 error = so->so_error;
4336 if ((flags & MSG_PEEK) == 0) {
4337 so->so_error = 0;
4338 }
4339 goto release;
4340 }
4341 if (so->so_state & SS_CANTRCVMORE) {
4342 goto release;
4343 }
4344 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4345 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4346 error = ENOTCONN;
4347 goto release;
4348 }
4349 if ((so->so_state & SS_NBIO) ||
4350 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4351 error = EWOULDBLOCK;
4352 goto release;
4353 }
4354 /*
4355 * Do not block if we got some data
4356 */
4357 if (free_list != NULL) {
4358 error = 0;
4359 goto release;
4360 }
4361
4362 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4363 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4364
4365 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4366 sblocked = 0;
4367
4368 error = sbwait(&so->so_rcv);
4369 if (error) {
4370 goto release;
4371 }
4372 goto restart;
4373 }
4374
4375 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4376 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4377 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4378
4379 /*
4380 * Consume the current uio index as we have a datagram
4381 */
4382 auio = msgarray[npkts].uio;
4383 resid = uio_resid(auio);
4384 msgarray[npkts].which |= SOCK_MSG_DATA;
4385 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4386 &msgarray[npkts].psa : NULL;
4387 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4388 &msgarray[npkts].controlp : NULL;
4389 npkts += 1;
4390 nextrecord = m->m_nextpkt;
4391
4392 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4393 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4394 if (error == ERESTART) {
4395 goto restart;
4396 } else if (error != 0) {
4397 goto release;
4398 }
4399 }
4400
4401 if (m != NULL && m->m_type == MT_CONTROL) {
4402 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4403 if (error != 0) {
4404 goto release;
4405 }
4406 }
4407
4408 if (m->m_pkthdr.len == 0) {
4409 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4410 __func__, __LINE__,
4411 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4412 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4413 m->m_type);
4414 }
4415
4416 /*
4417 * Loop to copy the mbufs of the current record
4418 * Support zero length packets
4419 */
4420 ml = NULL;
4421 pktlen = 0;
4422 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4423 if (m->m_len == 0) {
4424 panic("%p m_len zero", m);
4425 }
4426 if (m->m_type == 0) {
4427 panic("%p m_type zero", m);
4428 }
4429 /*
4430 * Clip to the residual length
4431 */
4432 if (len > m->m_len) {
4433 len = m->m_len;
4434 }
4435 pktlen += len;
4436 /*
4437 * Copy the mbufs via the uio or delay the copy
4438 * Sockbuf must be consistent here (points to current mbuf,
4439 * it points to next record) when we drop priority;
4440 * we must note any additions to the sockbuf when we
4441 * block interrupts again.
4442 */
4443 if (len > 0 && can_delay == 0) {
4444 socket_unlock(so, 0);
4445 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4446 socket_lock(so, 0);
4447 if (error) {
4448 goto release;
4449 }
4450 } else {
4451 delayed_copy_len += len;
4452 }
4453
4454 if (len == m->m_len) {
4455 /*
4456 * m was entirely copied
4457 */
4458 sbfree(&so->so_rcv, m);
4459 nextrecord = m->m_nextpkt;
4460 m->m_nextpkt = NULL;
4461
4462 /*
4463 * Set the first packet to the head of the free list
4464 */
4465 if (free_list == NULL) {
4466 free_list = m;
4467 }
4468 /*
4469 * Link current packet to tail of free list
4470 */
4471 if (ml == NULL) {
4472 if (free_tail != NULL) {
4473 free_tail->m_nextpkt = m;
4474 }
4475 free_tail = m;
4476 }
4477 /*
4478 * Link current mbuf to last mbuf of current packet
4479 */
4480 if (ml != NULL) {
4481 ml->m_next = m;
4482 }
4483 ml = m;
4484
4485 /*
4486 * Move next buf to head of socket buffer
4487 */
4488 so->so_rcv.sb_mb = m = ml->m_next;
4489 ml->m_next = NULL;
4490
4491 if (m != NULL) {
4492 m->m_nextpkt = nextrecord;
4493 if (nextrecord == NULL) {
4494 so->so_rcv.sb_lastrecord = m;
4495 }
4496 } else {
4497 so->so_rcv.sb_mb = nextrecord;
4498 SB_EMPTY_FIXUP(&so->so_rcv);
4499 }
4500 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4501 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4502 } else {
4503 /*
4504 * Stop the loop on partial copy
4505 */
4506 break;
4507 }
4508 }
4509 #ifdef MORE_LOCKING_DEBUG
4510 if (so->so_usecount <= 1) {
4511 panic("%s: after big while so=%llx ref=%d on socket",
4512 __func__,
4513 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4514 /* NOTREACHED */
4515 }
4516 #endif
4517 /*
4518 * Tell the caller we made a partial copy
4519 */
4520 if (m != NULL) {
4521 if (so->so_options & SO_DONTTRUNC) {
4522 /*
4523 * Copyout first the freelist then the partial mbuf
4524 */
4525 socket_unlock(so, 0);
4526 if (delayed_copy_len) {
4527 error = sodelayed_copy_list(so, msgarray,
4528 uiocnt, &free_list, &delayed_copy_len);
4529 }
4530
4531 if (error == 0) {
4532 error = uiomove(mtod(m, caddr_t), (int)len,
4533 auio);
4534 }
4535 socket_lock(so, 0);
4536 if (error) {
4537 goto release;
4538 }
4539
4540 m->m_data += len;
4541 m->m_len -= len;
4542 so->so_rcv.sb_cc -= len;
4543 flags |= MSG_RCVMORE;
4544 } else {
4545 (void) sbdroprecord(&so->so_rcv);
4546 nextrecord = so->so_rcv.sb_mb;
4547 m = NULL;
4548 flags |= MSG_TRUNC;
4549 }
4550 }
4551
4552 if (m == NULL) {
4553 so->so_rcv.sb_mb = nextrecord;
4554 /*
4555 * First part is an inline SB_EMPTY_FIXUP(). Second
4556 * part makes sure sb_lastrecord is up-to-date if
4557 * there is still data in the socket buffer.
4558 */
4559 if (so->so_rcv.sb_mb == NULL) {
4560 so->so_rcv.sb_mbtail = NULL;
4561 so->so_rcv.sb_lastrecord = NULL;
4562 } else if (nextrecord->m_nextpkt == NULL) {
4563 so->so_rcv.sb_lastrecord = nextrecord;
4564 }
4565 SB_MB_CHECK(&so->so_rcv);
4566 }
4567 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4568 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4569
4570 /*
4571 * We can continue to the next packet as long as:
4572 * - We haven't exhausted the uio array
4573 * - There was no error
4574 * - A packet was not truncated
4575 * - We can still receive more data
4576 */
4577 if (npkts < uiocnt && error == 0 &&
4578 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4579 (so->so_state & SS_CANTRCVMORE) == 0) {
4580 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4581 sblocked = 0;
4582
4583 goto next;
4584 }
4585 if (flagsp != NULL) {
4586 *flagsp |= flags;
4587 }
4588
4589 release:
4590 /*
4591 * pru_rcvd may cause more data to be received if the socket lock
4592 * is dropped so we set MSG_HAVEMORE now based on what we know.
4593 * That way the caller won't be surprised if it receives less data
4594 * than requested.
4595 */
4596 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4597 flags |= MSG_HAVEMORE;
4598 }
4599
4600 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4601 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4602 }
4603
4604 if (sblocked) {
4605 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4606 } else {
4607 socket_unlock(so, 1);
4608 }
4609
4610 if (delayed_copy_len) {
4611 error = sodelayed_copy_list(so, msgarray, uiocnt,
4612 &free_list, &delayed_copy_len);
4613 }
4614 out:
4615 /*
4616 * Amortize the cost of freeing the mbufs
4617 */
4618 if (free_list != NULL) {
4619 m_freem_list(free_list);
4620 }
4621 if (free_others != NULL) {
4622 m_freem_list(free_others);
4623 }
4624
4625 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4626 0, 0, 0, 0);
4627 return error;
4628 }
4629
4630 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4631 so_statistics_event_to_nstat_event(int64_t *input_options,
4632 uint64_t *nstat_event)
4633 {
4634 int error = 0;
4635 switch (*input_options) {
4636 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4637 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4638 break;
4639 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4640 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4641 break;
4642 #if (DEBUG || DEVELOPMENT)
4643 case SO_STATISTICS_EVENT_RESERVED_1:
4644 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4645 break;
4646 case SO_STATISTICS_EVENT_RESERVED_2:
4647 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4648 break;
4649 #endif /* (DEBUG || DEVELOPMENT) */
4650 default:
4651 error = EINVAL;
4652 break;
4653 }
4654 return error;
4655 }
4656
4657 /*
4658 * Returns: 0 Success
4659 * EINVAL
4660 * ENOTCONN
4661 * <pru_shutdown>:EINVAL
4662 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4663 * <pru_shutdown>:ENOBUFS[TCP]
4664 * <pru_shutdown>:EMSGSIZE[TCP]
4665 * <pru_shutdown>:EHOSTUNREACH[TCP]
4666 * <pru_shutdown>:ENETUNREACH[TCP]
4667 * <pru_shutdown>:ENETDOWN[TCP]
4668 * <pru_shutdown>:ENOMEM[TCP]
4669 * <pru_shutdown>:EACCES[TCP]
4670 * <pru_shutdown>:EMSGSIZE[TCP]
4671 * <pru_shutdown>:ENOBUFS[TCP]
4672 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4673 * <pru_shutdown>:??? [other protocol families]
4674 */
4675 int
soshutdown(struct socket * so,int how)4676 soshutdown(struct socket *so, int how)
4677 {
4678 int error;
4679
4680 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4681
4682 switch (how) {
4683 case SHUT_RD:
4684 case SHUT_WR:
4685 case SHUT_RDWR:
4686 socket_lock(so, 1);
4687 if ((so->so_state &
4688 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4689 error = ENOTCONN;
4690 } else {
4691 error = soshutdownlock(so, how);
4692 }
4693 socket_unlock(so, 1);
4694 break;
4695 default:
4696 error = EINVAL;
4697 break;
4698 }
4699
4700 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4701
4702 return error;
4703 }
4704
4705 int
soshutdownlock_final(struct socket * so,int how)4706 soshutdownlock_final(struct socket *so, int how)
4707 {
4708 struct protosw *pr = so->so_proto;
4709 int error = 0;
4710
4711 sflt_notify(so, sock_evt_shutdown, &how);
4712
4713 if (how != SHUT_WR) {
4714 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4715 /* read already shut down */
4716 error = ENOTCONN;
4717 goto done;
4718 }
4719 sorflush(so);
4720 }
4721 if (how != SHUT_RD) {
4722 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4723 /* write already shut down */
4724 error = ENOTCONN;
4725 goto done;
4726 }
4727 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4728 }
4729 done:
4730 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4731 return error;
4732 }
4733
4734 int
soshutdownlock(struct socket * so,int how)4735 soshutdownlock(struct socket *so, int how)
4736 {
4737 int error = 0;
4738
4739 #if CONTENT_FILTER
4740 /*
4741 * A content filter may delay the actual shutdown until it
4742 * has processed the pending data
4743 */
4744 if (so->so_flags & SOF_CONTENT_FILTER) {
4745 error = cfil_sock_shutdown(so, &how);
4746 if (error == EJUSTRETURN) {
4747 error = 0;
4748 goto done;
4749 } else if (error != 0) {
4750 goto done;
4751 }
4752 }
4753 #endif /* CONTENT_FILTER */
4754
4755 error = soshutdownlock_final(so, how);
4756
4757 done:
4758 return error;
4759 }
4760
4761 void
sowflush(struct socket * so)4762 sowflush(struct socket *so)
4763 {
4764 struct sockbuf *sb = &so->so_snd;
4765
4766 /*
4767 * Obtain lock on the socket buffer (SB_LOCK). This is required
4768 * to prevent the socket buffer from being unexpectedly altered
4769 * while it is used by another thread in socket send/receive.
4770 *
4771 * sblock() must not fail here, hence the assertion.
4772 */
4773 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4774 VERIFY(sb->sb_flags & SB_LOCK);
4775
4776 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4777 sb->sb_flags |= SB_DROP;
4778 sb->sb_upcall = NULL;
4779 sb->sb_upcallarg = NULL;
4780
4781 sbunlock(sb, TRUE); /* keep socket locked */
4782
4783 selthreadclear(&sb->sb_sel);
4784 sbrelease(sb);
4785 }
4786
4787 void
sorflush(struct socket * so)4788 sorflush(struct socket *so)
4789 {
4790 struct sockbuf *sb = &so->so_rcv;
4791 struct protosw *pr = so->so_proto;
4792 struct sockbuf asb;
4793 #ifdef notyet
4794 lck_mtx_t *mutex_held;
4795 /*
4796 * XXX: This code is currently commented out, because we may get here
4797 * as part of sofreelastref(), and at that time, pr_getlock() may no
4798 * longer be able to return us the lock; this will be fixed in future.
4799 */
4800 if (so->so_proto->pr_getlock != NULL) {
4801 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4802 } else {
4803 mutex_held = so->so_proto->pr_domain->dom_mtx;
4804 }
4805
4806 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4807 #endif /* notyet */
4808
4809 sflt_notify(so, sock_evt_flush_read, NULL);
4810
4811 socantrcvmore(so);
4812
4813 /*
4814 * Obtain lock on the socket buffer (SB_LOCK). This is required
4815 * to prevent the socket buffer from being unexpectedly altered
4816 * while it is used by another thread in socket send/receive.
4817 *
4818 * sblock() must not fail here, hence the assertion.
4819 */
4820 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4821 VERIFY(sb->sb_flags & SB_LOCK);
4822
4823 /*
4824 * Copy only the relevant fields from "sb" to "asb" which we
4825 * need for sbrelease() to function. In particular, skip
4826 * sb_sel as it contains the wait queue linkage, which would
4827 * wreak havoc if we were to issue selthreadclear() on "asb".
4828 * Make sure to not carry over SB_LOCK in "asb", as we need
4829 * to acquire it later as part of sbrelease().
4830 */
4831 bzero(&asb, sizeof(asb));
4832 asb.sb_cc = sb->sb_cc;
4833 asb.sb_hiwat = sb->sb_hiwat;
4834 asb.sb_mbcnt = sb->sb_mbcnt;
4835 asb.sb_mbmax = sb->sb_mbmax;
4836 asb.sb_ctl = sb->sb_ctl;
4837 asb.sb_lowat = sb->sb_lowat;
4838 asb.sb_mb = sb->sb_mb;
4839 asb.sb_mbtail = sb->sb_mbtail;
4840 asb.sb_lastrecord = sb->sb_lastrecord;
4841 asb.sb_so = sb->sb_so;
4842 asb.sb_flags = sb->sb_flags;
4843 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4844 asb.sb_flags |= SB_DROP;
4845
4846 /*
4847 * Ideally we'd bzero() these and preserve the ones we need;
4848 * but to do that we'd need to shuffle things around in the
4849 * sockbuf, and we can't do it now because there are KEXTS
4850 * that are directly referring to the socket structure.
4851 *
4852 * Setting SB_DROP acts as a barrier to prevent further appends.
4853 * Clearing SB_SEL is done for selthreadclear() below.
4854 */
4855 sb->sb_cc = 0;
4856 sb->sb_hiwat = 0;
4857 sb->sb_mbcnt = 0;
4858 sb->sb_mbmax = 0;
4859 sb->sb_ctl = 0;
4860 sb->sb_lowat = 0;
4861 sb->sb_mb = NULL;
4862 sb->sb_mbtail = NULL;
4863 sb->sb_lastrecord = NULL;
4864 sb->sb_timeo.tv_sec = 0;
4865 sb->sb_timeo.tv_usec = 0;
4866 sb->sb_upcall = NULL;
4867 sb->sb_upcallarg = NULL;
4868 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4869 sb->sb_flags |= SB_DROP;
4870
4871 sbunlock(sb, TRUE); /* keep socket locked */
4872
4873 /*
4874 * Note that selthreadclear() is called on the original "sb" and
4875 * not the local "asb" because of the way wait queue linkage is
4876 * implemented. Given that selwakeup() may be triggered, SB_SEL
4877 * should no longer be set (cleared above.)
4878 */
4879 selthreadclear(&sb->sb_sel);
4880
4881 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4882 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4883 }
4884
4885 sbrelease(&asb);
4886 }
4887
4888 /*
4889 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4890 * an additional variant to handle the case where the option value needs
4891 * to be some kind of integer, but not a specific size.
4892 * In addition to their use here, these functions are also called by the
4893 * protocol-level pr_ctloutput() routines.
4894 *
4895 * Returns: 0 Success
4896 * EINVAL
4897 * copyin:EFAULT
4898 */
4899 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4900 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4901 {
4902 size_t valsize;
4903
4904 /*
4905 * If the user gives us more than we wanted, we ignore it,
4906 * but if we don't get the minimum length the caller
4907 * wants, we return EINVAL. On success, sopt->sopt_valsize
4908 * is set to however much we actually retrieved.
4909 */
4910 if ((valsize = sopt->sopt_valsize) < minlen) {
4911 return EINVAL;
4912 }
4913 if (valsize > len) {
4914 sopt->sopt_valsize = valsize = len;
4915 }
4916
4917 if (sopt->sopt_p != kernproc) {
4918 return copyin(sopt->sopt_val, buf, valsize);
4919 }
4920
4921 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4922 return 0;
4923 }
4924
4925 /*
4926 * sooptcopyin_timeval
4927 * Copy in a timeval value into tv_p, and take into account whether the
4928 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4929 * code here so that we can verify the 64-bit tv_sec value before we lose
4930 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4931 */
4932 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4933 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4934 {
4935 int error;
4936
4937 if (proc_is64bit(sopt->sopt_p)) {
4938 struct user64_timeval tv64;
4939
4940 if (sopt->sopt_valsize < sizeof(tv64)) {
4941 return EINVAL;
4942 }
4943
4944 sopt->sopt_valsize = sizeof(tv64);
4945 if (sopt->sopt_p != kernproc) {
4946 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4947 if (error != 0) {
4948 return error;
4949 }
4950 } else {
4951 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4952 sizeof(tv64));
4953 }
4954 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4955 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4956 return EDOM;
4957 }
4958
4959 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4960 tv_p->tv_usec = tv64.tv_usec;
4961 } else {
4962 struct user32_timeval tv32;
4963
4964 if (sopt->sopt_valsize < sizeof(tv32)) {
4965 return EINVAL;
4966 }
4967
4968 sopt->sopt_valsize = sizeof(tv32);
4969 if (sopt->sopt_p != kernproc) {
4970 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4971 if (error != 0) {
4972 return error;
4973 }
4974 } else {
4975 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4976 sizeof(tv32));
4977 }
4978 #ifndef __LP64__
4979 /*
4980 * K64todo "comparison is always false due to
4981 * limited range of data type"
4982 */
4983 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4984 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4985 return EDOM;
4986 }
4987 #endif
4988 tv_p->tv_sec = tv32.tv_sec;
4989 tv_p->tv_usec = tv32.tv_usec;
4990 }
4991 return 0;
4992 }
4993
4994 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4995 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4996 boolean_t ignore_delegate)
4997 {
4998 kauth_cred_t cred = NULL;
4999 proc_t ep = PROC_NULL;
5000 uid_t uid;
5001 int error = 0;
5002
5003 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
5004 ep = proc_find(so->e_pid);
5005 if (ep) {
5006 cred = kauth_cred_proc_ref(ep);
5007 }
5008 }
5009
5010 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5011
5012 /* uid is 0 for root */
5013 if (uid != 0 || !allow_root) {
5014 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5015 }
5016 if (cred) {
5017 kauth_cred_unref(&cred);
5018 }
5019 if (ep != PROC_NULL) {
5020 proc_rele(ep);
5021 }
5022
5023 return error;
5024 }
5025
5026 /*
5027 * Returns: 0 Success
5028 * EINVAL
5029 * ENOPROTOOPT
5030 * ENOBUFS
5031 * EDOM
5032 * sooptcopyin:EINVAL
5033 * sooptcopyin:EFAULT
5034 * sooptcopyin_timeval:EINVAL
5035 * sooptcopyin_timeval:EFAULT
5036 * sooptcopyin_timeval:EDOM
5037 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5038 * <pr_ctloutput>:???w
5039 * sflt_attach_private:??? [whatever a filter author chooses]
5040 * <sf_setoption>:??? [whatever a filter author chooses]
5041 *
5042 * Notes: Other <pru_listen> returns depend on the protocol family; all
5043 * <sf_listen> returns depend on what the filter author causes
5044 * their filter to return.
5045 */
5046 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5047 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5048 {
5049 int error, optval;
5050 int64_t long_optval;
5051 struct linger l;
5052 struct timeval tv;
5053
5054 if (sopt->sopt_dir != SOPT_SET) {
5055 sopt->sopt_dir = SOPT_SET;
5056 }
5057
5058 if (dolock) {
5059 socket_lock(so, 1);
5060 }
5061
5062 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5063 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5064 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5065 /* the socket has been shutdown, no more sockopt's */
5066 error = EINVAL;
5067 goto out;
5068 }
5069
5070 error = sflt_setsockopt(so, sopt);
5071 if (error != 0) {
5072 if (error == EJUSTRETURN) {
5073 error = 0;
5074 }
5075 goto out;
5076 }
5077
5078 if (sopt->sopt_level != SOL_SOCKET) {
5079 if (so->so_proto != NULL &&
5080 so->so_proto->pr_ctloutput != NULL) {
5081 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5082 goto out;
5083 }
5084 error = ENOPROTOOPT;
5085 } else {
5086 /*
5087 * Allow socket-level (SOL_SOCKET) options to be filtered by
5088 * the protocol layer, if needed. A zero value returned from
5089 * the handler means use default socket-level processing as
5090 * done by the rest of this routine. Otherwise, any other
5091 * return value indicates that the option is unsupported.
5092 */
5093 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5094 pru_socheckopt(so, sopt)) != 0) {
5095 goto out;
5096 }
5097
5098 error = 0;
5099 switch (sopt->sopt_name) {
5100 case SO_LINGER:
5101 case SO_LINGER_SEC: {
5102 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5103 if (error != 0) {
5104 goto out;
5105 }
5106 /* Make sure to use sane values */
5107 if (sopt->sopt_name == SO_LINGER) {
5108 so->so_linger = (short)l.l_linger;
5109 } else {
5110 so->so_linger = (short)((long)l.l_linger * hz);
5111 }
5112 if (l.l_onoff != 0) {
5113 so->so_options |= SO_LINGER;
5114 } else {
5115 so->so_options &= ~SO_LINGER;
5116 }
5117 break;
5118 }
5119 case SO_DEBUG:
5120 case SO_KEEPALIVE:
5121 case SO_DONTROUTE:
5122 case SO_USELOOPBACK:
5123 case SO_BROADCAST:
5124 case SO_REUSEADDR:
5125 case SO_REUSEPORT:
5126 case SO_OOBINLINE:
5127 case SO_TIMESTAMP:
5128 case SO_TIMESTAMP_MONOTONIC:
5129 case SO_TIMESTAMP_CONTINUOUS:
5130 case SO_DONTTRUNC:
5131 case SO_WANTMORE:
5132 case SO_WANTOOBFLAG:
5133 case SO_NOWAKEFROMSLEEP:
5134 case SO_NOAPNFALLBK:
5135 error = sooptcopyin(sopt, &optval, sizeof(optval),
5136 sizeof(optval));
5137 if (error != 0) {
5138 goto out;
5139 }
5140 if (optval) {
5141 so->so_options |= sopt->sopt_name;
5142 } else {
5143 so->so_options &= ~sopt->sopt_name;
5144 }
5145 #if SKYWALK
5146 inp_update_netns_flags(so);
5147 #endif /* SKYWALK */
5148 break;
5149
5150 case SO_SNDBUF:
5151 case SO_RCVBUF:
5152 case SO_SNDLOWAT:
5153 case SO_RCVLOWAT:
5154 error = sooptcopyin(sopt, &optval, sizeof(optval),
5155 sizeof(optval));
5156 if (error != 0) {
5157 goto out;
5158 }
5159
5160 /*
5161 * Values < 1 make no sense for any of these
5162 * options, so disallow them.
5163 */
5164 if (optval < 1) {
5165 error = EINVAL;
5166 goto out;
5167 }
5168
5169 switch (sopt->sopt_name) {
5170 case SO_SNDBUF:
5171 case SO_RCVBUF: {
5172 struct sockbuf *sb =
5173 (sopt->sopt_name == SO_SNDBUF) ?
5174 &so->so_snd : &so->so_rcv;
5175 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5176 error = ENOBUFS;
5177 goto out;
5178 }
5179 sb->sb_flags |= SB_USRSIZE;
5180 sb->sb_flags &= ~SB_AUTOSIZE;
5181 sb->sb_idealsize = (u_int32_t)optval;
5182 break;
5183 }
5184 /*
5185 * Make sure the low-water is never greater than
5186 * the high-water.
5187 */
5188 case SO_SNDLOWAT: {
5189 int space = sbspace(&so->so_snd);
5190 uint32_t hiwat = so->so_snd.sb_hiwat;
5191
5192 if (so->so_snd.sb_flags & SB_UNIX) {
5193 struct unpcb *unp =
5194 (struct unpcb *)(so->so_pcb);
5195 if (unp != NULL &&
5196 unp->unp_conn != NULL) {
5197 struct socket *so2 = unp->unp_conn->unp_socket;
5198 hiwat += unp->unp_conn->unp_cc;
5199 space = sbspace(&so2->so_rcv);
5200 }
5201 }
5202
5203 so->so_snd.sb_lowat =
5204 (optval > hiwat) ?
5205 hiwat : optval;
5206
5207 if (space >= so->so_snd.sb_lowat) {
5208 sowwakeup(so);
5209 }
5210 break;
5211 }
5212 case SO_RCVLOWAT: {
5213 int64_t data_len;
5214 so->so_rcv.sb_lowat =
5215 (optval > so->so_rcv.sb_hiwat) ?
5216 so->so_rcv.sb_hiwat : optval;
5217 if (so->so_rcv.sb_flags & SB_UNIX) {
5218 struct unpcb *unp =
5219 (struct unpcb *)(so->so_pcb);
5220 if (unp != NULL &&
5221 unp->unp_conn != NULL) {
5222 struct socket *so2 = unp->unp_conn->unp_socket;
5223 data_len = so2->so_snd.sb_cc
5224 - so2->so_snd.sb_ctl;
5225 } else {
5226 data_len = so->so_rcv.sb_cc
5227 - so->so_rcv.sb_ctl;
5228 }
5229 } else {
5230 data_len = so->so_rcv.sb_cc
5231 - so->so_rcv.sb_ctl;
5232 }
5233
5234 if (data_len >= so->so_rcv.sb_lowat) {
5235 sorwakeup(so);
5236 }
5237 break;
5238 }
5239 }
5240 break;
5241
5242 case SO_SNDTIMEO:
5243 case SO_RCVTIMEO:
5244 error = sooptcopyin_timeval(sopt, &tv);
5245 if (error != 0) {
5246 goto out;
5247 }
5248
5249 switch (sopt->sopt_name) {
5250 case SO_SNDTIMEO:
5251 so->so_snd.sb_timeo = tv;
5252 break;
5253 case SO_RCVTIMEO:
5254 so->so_rcv.sb_timeo = tv;
5255 break;
5256 }
5257 break;
5258
5259 case SO_NKE: {
5260 struct so_nke nke;
5261
5262 error = sooptcopyin(sopt, &nke, sizeof(nke),
5263 sizeof(nke));
5264 if (error != 0) {
5265 goto out;
5266 }
5267
5268 error = sflt_attach_internal(so, nke.nke_handle);
5269 break;
5270 }
5271
5272 case SO_NOSIGPIPE:
5273 error = sooptcopyin(sopt, &optval, sizeof(optval),
5274 sizeof(optval));
5275 if (error != 0) {
5276 goto out;
5277 }
5278 if (optval != 0) {
5279 so->so_flags |= SOF_NOSIGPIPE;
5280 } else {
5281 so->so_flags &= ~SOF_NOSIGPIPE;
5282 }
5283 break;
5284
5285 case SO_NOADDRERR:
5286 error = sooptcopyin(sopt, &optval, sizeof(optval),
5287 sizeof(optval));
5288 if (error != 0) {
5289 goto out;
5290 }
5291 if (optval != 0) {
5292 so->so_flags |= SOF_NOADDRAVAIL;
5293 } else {
5294 so->so_flags &= ~SOF_NOADDRAVAIL;
5295 }
5296 break;
5297
5298 case SO_REUSESHAREUID:
5299 error = sooptcopyin(sopt, &optval, sizeof(optval),
5300 sizeof(optval));
5301 if (error != 0) {
5302 goto out;
5303 }
5304 if (optval != 0) {
5305 so->so_flags |= SOF_REUSESHAREUID;
5306 } else {
5307 so->so_flags &= ~SOF_REUSESHAREUID;
5308 }
5309 break;
5310
5311 case SO_NOTIFYCONFLICT:
5312 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5313 error = EPERM;
5314 goto out;
5315 }
5316 error = sooptcopyin(sopt, &optval, sizeof(optval),
5317 sizeof(optval));
5318 if (error != 0) {
5319 goto out;
5320 }
5321 if (optval != 0) {
5322 so->so_flags |= SOF_NOTIFYCONFLICT;
5323 } else {
5324 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5325 }
5326 break;
5327
5328 case SO_RESTRICTIONS:
5329 error = sooptcopyin(sopt, &optval, sizeof(optval),
5330 sizeof(optval));
5331 if (error != 0) {
5332 goto out;
5333 }
5334
5335 error = so_set_restrictions(so, optval);
5336 break;
5337
5338 case SO_AWDL_UNRESTRICTED:
5339 if (SOCK_DOM(so) != PF_INET &&
5340 SOCK_DOM(so) != PF_INET6) {
5341 error = EOPNOTSUPP;
5342 goto out;
5343 }
5344 error = sooptcopyin(sopt, &optval, sizeof(optval),
5345 sizeof(optval));
5346 if (error != 0) {
5347 goto out;
5348 }
5349 if (optval != 0) {
5350 error = soopt_cred_check(so,
5351 PRIV_NET_RESTRICTED_AWDL, false, false);
5352 if (error == 0) {
5353 inp_set_awdl_unrestricted(
5354 sotoinpcb(so));
5355 }
5356 } else {
5357 inp_clear_awdl_unrestricted(sotoinpcb(so));
5358 }
5359 break;
5360 case SO_INTCOPROC_ALLOW:
5361 if (SOCK_DOM(so) != PF_INET6) {
5362 error = EOPNOTSUPP;
5363 goto out;
5364 }
5365 error = sooptcopyin(sopt, &optval, sizeof(optval),
5366 sizeof(optval));
5367 if (error != 0) {
5368 goto out;
5369 }
5370 if (optval != 0 &&
5371 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5372 error = soopt_cred_check(so,
5373 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5374 if (error == 0) {
5375 inp_set_intcoproc_allowed(
5376 sotoinpcb(so));
5377 }
5378 } else if (optval == 0) {
5379 inp_clear_intcoproc_allowed(sotoinpcb(so));
5380 }
5381 break;
5382
5383 case SO_LABEL:
5384 error = EOPNOTSUPP;
5385 break;
5386
5387 case SO_UPCALLCLOSEWAIT:
5388 error = sooptcopyin(sopt, &optval, sizeof(optval),
5389 sizeof(optval));
5390 if (error != 0) {
5391 goto out;
5392 }
5393 if (optval != 0) {
5394 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5395 } else {
5396 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5397 }
5398 break;
5399
5400 case SO_RANDOMPORT:
5401 error = sooptcopyin(sopt, &optval, sizeof(optval),
5402 sizeof(optval));
5403 if (error != 0) {
5404 goto out;
5405 }
5406 if (optval != 0) {
5407 so->so_flags |= SOF_BINDRANDOMPORT;
5408 } else {
5409 so->so_flags &= ~SOF_BINDRANDOMPORT;
5410 }
5411 break;
5412
5413 case SO_NP_EXTENSIONS: {
5414 struct so_np_extensions sonpx;
5415
5416 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5417 sizeof(sonpx));
5418 if (error != 0) {
5419 goto out;
5420 }
5421 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5422 error = EINVAL;
5423 goto out;
5424 }
5425 /*
5426 * Only one bit defined for now
5427 */
5428 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5429 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5430 so->so_flags |= SOF_NPX_SETOPTSHUT;
5431 } else {
5432 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5433 }
5434 }
5435 break;
5436 }
5437
5438 case SO_TRAFFIC_CLASS: {
5439 error = sooptcopyin(sopt, &optval, sizeof(optval),
5440 sizeof(optval));
5441 if (error != 0) {
5442 goto out;
5443 }
5444 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5445 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5446 error = so_set_net_service_type(so, netsvc);
5447 goto out;
5448 }
5449 error = so_set_traffic_class(so, optval);
5450 if (error != 0) {
5451 goto out;
5452 }
5453 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5454 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5455 break;
5456 }
5457
5458 case SO_RECV_TRAFFIC_CLASS: {
5459 error = sooptcopyin(sopt, &optval, sizeof(optval),
5460 sizeof(optval));
5461 if (error != 0) {
5462 goto out;
5463 }
5464 if (optval == 0) {
5465 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5466 } else {
5467 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5468 }
5469 break;
5470 }
5471
5472 #if (DEVELOPMENT || DEBUG)
5473 case SO_TRAFFIC_CLASS_DBG: {
5474 struct so_tcdbg so_tcdbg;
5475
5476 error = sooptcopyin(sopt, &so_tcdbg,
5477 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5478 if (error != 0) {
5479 goto out;
5480 }
5481 error = so_set_tcdbg(so, &so_tcdbg);
5482 if (error != 0) {
5483 goto out;
5484 }
5485 break;
5486 }
5487 #endif /* (DEVELOPMENT || DEBUG) */
5488
5489 case SO_PRIVILEGED_TRAFFIC_CLASS:
5490 error = priv_check_cred(kauth_cred_get(),
5491 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5492 if (error != 0) {
5493 goto out;
5494 }
5495 error = sooptcopyin(sopt, &optval, sizeof(optval),
5496 sizeof(optval));
5497 if (error != 0) {
5498 goto out;
5499 }
5500 if (optval == 0) {
5501 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5502 } else {
5503 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5504 }
5505 break;
5506
5507 #if (DEVELOPMENT || DEBUG)
5508 case SO_DEFUNCTIT:
5509 error = sosetdefunct(current_proc(), so, 0, FALSE);
5510 if (error == 0) {
5511 error = sodefunct(current_proc(), so, 0);
5512 }
5513
5514 break;
5515 #endif /* (DEVELOPMENT || DEBUG) */
5516
5517 case SO_DEFUNCTOK:
5518 error = sooptcopyin(sopt, &optval, sizeof(optval),
5519 sizeof(optval));
5520 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5521 if (error == 0) {
5522 error = EBADF;
5523 }
5524 goto out;
5525 }
5526 /*
5527 * Any process can set SO_DEFUNCTOK (clear
5528 * SOF_NODEFUNCT), but only root can clear
5529 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5530 */
5531 if (optval == 0 &&
5532 kauth_cred_issuser(kauth_cred_get()) == 0) {
5533 error = EPERM;
5534 goto out;
5535 }
5536 if (optval) {
5537 so->so_flags &= ~SOF_NODEFUNCT;
5538 } else {
5539 so->so_flags |= SOF_NODEFUNCT;
5540 }
5541
5542 if (SOCK_DOM(so) == PF_INET ||
5543 SOCK_DOM(so) == PF_INET6) {
5544 char s[MAX_IPv6_STR_LEN];
5545 char d[MAX_IPv6_STR_LEN];
5546 struct inpcb *inp = sotoinpcb(so);
5547
5548 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5549 "[%s %s:%d -> %s:%d] is now marked "
5550 "as %seligible for "
5551 "defunct\n", __func__, proc_selfpid(),
5552 proc_best_name(current_proc()),
5553 so->so_gencnt,
5554 (SOCK_TYPE(so) == SOCK_STREAM) ?
5555 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5556 ((SOCK_DOM(so) == PF_INET) ?
5557 (void *)&inp->inp_laddr.s_addr :
5558 (void *)&inp->in6p_laddr), s, sizeof(s)),
5559 ntohs(inp->in6p_lport),
5560 inet_ntop(SOCK_DOM(so),
5561 (SOCK_DOM(so) == PF_INET) ?
5562 (void *)&inp->inp_faddr.s_addr :
5563 (void *)&inp->in6p_faddr, d, sizeof(d)),
5564 ntohs(inp->in6p_fport),
5565 (so->so_flags & SOF_NODEFUNCT) ?
5566 "not " : "");
5567 } else {
5568 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5569 "is now marked as %seligible for "
5570 "defunct\n",
5571 __func__, proc_selfpid(),
5572 proc_best_name(current_proc()),
5573 so->so_gencnt,
5574 SOCK_DOM(so), SOCK_TYPE(so),
5575 (so->so_flags & SOF_NODEFUNCT) ?
5576 "not " : "");
5577 }
5578 break;
5579
5580 case SO_ISDEFUNCT:
5581 /* This option is not settable */
5582 error = EINVAL;
5583 break;
5584
5585 case SO_OPPORTUNISTIC:
5586 error = sooptcopyin(sopt, &optval, sizeof(optval),
5587 sizeof(optval));
5588 if (error == 0) {
5589 error = so_set_opportunistic(so, optval);
5590 }
5591 break;
5592
5593 case SO_FLUSH:
5594 /* This option is handled by lower layer(s) */
5595 error = 0;
5596 break;
5597
5598 case SO_RECV_ANYIF:
5599 error = sooptcopyin(sopt, &optval, sizeof(optval),
5600 sizeof(optval));
5601 if (error == 0) {
5602 error = so_set_recv_anyif(so, optval);
5603 }
5604 break;
5605
5606 case SO_TRAFFIC_MGT_BACKGROUND: {
5607 /* This option is handled by lower layer(s) */
5608 error = 0;
5609 break;
5610 }
5611
5612 #if FLOW_DIVERT
5613 case SO_FLOW_DIVERT_TOKEN:
5614 error = flow_divert_token_set(so, sopt);
5615 break;
5616 #endif /* FLOW_DIVERT */
5617
5618
5619 case SO_DELEGATED:
5620 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5621 sizeof(optval))) != 0) {
5622 break;
5623 }
5624
5625 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5626 break;
5627
5628 case SO_DELEGATED_UUID: {
5629 uuid_t euuid;
5630
5631 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5632 sizeof(euuid))) != 0) {
5633 break;
5634 }
5635
5636 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5637 break;
5638 }
5639
5640 #if NECP
5641 case SO_NECP_ATTRIBUTES:
5642 if (SOCK_DOM(so) == PF_MULTIPATH) {
5643 /* Handled by MPTCP itself */
5644 break;
5645 }
5646
5647 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5648 error = EINVAL;
5649 goto out;
5650 }
5651
5652 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5653 break;
5654
5655 case SO_NECP_CLIENTUUID: {
5656 if (SOCK_DOM(so) == PF_MULTIPATH) {
5657 /* Handled by MPTCP itself */
5658 break;
5659 }
5660
5661 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5662 error = EINVAL;
5663 goto out;
5664 }
5665
5666 struct inpcb *inp = sotoinpcb(so);
5667 if (!uuid_is_null(inp->necp_client_uuid)) {
5668 // Clear out the old client UUID if present
5669 necp_inpcb_remove_cb(inp);
5670 }
5671
5672 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5673 sizeof(uuid_t), sizeof(uuid_t));
5674 if (error != 0) {
5675 goto out;
5676 }
5677
5678 if (uuid_is_null(inp->necp_client_uuid)) {
5679 error = EINVAL;
5680 goto out;
5681 }
5682
5683 pid_t current_pid = proc_pid(current_proc());
5684 error = necp_client_register_socket_flow(current_pid,
5685 inp->necp_client_uuid, inp);
5686 if (error != 0) {
5687 uuid_clear(inp->necp_client_uuid);
5688 goto out;
5689 }
5690
5691 if (inp->inp_lport != 0) {
5692 // There is a bound local port, so this is not
5693 // a fresh socket. Assign to the client.
5694 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5695 }
5696
5697 break;
5698 }
5699 case SO_NECP_LISTENUUID: {
5700 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5701 error = EINVAL;
5702 goto out;
5703 }
5704
5705 struct inpcb *inp = sotoinpcb(so);
5706 if (!uuid_is_null(inp->necp_client_uuid)) {
5707 error = EINVAL;
5708 goto out;
5709 }
5710
5711 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5712 sizeof(uuid_t), sizeof(uuid_t));
5713 if (error != 0) {
5714 goto out;
5715 }
5716
5717 if (uuid_is_null(inp->necp_client_uuid)) {
5718 error = EINVAL;
5719 goto out;
5720 }
5721
5722 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5723 inp->necp_client_uuid, inp);
5724 if (error != 0) {
5725 uuid_clear(inp->necp_client_uuid);
5726 goto out;
5727 }
5728
5729 // Mark that the port registration is held by NECP
5730 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5731
5732 break;
5733 }
5734
5735 case SO_RESOLVER_SIGNATURE: {
5736 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5737 error = EINVAL;
5738 goto out;
5739 }
5740 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5741 break;
5742 }
5743 #endif /* NECP */
5744
5745 case SO_EXTENDED_BK_IDLE:
5746 error = sooptcopyin(sopt, &optval, sizeof(optval),
5747 sizeof(optval));
5748 if (error == 0) {
5749 error = so_set_extended_bk_idle(so, optval);
5750 }
5751 break;
5752
5753 case SO_MARK_CELLFALLBACK:
5754 error = sooptcopyin(sopt, &optval, sizeof(optval),
5755 sizeof(optval));
5756 if (error != 0) {
5757 goto out;
5758 }
5759 if (optval < 0) {
5760 error = EINVAL;
5761 goto out;
5762 }
5763 if (optval == 0) {
5764 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5765 } else {
5766 so->so_flags1 |= SOF1_CELLFALLBACK;
5767 }
5768 break;
5769
5770 case SO_MARK_CELLFALLBACK_UUID:
5771 {
5772 struct so_mark_cellfallback_uuid_args args;
5773
5774 error = sooptcopyin(sopt, &args, sizeof(args),
5775 sizeof(args));
5776 if (error != 0) {
5777 goto out;
5778 }
5779 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5780 args.flow_cellfallback);
5781 break;
5782 }
5783
5784 case SO_FALLBACK_MODE:
5785 error = sooptcopyin(sopt, &optval, sizeof(optval),
5786 sizeof(optval));
5787 if (error != 0) {
5788 goto out;
5789 }
5790 if (optval < SO_FALLBACK_MODE_NONE ||
5791 optval > SO_FALLBACK_MODE_PREFER) {
5792 error = EINVAL;
5793 goto out;
5794 }
5795 so->so_fallback_mode = (u_int8_t)optval;
5796 break;
5797
5798 case SO_MARK_KNOWN_TRACKER: {
5799 error = sooptcopyin(sopt, &optval, sizeof(optval),
5800 sizeof(optval));
5801 if (error != 0) {
5802 goto out;
5803 }
5804 if (optval < 0) {
5805 error = EINVAL;
5806 goto out;
5807 }
5808 if (optval == 0) {
5809 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5810 } else {
5811 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5812 }
5813 break;
5814 }
5815
5816 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5817 error = sooptcopyin(sopt, &optval, sizeof(optval),
5818 sizeof(optval));
5819 if (error != 0) {
5820 goto out;
5821 }
5822 if (optval < 0) {
5823 error = EINVAL;
5824 goto out;
5825 }
5826 if (optval == 0) {
5827 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5828 } else {
5829 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5830 }
5831 break;
5832 }
5833
5834 case SO_MARK_APPROVED_APP_DOMAIN: {
5835 error = sooptcopyin(sopt, &optval, sizeof(optval),
5836 sizeof(optval));
5837 if (error != 0) {
5838 goto out;
5839 }
5840 if (optval < 0) {
5841 error = EINVAL;
5842 goto out;
5843 }
5844 if (optval == 0) {
5845 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5846 } else {
5847 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5848 }
5849 break;
5850 }
5851
5852 case SO_STATISTICS_EVENT:
5853 error = sooptcopyin(sopt, &long_optval,
5854 sizeof(long_optval), sizeof(long_optval));
5855 if (error != 0) {
5856 goto out;
5857 }
5858 u_int64_t nstat_event = 0;
5859 error = so_statistics_event_to_nstat_event(
5860 &long_optval, &nstat_event);
5861 if (error != 0) {
5862 goto out;
5863 }
5864 nstat_pcb_event(sotoinpcb(so), nstat_event);
5865 break;
5866
5867 case SO_NET_SERVICE_TYPE: {
5868 error = sooptcopyin(sopt, &optval, sizeof(optval),
5869 sizeof(optval));
5870 if (error != 0) {
5871 goto out;
5872 }
5873 error = so_set_net_service_type(so, optval);
5874 break;
5875 }
5876
5877 case SO_QOSMARKING_POLICY_OVERRIDE:
5878 error = priv_check_cred(kauth_cred_get(),
5879 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5880 if (error != 0) {
5881 goto out;
5882 }
5883 error = sooptcopyin(sopt, &optval, sizeof(optval),
5884 sizeof(optval));
5885 if (error != 0) {
5886 goto out;
5887 }
5888 if (optval == 0) {
5889 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5890 } else {
5891 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5892 }
5893 break;
5894
5895 case SO_MPKL_SEND_INFO: {
5896 struct so_mpkl_send_info so_mpkl_send_info;
5897
5898 error = sooptcopyin(sopt, &so_mpkl_send_info,
5899 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5900 if (error != 0) {
5901 goto out;
5902 }
5903 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5904 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5905
5906 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5907 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5908 } else {
5909 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5910 }
5911 break;
5912 }
5913 case SO_WANT_KEV_SOCKET_CLOSED: {
5914 error = sooptcopyin(sopt, &optval, sizeof(optval),
5915 sizeof(optval));
5916 if (error != 0) {
5917 goto out;
5918 }
5919 if (optval == 0) {
5920 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5921 } else {
5922 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5923 }
5924 break;
5925 }
5926 case SO_MARK_WAKE_PKT: {
5927 error = sooptcopyin(sopt, &optval, sizeof(optval),
5928 sizeof(optval));
5929 if (error != 0) {
5930 goto out;
5931 }
5932 if (optval == 0) {
5933 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5934 } else {
5935 so->so_flags |= SOF_MARK_WAKE_PKT;
5936 }
5937 break;
5938 }
5939 case SO_RECV_WAKE_PKT: {
5940 error = sooptcopyin(sopt, &optval, sizeof(optval),
5941 sizeof(optval));
5942 if (error != 0) {
5943 goto out;
5944 }
5945 if (optval == 0) {
5946 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5947 } else {
5948 so->so_flags |= SOF_RECV_WAKE_PKT;
5949 }
5950 break;
5951 }
5952 default:
5953 error = ENOPROTOOPT;
5954 break;
5955 }
5956 if (error == 0 && so->so_proto != NULL &&
5957 so->so_proto->pr_ctloutput != NULL) {
5958 (void) so->so_proto->pr_ctloutput(so, sopt);
5959 }
5960 }
5961 out:
5962 if (dolock) {
5963 socket_unlock(so, 1);
5964 }
5965 return error;
5966 }
5967
5968 /* Helper routines for getsockopt */
5969 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5970 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5971 {
5972 int error;
5973 size_t valsize;
5974
5975 error = 0;
5976
5977 /*
5978 * Documented get behavior is that we always return a value,
5979 * possibly truncated to fit in the user's buffer.
5980 * Traditional behavior is that we always tell the user
5981 * precisely how much we copied, rather than something useful
5982 * like the total amount we had available for her.
5983 * Note that this interface is not idempotent; the entire answer must
5984 * generated ahead of time.
5985 */
5986 valsize = MIN(len, sopt->sopt_valsize);
5987 sopt->sopt_valsize = valsize;
5988 if (sopt->sopt_val != USER_ADDR_NULL) {
5989 if (sopt->sopt_p != kernproc) {
5990 error = copyout(buf, sopt->sopt_val, valsize);
5991 } else {
5992 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5993 }
5994 }
5995 return error;
5996 }
5997
5998 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5999 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
6000 {
6001 int error;
6002 size_t len;
6003 struct user64_timeval tv64 = {};
6004 struct user32_timeval tv32 = {};
6005 const void * val;
6006 size_t valsize;
6007
6008 error = 0;
6009 if (proc_is64bit(sopt->sopt_p)) {
6010 len = sizeof(tv64);
6011 tv64.tv_sec = tv_p->tv_sec;
6012 tv64.tv_usec = tv_p->tv_usec;
6013 val = &tv64;
6014 } else {
6015 len = sizeof(tv32);
6016 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
6017 tv32.tv_usec = tv_p->tv_usec;
6018 val = &tv32;
6019 }
6020 valsize = MIN(len, sopt->sopt_valsize);
6021 sopt->sopt_valsize = valsize;
6022 if (sopt->sopt_val != USER_ADDR_NULL) {
6023 if (sopt->sopt_p != kernproc) {
6024 error = copyout(val, sopt->sopt_val, valsize);
6025 } else {
6026 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6027 }
6028 }
6029 return error;
6030 }
6031
6032 /*
6033 * Return: 0 Success
6034 * ENOPROTOOPT
6035 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
6036 * <pr_ctloutput>:???
6037 * <sf_getoption>:???
6038 */
6039 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)6040 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
6041 {
6042 int error, optval;
6043 struct linger l;
6044 struct timeval tv;
6045
6046 if (sopt->sopt_dir != SOPT_GET) {
6047 sopt->sopt_dir = SOPT_GET;
6048 }
6049
6050 if (dolock) {
6051 socket_lock(so, 1);
6052 }
6053
6054 error = sflt_getsockopt(so, sopt);
6055 if (error != 0) {
6056 if (error == EJUSTRETURN) {
6057 error = 0;
6058 }
6059 goto out;
6060 }
6061
6062 if (sopt->sopt_level != SOL_SOCKET) {
6063 if (so->so_proto != NULL &&
6064 so->so_proto->pr_ctloutput != NULL) {
6065 error = (*so->so_proto->pr_ctloutput)(so, sopt);
6066 goto out;
6067 }
6068 error = ENOPROTOOPT;
6069 } else {
6070 /*
6071 * Allow socket-level (SOL_SOCKET) options to be filtered by
6072 * the protocol layer, if needed. A zero value returned from
6073 * the handler means use default socket-level processing as
6074 * done by the rest of this routine. Otherwise, any other
6075 * return value indicates that the option is unsupported.
6076 */
6077 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6078 pru_socheckopt(so, sopt)) != 0) {
6079 goto out;
6080 }
6081
6082 error = 0;
6083 switch (sopt->sopt_name) {
6084 case SO_LINGER:
6085 case SO_LINGER_SEC:
6086 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6087 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6088 so->so_linger : so->so_linger / hz;
6089 error = sooptcopyout(sopt, &l, sizeof(l));
6090 break;
6091
6092 case SO_USELOOPBACK:
6093 case SO_DONTROUTE:
6094 case SO_DEBUG:
6095 case SO_KEEPALIVE:
6096 case SO_REUSEADDR:
6097 case SO_REUSEPORT:
6098 case SO_BROADCAST:
6099 case SO_OOBINLINE:
6100 case SO_TIMESTAMP:
6101 case SO_TIMESTAMP_MONOTONIC:
6102 case SO_TIMESTAMP_CONTINUOUS:
6103 case SO_DONTTRUNC:
6104 case SO_WANTMORE:
6105 case SO_WANTOOBFLAG:
6106 case SO_NOWAKEFROMSLEEP:
6107 case SO_NOAPNFALLBK:
6108 optval = so->so_options & sopt->sopt_name;
6109 integer:
6110 error = sooptcopyout(sopt, &optval, sizeof(optval));
6111 break;
6112
6113 case SO_TYPE:
6114 optval = so->so_type;
6115 goto integer;
6116
6117 case SO_NREAD:
6118 if (so->so_proto->pr_flags & PR_ATOMIC) {
6119 int pkt_total;
6120 struct mbuf *m1;
6121
6122 pkt_total = 0;
6123 m1 = so->so_rcv.sb_mb;
6124 while (m1 != NULL) {
6125 if (m1->m_type == MT_DATA ||
6126 m1->m_type == MT_HEADER ||
6127 m1->m_type == MT_OOBDATA) {
6128 pkt_total += m1->m_len;
6129 }
6130 m1 = m1->m_next;
6131 }
6132 optval = pkt_total;
6133 } else {
6134 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6135 }
6136 goto integer;
6137
6138 case SO_NUMRCVPKT:
6139 if (so->so_proto->pr_flags & PR_ATOMIC) {
6140 int cnt = 0;
6141 struct mbuf *m1;
6142
6143 m1 = so->so_rcv.sb_mb;
6144 while (m1 != NULL) {
6145 cnt += 1;
6146 m1 = m1->m_nextpkt;
6147 }
6148 optval = cnt;
6149 goto integer;
6150 } else {
6151 error = ENOPROTOOPT;
6152 break;
6153 }
6154
6155 case SO_NWRITE:
6156 optval = so->so_snd.sb_cc;
6157 goto integer;
6158
6159 case SO_ERROR:
6160 optval = so->so_error;
6161 so->so_error = 0;
6162 goto integer;
6163
6164 case SO_SNDBUF: {
6165 u_int32_t hiwat = so->so_snd.sb_hiwat;
6166
6167 if (so->so_snd.sb_flags & SB_UNIX) {
6168 struct unpcb *unp =
6169 (struct unpcb *)(so->so_pcb);
6170 if (unp != NULL && unp->unp_conn != NULL) {
6171 hiwat += unp->unp_conn->unp_cc;
6172 }
6173 }
6174
6175 optval = hiwat;
6176 goto integer;
6177 }
6178 case SO_RCVBUF:
6179 optval = so->so_rcv.sb_hiwat;
6180 goto integer;
6181
6182 case SO_SNDLOWAT:
6183 optval = so->so_snd.sb_lowat;
6184 goto integer;
6185
6186 case SO_RCVLOWAT:
6187 optval = so->so_rcv.sb_lowat;
6188 goto integer;
6189
6190 case SO_SNDTIMEO:
6191 case SO_RCVTIMEO:
6192 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6193 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6194
6195 error = sooptcopyout_timeval(sopt, &tv);
6196 break;
6197
6198 case SO_NOSIGPIPE:
6199 optval = (so->so_flags & SOF_NOSIGPIPE);
6200 goto integer;
6201
6202 case SO_NOADDRERR:
6203 optval = (so->so_flags & SOF_NOADDRAVAIL);
6204 goto integer;
6205
6206 case SO_REUSESHAREUID:
6207 optval = (so->so_flags & SOF_REUSESHAREUID);
6208 goto integer;
6209
6210
6211 case SO_NOTIFYCONFLICT:
6212 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6213 goto integer;
6214
6215 case SO_RESTRICTIONS:
6216 optval = so_get_restrictions(so);
6217 goto integer;
6218
6219 case SO_AWDL_UNRESTRICTED:
6220 if (SOCK_DOM(so) == PF_INET ||
6221 SOCK_DOM(so) == PF_INET6) {
6222 optval = inp_get_awdl_unrestricted(
6223 sotoinpcb(so));
6224 goto integer;
6225 } else {
6226 error = EOPNOTSUPP;
6227 }
6228 break;
6229
6230 case SO_INTCOPROC_ALLOW:
6231 if (SOCK_DOM(so) == PF_INET6) {
6232 optval = inp_get_intcoproc_allowed(
6233 sotoinpcb(so));
6234 goto integer;
6235 } else {
6236 error = EOPNOTSUPP;
6237 }
6238 break;
6239
6240 case SO_LABEL:
6241 error = EOPNOTSUPP;
6242 break;
6243
6244 case SO_PEERLABEL:
6245 error = EOPNOTSUPP;
6246 break;
6247
6248 #ifdef __APPLE_API_PRIVATE
6249 case SO_UPCALLCLOSEWAIT:
6250 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6251 goto integer;
6252 #endif
6253 case SO_RANDOMPORT:
6254 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6255 goto integer;
6256
6257 case SO_NP_EXTENSIONS: {
6258 struct so_np_extensions sonpx = {};
6259
6260 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6261 SONPX_SETOPTSHUT : 0;
6262 sonpx.npx_mask = SONPX_MASK_VALID;
6263
6264 error = sooptcopyout(sopt, &sonpx,
6265 sizeof(struct so_np_extensions));
6266 break;
6267 }
6268
6269 case SO_TRAFFIC_CLASS:
6270 optval = so->so_traffic_class;
6271 goto integer;
6272
6273 case SO_RECV_TRAFFIC_CLASS:
6274 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6275 goto integer;
6276
6277 #if (DEVELOPMENT || DEBUG)
6278 case SO_TRAFFIC_CLASS_DBG:
6279 error = sogetopt_tcdbg(so, sopt);
6280 break;
6281 #endif /* (DEVELOPMENT || DEBUG) */
6282
6283 case SO_PRIVILEGED_TRAFFIC_CLASS:
6284 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6285 goto integer;
6286
6287 case SO_DEFUNCTOK:
6288 optval = !(so->so_flags & SOF_NODEFUNCT);
6289 goto integer;
6290
6291 case SO_ISDEFUNCT:
6292 optval = (so->so_flags & SOF_DEFUNCT);
6293 goto integer;
6294
6295 case SO_OPPORTUNISTIC:
6296 optval = so_get_opportunistic(so);
6297 goto integer;
6298
6299 case SO_FLUSH:
6300 /* This option is not gettable */
6301 error = EINVAL;
6302 break;
6303
6304 case SO_RECV_ANYIF:
6305 optval = so_get_recv_anyif(so);
6306 goto integer;
6307
6308 case SO_TRAFFIC_MGT_BACKGROUND:
6309 /* This option is handled by lower layer(s) */
6310 if (so->so_proto != NULL &&
6311 so->so_proto->pr_ctloutput != NULL) {
6312 (void) so->so_proto->pr_ctloutput(so, sopt);
6313 }
6314 break;
6315
6316 #if FLOW_DIVERT
6317 case SO_FLOW_DIVERT_TOKEN:
6318 error = flow_divert_token_get(so, sopt);
6319 break;
6320 #endif /* FLOW_DIVERT */
6321
6322 #if NECP
6323 case SO_NECP_ATTRIBUTES:
6324 if (SOCK_DOM(so) == PF_MULTIPATH) {
6325 /* Handled by MPTCP itself */
6326 break;
6327 }
6328
6329 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6330 error = EINVAL;
6331 goto out;
6332 }
6333
6334 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6335 break;
6336
6337 case SO_NECP_CLIENTUUID: {
6338 uuid_t *ncu;
6339
6340 if (SOCK_DOM(so) == PF_MULTIPATH) {
6341 ncu = &mpsotomppcb(so)->necp_client_uuid;
6342 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6343 ncu = &sotoinpcb(so)->necp_client_uuid;
6344 } else {
6345 error = EINVAL;
6346 goto out;
6347 }
6348
6349 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6350 break;
6351 }
6352
6353 case SO_NECP_LISTENUUID: {
6354 uuid_t *nlu;
6355
6356 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6357 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6358 nlu = &sotoinpcb(so)->necp_client_uuid;
6359 } else {
6360 error = ENOENT;
6361 goto out;
6362 }
6363 } else {
6364 error = EINVAL;
6365 goto out;
6366 }
6367
6368 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6369 break;
6370 }
6371
6372 case SO_RESOLVER_SIGNATURE: {
6373 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6374 error = EINVAL;
6375 goto out;
6376 }
6377 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6378 break;
6379 }
6380
6381 #endif /* NECP */
6382
6383 #if CONTENT_FILTER
6384 case SO_CFIL_SOCK_ID: {
6385 cfil_sock_id_t sock_id;
6386
6387 sock_id = cfil_sock_id_from_socket(so);
6388
6389 error = sooptcopyout(sopt, &sock_id,
6390 sizeof(cfil_sock_id_t));
6391 break;
6392 }
6393 #endif /* CONTENT_FILTER */
6394
6395 case SO_EXTENDED_BK_IDLE:
6396 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6397 goto integer;
6398 case SO_MARK_CELLFALLBACK:
6399 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6400 ? 1 : 0;
6401 goto integer;
6402 case SO_FALLBACK_MODE:
6403 optval = so->so_fallback_mode;
6404 goto integer;
6405 case SO_MARK_KNOWN_TRACKER: {
6406 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6407 ? 1 : 0;
6408 goto integer;
6409 }
6410 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6411 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6412 ? 1 : 0;
6413 goto integer;
6414 }
6415 case SO_MARK_APPROVED_APP_DOMAIN: {
6416 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6417 ? 1 : 0;
6418 goto integer;
6419 }
6420 case SO_NET_SERVICE_TYPE: {
6421 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6422 optval = so->so_netsvctype;
6423 } else {
6424 optval = NET_SERVICE_TYPE_BE;
6425 }
6426 goto integer;
6427 }
6428 case SO_NETSVC_MARKING_LEVEL:
6429 optval = so_get_netsvc_marking_level(so);
6430 goto integer;
6431
6432 case SO_MPKL_SEND_INFO: {
6433 struct so_mpkl_send_info so_mpkl_send_info;
6434
6435 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6436 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6437 error = sooptcopyout(sopt, &so_mpkl_send_info,
6438 sizeof(struct so_mpkl_send_info));
6439 break;
6440 }
6441 case SO_MARK_WAKE_PKT:
6442 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6443 goto integer;
6444 case SO_RECV_WAKE_PKT:
6445 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6446 goto integer;
6447 default:
6448 error = ENOPROTOOPT;
6449 break;
6450 }
6451 }
6452 out:
6453 if (dolock) {
6454 socket_unlock(so, 1);
6455 }
6456 return error;
6457 }
6458
6459 /*
6460 * The size limits on our soopt_getm is different from that on FreeBSD.
6461 * We limit the size of options to MCLBYTES. This will have to change
6462 * if we need to define options that need more space than MCLBYTES.
6463 */
6464 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6465 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6466 {
6467 struct mbuf *m, *m_prev;
6468 int sopt_size = (int)sopt->sopt_valsize;
6469 int how;
6470
6471 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6472 return EMSGSIZE;
6473 }
6474
6475 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6476 MGET(m, how, MT_DATA);
6477 if (m == NULL) {
6478 return ENOBUFS;
6479 }
6480 if (sopt_size > MLEN) {
6481 MCLGET(m, how);
6482 if ((m->m_flags & M_EXT) == 0) {
6483 m_free(m);
6484 return ENOBUFS;
6485 }
6486 m->m_len = min(MCLBYTES, sopt_size);
6487 } else {
6488 m->m_len = min(MLEN, sopt_size);
6489 }
6490 sopt_size -= m->m_len;
6491 *mp = m;
6492 m_prev = m;
6493
6494 while (sopt_size > 0) {
6495 MGET(m, how, MT_DATA);
6496 if (m == NULL) {
6497 m_freem(*mp);
6498 return ENOBUFS;
6499 }
6500 if (sopt_size > MLEN) {
6501 MCLGET(m, how);
6502 if ((m->m_flags & M_EXT) == 0) {
6503 m_freem(*mp);
6504 m_freem(m);
6505 return ENOBUFS;
6506 }
6507 m->m_len = min(MCLBYTES, sopt_size);
6508 } else {
6509 m->m_len = min(MLEN, sopt_size);
6510 }
6511 sopt_size -= m->m_len;
6512 m_prev->m_next = m;
6513 m_prev = m;
6514 }
6515 return 0;
6516 }
6517
6518 /* copyin sopt data into mbuf chain */
6519 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6520 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6521 {
6522 struct mbuf *m0 = m;
6523
6524 if (sopt->sopt_val == USER_ADDR_NULL) {
6525 return 0;
6526 }
6527 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6528 if (sopt->sopt_p != kernproc) {
6529 int error;
6530
6531 error = copyin(sopt->sopt_val, mtod(m, char *),
6532 m->m_len);
6533 if (error != 0) {
6534 m_freem(m0);
6535 return error;
6536 }
6537 } else {
6538 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6539 mtod(m, char *), m->m_len);
6540 }
6541 sopt->sopt_valsize -= m->m_len;
6542 sopt->sopt_val += m->m_len;
6543 m = m->m_next;
6544 }
6545 /* should be allocated enoughly at ip6_sooptmcopyin() */
6546 if (m != NULL) {
6547 panic("soopt_mcopyin");
6548 /* NOTREACHED */
6549 }
6550 return 0;
6551 }
6552
6553 /* copyout mbuf chain data into soopt */
6554 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6555 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6556 {
6557 struct mbuf *m0 = m;
6558 size_t valsize = 0;
6559
6560 if (sopt->sopt_val == USER_ADDR_NULL) {
6561 return 0;
6562 }
6563 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6564 if (sopt->sopt_p != kernproc) {
6565 int error;
6566
6567 error = copyout(mtod(m, char *), sopt->sopt_val,
6568 m->m_len);
6569 if (error != 0) {
6570 m_freem(m0);
6571 return error;
6572 }
6573 } else {
6574 bcopy(mtod(m, char *),
6575 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6576 }
6577 sopt->sopt_valsize -= m->m_len;
6578 sopt->sopt_val += m->m_len;
6579 valsize += m->m_len;
6580 m = m->m_next;
6581 }
6582 if (m != NULL) {
6583 /* enough soopt buffer should be given from user-land */
6584 m_freem(m0);
6585 return EINVAL;
6586 }
6587 sopt->sopt_valsize = valsize;
6588 return 0;
6589 }
6590
6591 void
sohasoutofband(struct socket * so)6592 sohasoutofband(struct socket *so)
6593 {
6594 if (so->so_pgid < 0) {
6595 gsignal(-so->so_pgid, SIGURG);
6596 } else if (so->so_pgid > 0) {
6597 proc_signal(so->so_pgid, SIGURG);
6598 }
6599 selwakeup(&so->so_rcv.sb_sel);
6600 if (so->so_rcv.sb_flags & SB_KNOTE) {
6601 KNOTE(&so->so_rcv.sb_sel.si_note,
6602 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6603 }
6604 }
6605
6606 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6607 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6608 {
6609 #pragma unused(cred)
6610 struct proc *p = current_proc();
6611 int revents = 0;
6612
6613 socket_lock(so, 1);
6614 so_update_last_owner_locked(so, PROC_NULL);
6615 so_update_policy(so);
6616
6617 if (events & (POLLIN | POLLRDNORM)) {
6618 if (soreadable(so)) {
6619 revents |= events & (POLLIN | POLLRDNORM);
6620 }
6621 }
6622
6623 if (events & (POLLOUT | POLLWRNORM)) {
6624 if (sowriteable(so)) {
6625 revents |= events & (POLLOUT | POLLWRNORM);
6626 }
6627 }
6628
6629 if (events & (POLLPRI | POLLRDBAND)) {
6630 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6631 revents |= events & (POLLPRI | POLLRDBAND);
6632 }
6633 }
6634
6635 if (revents == 0) {
6636 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6637 /*
6638 * Darwin sets the flag first,
6639 * BSD calls selrecord first
6640 */
6641 so->so_rcv.sb_flags |= SB_SEL;
6642 selrecord(p, &so->so_rcv.sb_sel, wql);
6643 }
6644
6645 if (events & (POLLOUT | POLLWRNORM)) {
6646 /*
6647 * Darwin sets the flag first,
6648 * BSD calls selrecord first
6649 */
6650 so->so_snd.sb_flags |= SB_SEL;
6651 selrecord(p, &so->so_snd.sb_sel, wql);
6652 }
6653 }
6654
6655 socket_unlock(so, 1);
6656 return revents;
6657 }
6658
6659 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6660 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6661 {
6662 struct socket *so = (struct socket *)fp_get_data(fp);
6663 int result;
6664
6665 socket_lock(so, 1);
6666 so_update_last_owner_locked(so, PROC_NULL);
6667 so_update_policy(so);
6668
6669 switch (kn->kn_filter) {
6670 case EVFILT_READ:
6671 kn->kn_filtid = EVFILTID_SOREAD;
6672 break;
6673 case EVFILT_WRITE:
6674 kn->kn_filtid = EVFILTID_SOWRITE;
6675 break;
6676 case EVFILT_SOCK:
6677 kn->kn_filtid = EVFILTID_SCK;
6678 break;
6679 case EVFILT_EXCEPT:
6680 kn->kn_filtid = EVFILTID_SOEXCEPT;
6681 break;
6682 default:
6683 socket_unlock(so, 1);
6684 knote_set_error(kn, EINVAL);
6685 return 0;
6686 }
6687
6688 /*
6689 * call the appropriate sub-filter attach
6690 * with the socket still locked
6691 */
6692 result = knote_fops(kn)->f_attach(kn, kev);
6693
6694 socket_unlock(so, 1);
6695
6696 return result;
6697 }
6698
6699 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6700 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6701 {
6702 int retval = 0;
6703 int64_t data = 0;
6704
6705 if (so->so_options & SO_ACCEPTCONN) {
6706 /*
6707 * Radar 6615193 handle the listen case dynamically
6708 * for kqueue read filter. This allows to call listen()
6709 * after registering the kqueue EVFILT_READ.
6710 */
6711
6712 retval = !TAILQ_EMPTY(&so->so_comp);
6713 data = so->so_qlen;
6714 goto out;
6715 }
6716
6717 /* socket isn't a listener */
6718 /*
6719 * NOTE_LOWAT specifies new low water mark in data, i.e.
6720 * the bytes of protocol data. We therefore exclude any
6721 * control bytes.
6722 */
6723 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6724
6725 if (kn->kn_sfflags & NOTE_OOB) {
6726 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6727 kn->kn_fflags |= NOTE_OOB;
6728 data -= so->so_oobmark;
6729 retval = 1;
6730 goto out;
6731 }
6732 }
6733
6734 if ((so->so_state & SS_CANTRCVMORE)
6735 #if CONTENT_FILTER
6736 && cfil_sock_data_pending(&so->so_rcv) == 0
6737 #endif /* CONTENT_FILTER */
6738 ) {
6739 kn->kn_flags |= EV_EOF;
6740 kn->kn_fflags = so->so_error;
6741 retval = 1;
6742 goto out;
6743 }
6744
6745 if (so->so_error) { /* temporary udp error */
6746 retval = 1;
6747 goto out;
6748 }
6749
6750 int64_t lowwat = so->so_rcv.sb_lowat;
6751 /*
6752 * Ensure that when NOTE_LOWAT is used, the derived
6753 * low water mark is bounded by socket's rcv buf's
6754 * high and low water mark values.
6755 */
6756 if (kn->kn_sfflags & NOTE_LOWAT) {
6757 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6758 lowwat = so->so_rcv.sb_hiwat;
6759 } else if (kn->kn_sdata > lowwat) {
6760 lowwat = kn->kn_sdata;
6761 }
6762 }
6763
6764 /*
6765 * While the `data` field is the amount of data to read,
6766 * 0-sized packets need to wake up the kqueue, see 58140856,
6767 * so we need to take control bytes into account too.
6768 */
6769 retval = (so->so_rcv.sb_cc >= lowwat);
6770
6771 out:
6772 if (retval && kev) {
6773 knote_fill_kevent(kn, kev, data);
6774 }
6775 return retval;
6776 }
6777
6778 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6779 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6780 {
6781 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6782
6783 /* socket locked */
6784
6785 /*
6786 * If the caller explicitly asked for OOB results (e.g. poll())
6787 * from EVFILT_READ, then save that off in the hookid field
6788 * and reserve the kn_flags EV_OOBAND bit for output only.
6789 */
6790 if (kn->kn_filter == EVFILT_READ &&
6791 kn->kn_flags & EV_OOBAND) {
6792 kn->kn_flags &= ~EV_OOBAND;
6793 kn->kn_hook32 = EV_OOBAND;
6794 } else {
6795 kn->kn_hook32 = 0;
6796 }
6797 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6798 so->so_rcv.sb_flags |= SB_KNOTE;
6799 }
6800
6801 /* indicate if event is already fired */
6802 return filt_soread_common(kn, NULL, so);
6803 }
6804
6805 static void
filt_sordetach(struct knote * kn)6806 filt_sordetach(struct knote *kn)
6807 {
6808 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6809
6810 socket_lock(so, 1);
6811 if (so->so_rcv.sb_flags & SB_KNOTE) {
6812 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6813 so->so_rcv.sb_flags &= ~SB_KNOTE;
6814 }
6815 }
6816 socket_unlock(so, 1);
6817 }
6818
6819 /*ARGSUSED*/
6820 static int
filt_soread(struct knote * kn,long hint)6821 filt_soread(struct knote *kn, long hint)
6822 {
6823 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6824 int retval;
6825
6826 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6827 socket_lock(so, 1);
6828 }
6829
6830 retval = filt_soread_common(kn, NULL, so);
6831
6832 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6833 socket_unlock(so, 1);
6834 }
6835
6836 return retval;
6837 }
6838
6839 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6840 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6841 {
6842 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6843 int retval;
6844
6845 socket_lock(so, 1);
6846
6847 /* save off the new input fflags and data */
6848 kn->kn_sfflags = kev->fflags;
6849 kn->kn_sdata = kev->data;
6850
6851 /* determine if changes result in fired events */
6852 retval = filt_soread_common(kn, NULL, so);
6853
6854 socket_unlock(so, 1);
6855
6856 return retval;
6857 }
6858
6859 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6860 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6861 {
6862 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6863 int retval;
6864
6865 socket_lock(so, 1);
6866 retval = filt_soread_common(kn, kev, so);
6867 socket_unlock(so, 1);
6868
6869 return retval;
6870 }
6871
6872 int
so_wait_for_if_feedback(struct socket * so)6873 so_wait_for_if_feedback(struct socket *so)
6874 {
6875 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6876 (so->so_state & SS_ISCONNECTED)) {
6877 struct inpcb *inp = sotoinpcb(so);
6878 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6879 return 1;
6880 }
6881 }
6882 return 0;
6883 }
6884
6885 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6886 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6887 {
6888 int ret = 0;
6889 int64_t data = sbspace(&so->so_snd);
6890
6891 if (so->so_state & SS_CANTSENDMORE) {
6892 kn->kn_flags |= EV_EOF;
6893 kn->kn_fflags = so->so_error;
6894 ret = 1;
6895 goto out;
6896 }
6897
6898 if (so->so_error) { /* temporary udp error */
6899 ret = 1;
6900 goto out;
6901 }
6902
6903 if (!socanwrite(so)) {
6904 ret = 0;
6905 goto out;
6906 }
6907
6908 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6909 ret = 1;
6910 goto out;
6911 }
6912
6913 int64_t lowwat = so->so_snd.sb_lowat;
6914 const int64_t hiwat = so->so_snd.sb_hiwat;
6915 /*
6916 * Deal with connected UNIX domain sockets which
6917 * rely on the fact that the sender's socket buffer is
6918 * actually the receiver's socket buffer.
6919 */
6920 if (SOCK_DOM(so) == PF_LOCAL) {
6921 struct unpcb *unp = sotounpcb(so);
6922 if (unp != NULL && unp->unp_conn != NULL &&
6923 unp->unp_conn->unp_socket != NULL) {
6924 struct socket *so2 = unp->unp_conn->unp_socket;
6925 /*
6926 * At this point we know that `so' is locked
6927 * and that `unp_conn` isn't going to change.
6928 * However, we don't lock `so2` because doing so
6929 * may require unlocking `so'
6930 * (see unp_get_locks_in_order()).
6931 *
6932 * Two cases can happen:
6933 *
6934 * 1) we return 1 and tell the application that
6935 * it can write. Meanwhile, another thread
6936 * fills up the socket buffer. This will either
6937 * lead to a blocking send or EWOULDBLOCK
6938 * which the application should deal with.
6939 * 2) we return 0 and tell the application that
6940 * the socket is not writable. Meanwhile,
6941 * another thread depletes the receive socket
6942 * buffer. In this case the application will
6943 * be woken up by sb_notify().
6944 *
6945 * MIN() is required because otherwise sosendcheck()
6946 * may return EWOULDBLOCK since it only considers
6947 * so->so_snd.
6948 */
6949 data = MIN(data, sbspace(&so2->so_rcv));
6950 }
6951 }
6952
6953 if (kn->kn_sfflags & NOTE_LOWAT) {
6954 if (kn->kn_sdata > hiwat) {
6955 lowwat = hiwat;
6956 } else if (kn->kn_sdata > lowwat) {
6957 lowwat = kn->kn_sdata;
6958 }
6959 }
6960
6961 if (data > 0 && data >= lowwat) {
6962 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6963 #if (DEBUG || DEVELOPMENT)
6964 && so_notsent_lowat_check == 1
6965 #endif /* DEBUG || DEVELOPMENT */
6966 ) {
6967 if ((SOCK_DOM(so) == PF_INET ||
6968 SOCK_DOM(so) == PF_INET6) &&
6969 so->so_type == SOCK_STREAM) {
6970 ret = tcp_notsent_lowat_check(so);
6971 }
6972 #if MPTCP
6973 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6974 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6975 ret = mptcp_notsent_lowat_check(so);
6976 }
6977 #endif
6978 else {
6979 ret = 1;
6980 goto out;
6981 }
6982 } else {
6983 ret = 1;
6984 }
6985 }
6986 if (so_wait_for_if_feedback(so)) {
6987 ret = 0;
6988 }
6989
6990 out:
6991 if (ret && kev) {
6992 knote_fill_kevent(kn, kev, data);
6993 }
6994 return ret;
6995 }
6996
6997 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6998 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6999 {
7000 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7001
7002 /* socket locked */
7003 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
7004 so->so_snd.sb_flags |= SB_KNOTE;
7005 }
7006
7007 /* determine if its already fired */
7008 return filt_sowrite_common(kn, NULL, so);
7009 }
7010
7011 static void
filt_sowdetach(struct knote * kn)7012 filt_sowdetach(struct knote *kn)
7013 {
7014 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7015 socket_lock(so, 1);
7016
7017 if (so->so_snd.sb_flags & SB_KNOTE) {
7018 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
7019 so->so_snd.sb_flags &= ~SB_KNOTE;
7020 }
7021 }
7022 socket_unlock(so, 1);
7023 }
7024
7025 /*ARGSUSED*/
7026 static int
filt_sowrite(struct knote * kn,long hint)7027 filt_sowrite(struct knote *kn, long hint)
7028 {
7029 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7030 int ret;
7031
7032 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7033 socket_lock(so, 1);
7034 }
7035
7036 ret = filt_sowrite_common(kn, NULL, so);
7037
7038 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7039 socket_unlock(so, 1);
7040 }
7041
7042 return ret;
7043 }
7044
7045 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)7046 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
7047 {
7048 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7049 int ret;
7050
7051 socket_lock(so, 1);
7052
7053 /*save off the new input fflags and data */
7054 kn->kn_sfflags = kev->fflags;
7055 kn->kn_sdata = kev->data;
7056
7057 /* determine if these changes result in a triggered event */
7058 ret = filt_sowrite_common(kn, NULL, so);
7059
7060 socket_unlock(so, 1);
7061
7062 return ret;
7063 }
7064
7065 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)7066 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
7067 {
7068 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7069 int ret;
7070
7071 socket_lock(so, 1);
7072 ret = filt_sowrite_common(kn, kev, so);
7073 socket_unlock(so, 1);
7074
7075 return ret;
7076 }
7077
7078 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)7079 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
7080 struct socket *so, long ev_hint)
7081 {
7082 int ret = 0;
7083 int64_t data = 0;
7084 uint32_t level_trigger = 0;
7085
7086 if (ev_hint & SO_FILT_HINT_CONNRESET) {
7087 kn->kn_fflags |= NOTE_CONNRESET;
7088 }
7089 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
7090 kn->kn_fflags |= NOTE_TIMEOUT;
7091 }
7092 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
7093 kn->kn_fflags |= NOTE_NOSRCADDR;
7094 }
7095 if (ev_hint & SO_FILT_HINT_IFDENIED) {
7096 kn->kn_fflags |= NOTE_IFDENIED;
7097 }
7098 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7099 kn->kn_fflags |= NOTE_KEEPALIVE;
7100 }
7101 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7102 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7103 }
7104 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7105 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7106 }
7107 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7108 (so->so_state & SS_ISCONNECTED)) {
7109 kn->kn_fflags |= NOTE_CONNECTED;
7110 level_trigger |= NOTE_CONNECTED;
7111 }
7112 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7113 (so->so_state & SS_ISDISCONNECTED)) {
7114 kn->kn_fflags |= NOTE_DISCONNECTED;
7115 level_trigger |= NOTE_DISCONNECTED;
7116 }
7117 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7118 if (so->so_proto != NULL &&
7119 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7120 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7121 }
7122 }
7123 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7124 tcp_notify_ack_active(so)) {
7125 kn->kn_fflags |= NOTE_NOTIFY_ACK;
7126 }
7127 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7128 kn->kn_fflags |= NOTE_WAKE_PKT;
7129 }
7130
7131 if ((so->so_state & SS_CANTRCVMORE)
7132 #if CONTENT_FILTER
7133 && cfil_sock_data_pending(&so->so_rcv) == 0
7134 #endif /* CONTENT_FILTER */
7135 ) {
7136 kn->kn_fflags |= NOTE_READCLOSED;
7137 level_trigger |= NOTE_READCLOSED;
7138 }
7139
7140 if (so->so_state & SS_CANTSENDMORE) {
7141 kn->kn_fflags |= NOTE_WRITECLOSED;
7142 level_trigger |= NOTE_WRITECLOSED;
7143 }
7144
7145 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7146 (so->so_flags & SOF_SUSPENDED)) {
7147 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7148
7149 /* If resume event was delivered before, reset it */
7150 kn->kn_hook32 &= ~NOTE_RESUME;
7151
7152 kn->kn_fflags |= NOTE_SUSPEND;
7153 level_trigger |= NOTE_SUSPEND;
7154 }
7155
7156 if ((ev_hint & SO_FILT_HINT_RESUME) ||
7157 (so->so_flags & SOF_SUSPENDED) == 0) {
7158 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7159
7160 /* If suspend event was delivered before, reset it */
7161 kn->kn_hook32 &= ~NOTE_SUSPEND;
7162
7163 kn->kn_fflags |= NOTE_RESUME;
7164 level_trigger |= NOTE_RESUME;
7165 }
7166
7167 if (so->so_error != 0) {
7168 ret = 1;
7169 data = so->so_error;
7170 kn->kn_flags |= EV_EOF;
7171 } else {
7172 u_int32_t data32 = 0;
7173 get_sockev_state(so, &data32);
7174 data = data32;
7175 }
7176
7177 /* Reset any events that are not requested on this knote */
7178 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7179 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7180
7181 /* Find the level triggerred events that are already delivered */
7182 level_trigger &= kn->kn_hook32;
7183 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7184
7185 /* Do not deliver level triggerred events more than once */
7186 if ((kn->kn_fflags & ~level_trigger) != 0) {
7187 ret = 1;
7188 }
7189
7190 if (ret && kev) {
7191 /*
7192 * Store the state of the events being delivered. This
7193 * state can be used to deliver level triggered events
7194 * ateast once and still avoid waking up the application
7195 * multiple times as long as the event is active.
7196 */
7197 if (kn->kn_fflags != 0) {
7198 kn->kn_hook32 |= (kn->kn_fflags &
7199 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7200 }
7201
7202 /*
7203 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7204 * only one of them and remember the last one that was
7205 * delivered last
7206 */
7207 if (kn->kn_fflags & NOTE_SUSPEND) {
7208 kn->kn_hook32 &= ~NOTE_RESUME;
7209 }
7210 if (kn->kn_fflags & NOTE_RESUME) {
7211 kn->kn_hook32 &= ~NOTE_SUSPEND;
7212 }
7213
7214 knote_fill_kevent(kn, kev, data);
7215 }
7216 return ret;
7217 }
7218
7219 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7220 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7221 {
7222 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7223
7224 /* socket locked */
7225 kn->kn_hook32 = 0;
7226 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7227 so->so_flags |= SOF_KNOTE;
7228 }
7229
7230 /* determine if event already fired */
7231 return filt_sockev_common(kn, NULL, so, 0);
7232 }
7233
7234 static void
filt_sockdetach(struct knote * kn)7235 filt_sockdetach(struct knote *kn)
7236 {
7237 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7238 socket_lock(so, 1);
7239
7240 if ((so->so_flags & SOF_KNOTE) != 0) {
7241 if (KNOTE_DETACH(&so->so_klist, kn)) {
7242 so->so_flags &= ~SOF_KNOTE;
7243 }
7244 }
7245 socket_unlock(so, 1);
7246 }
7247
7248 static int
filt_sockev(struct knote * kn,long hint)7249 filt_sockev(struct knote *kn, long hint)
7250 {
7251 int ret = 0, locked = 0;
7252 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7253 long ev_hint = (hint & SO_FILT_HINT_EV);
7254
7255 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7256 socket_lock(so, 1);
7257 locked = 1;
7258 }
7259
7260 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7261
7262 if (locked) {
7263 socket_unlock(so, 1);
7264 }
7265
7266 return ret;
7267 }
7268
7269
7270
7271 /*
7272 * filt_socktouch - update event state
7273 */
7274 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7275 filt_socktouch(
7276 struct knote *kn,
7277 struct kevent_qos_s *kev)
7278 {
7279 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7280 uint32_t changed_flags;
7281 int ret;
7282
7283 socket_lock(so, 1);
7284
7285 /* save off the [result] data and fflags */
7286 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7287
7288 /* save off the new input fflags and data */
7289 kn->kn_sfflags = kev->fflags;
7290 kn->kn_sdata = kev->data;
7291
7292 /* restrict the current results to the (smaller?) set of new interest */
7293 /*
7294 * For compatibility with previous implementations, we leave kn_fflags
7295 * as they were before.
7296 */
7297 //kn->kn_fflags &= kev->fflags;
7298
7299 /*
7300 * Since we keep track of events that are already
7301 * delivered, if any of those events are not requested
7302 * anymore the state related to them can be reset
7303 */
7304 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7305
7306 /* determine if we have events to deliver */
7307 ret = filt_sockev_common(kn, NULL, so, 0);
7308
7309 socket_unlock(so, 1);
7310
7311 return ret;
7312 }
7313
7314 /*
7315 * filt_sockprocess - query event fired state and return data
7316 */
7317 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7318 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7319 {
7320 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7321 int ret = 0;
7322
7323 socket_lock(so, 1);
7324
7325 ret = filt_sockev_common(kn, kev, so, 0);
7326
7327 socket_unlock(so, 1);
7328
7329 return ret;
7330 }
7331
7332 void
get_sockev_state(struct socket * so,u_int32_t * statep)7333 get_sockev_state(struct socket *so, u_int32_t *statep)
7334 {
7335 u_int32_t state = *(statep);
7336
7337 /*
7338 * If the state variable is already used by a previous event,
7339 * reset it.
7340 */
7341 if (state != 0) {
7342 return;
7343 }
7344
7345 if (so->so_state & SS_ISCONNECTED) {
7346 state |= SOCKEV_CONNECTED;
7347 } else {
7348 state &= ~(SOCKEV_CONNECTED);
7349 }
7350 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7351 *(statep) = state;
7352 }
7353
7354 #define SO_LOCK_HISTORY_STR_LEN \
7355 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7356
7357 __private_extern__ const char *
solockhistory_nr(struct socket * so)7358 solockhistory_nr(struct socket *so)
7359 {
7360 size_t n = 0;
7361 int i;
7362 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7363
7364 bzero(lock_history_str, sizeof(lock_history_str));
7365 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7366 n += scnprintf(lock_history_str + n,
7367 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7368 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7369 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7370 }
7371 return lock_history_str;
7372 }
7373
7374 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7375 socket_getlock(struct socket *so, int flags)
7376 {
7377 if (so->so_proto->pr_getlock != NULL) {
7378 return (*so->so_proto->pr_getlock)(so, flags);
7379 } else {
7380 return so->so_proto->pr_domain->dom_mtx;
7381 }
7382 }
7383
7384 void
socket_lock(struct socket * so,int refcount)7385 socket_lock(struct socket *so, int refcount)
7386 {
7387 void *lr_saved;
7388
7389 lr_saved = __builtin_return_address(0);
7390
7391 if (so->so_proto->pr_lock) {
7392 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7393 } else {
7394 #ifdef MORE_LOCKING_DEBUG
7395 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7396 LCK_MTX_ASSERT_NOTOWNED);
7397 #endif
7398 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7399 if (refcount) {
7400 so->so_usecount++;
7401 }
7402 so->lock_lr[so->next_lock_lr] = lr_saved;
7403 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7404 }
7405 }
7406
7407 void
socket_lock_assert_owned(struct socket * so)7408 socket_lock_assert_owned(struct socket *so)
7409 {
7410 lck_mtx_t *mutex_held;
7411
7412 if (so->so_proto->pr_getlock != NULL) {
7413 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7414 } else {
7415 mutex_held = so->so_proto->pr_domain->dom_mtx;
7416 }
7417
7418 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7419 }
7420
7421 int
socket_try_lock(struct socket * so)7422 socket_try_lock(struct socket *so)
7423 {
7424 lck_mtx_t *mtx;
7425
7426 if (so->so_proto->pr_getlock != NULL) {
7427 mtx = (*so->so_proto->pr_getlock)(so, 0);
7428 } else {
7429 mtx = so->so_proto->pr_domain->dom_mtx;
7430 }
7431
7432 return lck_mtx_try_lock(mtx);
7433 }
7434
7435 void
socket_unlock(struct socket * so,int refcount)7436 socket_unlock(struct socket *so, int refcount)
7437 {
7438 void *lr_saved;
7439 lck_mtx_t *mutex_held;
7440
7441 lr_saved = __builtin_return_address(0);
7442
7443 if (so == NULL || so->so_proto == NULL) {
7444 panic("%s: null so_proto so=%p", __func__, so);
7445 /* NOTREACHED */
7446 }
7447
7448 if (so->so_proto->pr_unlock) {
7449 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7450 } else {
7451 mutex_held = so->so_proto->pr_domain->dom_mtx;
7452 #ifdef MORE_LOCKING_DEBUG
7453 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7454 #endif
7455 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7456 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7457
7458 if (refcount) {
7459 if (so->so_usecount <= 0) {
7460 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7461 "lrh=%s", __func__, so->so_usecount, so,
7462 SOCK_DOM(so), so->so_type,
7463 SOCK_PROTO(so), solockhistory_nr(so));
7464 /* NOTREACHED */
7465 }
7466
7467 so->so_usecount--;
7468 if (so->so_usecount == 0) {
7469 sofreelastref(so, 1);
7470 }
7471 }
7472 lck_mtx_unlock(mutex_held);
7473 }
7474 }
7475
7476 /* Called with socket locked, will unlock socket */
7477 void
sofree(struct socket * so)7478 sofree(struct socket *so)
7479 {
7480 lck_mtx_t *mutex_held;
7481
7482 if (so->so_proto->pr_getlock != NULL) {
7483 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7484 } else {
7485 mutex_held = so->so_proto->pr_domain->dom_mtx;
7486 }
7487 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7488
7489 sofreelastref(so, 0);
7490 }
7491
7492 void
soreference(struct socket * so)7493 soreference(struct socket *so)
7494 {
7495 socket_lock(so, 1); /* locks & take one reference on socket */
7496 socket_unlock(so, 0); /* unlock only */
7497 }
7498
7499 void
sodereference(struct socket * so)7500 sodereference(struct socket *so)
7501 {
7502 socket_lock(so, 0);
7503 socket_unlock(so, 1);
7504 }
7505
7506 /*
7507 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7508 * possibility of using jumbo clusters. Caller must ensure to hold
7509 * the socket lock.
7510 */
7511 void
somultipages(struct socket * so,boolean_t set)7512 somultipages(struct socket *so, boolean_t set)
7513 {
7514 if (set) {
7515 so->so_flags |= SOF_MULTIPAGES;
7516 } else {
7517 so->so_flags &= ~SOF_MULTIPAGES;
7518 }
7519 }
7520
7521 void
soif2kcl(struct socket * so,boolean_t set)7522 soif2kcl(struct socket *so, boolean_t set)
7523 {
7524 if (set) {
7525 so->so_flags1 |= SOF1_IF_2KCL;
7526 } else {
7527 so->so_flags1 &= ~SOF1_IF_2KCL;
7528 }
7529 }
7530
7531 int
so_isdstlocal(struct socket * so)7532 so_isdstlocal(struct socket *so)
7533 {
7534 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7535
7536 if (SOCK_DOM(so) == PF_INET) {
7537 return inaddr_local(inp->inp_faddr);
7538 } else if (SOCK_DOM(so) == PF_INET6) {
7539 return in6addr_local(&inp->in6p_faddr);
7540 }
7541
7542 return 0;
7543 }
7544
7545 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7546 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7547 {
7548 struct sockbuf *rcv, *snd;
7549 int err = 0, defunct;
7550
7551 rcv = &so->so_rcv;
7552 snd = &so->so_snd;
7553
7554 defunct = (so->so_flags & SOF_DEFUNCT);
7555 if (defunct) {
7556 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7557 panic("%s: SB_DROP not set", __func__);
7558 /* NOTREACHED */
7559 }
7560 goto done;
7561 }
7562
7563 if (so->so_flags & SOF_NODEFUNCT) {
7564 if (noforce) {
7565 err = EOPNOTSUPP;
7566 if (p != PROC_NULL) {
7567 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7568 "name %s level %d) so 0x%llu [%d,%d] "
7569 "is not eligible for defunct "
7570 "(%d)\n", __func__, proc_selfpid(),
7571 proc_best_name(current_proc()), proc_pid(p),
7572 proc_best_name(p), level,
7573 so->so_gencnt,
7574 SOCK_DOM(so), SOCK_TYPE(so), err);
7575 }
7576 return err;
7577 }
7578 so->so_flags &= ~SOF_NODEFUNCT;
7579 if (p != PROC_NULL) {
7580 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7581 "name %s level %d) so 0x%llu [%d,%d] "
7582 "defunct by force "
7583 "(%d)\n", __func__, proc_selfpid(),
7584 proc_best_name(current_proc()), proc_pid(p),
7585 proc_best_name(p), level,
7586 so->so_gencnt,
7587 SOCK_DOM(so), SOCK_TYPE(so), err);
7588 }
7589 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7590 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7591 struct ifnet *ifp = inp->inp_last_outifp;
7592
7593 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7594 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7595 } else if (so->so_flags & SOF_DELEGATED) {
7596 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7597 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7598 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7599 } else if (noforce && p != PROC_NULL) {
7600 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7601
7602 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7603 so->so_extended_bk_start = net_uptime();
7604 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7605
7606 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7607
7608 err = EOPNOTSUPP;
7609 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7610 "name %s level %d) so 0x%llu [%d,%d] "
7611 "extend bk idle "
7612 "(%d)\n", __func__, proc_selfpid(),
7613 proc_best_name(current_proc()), proc_pid(p),
7614 proc_best_name(p), level,
7615 so->so_gencnt,
7616 SOCK_DOM(so), SOCK_TYPE(so), err);
7617 return err;
7618 } else {
7619 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7620 }
7621 }
7622
7623 so->so_flags |= SOF_DEFUNCT;
7624
7625 /* Prevent further data from being appended to the socket buffers */
7626 snd->sb_flags |= SB_DROP;
7627 rcv->sb_flags |= SB_DROP;
7628
7629 /* Flush any existing data in the socket buffers */
7630 if (rcv->sb_cc != 0) {
7631 rcv->sb_flags &= ~SB_SEL;
7632 selthreadclear(&rcv->sb_sel);
7633 sbrelease(rcv);
7634 }
7635 if (snd->sb_cc != 0) {
7636 snd->sb_flags &= ~SB_SEL;
7637 selthreadclear(&snd->sb_sel);
7638 sbrelease(snd);
7639 }
7640
7641 done:
7642 if (p != PROC_NULL) {
7643 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7644 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7645 proc_selfpid(), proc_best_name(current_proc()),
7646 proc_pid(p), proc_best_name(p), level,
7647 so->so_gencnt, SOCK_DOM(so),
7648 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7649 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7650 " extbkidle" : "");
7651 }
7652 return err;
7653 }
7654
7655 int
sodefunct(struct proc * p,struct socket * so,int level)7656 sodefunct(struct proc *p, struct socket *so, int level)
7657 {
7658 struct sockbuf *rcv, *snd;
7659
7660 if (!(so->so_flags & SOF_DEFUNCT)) {
7661 panic("%s improperly called", __func__);
7662 /* NOTREACHED */
7663 }
7664 if (so->so_state & SS_DEFUNCT) {
7665 goto done;
7666 }
7667
7668 rcv = &so->so_rcv;
7669 snd = &so->so_snd;
7670
7671 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7672 char s[MAX_IPv6_STR_LEN];
7673 char d[MAX_IPv6_STR_LEN];
7674 struct inpcb *inp = sotoinpcb(so);
7675
7676 if (p != PROC_NULL) {
7677 SODEFUNCTLOG(
7678 "%s[%d, %s]: (target pid %d name %s level %d) "
7679 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7680 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7681 " snd_fl 0x%x]\n", __func__,
7682 proc_selfpid(), proc_best_name(current_proc()),
7683 proc_pid(p), proc_best_name(p), level,
7684 so->so_gencnt,
7685 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7686 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7687 (void *)&inp->inp_laddr.s_addr :
7688 (void *)&inp->in6p_laddr),
7689 s, sizeof(s)), ntohs(inp->in6p_lport),
7690 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7691 (void *)&inp->inp_faddr.s_addr :
7692 (void *)&inp->in6p_faddr,
7693 d, sizeof(d)), ntohs(inp->in6p_fport),
7694 (uint32_t)rcv->sb_sel.si_flags,
7695 (uint32_t)snd->sb_sel.si_flags,
7696 rcv->sb_flags, snd->sb_flags);
7697 }
7698 } else if (p != PROC_NULL) {
7699 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7700 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7701 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7702 proc_selfpid(), proc_best_name(current_proc()),
7703 proc_pid(p), proc_best_name(p), level,
7704 so->so_gencnt,
7705 SOCK_DOM(so), SOCK_TYPE(so),
7706 (uint32_t)rcv->sb_sel.si_flags,
7707 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7708 snd->sb_flags);
7709 }
7710
7711 /*
7712 * First tell the protocol the flow is defunct
7713 */
7714 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7715
7716 /*
7717 * Unwedge threads blocked on sbwait() and sb_lock().
7718 */
7719 sbwakeup(rcv);
7720 sbwakeup(snd);
7721
7722 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7723 if (rcv->sb_flags & SB_LOCK) {
7724 sbunlock(rcv, TRUE); /* keep socket locked */
7725 }
7726 if (snd->sb_flags & SB_LOCK) {
7727 sbunlock(snd, TRUE); /* keep socket locked */
7728 }
7729 /*
7730 * Flush the buffers and disconnect. We explicitly call shutdown
7731 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7732 * states are set for the socket. This would also flush out data
7733 * hanging off the receive list of this socket.
7734 */
7735 (void) soshutdownlock_final(so, SHUT_RD);
7736 (void) soshutdownlock_final(so, SHUT_WR);
7737 (void) sodisconnectlocked(so);
7738
7739 /*
7740 * Explicitly handle connectionless-protocol disconnection
7741 * and release any remaining data in the socket buffers.
7742 */
7743 if (!(so->so_state & SS_ISDISCONNECTED)) {
7744 (void) soisdisconnected(so);
7745 }
7746
7747 if (so->so_error == 0) {
7748 so->so_error = EBADF;
7749 }
7750
7751 if (rcv->sb_cc != 0) {
7752 rcv->sb_flags &= ~SB_SEL;
7753 selthreadclear(&rcv->sb_sel);
7754 sbrelease(rcv);
7755 }
7756 if (snd->sb_cc != 0) {
7757 snd->sb_flags &= ~SB_SEL;
7758 selthreadclear(&snd->sb_sel);
7759 sbrelease(snd);
7760 }
7761 so->so_state |= SS_DEFUNCT;
7762 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7763
7764 done:
7765 return 0;
7766 }
7767
7768 int
soresume(struct proc * p,struct socket * so,int locked)7769 soresume(struct proc *p, struct socket *so, int locked)
7770 {
7771 if (locked == 0) {
7772 socket_lock(so, 1);
7773 }
7774
7775 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7776 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7777 "[%d,%d] resumed from bk idle\n",
7778 __func__, proc_selfpid(), proc_best_name(current_proc()),
7779 proc_pid(p), proc_best_name(p),
7780 so->so_gencnt,
7781 SOCK_DOM(so), SOCK_TYPE(so));
7782
7783 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7784 so->so_extended_bk_start = 0;
7785 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7786
7787 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7788 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7789 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7790 }
7791 if (locked == 0) {
7792 socket_unlock(so, 1);
7793 }
7794
7795 return 0;
7796 }
7797
7798 /*
7799 * Does not attempt to account for sockets that are delegated from
7800 * the current process
7801 */
7802 int
so_set_extended_bk_idle(struct socket * so,int optval)7803 so_set_extended_bk_idle(struct socket *so, int optval)
7804 {
7805 int error = 0;
7806
7807 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7808 SOCK_PROTO(so) != IPPROTO_TCP) {
7809 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7810 error = EOPNOTSUPP;
7811 } else if (optval == 0) {
7812 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7813
7814 soresume(current_proc(), so, 1);
7815 } else {
7816 struct proc *p = current_proc();
7817 struct fileproc *fp;
7818 int count = 0;
7819
7820 /*
7821 * Unlock socket to avoid lock ordering issue with
7822 * the proc fd table lock
7823 */
7824 socket_unlock(so, 0);
7825
7826 proc_fdlock(p);
7827 fdt_foreach(fp, p) {
7828 struct socket *so2;
7829
7830 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7831 continue;
7832 }
7833
7834 so2 = (struct socket *)fp_get_data(fp);
7835 if (so != so2 &&
7836 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7837 count++;
7838 }
7839 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7840 break;
7841 }
7842 }
7843 proc_fdunlock(p);
7844
7845 socket_lock(so, 0);
7846
7847 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7848 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7849 error = EBUSY;
7850 } else if (so->so_flags & SOF_DELEGATED) {
7851 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7852 error = EBUSY;
7853 } else {
7854 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7855 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7856 }
7857 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7858 "%s marked for extended bk idle\n",
7859 __func__, proc_selfpid(), proc_best_name(current_proc()),
7860 so->so_gencnt,
7861 SOCK_DOM(so), SOCK_TYPE(so),
7862 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7863 "is" : "not");
7864 }
7865
7866 return error;
7867 }
7868
7869 static void
so_stop_extended_bk_idle(struct socket * so)7870 so_stop_extended_bk_idle(struct socket *so)
7871 {
7872 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7873 so->so_extended_bk_start = 0;
7874
7875 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7876 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7877 /*
7878 * Force defunct
7879 */
7880 sosetdefunct(current_proc(), so,
7881 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7882 if (so->so_flags & SOF_DEFUNCT) {
7883 sodefunct(current_proc(), so,
7884 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7885 }
7886 }
7887
7888 void
so_drain_extended_bk_idle(struct socket * so)7889 so_drain_extended_bk_idle(struct socket *so)
7890 {
7891 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7892 /*
7893 * Only penalize sockets that have outstanding data
7894 */
7895 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7896 so_stop_extended_bk_idle(so);
7897
7898 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7899 }
7900 }
7901 }
7902
7903 /*
7904 * Return values tells if socket is still in extended background idle
7905 */
7906 int
so_check_extended_bk_idle_time(struct socket * so)7907 so_check_extended_bk_idle_time(struct socket *so)
7908 {
7909 int ret = 1;
7910
7911 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7912 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7913 __func__, proc_selfpid(), proc_best_name(current_proc()),
7914 so->so_gencnt,
7915 SOCK_DOM(so), SOCK_TYPE(so));
7916 if (net_uptime() - so->so_extended_bk_start >
7917 soextbkidlestat.so_xbkidle_time) {
7918 so_stop_extended_bk_idle(so);
7919
7920 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7921
7922 ret = 0;
7923 } else {
7924 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7925
7926 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7927 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7928 }
7929 }
7930
7931 return ret;
7932 }
7933
7934 void
resume_proc_sockets(proc_t p)7935 resume_proc_sockets(proc_t p)
7936 {
7937 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7938 struct fileproc *fp;
7939 struct socket *so;
7940
7941 proc_fdlock(p);
7942 fdt_foreach(fp, p) {
7943 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7944 continue;
7945 }
7946
7947 so = (struct socket *)fp_get_data(fp);
7948 (void) soresume(p, so, 0);
7949 }
7950 proc_fdunlock(p);
7951
7952 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7953 }
7954 }
7955
7956 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7957 so_set_recv_anyif(struct socket *so, int optval)
7958 {
7959 int ret = 0;
7960
7961 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7962 if (optval) {
7963 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7964 } else {
7965 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7966 }
7967 #if SKYWALK
7968 inp_update_netns_flags(so);
7969 #endif /* SKYWALK */
7970 }
7971
7972
7973 return ret;
7974 }
7975
7976 __private_extern__ int
so_get_recv_anyif(struct socket * so)7977 so_get_recv_anyif(struct socket *so)
7978 {
7979 int ret = 0;
7980
7981 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7982 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7983 }
7984
7985 return ret;
7986 }
7987
7988 int
so_set_restrictions(struct socket * so,uint32_t vals)7989 so_set_restrictions(struct socket *so, uint32_t vals)
7990 {
7991 int nocell_old, nocell_new;
7992 int noexpensive_old, noexpensive_new;
7993 int noconstrained_old, noconstrained_new;
7994
7995 /*
7996 * Deny-type restrictions are trapdoors; once set they cannot be
7997 * unset for the lifetime of the socket. This allows them to be
7998 * issued by a framework on behalf of the application without
7999 * having to worry that they can be undone.
8000 *
8001 * Note here that socket-level restrictions overrides any protocol
8002 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
8003 * socket restriction issued on the socket has a higher precendence
8004 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
8005 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
8006 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
8007 */
8008 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8009 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8010 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8011 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
8012 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
8013 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
8014 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8015 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8016 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8017
8018 /* we can only set, not clear restrictions */
8019 if ((nocell_new - nocell_old) == 0 &&
8020 (noexpensive_new - noexpensive_old) == 0 &&
8021 (noconstrained_new - noconstrained_old) == 0) {
8022 return 0;
8023 }
8024 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8025 if (nocell_new - nocell_old != 0) {
8026 /*
8027 * if deny cellular is now set, do what's needed
8028 * for INPCB
8029 */
8030 inp_set_nocellular(sotoinpcb(so));
8031 }
8032 if (noexpensive_new - noexpensive_old != 0) {
8033 inp_set_noexpensive(sotoinpcb(so));
8034 }
8035 if (noconstrained_new - noconstrained_old != 0) {
8036 inp_set_noconstrained(sotoinpcb(so));
8037 }
8038 }
8039
8040 if (SOCK_DOM(so) == PF_MULTIPATH) {
8041 mptcp_set_restrictions(so);
8042 }
8043
8044 return 0;
8045 }
8046
8047 uint32_t
so_get_restrictions(struct socket * so)8048 so_get_restrictions(struct socket *so)
8049 {
8050 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
8051 SO_RESTRICT_DENY_OUT |
8052 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
8053 }
8054
8055 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)8056 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
8057 {
8058 struct proc *ep = PROC_NULL;
8059 int error = 0;
8060
8061 /* pid 0 is reserved for kernel */
8062 if (epid == 0) {
8063 error = EINVAL;
8064 goto done;
8065 }
8066
8067 /*
8068 * If this is an in-kernel socket, prevent its delegate
8069 * association from changing unless the socket option is
8070 * coming from within the kernel itself.
8071 */
8072 if (so->last_pid == 0 && p != kernproc) {
8073 error = EACCES;
8074 goto done;
8075 }
8076
8077 /*
8078 * If this is issued by a process that's recorded as the
8079 * real owner of the socket, or if the pid is the same as
8080 * the process's own pid, then proceed. Otherwise ensure
8081 * that the issuing process has the necessary privileges.
8082 */
8083 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
8084 if ((error = priv_check_cred(kauth_cred_get(),
8085 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8086 error = EACCES;
8087 goto done;
8088 }
8089 }
8090
8091 /* Find the process that corresponds to the effective pid */
8092 if ((ep = proc_find(epid)) == PROC_NULL) {
8093 error = ESRCH;
8094 goto done;
8095 }
8096
8097 /*
8098 * If a process tries to delegate the socket to itself, then
8099 * there's really nothing to do; treat it as a way for the
8100 * delegate association to be cleared. Note that we check
8101 * the passed-in proc rather than calling proc_selfpid(),
8102 * as we need to check the process issuing the socket option
8103 * which could be kernproc. Given that we don't allow 0 for
8104 * effective pid, it means that a delegated in-kernel socket
8105 * stays delegated during its lifetime (which is probably OK.)
8106 */
8107 if (epid == proc_pid(p)) {
8108 so->so_flags &= ~SOF_DELEGATED;
8109 so->e_upid = 0;
8110 so->e_pid = 0;
8111 uuid_clear(so->e_uuid);
8112 } else {
8113 so->so_flags |= SOF_DELEGATED;
8114 so->e_upid = proc_uniqueid(ep);
8115 so->e_pid = proc_pid(ep);
8116 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8117
8118 #if defined(XNU_TARGET_OS_OSX)
8119 if (ep->p_responsible_pid != so->e_pid) {
8120 proc_t rp = proc_find(ep->p_responsible_pid);
8121 if (rp != PROC_NULL) {
8122 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8123 so->so_rpid = ep->p_responsible_pid;
8124 proc_rele(rp);
8125 } else {
8126 uuid_clear(so->so_ruuid);
8127 so->so_rpid = -1;
8128 }
8129 }
8130 #endif
8131 }
8132 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8133 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8134 }
8135 done:
8136 if (error == 0 && net_io_policy_log) {
8137 uuid_string_t buf;
8138
8139 uuid_unparse(so->e_uuid, buf);
8140 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8141 "euuid %s%s\n", __func__, proc_name_address(p),
8142 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8143 SOCK_DOM(so), SOCK_TYPE(so),
8144 so->e_pid, proc_name_address(ep), buf,
8145 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8146 } else if (error != 0 && net_io_policy_log) {
8147 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8148 "ERROR (%d)\n", __func__, proc_name_address(p),
8149 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8150 SOCK_DOM(so), SOCK_TYPE(so),
8151 epid, (ep == PROC_NULL) ? "PROC_NULL" :
8152 proc_name_address(ep), error);
8153 }
8154
8155 /* Update this socket's policy upon success */
8156 if (error == 0) {
8157 so->so_policy_gencnt *= -1;
8158 so_update_policy(so);
8159 #if NECP
8160 so_update_necp_policy(so, NULL, NULL);
8161 #endif /* NECP */
8162 }
8163
8164 if (ep != PROC_NULL) {
8165 proc_rele(ep);
8166 }
8167
8168 return error;
8169 }
8170
8171 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8172 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8173 {
8174 uuid_string_t buf;
8175 uuid_t uuid;
8176 int error = 0;
8177
8178 /* UUID must not be all-zeroes (reserved for kernel) */
8179 if (uuid_is_null(euuid)) {
8180 error = EINVAL;
8181 goto done;
8182 }
8183
8184 /*
8185 * If this is an in-kernel socket, prevent its delegate
8186 * association from changing unless the socket option is
8187 * coming from within the kernel itself.
8188 */
8189 if (so->last_pid == 0 && p != kernproc) {
8190 error = EACCES;
8191 goto done;
8192 }
8193
8194 /* Get the UUID of the issuing process */
8195 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8196
8197 /*
8198 * If this is issued by a process that's recorded as the
8199 * real owner of the socket, or if the uuid is the same as
8200 * the process's own uuid, then proceed. Otherwise ensure
8201 * that the issuing process has the necessary privileges.
8202 */
8203 if (check_cred &&
8204 (uuid_compare(euuid, so->last_uuid) != 0 ||
8205 uuid_compare(euuid, uuid) != 0)) {
8206 if ((error = priv_check_cred(kauth_cred_get(),
8207 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8208 error = EACCES;
8209 goto done;
8210 }
8211 }
8212
8213 /*
8214 * If a process tries to delegate the socket to itself, then
8215 * there's really nothing to do; treat it as a way for the
8216 * delegate association to be cleared. Note that we check
8217 * the uuid of the passed-in proc rather than that of the
8218 * current process, as we need to check the process issuing
8219 * the socket option which could be kernproc itself. Given
8220 * that we don't allow 0 for effective uuid, it means that
8221 * a delegated in-kernel socket stays delegated during its
8222 * lifetime (which is okay.)
8223 */
8224 if (uuid_compare(euuid, uuid) == 0) {
8225 so->so_flags &= ~SOF_DELEGATED;
8226 so->e_upid = 0;
8227 so->e_pid = 0;
8228 uuid_clear(so->e_uuid);
8229 } else {
8230 so->so_flags |= SOF_DELEGATED;
8231 /*
8232 * Unlike so_set_effective_pid(), we only have the UUID
8233 * here and the process ID is not known. Inherit the
8234 * real {pid,upid} of the socket.
8235 */
8236 so->e_upid = so->last_upid;
8237 so->e_pid = so->last_pid;
8238 uuid_copy(so->e_uuid, euuid);
8239 }
8240 /*
8241 * The following will clear the effective process name as it's the same
8242 * as the real process
8243 */
8244 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8245 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8246 }
8247 done:
8248 if (error == 0 && net_io_policy_log) {
8249 uuid_unparse(so->e_uuid, buf);
8250 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8251 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8252 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8253 SOCK_TYPE(so), so->e_pid, buf,
8254 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8255 } else if (error != 0 && net_io_policy_log) {
8256 uuid_unparse(euuid, buf);
8257 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8258 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8259 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8260 SOCK_TYPE(so), buf, error);
8261 }
8262
8263 /* Update this socket's policy upon success */
8264 if (error == 0) {
8265 so->so_policy_gencnt *= -1;
8266 so_update_policy(so);
8267 #if NECP
8268 so_update_necp_policy(so, NULL, NULL);
8269 #endif /* NECP */
8270 }
8271
8272 return error;
8273 }
8274
8275 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8276 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8277 uint32_t ev_datalen)
8278 {
8279 struct kev_msg ev_msg;
8280
8281 /*
8282 * A netpolicy event always starts with a netpolicy_event_data
8283 * structure, but the caller can provide for a longer event
8284 * structure to post, depending on the event code.
8285 */
8286 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8287
8288 bzero(&ev_msg, sizeof(ev_msg));
8289 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8290 ev_msg.kev_class = KEV_NETWORK_CLASS;
8291 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8292 ev_msg.event_code = ev_code;
8293
8294 ev_msg.dv[0].data_ptr = ev_data;
8295 ev_msg.dv[0].data_length = ev_datalen;
8296
8297 kev_post_msg(&ev_msg);
8298 }
8299
8300 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8301 socket_post_kev_msg(uint32_t ev_code,
8302 struct kev_socket_event_data *ev_data,
8303 uint32_t ev_datalen)
8304 {
8305 struct kev_msg ev_msg;
8306
8307 bzero(&ev_msg, sizeof(ev_msg));
8308 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8309 ev_msg.kev_class = KEV_NETWORK_CLASS;
8310 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8311 ev_msg.event_code = ev_code;
8312
8313 ev_msg.dv[0].data_ptr = ev_data;
8314 ev_msg.dv[0].data_length = ev_datalen;
8315
8316 kev_post_msg(&ev_msg);
8317 }
8318
8319 void
socket_post_kev_msg_closed(struct socket * so)8320 socket_post_kev_msg_closed(struct socket *so)
8321 {
8322 struct kev_socket_closed ev = {};
8323 struct sockaddr *socksa = NULL, *peersa = NULL;
8324 int err;
8325
8326 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8327 return;
8328 }
8329 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8330 if (err == 0) {
8331 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8332 &peersa);
8333 if (err == 0) {
8334 memcpy(&ev.ev_data.kev_sockname, socksa,
8335 min(socksa->sa_len,
8336 sizeof(ev.ev_data.kev_sockname)));
8337 memcpy(&ev.ev_data.kev_peername, peersa,
8338 min(peersa->sa_len,
8339 sizeof(ev.ev_data.kev_peername)));
8340 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8341 &ev.ev_data, sizeof(ev));
8342 }
8343 }
8344 free_sockaddr(socksa);
8345 free_sockaddr(peersa);
8346 }
8347