1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #include <os/log.h>
127
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147
148 static u_int32_t so_cache_hw; /* High water mark for socache */
149 static u_int32_t so_cache_timeouts; /* number of timeouts */
150 static u_int32_t so_cache_max_freed; /* max freed per timeout */
151 static u_int32_t cached_sock_count = 0;
152 STAILQ_HEAD(, socket) so_cache_head;
153 int max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t so_cache_time;
155 static int socketinit_done;
156 static struct zone *so_cache_zone;
157
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329
330 extern struct inpcbinfo tcbinfo;
331
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371
372 /*
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
376 */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &sotcdb, 0, "");
380
381 void
socketinit(void)382 socketinit(void)
383 {
384 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387 #ifdef __LP64__
388 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402
403 if (socketinit_done) {
404 printf("socketinit: already called...\n");
405 return;
406 }
407 socketinit_done = 1;
408
409 PE_parse_boot_argn("socket_debug", &socket_debug,
410 sizeof(socket_debug));
411
412 STAILQ_INIT(&so_cache_head);
413
414 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 + get_inpcb_str_size() + 4 + get_tcp_str_size());
416
417 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419
420 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424
425 in_pcbinit();
426 }
427
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 caddr_t temp;
432 uintptr_t offset;
433
434 lck_mtx_lock(&so_cache_mtx);
435
436 if (!STAILQ_EMPTY(&so_cache_head)) {
437 VERIFY(cached_sock_count > 0);
438
439 *so = STAILQ_FIRST(&so_cache_head);
440 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 STAILQ_NEXT((*so), so_cache_ent) = NULL;
442
443 cached_sock_count--;
444 lck_mtx_unlock(&so_cache_mtx);
445
446 temp = (*so)->so_saved_pcb;
447 bzero((caddr_t)*so, sizeof(struct socket));
448
449 (*so)->so_saved_pcb = temp;
450 } else {
451 lck_mtx_unlock(&so_cache_mtx);
452
453 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454
455 /*
456 * Define offsets for extra structures into our
457 * single block of memory. Align extra structures
458 * on longword boundaries.
459 */
460
461 offset = (uintptr_t)*so;
462 offset += sizeof(struct socket);
463
464 offset = ALIGN(offset);
465
466 (*so)->so_saved_pcb = (caddr_t)offset;
467 offset += get_inpcb_str_size();
468
469 offset = ALIGN(offset);
470
471 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 (caddr_t)offset;
473 }
474
475 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 lck_mtx_lock(&so_cache_mtx);
482
483 so_cache_time = net_uptime();
484 if (++cached_sock_count > max_cached_sock_count) {
485 --cached_sock_count;
486 lck_mtx_unlock(&so_cache_mtx);
487 zfree(so_cache_zone, so);
488 } else {
489 if (so_cache_hw < cached_sock_count) {
490 so_cache_hw = cached_sock_count;
491 }
492
493 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494
495 so->cache_timestamp = so_cache_time;
496 lck_mtx_unlock(&so_cache_mtx);
497 }
498 }
499
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 if (so->last_pid != 0) {
504 /*
505 * last_pid and last_upid should remain zero for sockets
506 * created using sock_socket. The check above achieves that
507 */
508 if (self == PROC_NULL) {
509 self = current_proc();
510 }
511
512 if (so->last_upid != proc_uniqueid(self) ||
513 so->last_pid != proc_pid(self)) {
514 so->last_upid = proc_uniqueid(self);
515 so->last_pid = proc_pid(self);
516 proc_getexecutableuuid(self, so->last_uuid,
517 sizeof(so->last_uuid));
518 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 }
521 }
522 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 }
524 }
525
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 (void) inp_update_policy(sotoinpcb(so));
531 }
532 }
533
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537 struct sockaddr *override_remote_addr)
538 {
539 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 override_remote_addr, 0);
542 }
543 }
544 #endif /* NECP */
545
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 struct socket *p;
550 int n_freed = 0;
551 boolean_t rc = FALSE;
552
553 lck_mtx_lock(&so_cache_mtx);
554 so_cache_timeouts++;
555 so_cache_time = net_uptime();
556
557 while (!STAILQ_EMPTY(&so_cache_head)) {
558 VERIFY(cached_sock_count > 0);
559 p = STAILQ_FIRST(&so_cache_head);
560 if ((so_cache_time - p->cache_timestamp) <
561 SO_CACHE_TIME_LIMIT) {
562 break;
563 }
564
565 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 --cached_sock_count;
567
568 zfree(so_cache_zone, p);
569
570 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 so_cache_max_freed++;
572 break;
573 }
574 }
575
576 /* Schedule again if there is more to cleanup */
577 if (!STAILQ_EMPTY(&so_cache_head)) {
578 rc = TRUE;
579 }
580
581 lck_mtx_unlock(&so_cache_mtx);
582 return rc;
583 }
584
585 /*
586 * Get a socket structure from our zone, and initialize it.
587 * We don't implement `waitok' yet (see comments in uipc_domain.c).
588 * Note that it would probably be better to allocate socket
589 * and PCB at the same time, but I'm not convinced that all
590 * the protocols can be easily modified to do this.
591 */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 struct socket *so;
597
598 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 cached_sock_alloc(&so, how);
600 } else {
601 so = zalloc_flags(socket_zone, how | Z_ZERO);
602 }
603 if (so != NULL) {
604 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605
606 /*
607 * Increment the socket allocation statistics
608 */
609 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 }
611
612 return so;
613 }
614
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617 struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 struct protosw *prp;
620 struct socket *so;
621 int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 pid_t rpid = -1;
624 #endif
625
626 #if TCPDEBUG
627 extern int tcpconsdebug;
628 #endif
629
630 VERIFY(aso != NULL);
631 *aso = NULL;
632
633 if (proto != 0) {
634 prp = pffindproto(dom, proto, type);
635 } else {
636 prp = pffindtype(dom, type);
637 }
638
639 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 if (pffinddomain(dom) == NULL) {
641 return EAFNOSUPPORT;
642 }
643 if (proto != 0) {
644 if (pffindprotonotype(dom, proto) != NULL) {
645 return EPROTOTYPE;
646 }
647 }
648 return EPROTONOSUPPORT;
649 }
650 if (prp->pr_type != type) {
651 return EPROTOTYPE;
652 }
653 so = soalloc(1, dom, type);
654 if (so == NULL) {
655 return ENOBUFS;
656 }
657
658 switch (dom) {
659 case PF_LOCAL:
660 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 break;
662 case PF_INET:
663 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 if (type == SOCK_STREAM) {
665 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 } else {
667 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 }
669 break;
670 case PF_ROUTE:
671 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 break;
673 case PF_NDRV:
674 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 break;
676 case PF_KEY:
677 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 break;
679 case PF_INET6:
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 if (type == SOCK_STREAM) {
682 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 } else {
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 }
686 break;
687 case PF_SYSTEM:
688 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 break;
690 case PF_MULTIPATH:
691 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 break;
693 default:
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 break;
696 }
697
698 if (flags & SOCF_MPTCP) {
699 so->so_state |= SS_NBIO;
700 }
701
702 TAILQ_INIT(&so->so_incomp);
703 TAILQ_INIT(&so->so_comp);
704 so->so_type = (short)type;
705 so->last_upid = proc_uniqueid(p);
706 so->last_pid = proc_pid(p);
707 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709
710 if (ep != PROC_NULL && ep != p) {
711 so->e_upid = proc_uniqueid(ep);
712 so->e_pid = proc_pid(ep);
713 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 if (ep->p_responsible_pid != so->e_pid) {
717 rpid = ep->p_responsible_pid;
718 }
719 #endif
720 }
721
722 #if defined(XNU_TARGET_OS_OSX)
723 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 rpid = p->p_responsible_pid;
725 }
726
727 so->so_rpid = -1;
728 uuid_clear(so->so_ruuid);
729 if (rpid >= 0) {
730 proc_t rp = proc_find(rpid);
731 if (rp != PROC_NULL) {
732 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 so->so_rpid = rpid;
734 proc_rele(rp);
735 }
736 }
737 #endif
738
739 so->so_cred = kauth_cred_proc_ref(p);
740 if (!suser(kauth_cred_get(), NULL)) {
741 so->so_state |= SS_PRIV;
742 }
743
744 so->so_proto = prp;
745 so->so_rcv.sb_flags |= SB_RECV;
746 so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 so->next_lock_lr = 0;
748 so->next_unlock_lr = 0;
749
750 /*
751 * Attachment will create the per pcb lock if necessary and
752 * increase refcount for creation, make sure it's done before
753 * socket is inserted in lists.
754 */
755 so->so_usecount++;
756
757 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 if (error != 0) {
759 /*
760 * Warning:
761 * If so_pcb is not zero, the socket will be leaked,
762 * so protocol attachment handler must be coded carefuly
763 */
764 if (so->so_pcb != NULL) {
765 os_log_error(OS_LOG_DEFAULT,
766 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 error, dom, proto, type);
768 }
769 /*
770 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 */
772 so->so_state |= SS_NOFDREF;
773 so->so_flags |= SOF_PCBCLEARING;
774 VERIFY(so->so_usecount > 0);
775 so->so_usecount--;
776 sofreelastref(so, 1); /* will deallocate the socket */
777 return error;
778 }
779
780 /*
781 * Note: needs so_pcb to be set after pru_attach
782 */
783 if (prp->pr_update_last_owner != NULL) {
784 (*prp->pr_update_last_owner)(so, p, ep);
785 }
786
787 atomic_add_32(&prp->pr_domain->dom_refs, 1);
788
789 /* Attach socket filters for this protocol */
790 sflt_initsock(so);
791 #if TCPDEBUG
792 if (tcpconsdebug == 2) {
793 so->so_options |= SO_DEBUG;
794 }
795 #endif
796 so_set_default_traffic_class(so);
797
798 /*
799 * If this thread or task is marked to create backgrounded sockets,
800 * mark the socket as background.
801 */
802 if (!(flags & SOCF_MPTCP) &&
803 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 so->so_background_thread = current_thread();
806 }
807
808 switch (dom) {
809 /*
810 * Don't mark Unix domain or system
811 * eligible for defunct by default.
812 */
813 case PF_LOCAL:
814 case PF_SYSTEM:
815 so->so_flags |= SOF_NODEFUNCT;
816 break;
817 default:
818 break;
819 }
820
821 /*
822 * Entitlements can't be checked at socket creation time except if the
823 * application requested a feature guarded by a privilege (c.f., socket
824 * delegation).
825 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 * a privilege check should only be triggered by a userland request.
827 * A privilege check at socket creation time is time consuming and
828 * could trigger many authorisation error messages from the security
829 * APIs.
830 */
831
832 *aso = so;
833
834 return 0;
835 }
836
837 /*
838 * Returns: 0 Success
839 * EAFNOSUPPORT
840 * EPROTOTYPE
841 * EPROTONOSUPPORT
842 * ENOBUFS
843 * <pru_attach>:ENOBUFS[AF_UNIX]
844 * <pru_attach>:ENOBUFS[TCP]
845 * <pru_attach>:ENOMEM[TCP]
846 * <pru_attach>:??? [other protocol families, IPSEC]
847 */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 PROC_NULL);
853 }
854
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 int error = 0;
859 struct proc *ep = PROC_NULL;
860
861 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 error = ESRCH;
863 goto done;
864 }
865
866 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867
868 /*
869 * It might not be wise to hold the proc reference when calling
870 * socreate_internal since it calls soalloc with M_WAITOK
871 */
872 done:
873 if (ep != PROC_NULL) {
874 proc_rele(ep);
875 }
876
877 return error;
878 }
879
880 /*
881 * Returns: 0 Success
882 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
883 * <pru_bind>:EAFNOSUPPORT Address family not supported
884 * <pru_bind>:EADDRNOTAVAIL Address not available.
885 * <pru_bind>:EINVAL Invalid argument
886 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
887 * <pru_bind>:EACCES Permission denied
888 * <pru_bind>:EADDRINUSE Address in use
889 * <pru_bind>:EAGAIN Resource unavailable, try again
890 * <pru_bind>:EPERM Operation not permitted
891 * <pru_bind>:???
892 * <sf_bind>:???
893 *
894 * Notes: It's not possible to fully enumerate the return codes above,
895 * since socket filter authors and protocol family authors may
896 * not choose to limit their error returns to those listed, even
897 * though this may result in some software operating incorrectly.
898 *
899 * The error codes which are enumerated above are those known to
900 * be returned by the tcp_usr_bind function supplied.
901 */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 struct proc *p = current_proc();
906 int error = 0;
907
908 if (dolock) {
909 socket_lock(so, 1);
910 }
911
912 so_update_last_owner_locked(so, p);
913 so_update_policy(so);
914
915 #if NECP
916 so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918
919 /*
920 * If this is a bind request on a socket that has been marked
921 * as inactive, reject it now before we go any further.
922 */
923 if (so->so_flags & SOF_DEFUNCT) {
924 error = EINVAL;
925 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
926 __func__, proc_pid(p), proc_best_name(p),
927 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
928 SOCK_DOM(so), SOCK_TYPE(so), error);
929 goto out;
930 }
931
932 /* Socket filter */
933 error = sflt_bind(so, nam);
934
935 if (error == 0) {
936 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 }
938 out:
939 if (dolock) {
940 socket_unlock(so, 1);
941 }
942
943 if (error == EJUSTRETURN) {
944 error = 0;
945 }
946
947 return error;
948 }
949
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 kauth_cred_unref(&so->so_cred);
954
955 /* Remove any filters */
956 sflt_termsock(so);
957
958 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959
960 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
961 cached_sock_free(so);
962 } else {
963 zfree(socket_zone, so);
964 }
965 }
966
967 /*
968 * Returns: 0 Success
969 * EINVAL
970 * EOPNOTSUPP
971 * <pru_listen>:EINVAL[AF_UNIX]
972 * <pru_listen>:EINVAL[TCP]
973 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
974 * <pru_listen>:EINVAL[TCP] Invalid argument
975 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
976 * <pru_listen>:EACCES[TCP] Permission denied
977 * <pru_listen>:EADDRINUSE[TCP] Address in use
978 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
979 * <pru_listen>:EPERM[TCP] Operation not permitted
980 * <sf_listen>:???
981 *
982 * Notes: Other <pru_listen> returns depend on the protocol family; all
983 * <sf_listen> returns depend on what the filter author causes
984 * their filter to return.
985 */
986 int
solisten(struct socket * so,int backlog)987 solisten(struct socket *so, int backlog)
988 {
989 struct proc *p = current_proc();
990 int error = 0;
991
992 socket_lock(so, 1);
993
994 so_update_last_owner_locked(so, p);
995 so_update_policy(so);
996
997 #if NECP
998 so_update_necp_policy(so, NULL, NULL);
999 #endif /* NECP */
1000
1001 if (so->so_proto == NULL) {
1002 error = EINVAL;
1003 goto out;
1004 }
1005 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1006 error = EOPNOTSUPP;
1007 goto out;
1008 }
1009
1010 /*
1011 * If the listen request is made on a socket that is not fully
1012 * disconnected, or on a socket that has been marked as inactive,
1013 * reject the request now.
1014 */
1015 if ((so->so_state &
1016 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1017 (so->so_flags & SOF_DEFUNCT)) {
1018 error = EINVAL;
1019 if (so->so_flags & SOF_DEFUNCT) {
1020 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1021 "(%d)\n", __func__, proc_pid(p),
1022 proc_best_name(p),
1023 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1024 SOCK_DOM(so), SOCK_TYPE(so), error);
1025 }
1026 goto out;
1027 }
1028
1029 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1030 error = EPERM;
1031 goto out;
1032 }
1033
1034 error = sflt_listen(so);
1035 if (error == 0) {
1036 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1037 }
1038
1039 if (error) {
1040 if (error == EJUSTRETURN) {
1041 error = 0;
1042 }
1043 goto out;
1044 }
1045
1046 if (TAILQ_EMPTY(&so->so_comp)) {
1047 so->so_options |= SO_ACCEPTCONN;
1048 }
1049 /*
1050 * POSIX: The implementation may have an upper limit on the length of
1051 * the listen queue-either global or per accepting socket. If backlog
1052 * exceeds this limit, the length of the listen queue is set to the
1053 * limit.
1054 *
1055 * If listen() is called with a backlog argument value that is less
1056 * than 0, the function behaves as if it had been called with a backlog
1057 * argument value of 0.
1058 *
1059 * A backlog argument of 0 may allow the socket to accept connections,
1060 * in which case the length of the listen queue may be set to an
1061 * implementation-defined minimum value.
1062 */
1063 if (backlog <= 0 || backlog > somaxconn) {
1064 backlog = somaxconn;
1065 }
1066
1067 so->so_qlimit = (short)backlog;
1068 out:
1069 socket_unlock(so, 1);
1070 return error;
1071 }
1072
1073 /*
1074 * The "accept list lock" protects the fields related to the listener queues
1075 * because we can unlock a socket to respect the lock ordering between
1076 * the listener socket and its clients sockets. The lock ordering is first to
1077 * acquire the client socket before the listener socket.
1078 *
1079 * The accept list lock serializes access to the following fields:
1080 * - of the listener socket:
1081 * - so_comp
1082 * - so_incomp
1083 * - so_qlen
1084 * - so_inqlen
1085 * - of client sockets that are in so_comp or so_incomp:
1086 * - so_head
1087 * - so_list
1088 *
1089 * As one can see the accept list lock protects the consistent of the
1090 * linkage of the client sockets.
1091 *
1092 * Note that those fields may be read without holding the accept list lock
1093 * for a preflight provided the accept list lock is taken when committing
1094 * to take an action based on the result of the preflight. The preflight
1095 * saves the cost of doing the unlock/lock dance.
1096 */
1097 void
so_acquire_accept_list(struct socket * head,struct socket * so)1098 so_acquire_accept_list(struct socket *head, struct socket *so)
1099 {
1100 lck_mtx_t *mutex_held;
1101
1102 if (head->so_proto->pr_getlock == NULL) {
1103 return;
1104 }
1105 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1106 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1107
1108 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1109 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1110 return;
1111 }
1112 if (so != NULL) {
1113 socket_unlock(so, 0);
1114 }
1115 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1116 so_accept_list_waits += 1;
1117 msleep((caddr_t)&head->so_incomp, mutex_held,
1118 PSOCK | PCATCH, __func__, NULL);
1119 }
1120 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1121 if (so != NULL) {
1122 socket_unlock(head, 0);
1123 socket_lock(so, 0);
1124 socket_lock(head, 0);
1125 }
1126 }
1127
1128 void
so_release_accept_list(struct socket * head)1129 so_release_accept_list(struct socket *head)
1130 {
1131 if (head->so_proto->pr_getlock != NULL) {
1132 lck_mtx_t *mutex_held;
1133
1134 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1135 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1136
1137 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1138 wakeup((caddr_t)&head->so_incomp);
1139 }
1140 }
1141
1142 void
sofreelastref(struct socket * so,int dealloc)1143 sofreelastref(struct socket *so, int dealloc)
1144 {
1145 struct socket *head = so->so_head;
1146
1147 /* Assume socket is locked */
1148
1149 #if FLOW_DIVERT
1150 if (so->so_flags & SOF_FLOW_DIVERT) {
1151 flow_divert_detach(so);
1152 }
1153 #endif /* FLOW_DIVERT */
1154
1155 #if CONTENT_FILTER
1156 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1157 cfil_sock_detach(so);
1158 }
1159 #endif /* CONTENT_FILTER */
1160
1161 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1162 soflow_detach(so);
1163 }
1164
1165 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1166 selthreadclear(&so->so_snd.sb_sel);
1167 selthreadclear(&so->so_rcv.sb_sel);
1168 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1169 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1170 so->so_event = sonullevent;
1171 return;
1172 }
1173 if (head != NULL) {
1174 /*
1175 * Need to lock the listener when the protocol has
1176 * per socket locks
1177 */
1178 if (head->so_proto->pr_getlock != NULL) {
1179 socket_lock(head, 1);
1180 so_acquire_accept_list(head, so);
1181 }
1182 if (so->so_state & SS_INCOMP) {
1183 so->so_state &= ~SS_INCOMP;
1184 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1185 head->so_incqlen--;
1186 head->so_qlen--;
1187 so->so_head = NULL;
1188
1189 if (head->so_proto->pr_getlock != NULL) {
1190 so_release_accept_list(head);
1191 socket_unlock(head, 1);
1192 }
1193 } else if (so->so_state & SS_COMP) {
1194 if (head->so_proto->pr_getlock != NULL) {
1195 so_release_accept_list(head);
1196 socket_unlock(head, 1);
1197 }
1198 /*
1199 * We must not decommission a socket that's
1200 * on the accept(2) queue. If we do, then
1201 * accept(2) may hang after select(2) indicated
1202 * that the listening socket was ready.
1203 */
1204 selthreadclear(&so->so_snd.sb_sel);
1205 selthreadclear(&so->so_rcv.sb_sel);
1206 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1207 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1208 so->so_event = sonullevent;
1209 return;
1210 } else {
1211 if (head->so_proto->pr_getlock != NULL) {
1212 so_release_accept_list(head);
1213 socket_unlock(head, 1);
1214 }
1215 printf("sofree: not queued\n");
1216 }
1217 }
1218 sowflush(so);
1219 sorflush(so);
1220
1221 /* 3932268: disable upcall */
1222 so->so_rcv.sb_flags &= ~SB_UPCALL;
1223 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1224 so->so_event = sonullevent;
1225
1226 if (dealloc) {
1227 sodealloc(so);
1228 }
1229 }
1230
1231 void
soclose_wait_locked(struct socket * so)1232 soclose_wait_locked(struct socket *so)
1233 {
1234 lck_mtx_t *mutex_held;
1235
1236 if (so->so_proto->pr_getlock != NULL) {
1237 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1238 } else {
1239 mutex_held = so->so_proto->pr_domain->dom_mtx;
1240 }
1241 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1242
1243 /*
1244 * Double check here and return if there's no outstanding upcall;
1245 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1246 */
1247 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1248 return;
1249 }
1250 so->so_rcv.sb_flags &= ~SB_UPCALL;
1251 so->so_snd.sb_flags &= ~SB_UPCALL;
1252 so->so_flags |= SOF_CLOSEWAIT;
1253
1254 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1255 "soclose_wait_locked", NULL);
1256 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1257 so->so_flags &= ~SOF_CLOSEWAIT;
1258 }
1259
1260 /*
1261 * Close a socket on last file table reference removal.
1262 * Initiate disconnect if connected.
1263 * Free socket when disconnect complete.
1264 */
1265 int
soclose_locked(struct socket * so)1266 soclose_locked(struct socket *so)
1267 {
1268 int error = 0;
1269 struct timespec ts;
1270
1271 if (so->so_usecount == 0) {
1272 panic("soclose: so=%p refcount=0", so);
1273 /* NOTREACHED */
1274 }
1275
1276 sflt_notify(so, sock_evt_closing, NULL);
1277
1278 if (so->so_upcallusecount) {
1279 soclose_wait_locked(so);
1280 }
1281
1282 #if CONTENT_FILTER
1283 /*
1284 * We have to wait until the content filters are done
1285 */
1286 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1287 cfil_sock_close_wait(so);
1288 cfil_sock_is_closed(so);
1289 cfil_sock_detach(so);
1290 }
1291 #endif /* CONTENT_FILTER */
1292
1293 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1294 soflow_detach(so);
1295 }
1296
1297 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1298 soresume(current_proc(), so, 1);
1299 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1300 }
1301
1302 if ((so->so_options & SO_ACCEPTCONN)) {
1303 struct socket *sp, *sonext;
1304 int persocklock = 0;
1305 int incomp_overflow_only;
1306
1307 /*
1308 * We do not want new connection to be added
1309 * to the connection queues
1310 */
1311 so->so_options &= ~SO_ACCEPTCONN;
1312
1313 /*
1314 * We can drop the lock on the listener once
1315 * we've acquired the incoming list
1316 */
1317 if (so->so_proto->pr_getlock != NULL) {
1318 persocklock = 1;
1319 so_acquire_accept_list(so, NULL);
1320 socket_unlock(so, 0);
1321 }
1322 again:
1323 incomp_overflow_only = 1;
1324
1325 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1326 /*
1327 * Radar 5350314
1328 * skip sockets thrown away by tcpdropdropblreq
1329 * they will get cleanup by the garbage collection.
1330 * otherwise, remove the incomp socket from the queue
1331 * and let soabort trigger the appropriate cleanup.
1332 */
1333 if (sp->so_flags & SOF_OVERFLOW) {
1334 continue;
1335 }
1336
1337 if (persocklock != 0) {
1338 socket_lock(sp, 1);
1339 }
1340
1341 /*
1342 * Radar 27945981
1343 * The extra reference for the list insure the
1344 * validity of the socket pointer when we perform the
1345 * unlock of the head above
1346 */
1347 if (sp->so_state & SS_INCOMP) {
1348 sp->so_state &= ~SS_INCOMP;
1349 sp->so_head = NULL;
1350 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1351 so->so_incqlen--;
1352 so->so_qlen--;
1353
1354 (void) soabort(sp);
1355 } else {
1356 panic("%s sp %p in so_incomp but !SS_INCOMP",
1357 __func__, sp);
1358 }
1359
1360 if (persocklock != 0) {
1361 socket_unlock(sp, 1);
1362 }
1363 }
1364
1365 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1366 /* Dequeue from so_comp since sofree() won't do it */
1367 if (persocklock != 0) {
1368 socket_lock(sp, 1);
1369 }
1370
1371 if (sp->so_state & SS_COMP) {
1372 sp->so_state &= ~SS_COMP;
1373 sp->so_head = NULL;
1374 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1375 so->so_qlen--;
1376
1377 (void) soabort(sp);
1378 } else {
1379 panic("%s sp %p in so_comp but !SS_COMP",
1380 __func__, sp);
1381 }
1382
1383 if (persocklock) {
1384 socket_unlock(sp, 1);
1385 }
1386 }
1387
1388 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1389 #if (DEBUG | DEVELOPMENT)
1390 panic("%s head %p so_comp not empty", __func__, so);
1391 #endif /* (DEVELOPMENT || DEBUG) */
1392
1393 goto again;
1394 }
1395
1396 if (!TAILQ_EMPTY(&so->so_comp)) {
1397 #if (DEBUG | DEVELOPMENT)
1398 panic("%s head %p so_comp not empty", __func__, so);
1399 #endif /* (DEVELOPMENT || DEBUG) */
1400
1401 goto again;
1402 }
1403
1404 if (persocklock) {
1405 socket_lock(so, 0);
1406 so_release_accept_list(so);
1407 }
1408 }
1409 if (so->so_pcb == NULL) {
1410 /* 3915887: mark the socket as ready for dealloc */
1411 so->so_flags |= SOF_PCBCLEARING;
1412 goto discard;
1413 }
1414
1415 if (so->so_state & SS_ISCONNECTED) {
1416 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1417 error = sodisconnectlocked(so);
1418 if (error) {
1419 goto drop;
1420 }
1421 }
1422 if (so->so_options & SO_LINGER) {
1423 if ((so->so_state & SS_ISDISCONNECTING) &&
1424 (so->so_state & SS_NBIO)) {
1425 goto drop;
1426 }
1427 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1428 lck_mtx_t *mutex_held;
1429
1430 if (so->so_proto->pr_getlock != NULL) {
1431 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1432 } else {
1433 mutex_held = so->so_proto->pr_domain->dom_mtx;
1434 }
1435 ts.tv_sec = (so->so_linger / 100);
1436 ts.tv_nsec = (so->so_linger % 100) *
1437 NSEC_PER_USEC * 1000 * 10;
1438 error = msleep((caddr_t)&so->so_timeo,
1439 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1440 if (error) {
1441 /*
1442 * It's OK when the time fires,
1443 * don't report an error
1444 */
1445 if (error == EWOULDBLOCK) {
1446 error = 0;
1447 }
1448 break;
1449 }
1450 }
1451 }
1452 }
1453 drop:
1454 if (so->so_usecount == 0) {
1455 panic("soclose: usecount is zero so=%p", so);
1456 /* NOTREACHED */
1457 }
1458 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1459 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1460 if (error == 0) {
1461 error = error2;
1462 }
1463 }
1464 if (so->so_usecount <= 0) {
1465 panic("soclose: usecount is zero so=%p", so);
1466 /* NOTREACHED */
1467 }
1468 discard:
1469 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1470 (so->so_state & SS_NOFDREF)) {
1471 panic("soclose: NOFDREF");
1472 /* NOTREACHED */
1473 }
1474 so->so_state |= SS_NOFDREF;
1475
1476 if ((so->so_flags & SOF_KNOTE) != 0) {
1477 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1478 }
1479
1480 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1481
1482 VERIFY(so->so_usecount > 0);
1483 so->so_usecount--;
1484 sofree(so);
1485 return error;
1486 }
1487
1488 int
soclose(struct socket * so)1489 soclose(struct socket *so)
1490 {
1491 int error = 0;
1492 socket_lock(so, 1);
1493
1494 if (so->so_retaincnt == 0) {
1495 error = soclose_locked(so);
1496 } else {
1497 /*
1498 * if the FD is going away, but socket is
1499 * retained in kernel remove its reference
1500 */
1501 so->so_usecount--;
1502 if (so->so_usecount < 2) {
1503 panic("soclose: retaincnt non null and so=%p "
1504 "usecount=%d\n", so, so->so_usecount);
1505 }
1506 }
1507 socket_unlock(so, 1);
1508 return error;
1509 }
1510
1511 /*
1512 * Must be called at splnet...
1513 */
1514 /* Should already be locked */
1515 int
soabort(struct socket * so)1516 soabort(struct socket *so)
1517 {
1518 int error;
1519
1520 #ifdef MORE_LOCKING_DEBUG
1521 lck_mtx_t *mutex_held;
1522
1523 if (so->so_proto->pr_getlock != NULL) {
1524 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1525 } else {
1526 mutex_held = so->so_proto->pr_domain->dom_mtx;
1527 }
1528 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1529 #endif
1530
1531 if ((so->so_flags & SOF_ABORTED) == 0) {
1532 so->so_flags |= SOF_ABORTED;
1533 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1534 if (error) {
1535 sofree(so);
1536 return error;
1537 }
1538 }
1539 return 0;
1540 }
1541
1542 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1543 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1544 {
1545 int error;
1546
1547 if (dolock) {
1548 socket_lock(so, 1);
1549 }
1550
1551 so_update_last_owner_locked(so, PROC_NULL);
1552 so_update_policy(so);
1553 #if NECP
1554 so_update_necp_policy(so, NULL, NULL);
1555 #endif /* NECP */
1556
1557 if ((so->so_state & SS_NOFDREF) == 0) {
1558 panic("soaccept: !NOFDREF");
1559 }
1560 so->so_state &= ~SS_NOFDREF;
1561 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1562
1563 if (dolock) {
1564 socket_unlock(so, 1);
1565 }
1566 return error;
1567 }
1568
1569 int
soaccept(struct socket * so,struct sockaddr ** nam)1570 soaccept(struct socket *so, struct sockaddr **nam)
1571 {
1572 return soacceptlock(so, nam, 1);
1573 }
1574
1575 int
soacceptfilter(struct socket * so,struct socket * head)1576 soacceptfilter(struct socket *so, struct socket *head)
1577 {
1578 struct sockaddr *local = NULL, *remote = NULL;
1579 int error = 0;
1580
1581 /*
1582 * Hold the lock even if this socket has not been made visible
1583 * to the filter(s). For sockets with global locks, this protects
1584 * against the head or peer going away
1585 */
1586 socket_lock(so, 1);
1587 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1588 sogetaddr_locked(so, &local, 0) != 0) {
1589 so->so_state &= ~SS_NOFDREF;
1590 socket_unlock(so, 1);
1591 soclose(so);
1592 /* Out of resources; try it again next time */
1593 error = ECONNABORTED;
1594 goto done;
1595 }
1596
1597 error = sflt_accept(head, so, local, remote);
1598
1599 /*
1600 * If we get EJUSTRETURN from one of the filters, mark this socket
1601 * as inactive and return it anyway. This newly accepted socket
1602 * will be disconnected later before we hand it off to the caller.
1603 */
1604 if (error == EJUSTRETURN) {
1605 error = 0;
1606 (void) sosetdefunct(current_proc(), so,
1607 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1608 }
1609
1610 if (error != 0) {
1611 /*
1612 * This may seem like a duplication to the above error
1613 * handling part when we return ECONNABORTED, except
1614 * the following is done while holding the lock since
1615 * the socket has been exposed to the filter(s) earlier.
1616 */
1617 so->so_state &= ~SS_NOFDREF;
1618 socket_unlock(so, 1);
1619 soclose(so);
1620 /* Propagate socket filter's error code to the caller */
1621 } else {
1622 socket_unlock(so, 1);
1623 }
1624 done:
1625 /* Callee checks for NULL pointer */
1626 sock_freeaddr(remote);
1627 sock_freeaddr(local);
1628 return error;
1629 }
1630
1631 /*
1632 * Returns: 0 Success
1633 * EOPNOTSUPP Operation not supported on socket
1634 * EISCONN Socket is connected
1635 * <pru_connect>:EADDRNOTAVAIL Address not available.
1636 * <pru_connect>:EINVAL Invalid argument
1637 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1638 * <pru_connect>:EACCES Permission denied
1639 * <pru_connect>:EADDRINUSE Address in use
1640 * <pru_connect>:EAGAIN Resource unavailable, try again
1641 * <pru_connect>:EPERM Operation not permitted
1642 * <sf_connect_out>:??? [anything a filter writer might set]
1643 */
1644 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1645 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1646 {
1647 int error;
1648 struct proc *p = current_proc();
1649 tracker_metadata_t metadata = { };
1650
1651 if (dolock) {
1652 socket_lock(so, 1);
1653 }
1654
1655 so_update_last_owner_locked(so, p);
1656 so_update_policy(so);
1657
1658 #if NECP
1659 so_update_necp_policy(so, NULL, nam);
1660 #endif /* NECP */
1661
1662 /*
1663 * If this is a listening socket or if this is a previously-accepted
1664 * socket that has been marked as inactive, reject the connect request.
1665 */
1666 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1667 error = EOPNOTSUPP;
1668 if (so->so_flags & SOF_DEFUNCT) {
1669 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1670 "(%d)\n", __func__, proc_pid(p),
1671 proc_best_name(p),
1672 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1673 SOCK_DOM(so), SOCK_TYPE(so), error);
1674 }
1675 if (dolock) {
1676 socket_unlock(so, 1);
1677 }
1678 return error;
1679 }
1680
1681 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1682 if (dolock) {
1683 socket_unlock(so, 1);
1684 }
1685 return EPERM;
1686 }
1687
1688 /*
1689 * If protocol is connection-based, can only connect once.
1690 * Otherwise, if connected, try to disconnect first.
1691 * This allows user to disconnect by connecting to, e.g.,
1692 * a null address.
1693 */
1694 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1695 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1696 (error = sodisconnectlocked(so)))) {
1697 error = EISCONN;
1698 } else {
1699 /*
1700 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1701 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1702 */
1703 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1704 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1705 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1706 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1707 }
1708 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1709 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1710 }
1711 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1712 printf("connect() - failed necp_set_socket_domain_attributes");
1713 }
1714 }
1715 }
1716
1717 /*
1718 * Run connect filter before calling protocol:
1719 * - non-blocking connect returns before completion;
1720 */
1721 error = sflt_connectout(so, nam);
1722 if (error != 0) {
1723 if (error == EJUSTRETURN) {
1724 error = 0;
1725 }
1726 } else {
1727 error = (*so->so_proto->pr_usrreqs->pru_connect)
1728 (so, nam, p);
1729 if (error != 0) {
1730 so->so_state &= ~SS_ISCONNECTING;
1731 }
1732 }
1733 }
1734 if (dolock) {
1735 socket_unlock(so, 1);
1736 }
1737 return error;
1738 }
1739
1740 int
soconnect(struct socket * so,struct sockaddr * nam)1741 soconnect(struct socket *so, struct sockaddr *nam)
1742 {
1743 return soconnectlock(so, nam, 1);
1744 }
1745
1746 /*
1747 * Returns: 0 Success
1748 * <pru_connect2>:EINVAL[AF_UNIX]
1749 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1750 * <pru_connect2>:??? [other protocol families]
1751 *
1752 * Notes: <pru_connect2> is not supported by [TCP].
1753 */
1754 int
soconnect2(struct socket * so1,struct socket * so2)1755 soconnect2(struct socket *so1, struct socket *so2)
1756 {
1757 int error;
1758
1759 socket_lock(so1, 1);
1760 if (so2->so_proto->pr_lock) {
1761 socket_lock(so2, 1);
1762 }
1763
1764 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1765
1766 socket_unlock(so1, 1);
1767 if (so2->so_proto->pr_lock) {
1768 socket_unlock(so2, 1);
1769 }
1770 return error;
1771 }
1772
1773 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1774 soconnectxlocked(struct socket *so, struct sockaddr *src,
1775 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1776 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1777 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1778 {
1779 int error;
1780 tracker_metadata_t metadata = { };
1781
1782 so_update_last_owner_locked(so, p);
1783 so_update_policy(so);
1784
1785 /*
1786 * If this is a listening socket or if this is a previously-accepted
1787 * socket that has been marked as inactive, reject the connect request.
1788 */
1789 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1790 error = EOPNOTSUPP;
1791 if (so->so_flags & SOF_DEFUNCT) {
1792 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1793 "(%d)\n", __func__, proc_pid(p),
1794 proc_best_name(p),
1795 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1796 SOCK_DOM(so), SOCK_TYPE(so), error);
1797 }
1798 return error;
1799 }
1800
1801 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1802 return EPERM;
1803 }
1804
1805 /*
1806 * If protocol is connection-based, can only connect once
1807 * unless PR_MULTICONN is set. Otherwise, if connected,
1808 * try to disconnect first. This allows user to disconnect
1809 * by connecting to, e.g., a null address.
1810 */
1811 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1812 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1813 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1814 (error = sodisconnectlocked(so)) != 0)) {
1815 error = EISCONN;
1816 } else {
1817 /*
1818 * For TCP, check if destination address is a tracker and mark the socket accordingly
1819 * (only if it hasn't been marked yet).
1820 */
1821 if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1822 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1823 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1824 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1825 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1826 }
1827 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1828 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1829 }
1830 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1831 printf("connectx() - failed necp_set_socket_domain_attributes");
1832 }
1833 }
1834 }
1835
1836 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1837 (flags & CONNECT_DATA_IDEMPOTENT)) {
1838 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1839
1840 if (flags & CONNECT_DATA_AUTHENTICATED) {
1841 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1842 }
1843 }
1844
1845 /*
1846 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1847 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1848 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1849 * Case 3 allows user to combine write with connect even if they have
1850 * no use for TFO (such as regular TCP, and UDP).
1851 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1852 */
1853 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1854 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1855 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1856 }
1857
1858 /*
1859 * If a user sets data idempotent and does not pass an uio, or
1860 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1861 * SOF1_DATA_IDEMPOTENT.
1862 */
1863 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1864 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1865 /* We should return EINVAL instead perhaps. */
1866 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1867 }
1868
1869 /*
1870 * Run connect filter before calling protocol:
1871 * - non-blocking connect returns before completion;
1872 */
1873 error = sflt_connectout(so, dst);
1874 if (error != 0) {
1875 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1876 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1877 if (error == EJUSTRETURN) {
1878 error = 0;
1879 }
1880 } else {
1881 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1882 (so, src, dst, p, ifscope, aid, pcid,
1883 flags, arg, arglen, auio, bytes_written);
1884 if (error != 0) {
1885 so->so_state &= ~SS_ISCONNECTING;
1886 if (error != EINPROGRESS) {
1887 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1888 }
1889 }
1890 }
1891 }
1892
1893 return error;
1894 }
1895
1896 int
sodisconnectlocked(struct socket * so)1897 sodisconnectlocked(struct socket *so)
1898 {
1899 int error;
1900
1901 if ((so->so_state & SS_ISCONNECTED) == 0) {
1902 error = ENOTCONN;
1903 goto bad;
1904 }
1905 if (so->so_state & SS_ISDISCONNECTING) {
1906 error = EALREADY;
1907 goto bad;
1908 }
1909
1910 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1911 if (error == 0) {
1912 sflt_notify(so, sock_evt_disconnected, NULL);
1913 }
1914
1915 bad:
1916 return error;
1917 }
1918
1919 /* Locking version */
1920 int
sodisconnect(struct socket * so)1921 sodisconnect(struct socket *so)
1922 {
1923 int error;
1924
1925 socket_lock(so, 1);
1926 error = sodisconnectlocked(so);
1927 socket_unlock(so, 1);
1928 return error;
1929 }
1930
1931 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1932 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1933 {
1934 int error;
1935
1936 /*
1937 * Call the protocol disconnectx handler; let it handle all
1938 * matters related to the connection state of this session.
1939 */
1940 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1941 if (error == 0) {
1942 /*
1943 * The event applies only for the session, not for
1944 * the disconnection of individual subflows.
1945 */
1946 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1947 sflt_notify(so, sock_evt_disconnected, NULL);
1948 }
1949 }
1950 return error;
1951 }
1952
1953 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1954 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1955 {
1956 int error;
1957
1958 socket_lock(so, 1);
1959 error = sodisconnectxlocked(so, aid, cid);
1960 socket_unlock(so, 1);
1961 return error;
1962 }
1963
1964 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1965
1966 /*
1967 * sosendcheck will lock the socket buffer if it isn't locked and
1968 * verify that there is space for the data being inserted.
1969 *
1970 * Returns: 0 Success
1971 * EPIPE
1972 * sblock:EWOULDBLOCK
1973 * sblock:EINTR
1974 * sbwait:EBADF
1975 * sbwait:EINTR
1976 * [so_error]:???
1977 */
1978 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1979 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1980 int32_t clen, int32_t atomic, int flags, int *sblocked)
1981 {
1982 int error = 0;
1983 int32_t space;
1984 int assumelock = 0;
1985
1986 restart:
1987 if (*sblocked == 0) {
1988 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1989 so->so_send_filt_thread != 0 &&
1990 so->so_send_filt_thread == current_thread()) {
1991 /*
1992 * We're being called recursively from a filter,
1993 * allow this to continue. Radar 4150520.
1994 * Don't set sblocked because we don't want
1995 * to perform an unlock later.
1996 */
1997 assumelock = 1;
1998 } else {
1999 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2000 if (error) {
2001 if (so->so_flags & SOF_DEFUNCT) {
2002 goto defunct;
2003 }
2004 return error;
2005 }
2006 *sblocked = 1;
2007 }
2008 }
2009
2010 /*
2011 * If a send attempt is made on a socket that has been marked
2012 * as inactive (disconnected), reject the request.
2013 */
2014 if (so->so_flags & SOF_DEFUNCT) {
2015 defunct:
2016 error = EPIPE;
2017 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2018 __func__, proc_selfpid(), proc_best_name(current_proc()),
2019 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2020 SOCK_DOM(so), SOCK_TYPE(so), error);
2021 return error;
2022 }
2023
2024 if (so->so_state & SS_CANTSENDMORE) {
2025 #if CONTENT_FILTER
2026 /*
2027 * Can re-inject data of half closed connections
2028 */
2029 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2030 so->so_snd.sb_cfil_thread == current_thread() &&
2031 cfil_sock_data_pending(&so->so_snd) != 0) {
2032 CFIL_LOG(LOG_INFO,
2033 "so %llx ignore SS_CANTSENDMORE",
2034 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2035 } else
2036 #endif /* CONTENT_FILTER */
2037 return EPIPE;
2038 }
2039 if (so->so_error) {
2040 error = so->so_error;
2041 so->so_error = 0;
2042 return error;
2043 }
2044
2045 if ((so->so_state & SS_ISCONNECTED) == 0) {
2046 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2047 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2048 (resid != 0 || clen == 0) &&
2049 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2050 return ENOTCONN;
2051 }
2052 } else if (addr == 0) {
2053 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2054 ENOTCONN : EDESTADDRREQ;
2055 }
2056 }
2057
2058 space = sbspace(&so->so_snd);
2059
2060 if (flags & MSG_OOB) {
2061 space += 1024;
2062 }
2063 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2064 clen > so->so_snd.sb_hiwat) {
2065 return EMSGSIZE;
2066 }
2067
2068 if ((space < resid + clen &&
2069 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2070 space < clen)) ||
2071 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2072 /*
2073 * don't block the connectx call when there's more data
2074 * than can be copied.
2075 */
2076 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2077 if (space == 0) {
2078 return EWOULDBLOCK;
2079 }
2080 if (space < (int32_t)so->so_snd.sb_lowat) {
2081 return 0;
2082 }
2083 }
2084 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2085 assumelock) {
2086 return EWOULDBLOCK;
2087 }
2088 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2089 *sblocked = 0;
2090 error = sbwait(&so->so_snd);
2091 if (error) {
2092 if (so->so_flags & SOF_DEFUNCT) {
2093 goto defunct;
2094 }
2095 return error;
2096 }
2097 goto restart;
2098 }
2099 return 0;
2100 }
2101
2102 /*
2103 * Send on a socket.
2104 * If send must go all at once and message is larger than
2105 * send buffering, then hard error.
2106 * Lock against other senders.
2107 * If must go all at once and not enough room now, then
2108 * inform user that this would block and do nothing.
2109 * Otherwise, if nonblocking, send as much as possible.
2110 * The data to be sent is described by "uio" if nonzero,
2111 * otherwise by the mbuf chain "top" (which must be null
2112 * if uio is not). Data provided in mbuf chain must be small
2113 * enough to send all at once.
2114 *
2115 * Returns nonzero on error, timeout or signal; callers
2116 * must check for short counts if EINTR/ERESTART are returned.
2117 * Data and control buffers are freed on return.
2118 *
2119 * Returns: 0 Success
2120 * EOPNOTSUPP
2121 * EINVAL
2122 * ENOBUFS
2123 * uiomove:EFAULT
2124 * sosendcheck:EPIPE
2125 * sosendcheck:EWOULDBLOCK
2126 * sosendcheck:EINTR
2127 * sosendcheck:EBADF
2128 * sosendcheck:EINTR
2129 * sosendcheck:??? [value from so_error]
2130 * <pru_send>:ECONNRESET[TCP]
2131 * <pru_send>:EINVAL[TCP]
2132 * <pru_send>:ENOBUFS[TCP]
2133 * <pru_send>:EADDRINUSE[TCP]
2134 * <pru_send>:EADDRNOTAVAIL[TCP]
2135 * <pru_send>:EAFNOSUPPORT[TCP]
2136 * <pru_send>:EACCES[TCP]
2137 * <pru_send>:EAGAIN[TCP]
2138 * <pru_send>:EPERM[TCP]
2139 * <pru_send>:EMSGSIZE[TCP]
2140 * <pru_send>:EHOSTUNREACH[TCP]
2141 * <pru_send>:ENETUNREACH[TCP]
2142 * <pru_send>:ENETDOWN[TCP]
2143 * <pru_send>:ENOMEM[TCP]
2144 * <pru_send>:ENOBUFS[TCP]
2145 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2146 * <pru_send>:EINVAL[AF_UNIX]
2147 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2148 * <pru_send>:EPIPE[AF_UNIX]
2149 * <pru_send>:ENOTCONN[AF_UNIX]
2150 * <pru_send>:EISCONN[AF_UNIX]
2151 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2152 * <sf_data_out>:??? [whatever a filter author chooses]
2153 *
2154 * Notes: Other <pru_send> returns depend on the protocol family; all
2155 * <sf_data_out> returns depend on what the filter author causes
2156 * their filter to return.
2157 */
2158 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2159 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2160 struct mbuf *top, struct mbuf *control, int flags)
2161 {
2162 struct mbuf **mp;
2163 struct mbuf *m, *freelist = NULL;
2164 struct soflow_hash_entry *dgram_flow_entry = NULL;
2165 user_ssize_t space, len, resid, orig_resid;
2166 int clen = 0, error, dontroute, sendflags;
2167 int atomic = sosendallatonce(so) || top;
2168 int sblocked = 0;
2169 struct proc *p = current_proc();
2170 uint16_t headroom = 0;
2171 ssize_t mlen;
2172 boolean_t en_tracing = FALSE;
2173
2174 if (uio != NULL) {
2175 resid = uio_resid(uio);
2176 } else {
2177 resid = top->m_pkthdr.len;
2178 }
2179
2180 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2181 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2182
2183 socket_lock(so, 1);
2184
2185 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2186 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2187 }
2188
2189 /*
2190 * trace if tracing & network (vs. unix) sockets & and
2191 * non-loopback
2192 */
2193 if (ENTR_SHOULDTRACE &&
2194 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2195 struct inpcb *inp = sotoinpcb(so);
2196 if (inp->inp_last_outifp != NULL &&
2197 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2198 en_tracing = TRUE;
2199 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2200 VM_KERNEL_ADDRPERM(so),
2201 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2202 (int64_t)resid);
2203 orig_resid = resid;
2204 }
2205 }
2206
2207 /*
2208 * Re-injection should not affect process accounting
2209 */
2210 if ((flags & MSG_SKIPCFIL) == 0) {
2211 so_update_last_owner_locked(so, p);
2212 so_update_policy(so);
2213
2214 #if NECP
2215 so_update_necp_policy(so, NULL, addr);
2216 #endif /* NECP */
2217 }
2218
2219 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2220 error = EOPNOTSUPP;
2221 goto out_locked;
2222 }
2223
2224 /*
2225 * In theory resid should be unsigned.
2226 * However, space must be signed, as it might be less than 0
2227 * if we over-committed, and we must use a signed comparison
2228 * of space and resid. On the other hand, a negative resid
2229 * causes us to loop sending 0-length segments to the protocol.
2230 *
2231 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2232 *
2233 * Note: We limit resid to be a positive int value as we use
2234 * imin() to set bytes_to_copy -- radr://14558484
2235 */
2236 if (resid < 0 || resid > INT_MAX ||
2237 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2238 error = EINVAL;
2239 goto out_locked;
2240 }
2241
2242 dontroute = (flags & MSG_DONTROUTE) &&
2243 (so->so_options & SO_DONTROUTE) == 0 &&
2244 (so->so_proto->pr_flags & PR_ATOMIC);
2245 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2246
2247 if (control != NULL) {
2248 clen = control->m_len;
2249 }
2250
2251 if (soreserveheadroom != 0) {
2252 headroom = so->so_pktheadroom;
2253 }
2254
2255 do {
2256 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2257 &sblocked);
2258 if (error) {
2259 goto out_locked;
2260 }
2261
2262 mp = ⊤
2263 space = sbspace(&so->so_snd) - clen;
2264 space += ((flags & MSG_OOB) ? 1024 : 0);
2265
2266 do {
2267 if (uio == NULL) {
2268 /*
2269 * Data is prepackaged in "top".
2270 */
2271 resid = 0;
2272 if (flags & MSG_EOR) {
2273 top->m_flags |= M_EOR;
2274 }
2275 } else {
2276 int chainlength;
2277 int bytes_to_copy;
2278 boolean_t jumbocl;
2279 boolean_t bigcl;
2280 int bytes_to_alloc;
2281
2282 bytes_to_copy = imin((int)resid, (int)space);
2283
2284 bytes_to_alloc = bytes_to_copy;
2285 if (top == NULL) {
2286 bytes_to_alloc += headroom;
2287 }
2288
2289 if (sosendminchain > 0) {
2290 chainlength = 0;
2291 } else {
2292 chainlength = sosendmaxchain;
2293 }
2294
2295 /*
2296 * Use big 4 KB cluster when the outgoing interface
2297 * does not prefer 2 KB clusters
2298 */
2299 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2300 sosendbigcl_ignore_capab;
2301
2302 /*
2303 * Attempt to use larger than system page-size
2304 * clusters for large writes only if there is
2305 * a jumbo cluster pool and if the socket is
2306 * marked accordingly.
2307 */
2308 jumbocl = sosendjcl && njcl > 0 &&
2309 ((so->so_flags & SOF_MULTIPAGES) ||
2310 sosendjcl_ignore_capab) &&
2311 bigcl;
2312
2313 socket_unlock(so, 0);
2314
2315 do {
2316 int num_needed;
2317 int hdrs_needed = (top == NULL) ? 1 : 0;
2318
2319 /*
2320 * try to maintain a local cache of mbuf
2321 * clusters needed to complete this
2322 * write the list is further limited to
2323 * the number that are currently needed
2324 * to fill the socket this mechanism
2325 * allows a large number of mbufs/
2326 * clusters to be grabbed under a single
2327 * mbuf lock... if we can't get any
2328 * clusters, than fall back to trying
2329 * for mbufs if we fail early (or
2330 * miscalcluate the number needed) make
2331 * sure to release any clusters we
2332 * haven't yet consumed.
2333 */
2334 if (freelist == NULL &&
2335 bytes_to_alloc > MBIGCLBYTES &&
2336 jumbocl) {
2337 num_needed =
2338 bytes_to_alloc / M16KCLBYTES;
2339
2340 if ((bytes_to_alloc -
2341 (num_needed * M16KCLBYTES))
2342 >= MINCLSIZE) {
2343 num_needed++;
2344 }
2345
2346 freelist =
2347 m_getpackets_internal(
2348 (unsigned int *)&num_needed,
2349 hdrs_needed, M_WAIT, 0,
2350 M16KCLBYTES);
2351 /*
2352 * Fall back to 4K cluster size
2353 * if allocation failed
2354 */
2355 }
2356
2357 if (freelist == NULL &&
2358 bytes_to_alloc > MCLBYTES &&
2359 bigcl) {
2360 num_needed =
2361 bytes_to_alloc / MBIGCLBYTES;
2362
2363 if ((bytes_to_alloc -
2364 (num_needed * MBIGCLBYTES)) >=
2365 MINCLSIZE) {
2366 num_needed++;
2367 }
2368
2369 freelist =
2370 m_getpackets_internal(
2371 (unsigned int *)&num_needed,
2372 hdrs_needed, M_WAIT, 0,
2373 MBIGCLBYTES);
2374 /*
2375 * Fall back to cluster size
2376 * if allocation failed
2377 */
2378 }
2379
2380 /*
2381 * Allocate a cluster as we want to
2382 * avoid to split the data in more
2383 * that one segment and using MINCLSIZE
2384 * would lead us to allocate two mbufs
2385 */
2386 if (soreserveheadroom != 0 &&
2387 freelist == NULL &&
2388 ((top == NULL &&
2389 bytes_to_alloc > _MHLEN) ||
2390 bytes_to_alloc > _MLEN)) {
2391 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2392 MCLBYTES;
2393 freelist =
2394 m_getpackets_internal(
2395 (unsigned int *)&num_needed,
2396 hdrs_needed, M_WAIT, 0,
2397 MCLBYTES);
2398 /*
2399 * Fall back to a single mbuf
2400 * if allocation failed
2401 */
2402 } else if (freelist == NULL &&
2403 bytes_to_alloc > MINCLSIZE) {
2404 num_needed =
2405 bytes_to_alloc / MCLBYTES;
2406
2407 if ((bytes_to_alloc -
2408 (num_needed * MCLBYTES)) >=
2409 MINCLSIZE) {
2410 num_needed++;
2411 }
2412
2413 freelist =
2414 m_getpackets_internal(
2415 (unsigned int *)&num_needed,
2416 hdrs_needed, M_WAIT, 0,
2417 MCLBYTES);
2418 /*
2419 * Fall back to a single mbuf
2420 * if allocation failed
2421 */
2422 }
2423 /*
2424 * For datagram protocols, leave
2425 * headroom for protocol headers
2426 * in the first cluster of the chain
2427 */
2428 if (freelist != NULL && atomic &&
2429 top == NULL && headroom > 0) {
2430 freelist->m_data += headroom;
2431 }
2432
2433 /*
2434 * Fall back to regular mbufs without
2435 * reserving the socket headroom
2436 */
2437 if (freelist == NULL) {
2438 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2439 if (top == NULL) {
2440 MGETHDR(freelist,
2441 M_WAIT, MT_DATA);
2442 } else {
2443 MGET(freelist,
2444 M_WAIT, MT_DATA);
2445 }
2446 }
2447
2448 if (freelist == NULL) {
2449 error = ENOBUFS;
2450 socket_lock(so, 0);
2451 goto out_locked;
2452 }
2453 /*
2454 * For datagram protocols,
2455 * leave room for protocol
2456 * headers in first mbuf.
2457 */
2458 if (atomic && top == NULL &&
2459 bytes_to_copy > 0 &&
2460 bytes_to_copy < MHLEN) {
2461 MH_ALIGN(freelist,
2462 bytes_to_copy);
2463 }
2464 }
2465 m = freelist;
2466 freelist = m->m_next;
2467 m->m_next = NULL;
2468
2469 if ((m->m_flags & M_EXT)) {
2470 mlen = m->m_ext.ext_size -
2471 M_LEADINGSPACE(m);
2472 } else if ((m->m_flags & M_PKTHDR)) {
2473 mlen = MHLEN - M_LEADINGSPACE(m);
2474 m_add_crumb(m, PKT_CRUMB_SOSEND);
2475 } else {
2476 mlen = MLEN - M_LEADINGSPACE(m);
2477 }
2478 len = imin((int)mlen, bytes_to_copy);
2479
2480 chainlength += len;
2481
2482 space -= len;
2483
2484 error = uiomove(mtod(m, caddr_t),
2485 (int)len, uio);
2486
2487 resid = uio_resid(uio);
2488
2489 m->m_len = (int32_t)len;
2490 *mp = m;
2491 top->m_pkthdr.len += len;
2492 if (error) {
2493 break;
2494 }
2495 mp = &m->m_next;
2496 if (resid <= 0) {
2497 if (flags & MSG_EOR) {
2498 top->m_flags |= M_EOR;
2499 }
2500 break;
2501 }
2502 bytes_to_copy = imin((int)resid, (int)space);
2503 } while (space > 0 &&
2504 (chainlength < sosendmaxchain || atomic ||
2505 resid < MINCLSIZE));
2506
2507 socket_lock(so, 0);
2508
2509 if (error) {
2510 goto out_locked;
2511 }
2512 }
2513
2514 if (dontroute) {
2515 so->so_options |= SO_DONTROUTE;
2516 }
2517
2518 /*
2519 * Compute flags here, for pru_send and NKEs
2520 *
2521 * If the user set MSG_EOF, the protocol
2522 * understands this flag and nothing left to
2523 * send then use PRU_SEND_EOF instead of PRU_SEND.
2524 */
2525 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2526 ((flags & MSG_EOF) &&
2527 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2528 (resid <= 0)) ? PRUS_EOF :
2529 /* If there is more to send set PRUS_MORETOCOME */
2530 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2531
2532 if ((flags & MSG_SKIPCFIL) == 0) {
2533 /*
2534 * Socket filter processing
2535 */
2536 error = sflt_data_out(so, addr, &top,
2537 &control, (sendflags & MSG_OOB) ?
2538 sock_data_filt_flag_oob : 0);
2539 if (error) {
2540 if (error == EJUSTRETURN) {
2541 error = 0;
2542 goto packet_consumed;
2543 }
2544 goto out_locked;
2545 }
2546 #if CONTENT_FILTER
2547 /*
2548 * Content filter processing
2549 */
2550 error = cfil_sock_data_out(so, addr, top,
2551 control, sendflags, dgram_flow_entry);
2552 if (error) {
2553 if (error == EJUSTRETURN) {
2554 error = 0;
2555 goto packet_consumed;
2556 }
2557 goto out_locked;
2558 }
2559 #endif /* CONTENT_FILTER */
2560 }
2561 error = (*so->so_proto->pr_usrreqs->pru_send)
2562 (so, sendflags, top, addr, control, p);
2563
2564 packet_consumed:
2565 if (dontroute) {
2566 so->so_options &= ~SO_DONTROUTE;
2567 }
2568
2569 clen = 0;
2570 control = NULL;
2571 top = NULL;
2572 mp = ⊤
2573 if (error) {
2574 goto out_locked;
2575 }
2576 } while (resid && space > 0);
2577 } while (resid);
2578
2579 out_locked:
2580 if (sblocked) {
2581 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2582 } else {
2583 socket_unlock(so, 1);
2584 }
2585 if (top != NULL) {
2586 m_freem(top);
2587 }
2588 if (control != NULL) {
2589 m_freem(control);
2590 }
2591 if (freelist != NULL) {
2592 m_freem_list(freelist);
2593 }
2594
2595 if (dgram_flow_entry != NULL) {
2596 soflow_free_flow(dgram_flow_entry);
2597 }
2598
2599 soclearfastopen(so);
2600
2601 if (en_tracing) {
2602 /* resid passed here is the bytes left in uio */
2603 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2604 VM_KERNEL_ADDRPERM(so),
2605 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2606 (int64_t)(orig_resid - resid));
2607 }
2608 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2609 so->so_snd.sb_cc, space, error);
2610
2611 return error;
2612 }
2613
2614 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2615 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2616 {
2617 struct mbuf *m0 = NULL, *control_end = NULL;
2618
2619 socket_lock_assert_owned(so);
2620
2621 /*
2622 * top must points to mbuf chain to be sent.
2623 * If control is not NULL, top must be packet header
2624 */
2625 VERIFY(top != NULL &&
2626 (control == NULL || top->m_flags & M_PKTHDR));
2627
2628 /*
2629 * If control is not passed in, see if we can get it
2630 * from top.
2631 */
2632 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2633 // Locate start of control if present and start of data
2634 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2635 if (m0->m_flags & M_PKTHDR) {
2636 top = m0;
2637 break;
2638 } else if (m0->m_type == MT_CONTROL) {
2639 if (control == NULL) {
2640 // Found start of control
2641 control = m0;
2642 }
2643 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2644 // Found end of control
2645 control_end = m0;
2646 }
2647 }
2648 }
2649 if (control_end != NULL) {
2650 control_end->m_next = NULL;
2651 }
2652 }
2653
2654 int error = (*so->so_proto->pr_usrreqs->pru_send)
2655 (so, sendflags, top, addr, control, current_proc());
2656
2657 return error;
2658 }
2659
2660 /*
2661 * Supported only connected sockets (no address) without ancillary data
2662 * (control mbuf) for atomic protocols
2663 */
2664 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2665 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2666 {
2667 struct mbuf *m, *freelist = NULL;
2668 struct soflow_hash_entry *dgram_flow_entry = NULL;
2669 user_ssize_t len, resid;
2670 int error, dontroute;
2671 int atomic = sosendallatonce(so);
2672 int sblocked = 0;
2673 struct proc *p = current_proc();
2674 u_int uiofirst = 0;
2675 u_int uiolast = 0;
2676 struct mbuf *top = NULL;
2677 uint16_t headroom = 0;
2678 ssize_t mlen;
2679 boolean_t bigcl;
2680
2681 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2682 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2683
2684 if (so->so_type != SOCK_DGRAM) {
2685 error = EINVAL;
2686 goto out;
2687 }
2688 if (atomic == 0) {
2689 error = EINVAL;
2690 goto out;
2691 }
2692 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2693 error = EPROTONOSUPPORT;
2694 goto out;
2695 }
2696 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2697 error = EINVAL;
2698 goto out;
2699 }
2700 resid = uio_array_resid(uioarray, uiocnt);
2701
2702 /*
2703 * In theory resid should be unsigned.
2704 * However, space must be signed, as it might be less than 0
2705 * if we over-committed, and we must use a signed comparison
2706 * of space and resid. On the other hand, a negative resid
2707 * causes us to loop sending 0-length segments to the protocol.
2708 *
2709 * Note: We limit resid to be a positive int value as we use
2710 * imin() to set bytes_to_copy -- radr://14558484
2711 */
2712 if (resid < 0 || resid > INT_MAX) {
2713 error = EINVAL;
2714 goto out;
2715 }
2716
2717 socket_lock(so, 1);
2718 so_update_last_owner_locked(so, p);
2719 so_update_policy(so);
2720
2721 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2722 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2723 }
2724
2725 #if NECP
2726 so_update_necp_policy(so, NULL, NULL);
2727 #endif /* NECP */
2728
2729 dontroute = (flags & MSG_DONTROUTE) &&
2730 (so->so_options & SO_DONTROUTE) == 0 &&
2731 (so->so_proto->pr_flags & PR_ATOMIC);
2732 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2733
2734 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2735 if (error) {
2736 goto release;
2737 }
2738
2739 /*
2740 * Use big 4 KB clusters when the outgoing interface does not prefer
2741 * 2 KB clusters
2742 */
2743 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2744
2745 if (soreserveheadroom != 0) {
2746 headroom = so->so_pktheadroom;
2747 }
2748
2749 do {
2750 int i;
2751 int num_needed = 0;
2752 int chainlength;
2753 size_t maxpktlen = 0;
2754 int bytes_to_alloc;
2755
2756 if (sosendminchain > 0) {
2757 chainlength = 0;
2758 } else {
2759 chainlength = sosendmaxchain;
2760 }
2761
2762 socket_unlock(so, 0);
2763
2764 /*
2765 * Find a set of uio that fit in a reasonable number
2766 * of mbuf packets
2767 */
2768 for (i = uiofirst; i < uiocnt; i++) {
2769 struct uio *auio = uioarray[i];
2770
2771 len = uio_resid(auio);
2772
2773 /* Do nothing for empty messages */
2774 if (len == 0) {
2775 continue;
2776 }
2777
2778 num_needed += 1;
2779 uiolast += 1;
2780
2781 if (len > maxpktlen) {
2782 maxpktlen = len;
2783 }
2784
2785 chainlength += len;
2786 if (chainlength > sosendmaxchain) {
2787 break;
2788 }
2789 }
2790 /*
2791 * Nothing left to send
2792 */
2793 if (num_needed == 0) {
2794 socket_lock(so, 0);
2795 break;
2796 }
2797 /*
2798 * Allocate buffer large enough to include headroom space for
2799 * network and link header
2800 *
2801 */
2802 bytes_to_alloc = (int) maxpktlen + headroom;
2803
2804 /*
2805 * Allocate a single contiguous buffer of the smallest available
2806 * size when possible
2807 */
2808 if (bytes_to_alloc > MCLBYTES &&
2809 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2810 freelist = m_getpackets_internal(
2811 (unsigned int *)&num_needed,
2812 num_needed, M_WAIT, 1,
2813 MBIGCLBYTES);
2814 } else if (bytes_to_alloc > _MHLEN &&
2815 bytes_to_alloc <= MCLBYTES) {
2816 freelist = m_getpackets_internal(
2817 (unsigned int *)&num_needed,
2818 num_needed, M_WAIT, 1,
2819 MCLBYTES);
2820 } else {
2821 freelist = m_allocpacket_internal(
2822 (unsigned int *)&num_needed,
2823 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2824 }
2825
2826 if (freelist == NULL) {
2827 socket_lock(so, 0);
2828 error = ENOMEM;
2829 goto release;
2830 }
2831 /*
2832 * Copy each uio of the set into its own mbuf packet
2833 */
2834 for (i = uiofirst, m = freelist;
2835 i < uiolast && m != NULL;
2836 i++) {
2837 int bytes_to_copy;
2838 struct mbuf *n;
2839 struct uio *auio = uioarray[i];
2840
2841 bytes_to_copy = (int)uio_resid(auio);
2842
2843 /* Do nothing for empty messages */
2844 if (bytes_to_copy == 0) {
2845 continue;
2846 }
2847 /*
2848 * Leave headroom for protocol headers
2849 * in the first mbuf of the chain
2850 */
2851 m->m_data += headroom;
2852
2853 for (n = m; n != NULL; n = n->m_next) {
2854 if ((m->m_flags & M_EXT)) {
2855 mlen = m->m_ext.ext_size -
2856 M_LEADINGSPACE(m);
2857 } else if ((m->m_flags & M_PKTHDR)) {
2858 mlen =
2859 MHLEN - M_LEADINGSPACE(m);
2860 } else {
2861 mlen = MLEN - M_LEADINGSPACE(m);
2862 }
2863 len = imin((int)mlen, bytes_to_copy);
2864
2865 /*
2866 * Note: uiomove() decrements the iovec
2867 * length
2868 */
2869 error = uiomove(mtod(n, caddr_t),
2870 (int)len, auio);
2871 if (error != 0) {
2872 break;
2873 }
2874 n->m_len = (int32_t)len;
2875 m->m_pkthdr.len += len;
2876
2877 VERIFY(m->m_pkthdr.len <= maxpktlen);
2878
2879 bytes_to_copy -= len;
2880 resid -= len;
2881 }
2882 if (m->m_pkthdr.len == 0) {
2883 printf(
2884 "%s:%d so %llx pkt %llx type %u len null\n",
2885 __func__, __LINE__,
2886 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2887 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2888 m->m_type);
2889 }
2890 if (error != 0) {
2891 break;
2892 }
2893 m = m->m_nextpkt;
2894 }
2895
2896 socket_lock(so, 0);
2897
2898 if (error) {
2899 goto release;
2900 }
2901 top = freelist;
2902 freelist = NULL;
2903
2904 if (dontroute) {
2905 so->so_options |= SO_DONTROUTE;
2906 }
2907
2908 if ((flags & MSG_SKIPCFIL) == 0) {
2909 struct mbuf **prevnextp = NULL;
2910
2911 for (i = uiofirst, m = top;
2912 i < uiolast && m != NULL;
2913 i++) {
2914 struct mbuf *nextpkt = m->m_nextpkt;
2915
2916 /*
2917 * Socket filter processing
2918 */
2919 error = sflt_data_out(so, NULL, &m,
2920 NULL, 0);
2921 if (error != 0 && error != EJUSTRETURN) {
2922 goto release;
2923 }
2924
2925 #if CONTENT_FILTER
2926 if (error == 0) {
2927 /*
2928 * Content filter processing
2929 */
2930 error = cfil_sock_data_out(so, NULL, m,
2931 NULL, 0, dgram_flow_entry);
2932 if (error != 0 && error != EJUSTRETURN) {
2933 goto release;
2934 }
2935 }
2936 #endif /* CONTENT_FILTER */
2937 /*
2938 * Remove packet from the list when
2939 * swallowed by a filter
2940 */
2941 if (error == EJUSTRETURN) {
2942 error = 0;
2943 if (prevnextp != NULL) {
2944 *prevnextp = nextpkt;
2945 } else {
2946 top = nextpkt;
2947 }
2948 }
2949
2950 m = nextpkt;
2951 if (m != NULL) {
2952 prevnextp = &m->m_nextpkt;
2953 }
2954 }
2955 }
2956 if (top != NULL) {
2957 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2958 (so, 0, top, NULL, NULL, p);
2959 }
2960
2961 if (dontroute) {
2962 so->so_options &= ~SO_DONTROUTE;
2963 }
2964
2965 top = NULL;
2966 uiofirst = uiolast;
2967 } while (resid > 0 && error == 0);
2968 release:
2969 if (sblocked) {
2970 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2971 } else {
2972 socket_unlock(so, 1);
2973 }
2974 out:
2975 if (top != NULL) {
2976 m_freem(top);
2977 }
2978 if (freelist != NULL) {
2979 m_freem_list(freelist);
2980 }
2981
2982 if (dgram_flow_entry != NULL) {
2983 soflow_free_flow(dgram_flow_entry);
2984 }
2985
2986 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2987 so->so_snd.sb_cc, 0, error);
2988
2989 return error;
2990 }
2991
2992 /*
2993 * May return ERESTART when packet is dropped by MAC policy check
2994 */
2995 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2996 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2997 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2998 {
2999 int error = 0;
3000 struct mbuf *m = *mp;
3001 struct mbuf *nextrecord = *nextrecordp;
3002
3003 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3004 #if CONFIG_MACF_SOCKET_SUBSET
3005 /*
3006 * Call the MAC framework for policy checking if we're in
3007 * the user process context and the socket isn't connected.
3008 */
3009 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3010 struct mbuf *m0 = m;
3011 /*
3012 * Dequeue this record (temporarily) from the receive
3013 * list since we're about to drop the socket's lock
3014 * where a new record may arrive and be appended to
3015 * the list. Upon MAC policy failure, the record
3016 * will be freed. Otherwise, we'll add it back to
3017 * the head of the list. We cannot rely on SB_LOCK
3018 * because append operation uses the socket's lock.
3019 */
3020 do {
3021 m->m_nextpkt = NULL;
3022 sbfree(&so->so_rcv, m);
3023 m = m->m_next;
3024 } while (m != NULL);
3025 m = m0;
3026 so->so_rcv.sb_mb = nextrecord;
3027 SB_EMPTY_FIXUP(&so->so_rcv);
3028 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3029 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3030 socket_unlock(so, 0);
3031
3032 error = mac_socket_check_received(kauth_cred_get(), so,
3033 mtod(m, struct sockaddr *));
3034
3035 if (error != 0) {
3036 /*
3037 * MAC policy failure; free this record and
3038 * process the next record (or block until
3039 * one is available). We have adjusted sb_cc
3040 * and sb_mbcnt above so there is no need to
3041 * call sbfree() again.
3042 */
3043 m_freem(m);
3044 /*
3045 * Clear SB_LOCK but don't unlock the socket.
3046 * Process the next record or wait for one.
3047 */
3048 socket_lock(so, 0);
3049 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3050 error = ERESTART;
3051 goto done;
3052 }
3053 socket_lock(so, 0);
3054 /*
3055 * If the socket has been defunct'd, drop it.
3056 */
3057 if (so->so_flags & SOF_DEFUNCT) {
3058 m_freem(m);
3059 error = ENOTCONN;
3060 goto done;
3061 }
3062 /*
3063 * Re-adjust the socket receive list and re-enqueue
3064 * the record in front of any packets which may have
3065 * been appended while we dropped the lock.
3066 */
3067 for (m = m0; m->m_next != NULL; m = m->m_next) {
3068 sballoc(&so->so_rcv, m);
3069 }
3070 sballoc(&so->so_rcv, m);
3071 if (so->so_rcv.sb_mb == NULL) {
3072 so->so_rcv.sb_lastrecord = m0;
3073 so->so_rcv.sb_mbtail = m;
3074 }
3075 m = m0;
3076 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3077 so->so_rcv.sb_mb = m;
3078 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3079 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3080 }
3081 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3082 if (psa != NULL) {
3083 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3084 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3085 error = EWOULDBLOCK;
3086 goto done;
3087 }
3088 }
3089 if (flags & MSG_PEEK) {
3090 m = m->m_next;
3091 } else {
3092 sbfree(&so->so_rcv, m);
3093 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3094 panic("%s: about to create invalid socketbuf",
3095 __func__);
3096 /* NOTREACHED */
3097 }
3098 MFREE(m, so->so_rcv.sb_mb);
3099 m = so->so_rcv.sb_mb;
3100 if (m != NULL) {
3101 m->m_nextpkt = nextrecord;
3102 } else {
3103 so->so_rcv.sb_mb = nextrecord;
3104 SB_EMPTY_FIXUP(&so->so_rcv);
3105 }
3106 }
3107 done:
3108 *mp = m;
3109 *nextrecordp = nextrecord;
3110
3111 return error;
3112 }
3113
3114 /*
3115 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3116 * so clear the data portion in order not to leak the file pointers
3117 */
3118 static void
sopeek_scm_rights(struct mbuf * rights)3119 sopeek_scm_rights(struct mbuf *rights)
3120 {
3121 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3122
3123 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3124 VERIFY(cm->cmsg_len <= rights->m_len);
3125 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3126 }
3127 }
3128
3129 /*
3130 * Process one or more MT_CONTROL mbufs present before any data mbufs
3131 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3132 * just copy the data; if !MSG_PEEK, we call into the protocol to
3133 * perform externalization.
3134 */
3135 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3136 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3137 struct mbuf **mp, struct mbuf **nextrecordp)
3138 {
3139 int error = 0;
3140 struct mbuf *cm = NULL, *cmn;
3141 struct mbuf **cme = &cm;
3142 struct sockbuf *sb_rcv = &so->so_rcv;
3143 struct mbuf **msgpcm = NULL;
3144 struct mbuf *m = *mp;
3145 struct mbuf *nextrecord = *nextrecordp;
3146 struct protosw *pr = so->so_proto;
3147
3148 /*
3149 * Externalizing the control messages would require us to
3150 * drop the socket's lock below. Once we re-acquire the
3151 * lock, the mbuf chain might change. In order to preserve
3152 * consistency, we unlink all control messages from the
3153 * first mbuf chain in one shot and link them separately
3154 * onto a different chain.
3155 */
3156 do {
3157 if (flags & MSG_PEEK) {
3158 if (controlp != NULL) {
3159 if (*controlp == NULL) {
3160 msgpcm = controlp;
3161 }
3162 *controlp = m_copy(m, 0, m->m_len);
3163
3164 /*
3165 * If we failed to allocate an mbuf,
3166 * release any previously allocated
3167 * mbufs for control data. Return
3168 * an error. Keep the mbufs in the
3169 * socket as this is using
3170 * MSG_PEEK flag.
3171 */
3172 if (*controlp == NULL) {
3173 m_freem(*msgpcm);
3174 error = ENOBUFS;
3175 goto done;
3176 }
3177
3178 if (pr->pr_domain->dom_externalize != NULL) {
3179 sopeek_scm_rights(*controlp);
3180 }
3181
3182 controlp = &(*controlp)->m_next;
3183 }
3184 m = m->m_next;
3185 } else {
3186 m->m_nextpkt = NULL;
3187 sbfree(sb_rcv, m);
3188 sb_rcv->sb_mb = m->m_next;
3189 m->m_next = NULL;
3190 *cme = m;
3191 cme = &(*cme)->m_next;
3192 m = sb_rcv->sb_mb;
3193 }
3194 } while (m != NULL && m->m_type == MT_CONTROL);
3195
3196 if (!(flags & MSG_PEEK)) {
3197 if (sb_rcv->sb_mb != NULL) {
3198 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3199 } else {
3200 sb_rcv->sb_mb = nextrecord;
3201 SB_EMPTY_FIXUP(sb_rcv);
3202 }
3203 if (nextrecord == NULL) {
3204 sb_rcv->sb_lastrecord = m;
3205 }
3206 }
3207
3208 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3209 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3210
3211 while (cm != NULL) {
3212 int cmsg_level;
3213 int cmsg_type;
3214
3215 cmn = cm->m_next;
3216 cm->m_next = NULL;
3217 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3218 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3219
3220 /*
3221 * Call the protocol to externalize SCM_RIGHTS message
3222 * and return the modified message to the caller upon
3223 * success. Otherwise, all other control messages are
3224 * returned unmodified to the caller. Note that we
3225 * only get into this loop if MSG_PEEK is not set.
3226 */
3227 if (pr->pr_domain->dom_externalize != NULL &&
3228 cmsg_level == SOL_SOCKET &&
3229 cmsg_type == SCM_RIGHTS) {
3230 /*
3231 * Release socket lock: see 3903171. This
3232 * would also allow more records to be appended
3233 * to the socket buffer. We still have SB_LOCK
3234 * set on it, so we can be sure that the head
3235 * of the mbuf chain won't change.
3236 */
3237 socket_unlock(so, 0);
3238 error = (*pr->pr_domain->dom_externalize)(cm);
3239 socket_lock(so, 0);
3240 } else {
3241 error = 0;
3242 }
3243
3244 if (controlp != NULL && error == 0) {
3245 *controlp = cm;
3246 controlp = &(*controlp)->m_next;
3247 } else {
3248 (void) m_free(cm);
3249 }
3250 cm = cmn;
3251 }
3252 /*
3253 * Update the value of nextrecord in case we received new
3254 * records when the socket was unlocked above for
3255 * externalizing SCM_RIGHTS.
3256 */
3257 if (m != NULL) {
3258 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3259 } else {
3260 nextrecord = sb_rcv->sb_mb;
3261 }
3262
3263 done:
3264 *mp = m;
3265 *nextrecordp = nextrecord;
3266
3267 return error;
3268 }
3269
3270 /*
3271 * If we have less data than requested, block awaiting more
3272 * (subject to any timeout) if:
3273 * 1. the current count is less than the low water mark, or
3274 * 2. MSG_WAITALL is set, and it is possible to do the entire
3275 * receive operation at once if we block (resid <= hiwat).
3276 * 3. MSG_DONTWAIT is not set
3277 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3278 * we have to do the receive in sections, and thus risk returning
3279 * a short count if a timeout or signal occurs after we start.
3280 */
3281 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3282 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3283 {
3284 struct protosw *pr = so->so_proto;
3285
3286 /* No mbufs in the receive-queue? Wait! */
3287 if (m == NULL) {
3288 return true;
3289 }
3290
3291 /* Not enough data in the receive socket-buffer - we may have to wait */
3292 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3293 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3294 /*
3295 * Application did set the lowater-mark, so we should wait for
3296 * this data to be present.
3297 */
3298 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3299 return true;
3300 }
3301
3302 /*
3303 * Application wants all the data - so let's try to do the
3304 * receive-operation at once by waiting for everything to
3305 * be there.
3306 */
3307 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3308 return true;
3309 }
3310 }
3311
3312 return false;
3313 }
3314
3315 /*
3316 * Implement receive operations on a socket.
3317 * We depend on the way that records are added to the sockbuf
3318 * by sbappend*. In particular, each record (mbufs linked through m_next)
3319 * must begin with an address if the protocol so specifies,
3320 * followed by an optional mbuf or mbufs containing ancillary data,
3321 * and then zero or more mbufs of data.
3322 * In order to avoid blocking network interrupts for the entire time here,
3323 * we splx() while doing the actual copy to user space.
3324 * Although the sockbuf is locked, new data may still be appended,
3325 * and thus we must maintain consistency of the sockbuf during that time.
3326 *
3327 * The caller may receive the data as a single mbuf chain by supplying
3328 * an mbuf **mp0 for use in returning the chain. The uio is then used
3329 * only for the count in uio_resid.
3330 *
3331 * Returns: 0 Success
3332 * ENOBUFS
3333 * ENOTCONN
3334 * EWOULDBLOCK
3335 * uiomove:EFAULT
3336 * sblock:EWOULDBLOCK
3337 * sblock:EINTR
3338 * sbwait:EBADF
3339 * sbwait:EINTR
3340 * sodelayed_copy:EFAULT
3341 * <pru_rcvoob>:EINVAL[TCP]
3342 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3343 * <pru_rcvoob>:???
3344 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3345 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3346 * <pr_domain->dom_externalize>:???
3347 *
3348 * Notes: Additional return values from calls through <pru_rcvoob> and
3349 * <pr_domain->dom_externalize> depend on protocols other than
3350 * TCP or AF_UNIX, which are documented above.
3351 */
3352 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3353 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3354 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3355 {
3356 struct mbuf *m, **mp, *ml = NULL;
3357 struct mbuf *nextrecord, *free_list;
3358 int flags, error, offset;
3359 user_ssize_t len;
3360 struct protosw *pr = so->so_proto;
3361 int moff, type = 0;
3362 user_ssize_t orig_resid = uio_resid(uio);
3363 user_ssize_t delayed_copy_len;
3364 int can_delay;
3365 struct proc *p = current_proc();
3366 boolean_t en_tracing = FALSE;
3367
3368 /*
3369 * Sanity check on the length passed by caller as we are making 'int'
3370 * comparisons
3371 */
3372 if (orig_resid < 0 || orig_resid > INT_MAX) {
3373 return EINVAL;
3374 }
3375
3376 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3377 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3378 so->so_rcv.sb_hiwat);
3379
3380 socket_lock(so, 1);
3381 so_update_last_owner_locked(so, p);
3382 so_update_policy(so);
3383
3384 #ifdef MORE_LOCKING_DEBUG
3385 if (so->so_usecount == 1) {
3386 panic("%s: so=%x no other reference on socket", __func__, so);
3387 /* NOTREACHED */
3388 }
3389 #endif
3390 mp = mp0;
3391 if (psa != NULL) {
3392 *psa = NULL;
3393 }
3394 if (controlp != NULL) {
3395 *controlp = NULL;
3396 }
3397 if (flagsp != NULL) {
3398 flags = *flagsp & ~MSG_EOR;
3399 } else {
3400 flags = 0;
3401 }
3402
3403 /*
3404 * If a recv attempt is made on a previously-accepted socket
3405 * that has been marked as inactive (disconnected), reject
3406 * the request.
3407 */
3408 if (so->so_flags & SOF_DEFUNCT) {
3409 struct sockbuf *sb = &so->so_rcv;
3410
3411 error = ENOTCONN;
3412 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3413 __func__, proc_pid(p), proc_best_name(p),
3414 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3415 SOCK_DOM(so), SOCK_TYPE(so), error);
3416 /*
3417 * This socket should have been disconnected and flushed
3418 * prior to being returned from sodefunct(); there should
3419 * be no data on its receive list, so panic otherwise.
3420 */
3421 if (so->so_state & SS_DEFUNCT) {
3422 sb_empty_assert(sb, __func__);
3423 }
3424 socket_unlock(so, 1);
3425 return error;
3426 }
3427
3428 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3429 pr->pr_usrreqs->pru_preconnect) {
3430 /*
3431 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3432 * calling write() right after this. *If* the app calls a read
3433 * we do not want to block this read indefinetely. Thus,
3434 * we trigger a connect so that the session gets initiated.
3435 */
3436 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3437
3438 if (error) {
3439 socket_unlock(so, 1);
3440 return error;
3441 }
3442 }
3443
3444 if (ENTR_SHOULDTRACE &&
3445 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3446 /*
3447 * enable energy tracing for inet sockets that go over
3448 * non-loopback interfaces only.
3449 */
3450 struct inpcb *inp = sotoinpcb(so);
3451 if (inp->inp_last_outifp != NULL &&
3452 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3453 en_tracing = TRUE;
3454 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3455 VM_KERNEL_ADDRPERM(so),
3456 ((so->so_state & SS_NBIO) ?
3457 kEnTrFlagNonBlocking : 0),
3458 (int64_t)orig_resid);
3459 }
3460 }
3461
3462 /*
3463 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3464 * regardless of the flags argument. Here is the case were
3465 * out-of-band data is not inline.
3466 */
3467 if ((flags & MSG_OOB) ||
3468 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3469 (so->so_options & SO_OOBINLINE) == 0 &&
3470 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3471 m = m_get(M_WAIT, MT_DATA);
3472 if (m == NULL) {
3473 socket_unlock(so, 1);
3474 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3475 ENOBUFS, 0, 0, 0, 0);
3476 return ENOBUFS;
3477 }
3478 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3479 if (error) {
3480 goto bad;
3481 }
3482 socket_unlock(so, 0);
3483 do {
3484 error = uiomove(mtod(m, caddr_t),
3485 imin((int)uio_resid(uio), m->m_len), uio);
3486 m = m_free(m);
3487 } while (uio_resid(uio) && error == 0 && m != NULL);
3488 socket_lock(so, 0);
3489 bad:
3490 if (m != NULL) {
3491 m_freem(m);
3492 }
3493
3494 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3495 if (error == EWOULDBLOCK || error == EINVAL) {
3496 /*
3497 * Let's try to get normal data:
3498 * EWOULDBLOCK: out-of-band data not
3499 * receive yet. EINVAL: out-of-band data
3500 * already read.
3501 */
3502 error = 0;
3503 goto nooob;
3504 } else if (error == 0 && flagsp != NULL) {
3505 *flagsp |= MSG_OOB;
3506 }
3507 }
3508 socket_unlock(so, 1);
3509 if (en_tracing) {
3510 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3511 VM_KERNEL_ADDRPERM(so), 0,
3512 (int64_t)(orig_resid - uio_resid(uio)));
3513 }
3514 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3515 0, 0, 0, 0);
3516
3517 return error;
3518 }
3519 nooob:
3520 if (mp != NULL) {
3521 *mp = NULL;
3522 }
3523
3524 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3525 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3526 }
3527
3528 free_list = NULL;
3529 delayed_copy_len = 0;
3530 restart:
3531 #ifdef MORE_LOCKING_DEBUG
3532 if (so->so_usecount <= 1) {
3533 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3534 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3535 }
3536 #endif
3537 /*
3538 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3539 * and if so just return to the caller. This could happen when
3540 * soreceive() is called by a socket upcall function during the
3541 * time the socket is freed. The socket buffer would have been
3542 * locked across the upcall, therefore we cannot put this thread
3543 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3544 * we may livelock), because the lock on the socket buffer will
3545 * only be released when the upcall routine returns to its caller.
3546 * Because the socket has been officially closed, there can be
3547 * no further read on it.
3548 *
3549 * A multipath subflow socket would have its SS_NOFDREF set by
3550 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3551 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3552 */
3553 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3554 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3555 socket_unlock(so, 1);
3556 return 0;
3557 }
3558
3559 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3560 if (error) {
3561 socket_unlock(so, 1);
3562 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3563 0, 0, 0, 0);
3564 if (en_tracing) {
3565 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3566 VM_KERNEL_ADDRPERM(so), 0,
3567 (int64_t)(orig_resid - uio_resid(uio)));
3568 }
3569 return error;
3570 }
3571
3572 m = so->so_rcv.sb_mb;
3573 if (so_should_wait(so, uio, m, flags)) {
3574 /*
3575 * Panic if we notice inconsistencies in the socket's
3576 * receive list; both sb_mb and sb_cc should correctly
3577 * reflect the contents of the list, otherwise we may
3578 * end up with false positives during select() or poll()
3579 * which could put the application in a bad state.
3580 */
3581 SB_MB_CHECK(&so->so_rcv);
3582
3583 if (so->so_error) {
3584 if (m != NULL) {
3585 goto dontblock;
3586 }
3587 error = so->so_error;
3588 if ((flags & MSG_PEEK) == 0) {
3589 so->so_error = 0;
3590 }
3591 goto release;
3592 }
3593 if (so->so_state & SS_CANTRCVMORE) {
3594 #if CONTENT_FILTER
3595 /*
3596 * Deal with half closed connections
3597 */
3598 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3599 cfil_sock_data_pending(&so->so_rcv) != 0) {
3600 CFIL_LOG(LOG_INFO,
3601 "so %llx ignore SS_CANTRCVMORE",
3602 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3603 } else
3604 #endif /* CONTENT_FILTER */
3605 if (m != NULL) {
3606 goto dontblock;
3607 } else {
3608 goto release;
3609 }
3610 }
3611 for (; m != NULL; m = m->m_next) {
3612 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3613 m = so->so_rcv.sb_mb;
3614 goto dontblock;
3615 }
3616 }
3617 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3618 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3619 error = ENOTCONN;
3620 goto release;
3621 }
3622 if (uio_resid(uio) == 0) {
3623 goto release;
3624 }
3625
3626 if ((so->so_state & SS_NBIO) ||
3627 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3628 error = EWOULDBLOCK;
3629 goto release;
3630 }
3631 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3632 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3633 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3634 #if EVEN_MORE_LOCKING_DEBUG
3635 if (socket_debug) {
3636 printf("Waiting for socket data\n");
3637 }
3638 #endif
3639
3640 /*
3641 * Depending on the protocol (e.g. TCP), the following
3642 * might cause the socket lock to be dropped and later
3643 * be reacquired, and more data could have arrived and
3644 * have been appended to the receive socket buffer by
3645 * the time it returns. Therefore, we only sleep in
3646 * sbwait() below if and only if the wait-condition is still
3647 * true.
3648 */
3649 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3650 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3651 }
3652
3653 error = 0;
3654 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3655 error = sbwait(&so->so_rcv);
3656 }
3657
3658 #if EVEN_MORE_LOCKING_DEBUG
3659 if (socket_debug) {
3660 printf("SORECEIVE - sbwait returned %d\n", error);
3661 }
3662 #endif
3663 if (so->so_usecount < 1) {
3664 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3665 __func__, so, so->so_usecount);
3666 /* NOTREACHED */
3667 }
3668 if (error) {
3669 socket_unlock(so, 1);
3670 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3671 0, 0, 0, 0);
3672 if (en_tracing) {
3673 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3674 VM_KERNEL_ADDRPERM(so), 0,
3675 (int64_t)(orig_resid - uio_resid(uio)));
3676 }
3677 return error;
3678 }
3679 goto restart;
3680 }
3681 dontblock:
3682 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3683 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3684 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3685 nextrecord = m->m_nextpkt;
3686
3687 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3688 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3689 mp0 == NULL);
3690 if (error == ERESTART) {
3691 goto restart;
3692 } else if (error != 0) {
3693 goto release;
3694 }
3695 orig_resid = 0;
3696 }
3697
3698 /*
3699 * Process one or more MT_CONTROL mbufs present before any data mbufs
3700 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3701 * just copy the data; if !MSG_PEEK, we call into the protocol to
3702 * perform externalization.
3703 */
3704 if (m != NULL && m->m_type == MT_CONTROL) {
3705 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3706 if (error != 0) {
3707 goto release;
3708 }
3709 orig_resid = 0;
3710 }
3711
3712 if (m != NULL) {
3713 if (!(flags & MSG_PEEK)) {
3714 /*
3715 * We get here because m points to an mbuf following
3716 * any MT_SONAME or MT_CONTROL mbufs which have been
3717 * processed above. In any case, m should be pointing
3718 * to the head of the mbuf chain, and the nextrecord
3719 * should be either NULL or equal to m->m_nextpkt.
3720 * See comments above about SB_LOCK.
3721 */
3722 if (m != so->so_rcv.sb_mb ||
3723 m->m_nextpkt != nextrecord) {
3724 panic("%s: post-control !sync so=%p m=%p "
3725 "nextrecord=%p\n", __func__, so, m,
3726 nextrecord);
3727 /* NOTREACHED */
3728 }
3729 if (nextrecord == NULL) {
3730 so->so_rcv.sb_lastrecord = m;
3731 }
3732 }
3733 type = m->m_type;
3734 if (type == MT_OOBDATA) {
3735 flags |= MSG_OOB;
3736 }
3737 } else {
3738 if (!(flags & MSG_PEEK)) {
3739 SB_EMPTY_FIXUP(&so->so_rcv);
3740 }
3741 }
3742 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3743 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3744
3745 moff = 0;
3746 offset = 0;
3747
3748 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3749 can_delay = 1;
3750 } else {
3751 can_delay = 0;
3752 }
3753
3754 while (m != NULL &&
3755 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3756 if (m->m_type == MT_OOBDATA) {
3757 if (type != MT_OOBDATA) {
3758 break;
3759 }
3760 } else if (type == MT_OOBDATA) {
3761 break;
3762 }
3763
3764 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3765 m->m_type != MT_HEADER) {
3766 break;
3767 }
3768 /*
3769 * Make sure to allways set MSG_OOB event when getting
3770 * out of band data inline.
3771 */
3772 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3773 (so->so_options & SO_OOBINLINE) != 0 &&
3774 (so->so_state & SS_RCVATMARK) != 0) {
3775 flags |= MSG_OOB;
3776 }
3777 so->so_state &= ~SS_RCVATMARK;
3778 len = uio_resid(uio) - delayed_copy_len;
3779 if (so->so_oobmark && len > so->so_oobmark - offset) {
3780 len = so->so_oobmark - offset;
3781 }
3782 if (len > m->m_len - moff) {
3783 len = m->m_len - moff;
3784 }
3785 /*
3786 * If mp is set, just pass back the mbufs.
3787 * Otherwise copy them out via the uio, then free.
3788 * Sockbuf must be consistent here (points to current mbuf,
3789 * it points to next record) when we drop priority;
3790 * we must note any additions to the sockbuf when we
3791 * block interrupts again.
3792 */
3793 if (mp == NULL) {
3794 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3795 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3796 if (can_delay && len == m->m_len) {
3797 /*
3798 * only delay the copy if we're consuming the
3799 * mbuf and we're NOT in MSG_PEEK mode
3800 * and we have enough data to make it worthwile
3801 * to drop and retake the lock... can_delay
3802 * reflects the state of the 2 latter
3803 * constraints moff should always be zero
3804 * in these cases
3805 */
3806 delayed_copy_len += len;
3807 } else {
3808 if (delayed_copy_len) {
3809 error = sodelayed_copy(so, uio,
3810 &free_list, &delayed_copy_len);
3811
3812 if (error) {
3813 goto release;
3814 }
3815 /*
3816 * can only get here if MSG_PEEK is not
3817 * set therefore, m should point at the
3818 * head of the rcv queue; if it doesn't,
3819 * it means something drastically
3820 * changed while we were out from behind
3821 * the lock in sodelayed_copy. perhaps
3822 * a RST on the stream. in any event,
3823 * the stream has been interrupted. it's
3824 * probably best just to return whatever
3825 * data we've moved and let the caller
3826 * sort it out...
3827 */
3828 if (m != so->so_rcv.sb_mb) {
3829 break;
3830 }
3831 }
3832 socket_unlock(so, 0);
3833 error = uiomove(mtod(m, caddr_t) + moff,
3834 (int)len, uio);
3835 socket_lock(so, 0);
3836
3837 if (error) {
3838 goto release;
3839 }
3840 }
3841 } else {
3842 uio_setresid(uio, (uio_resid(uio) - len));
3843 }
3844 if (len == m->m_len - moff) {
3845 if (m->m_flags & M_EOR) {
3846 flags |= MSG_EOR;
3847 }
3848 if (flags & MSG_PEEK) {
3849 m = m->m_next;
3850 moff = 0;
3851 } else {
3852 nextrecord = m->m_nextpkt;
3853 sbfree(&so->so_rcv, m);
3854 m->m_nextpkt = NULL;
3855
3856 if (mp != NULL) {
3857 *mp = m;
3858 mp = &m->m_next;
3859 so->so_rcv.sb_mb = m = m->m_next;
3860 *mp = NULL;
3861 } else {
3862 if (free_list == NULL) {
3863 free_list = m;
3864 } else {
3865 ml->m_next = m;
3866 }
3867 ml = m;
3868 so->so_rcv.sb_mb = m = m->m_next;
3869 ml->m_next = NULL;
3870 }
3871 if (m != NULL) {
3872 m->m_nextpkt = nextrecord;
3873 if (nextrecord == NULL) {
3874 so->so_rcv.sb_lastrecord = m;
3875 }
3876 } else {
3877 so->so_rcv.sb_mb = nextrecord;
3878 SB_EMPTY_FIXUP(&so->so_rcv);
3879 }
3880 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3881 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3882 }
3883 } else {
3884 if (flags & MSG_PEEK) {
3885 moff += len;
3886 } else {
3887 if (mp != NULL) {
3888 int copy_flag;
3889
3890 if (flags & MSG_DONTWAIT) {
3891 copy_flag = M_DONTWAIT;
3892 } else {
3893 copy_flag = M_WAIT;
3894 }
3895 *mp = m_copym(m, 0, (int)len, copy_flag);
3896 /*
3897 * Failed to allocate an mbuf?
3898 * Adjust uio_resid back, it was
3899 * adjusted down by len bytes which
3900 * we didn't copy over.
3901 */
3902 if (*mp == NULL) {
3903 uio_setresid(uio,
3904 (uio_resid(uio) + len));
3905 break;
3906 }
3907 }
3908 m->m_data += len;
3909 m->m_len -= len;
3910 so->so_rcv.sb_cc -= len;
3911 }
3912 }
3913 if (so->so_oobmark) {
3914 if ((flags & MSG_PEEK) == 0) {
3915 so->so_oobmark -= len;
3916 if (so->so_oobmark == 0) {
3917 so->so_state |= SS_RCVATMARK;
3918 break;
3919 }
3920 } else {
3921 offset += len;
3922 if (offset == so->so_oobmark) {
3923 break;
3924 }
3925 }
3926 }
3927 if (flags & MSG_EOR) {
3928 break;
3929 }
3930 /*
3931 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3932 * (for non-atomic socket), we must not quit until
3933 * "uio->uio_resid == 0" or an error termination.
3934 * If a signal/timeout occurs, return with a short
3935 * count but without error. Keep sockbuf locked
3936 * against other readers.
3937 */
3938 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3939 (uio_resid(uio) - delayed_copy_len) > 0 &&
3940 !sosendallatonce(so) && !nextrecord) {
3941 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3942 #if CONTENT_FILTER
3943 && cfil_sock_data_pending(&so->so_rcv) == 0
3944 #endif /* CONTENT_FILTER */
3945 )) {
3946 goto release;
3947 }
3948
3949 /*
3950 * Depending on the protocol (e.g. TCP), the following
3951 * might cause the socket lock to be dropped and later
3952 * be reacquired, and more data could have arrived and
3953 * have been appended to the receive socket buffer by
3954 * the time it returns. Therefore, we only sleep in
3955 * sbwait() below if and only if the socket buffer is
3956 * empty, in order to avoid a false sleep.
3957 */
3958 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3959 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3960 }
3961
3962 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3963 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3964
3965 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3966 error = 0;
3967 goto release;
3968 }
3969 /*
3970 * have to wait until after we get back from the sbwait
3971 * to do the copy because we will drop the lock if we
3972 * have enough data that has been delayed... by dropping
3973 * the lock we open up a window allowing the netisr
3974 * thread to process the incoming packets and to change
3975 * the state of this socket... we're issuing the sbwait
3976 * because the socket is empty and we're expecting the
3977 * netisr thread to wake us up when more packets arrive;
3978 * if we allow that processing to happen and then sbwait
3979 * we could stall forever with packets sitting in the
3980 * socket if no further packets arrive from the remote
3981 * side.
3982 *
3983 * we want to copy before we've collected all the data
3984 * to satisfy this request to allow the copy to overlap
3985 * the incoming packet processing on an MP system
3986 */
3987 if (delayed_copy_len > sorecvmincopy &&
3988 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3989 error = sodelayed_copy(so, uio,
3990 &free_list, &delayed_copy_len);
3991
3992 if (error) {
3993 goto release;
3994 }
3995 }
3996 m = so->so_rcv.sb_mb;
3997 if (m != NULL) {
3998 nextrecord = m->m_nextpkt;
3999 }
4000 SB_MB_CHECK(&so->so_rcv);
4001 }
4002 }
4003 #ifdef MORE_LOCKING_DEBUG
4004 if (so->so_usecount <= 1) {
4005 panic("%s: after big while so=%p ref=%d on socket",
4006 __func__, so, so->so_usecount);
4007 /* NOTREACHED */
4008 }
4009 #endif
4010
4011 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4012 if (so->so_options & SO_DONTTRUNC) {
4013 flags |= MSG_RCVMORE;
4014 } else {
4015 flags |= MSG_TRUNC;
4016 if ((flags & MSG_PEEK) == 0) {
4017 (void) sbdroprecord(&so->so_rcv);
4018 }
4019 }
4020 }
4021
4022 /*
4023 * pru_rcvd below (for TCP) may cause more data to be received
4024 * if the socket lock is dropped prior to sending the ACK; some
4025 * legacy OpenTransport applications don't handle this well
4026 * (if it receives less data than requested while MSG_HAVEMORE
4027 * is set), and so we set the flag now based on what we know
4028 * prior to calling pru_rcvd.
4029 */
4030 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4031 flags |= MSG_HAVEMORE;
4032 }
4033
4034 if ((flags & MSG_PEEK) == 0) {
4035 if (m == NULL) {
4036 so->so_rcv.sb_mb = nextrecord;
4037 /*
4038 * First part is an inline SB_EMPTY_FIXUP(). Second
4039 * part makes sure sb_lastrecord is up-to-date if
4040 * there is still data in the socket buffer.
4041 */
4042 if (so->so_rcv.sb_mb == NULL) {
4043 so->so_rcv.sb_mbtail = NULL;
4044 so->so_rcv.sb_lastrecord = NULL;
4045 } else if (nextrecord->m_nextpkt == NULL) {
4046 so->so_rcv.sb_lastrecord = nextrecord;
4047 }
4048 SB_MB_CHECK(&so->so_rcv);
4049 }
4050 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4051 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4052 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4053 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4054 }
4055 }
4056
4057 if (delayed_copy_len) {
4058 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4059 if (error) {
4060 goto release;
4061 }
4062 }
4063 if (free_list != NULL) {
4064 m_freem_list(free_list);
4065 free_list = NULL;
4066 }
4067
4068 if (orig_resid == uio_resid(uio) && orig_resid &&
4069 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4070 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4071 goto restart;
4072 }
4073
4074 if (flagsp != NULL) {
4075 *flagsp |= flags;
4076 }
4077 release:
4078 #ifdef MORE_LOCKING_DEBUG
4079 if (so->so_usecount <= 1) {
4080 panic("%s: release so=%p ref=%d on socket", __func__,
4081 so, so->so_usecount);
4082 /* NOTREACHED */
4083 }
4084 #endif
4085 if (delayed_copy_len) {
4086 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4087 }
4088
4089 if (free_list != NULL) {
4090 m_freem_list(free_list);
4091 }
4092
4093 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4094
4095 if (en_tracing) {
4096 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4097 VM_KERNEL_ADDRPERM(so),
4098 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4099 (int64_t)(orig_resid - uio_resid(uio)));
4100 }
4101 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4102 so->so_rcv.sb_cc, 0, error);
4103
4104 return error;
4105 }
4106
4107 /*
4108 * Returns: 0 Success
4109 * uiomove:EFAULT
4110 */
4111 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4112 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4113 user_ssize_t *resid)
4114 {
4115 int error = 0;
4116 struct mbuf *m;
4117
4118 m = *free_list;
4119
4120 socket_unlock(so, 0);
4121
4122 while (m != NULL && error == 0) {
4123 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4124 m = m->m_next;
4125 }
4126 m_freem_list(*free_list);
4127
4128 *free_list = NULL;
4129 *resid = 0;
4130
4131 socket_lock(so, 0);
4132
4133 return error;
4134 }
4135
4136 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4137 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4138 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4139 {
4140 #pragma unused(so)
4141 int error = 0;
4142 struct mbuf *ml, *m;
4143 int i = 0;
4144 struct uio *auio;
4145
4146 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4147 ml = ml->m_nextpkt, i++) {
4148 auio = msgarray[i].uio;
4149 for (m = ml; m != NULL; m = m->m_next) {
4150 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4151 if (error != 0) {
4152 goto out;
4153 }
4154 }
4155 }
4156 out:
4157 m_freem_list(*free_list);
4158
4159 *free_list = NULL;
4160 *resid = 0;
4161
4162 return error;
4163 }
4164
4165 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4166 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4167 int *flagsp)
4168 {
4169 struct mbuf *m;
4170 struct mbuf *nextrecord;
4171 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4172 int error;
4173 user_ssize_t len, pktlen, delayed_copy_len = 0;
4174 struct protosw *pr = so->so_proto;
4175 user_ssize_t resid;
4176 struct proc *p = current_proc();
4177 struct uio *auio = NULL;
4178 int npkts = 0;
4179 int sblocked = 0;
4180 struct sockaddr **psa = NULL;
4181 struct mbuf **controlp = NULL;
4182 int can_delay;
4183 int flags;
4184 struct mbuf *free_others = NULL;
4185
4186 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4187 so, uiocnt,
4188 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4189
4190 /*
4191 * Sanity checks:
4192 * - Only supports don't wait flags
4193 * - Only support datagram sockets (could be extended to raw)
4194 * - Must be atomic
4195 * - Protocol must support packet chains
4196 * - The uio array is NULL (should we panic?)
4197 */
4198 if (flagsp != NULL) {
4199 flags = *flagsp;
4200 } else {
4201 flags = 0;
4202 }
4203 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4204 MSG_NBIO)) {
4205 printf("%s invalid flags 0x%x\n", __func__, flags);
4206 error = EINVAL;
4207 goto out;
4208 }
4209 if (so->so_type != SOCK_DGRAM) {
4210 error = EINVAL;
4211 goto out;
4212 }
4213 if (sosendallatonce(so) == 0) {
4214 error = EINVAL;
4215 goto out;
4216 }
4217 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4218 error = EPROTONOSUPPORT;
4219 goto out;
4220 }
4221 if (msgarray == NULL) {
4222 printf("%s uioarray is NULL\n", __func__);
4223 error = EINVAL;
4224 goto out;
4225 }
4226 if (uiocnt == 0) {
4227 printf("%s uiocnt is 0\n", __func__);
4228 error = EINVAL;
4229 goto out;
4230 }
4231 /*
4232 * Sanity check on the length passed by caller as we are making 'int'
4233 * comparisons
4234 */
4235 resid = recv_msg_array_resid(msgarray, uiocnt);
4236 if (resid < 0 || resid > INT_MAX) {
4237 error = EINVAL;
4238 goto out;
4239 }
4240
4241 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4242 can_delay = 1;
4243 } else {
4244 can_delay = 0;
4245 }
4246
4247 socket_lock(so, 1);
4248 so_update_last_owner_locked(so, p);
4249 so_update_policy(so);
4250
4251 #if NECP
4252 so_update_necp_policy(so, NULL, NULL);
4253 #endif /* NECP */
4254
4255 /*
4256 * If a recv attempt is made on a previously-accepted socket
4257 * that has been marked as inactive (disconnected), reject
4258 * the request.
4259 */
4260 if (so->so_flags & SOF_DEFUNCT) {
4261 struct sockbuf *sb = &so->so_rcv;
4262
4263 error = ENOTCONN;
4264 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4265 __func__, proc_pid(p), proc_best_name(p),
4266 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4267 SOCK_DOM(so), SOCK_TYPE(so), error);
4268 /*
4269 * This socket should have been disconnected and flushed
4270 * prior to being returned from sodefunct(); there should
4271 * be no data on its receive list, so panic otherwise.
4272 */
4273 if (so->so_state & SS_DEFUNCT) {
4274 sb_empty_assert(sb, __func__);
4275 }
4276 goto release;
4277 }
4278
4279 next:
4280 /*
4281 * The uio may be empty
4282 */
4283 if (npkts >= uiocnt) {
4284 error = 0;
4285 goto release;
4286 }
4287 restart:
4288 /*
4289 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4290 * and if so just return to the caller. This could happen when
4291 * soreceive() is called by a socket upcall function during the
4292 * time the socket is freed. The socket buffer would have been
4293 * locked across the upcall, therefore we cannot put this thread
4294 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4295 * we may livelock), because the lock on the socket buffer will
4296 * only be released when the upcall routine returns to its caller.
4297 * Because the socket has been officially closed, there can be
4298 * no further read on it.
4299 */
4300 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4301 (SS_NOFDREF | SS_CANTRCVMORE)) {
4302 error = 0;
4303 goto release;
4304 }
4305
4306 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4307 if (error) {
4308 goto release;
4309 }
4310 sblocked = 1;
4311
4312 m = so->so_rcv.sb_mb;
4313 /*
4314 * Block awaiting more datagram if needed
4315 */
4316 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4317 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4318 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4319 /*
4320 * Panic if we notice inconsistencies in the socket's
4321 * receive list; both sb_mb and sb_cc should correctly
4322 * reflect the contents of the list, otherwise we may
4323 * end up with false positives during select() or poll()
4324 * which could put the application in a bad state.
4325 */
4326 SB_MB_CHECK(&so->so_rcv);
4327
4328 if (so->so_error) {
4329 error = so->so_error;
4330 if ((flags & MSG_PEEK) == 0) {
4331 so->so_error = 0;
4332 }
4333 goto release;
4334 }
4335 if (so->so_state & SS_CANTRCVMORE) {
4336 goto release;
4337 }
4338 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4339 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4340 error = ENOTCONN;
4341 goto release;
4342 }
4343 if ((so->so_state & SS_NBIO) ||
4344 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4345 error = EWOULDBLOCK;
4346 goto release;
4347 }
4348 /*
4349 * Do not block if we got some data
4350 */
4351 if (free_list != NULL) {
4352 error = 0;
4353 goto release;
4354 }
4355
4356 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4357 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4358
4359 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4360 sblocked = 0;
4361
4362 error = sbwait(&so->so_rcv);
4363 if (error) {
4364 goto release;
4365 }
4366 goto restart;
4367 }
4368
4369 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4370 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4371 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4372
4373 /*
4374 * Consume the current uio index as we have a datagram
4375 */
4376 auio = msgarray[npkts].uio;
4377 resid = uio_resid(auio);
4378 msgarray[npkts].which |= SOCK_MSG_DATA;
4379 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4380 &msgarray[npkts].psa : NULL;
4381 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4382 &msgarray[npkts].controlp : NULL;
4383 npkts += 1;
4384 nextrecord = m->m_nextpkt;
4385
4386 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4387 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4388 if (error == ERESTART) {
4389 goto restart;
4390 } else if (error != 0) {
4391 goto release;
4392 }
4393 }
4394
4395 if (m != NULL && m->m_type == MT_CONTROL) {
4396 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4397 if (error != 0) {
4398 goto release;
4399 }
4400 }
4401
4402 if (m->m_pkthdr.len == 0) {
4403 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4404 __func__, __LINE__,
4405 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4406 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4407 m->m_type);
4408 }
4409
4410 /*
4411 * Loop to copy the mbufs of the current record
4412 * Support zero length packets
4413 */
4414 ml = NULL;
4415 pktlen = 0;
4416 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4417 if (m->m_len == 0) {
4418 panic("%p m_len zero", m);
4419 }
4420 if (m->m_type == 0) {
4421 panic("%p m_type zero", m);
4422 }
4423 /*
4424 * Clip to the residual length
4425 */
4426 if (len > m->m_len) {
4427 len = m->m_len;
4428 }
4429 pktlen += len;
4430 /*
4431 * Copy the mbufs via the uio or delay the copy
4432 * Sockbuf must be consistent here (points to current mbuf,
4433 * it points to next record) when we drop priority;
4434 * we must note any additions to the sockbuf when we
4435 * block interrupts again.
4436 */
4437 if (len > 0 && can_delay == 0) {
4438 socket_unlock(so, 0);
4439 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4440 socket_lock(so, 0);
4441 if (error) {
4442 goto release;
4443 }
4444 } else {
4445 delayed_copy_len += len;
4446 }
4447
4448 if (len == m->m_len) {
4449 /*
4450 * m was entirely copied
4451 */
4452 sbfree(&so->so_rcv, m);
4453 nextrecord = m->m_nextpkt;
4454 m->m_nextpkt = NULL;
4455
4456 /*
4457 * Set the first packet to the head of the free list
4458 */
4459 if (free_list == NULL) {
4460 free_list = m;
4461 }
4462 /*
4463 * Link current packet to tail of free list
4464 */
4465 if (ml == NULL) {
4466 if (free_tail != NULL) {
4467 free_tail->m_nextpkt = m;
4468 }
4469 free_tail = m;
4470 }
4471 /*
4472 * Link current mbuf to last mbuf of current packet
4473 */
4474 if (ml != NULL) {
4475 ml->m_next = m;
4476 }
4477 ml = m;
4478
4479 /*
4480 * Move next buf to head of socket buffer
4481 */
4482 so->so_rcv.sb_mb = m = ml->m_next;
4483 ml->m_next = NULL;
4484
4485 if (m != NULL) {
4486 m->m_nextpkt = nextrecord;
4487 if (nextrecord == NULL) {
4488 so->so_rcv.sb_lastrecord = m;
4489 }
4490 } else {
4491 so->so_rcv.sb_mb = nextrecord;
4492 SB_EMPTY_FIXUP(&so->so_rcv);
4493 }
4494 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4495 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4496 } else {
4497 /*
4498 * Stop the loop on partial copy
4499 */
4500 break;
4501 }
4502 }
4503 #ifdef MORE_LOCKING_DEBUG
4504 if (so->so_usecount <= 1) {
4505 panic("%s: after big while so=%llx ref=%d on socket",
4506 __func__,
4507 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4508 /* NOTREACHED */
4509 }
4510 #endif
4511 /*
4512 * Tell the caller we made a partial copy
4513 */
4514 if (m != NULL) {
4515 if (so->so_options & SO_DONTTRUNC) {
4516 /*
4517 * Copyout first the freelist then the partial mbuf
4518 */
4519 socket_unlock(so, 0);
4520 if (delayed_copy_len) {
4521 error = sodelayed_copy_list(so, msgarray,
4522 uiocnt, &free_list, &delayed_copy_len);
4523 }
4524
4525 if (error == 0) {
4526 error = uiomove(mtod(m, caddr_t), (int)len,
4527 auio);
4528 }
4529 socket_lock(so, 0);
4530 if (error) {
4531 goto release;
4532 }
4533
4534 m->m_data += len;
4535 m->m_len -= len;
4536 so->so_rcv.sb_cc -= len;
4537 flags |= MSG_RCVMORE;
4538 } else {
4539 (void) sbdroprecord(&so->so_rcv);
4540 nextrecord = so->so_rcv.sb_mb;
4541 m = NULL;
4542 flags |= MSG_TRUNC;
4543 }
4544 }
4545
4546 if (m == NULL) {
4547 so->so_rcv.sb_mb = nextrecord;
4548 /*
4549 * First part is an inline SB_EMPTY_FIXUP(). Second
4550 * part makes sure sb_lastrecord is up-to-date if
4551 * there is still data in the socket buffer.
4552 */
4553 if (so->so_rcv.sb_mb == NULL) {
4554 so->so_rcv.sb_mbtail = NULL;
4555 so->so_rcv.sb_lastrecord = NULL;
4556 } else if (nextrecord->m_nextpkt == NULL) {
4557 so->so_rcv.sb_lastrecord = nextrecord;
4558 }
4559 SB_MB_CHECK(&so->so_rcv);
4560 }
4561 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4562 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4563
4564 /*
4565 * We can continue to the next packet as long as:
4566 * - We haven't exhausted the uio array
4567 * - There was no error
4568 * - A packet was not truncated
4569 * - We can still receive more data
4570 */
4571 if (npkts < uiocnt && error == 0 &&
4572 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4573 (so->so_state & SS_CANTRCVMORE) == 0) {
4574 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4575 sblocked = 0;
4576
4577 goto next;
4578 }
4579 if (flagsp != NULL) {
4580 *flagsp |= flags;
4581 }
4582
4583 release:
4584 /*
4585 * pru_rcvd may cause more data to be received if the socket lock
4586 * is dropped so we set MSG_HAVEMORE now based on what we know.
4587 * That way the caller won't be surprised if it receives less data
4588 * than requested.
4589 */
4590 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4591 flags |= MSG_HAVEMORE;
4592 }
4593
4594 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4595 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4596 }
4597
4598 if (sblocked) {
4599 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4600 } else {
4601 socket_unlock(so, 1);
4602 }
4603
4604 if (delayed_copy_len) {
4605 error = sodelayed_copy_list(so, msgarray, uiocnt,
4606 &free_list, &delayed_copy_len);
4607 }
4608 out:
4609 /*
4610 * Amortize the cost of freeing the mbufs
4611 */
4612 if (free_list != NULL) {
4613 m_freem_list(free_list);
4614 }
4615 if (free_others != NULL) {
4616 m_freem_list(free_others);
4617 }
4618
4619 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4620 0, 0, 0, 0);
4621 return error;
4622 }
4623
4624 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4625 so_statistics_event_to_nstat_event(int64_t *input_options,
4626 uint64_t *nstat_event)
4627 {
4628 int error = 0;
4629 switch (*input_options) {
4630 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4631 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4632 break;
4633 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4634 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4635 break;
4636 #if (DEBUG || DEVELOPMENT)
4637 case SO_STATISTICS_EVENT_RESERVED_1:
4638 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4639 break;
4640 case SO_STATISTICS_EVENT_RESERVED_2:
4641 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4642 break;
4643 #endif /* (DEBUG || DEVELOPMENT) */
4644 default:
4645 error = EINVAL;
4646 break;
4647 }
4648 return error;
4649 }
4650
4651 /*
4652 * Returns: 0 Success
4653 * EINVAL
4654 * ENOTCONN
4655 * <pru_shutdown>:EINVAL
4656 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4657 * <pru_shutdown>:ENOBUFS[TCP]
4658 * <pru_shutdown>:EMSGSIZE[TCP]
4659 * <pru_shutdown>:EHOSTUNREACH[TCP]
4660 * <pru_shutdown>:ENETUNREACH[TCP]
4661 * <pru_shutdown>:ENETDOWN[TCP]
4662 * <pru_shutdown>:ENOMEM[TCP]
4663 * <pru_shutdown>:EACCES[TCP]
4664 * <pru_shutdown>:EMSGSIZE[TCP]
4665 * <pru_shutdown>:ENOBUFS[TCP]
4666 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4667 * <pru_shutdown>:??? [other protocol families]
4668 */
4669 int
soshutdown(struct socket * so,int how)4670 soshutdown(struct socket *so, int how)
4671 {
4672 int error;
4673
4674 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4675
4676 switch (how) {
4677 case SHUT_RD:
4678 case SHUT_WR:
4679 case SHUT_RDWR:
4680 socket_lock(so, 1);
4681 if ((so->so_state &
4682 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4683 error = ENOTCONN;
4684 } else {
4685 error = soshutdownlock(so, how);
4686 }
4687 socket_unlock(so, 1);
4688 break;
4689 default:
4690 error = EINVAL;
4691 break;
4692 }
4693
4694 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4695
4696 return error;
4697 }
4698
4699 int
soshutdownlock_final(struct socket * so,int how)4700 soshutdownlock_final(struct socket *so, int how)
4701 {
4702 struct protosw *pr = so->so_proto;
4703 int error = 0;
4704
4705 sflt_notify(so, sock_evt_shutdown, &how);
4706
4707 if (how != SHUT_WR) {
4708 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4709 /* read already shut down */
4710 error = ENOTCONN;
4711 goto done;
4712 }
4713 sorflush(so);
4714 }
4715 if (how != SHUT_RD) {
4716 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4717 /* write already shut down */
4718 error = ENOTCONN;
4719 goto done;
4720 }
4721 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4722 }
4723 done:
4724 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4725 return error;
4726 }
4727
4728 int
soshutdownlock(struct socket * so,int how)4729 soshutdownlock(struct socket *so, int how)
4730 {
4731 int error = 0;
4732
4733 #if CONTENT_FILTER
4734 /*
4735 * A content filter may delay the actual shutdown until it
4736 * has processed the pending data
4737 */
4738 if (so->so_flags & SOF_CONTENT_FILTER) {
4739 error = cfil_sock_shutdown(so, &how);
4740 if (error == EJUSTRETURN) {
4741 error = 0;
4742 goto done;
4743 } else if (error != 0) {
4744 goto done;
4745 }
4746 }
4747 #endif /* CONTENT_FILTER */
4748
4749 error = soshutdownlock_final(so, how);
4750
4751 done:
4752 return error;
4753 }
4754
4755 void
sowflush(struct socket * so)4756 sowflush(struct socket *so)
4757 {
4758 struct sockbuf *sb = &so->so_snd;
4759
4760 /*
4761 * Obtain lock on the socket buffer (SB_LOCK). This is required
4762 * to prevent the socket buffer from being unexpectedly altered
4763 * while it is used by another thread in socket send/receive.
4764 *
4765 * sblock() must not fail here, hence the assertion.
4766 */
4767 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4768 VERIFY(sb->sb_flags & SB_LOCK);
4769
4770 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4771 sb->sb_flags |= SB_DROP;
4772 sb->sb_upcall = NULL;
4773 sb->sb_upcallarg = NULL;
4774
4775 sbunlock(sb, TRUE); /* keep socket locked */
4776
4777 selthreadclear(&sb->sb_sel);
4778 sbrelease(sb);
4779 }
4780
4781 void
sorflush(struct socket * so)4782 sorflush(struct socket *so)
4783 {
4784 struct sockbuf *sb = &so->so_rcv;
4785 struct protosw *pr = so->so_proto;
4786 struct sockbuf asb;
4787 #ifdef notyet
4788 lck_mtx_t *mutex_held;
4789 /*
4790 * XXX: This code is currently commented out, because we may get here
4791 * as part of sofreelastref(), and at that time, pr_getlock() may no
4792 * longer be able to return us the lock; this will be fixed in future.
4793 */
4794 if (so->so_proto->pr_getlock != NULL) {
4795 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4796 } else {
4797 mutex_held = so->so_proto->pr_domain->dom_mtx;
4798 }
4799
4800 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4801 #endif /* notyet */
4802
4803 sflt_notify(so, sock_evt_flush_read, NULL);
4804
4805 socantrcvmore(so);
4806
4807 /*
4808 * Obtain lock on the socket buffer (SB_LOCK). This is required
4809 * to prevent the socket buffer from being unexpectedly altered
4810 * while it is used by another thread in socket send/receive.
4811 *
4812 * sblock() must not fail here, hence the assertion.
4813 */
4814 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4815 VERIFY(sb->sb_flags & SB_LOCK);
4816
4817 /*
4818 * Copy only the relevant fields from "sb" to "asb" which we
4819 * need for sbrelease() to function. In particular, skip
4820 * sb_sel as it contains the wait queue linkage, which would
4821 * wreak havoc if we were to issue selthreadclear() on "asb".
4822 * Make sure to not carry over SB_LOCK in "asb", as we need
4823 * to acquire it later as part of sbrelease().
4824 */
4825 bzero(&asb, sizeof(asb));
4826 asb.sb_cc = sb->sb_cc;
4827 asb.sb_hiwat = sb->sb_hiwat;
4828 asb.sb_mbcnt = sb->sb_mbcnt;
4829 asb.sb_mbmax = sb->sb_mbmax;
4830 asb.sb_ctl = sb->sb_ctl;
4831 asb.sb_lowat = sb->sb_lowat;
4832 asb.sb_mb = sb->sb_mb;
4833 asb.sb_mbtail = sb->sb_mbtail;
4834 asb.sb_lastrecord = sb->sb_lastrecord;
4835 asb.sb_so = sb->sb_so;
4836 asb.sb_flags = sb->sb_flags;
4837 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4838 asb.sb_flags |= SB_DROP;
4839
4840 /*
4841 * Ideally we'd bzero() these and preserve the ones we need;
4842 * but to do that we'd need to shuffle things around in the
4843 * sockbuf, and we can't do it now because there are KEXTS
4844 * that are directly referring to the socket structure.
4845 *
4846 * Setting SB_DROP acts as a barrier to prevent further appends.
4847 * Clearing SB_SEL is done for selthreadclear() below.
4848 */
4849 sb->sb_cc = 0;
4850 sb->sb_hiwat = 0;
4851 sb->sb_mbcnt = 0;
4852 sb->sb_mbmax = 0;
4853 sb->sb_ctl = 0;
4854 sb->sb_lowat = 0;
4855 sb->sb_mb = NULL;
4856 sb->sb_mbtail = NULL;
4857 sb->sb_lastrecord = NULL;
4858 sb->sb_timeo.tv_sec = 0;
4859 sb->sb_timeo.tv_usec = 0;
4860 sb->sb_upcall = NULL;
4861 sb->sb_upcallarg = NULL;
4862 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4863 sb->sb_flags |= SB_DROP;
4864
4865 sbunlock(sb, TRUE); /* keep socket locked */
4866
4867 /*
4868 * Note that selthreadclear() is called on the original "sb" and
4869 * not the local "asb" because of the way wait queue linkage is
4870 * implemented. Given that selwakeup() may be triggered, SB_SEL
4871 * should no longer be set (cleared above.)
4872 */
4873 selthreadclear(&sb->sb_sel);
4874
4875 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4876 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4877 }
4878
4879 sbrelease(&asb);
4880 }
4881
4882 /*
4883 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4884 * an additional variant to handle the case where the option value needs
4885 * to be some kind of integer, but not a specific size.
4886 * In addition to their use here, these functions are also called by the
4887 * protocol-level pr_ctloutput() routines.
4888 *
4889 * Returns: 0 Success
4890 * EINVAL
4891 * copyin:EFAULT
4892 */
4893 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4894 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4895 {
4896 size_t valsize;
4897
4898 /*
4899 * If the user gives us more than we wanted, we ignore it,
4900 * but if we don't get the minimum length the caller
4901 * wants, we return EINVAL. On success, sopt->sopt_valsize
4902 * is set to however much we actually retrieved.
4903 */
4904 if ((valsize = sopt->sopt_valsize) < minlen) {
4905 return EINVAL;
4906 }
4907 if (valsize > len) {
4908 sopt->sopt_valsize = valsize = len;
4909 }
4910
4911 if (sopt->sopt_p != kernproc) {
4912 return copyin(sopt->sopt_val, buf, valsize);
4913 }
4914
4915 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4916 return 0;
4917 }
4918
4919 /*
4920 * sooptcopyin_timeval
4921 * Copy in a timeval value into tv_p, and take into account whether the
4922 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4923 * code here so that we can verify the 64-bit tv_sec value before we lose
4924 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4925 */
4926 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4927 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4928 {
4929 int error;
4930
4931 if (proc_is64bit(sopt->sopt_p)) {
4932 struct user64_timeval tv64;
4933
4934 if (sopt->sopt_valsize < sizeof(tv64)) {
4935 return EINVAL;
4936 }
4937
4938 sopt->sopt_valsize = sizeof(tv64);
4939 if (sopt->sopt_p != kernproc) {
4940 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4941 if (error != 0) {
4942 return error;
4943 }
4944 } else {
4945 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4946 sizeof(tv64));
4947 }
4948 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4949 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4950 return EDOM;
4951 }
4952
4953 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4954 tv_p->tv_usec = tv64.tv_usec;
4955 } else {
4956 struct user32_timeval tv32;
4957
4958 if (sopt->sopt_valsize < sizeof(tv32)) {
4959 return EINVAL;
4960 }
4961
4962 sopt->sopt_valsize = sizeof(tv32);
4963 if (sopt->sopt_p != kernproc) {
4964 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4965 if (error != 0) {
4966 return error;
4967 }
4968 } else {
4969 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4970 sizeof(tv32));
4971 }
4972 #ifndef __LP64__
4973 /*
4974 * K64todo "comparison is always false due to
4975 * limited range of data type"
4976 */
4977 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4978 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4979 return EDOM;
4980 }
4981 #endif
4982 tv_p->tv_sec = tv32.tv_sec;
4983 tv_p->tv_usec = tv32.tv_usec;
4984 }
4985 return 0;
4986 }
4987
4988 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4989 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4990 boolean_t ignore_delegate)
4991 {
4992 kauth_cred_t cred = NULL;
4993 proc_t ep = PROC_NULL;
4994 uid_t uid;
4995 int error = 0;
4996
4997 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4998 ep = proc_find(so->e_pid);
4999 if (ep) {
5000 cred = kauth_cred_proc_ref(ep);
5001 }
5002 }
5003
5004 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5005
5006 /* uid is 0 for root */
5007 if (uid != 0 || !allow_root) {
5008 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5009 }
5010 if (cred) {
5011 kauth_cred_unref(&cred);
5012 }
5013 if (ep != PROC_NULL) {
5014 proc_rele(ep);
5015 }
5016
5017 return error;
5018 }
5019
5020 /*
5021 * Returns: 0 Success
5022 * EINVAL
5023 * ENOPROTOOPT
5024 * ENOBUFS
5025 * EDOM
5026 * sooptcopyin:EINVAL
5027 * sooptcopyin:EFAULT
5028 * sooptcopyin_timeval:EINVAL
5029 * sooptcopyin_timeval:EFAULT
5030 * sooptcopyin_timeval:EDOM
5031 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5032 * <pr_ctloutput>:???w
5033 * sflt_attach_private:??? [whatever a filter author chooses]
5034 * <sf_setoption>:??? [whatever a filter author chooses]
5035 *
5036 * Notes: Other <pru_listen> returns depend on the protocol family; all
5037 * <sf_listen> returns depend on what the filter author causes
5038 * their filter to return.
5039 */
5040 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5041 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5042 {
5043 int error, optval;
5044 int64_t long_optval;
5045 struct linger l;
5046 struct timeval tv;
5047
5048 if (sopt->sopt_dir != SOPT_SET) {
5049 sopt->sopt_dir = SOPT_SET;
5050 }
5051
5052 if (dolock) {
5053 socket_lock(so, 1);
5054 }
5055
5056 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5057 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5058 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5059 /* the socket has been shutdown, no more sockopt's */
5060 error = EINVAL;
5061 goto out;
5062 }
5063
5064 error = sflt_setsockopt(so, sopt);
5065 if (error != 0) {
5066 if (error == EJUSTRETURN) {
5067 error = 0;
5068 }
5069 goto out;
5070 }
5071
5072 if (sopt->sopt_level != SOL_SOCKET) {
5073 if (so->so_proto != NULL &&
5074 so->so_proto->pr_ctloutput != NULL) {
5075 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5076 goto out;
5077 }
5078 error = ENOPROTOOPT;
5079 } else {
5080 /*
5081 * Allow socket-level (SOL_SOCKET) options to be filtered by
5082 * the protocol layer, if needed. A zero value returned from
5083 * the handler means use default socket-level processing as
5084 * done by the rest of this routine. Otherwise, any other
5085 * return value indicates that the option is unsupported.
5086 */
5087 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5088 pru_socheckopt(so, sopt)) != 0) {
5089 goto out;
5090 }
5091
5092 error = 0;
5093 switch (sopt->sopt_name) {
5094 case SO_LINGER:
5095 case SO_LINGER_SEC: {
5096 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5097 if (error != 0) {
5098 goto out;
5099 }
5100 /* Make sure to use sane values */
5101 if (sopt->sopt_name == SO_LINGER) {
5102 so->so_linger = (short)l.l_linger;
5103 } else {
5104 so->so_linger = (short)((long)l.l_linger * hz);
5105 }
5106 if (l.l_onoff != 0) {
5107 so->so_options |= SO_LINGER;
5108 } else {
5109 so->so_options &= ~SO_LINGER;
5110 }
5111 break;
5112 }
5113 case SO_DEBUG:
5114 case SO_KEEPALIVE:
5115 case SO_DONTROUTE:
5116 case SO_USELOOPBACK:
5117 case SO_BROADCAST:
5118 case SO_REUSEADDR:
5119 case SO_REUSEPORT:
5120 case SO_OOBINLINE:
5121 case SO_TIMESTAMP:
5122 case SO_TIMESTAMP_MONOTONIC:
5123 case SO_TIMESTAMP_CONTINUOUS:
5124 case SO_DONTTRUNC:
5125 case SO_WANTMORE:
5126 case SO_WANTOOBFLAG:
5127 case SO_NOWAKEFROMSLEEP:
5128 case SO_NOAPNFALLBK:
5129 error = sooptcopyin(sopt, &optval, sizeof(optval),
5130 sizeof(optval));
5131 if (error != 0) {
5132 goto out;
5133 }
5134 if (optval) {
5135 so->so_options |= sopt->sopt_name;
5136 } else {
5137 so->so_options &= ~sopt->sopt_name;
5138 }
5139 #if SKYWALK
5140 inp_update_netns_flags(so);
5141 #endif /* SKYWALK */
5142 break;
5143
5144 case SO_SNDBUF:
5145 case SO_RCVBUF:
5146 case SO_SNDLOWAT:
5147 case SO_RCVLOWAT:
5148 error = sooptcopyin(sopt, &optval, sizeof(optval),
5149 sizeof(optval));
5150 if (error != 0) {
5151 goto out;
5152 }
5153
5154 /*
5155 * Values < 1 make no sense for any of these
5156 * options, so disallow them.
5157 */
5158 if (optval < 1) {
5159 error = EINVAL;
5160 goto out;
5161 }
5162
5163 switch (sopt->sopt_name) {
5164 case SO_SNDBUF:
5165 case SO_RCVBUF: {
5166 struct sockbuf *sb =
5167 (sopt->sopt_name == SO_SNDBUF) ?
5168 &so->so_snd : &so->so_rcv;
5169 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5170 error = ENOBUFS;
5171 goto out;
5172 }
5173 sb->sb_flags |= SB_USRSIZE;
5174 sb->sb_flags &= ~SB_AUTOSIZE;
5175 sb->sb_idealsize = (u_int32_t)optval;
5176 break;
5177 }
5178 /*
5179 * Make sure the low-water is never greater than
5180 * the high-water.
5181 */
5182 case SO_SNDLOWAT: {
5183 int space = sbspace(&so->so_snd);
5184 uint32_t hiwat = so->so_snd.sb_hiwat;
5185
5186 if (so->so_snd.sb_flags & SB_UNIX) {
5187 struct unpcb *unp =
5188 (struct unpcb *)(so->so_pcb);
5189 if (unp != NULL &&
5190 unp->unp_conn != NULL) {
5191 struct socket *so2 = unp->unp_conn->unp_socket;
5192 hiwat += unp->unp_conn->unp_cc;
5193 space = sbspace(&so2->so_rcv);
5194 }
5195 }
5196
5197 so->so_snd.sb_lowat =
5198 (optval > hiwat) ?
5199 hiwat : optval;
5200
5201 if (space >= so->so_snd.sb_lowat) {
5202 sowwakeup(so);
5203 }
5204 break;
5205 }
5206 case SO_RCVLOWAT: {
5207 int64_t data_len;
5208 so->so_rcv.sb_lowat =
5209 (optval > so->so_rcv.sb_hiwat) ?
5210 so->so_rcv.sb_hiwat : optval;
5211 if (so->so_rcv.sb_flags & SB_UNIX) {
5212 struct unpcb *unp =
5213 (struct unpcb *)(so->so_pcb);
5214 if (unp != NULL &&
5215 unp->unp_conn != NULL) {
5216 struct socket *so2 = unp->unp_conn->unp_socket;
5217 data_len = so2->so_snd.sb_cc
5218 - so2->so_snd.sb_ctl;
5219 } else {
5220 data_len = so->so_rcv.sb_cc
5221 - so->so_rcv.sb_ctl;
5222 }
5223 } else {
5224 data_len = so->so_rcv.sb_cc
5225 - so->so_rcv.sb_ctl;
5226 }
5227
5228 if (data_len >= so->so_rcv.sb_lowat) {
5229 sorwakeup(so);
5230 }
5231 break;
5232 }
5233 }
5234 break;
5235
5236 case SO_SNDTIMEO:
5237 case SO_RCVTIMEO:
5238 error = sooptcopyin_timeval(sopt, &tv);
5239 if (error != 0) {
5240 goto out;
5241 }
5242
5243 switch (sopt->sopt_name) {
5244 case SO_SNDTIMEO:
5245 so->so_snd.sb_timeo = tv;
5246 break;
5247 case SO_RCVTIMEO:
5248 so->so_rcv.sb_timeo = tv;
5249 break;
5250 }
5251 break;
5252
5253 case SO_NKE: {
5254 struct so_nke nke;
5255
5256 error = sooptcopyin(sopt, &nke, sizeof(nke),
5257 sizeof(nke));
5258 if (error != 0) {
5259 goto out;
5260 }
5261
5262 error = sflt_attach_internal(so, nke.nke_handle);
5263 break;
5264 }
5265
5266 case SO_NOSIGPIPE:
5267 error = sooptcopyin(sopt, &optval, sizeof(optval),
5268 sizeof(optval));
5269 if (error != 0) {
5270 goto out;
5271 }
5272 if (optval != 0) {
5273 so->so_flags |= SOF_NOSIGPIPE;
5274 } else {
5275 so->so_flags &= ~SOF_NOSIGPIPE;
5276 }
5277 break;
5278
5279 case SO_NOADDRERR:
5280 error = sooptcopyin(sopt, &optval, sizeof(optval),
5281 sizeof(optval));
5282 if (error != 0) {
5283 goto out;
5284 }
5285 if (optval != 0) {
5286 so->so_flags |= SOF_NOADDRAVAIL;
5287 } else {
5288 so->so_flags &= ~SOF_NOADDRAVAIL;
5289 }
5290 break;
5291
5292 case SO_REUSESHAREUID:
5293 error = sooptcopyin(sopt, &optval, sizeof(optval),
5294 sizeof(optval));
5295 if (error != 0) {
5296 goto out;
5297 }
5298 if (optval != 0) {
5299 so->so_flags |= SOF_REUSESHAREUID;
5300 } else {
5301 so->so_flags &= ~SOF_REUSESHAREUID;
5302 }
5303 break;
5304
5305 case SO_NOTIFYCONFLICT:
5306 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5307 error = EPERM;
5308 goto out;
5309 }
5310 error = sooptcopyin(sopt, &optval, sizeof(optval),
5311 sizeof(optval));
5312 if (error != 0) {
5313 goto out;
5314 }
5315 if (optval != 0) {
5316 so->so_flags |= SOF_NOTIFYCONFLICT;
5317 } else {
5318 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5319 }
5320 break;
5321
5322 case SO_RESTRICTIONS:
5323 error = sooptcopyin(sopt, &optval, sizeof(optval),
5324 sizeof(optval));
5325 if (error != 0) {
5326 goto out;
5327 }
5328
5329 error = so_set_restrictions(so, optval);
5330 break;
5331
5332 case SO_AWDL_UNRESTRICTED:
5333 if (SOCK_DOM(so) != PF_INET &&
5334 SOCK_DOM(so) != PF_INET6) {
5335 error = EOPNOTSUPP;
5336 goto out;
5337 }
5338 error = sooptcopyin(sopt, &optval, sizeof(optval),
5339 sizeof(optval));
5340 if (error != 0) {
5341 goto out;
5342 }
5343 if (optval != 0) {
5344 error = soopt_cred_check(so,
5345 PRIV_NET_RESTRICTED_AWDL, false, false);
5346 if (error == 0) {
5347 inp_set_awdl_unrestricted(
5348 sotoinpcb(so));
5349 }
5350 } else {
5351 inp_clear_awdl_unrestricted(sotoinpcb(so));
5352 }
5353 break;
5354 case SO_INTCOPROC_ALLOW:
5355 if (SOCK_DOM(so) != PF_INET6) {
5356 error = EOPNOTSUPP;
5357 goto out;
5358 }
5359 error = sooptcopyin(sopt, &optval, sizeof(optval),
5360 sizeof(optval));
5361 if (error != 0) {
5362 goto out;
5363 }
5364 if (optval != 0 &&
5365 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5366 error = soopt_cred_check(so,
5367 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5368 if (error == 0) {
5369 inp_set_intcoproc_allowed(
5370 sotoinpcb(so));
5371 }
5372 } else if (optval == 0) {
5373 inp_clear_intcoproc_allowed(sotoinpcb(so));
5374 }
5375 break;
5376
5377 case SO_LABEL:
5378 error = EOPNOTSUPP;
5379 break;
5380
5381 case SO_UPCALLCLOSEWAIT:
5382 error = sooptcopyin(sopt, &optval, sizeof(optval),
5383 sizeof(optval));
5384 if (error != 0) {
5385 goto out;
5386 }
5387 if (optval != 0) {
5388 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5389 } else {
5390 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5391 }
5392 break;
5393
5394 case SO_RANDOMPORT:
5395 error = sooptcopyin(sopt, &optval, sizeof(optval),
5396 sizeof(optval));
5397 if (error != 0) {
5398 goto out;
5399 }
5400 if (optval != 0) {
5401 so->so_flags |= SOF_BINDRANDOMPORT;
5402 } else {
5403 so->so_flags &= ~SOF_BINDRANDOMPORT;
5404 }
5405 break;
5406
5407 case SO_NP_EXTENSIONS: {
5408 struct so_np_extensions sonpx;
5409
5410 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5411 sizeof(sonpx));
5412 if (error != 0) {
5413 goto out;
5414 }
5415 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5416 error = EINVAL;
5417 goto out;
5418 }
5419 /*
5420 * Only one bit defined for now
5421 */
5422 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5423 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5424 so->so_flags |= SOF_NPX_SETOPTSHUT;
5425 } else {
5426 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5427 }
5428 }
5429 break;
5430 }
5431
5432 case SO_TRAFFIC_CLASS: {
5433 error = sooptcopyin(sopt, &optval, sizeof(optval),
5434 sizeof(optval));
5435 if (error != 0) {
5436 goto out;
5437 }
5438 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5439 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5440 error = so_set_net_service_type(so, netsvc);
5441 goto out;
5442 }
5443 error = so_set_traffic_class(so, optval);
5444 if (error != 0) {
5445 goto out;
5446 }
5447 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5448 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5449 break;
5450 }
5451
5452 case SO_RECV_TRAFFIC_CLASS: {
5453 error = sooptcopyin(sopt, &optval, sizeof(optval),
5454 sizeof(optval));
5455 if (error != 0) {
5456 goto out;
5457 }
5458 if (optval == 0) {
5459 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5460 } else {
5461 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5462 }
5463 break;
5464 }
5465
5466 #if (DEVELOPMENT || DEBUG)
5467 case SO_TRAFFIC_CLASS_DBG: {
5468 struct so_tcdbg so_tcdbg;
5469
5470 error = sooptcopyin(sopt, &so_tcdbg,
5471 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5472 if (error != 0) {
5473 goto out;
5474 }
5475 error = so_set_tcdbg(so, &so_tcdbg);
5476 if (error != 0) {
5477 goto out;
5478 }
5479 break;
5480 }
5481 #endif /* (DEVELOPMENT || DEBUG) */
5482
5483 case SO_PRIVILEGED_TRAFFIC_CLASS:
5484 error = priv_check_cred(kauth_cred_get(),
5485 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5486 if (error != 0) {
5487 goto out;
5488 }
5489 error = sooptcopyin(sopt, &optval, sizeof(optval),
5490 sizeof(optval));
5491 if (error != 0) {
5492 goto out;
5493 }
5494 if (optval == 0) {
5495 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5496 } else {
5497 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5498 }
5499 break;
5500
5501 #if (DEVELOPMENT || DEBUG)
5502 case SO_DEFUNCTIT:
5503 error = sosetdefunct(current_proc(), so, 0, FALSE);
5504 if (error == 0) {
5505 error = sodefunct(current_proc(), so, 0);
5506 }
5507
5508 break;
5509 #endif /* (DEVELOPMENT || DEBUG) */
5510
5511 case SO_DEFUNCTOK:
5512 error = sooptcopyin(sopt, &optval, sizeof(optval),
5513 sizeof(optval));
5514 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5515 if (error == 0) {
5516 error = EBADF;
5517 }
5518 goto out;
5519 }
5520 /*
5521 * Any process can set SO_DEFUNCTOK (clear
5522 * SOF_NODEFUNCT), but only root can clear
5523 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5524 */
5525 if (optval == 0 &&
5526 kauth_cred_issuser(kauth_cred_get()) == 0) {
5527 error = EPERM;
5528 goto out;
5529 }
5530 if (optval) {
5531 so->so_flags &= ~SOF_NODEFUNCT;
5532 } else {
5533 so->so_flags |= SOF_NODEFUNCT;
5534 }
5535
5536 if (SOCK_DOM(so) == PF_INET ||
5537 SOCK_DOM(so) == PF_INET6) {
5538 char s[MAX_IPv6_STR_LEN];
5539 char d[MAX_IPv6_STR_LEN];
5540 struct inpcb *inp = sotoinpcb(so);
5541
5542 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5543 "[%s %s:%d -> %s:%d] is now marked "
5544 "as %seligible for "
5545 "defunct\n", __func__, proc_selfpid(),
5546 proc_best_name(current_proc()),
5547 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5548 (SOCK_TYPE(so) == SOCK_STREAM) ?
5549 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5550 ((SOCK_DOM(so) == PF_INET) ?
5551 (void *)&inp->inp_laddr.s_addr :
5552 (void *)&inp->in6p_laddr), s, sizeof(s)),
5553 ntohs(inp->in6p_lport),
5554 inet_ntop(SOCK_DOM(so),
5555 (SOCK_DOM(so) == PF_INET) ?
5556 (void *)&inp->inp_faddr.s_addr :
5557 (void *)&inp->in6p_faddr, d, sizeof(d)),
5558 ntohs(inp->in6p_fport),
5559 (so->so_flags & SOF_NODEFUNCT) ?
5560 "not " : "");
5561 } else {
5562 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5563 "is now marked as %seligible for "
5564 "defunct\n",
5565 __func__, proc_selfpid(),
5566 proc_best_name(current_proc()),
5567 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5568 SOCK_DOM(so), SOCK_TYPE(so),
5569 (so->so_flags & SOF_NODEFUNCT) ?
5570 "not " : "");
5571 }
5572 break;
5573
5574 case SO_ISDEFUNCT:
5575 /* This option is not settable */
5576 error = EINVAL;
5577 break;
5578
5579 case SO_OPPORTUNISTIC:
5580 error = sooptcopyin(sopt, &optval, sizeof(optval),
5581 sizeof(optval));
5582 if (error == 0) {
5583 error = so_set_opportunistic(so, optval);
5584 }
5585 break;
5586
5587 case SO_FLUSH:
5588 /* This option is handled by lower layer(s) */
5589 error = 0;
5590 break;
5591
5592 case SO_RECV_ANYIF:
5593 error = sooptcopyin(sopt, &optval, sizeof(optval),
5594 sizeof(optval));
5595 if (error == 0) {
5596 error = so_set_recv_anyif(so, optval);
5597 }
5598 break;
5599
5600 case SO_TRAFFIC_MGT_BACKGROUND: {
5601 /* This option is handled by lower layer(s) */
5602 error = 0;
5603 break;
5604 }
5605
5606 #if FLOW_DIVERT
5607 case SO_FLOW_DIVERT_TOKEN:
5608 error = flow_divert_token_set(so, sopt);
5609 break;
5610 #endif /* FLOW_DIVERT */
5611
5612
5613 case SO_DELEGATED:
5614 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5615 sizeof(optval))) != 0) {
5616 break;
5617 }
5618
5619 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5620 break;
5621
5622 case SO_DELEGATED_UUID: {
5623 uuid_t euuid;
5624
5625 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5626 sizeof(euuid))) != 0) {
5627 break;
5628 }
5629
5630 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5631 break;
5632 }
5633
5634 #if NECP
5635 case SO_NECP_ATTRIBUTES:
5636 if (SOCK_DOM(so) == PF_MULTIPATH) {
5637 /* Handled by MPTCP itself */
5638 break;
5639 }
5640
5641 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5642 error = EINVAL;
5643 goto out;
5644 }
5645
5646 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5647 break;
5648
5649 case SO_NECP_CLIENTUUID: {
5650 if (SOCK_DOM(so) == PF_MULTIPATH) {
5651 /* Handled by MPTCP itself */
5652 break;
5653 }
5654
5655 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5656 error = EINVAL;
5657 goto out;
5658 }
5659
5660 struct inpcb *inp = sotoinpcb(so);
5661 if (!uuid_is_null(inp->necp_client_uuid)) {
5662 // Clear out the old client UUID if present
5663 necp_inpcb_remove_cb(inp);
5664 }
5665
5666 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5667 sizeof(uuid_t), sizeof(uuid_t));
5668 if (error != 0) {
5669 goto out;
5670 }
5671
5672 if (uuid_is_null(inp->necp_client_uuid)) {
5673 error = EINVAL;
5674 goto out;
5675 }
5676
5677 pid_t current_pid = proc_pid(current_proc());
5678 error = necp_client_register_socket_flow(current_pid,
5679 inp->necp_client_uuid, inp);
5680 if (error != 0) {
5681 uuid_clear(inp->necp_client_uuid);
5682 goto out;
5683 }
5684
5685 if (inp->inp_lport != 0) {
5686 // There is a bound local port, so this is not
5687 // a fresh socket. Assign to the client.
5688 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5689 }
5690
5691 break;
5692 }
5693 case SO_NECP_LISTENUUID: {
5694 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5695 error = EINVAL;
5696 goto out;
5697 }
5698
5699 struct inpcb *inp = sotoinpcb(so);
5700 if (!uuid_is_null(inp->necp_client_uuid)) {
5701 error = EINVAL;
5702 goto out;
5703 }
5704
5705 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5706 sizeof(uuid_t), sizeof(uuid_t));
5707 if (error != 0) {
5708 goto out;
5709 }
5710
5711 if (uuid_is_null(inp->necp_client_uuid)) {
5712 error = EINVAL;
5713 goto out;
5714 }
5715
5716 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5717 inp->necp_client_uuid, inp);
5718 if (error != 0) {
5719 uuid_clear(inp->necp_client_uuid);
5720 goto out;
5721 }
5722
5723 // Mark that the port registration is held by NECP
5724 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5725
5726 break;
5727 }
5728
5729 case SO_RESOLVER_SIGNATURE: {
5730 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5731 error = EINVAL;
5732 goto out;
5733 }
5734 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5735 break;
5736 }
5737 #endif /* NECP */
5738
5739 case SO_EXTENDED_BK_IDLE:
5740 error = sooptcopyin(sopt, &optval, sizeof(optval),
5741 sizeof(optval));
5742 if (error == 0) {
5743 error = so_set_extended_bk_idle(so, optval);
5744 }
5745 break;
5746
5747 case SO_MARK_CELLFALLBACK:
5748 error = sooptcopyin(sopt, &optval, sizeof(optval),
5749 sizeof(optval));
5750 if (error != 0) {
5751 goto out;
5752 }
5753 if (optval < 0) {
5754 error = EINVAL;
5755 goto out;
5756 }
5757 if (optval == 0) {
5758 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5759 } else {
5760 so->so_flags1 |= SOF1_CELLFALLBACK;
5761 }
5762 break;
5763
5764 case SO_MARK_CELLFALLBACK_UUID:
5765 {
5766 struct so_mark_cellfallback_uuid_args args;
5767
5768 error = sooptcopyin(sopt, &args, sizeof(args),
5769 sizeof(args));
5770 if (error != 0) {
5771 goto out;
5772 }
5773 error = nstat_userland_mark_rnf_override(args.flow_uuid,
5774 args.flow_cellfallback);
5775 break;
5776 }
5777
5778 case SO_FALLBACK_MODE:
5779 error = sooptcopyin(sopt, &optval, sizeof(optval),
5780 sizeof(optval));
5781 if (error != 0) {
5782 goto out;
5783 }
5784 if (optval < SO_FALLBACK_MODE_NONE ||
5785 optval > SO_FALLBACK_MODE_PREFER) {
5786 error = EINVAL;
5787 goto out;
5788 }
5789 so->so_fallback_mode = (u_int8_t)optval;
5790 break;
5791
5792 case SO_MARK_KNOWN_TRACKER: {
5793 error = sooptcopyin(sopt, &optval, sizeof(optval),
5794 sizeof(optval));
5795 if (error != 0) {
5796 goto out;
5797 }
5798 if (optval < 0) {
5799 error = EINVAL;
5800 goto out;
5801 }
5802 if (optval == 0) {
5803 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5804 } else {
5805 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5806 }
5807 break;
5808 }
5809
5810 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5811 error = sooptcopyin(sopt, &optval, sizeof(optval),
5812 sizeof(optval));
5813 if (error != 0) {
5814 goto out;
5815 }
5816 if (optval < 0) {
5817 error = EINVAL;
5818 goto out;
5819 }
5820 if (optval == 0) {
5821 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5822 } else {
5823 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5824 }
5825 break;
5826 }
5827
5828 case SO_MARK_APPROVED_APP_DOMAIN: {
5829 error = sooptcopyin(sopt, &optval, sizeof(optval),
5830 sizeof(optval));
5831 if (error != 0) {
5832 goto out;
5833 }
5834 if (optval < 0) {
5835 error = EINVAL;
5836 goto out;
5837 }
5838 if (optval == 0) {
5839 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5840 } else {
5841 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5842 }
5843 break;
5844 }
5845
5846 case SO_STATISTICS_EVENT:
5847 error = sooptcopyin(sopt, &long_optval,
5848 sizeof(long_optval), sizeof(long_optval));
5849 if (error != 0) {
5850 goto out;
5851 }
5852 u_int64_t nstat_event = 0;
5853 error = so_statistics_event_to_nstat_event(
5854 &long_optval, &nstat_event);
5855 if (error != 0) {
5856 goto out;
5857 }
5858 nstat_pcb_event(sotoinpcb(so), nstat_event);
5859 break;
5860
5861 case SO_NET_SERVICE_TYPE: {
5862 error = sooptcopyin(sopt, &optval, sizeof(optval),
5863 sizeof(optval));
5864 if (error != 0) {
5865 goto out;
5866 }
5867 error = so_set_net_service_type(so, optval);
5868 break;
5869 }
5870
5871 case SO_QOSMARKING_POLICY_OVERRIDE:
5872 error = priv_check_cred(kauth_cred_get(),
5873 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5874 if (error != 0) {
5875 goto out;
5876 }
5877 error = sooptcopyin(sopt, &optval, sizeof(optval),
5878 sizeof(optval));
5879 if (error != 0) {
5880 goto out;
5881 }
5882 if (optval == 0) {
5883 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5884 } else {
5885 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5886 }
5887 break;
5888
5889 case SO_MPKL_SEND_INFO: {
5890 struct so_mpkl_send_info so_mpkl_send_info;
5891
5892 error = sooptcopyin(sopt, &so_mpkl_send_info,
5893 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5894 if (error != 0) {
5895 goto out;
5896 }
5897 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5898 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5899
5900 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5901 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5902 } else {
5903 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5904 }
5905 break;
5906 }
5907 case SO_WANT_KEV_SOCKET_CLOSED: {
5908 error = sooptcopyin(sopt, &optval, sizeof(optval),
5909 sizeof(optval));
5910 if (error != 0) {
5911 goto out;
5912 }
5913 if (optval == 0) {
5914 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5915 } else {
5916 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5917 }
5918 break;
5919 }
5920 case SO_MARK_WAKE_PKT: {
5921 error = sooptcopyin(sopt, &optval, sizeof(optval),
5922 sizeof(optval));
5923 if (error != 0) {
5924 goto out;
5925 }
5926 if (optval == 0) {
5927 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5928 } else {
5929 so->so_flags |= SOF_MARK_WAKE_PKT;
5930 }
5931 break;
5932 }
5933 case SO_RECV_WAKE_PKT: {
5934 error = sooptcopyin(sopt, &optval, sizeof(optval),
5935 sizeof(optval));
5936 if (error != 0) {
5937 goto out;
5938 }
5939 if (optval == 0) {
5940 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5941 } else {
5942 so->so_flags |= SOF_RECV_WAKE_PKT;
5943 }
5944 break;
5945 }
5946 default:
5947 error = ENOPROTOOPT;
5948 break;
5949 }
5950 if (error == 0 && so->so_proto != NULL &&
5951 so->so_proto->pr_ctloutput != NULL) {
5952 (void) so->so_proto->pr_ctloutput(so, sopt);
5953 }
5954 }
5955 out:
5956 if (dolock) {
5957 socket_unlock(so, 1);
5958 }
5959 return error;
5960 }
5961
5962 /* Helper routines for getsockopt */
5963 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5964 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5965 {
5966 int error;
5967 size_t valsize;
5968
5969 error = 0;
5970
5971 /*
5972 * Documented get behavior is that we always return a value,
5973 * possibly truncated to fit in the user's buffer.
5974 * Traditional behavior is that we always tell the user
5975 * precisely how much we copied, rather than something useful
5976 * like the total amount we had available for her.
5977 * Note that this interface is not idempotent; the entire answer must
5978 * generated ahead of time.
5979 */
5980 valsize = MIN(len, sopt->sopt_valsize);
5981 sopt->sopt_valsize = valsize;
5982 if (sopt->sopt_val != USER_ADDR_NULL) {
5983 if (sopt->sopt_p != kernproc) {
5984 error = copyout(buf, sopt->sopt_val, valsize);
5985 } else {
5986 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5987 }
5988 }
5989 return error;
5990 }
5991
5992 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5993 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5994 {
5995 int error;
5996 size_t len;
5997 struct user64_timeval tv64 = {};
5998 struct user32_timeval tv32 = {};
5999 const void * val;
6000 size_t valsize;
6001
6002 error = 0;
6003 if (proc_is64bit(sopt->sopt_p)) {
6004 len = sizeof(tv64);
6005 tv64.tv_sec = tv_p->tv_sec;
6006 tv64.tv_usec = tv_p->tv_usec;
6007 val = &tv64;
6008 } else {
6009 len = sizeof(tv32);
6010 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
6011 tv32.tv_usec = tv_p->tv_usec;
6012 val = &tv32;
6013 }
6014 valsize = MIN(len, sopt->sopt_valsize);
6015 sopt->sopt_valsize = valsize;
6016 if (sopt->sopt_val != USER_ADDR_NULL) {
6017 if (sopt->sopt_p != kernproc) {
6018 error = copyout(val, sopt->sopt_val, valsize);
6019 } else {
6020 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
6021 }
6022 }
6023 return error;
6024 }
6025
6026 /*
6027 * Return: 0 Success
6028 * ENOPROTOOPT
6029 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
6030 * <pr_ctloutput>:???
6031 * <sf_getoption>:???
6032 */
6033 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)6034 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
6035 {
6036 int error, optval;
6037 struct linger l;
6038 struct timeval tv;
6039
6040 if (sopt->sopt_dir != SOPT_GET) {
6041 sopt->sopt_dir = SOPT_GET;
6042 }
6043
6044 if (dolock) {
6045 socket_lock(so, 1);
6046 }
6047
6048 error = sflt_getsockopt(so, sopt);
6049 if (error != 0) {
6050 if (error == EJUSTRETURN) {
6051 error = 0;
6052 }
6053 goto out;
6054 }
6055
6056 if (sopt->sopt_level != SOL_SOCKET) {
6057 if (so->so_proto != NULL &&
6058 so->so_proto->pr_ctloutput != NULL) {
6059 error = (*so->so_proto->pr_ctloutput)(so, sopt);
6060 goto out;
6061 }
6062 error = ENOPROTOOPT;
6063 } else {
6064 /*
6065 * Allow socket-level (SOL_SOCKET) options to be filtered by
6066 * the protocol layer, if needed. A zero value returned from
6067 * the handler means use default socket-level processing as
6068 * done by the rest of this routine. Otherwise, any other
6069 * return value indicates that the option is unsupported.
6070 */
6071 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6072 pru_socheckopt(so, sopt)) != 0) {
6073 goto out;
6074 }
6075
6076 error = 0;
6077 switch (sopt->sopt_name) {
6078 case SO_LINGER:
6079 case SO_LINGER_SEC:
6080 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6081 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6082 so->so_linger : so->so_linger / hz;
6083 error = sooptcopyout(sopt, &l, sizeof(l));
6084 break;
6085
6086 case SO_USELOOPBACK:
6087 case SO_DONTROUTE:
6088 case SO_DEBUG:
6089 case SO_KEEPALIVE:
6090 case SO_REUSEADDR:
6091 case SO_REUSEPORT:
6092 case SO_BROADCAST:
6093 case SO_OOBINLINE:
6094 case SO_TIMESTAMP:
6095 case SO_TIMESTAMP_MONOTONIC:
6096 case SO_TIMESTAMP_CONTINUOUS:
6097 case SO_DONTTRUNC:
6098 case SO_WANTMORE:
6099 case SO_WANTOOBFLAG:
6100 case SO_NOWAKEFROMSLEEP:
6101 case SO_NOAPNFALLBK:
6102 optval = so->so_options & sopt->sopt_name;
6103 integer:
6104 error = sooptcopyout(sopt, &optval, sizeof(optval));
6105 break;
6106
6107 case SO_TYPE:
6108 optval = so->so_type;
6109 goto integer;
6110
6111 case SO_NREAD:
6112 if (so->so_proto->pr_flags & PR_ATOMIC) {
6113 int pkt_total;
6114 struct mbuf *m1;
6115
6116 pkt_total = 0;
6117 m1 = so->so_rcv.sb_mb;
6118 while (m1 != NULL) {
6119 if (m1->m_type == MT_DATA ||
6120 m1->m_type == MT_HEADER ||
6121 m1->m_type == MT_OOBDATA) {
6122 pkt_total += m1->m_len;
6123 }
6124 m1 = m1->m_next;
6125 }
6126 optval = pkt_total;
6127 } else {
6128 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6129 }
6130 goto integer;
6131
6132 case SO_NUMRCVPKT:
6133 if (so->so_proto->pr_flags & PR_ATOMIC) {
6134 int cnt = 0;
6135 struct mbuf *m1;
6136
6137 m1 = so->so_rcv.sb_mb;
6138 while (m1 != NULL) {
6139 cnt += 1;
6140 m1 = m1->m_nextpkt;
6141 }
6142 optval = cnt;
6143 goto integer;
6144 } else {
6145 error = ENOPROTOOPT;
6146 break;
6147 }
6148
6149 case SO_NWRITE:
6150 optval = so->so_snd.sb_cc;
6151 goto integer;
6152
6153 case SO_ERROR:
6154 optval = so->so_error;
6155 so->so_error = 0;
6156 goto integer;
6157
6158 case SO_SNDBUF: {
6159 u_int32_t hiwat = so->so_snd.sb_hiwat;
6160
6161 if (so->so_snd.sb_flags & SB_UNIX) {
6162 struct unpcb *unp =
6163 (struct unpcb *)(so->so_pcb);
6164 if (unp != NULL && unp->unp_conn != NULL) {
6165 hiwat += unp->unp_conn->unp_cc;
6166 }
6167 }
6168
6169 optval = hiwat;
6170 goto integer;
6171 }
6172 case SO_RCVBUF:
6173 optval = so->so_rcv.sb_hiwat;
6174 goto integer;
6175
6176 case SO_SNDLOWAT:
6177 optval = so->so_snd.sb_lowat;
6178 goto integer;
6179
6180 case SO_RCVLOWAT:
6181 optval = so->so_rcv.sb_lowat;
6182 goto integer;
6183
6184 case SO_SNDTIMEO:
6185 case SO_RCVTIMEO:
6186 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6187 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6188
6189 error = sooptcopyout_timeval(sopt, &tv);
6190 break;
6191
6192 case SO_NOSIGPIPE:
6193 optval = (so->so_flags & SOF_NOSIGPIPE);
6194 goto integer;
6195
6196 case SO_NOADDRERR:
6197 optval = (so->so_flags & SOF_NOADDRAVAIL);
6198 goto integer;
6199
6200 case SO_REUSESHAREUID:
6201 optval = (so->so_flags & SOF_REUSESHAREUID);
6202 goto integer;
6203
6204
6205 case SO_NOTIFYCONFLICT:
6206 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6207 goto integer;
6208
6209 case SO_RESTRICTIONS:
6210 optval = so_get_restrictions(so);
6211 goto integer;
6212
6213 case SO_AWDL_UNRESTRICTED:
6214 if (SOCK_DOM(so) == PF_INET ||
6215 SOCK_DOM(so) == PF_INET6) {
6216 optval = inp_get_awdl_unrestricted(
6217 sotoinpcb(so));
6218 goto integer;
6219 } else {
6220 error = EOPNOTSUPP;
6221 }
6222 break;
6223
6224 case SO_INTCOPROC_ALLOW:
6225 if (SOCK_DOM(so) == PF_INET6) {
6226 optval = inp_get_intcoproc_allowed(
6227 sotoinpcb(so));
6228 goto integer;
6229 } else {
6230 error = EOPNOTSUPP;
6231 }
6232 break;
6233
6234 case SO_LABEL:
6235 error = EOPNOTSUPP;
6236 break;
6237
6238 case SO_PEERLABEL:
6239 error = EOPNOTSUPP;
6240 break;
6241
6242 #ifdef __APPLE_API_PRIVATE
6243 case SO_UPCALLCLOSEWAIT:
6244 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6245 goto integer;
6246 #endif
6247 case SO_RANDOMPORT:
6248 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6249 goto integer;
6250
6251 case SO_NP_EXTENSIONS: {
6252 struct so_np_extensions sonpx = {};
6253
6254 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6255 SONPX_SETOPTSHUT : 0;
6256 sonpx.npx_mask = SONPX_MASK_VALID;
6257
6258 error = sooptcopyout(sopt, &sonpx,
6259 sizeof(struct so_np_extensions));
6260 break;
6261 }
6262
6263 case SO_TRAFFIC_CLASS:
6264 optval = so->so_traffic_class;
6265 goto integer;
6266
6267 case SO_RECV_TRAFFIC_CLASS:
6268 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6269 goto integer;
6270
6271 #if (DEVELOPMENT || DEBUG)
6272 case SO_TRAFFIC_CLASS_DBG:
6273 error = sogetopt_tcdbg(so, sopt);
6274 break;
6275 #endif /* (DEVELOPMENT || DEBUG) */
6276
6277 case SO_PRIVILEGED_TRAFFIC_CLASS:
6278 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6279 goto integer;
6280
6281 case SO_DEFUNCTOK:
6282 optval = !(so->so_flags & SOF_NODEFUNCT);
6283 goto integer;
6284
6285 case SO_ISDEFUNCT:
6286 optval = (so->so_flags & SOF_DEFUNCT);
6287 goto integer;
6288
6289 case SO_OPPORTUNISTIC:
6290 optval = so_get_opportunistic(so);
6291 goto integer;
6292
6293 case SO_FLUSH:
6294 /* This option is not gettable */
6295 error = EINVAL;
6296 break;
6297
6298 case SO_RECV_ANYIF:
6299 optval = so_get_recv_anyif(so);
6300 goto integer;
6301
6302 case SO_TRAFFIC_MGT_BACKGROUND:
6303 /* This option is handled by lower layer(s) */
6304 if (so->so_proto != NULL &&
6305 so->so_proto->pr_ctloutput != NULL) {
6306 (void) so->so_proto->pr_ctloutput(so, sopt);
6307 }
6308 break;
6309
6310 #if FLOW_DIVERT
6311 case SO_FLOW_DIVERT_TOKEN:
6312 error = flow_divert_token_get(so, sopt);
6313 break;
6314 #endif /* FLOW_DIVERT */
6315
6316 #if NECP
6317 case SO_NECP_ATTRIBUTES:
6318 if (SOCK_DOM(so) == PF_MULTIPATH) {
6319 /* Handled by MPTCP itself */
6320 break;
6321 }
6322
6323 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6324 error = EINVAL;
6325 goto out;
6326 }
6327
6328 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6329 break;
6330
6331 case SO_NECP_CLIENTUUID: {
6332 uuid_t *ncu;
6333
6334 if (SOCK_DOM(so) == PF_MULTIPATH) {
6335 ncu = &mpsotomppcb(so)->necp_client_uuid;
6336 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6337 ncu = &sotoinpcb(so)->necp_client_uuid;
6338 } else {
6339 error = EINVAL;
6340 goto out;
6341 }
6342
6343 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6344 break;
6345 }
6346
6347 case SO_NECP_LISTENUUID: {
6348 uuid_t *nlu;
6349
6350 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6351 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6352 nlu = &sotoinpcb(so)->necp_client_uuid;
6353 } else {
6354 error = ENOENT;
6355 goto out;
6356 }
6357 } else {
6358 error = EINVAL;
6359 goto out;
6360 }
6361
6362 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6363 break;
6364 }
6365
6366 case SO_RESOLVER_SIGNATURE: {
6367 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6368 error = EINVAL;
6369 goto out;
6370 }
6371 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6372 break;
6373 }
6374
6375 #endif /* NECP */
6376
6377 #if CONTENT_FILTER
6378 case SO_CFIL_SOCK_ID: {
6379 cfil_sock_id_t sock_id;
6380
6381 sock_id = cfil_sock_id_from_socket(so);
6382
6383 error = sooptcopyout(sopt, &sock_id,
6384 sizeof(cfil_sock_id_t));
6385 break;
6386 }
6387 #endif /* CONTENT_FILTER */
6388
6389 case SO_EXTENDED_BK_IDLE:
6390 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6391 goto integer;
6392 case SO_MARK_CELLFALLBACK:
6393 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6394 ? 1 : 0;
6395 goto integer;
6396 case SO_FALLBACK_MODE:
6397 optval = so->so_fallback_mode;
6398 goto integer;
6399 case SO_MARK_KNOWN_TRACKER: {
6400 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6401 ? 1 : 0;
6402 goto integer;
6403 }
6404 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6405 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6406 ? 1 : 0;
6407 goto integer;
6408 }
6409 case SO_MARK_APPROVED_APP_DOMAIN: {
6410 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6411 ? 1 : 0;
6412 goto integer;
6413 }
6414 case SO_NET_SERVICE_TYPE: {
6415 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6416 optval = so->so_netsvctype;
6417 } else {
6418 optval = NET_SERVICE_TYPE_BE;
6419 }
6420 goto integer;
6421 }
6422 case SO_NETSVC_MARKING_LEVEL:
6423 optval = so_get_netsvc_marking_level(so);
6424 goto integer;
6425
6426 case SO_MPKL_SEND_INFO: {
6427 struct so_mpkl_send_info so_mpkl_send_info;
6428
6429 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6430 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6431 error = sooptcopyout(sopt, &so_mpkl_send_info,
6432 sizeof(struct so_mpkl_send_info));
6433 break;
6434 }
6435 case SO_MARK_WAKE_PKT:
6436 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6437 goto integer;
6438 case SO_RECV_WAKE_PKT:
6439 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6440 goto integer;
6441 default:
6442 error = ENOPROTOOPT;
6443 break;
6444 }
6445 }
6446 out:
6447 if (dolock) {
6448 socket_unlock(so, 1);
6449 }
6450 return error;
6451 }
6452
6453 /*
6454 * The size limits on our soopt_getm is different from that on FreeBSD.
6455 * We limit the size of options to MCLBYTES. This will have to change
6456 * if we need to define options that need more space than MCLBYTES.
6457 */
6458 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6459 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6460 {
6461 struct mbuf *m, *m_prev;
6462 int sopt_size = (int)sopt->sopt_valsize;
6463 int how;
6464
6465 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6466 return EMSGSIZE;
6467 }
6468
6469 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6470 MGET(m, how, MT_DATA);
6471 if (m == NULL) {
6472 return ENOBUFS;
6473 }
6474 if (sopt_size > MLEN) {
6475 MCLGET(m, how);
6476 if ((m->m_flags & M_EXT) == 0) {
6477 m_free(m);
6478 return ENOBUFS;
6479 }
6480 m->m_len = min(MCLBYTES, sopt_size);
6481 } else {
6482 m->m_len = min(MLEN, sopt_size);
6483 }
6484 sopt_size -= m->m_len;
6485 *mp = m;
6486 m_prev = m;
6487
6488 while (sopt_size > 0) {
6489 MGET(m, how, MT_DATA);
6490 if (m == NULL) {
6491 m_freem(*mp);
6492 return ENOBUFS;
6493 }
6494 if (sopt_size > MLEN) {
6495 MCLGET(m, how);
6496 if ((m->m_flags & M_EXT) == 0) {
6497 m_freem(*mp);
6498 m_freem(m);
6499 return ENOBUFS;
6500 }
6501 m->m_len = min(MCLBYTES, sopt_size);
6502 } else {
6503 m->m_len = min(MLEN, sopt_size);
6504 }
6505 sopt_size -= m->m_len;
6506 m_prev->m_next = m;
6507 m_prev = m;
6508 }
6509 return 0;
6510 }
6511
6512 /* copyin sopt data into mbuf chain */
6513 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6514 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6515 {
6516 struct mbuf *m0 = m;
6517
6518 if (sopt->sopt_val == USER_ADDR_NULL) {
6519 return 0;
6520 }
6521 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6522 if (sopt->sopt_p != kernproc) {
6523 int error;
6524
6525 error = copyin(sopt->sopt_val, mtod(m, char *),
6526 m->m_len);
6527 if (error != 0) {
6528 m_freem(m0);
6529 return error;
6530 }
6531 } else {
6532 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6533 mtod(m, char *), m->m_len);
6534 }
6535 sopt->sopt_valsize -= m->m_len;
6536 sopt->sopt_val += m->m_len;
6537 m = m->m_next;
6538 }
6539 /* should be allocated enoughly at ip6_sooptmcopyin() */
6540 if (m != NULL) {
6541 panic("soopt_mcopyin");
6542 /* NOTREACHED */
6543 }
6544 return 0;
6545 }
6546
6547 /* copyout mbuf chain data into soopt */
6548 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6549 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6550 {
6551 struct mbuf *m0 = m;
6552 size_t valsize = 0;
6553
6554 if (sopt->sopt_val == USER_ADDR_NULL) {
6555 return 0;
6556 }
6557 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6558 if (sopt->sopt_p != kernproc) {
6559 int error;
6560
6561 error = copyout(mtod(m, char *), sopt->sopt_val,
6562 m->m_len);
6563 if (error != 0) {
6564 m_freem(m0);
6565 return error;
6566 }
6567 } else {
6568 bcopy(mtod(m, char *),
6569 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6570 }
6571 sopt->sopt_valsize -= m->m_len;
6572 sopt->sopt_val += m->m_len;
6573 valsize += m->m_len;
6574 m = m->m_next;
6575 }
6576 if (m != NULL) {
6577 /* enough soopt buffer should be given from user-land */
6578 m_freem(m0);
6579 return EINVAL;
6580 }
6581 sopt->sopt_valsize = valsize;
6582 return 0;
6583 }
6584
6585 void
sohasoutofband(struct socket * so)6586 sohasoutofband(struct socket *so)
6587 {
6588 if (so->so_pgid < 0) {
6589 gsignal(-so->so_pgid, SIGURG);
6590 } else if (so->so_pgid > 0) {
6591 proc_signal(so->so_pgid, SIGURG);
6592 }
6593 selwakeup(&so->so_rcv.sb_sel);
6594 if (so->so_rcv.sb_flags & SB_KNOTE) {
6595 KNOTE(&so->so_rcv.sb_sel.si_note,
6596 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6597 }
6598 }
6599
6600 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6601 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6602 {
6603 #pragma unused(cred)
6604 struct proc *p = current_proc();
6605 int revents = 0;
6606
6607 socket_lock(so, 1);
6608 so_update_last_owner_locked(so, PROC_NULL);
6609 so_update_policy(so);
6610
6611 if (events & (POLLIN | POLLRDNORM)) {
6612 if (soreadable(so)) {
6613 revents |= events & (POLLIN | POLLRDNORM);
6614 }
6615 }
6616
6617 if (events & (POLLOUT | POLLWRNORM)) {
6618 if (sowriteable(so)) {
6619 revents |= events & (POLLOUT | POLLWRNORM);
6620 }
6621 }
6622
6623 if (events & (POLLPRI | POLLRDBAND)) {
6624 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6625 revents |= events & (POLLPRI | POLLRDBAND);
6626 }
6627 }
6628
6629 if (revents == 0) {
6630 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6631 /*
6632 * Darwin sets the flag first,
6633 * BSD calls selrecord first
6634 */
6635 so->so_rcv.sb_flags |= SB_SEL;
6636 selrecord(p, &so->so_rcv.sb_sel, wql);
6637 }
6638
6639 if (events & (POLLOUT | POLLWRNORM)) {
6640 /*
6641 * Darwin sets the flag first,
6642 * BSD calls selrecord first
6643 */
6644 so->so_snd.sb_flags |= SB_SEL;
6645 selrecord(p, &so->so_snd.sb_sel, wql);
6646 }
6647 }
6648
6649 socket_unlock(so, 1);
6650 return revents;
6651 }
6652
6653 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6654 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6655 {
6656 struct socket *so = (struct socket *)fp_get_data(fp);
6657 int result;
6658
6659 socket_lock(so, 1);
6660 so_update_last_owner_locked(so, PROC_NULL);
6661 so_update_policy(so);
6662
6663 switch (kn->kn_filter) {
6664 case EVFILT_READ:
6665 kn->kn_filtid = EVFILTID_SOREAD;
6666 break;
6667 case EVFILT_WRITE:
6668 kn->kn_filtid = EVFILTID_SOWRITE;
6669 break;
6670 case EVFILT_SOCK:
6671 kn->kn_filtid = EVFILTID_SCK;
6672 break;
6673 case EVFILT_EXCEPT:
6674 kn->kn_filtid = EVFILTID_SOEXCEPT;
6675 break;
6676 default:
6677 socket_unlock(so, 1);
6678 knote_set_error(kn, EINVAL);
6679 return 0;
6680 }
6681
6682 /*
6683 * call the appropriate sub-filter attach
6684 * with the socket still locked
6685 */
6686 result = knote_fops(kn)->f_attach(kn, kev);
6687
6688 socket_unlock(so, 1);
6689
6690 return result;
6691 }
6692
6693 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6694 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6695 {
6696 int retval = 0;
6697 int64_t data = 0;
6698
6699 if (so->so_options & SO_ACCEPTCONN) {
6700 /*
6701 * Radar 6615193 handle the listen case dynamically
6702 * for kqueue read filter. This allows to call listen()
6703 * after registering the kqueue EVFILT_READ.
6704 */
6705
6706 retval = !TAILQ_EMPTY(&so->so_comp);
6707 data = so->so_qlen;
6708 goto out;
6709 }
6710
6711 /* socket isn't a listener */
6712 /*
6713 * NOTE_LOWAT specifies new low water mark in data, i.e.
6714 * the bytes of protocol data. We therefore exclude any
6715 * control bytes.
6716 */
6717 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6718
6719 if (kn->kn_sfflags & NOTE_OOB) {
6720 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6721 kn->kn_fflags |= NOTE_OOB;
6722 data -= so->so_oobmark;
6723 retval = 1;
6724 goto out;
6725 }
6726 }
6727
6728 if ((so->so_state & SS_CANTRCVMORE)
6729 #if CONTENT_FILTER
6730 && cfil_sock_data_pending(&so->so_rcv) == 0
6731 #endif /* CONTENT_FILTER */
6732 ) {
6733 kn->kn_flags |= EV_EOF;
6734 kn->kn_fflags = so->so_error;
6735 retval = 1;
6736 goto out;
6737 }
6738
6739 if (so->so_error) { /* temporary udp error */
6740 retval = 1;
6741 goto out;
6742 }
6743
6744 int64_t lowwat = so->so_rcv.sb_lowat;
6745 /*
6746 * Ensure that when NOTE_LOWAT is used, the derived
6747 * low water mark is bounded by socket's rcv buf's
6748 * high and low water mark values.
6749 */
6750 if (kn->kn_sfflags & NOTE_LOWAT) {
6751 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6752 lowwat = so->so_rcv.sb_hiwat;
6753 } else if (kn->kn_sdata > lowwat) {
6754 lowwat = kn->kn_sdata;
6755 }
6756 }
6757
6758 /*
6759 * While the `data` field is the amount of data to read,
6760 * 0-sized packets need to wake up the kqueue, see 58140856,
6761 * so we need to take control bytes into account too.
6762 */
6763 retval = (so->so_rcv.sb_cc >= lowwat);
6764
6765 out:
6766 if (retval && kev) {
6767 knote_fill_kevent(kn, kev, data);
6768 }
6769 return retval;
6770 }
6771
6772 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6773 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6774 {
6775 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6776
6777 /* socket locked */
6778
6779 /*
6780 * If the caller explicitly asked for OOB results (e.g. poll())
6781 * from EVFILT_READ, then save that off in the hookid field
6782 * and reserve the kn_flags EV_OOBAND bit for output only.
6783 */
6784 if (kn->kn_filter == EVFILT_READ &&
6785 kn->kn_flags & EV_OOBAND) {
6786 kn->kn_flags &= ~EV_OOBAND;
6787 kn->kn_hook32 = EV_OOBAND;
6788 } else {
6789 kn->kn_hook32 = 0;
6790 }
6791 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6792 so->so_rcv.sb_flags |= SB_KNOTE;
6793 }
6794
6795 /* indicate if event is already fired */
6796 return filt_soread_common(kn, NULL, so);
6797 }
6798
6799 static void
filt_sordetach(struct knote * kn)6800 filt_sordetach(struct knote *kn)
6801 {
6802 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6803
6804 socket_lock(so, 1);
6805 if (so->so_rcv.sb_flags & SB_KNOTE) {
6806 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6807 so->so_rcv.sb_flags &= ~SB_KNOTE;
6808 }
6809 }
6810 socket_unlock(so, 1);
6811 }
6812
6813 /*ARGSUSED*/
6814 static int
filt_soread(struct knote * kn,long hint)6815 filt_soread(struct knote *kn, long hint)
6816 {
6817 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6818 int retval;
6819
6820 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6821 socket_lock(so, 1);
6822 }
6823
6824 retval = filt_soread_common(kn, NULL, so);
6825
6826 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6827 socket_unlock(so, 1);
6828 }
6829
6830 return retval;
6831 }
6832
6833 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6834 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6835 {
6836 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6837 int retval;
6838
6839 socket_lock(so, 1);
6840
6841 /* save off the new input fflags and data */
6842 kn->kn_sfflags = kev->fflags;
6843 kn->kn_sdata = kev->data;
6844
6845 /* determine if changes result in fired events */
6846 retval = filt_soread_common(kn, NULL, so);
6847
6848 socket_unlock(so, 1);
6849
6850 return retval;
6851 }
6852
6853 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6854 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6855 {
6856 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6857 int retval;
6858
6859 socket_lock(so, 1);
6860 retval = filt_soread_common(kn, kev, so);
6861 socket_unlock(so, 1);
6862
6863 return retval;
6864 }
6865
6866 int
so_wait_for_if_feedback(struct socket * so)6867 so_wait_for_if_feedback(struct socket *so)
6868 {
6869 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6870 (so->so_state & SS_ISCONNECTED)) {
6871 struct inpcb *inp = sotoinpcb(so);
6872 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6873 return 1;
6874 }
6875 }
6876 return 0;
6877 }
6878
6879 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6880 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6881 {
6882 int ret = 0;
6883 int64_t data = sbspace(&so->so_snd);
6884
6885 if (so->so_state & SS_CANTSENDMORE) {
6886 kn->kn_flags |= EV_EOF;
6887 kn->kn_fflags = so->so_error;
6888 ret = 1;
6889 goto out;
6890 }
6891
6892 if (so->so_error) { /* temporary udp error */
6893 ret = 1;
6894 goto out;
6895 }
6896
6897 if (!socanwrite(so)) {
6898 ret = 0;
6899 goto out;
6900 }
6901
6902 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6903 ret = 1;
6904 goto out;
6905 }
6906
6907 int64_t lowwat = so->so_snd.sb_lowat;
6908 const int64_t hiwat = so->so_snd.sb_hiwat;
6909 /*
6910 * Deal with connected UNIX domain sockets which
6911 * rely on the fact that the sender's socket buffer is
6912 * actually the receiver's socket buffer.
6913 */
6914 if (SOCK_DOM(so) == PF_LOCAL) {
6915 struct unpcb *unp = sotounpcb(so);
6916 if (unp != NULL && unp->unp_conn != NULL &&
6917 unp->unp_conn->unp_socket != NULL) {
6918 struct socket *so2 = unp->unp_conn->unp_socket;
6919 /*
6920 * At this point we know that `so' is locked
6921 * and that `unp_conn` isn't going to change.
6922 * However, we don't lock `so2` because doing so
6923 * may require unlocking `so'
6924 * (see unp_get_locks_in_order()).
6925 *
6926 * Two cases can happen:
6927 *
6928 * 1) we return 1 and tell the application that
6929 * it can write. Meanwhile, another thread
6930 * fills up the socket buffer. This will either
6931 * lead to a blocking send or EWOULDBLOCK
6932 * which the application should deal with.
6933 * 2) we return 0 and tell the application that
6934 * the socket is not writable. Meanwhile,
6935 * another thread depletes the receive socket
6936 * buffer. In this case the application will
6937 * be woken up by sb_notify().
6938 *
6939 * MIN() is required because otherwise sosendcheck()
6940 * may return EWOULDBLOCK since it only considers
6941 * so->so_snd.
6942 */
6943 data = MIN(data, sbspace(&so2->so_rcv));
6944 }
6945 }
6946
6947 if (kn->kn_sfflags & NOTE_LOWAT) {
6948 if (kn->kn_sdata > hiwat) {
6949 lowwat = hiwat;
6950 } else if (kn->kn_sdata > lowwat) {
6951 lowwat = kn->kn_sdata;
6952 }
6953 }
6954
6955 if (data > 0 && data >= lowwat) {
6956 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6957 #if (DEBUG || DEVELOPMENT)
6958 && so_notsent_lowat_check == 1
6959 #endif /* DEBUG || DEVELOPMENT */
6960 ) {
6961 if ((SOCK_DOM(so) == PF_INET ||
6962 SOCK_DOM(so) == PF_INET6) &&
6963 so->so_type == SOCK_STREAM) {
6964 ret = tcp_notsent_lowat_check(so);
6965 }
6966 #if MPTCP
6967 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6968 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6969 ret = mptcp_notsent_lowat_check(so);
6970 }
6971 #endif
6972 else {
6973 ret = 1;
6974 goto out;
6975 }
6976 } else {
6977 ret = 1;
6978 }
6979 }
6980 if (so_wait_for_if_feedback(so)) {
6981 ret = 0;
6982 }
6983
6984 out:
6985 if (ret && kev) {
6986 knote_fill_kevent(kn, kev, data);
6987 }
6988 return ret;
6989 }
6990
6991 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6992 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6993 {
6994 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6995
6996 /* socket locked */
6997 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6998 so->so_snd.sb_flags |= SB_KNOTE;
6999 }
7000
7001 /* determine if its already fired */
7002 return filt_sowrite_common(kn, NULL, so);
7003 }
7004
7005 static void
filt_sowdetach(struct knote * kn)7006 filt_sowdetach(struct knote *kn)
7007 {
7008 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7009 socket_lock(so, 1);
7010
7011 if (so->so_snd.sb_flags & SB_KNOTE) {
7012 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
7013 so->so_snd.sb_flags &= ~SB_KNOTE;
7014 }
7015 }
7016 socket_unlock(so, 1);
7017 }
7018
7019 /*ARGSUSED*/
7020 static int
filt_sowrite(struct knote * kn,long hint)7021 filt_sowrite(struct knote *kn, long hint)
7022 {
7023 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7024 int ret;
7025
7026 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7027 socket_lock(so, 1);
7028 }
7029
7030 ret = filt_sowrite_common(kn, NULL, so);
7031
7032 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7033 socket_unlock(so, 1);
7034 }
7035
7036 return ret;
7037 }
7038
7039 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)7040 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
7041 {
7042 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7043 int ret;
7044
7045 socket_lock(so, 1);
7046
7047 /*save off the new input fflags and data */
7048 kn->kn_sfflags = kev->fflags;
7049 kn->kn_sdata = kev->data;
7050
7051 /* determine if these changes result in a triggered event */
7052 ret = filt_sowrite_common(kn, NULL, so);
7053
7054 socket_unlock(so, 1);
7055
7056 return ret;
7057 }
7058
7059 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)7060 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
7061 {
7062 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7063 int ret;
7064
7065 socket_lock(so, 1);
7066 ret = filt_sowrite_common(kn, kev, so);
7067 socket_unlock(so, 1);
7068
7069 return ret;
7070 }
7071
7072 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)7073 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
7074 struct socket *so, long ev_hint)
7075 {
7076 int ret = 0;
7077 int64_t data = 0;
7078 uint32_t level_trigger = 0;
7079
7080 if (ev_hint & SO_FILT_HINT_CONNRESET) {
7081 kn->kn_fflags |= NOTE_CONNRESET;
7082 }
7083 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
7084 kn->kn_fflags |= NOTE_TIMEOUT;
7085 }
7086 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
7087 kn->kn_fflags |= NOTE_NOSRCADDR;
7088 }
7089 if (ev_hint & SO_FILT_HINT_IFDENIED) {
7090 kn->kn_fflags |= NOTE_IFDENIED;
7091 }
7092 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7093 kn->kn_fflags |= NOTE_KEEPALIVE;
7094 }
7095 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7096 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7097 }
7098 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7099 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7100 }
7101 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7102 (so->so_state & SS_ISCONNECTED)) {
7103 kn->kn_fflags |= NOTE_CONNECTED;
7104 level_trigger |= NOTE_CONNECTED;
7105 }
7106 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7107 (so->so_state & SS_ISDISCONNECTED)) {
7108 kn->kn_fflags |= NOTE_DISCONNECTED;
7109 level_trigger |= NOTE_DISCONNECTED;
7110 }
7111 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7112 if (so->so_proto != NULL &&
7113 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7114 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7115 }
7116 }
7117 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7118 tcp_notify_ack_active(so)) {
7119 kn->kn_fflags |= NOTE_NOTIFY_ACK;
7120 }
7121 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
7122 kn->kn_fflags |= NOTE_WAKE_PKT;
7123 }
7124
7125 if ((so->so_state & SS_CANTRCVMORE)
7126 #if CONTENT_FILTER
7127 && cfil_sock_data_pending(&so->so_rcv) == 0
7128 #endif /* CONTENT_FILTER */
7129 ) {
7130 kn->kn_fflags |= NOTE_READCLOSED;
7131 level_trigger |= NOTE_READCLOSED;
7132 }
7133
7134 if (so->so_state & SS_CANTSENDMORE) {
7135 kn->kn_fflags |= NOTE_WRITECLOSED;
7136 level_trigger |= NOTE_WRITECLOSED;
7137 }
7138
7139 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7140 (so->so_flags & SOF_SUSPENDED)) {
7141 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7142
7143 /* If resume event was delivered before, reset it */
7144 kn->kn_hook32 &= ~NOTE_RESUME;
7145
7146 kn->kn_fflags |= NOTE_SUSPEND;
7147 level_trigger |= NOTE_SUSPEND;
7148 }
7149
7150 if ((ev_hint & SO_FILT_HINT_RESUME) ||
7151 (so->so_flags & SOF_SUSPENDED) == 0) {
7152 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7153
7154 /* If suspend event was delivered before, reset it */
7155 kn->kn_hook32 &= ~NOTE_SUSPEND;
7156
7157 kn->kn_fflags |= NOTE_RESUME;
7158 level_trigger |= NOTE_RESUME;
7159 }
7160
7161 if (so->so_error != 0) {
7162 ret = 1;
7163 data = so->so_error;
7164 kn->kn_flags |= EV_EOF;
7165 } else {
7166 u_int32_t data32 = 0;
7167 get_sockev_state(so, &data32);
7168 data = data32;
7169 }
7170
7171 /* Reset any events that are not requested on this knote */
7172 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7173 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7174
7175 /* Find the level triggerred events that are already delivered */
7176 level_trigger &= kn->kn_hook32;
7177 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7178
7179 /* Do not deliver level triggerred events more than once */
7180 if ((kn->kn_fflags & ~level_trigger) != 0) {
7181 ret = 1;
7182 }
7183
7184 if (ret && kev) {
7185 /*
7186 * Store the state of the events being delivered. This
7187 * state can be used to deliver level triggered events
7188 * ateast once and still avoid waking up the application
7189 * multiple times as long as the event is active.
7190 */
7191 if (kn->kn_fflags != 0) {
7192 kn->kn_hook32 |= (kn->kn_fflags &
7193 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7194 }
7195
7196 /*
7197 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7198 * only one of them and remember the last one that was
7199 * delivered last
7200 */
7201 if (kn->kn_fflags & NOTE_SUSPEND) {
7202 kn->kn_hook32 &= ~NOTE_RESUME;
7203 }
7204 if (kn->kn_fflags & NOTE_RESUME) {
7205 kn->kn_hook32 &= ~NOTE_SUSPEND;
7206 }
7207
7208 knote_fill_kevent(kn, kev, data);
7209 }
7210 return ret;
7211 }
7212
7213 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7214 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7215 {
7216 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7217
7218 /* socket locked */
7219 kn->kn_hook32 = 0;
7220 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7221 so->so_flags |= SOF_KNOTE;
7222 }
7223
7224 /* determine if event already fired */
7225 return filt_sockev_common(kn, NULL, so, 0);
7226 }
7227
7228 static void
filt_sockdetach(struct knote * kn)7229 filt_sockdetach(struct knote *kn)
7230 {
7231 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7232 socket_lock(so, 1);
7233
7234 if ((so->so_flags & SOF_KNOTE) != 0) {
7235 if (KNOTE_DETACH(&so->so_klist, kn)) {
7236 so->so_flags &= ~SOF_KNOTE;
7237 }
7238 }
7239 socket_unlock(so, 1);
7240 }
7241
7242 static int
filt_sockev(struct knote * kn,long hint)7243 filt_sockev(struct knote *kn, long hint)
7244 {
7245 int ret = 0, locked = 0;
7246 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7247 long ev_hint = (hint & SO_FILT_HINT_EV);
7248
7249 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7250 socket_lock(so, 1);
7251 locked = 1;
7252 }
7253
7254 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7255
7256 if (locked) {
7257 socket_unlock(so, 1);
7258 }
7259
7260 return ret;
7261 }
7262
7263
7264
7265 /*
7266 * filt_socktouch - update event state
7267 */
7268 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7269 filt_socktouch(
7270 struct knote *kn,
7271 struct kevent_qos_s *kev)
7272 {
7273 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7274 uint32_t changed_flags;
7275 int ret;
7276
7277 socket_lock(so, 1);
7278
7279 /* save off the [result] data and fflags */
7280 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7281
7282 /* save off the new input fflags and data */
7283 kn->kn_sfflags = kev->fflags;
7284 kn->kn_sdata = kev->data;
7285
7286 /* restrict the current results to the (smaller?) set of new interest */
7287 /*
7288 * For compatibility with previous implementations, we leave kn_fflags
7289 * as they were before.
7290 */
7291 //kn->kn_fflags &= kev->fflags;
7292
7293 /*
7294 * Since we keep track of events that are already
7295 * delivered, if any of those events are not requested
7296 * anymore the state related to them can be reset
7297 */
7298 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7299
7300 /* determine if we have events to deliver */
7301 ret = filt_sockev_common(kn, NULL, so, 0);
7302
7303 socket_unlock(so, 1);
7304
7305 return ret;
7306 }
7307
7308 /*
7309 * filt_sockprocess - query event fired state and return data
7310 */
7311 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7312 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7313 {
7314 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7315 int ret = 0;
7316
7317 socket_lock(so, 1);
7318
7319 ret = filt_sockev_common(kn, kev, so, 0);
7320
7321 socket_unlock(so, 1);
7322
7323 return ret;
7324 }
7325
7326 void
get_sockev_state(struct socket * so,u_int32_t * statep)7327 get_sockev_state(struct socket *so, u_int32_t *statep)
7328 {
7329 u_int32_t state = *(statep);
7330
7331 /*
7332 * If the state variable is already used by a previous event,
7333 * reset it.
7334 */
7335 if (state != 0) {
7336 return;
7337 }
7338
7339 if (so->so_state & SS_ISCONNECTED) {
7340 state |= SOCKEV_CONNECTED;
7341 } else {
7342 state &= ~(SOCKEV_CONNECTED);
7343 }
7344 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7345 *(statep) = state;
7346 }
7347
7348 #define SO_LOCK_HISTORY_STR_LEN \
7349 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7350
7351 __private_extern__ const char *
solockhistory_nr(struct socket * so)7352 solockhistory_nr(struct socket *so)
7353 {
7354 size_t n = 0;
7355 int i;
7356 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7357
7358 bzero(lock_history_str, sizeof(lock_history_str));
7359 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7360 n += scnprintf(lock_history_str + n,
7361 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7362 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7363 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7364 }
7365 return lock_history_str;
7366 }
7367
7368 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7369 socket_getlock(struct socket *so, int flags)
7370 {
7371 if (so->so_proto->pr_getlock != NULL) {
7372 return (*so->so_proto->pr_getlock)(so, flags);
7373 } else {
7374 return so->so_proto->pr_domain->dom_mtx;
7375 }
7376 }
7377
7378 void
socket_lock(struct socket * so,int refcount)7379 socket_lock(struct socket *so, int refcount)
7380 {
7381 void *lr_saved;
7382
7383 lr_saved = __builtin_return_address(0);
7384
7385 if (so->so_proto->pr_lock) {
7386 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7387 } else {
7388 #ifdef MORE_LOCKING_DEBUG
7389 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7390 LCK_MTX_ASSERT_NOTOWNED);
7391 #endif
7392 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7393 if (refcount) {
7394 so->so_usecount++;
7395 }
7396 so->lock_lr[so->next_lock_lr] = lr_saved;
7397 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7398 }
7399 }
7400
7401 void
socket_lock_assert_owned(struct socket * so)7402 socket_lock_assert_owned(struct socket *so)
7403 {
7404 lck_mtx_t *mutex_held;
7405
7406 if (so->so_proto->pr_getlock != NULL) {
7407 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7408 } else {
7409 mutex_held = so->so_proto->pr_domain->dom_mtx;
7410 }
7411
7412 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7413 }
7414
7415 int
socket_try_lock(struct socket * so)7416 socket_try_lock(struct socket *so)
7417 {
7418 lck_mtx_t *mtx;
7419
7420 if (so->so_proto->pr_getlock != NULL) {
7421 mtx = (*so->so_proto->pr_getlock)(so, 0);
7422 } else {
7423 mtx = so->so_proto->pr_domain->dom_mtx;
7424 }
7425
7426 return lck_mtx_try_lock(mtx);
7427 }
7428
7429 void
socket_unlock(struct socket * so,int refcount)7430 socket_unlock(struct socket *so, int refcount)
7431 {
7432 void *lr_saved;
7433 lck_mtx_t *mutex_held;
7434
7435 lr_saved = __builtin_return_address(0);
7436
7437 if (so == NULL || so->so_proto == NULL) {
7438 panic("%s: null so_proto so=%p", __func__, so);
7439 /* NOTREACHED */
7440 }
7441
7442 if (so->so_proto->pr_unlock) {
7443 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7444 } else {
7445 mutex_held = so->so_proto->pr_domain->dom_mtx;
7446 #ifdef MORE_LOCKING_DEBUG
7447 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7448 #endif
7449 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7450 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7451
7452 if (refcount) {
7453 if (so->so_usecount <= 0) {
7454 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7455 "lrh=%s", __func__, so->so_usecount, so,
7456 SOCK_DOM(so), so->so_type,
7457 SOCK_PROTO(so), solockhistory_nr(so));
7458 /* NOTREACHED */
7459 }
7460
7461 so->so_usecount--;
7462 if (so->so_usecount == 0) {
7463 sofreelastref(so, 1);
7464 }
7465 }
7466 lck_mtx_unlock(mutex_held);
7467 }
7468 }
7469
7470 /* Called with socket locked, will unlock socket */
7471 void
sofree(struct socket * so)7472 sofree(struct socket *so)
7473 {
7474 lck_mtx_t *mutex_held;
7475
7476 if (so->so_proto->pr_getlock != NULL) {
7477 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7478 } else {
7479 mutex_held = so->so_proto->pr_domain->dom_mtx;
7480 }
7481 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7482
7483 sofreelastref(so, 0);
7484 }
7485
7486 void
soreference(struct socket * so)7487 soreference(struct socket *so)
7488 {
7489 socket_lock(so, 1); /* locks & take one reference on socket */
7490 socket_unlock(so, 0); /* unlock only */
7491 }
7492
7493 void
sodereference(struct socket * so)7494 sodereference(struct socket *so)
7495 {
7496 socket_lock(so, 0);
7497 socket_unlock(so, 1);
7498 }
7499
7500 /*
7501 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7502 * possibility of using jumbo clusters. Caller must ensure to hold
7503 * the socket lock.
7504 */
7505 void
somultipages(struct socket * so,boolean_t set)7506 somultipages(struct socket *so, boolean_t set)
7507 {
7508 if (set) {
7509 so->so_flags |= SOF_MULTIPAGES;
7510 } else {
7511 so->so_flags &= ~SOF_MULTIPAGES;
7512 }
7513 }
7514
7515 void
soif2kcl(struct socket * so,boolean_t set)7516 soif2kcl(struct socket *so, boolean_t set)
7517 {
7518 if (set) {
7519 so->so_flags1 |= SOF1_IF_2KCL;
7520 } else {
7521 so->so_flags1 &= ~SOF1_IF_2KCL;
7522 }
7523 }
7524
7525 int
so_isdstlocal(struct socket * so)7526 so_isdstlocal(struct socket *so)
7527 {
7528 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7529
7530 if (SOCK_DOM(so) == PF_INET) {
7531 return inaddr_local(inp->inp_faddr);
7532 } else if (SOCK_DOM(so) == PF_INET6) {
7533 return in6addr_local(&inp->in6p_faddr);
7534 }
7535
7536 return 0;
7537 }
7538
7539 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7540 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7541 {
7542 struct sockbuf *rcv, *snd;
7543 int err = 0, defunct;
7544
7545 rcv = &so->so_rcv;
7546 snd = &so->so_snd;
7547
7548 defunct = (so->so_flags & SOF_DEFUNCT);
7549 if (defunct) {
7550 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7551 panic("%s: SB_DROP not set", __func__);
7552 /* NOTREACHED */
7553 }
7554 goto done;
7555 }
7556
7557 if (so->so_flags & SOF_NODEFUNCT) {
7558 if (noforce) {
7559 err = EOPNOTSUPP;
7560 if (p != PROC_NULL) {
7561 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7562 "name %s level %d) so 0x%llx [%d,%d] "
7563 "is not eligible for defunct "
7564 "(%d)\n", __func__, proc_selfpid(),
7565 proc_best_name(current_proc()), proc_pid(p),
7566 proc_best_name(p), level,
7567 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7568 SOCK_DOM(so), SOCK_TYPE(so), err);
7569 }
7570 return err;
7571 }
7572 so->so_flags &= ~SOF_NODEFUNCT;
7573 if (p != PROC_NULL) {
7574 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7575 "name %s level %d) so 0x%llx [%d,%d] "
7576 "defunct by force "
7577 "(%d)\n", __func__, proc_selfpid(),
7578 proc_best_name(current_proc()), proc_pid(p),
7579 proc_best_name(p), level,
7580 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7581 SOCK_DOM(so), SOCK_TYPE(so), err);
7582 }
7583 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7584 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7585 struct ifnet *ifp = inp->inp_last_outifp;
7586
7587 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7588 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7589 } else if (so->so_flags & SOF_DELEGATED) {
7590 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7591 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7592 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7593 } else if (noforce && p != PROC_NULL) {
7594 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7595
7596 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7597 so->so_extended_bk_start = net_uptime();
7598 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7599
7600 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7601
7602 err = EOPNOTSUPP;
7603 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7604 "name %s level %d) so 0x%llx [%d,%d] "
7605 "extend bk idle "
7606 "(%d)\n", __func__, proc_selfpid(),
7607 proc_best_name(current_proc()), proc_pid(p),
7608 proc_best_name(p), level,
7609 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7610 SOCK_DOM(so), SOCK_TYPE(so), err);
7611 return err;
7612 } else {
7613 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7614 }
7615 }
7616
7617 so->so_flags |= SOF_DEFUNCT;
7618
7619 /* Prevent further data from being appended to the socket buffers */
7620 snd->sb_flags |= SB_DROP;
7621 rcv->sb_flags |= SB_DROP;
7622
7623 /* Flush any existing data in the socket buffers */
7624 if (rcv->sb_cc != 0) {
7625 rcv->sb_flags &= ~SB_SEL;
7626 selthreadclear(&rcv->sb_sel);
7627 sbrelease(rcv);
7628 }
7629 if (snd->sb_cc != 0) {
7630 snd->sb_flags &= ~SB_SEL;
7631 selthreadclear(&snd->sb_sel);
7632 sbrelease(snd);
7633 }
7634
7635 done:
7636 if (p != PROC_NULL) {
7637 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7638 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7639 proc_selfpid(), proc_best_name(current_proc()),
7640 proc_pid(p), proc_best_name(p), level,
7641 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7642 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7643 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7644 " extbkidle" : "");
7645 }
7646 return err;
7647 }
7648
7649 int
sodefunct(struct proc * p,struct socket * so,int level)7650 sodefunct(struct proc *p, struct socket *so, int level)
7651 {
7652 struct sockbuf *rcv, *snd;
7653
7654 if (!(so->so_flags & SOF_DEFUNCT)) {
7655 panic("%s improperly called", __func__);
7656 /* NOTREACHED */
7657 }
7658 if (so->so_state & SS_DEFUNCT) {
7659 goto done;
7660 }
7661
7662 rcv = &so->so_rcv;
7663 snd = &so->so_snd;
7664
7665 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7666 char s[MAX_IPv6_STR_LEN];
7667 char d[MAX_IPv6_STR_LEN];
7668 struct inpcb *inp = sotoinpcb(so);
7669
7670 if (p != PROC_NULL) {
7671 SODEFUNCTLOG(
7672 "%s[%d, %s]: (target pid %d name %s level %d) "
7673 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7674 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7675 " snd_fl 0x%x]\n", __func__,
7676 proc_selfpid(), proc_best_name(current_proc()),
7677 proc_pid(p), proc_best_name(p), level,
7678 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7679 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7680 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7681 (void *)&inp->inp_laddr.s_addr :
7682 (void *)&inp->in6p_laddr),
7683 s, sizeof(s)), ntohs(inp->in6p_lport),
7684 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7685 (void *)&inp->inp_faddr.s_addr :
7686 (void *)&inp->in6p_faddr,
7687 d, sizeof(d)), ntohs(inp->in6p_fport),
7688 (uint32_t)rcv->sb_sel.si_flags,
7689 (uint32_t)snd->sb_sel.si_flags,
7690 rcv->sb_flags, snd->sb_flags);
7691 }
7692 } else if (p != PROC_NULL) {
7693 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7694 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7695 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7696 proc_selfpid(), proc_best_name(current_proc()),
7697 proc_pid(p), proc_best_name(p), level,
7698 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7699 SOCK_DOM(so), SOCK_TYPE(so),
7700 (uint32_t)rcv->sb_sel.si_flags,
7701 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7702 snd->sb_flags);
7703 }
7704
7705 /*
7706 * Unwedge threads blocked on sbwait() and sb_lock().
7707 */
7708 sbwakeup(rcv);
7709 sbwakeup(snd);
7710
7711 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7712 if (rcv->sb_flags & SB_LOCK) {
7713 sbunlock(rcv, TRUE); /* keep socket locked */
7714 }
7715 if (snd->sb_flags & SB_LOCK) {
7716 sbunlock(snd, TRUE); /* keep socket locked */
7717 }
7718 /*
7719 * Flush the buffers and disconnect. We explicitly call shutdown
7720 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7721 * states are set for the socket. This would also flush out data
7722 * hanging off the receive list of this socket.
7723 */
7724 (void) soshutdownlock_final(so, SHUT_RD);
7725 (void) soshutdownlock_final(so, SHUT_WR);
7726 (void) sodisconnectlocked(so);
7727
7728 /*
7729 * Explicitly handle connectionless-protocol disconnection
7730 * and release any remaining data in the socket buffers.
7731 */
7732 if (!(so->so_state & SS_ISDISCONNECTED)) {
7733 (void) soisdisconnected(so);
7734 }
7735
7736 if (so->so_error == 0) {
7737 so->so_error = EBADF;
7738 }
7739
7740 if (rcv->sb_cc != 0) {
7741 rcv->sb_flags &= ~SB_SEL;
7742 selthreadclear(&rcv->sb_sel);
7743 sbrelease(rcv);
7744 }
7745 if (snd->sb_cc != 0) {
7746 snd->sb_flags &= ~SB_SEL;
7747 selthreadclear(&snd->sb_sel);
7748 sbrelease(snd);
7749 }
7750 so->so_state |= SS_DEFUNCT;
7751 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7752
7753 done:
7754 return 0;
7755 }
7756
7757 int
soresume(struct proc * p,struct socket * so,int locked)7758 soresume(struct proc *p, struct socket *so, int locked)
7759 {
7760 if (locked == 0) {
7761 socket_lock(so, 1);
7762 }
7763
7764 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7765 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7766 "[%d,%d] resumed from bk idle\n",
7767 __func__, proc_selfpid(), proc_best_name(current_proc()),
7768 proc_pid(p), proc_best_name(p),
7769 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7770 SOCK_DOM(so), SOCK_TYPE(so));
7771
7772 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7773 so->so_extended_bk_start = 0;
7774 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7775
7776 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7777 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7778 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7779 }
7780 if (locked == 0) {
7781 socket_unlock(so, 1);
7782 }
7783
7784 return 0;
7785 }
7786
7787 /*
7788 * Does not attempt to account for sockets that are delegated from
7789 * the current process
7790 */
7791 int
so_set_extended_bk_idle(struct socket * so,int optval)7792 so_set_extended_bk_idle(struct socket *so, int optval)
7793 {
7794 int error = 0;
7795
7796 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7797 SOCK_PROTO(so) != IPPROTO_TCP) {
7798 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7799 error = EOPNOTSUPP;
7800 } else if (optval == 0) {
7801 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7802
7803 soresume(current_proc(), so, 1);
7804 } else {
7805 struct proc *p = current_proc();
7806 struct fileproc *fp;
7807 int count = 0;
7808
7809 /*
7810 * Unlock socket to avoid lock ordering issue with
7811 * the proc fd table lock
7812 */
7813 socket_unlock(so, 0);
7814
7815 proc_fdlock(p);
7816 fdt_foreach(fp, p) {
7817 struct socket *so2;
7818
7819 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7820 continue;
7821 }
7822
7823 so2 = (struct socket *)fp_get_data(fp);
7824 if (so != so2 &&
7825 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7826 count++;
7827 }
7828 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7829 break;
7830 }
7831 }
7832 proc_fdunlock(p);
7833
7834 socket_lock(so, 0);
7835
7836 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7837 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7838 error = EBUSY;
7839 } else if (so->so_flags & SOF_DELEGATED) {
7840 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7841 error = EBUSY;
7842 } else {
7843 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7844 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7845 }
7846 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7847 "%s marked for extended bk idle\n",
7848 __func__, proc_selfpid(), proc_best_name(current_proc()),
7849 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7850 SOCK_DOM(so), SOCK_TYPE(so),
7851 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7852 "is" : "not");
7853 }
7854
7855 return error;
7856 }
7857
7858 static void
so_stop_extended_bk_idle(struct socket * so)7859 so_stop_extended_bk_idle(struct socket *so)
7860 {
7861 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7862 so->so_extended_bk_start = 0;
7863
7864 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7865 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7866 /*
7867 * Force defunct
7868 */
7869 sosetdefunct(current_proc(), so,
7870 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7871 if (so->so_flags & SOF_DEFUNCT) {
7872 sodefunct(current_proc(), so,
7873 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7874 }
7875 }
7876
7877 void
so_drain_extended_bk_idle(struct socket * so)7878 so_drain_extended_bk_idle(struct socket *so)
7879 {
7880 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7881 /*
7882 * Only penalize sockets that have outstanding data
7883 */
7884 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7885 so_stop_extended_bk_idle(so);
7886
7887 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7888 }
7889 }
7890 }
7891
7892 /*
7893 * Return values tells if socket is still in extended background idle
7894 */
7895 int
so_check_extended_bk_idle_time(struct socket * so)7896 so_check_extended_bk_idle_time(struct socket *so)
7897 {
7898 int ret = 1;
7899
7900 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7901 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7902 __func__, proc_selfpid(), proc_best_name(current_proc()),
7903 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7904 SOCK_DOM(so), SOCK_TYPE(so));
7905 if (net_uptime() - so->so_extended_bk_start >
7906 soextbkidlestat.so_xbkidle_time) {
7907 so_stop_extended_bk_idle(so);
7908
7909 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7910
7911 ret = 0;
7912 } else {
7913 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7914
7915 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7916 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7917 }
7918 }
7919
7920 return ret;
7921 }
7922
7923 void
resume_proc_sockets(proc_t p)7924 resume_proc_sockets(proc_t p)
7925 {
7926 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7927 struct fileproc *fp;
7928 struct socket *so;
7929
7930 proc_fdlock(p);
7931 fdt_foreach(fp, p) {
7932 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7933 continue;
7934 }
7935
7936 so = (struct socket *)fp_get_data(fp);
7937 (void) soresume(p, so, 0);
7938 }
7939 proc_fdunlock(p);
7940
7941 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7942 }
7943 }
7944
7945 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7946 so_set_recv_anyif(struct socket *so, int optval)
7947 {
7948 int ret = 0;
7949
7950 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7951 if (optval) {
7952 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7953 } else {
7954 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7955 }
7956 #if SKYWALK
7957 inp_update_netns_flags(so);
7958 #endif /* SKYWALK */
7959 }
7960
7961
7962 return ret;
7963 }
7964
7965 __private_extern__ int
so_get_recv_anyif(struct socket * so)7966 so_get_recv_anyif(struct socket *so)
7967 {
7968 int ret = 0;
7969
7970 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7971 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7972 }
7973
7974 return ret;
7975 }
7976
7977 int
so_set_restrictions(struct socket * so,uint32_t vals)7978 so_set_restrictions(struct socket *so, uint32_t vals)
7979 {
7980 int nocell_old, nocell_new;
7981 int noexpensive_old, noexpensive_new;
7982 int noconstrained_old, noconstrained_new;
7983
7984 /*
7985 * Deny-type restrictions are trapdoors; once set they cannot be
7986 * unset for the lifetime of the socket. This allows them to be
7987 * issued by a framework on behalf of the application without
7988 * having to worry that they can be undone.
7989 *
7990 * Note here that socket-level restrictions overrides any protocol
7991 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7992 * socket restriction issued on the socket has a higher precendence
7993 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7994 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7995 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7996 */
7997 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7998 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7999 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8000 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
8001 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
8002 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
8003 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
8004 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
8005 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
8006
8007 /* we can only set, not clear restrictions */
8008 if ((nocell_new - nocell_old) == 0 &&
8009 (noexpensive_new - noexpensive_old) == 0 &&
8010 (noconstrained_new - noconstrained_old) == 0) {
8011 return 0;
8012 }
8013 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
8014 if (nocell_new - nocell_old != 0) {
8015 /*
8016 * if deny cellular is now set, do what's needed
8017 * for INPCB
8018 */
8019 inp_set_nocellular(sotoinpcb(so));
8020 }
8021 if (noexpensive_new - noexpensive_old != 0) {
8022 inp_set_noexpensive(sotoinpcb(so));
8023 }
8024 if (noconstrained_new - noconstrained_old != 0) {
8025 inp_set_noconstrained(sotoinpcb(so));
8026 }
8027 }
8028
8029 if (SOCK_DOM(so) == PF_MULTIPATH) {
8030 mptcp_set_restrictions(so);
8031 }
8032
8033 return 0;
8034 }
8035
8036 uint32_t
so_get_restrictions(struct socket * so)8037 so_get_restrictions(struct socket *so)
8038 {
8039 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
8040 SO_RESTRICT_DENY_OUT |
8041 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
8042 }
8043
8044 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)8045 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
8046 {
8047 struct proc *ep = PROC_NULL;
8048 int error = 0;
8049
8050 /* pid 0 is reserved for kernel */
8051 if (epid == 0) {
8052 error = EINVAL;
8053 goto done;
8054 }
8055
8056 /*
8057 * If this is an in-kernel socket, prevent its delegate
8058 * association from changing unless the socket option is
8059 * coming from within the kernel itself.
8060 */
8061 if (so->last_pid == 0 && p != kernproc) {
8062 error = EACCES;
8063 goto done;
8064 }
8065
8066 /*
8067 * If this is issued by a process that's recorded as the
8068 * real owner of the socket, or if the pid is the same as
8069 * the process's own pid, then proceed. Otherwise ensure
8070 * that the issuing process has the necessary privileges.
8071 */
8072 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
8073 if ((error = priv_check_cred(kauth_cred_get(),
8074 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8075 error = EACCES;
8076 goto done;
8077 }
8078 }
8079
8080 /* Find the process that corresponds to the effective pid */
8081 if ((ep = proc_find(epid)) == PROC_NULL) {
8082 error = ESRCH;
8083 goto done;
8084 }
8085
8086 /*
8087 * If a process tries to delegate the socket to itself, then
8088 * there's really nothing to do; treat it as a way for the
8089 * delegate association to be cleared. Note that we check
8090 * the passed-in proc rather than calling proc_selfpid(),
8091 * as we need to check the process issuing the socket option
8092 * which could be kernproc. Given that we don't allow 0 for
8093 * effective pid, it means that a delegated in-kernel socket
8094 * stays delegated during its lifetime (which is probably OK.)
8095 */
8096 if (epid == proc_pid(p)) {
8097 so->so_flags &= ~SOF_DELEGATED;
8098 so->e_upid = 0;
8099 so->e_pid = 0;
8100 uuid_clear(so->e_uuid);
8101 } else {
8102 so->so_flags |= SOF_DELEGATED;
8103 so->e_upid = proc_uniqueid(ep);
8104 so->e_pid = proc_pid(ep);
8105 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8106
8107 #if defined(XNU_TARGET_OS_OSX)
8108 if (ep->p_responsible_pid != so->e_pid) {
8109 proc_t rp = proc_find(ep->p_responsible_pid);
8110 if (rp != PROC_NULL) {
8111 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8112 so->so_rpid = ep->p_responsible_pid;
8113 proc_rele(rp);
8114 } else {
8115 uuid_clear(so->so_ruuid);
8116 so->so_rpid = -1;
8117 }
8118 }
8119 #endif
8120 }
8121 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8122 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8123 }
8124 done:
8125 if (error == 0 && net_io_policy_log) {
8126 uuid_string_t buf;
8127
8128 uuid_unparse(so->e_uuid, buf);
8129 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8130 "euuid %s%s\n", __func__, proc_name_address(p),
8131 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8132 SOCK_DOM(so), SOCK_TYPE(so),
8133 so->e_pid, proc_name_address(ep), buf,
8134 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8135 } else if (error != 0 && net_io_policy_log) {
8136 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8137 "ERROR (%d)\n", __func__, proc_name_address(p),
8138 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8139 SOCK_DOM(so), SOCK_TYPE(so),
8140 epid, (ep == PROC_NULL) ? "PROC_NULL" :
8141 proc_name_address(ep), error);
8142 }
8143
8144 /* Update this socket's policy upon success */
8145 if (error == 0) {
8146 so->so_policy_gencnt *= -1;
8147 so_update_policy(so);
8148 #if NECP
8149 so_update_necp_policy(so, NULL, NULL);
8150 #endif /* NECP */
8151 }
8152
8153 if (ep != PROC_NULL) {
8154 proc_rele(ep);
8155 }
8156
8157 return error;
8158 }
8159
8160 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8161 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8162 {
8163 uuid_string_t buf;
8164 uuid_t uuid;
8165 int error = 0;
8166
8167 /* UUID must not be all-zeroes (reserved for kernel) */
8168 if (uuid_is_null(euuid)) {
8169 error = EINVAL;
8170 goto done;
8171 }
8172
8173 /*
8174 * If this is an in-kernel socket, prevent its delegate
8175 * association from changing unless the socket option is
8176 * coming from within the kernel itself.
8177 */
8178 if (so->last_pid == 0 && p != kernproc) {
8179 error = EACCES;
8180 goto done;
8181 }
8182
8183 /* Get the UUID of the issuing process */
8184 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8185
8186 /*
8187 * If this is issued by a process that's recorded as the
8188 * real owner of the socket, or if the uuid is the same as
8189 * the process's own uuid, then proceed. Otherwise ensure
8190 * that the issuing process has the necessary privileges.
8191 */
8192 if (check_cred &&
8193 (uuid_compare(euuid, so->last_uuid) != 0 ||
8194 uuid_compare(euuid, uuid) != 0)) {
8195 if ((error = priv_check_cred(kauth_cred_get(),
8196 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8197 error = EACCES;
8198 goto done;
8199 }
8200 }
8201
8202 /*
8203 * If a process tries to delegate the socket to itself, then
8204 * there's really nothing to do; treat it as a way for the
8205 * delegate association to be cleared. Note that we check
8206 * the uuid of the passed-in proc rather than that of the
8207 * current process, as we need to check the process issuing
8208 * the socket option which could be kernproc itself. Given
8209 * that we don't allow 0 for effective uuid, it means that
8210 * a delegated in-kernel socket stays delegated during its
8211 * lifetime (which is okay.)
8212 */
8213 if (uuid_compare(euuid, uuid) == 0) {
8214 so->so_flags &= ~SOF_DELEGATED;
8215 so->e_upid = 0;
8216 so->e_pid = 0;
8217 uuid_clear(so->e_uuid);
8218 } else {
8219 so->so_flags |= SOF_DELEGATED;
8220 /*
8221 * Unlike so_set_effective_pid(), we only have the UUID
8222 * here and the process ID is not known. Inherit the
8223 * real {pid,upid} of the socket.
8224 */
8225 so->e_upid = so->last_upid;
8226 so->e_pid = so->last_pid;
8227 uuid_copy(so->e_uuid, euuid);
8228 }
8229 /*
8230 * The following will clear the effective process name as it's the same
8231 * as the real process
8232 */
8233 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8234 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8235 }
8236 done:
8237 if (error == 0 && net_io_policy_log) {
8238 uuid_unparse(so->e_uuid, buf);
8239 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8240 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8241 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8242 SOCK_TYPE(so), so->e_pid, buf,
8243 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8244 } else if (error != 0 && net_io_policy_log) {
8245 uuid_unparse(euuid, buf);
8246 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8247 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8248 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8249 SOCK_TYPE(so), buf, error);
8250 }
8251
8252 /* Update this socket's policy upon success */
8253 if (error == 0) {
8254 so->so_policy_gencnt *= -1;
8255 so_update_policy(so);
8256 #if NECP
8257 so_update_necp_policy(so, NULL, NULL);
8258 #endif /* NECP */
8259 }
8260
8261 return error;
8262 }
8263
8264 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8265 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8266 uint32_t ev_datalen)
8267 {
8268 struct kev_msg ev_msg;
8269
8270 /*
8271 * A netpolicy event always starts with a netpolicy_event_data
8272 * structure, but the caller can provide for a longer event
8273 * structure to post, depending on the event code.
8274 */
8275 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8276
8277 bzero(&ev_msg, sizeof(ev_msg));
8278 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8279 ev_msg.kev_class = KEV_NETWORK_CLASS;
8280 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8281 ev_msg.event_code = ev_code;
8282
8283 ev_msg.dv[0].data_ptr = ev_data;
8284 ev_msg.dv[0].data_length = ev_datalen;
8285
8286 kev_post_msg(&ev_msg);
8287 }
8288
8289 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8290 socket_post_kev_msg(uint32_t ev_code,
8291 struct kev_socket_event_data *ev_data,
8292 uint32_t ev_datalen)
8293 {
8294 struct kev_msg ev_msg;
8295
8296 bzero(&ev_msg, sizeof(ev_msg));
8297 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8298 ev_msg.kev_class = KEV_NETWORK_CLASS;
8299 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8300 ev_msg.event_code = ev_code;
8301
8302 ev_msg.dv[0].data_ptr = ev_data;
8303 ev_msg.dv[0].data_length = ev_datalen;
8304
8305 kev_post_msg(&ev_msg);
8306 }
8307
8308 void
socket_post_kev_msg_closed(struct socket * so)8309 socket_post_kev_msg_closed(struct socket *so)
8310 {
8311 struct kev_socket_closed ev = {};
8312 struct sockaddr *socksa = NULL, *peersa = NULL;
8313 int err;
8314
8315 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8316 return;
8317 }
8318 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8319 if (err == 0) {
8320 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8321 &peersa);
8322 if (err == 0) {
8323 memcpy(&ev.ev_data.kev_sockname, socksa,
8324 min(socksa->sa_len,
8325 sizeof(ev.ev_data.kev_sockname)));
8326 memcpy(&ev.ev_data.kev_peername, peersa,
8327 min(peersa->sa_len,
8328 sizeof(ev.ev_data.kev_peername)));
8329 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8330 &ev.ev_data, sizeof(ev));
8331 }
8332 }
8333 free_sockaddr(socksa);
8334 free_sockaddr(peersa);
8335 }
8336