1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #include <os/log.h>
127
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147
148 static u_int32_t so_cache_hw; /* High water mark for socache */
149 static u_int32_t so_cache_timeouts; /* number of timeouts */
150 static u_int32_t so_cache_max_freed; /* max freed per timeout */
151 static u_int32_t cached_sock_count = 0;
152 STAILQ_HEAD(, socket) so_cache_head;
153 int max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t so_cache_time;
155 static int socketinit_done;
156 static struct zone *so_cache_zone;
157
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329
330 extern struct inpcbinfo tcbinfo;
331
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371
372 /*
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
376 */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &sotcdb, 0, "");
380
381 void
socketinit(void)382 socketinit(void)
383 {
384 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387 #ifdef __LP64__
388 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402
403 if (socketinit_done) {
404 printf("socketinit: already called...\n");
405 return;
406 }
407 socketinit_done = 1;
408
409 PE_parse_boot_argn("socket_debug", &socket_debug,
410 sizeof(socket_debug));
411
412 STAILQ_INIT(&so_cache_head);
413
414 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 + get_inpcb_str_size() + 4 + get_tcp_str_size());
416
417 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419
420 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424
425 in_pcbinit();
426 }
427
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 caddr_t temp;
432 uintptr_t offset;
433
434 lck_mtx_lock(&so_cache_mtx);
435
436 if (!STAILQ_EMPTY(&so_cache_head)) {
437 VERIFY(cached_sock_count > 0);
438
439 *so = STAILQ_FIRST(&so_cache_head);
440 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 STAILQ_NEXT((*so), so_cache_ent) = NULL;
442
443 cached_sock_count--;
444 lck_mtx_unlock(&so_cache_mtx);
445
446 temp = (*so)->so_saved_pcb;
447 bzero((caddr_t)*so, sizeof(struct socket));
448
449 (*so)->so_saved_pcb = temp;
450 } else {
451 lck_mtx_unlock(&so_cache_mtx);
452
453 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454
455 /*
456 * Define offsets for extra structures into our
457 * single block of memory. Align extra structures
458 * on longword boundaries.
459 */
460
461 offset = (uintptr_t)*so;
462 offset += sizeof(struct socket);
463
464 offset = ALIGN(offset);
465
466 (*so)->so_saved_pcb = (caddr_t)offset;
467 offset += get_inpcb_str_size();
468
469 offset = ALIGN(offset);
470
471 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 (caddr_t)offset;
473 }
474
475 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 lck_mtx_lock(&so_cache_mtx);
482
483 so_cache_time = net_uptime();
484 if (++cached_sock_count > max_cached_sock_count) {
485 --cached_sock_count;
486 lck_mtx_unlock(&so_cache_mtx);
487 zfree(so_cache_zone, so);
488 } else {
489 if (so_cache_hw < cached_sock_count) {
490 so_cache_hw = cached_sock_count;
491 }
492
493 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494
495 so->cache_timestamp = so_cache_time;
496 lck_mtx_unlock(&so_cache_mtx);
497 }
498 }
499
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 if (so->last_pid != 0) {
504 /*
505 * last_pid and last_upid should remain zero for sockets
506 * created using sock_socket. The check above achieves that
507 */
508 if (self == PROC_NULL) {
509 self = current_proc();
510 }
511
512 if (so->last_upid != proc_uniqueid(self) ||
513 so->last_pid != proc_pid(self)) {
514 so->last_upid = proc_uniqueid(self);
515 so->last_pid = proc_pid(self);
516 proc_getexecutableuuid(self, so->last_uuid,
517 sizeof(so->last_uuid));
518 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 }
521 }
522 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 }
524 }
525
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 (void) inp_update_policy(sotoinpcb(so));
531 }
532 }
533
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537 struct sockaddr *override_remote_addr)
538 {
539 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 override_remote_addr, 0);
542 }
543 }
544 #endif /* NECP */
545
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 struct socket *p;
550 int n_freed = 0;
551 boolean_t rc = FALSE;
552
553 lck_mtx_lock(&so_cache_mtx);
554 so_cache_timeouts++;
555 so_cache_time = net_uptime();
556
557 while (!STAILQ_EMPTY(&so_cache_head)) {
558 VERIFY(cached_sock_count > 0);
559 p = STAILQ_FIRST(&so_cache_head);
560 if ((so_cache_time - p->cache_timestamp) <
561 SO_CACHE_TIME_LIMIT) {
562 break;
563 }
564
565 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 --cached_sock_count;
567
568 zfree(so_cache_zone, p);
569
570 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 so_cache_max_freed++;
572 break;
573 }
574 }
575
576 /* Schedule again if there is more to cleanup */
577 if (!STAILQ_EMPTY(&so_cache_head)) {
578 rc = TRUE;
579 }
580
581 lck_mtx_unlock(&so_cache_mtx);
582 return rc;
583 }
584
585 /*
586 * Get a socket structure from our zone, and initialize it.
587 * We don't implement `waitok' yet (see comments in uipc_domain.c).
588 * Note that it would probably be better to allocate socket
589 * and PCB at the same time, but I'm not convinced that all
590 * the protocols can be easily modified to do this.
591 */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 struct socket *so;
597
598 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 cached_sock_alloc(&so, how);
600 } else {
601 so = zalloc_flags(socket_zone, how | Z_ZERO);
602 }
603 if (so != NULL) {
604 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605
606 /*
607 * Increment the socket allocation statistics
608 */
609 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 }
611
612 return so;
613 }
614
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617 struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 struct protosw *prp;
620 struct socket *so;
621 int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 pid_t rpid = -1;
624 #endif
625
626 #if TCPDEBUG
627 extern int tcpconsdebug;
628 #endif
629
630 VERIFY(aso != NULL);
631 *aso = NULL;
632
633 if (proto != 0) {
634 prp = pffindproto(dom, proto, type);
635 } else {
636 prp = pffindtype(dom, type);
637 }
638
639 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 if (pffinddomain(dom) == NULL) {
641 return EAFNOSUPPORT;
642 }
643 if (proto != 0) {
644 if (pffindprotonotype(dom, proto) != NULL) {
645 return EPROTOTYPE;
646 }
647 }
648 return EPROTONOSUPPORT;
649 }
650 if (prp->pr_type != type) {
651 return EPROTOTYPE;
652 }
653 so = soalloc(1, dom, type);
654 if (so == NULL) {
655 return ENOBUFS;
656 }
657
658 switch (dom) {
659 case PF_LOCAL:
660 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 break;
662 case PF_INET:
663 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 if (type == SOCK_STREAM) {
665 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 } else {
667 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 }
669 break;
670 case PF_ROUTE:
671 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 break;
673 case PF_NDRV:
674 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 break;
676 case PF_KEY:
677 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 break;
679 case PF_INET6:
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 if (type == SOCK_STREAM) {
682 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 } else {
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 }
686 break;
687 case PF_SYSTEM:
688 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 break;
690 case PF_MULTIPATH:
691 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 break;
693 default:
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 break;
696 }
697
698 if (flags & SOCF_MPTCP) {
699 so->so_state |= SS_NBIO;
700 }
701
702 TAILQ_INIT(&so->so_incomp);
703 TAILQ_INIT(&so->so_comp);
704 so->so_type = (short)type;
705 so->last_upid = proc_uniqueid(p);
706 so->last_pid = proc_pid(p);
707 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709
710 if (ep != PROC_NULL && ep != p) {
711 so->e_upid = proc_uniqueid(ep);
712 so->e_pid = proc_pid(ep);
713 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 if (ep->p_responsible_pid != so->e_pid) {
717 rpid = ep->p_responsible_pid;
718 }
719 #endif
720 }
721
722 #if defined(XNU_TARGET_OS_OSX)
723 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 rpid = p->p_responsible_pid;
725 }
726
727 so->so_rpid = -1;
728 uuid_clear(so->so_ruuid);
729 if (rpid >= 0) {
730 proc_t rp = proc_find(rpid);
731 if (rp != PROC_NULL) {
732 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 so->so_rpid = rpid;
734 proc_rele(rp);
735 }
736 }
737 #endif
738
739 so->so_cred = kauth_cred_proc_ref(p);
740 if (!suser(kauth_cred_get(), NULL)) {
741 so->so_state |= SS_PRIV;
742 }
743
744 so->so_proto = prp;
745 so->so_rcv.sb_flags |= SB_RECV;
746 so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 so->next_lock_lr = 0;
748 so->next_unlock_lr = 0;
749
750 /*
751 * Attachment will create the per pcb lock if necessary and
752 * increase refcount for creation, make sure it's done before
753 * socket is inserted in lists.
754 */
755 so->so_usecount++;
756
757 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 if (error != 0) {
759 /*
760 * Warning:
761 * If so_pcb is not zero, the socket will be leaked,
762 * so protocol attachment handler must be coded carefuly
763 */
764 if (so->so_pcb != NULL) {
765 os_log_error(OS_LOG_DEFAULT,
766 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 error, dom, proto, type);
768 }
769 /*
770 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 */
772 so->so_state |= SS_NOFDREF;
773 so->so_flags |= SOF_PCBCLEARING;
774 VERIFY(so->so_usecount > 0);
775 so->so_usecount--;
776 sofreelastref(so, 1); /* will deallocate the socket */
777 return error;
778 }
779
780 /*
781 * Note: needs so_pcb to be set after pru_attach
782 */
783 if (prp->pr_update_last_owner != NULL) {
784 (*prp->pr_update_last_owner)(so, p, ep);
785 }
786
787 atomic_add_32(&prp->pr_domain->dom_refs, 1);
788
789 /* Attach socket filters for this protocol */
790 sflt_initsock(so);
791 #if TCPDEBUG
792 if (tcpconsdebug == 2) {
793 so->so_options |= SO_DEBUG;
794 }
795 #endif
796 so_set_default_traffic_class(so);
797
798 /*
799 * If this thread or task is marked to create backgrounded sockets,
800 * mark the socket as background.
801 */
802 if (!(flags & SOCF_MPTCP) &&
803 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 so->so_background_thread = current_thread();
806 }
807
808 switch (dom) {
809 /*
810 * Don't mark Unix domain or system
811 * eligible for defunct by default.
812 */
813 case PF_LOCAL:
814 case PF_SYSTEM:
815 so->so_flags |= SOF_NODEFUNCT;
816 break;
817 default:
818 break;
819 }
820
821 /*
822 * Entitlements can't be checked at socket creation time except if the
823 * application requested a feature guarded by a privilege (c.f., socket
824 * delegation).
825 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 * a privilege check should only be triggered by a userland request.
827 * A privilege check at socket creation time is time consuming and
828 * could trigger many authorisation error messages from the security
829 * APIs.
830 */
831
832 *aso = so;
833
834 return 0;
835 }
836
837 /*
838 * Returns: 0 Success
839 * EAFNOSUPPORT
840 * EPROTOTYPE
841 * EPROTONOSUPPORT
842 * ENOBUFS
843 * <pru_attach>:ENOBUFS[AF_UNIX]
844 * <pru_attach>:ENOBUFS[TCP]
845 * <pru_attach>:ENOMEM[TCP]
846 * <pru_attach>:??? [other protocol families, IPSEC]
847 */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 PROC_NULL);
853 }
854
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 int error = 0;
859 struct proc *ep = PROC_NULL;
860
861 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 error = ESRCH;
863 goto done;
864 }
865
866 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867
868 /*
869 * It might not be wise to hold the proc reference when calling
870 * socreate_internal since it calls soalloc with M_WAITOK
871 */
872 done:
873 if (ep != PROC_NULL) {
874 proc_rele(ep);
875 }
876
877 return error;
878 }
879
880 /*
881 * Returns: 0 Success
882 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
883 * <pru_bind>:EAFNOSUPPORT Address family not supported
884 * <pru_bind>:EADDRNOTAVAIL Address not available.
885 * <pru_bind>:EINVAL Invalid argument
886 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
887 * <pru_bind>:EACCES Permission denied
888 * <pru_bind>:EADDRINUSE Address in use
889 * <pru_bind>:EAGAIN Resource unavailable, try again
890 * <pru_bind>:EPERM Operation not permitted
891 * <pru_bind>:???
892 * <sf_bind>:???
893 *
894 * Notes: It's not possible to fully enumerate the return codes above,
895 * since socket filter authors and protocol family authors may
896 * not choose to limit their error returns to those listed, even
897 * though this may result in some software operating incorrectly.
898 *
899 * The error codes which are enumerated above are those known to
900 * be returned by the tcp_usr_bind function supplied.
901 */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 struct proc *p = current_proc();
906 int error = 0;
907
908 if (dolock) {
909 socket_lock(so, 1);
910 }
911
912 so_update_last_owner_locked(so, p);
913 so_update_policy(so);
914
915 #if NECP
916 so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918
919 /*
920 * If this is a bind request on a socket that has been marked
921 * as inactive, reject it now before we go any further.
922 */
923 if (so->so_flags & SOF_DEFUNCT) {
924 error = EINVAL;
925 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
926 __func__, proc_pid(p), proc_best_name(p),
927 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
928 SOCK_DOM(so), SOCK_TYPE(so), error);
929 goto out;
930 }
931
932 /* Socket filter */
933 error = sflt_bind(so, nam);
934
935 if (error == 0) {
936 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 }
938 out:
939 if (dolock) {
940 socket_unlock(so, 1);
941 }
942
943 if (error == EJUSTRETURN) {
944 error = 0;
945 }
946
947 return error;
948 }
949
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 kauth_cred_unref(&so->so_cred);
954
955 /* Remove any filters */
956 sflt_termsock(so);
957
958 #if CONTENT_FILTER
959 cfil_sock_detach(so);
960 #endif /* CONTENT_FILTER */
961
962 if (NEED_DGRAM_FLOW_TRACKING(so)) {
963 soflow_detach(so);
964 }
965
966 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
967
968 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
969 cached_sock_free(so);
970 } else {
971 zfree(socket_zone, so);
972 }
973 }
974
975 /*
976 * Returns: 0 Success
977 * EINVAL
978 * EOPNOTSUPP
979 * <pru_listen>:EINVAL[AF_UNIX]
980 * <pru_listen>:EINVAL[TCP]
981 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
982 * <pru_listen>:EINVAL[TCP] Invalid argument
983 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
984 * <pru_listen>:EACCES[TCP] Permission denied
985 * <pru_listen>:EADDRINUSE[TCP] Address in use
986 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
987 * <pru_listen>:EPERM[TCP] Operation not permitted
988 * <sf_listen>:???
989 *
990 * Notes: Other <pru_listen> returns depend on the protocol family; all
991 * <sf_listen> returns depend on what the filter author causes
992 * their filter to return.
993 */
994 int
solisten(struct socket * so,int backlog)995 solisten(struct socket *so, int backlog)
996 {
997 struct proc *p = current_proc();
998 int error = 0;
999
1000 socket_lock(so, 1);
1001
1002 so_update_last_owner_locked(so, p);
1003 so_update_policy(so);
1004
1005 #if NECP
1006 so_update_necp_policy(so, NULL, NULL);
1007 #endif /* NECP */
1008
1009 if (so->so_proto == NULL) {
1010 error = EINVAL;
1011 goto out;
1012 }
1013 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1014 error = EOPNOTSUPP;
1015 goto out;
1016 }
1017
1018 /*
1019 * If the listen request is made on a socket that is not fully
1020 * disconnected, or on a socket that has been marked as inactive,
1021 * reject the request now.
1022 */
1023 if ((so->so_state &
1024 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1025 (so->so_flags & SOF_DEFUNCT)) {
1026 error = EINVAL;
1027 if (so->so_flags & SOF_DEFUNCT) {
1028 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1029 "(%d)\n", __func__, proc_pid(p),
1030 proc_best_name(p),
1031 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1032 SOCK_DOM(so), SOCK_TYPE(so), error);
1033 }
1034 goto out;
1035 }
1036
1037 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1038 error = EPERM;
1039 goto out;
1040 }
1041
1042 error = sflt_listen(so);
1043 if (error == 0) {
1044 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1045 }
1046
1047 if (error) {
1048 if (error == EJUSTRETURN) {
1049 error = 0;
1050 }
1051 goto out;
1052 }
1053
1054 if (TAILQ_EMPTY(&so->so_comp)) {
1055 so->so_options |= SO_ACCEPTCONN;
1056 }
1057 /*
1058 * POSIX: The implementation may have an upper limit on the length of
1059 * the listen queue-either global or per accepting socket. If backlog
1060 * exceeds this limit, the length of the listen queue is set to the
1061 * limit.
1062 *
1063 * If listen() is called with a backlog argument value that is less
1064 * than 0, the function behaves as if it had been called with a backlog
1065 * argument value of 0.
1066 *
1067 * A backlog argument of 0 may allow the socket to accept connections,
1068 * in which case the length of the listen queue may be set to an
1069 * implementation-defined minimum value.
1070 */
1071 if (backlog <= 0 || backlog > somaxconn) {
1072 backlog = somaxconn;
1073 }
1074
1075 so->so_qlimit = (short)backlog;
1076 out:
1077 socket_unlock(so, 1);
1078 return error;
1079 }
1080
1081 /*
1082 * The "accept list lock" protects the fields related to the listener queues
1083 * because we can unlock a socket to respect the lock ordering between
1084 * the listener socket and its clients sockets. The lock ordering is first to
1085 * acquire the client socket before the listener socket.
1086 *
1087 * The accept list lock serializes access to the following fields:
1088 * - of the listener socket:
1089 * - so_comp
1090 * - so_incomp
1091 * - so_qlen
1092 * - so_inqlen
1093 * - of client sockets that are in so_comp or so_incomp:
1094 * - so_head
1095 * - so_list
1096 *
1097 * As one can see the accept list lock protects the consistent of the
1098 * linkage of the client sockets.
1099 *
1100 * Note that those fields may be read without holding the accept list lock
1101 * for a preflight provided the accept list lock is taken when committing
1102 * to take an action based on the result of the preflight. The preflight
1103 * saves the cost of doing the unlock/lock dance.
1104 */
1105 void
so_acquire_accept_list(struct socket * head,struct socket * so)1106 so_acquire_accept_list(struct socket *head, struct socket *so)
1107 {
1108 lck_mtx_t *mutex_held;
1109
1110 if (head->so_proto->pr_getlock == NULL) {
1111 return;
1112 }
1113 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1114 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1115
1116 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1117 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1118 return;
1119 }
1120 if (so != NULL) {
1121 socket_unlock(so, 0);
1122 }
1123 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1124 so_accept_list_waits += 1;
1125 msleep((caddr_t)&head->so_incomp, mutex_held,
1126 PSOCK | PCATCH, __func__, NULL);
1127 }
1128 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 if (so != NULL) {
1130 socket_unlock(head, 0);
1131 socket_lock(so, 0);
1132 socket_lock(head, 0);
1133 }
1134 }
1135
1136 void
so_release_accept_list(struct socket * head)1137 so_release_accept_list(struct socket *head)
1138 {
1139 if (head->so_proto->pr_getlock != NULL) {
1140 lck_mtx_t *mutex_held;
1141
1142 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1143 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1144
1145 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1146 wakeup((caddr_t)&head->so_incomp);
1147 }
1148 }
1149
1150 void
sofreelastref(struct socket * so,int dealloc)1151 sofreelastref(struct socket *so, int dealloc)
1152 {
1153 struct socket *head = so->so_head;
1154
1155 /* Assume socket is locked */
1156
1157 #if FLOW_DIVERT
1158 if (so->so_flags & SOF_FLOW_DIVERT) {
1159 flow_divert_detach(so);
1160 }
1161 #endif /* FLOW_DIVERT */
1162
1163 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1164 selthreadclear(&so->so_snd.sb_sel);
1165 selthreadclear(&so->so_rcv.sb_sel);
1166 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1167 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1168 so->so_event = sonullevent;
1169 return;
1170 }
1171 if (head != NULL) {
1172 /*
1173 * Need to lock the listener when the protocol has
1174 * per socket locks
1175 */
1176 if (head->so_proto->pr_getlock != NULL) {
1177 socket_lock(head, 1);
1178 so_acquire_accept_list(head, so);
1179 }
1180 if (so->so_state & SS_INCOMP) {
1181 so->so_state &= ~SS_INCOMP;
1182 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1183 head->so_incqlen--;
1184 head->so_qlen--;
1185 so->so_head = NULL;
1186
1187 if (head->so_proto->pr_getlock != NULL) {
1188 so_release_accept_list(head);
1189 socket_unlock(head, 1);
1190 }
1191 } else if (so->so_state & SS_COMP) {
1192 if (head->so_proto->pr_getlock != NULL) {
1193 so_release_accept_list(head);
1194 socket_unlock(head, 1);
1195 }
1196 /*
1197 * We must not decommission a socket that's
1198 * on the accept(2) queue. If we do, then
1199 * accept(2) may hang after select(2) indicated
1200 * that the listening socket was ready.
1201 */
1202 selthreadclear(&so->so_snd.sb_sel);
1203 selthreadclear(&so->so_rcv.sb_sel);
1204 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1205 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1206 so->so_event = sonullevent;
1207 return;
1208 } else {
1209 if (head->so_proto->pr_getlock != NULL) {
1210 so_release_accept_list(head);
1211 socket_unlock(head, 1);
1212 }
1213 printf("sofree: not queued\n");
1214 }
1215 }
1216 sowflush(so);
1217 sorflush(so);
1218
1219 /* 3932268: disable upcall */
1220 so->so_rcv.sb_flags &= ~SB_UPCALL;
1221 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1222 so->so_event = sonullevent;
1223
1224 if (dealloc) {
1225 sodealloc(so);
1226 }
1227 }
1228
1229 void
soclose_wait_locked(struct socket * so)1230 soclose_wait_locked(struct socket *so)
1231 {
1232 lck_mtx_t *mutex_held;
1233
1234 if (so->so_proto->pr_getlock != NULL) {
1235 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1236 } else {
1237 mutex_held = so->so_proto->pr_domain->dom_mtx;
1238 }
1239 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1240
1241 /*
1242 * Double check here and return if there's no outstanding upcall;
1243 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1244 */
1245 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1246 return;
1247 }
1248 so->so_rcv.sb_flags &= ~SB_UPCALL;
1249 so->so_snd.sb_flags &= ~SB_UPCALL;
1250 so->so_flags |= SOF_CLOSEWAIT;
1251
1252 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1253 "soclose_wait_locked", NULL);
1254 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1255 so->so_flags &= ~SOF_CLOSEWAIT;
1256 }
1257
1258 /*
1259 * Close a socket on last file table reference removal.
1260 * Initiate disconnect if connected.
1261 * Free socket when disconnect complete.
1262 */
1263 int
soclose_locked(struct socket * so)1264 soclose_locked(struct socket *so)
1265 {
1266 int error = 0;
1267 struct timespec ts;
1268
1269 if (so->so_usecount == 0) {
1270 panic("soclose: so=%p refcount=0", so);
1271 /* NOTREACHED */
1272 }
1273
1274 sflt_notify(so, sock_evt_closing, NULL);
1275
1276 if (so->so_upcallusecount) {
1277 soclose_wait_locked(so);
1278 }
1279
1280 #if CONTENT_FILTER
1281 /*
1282 * We have to wait until the content filters are done
1283 */
1284 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1285 cfil_sock_close_wait(so);
1286 cfil_sock_is_closed(so);
1287 cfil_sock_detach(so);
1288 }
1289 #endif /* CONTENT_FILTER */
1290
1291 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1292 soflow_detach(so);
1293 }
1294
1295 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1296 soresume(current_proc(), so, 1);
1297 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1298 }
1299
1300 if ((so->so_options & SO_ACCEPTCONN)) {
1301 struct socket *sp, *sonext;
1302 int persocklock = 0;
1303 int incomp_overflow_only;
1304
1305 /*
1306 * We do not want new connection to be added
1307 * to the connection queues
1308 */
1309 so->so_options &= ~SO_ACCEPTCONN;
1310
1311 /*
1312 * We can drop the lock on the listener once
1313 * we've acquired the incoming list
1314 */
1315 if (so->so_proto->pr_getlock != NULL) {
1316 persocklock = 1;
1317 so_acquire_accept_list(so, NULL);
1318 socket_unlock(so, 0);
1319 }
1320 again:
1321 incomp_overflow_only = 1;
1322
1323 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1324 /*
1325 * Radar 5350314
1326 * skip sockets thrown away by tcpdropdropblreq
1327 * they will get cleanup by the garbage collection.
1328 * otherwise, remove the incomp socket from the queue
1329 * and let soabort trigger the appropriate cleanup.
1330 */
1331 if (sp->so_flags & SOF_OVERFLOW) {
1332 continue;
1333 }
1334
1335 if (persocklock != 0) {
1336 socket_lock(sp, 1);
1337 }
1338
1339 /*
1340 * Radar 27945981
1341 * The extra reference for the list insure the
1342 * validity of the socket pointer when we perform the
1343 * unlock of the head above
1344 */
1345 if (sp->so_state & SS_INCOMP) {
1346 sp->so_state &= ~SS_INCOMP;
1347 sp->so_head = NULL;
1348 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1349 so->so_incqlen--;
1350 so->so_qlen--;
1351
1352 (void) soabort(sp);
1353 } else {
1354 panic("%s sp %p in so_incomp but !SS_INCOMP",
1355 __func__, sp);
1356 }
1357
1358 if (persocklock != 0) {
1359 socket_unlock(sp, 1);
1360 }
1361 }
1362
1363 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1364 /* Dequeue from so_comp since sofree() won't do it */
1365 if (persocklock != 0) {
1366 socket_lock(sp, 1);
1367 }
1368
1369 if (sp->so_state & SS_COMP) {
1370 sp->so_state &= ~SS_COMP;
1371 sp->so_head = NULL;
1372 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1373 so->so_qlen--;
1374
1375 (void) soabort(sp);
1376 } else {
1377 panic("%s sp %p in so_comp but !SS_COMP",
1378 __func__, sp);
1379 }
1380
1381 if (persocklock) {
1382 socket_unlock(sp, 1);
1383 }
1384 }
1385
1386 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1387 #if (DEBUG | DEVELOPMENT)
1388 panic("%s head %p so_comp not empty", __func__, so);
1389 #endif /* (DEVELOPMENT || DEBUG) */
1390
1391 goto again;
1392 }
1393
1394 if (!TAILQ_EMPTY(&so->so_comp)) {
1395 #if (DEBUG | DEVELOPMENT)
1396 panic("%s head %p so_comp not empty", __func__, so);
1397 #endif /* (DEVELOPMENT || DEBUG) */
1398
1399 goto again;
1400 }
1401
1402 if (persocklock) {
1403 socket_lock(so, 0);
1404 so_release_accept_list(so);
1405 }
1406 }
1407 if (so->so_pcb == NULL) {
1408 /* 3915887: mark the socket as ready for dealloc */
1409 so->so_flags |= SOF_PCBCLEARING;
1410 goto discard;
1411 }
1412 if (so->so_state & SS_ISCONNECTED) {
1413 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1414 error = sodisconnectlocked(so);
1415 if (error) {
1416 goto drop;
1417 }
1418 }
1419 if (so->so_options & SO_LINGER) {
1420 lck_mtx_t *mutex_held;
1421
1422 if ((so->so_state & SS_ISDISCONNECTING) &&
1423 (so->so_state & SS_NBIO)) {
1424 goto drop;
1425 }
1426 if (so->so_proto->pr_getlock != NULL) {
1427 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1428 } else {
1429 mutex_held = so->so_proto->pr_domain->dom_mtx;
1430 }
1431 while (so->so_state & SS_ISCONNECTED) {
1432 ts.tv_sec = (so->so_linger / 100);
1433 ts.tv_nsec = (so->so_linger % 100) *
1434 NSEC_PER_USEC * 1000 * 10;
1435 error = msleep((caddr_t)&so->so_timeo,
1436 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1437 if (error) {
1438 /*
1439 * It's OK when the time fires,
1440 * don't report an error
1441 */
1442 if (error == EWOULDBLOCK) {
1443 error = 0;
1444 }
1445 break;
1446 }
1447 }
1448 }
1449 }
1450 drop:
1451 if (so->so_usecount == 0) {
1452 panic("soclose: usecount is zero so=%p", so);
1453 /* NOTREACHED */
1454 }
1455 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1456 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1457 if (error == 0) {
1458 error = error2;
1459 }
1460 }
1461 if (so->so_usecount <= 0) {
1462 panic("soclose: usecount is zero so=%p", so);
1463 /* NOTREACHED */
1464 }
1465 discard:
1466 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1467 (so->so_state & SS_NOFDREF)) {
1468 panic("soclose: NOFDREF");
1469 /* NOTREACHED */
1470 }
1471 so->so_state |= SS_NOFDREF;
1472
1473 if ((so->so_flags & SOF_KNOTE) != 0) {
1474 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1475 }
1476
1477 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1478
1479 VERIFY(so->so_usecount > 0);
1480 so->so_usecount--;
1481 sofree(so);
1482 return error;
1483 }
1484
1485 int
soclose(struct socket * so)1486 soclose(struct socket *so)
1487 {
1488 int error = 0;
1489 socket_lock(so, 1);
1490
1491 if (so->so_retaincnt == 0) {
1492 error = soclose_locked(so);
1493 } else {
1494 /*
1495 * if the FD is going away, but socket is
1496 * retained in kernel remove its reference
1497 */
1498 so->so_usecount--;
1499 if (so->so_usecount < 2) {
1500 panic("soclose: retaincnt non null and so=%p "
1501 "usecount=%d\n", so, so->so_usecount);
1502 }
1503 }
1504 socket_unlock(so, 1);
1505 return error;
1506 }
1507
1508 /*
1509 * Must be called at splnet...
1510 */
1511 /* Should already be locked */
1512 int
soabort(struct socket * so)1513 soabort(struct socket *so)
1514 {
1515 int error;
1516
1517 #ifdef MORE_LOCKING_DEBUG
1518 lck_mtx_t *mutex_held;
1519
1520 if (so->so_proto->pr_getlock != NULL) {
1521 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1522 } else {
1523 mutex_held = so->so_proto->pr_domain->dom_mtx;
1524 }
1525 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1526 #endif
1527
1528 if ((so->so_flags & SOF_ABORTED) == 0) {
1529 so->so_flags |= SOF_ABORTED;
1530 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1531 if (error) {
1532 sofree(so);
1533 return error;
1534 }
1535 }
1536 return 0;
1537 }
1538
1539 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1540 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1541 {
1542 int error;
1543
1544 if (dolock) {
1545 socket_lock(so, 1);
1546 }
1547
1548 so_update_last_owner_locked(so, PROC_NULL);
1549 so_update_policy(so);
1550 #if NECP
1551 so_update_necp_policy(so, NULL, NULL);
1552 #endif /* NECP */
1553
1554 if ((so->so_state & SS_NOFDREF) == 0) {
1555 panic("soaccept: !NOFDREF");
1556 }
1557 so->so_state &= ~SS_NOFDREF;
1558 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1559
1560 if (dolock) {
1561 socket_unlock(so, 1);
1562 }
1563 return error;
1564 }
1565
1566 int
soaccept(struct socket * so,struct sockaddr ** nam)1567 soaccept(struct socket *so, struct sockaddr **nam)
1568 {
1569 return soacceptlock(so, nam, 1);
1570 }
1571
1572 int
soacceptfilter(struct socket * so,struct socket * head)1573 soacceptfilter(struct socket *so, struct socket *head)
1574 {
1575 struct sockaddr *local = NULL, *remote = NULL;
1576 int error = 0;
1577
1578 /*
1579 * Hold the lock even if this socket has not been made visible
1580 * to the filter(s). For sockets with global locks, this protects
1581 * against the head or peer going away
1582 */
1583 socket_lock(so, 1);
1584 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1585 sogetaddr_locked(so, &local, 0) != 0) {
1586 so->so_state &= ~SS_NOFDREF;
1587 socket_unlock(so, 1);
1588 soclose(so);
1589 /* Out of resources; try it again next time */
1590 error = ECONNABORTED;
1591 goto done;
1592 }
1593
1594 error = sflt_accept(head, so, local, remote);
1595
1596 /*
1597 * If we get EJUSTRETURN from one of the filters, mark this socket
1598 * as inactive and return it anyway. This newly accepted socket
1599 * will be disconnected later before we hand it off to the caller.
1600 */
1601 if (error == EJUSTRETURN) {
1602 error = 0;
1603 (void) sosetdefunct(current_proc(), so,
1604 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1605 }
1606
1607 if (error != 0) {
1608 /*
1609 * This may seem like a duplication to the above error
1610 * handling part when we return ECONNABORTED, except
1611 * the following is done while holding the lock since
1612 * the socket has been exposed to the filter(s) earlier.
1613 */
1614 so->so_state &= ~SS_NOFDREF;
1615 socket_unlock(so, 1);
1616 soclose(so);
1617 /* Propagate socket filter's error code to the caller */
1618 } else {
1619 socket_unlock(so, 1);
1620 }
1621 done:
1622 /* Callee checks for NULL pointer */
1623 sock_freeaddr(remote);
1624 sock_freeaddr(local);
1625 return error;
1626 }
1627
1628 /*
1629 * Returns: 0 Success
1630 * EOPNOTSUPP Operation not supported on socket
1631 * EISCONN Socket is connected
1632 * <pru_connect>:EADDRNOTAVAIL Address not available.
1633 * <pru_connect>:EINVAL Invalid argument
1634 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1635 * <pru_connect>:EACCES Permission denied
1636 * <pru_connect>:EADDRINUSE Address in use
1637 * <pru_connect>:EAGAIN Resource unavailable, try again
1638 * <pru_connect>:EPERM Operation not permitted
1639 * <sf_connect_out>:??? [anything a filter writer might set]
1640 */
1641 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1642 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1643 {
1644 int error;
1645 struct proc *p = current_proc();
1646 tracker_metadata_t metadata = { };
1647
1648 if (dolock) {
1649 socket_lock(so, 1);
1650 }
1651
1652 so_update_last_owner_locked(so, p);
1653 so_update_policy(so);
1654
1655 #if NECP
1656 so_update_necp_policy(so, NULL, nam);
1657 #endif /* NECP */
1658
1659 /*
1660 * If this is a listening socket or if this is a previously-accepted
1661 * socket that has been marked as inactive, reject the connect request.
1662 */
1663 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1664 error = EOPNOTSUPP;
1665 if (so->so_flags & SOF_DEFUNCT) {
1666 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1667 "(%d)\n", __func__, proc_pid(p),
1668 proc_best_name(p),
1669 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1670 SOCK_DOM(so), SOCK_TYPE(so), error);
1671 }
1672 if (dolock) {
1673 socket_unlock(so, 1);
1674 }
1675 return error;
1676 }
1677
1678 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1679 if (dolock) {
1680 socket_unlock(so, 1);
1681 }
1682 return EPERM;
1683 }
1684
1685 /*
1686 * If protocol is connection-based, can only connect once.
1687 * Otherwise, if connected, try to disconnect first.
1688 * This allows user to disconnect by connecting to, e.g.,
1689 * a null address.
1690 */
1691 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1692 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1693 (error = sodisconnectlocked(so)))) {
1694 error = EISCONN;
1695 } else {
1696 /*
1697 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1698 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1699 */
1700 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1701 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1702 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1703 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1704 }
1705 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1706 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1707 }
1708 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1709 printf("connect() - failed necp_set_socket_domain_attributes");
1710 }
1711 }
1712 }
1713
1714 /*
1715 * Run connect filter before calling protocol:
1716 * - non-blocking connect returns before completion;
1717 */
1718 error = sflt_connectout(so, nam);
1719 if (error != 0) {
1720 if (error == EJUSTRETURN) {
1721 error = 0;
1722 }
1723 } else {
1724 error = (*so->so_proto->pr_usrreqs->pru_connect)
1725 (so, nam, p);
1726 if (error != 0) {
1727 so->so_state &= ~SS_ISCONNECTING;
1728 }
1729 }
1730 }
1731 if (dolock) {
1732 socket_unlock(so, 1);
1733 }
1734 return error;
1735 }
1736
1737 int
soconnect(struct socket * so,struct sockaddr * nam)1738 soconnect(struct socket *so, struct sockaddr *nam)
1739 {
1740 return soconnectlock(so, nam, 1);
1741 }
1742
1743 /*
1744 * Returns: 0 Success
1745 * <pru_connect2>:EINVAL[AF_UNIX]
1746 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1747 * <pru_connect2>:??? [other protocol families]
1748 *
1749 * Notes: <pru_connect2> is not supported by [TCP].
1750 */
1751 int
soconnect2(struct socket * so1,struct socket * so2)1752 soconnect2(struct socket *so1, struct socket *so2)
1753 {
1754 int error;
1755
1756 socket_lock(so1, 1);
1757 if (so2->so_proto->pr_lock) {
1758 socket_lock(so2, 1);
1759 }
1760
1761 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1762
1763 socket_unlock(so1, 1);
1764 if (so2->so_proto->pr_lock) {
1765 socket_unlock(so2, 1);
1766 }
1767 return error;
1768 }
1769
1770 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1771 soconnectxlocked(struct socket *so, struct sockaddr *src,
1772 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1773 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1774 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1775 {
1776 int error;
1777 tracker_metadata_t metadata = { };
1778
1779 so_update_last_owner_locked(so, p);
1780 so_update_policy(so);
1781
1782 /*
1783 * If this is a listening socket or if this is a previously-accepted
1784 * socket that has been marked as inactive, reject the connect request.
1785 */
1786 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1787 error = EOPNOTSUPP;
1788 if (so->so_flags & SOF_DEFUNCT) {
1789 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1790 "(%d)\n", __func__, proc_pid(p),
1791 proc_best_name(p),
1792 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1793 SOCK_DOM(so), SOCK_TYPE(so), error);
1794 }
1795 return error;
1796 }
1797
1798 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1799 return EPERM;
1800 }
1801
1802 /*
1803 * If protocol is connection-based, can only connect once
1804 * unless PR_MULTICONN is set. Otherwise, if connected,
1805 * try to disconnect first. This allows user to disconnect
1806 * by connecting to, e.g., a null address.
1807 */
1808 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1809 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1810 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1811 (error = sodisconnectlocked(so)) != 0)) {
1812 error = EISCONN;
1813 } else {
1814 /*
1815 * For TCP, check if destination address is a tracker and mark the socket accordingly
1816 * (only if it hasn't been marked yet).
1817 */
1818 if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1819 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1820 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1821 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1822 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1823 }
1824 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1825 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1826 }
1827 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1828 printf("connectx() - failed necp_set_socket_domain_attributes");
1829 }
1830 }
1831 }
1832
1833 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1834 (flags & CONNECT_DATA_IDEMPOTENT)) {
1835 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1836
1837 if (flags & CONNECT_DATA_AUTHENTICATED) {
1838 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1839 }
1840 }
1841
1842 /*
1843 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1844 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1845 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1846 * Case 3 allows user to combine write with connect even if they have
1847 * no use for TFO (such as regular TCP, and UDP).
1848 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1849 */
1850 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1851 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1852 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1853 }
1854
1855 /*
1856 * If a user sets data idempotent and does not pass an uio, or
1857 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1858 * SOF1_DATA_IDEMPOTENT.
1859 */
1860 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1861 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1862 /* We should return EINVAL instead perhaps. */
1863 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1864 }
1865
1866 /*
1867 * Run connect filter before calling protocol:
1868 * - non-blocking connect returns before completion;
1869 */
1870 error = sflt_connectout(so, dst);
1871 if (error != 0) {
1872 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1873 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1874 if (error == EJUSTRETURN) {
1875 error = 0;
1876 }
1877 } else {
1878 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1879 (so, src, dst, p, ifscope, aid, pcid,
1880 flags, arg, arglen, auio, bytes_written);
1881 if (error != 0) {
1882 so->so_state &= ~SS_ISCONNECTING;
1883 if (error != EINPROGRESS) {
1884 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1885 }
1886 }
1887 }
1888 }
1889
1890 return error;
1891 }
1892
1893 int
sodisconnectlocked(struct socket * so)1894 sodisconnectlocked(struct socket *so)
1895 {
1896 int error;
1897
1898 if ((so->so_state & SS_ISCONNECTED) == 0) {
1899 error = ENOTCONN;
1900 goto bad;
1901 }
1902 if (so->so_state & SS_ISDISCONNECTING) {
1903 error = EALREADY;
1904 goto bad;
1905 }
1906
1907 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1908 if (error == 0) {
1909 sflt_notify(so, sock_evt_disconnected, NULL);
1910 }
1911
1912 bad:
1913 return error;
1914 }
1915
1916 /* Locking version */
1917 int
sodisconnect(struct socket * so)1918 sodisconnect(struct socket *so)
1919 {
1920 int error;
1921
1922 socket_lock(so, 1);
1923 error = sodisconnectlocked(so);
1924 socket_unlock(so, 1);
1925 return error;
1926 }
1927
1928 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1929 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1930 {
1931 int error;
1932
1933 /*
1934 * Call the protocol disconnectx handler; let it handle all
1935 * matters related to the connection state of this session.
1936 */
1937 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1938 if (error == 0) {
1939 /*
1940 * The event applies only for the session, not for
1941 * the disconnection of individual subflows.
1942 */
1943 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1944 sflt_notify(so, sock_evt_disconnected, NULL);
1945 }
1946 }
1947 return error;
1948 }
1949
1950 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1951 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1952 {
1953 int error;
1954
1955 socket_lock(so, 1);
1956 error = sodisconnectxlocked(so, aid, cid);
1957 socket_unlock(so, 1);
1958 return error;
1959 }
1960
1961 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1962
1963 /*
1964 * sosendcheck will lock the socket buffer if it isn't locked and
1965 * verify that there is space for the data being inserted.
1966 *
1967 * Returns: 0 Success
1968 * EPIPE
1969 * sblock:EWOULDBLOCK
1970 * sblock:EINTR
1971 * sbwait:EBADF
1972 * sbwait:EINTR
1973 * [so_error]:???
1974 */
1975 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1976 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1977 int32_t clen, int32_t atomic, int flags, int *sblocked)
1978 {
1979 int error = 0;
1980 int32_t space;
1981 int assumelock = 0;
1982
1983 restart:
1984 if (*sblocked == 0) {
1985 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1986 so->so_send_filt_thread != 0 &&
1987 so->so_send_filt_thread == current_thread()) {
1988 /*
1989 * We're being called recursively from a filter,
1990 * allow this to continue. Radar 4150520.
1991 * Don't set sblocked because we don't want
1992 * to perform an unlock later.
1993 */
1994 assumelock = 1;
1995 } else {
1996 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1997 if (error) {
1998 if (so->so_flags & SOF_DEFUNCT) {
1999 goto defunct;
2000 }
2001 return error;
2002 }
2003 *sblocked = 1;
2004 }
2005 }
2006
2007 /*
2008 * If a send attempt is made on a socket that has been marked
2009 * as inactive (disconnected), reject the request.
2010 */
2011 if (so->so_flags & SOF_DEFUNCT) {
2012 defunct:
2013 error = EPIPE;
2014 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2015 __func__, proc_selfpid(), proc_best_name(current_proc()),
2016 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2017 SOCK_DOM(so), SOCK_TYPE(so), error);
2018 return error;
2019 }
2020
2021 if (so->so_state & SS_CANTSENDMORE) {
2022 #if CONTENT_FILTER
2023 /*
2024 * Can re-inject data of half closed connections
2025 */
2026 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2027 so->so_snd.sb_cfil_thread == current_thread() &&
2028 cfil_sock_data_pending(&so->so_snd) != 0) {
2029 CFIL_LOG(LOG_INFO,
2030 "so %llx ignore SS_CANTSENDMORE",
2031 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2032 } else
2033 #endif /* CONTENT_FILTER */
2034 return EPIPE;
2035 }
2036 if (so->so_error) {
2037 error = so->so_error;
2038 so->so_error = 0;
2039 return error;
2040 }
2041
2042 if ((so->so_state & SS_ISCONNECTED) == 0) {
2043 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2044 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2045 (resid != 0 || clen == 0) &&
2046 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2047 return ENOTCONN;
2048 }
2049 } else if (addr == 0) {
2050 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2051 ENOTCONN : EDESTADDRREQ;
2052 }
2053 }
2054
2055 space = sbspace(&so->so_snd);
2056
2057 if (flags & MSG_OOB) {
2058 space += 1024;
2059 }
2060 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2061 clen > so->so_snd.sb_hiwat) {
2062 return EMSGSIZE;
2063 }
2064
2065 if ((space < resid + clen &&
2066 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2067 space < clen)) ||
2068 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2069 /*
2070 * don't block the connectx call when there's more data
2071 * than can be copied.
2072 */
2073 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2074 if (space == 0) {
2075 return EWOULDBLOCK;
2076 }
2077 if (space < (int32_t)so->so_snd.sb_lowat) {
2078 return 0;
2079 }
2080 }
2081 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2082 assumelock) {
2083 return EWOULDBLOCK;
2084 }
2085 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2086 *sblocked = 0;
2087 error = sbwait(&so->so_snd);
2088 if (error) {
2089 if (so->so_flags & SOF_DEFUNCT) {
2090 goto defunct;
2091 }
2092 return error;
2093 }
2094 goto restart;
2095 }
2096 return 0;
2097 }
2098
2099 /*
2100 * Send on a socket.
2101 * If send must go all at once and message is larger than
2102 * send buffering, then hard error.
2103 * Lock against other senders.
2104 * If must go all at once and not enough room now, then
2105 * inform user that this would block and do nothing.
2106 * Otherwise, if nonblocking, send as much as possible.
2107 * The data to be sent is described by "uio" if nonzero,
2108 * otherwise by the mbuf chain "top" (which must be null
2109 * if uio is not). Data provided in mbuf chain must be small
2110 * enough to send all at once.
2111 *
2112 * Returns nonzero on error, timeout or signal; callers
2113 * must check for short counts if EINTR/ERESTART are returned.
2114 * Data and control buffers are freed on return.
2115 *
2116 * Returns: 0 Success
2117 * EOPNOTSUPP
2118 * EINVAL
2119 * ENOBUFS
2120 * uiomove:EFAULT
2121 * sosendcheck:EPIPE
2122 * sosendcheck:EWOULDBLOCK
2123 * sosendcheck:EINTR
2124 * sosendcheck:EBADF
2125 * sosendcheck:EINTR
2126 * sosendcheck:??? [value from so_error]
2127 * <pru_send>:ECONNRESET[TCP]
2128 * <pru_send>:EINVAL[TCP]
2129 * <pru_send>:ENOBUFS[TCP]
2130 * <pru_send>:EADDRINUSE[TCP]
2131 * <pru_send>:EADDRNOTAVAIL[TCP]
2132 * <pru_send>:EAFNOSUPPORT[TCP]
2133 * <pru_send>:EACCES[TCP]
2134 * <pru_send>:EAGAIN[TCP]
2135 * <pru_send>:EPERM[TCP]
2136 * <pru_send>:EMSGSIZE[TCP]
2137 * <pru_send>:EHOSTUNREACH[TCP]
2138 * <pru_send>:ENETUNREACH[TCP]
2139 * <pru_send>:ENETDOWN[TCP]
2140 * <pru_send>:ENOMEM[TCP]
2141 * <pru_send>:ENOBUFS[TCP]
2142 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2143 * <pru_send>:EINVAL[AF_UNIX]
2144 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2145 * <pru_send>:EPIPE[AF_UNIX]
2146 * <pru_send>:ENOTCONN[AF_UNIX]
2147 * <pru_send>:EISCONN[AF_UNIX]
2148 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2149 * <sf_data_out>:??? [whatever a filter author chooses]
2150 *
2151 * Notes: Other <pru_send> returns depend on the protocol family; all
2152 * <sf_data_out> returns depend on what the filter author causes
2153 * their filter to return.
2154 */
2155 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2156 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2157 struct mbuf *top, struct mbuf *control, int flags)
2158 {
2159 struct mbuf **mp;
2160 struct mbuf *m, *freelist = NULL;
2161 struct soflow_hash_entry *dgram_flow_entry = NULL;
2162 user_ssize_t space, len, resid, orig_resid;
2163 int clen = 0, error, dontroute, sendflags;
2164 int atomic = sosendallatonce(so) || top;
2165 int sblocked = 0;
2166 struct proc *p = current_proc();
2167 uint16_t headroom = 0;
2168 ssize_t mlen;
2169 boolean_t en_tracing = FALSE;
2170
2171 if (uio != NULL) {
2172 resid = uio_resid(uio);
2173 } else {
2174 resid = top->m_pkthdr.len;
2175 }
2176
2177 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2178 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2179
2180 socket_lock(so, 1);
2181
2182 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2183 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2184 }
2185
2186 /*
2187 * trace if tracing & network (vs. unix) sockets & and
2188 * non-loopback
2189 */
2190 if (ENTR_SHOULDTRACE &&
2191 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2192 struct inpcb *inp = sotoinpcb(so);
2193 if (inp->inp_last_outifp != NULL &&
2194 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2195 en_tracing = TRUE;
2196 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2197 VM_KERNEL_ADDRPERM(so),
2198 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2199 (int64_t)resid);
2200 orig_resid = resid;
2201 }
2202 }
2203
2204 /*
2205 * Re-injection should not affect process accounting
2206 */
2207 if ((flags & MSG_SKIPCFIL) == 0) {
2208 so_update_last_owner_locked(so, p);
2209 so_update_policy(so);
2210
2211 #if NECP
2212 so_update_necp_policy(so, NULL, addr);
2213 #endif /* NECP */
2214 }
2215
2216 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2217 error = EOPNOTSUPP;
2218 goto out_locked;
2219 }
2220
2221 /*
2222 * In theory resid should be unsigned.
2223 * However, space must be signed, as it might be less than 0
2224 * if we over-committed, and we must use a signed comparison
2225 * of space and resid. On the other hand, a negative resid
2226 * causes us to loop sending 0-length segments to the protocol.
2227 *
2228 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2229 *
2230 * Note: We limit resid to be a positive int value as we use
2231 * imin() to set bytes_to_copy -- radr://14558484
2232 */
2233 if (resid < 0 || resid > INT_MAX ||
2234 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2235 error = EINVAL;
2236 goto out_locked;
2237 }
2238
2239 dontroute = (flags & MSG_DONTROUTE) &&
2240 (so->so_options & SO_DONTROUTE) == 0 &&
2241 (so->so_proto->pr_flags & PR_ATOMIC);
2242 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2243
2244 if (control != NULL) {
2245 clen = control->m_len;
2246 }
2247
2248 if (soreserveheadroom != 0) {
2249 headroom = so->so_pktheadroom;
2250 }
2251
2252 do {
2253 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2254 &sblocked);
2255 if (error) {
2256 goto out_locked;
2257 }
2258
2259 mp = ⊤
2260 space = sbspace(&so->so_snd) - clen;
2261 space += ((flags & MSG_OOB) ? 1024 : 0);
2262
2263 do {
2264 if (uio == NULL) {
2265 /*
2266 * Data is prepackaged in "top".
2267 */
2268 resid = 0;
2269 if (flags & MSG_EOR) {
2270 top->m_flags |= M_EOR;
2271 }
2272 } else {
2273 int chainlength;
2274 int bytes_to_copy;
2275 boolean_t jumbocl;
2276 boolean_t bigcl;
2277 int bytes_to_alloc;
2278
2279 bytes_to_copy = imin((int)resid, (int)space);
2280
2281 bytes_to_alloc = bytes_to_copy;
2282 if (top == NULL) {
2283 bytes_to_alloc += headroom;
2284 }
2285
2286 if (sosendminchain > 0) {
2287 chainlength = 0;
2288 } else {
2289 chainlength = sosendmaxchain;
2290 }
2291
2292 /*
2293 * Use big 4 KB cluster when the outgoing interface
2294 * does not prefer 2 KB clusters
2295 */
2296 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2297 sosendbigcl_ignore_capab;
2298
2299 /*
2300 * Attempt to use larger than system page-size
2301 * clusters for large writes only if there is
2302 * a jumbo cluster pool and if the socket is
2303 * marked accordingly.
2304 */
2305 jumbocl = sosendjcl && njcl > 0 &&
2306 ((so->so_flags & SOF_MULTIPAGES) ||
2307 sosendjcl_ignore_capab) &&
2308 bigcl;
2309
2310 socket_unlock(so, 0);
2311
2312 do {
2313 int num_needed;
2314 int hdrs_needed = (top == NULL) ? 1 : 0;
2315
2316 /*
2317 * try to maintain a local cache of mbuf
2318 * clusters needed to complete this
2319 * write the list is further limited to
2320 * the number that are currently needed
2321 * to fill the socket this mechanism
2322 * allows a large number of mbufs/
2323 * clusters to be grabbed under a single
2324 * mbuf lock... if we can't get any
2325 * clusters, than fall back to trying
2326 * for mbufs if we fail early (or
2327 * miscalcluate the number needed) make
2328 * sure to release any clusters we
2329 * haven't yet consumed.
2330 */
2331 if (freelist == NULL &&
2332 bytes_to_alloc > MBIGCLBYTES &&
2333 jumbocl) {
2334 num_needed =
2335 bytes_to_alloc / M16KCLBYTES;
2336
2337 if ((bytes_to_alloc -
2338 (num_needed * M16KCLBYTES))
2339 >= MINCLSIZE) {
2340 num_needed++;
2341 }
2342
2343 freelist =
2344 m_getpackets_internal(
2345 (unsigned int *)&num_needed,
2346 hdrs_needed, M_WAIT, 0,
2347 M16KCLBYTES);
2348 /*
2349 * Fall back to 4K cluster size
2350 * if allocation failed
2351 */
2352 }
2353
2354 if (freelist == NULL &&
2355 bytes_to_alloc > MCLBYTES &&
2356 bigcl) {
2357 num_needed =
2358 bytes_to_alloc / MBIGCLBYTES;
2359
2360 if ((bytes_to_alloc -
2361 (num_needed * MBIGCLBYTES)) >=
2362 MINCLSIZE) {
2363 num_needed++;
2364 }
2365
2366 freelist =
2367 m_getpackets_internal(
2368 (unsigned int *)&num_needed,
2369 hdrs_needed, M_WAIT, 0,
2370 MBIGCLBYTES);
2371 /*
2372 * Fall back to cluster size
2373 * if allocation failed
2374 */
2375 }
2376
2377 /*
2378 * Allocate a cluster as we want to
2379 * avoid to split the data in more
2380 * that one segment and using MINCLSIZE
2381 * would lead us to allocate two mbufs
2382 */
2383 if (soreserveheadroom != 0 &&
2384 freelist == NULL &&
2385 ((top == NULL &&
2386 bytes_to_alloc > _MHLEN) ||
2387 bytes_to_alloc > _MLEN)) {
2388 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2389 MCLBYTES;
2390 freelist =
2391 m_getpackets_internal(
2392 (unsigned int *)&num_needed,
2393 hdrs_needed, M_WAIT, 0,
2394 MCLBYTES);
2395 /*
2396 * Fall back to a single mbuf
2397 * if allocation failed
2398 */
2399 } else if (freelist == NULL &&
2400 bytes_to_alloc > MINCLSIZE) {
2401 num_needed =
2402 bytes_to_alloc / MCLBYTES;
2403
2404 if ((bytes_to_alloc -
2405 (num_needed * MCLBYTES)) >=
2406 MINCLSIZE) {
2407 num_needed++;
2408 }
2409
2410 freelist =
2411 m_getpackets_internal(
2412 (unsigned int *)&num_needed,
2413 hdrs_needed, M_WAIT, 0,
2414 MCLBYTES);
2415 /*
2416 * Fall back to a single mbuf
2417 * if allocation failed
2418 */
2419 }
2420 /*
2421 * For datagram protocols, leave
2422 * headroom for protocol headers
2423 * in the first cluster of the chain
2424 */
2425 if (freelist != NULL && atomic &&
2426 top == NULL && headroom > 0) {
2427 freelist->m_data += headroom;
2428 }
2429
2430 /*
2431 * Fall back to regular mbufs without
2432 * reserving the socket headroom
2433 */
2434 if (freelist == NULL) {
2435 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2436 if (top == NULL) {
2437 MGETHDR(freelist,
2438 M_WAIT, MT_DATA);
2439 } else {
2440 MGET(freelist,
2441 M_WAIT, MT_DATA);
2442 }
2443 }
2444
2445 if (freelist == NULL) {
2446 error = ENOBUFS;
2447 socket_lock(so, 0);
2448 goto out_locked;
2449 }
2450 /*
2451 * For datagram protocols,
2452 * leave room for protocol
2453 * headers in first mbuf.
2454 */
2455 if (atomic && top == NULL &&
2456 bytes_to_copy > 0 &&
2457 bytes_to_copy < MHLEN) {
2458 MH_ALIGN(freelist,
2459 bytes_to_copy);
2460 }
2461 }
2462 m = freelist;
2463 freelist = m->m_next;
2464 m->m_next = NULL;
2465
2466 if ((m->m_flags & M_EXT)) {
2467 mlen = m->m_ext.ext_size -
2468 M_LEADINGSPACE(m);
2469 } else if ((m->m_flags & M_PKTHDR)) {
2470 mlen = MHLEN - M_LEADINGSPACE(m);
2471 m_add_crumb(m, PKT_CRUMB_SOSEND);
2472 } else {
2473 mlen = MLEN - M_LEADINGSPACE(m);
2474 }
2475 len = imin((int)mlen, bytes_to_copy);
2476
2477 chainlength += len;
2478
2479 space -= len;
2480
2481 error = uiomove(mtod(m, caddr_t),
2482 (int)len, uio);
2483
2484 resid = uio_resid(uio);
2485
2486 m->m_len = (int32_t)len;
2487 *mp = m;
2488 top->m_pkthdr.len += len;
2489 if (error) {
2490 break;
2491 }
2492 mp = &m->m_next;
2493 if (resid <= 0) {
2494 if (flags & MSG_EOR) {
2495 top->m_flags |= M_EOR;
2496 }
2497 break;
2498 }
2499 bytes_to_copy = imin((int)resid, (int)space);
2500 } while (space > 0 &&
2501 (chainlength < sosendmaxchain || atomic ||
2502 resid < MINCLSIZE));
2503
2504 socket_lock(so, 0);
2505
2506 if (error) {
2507 goto out_locked;
2508 }
2509 }
2510
2511 if (dontroute) {
2512 so->so_options |= SO_DONTROUTE;
2513 }
2514
2515 /*
2516 * Compute flags here, for pru_send and NKEs
2517 *
2518 * If the user set MSG_EOF, the protocol
2519 * understands this flag and nothing left to
2520 * send then use PRU_SEND_EOF instead of PRU_SEND.
2521 */
2522 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2523 ((flags & MSG_EOF) &&
2524 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2525 (resid <= 0)) ? PRUS_EOF :
2526 /* If there is more to send set PRUS_MORETOCOME */
2527 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2528
2529 if ((flags & MSG_SKIPCFIL) == 0) {
2530 /*
2531 * Socket filter processing
2532 */
2533 error = sflt_data_out(so, addr, &top,
2534 &control, (sendflags & MSG_OOB) ?
2535 sock_data_filt_flag_oob : 0);
2536 if (error) {
2537 if (error == EJUSTRETURN) {
2538 error = 0;
2539 goto packet_consumed;
2540 }
2541 goto out_locked;
2542 }
2543 #if CONTENT_FILTER
2544 /*
2545 * Content filter processing
2546 */
2547 error = cfil_sock_data_out(so, addr, top,
2548 control, sendflags, dgram_flow_entry);
2549 if (error) {
2550 if (error == EJUSTRETURN) {
2551 error = 0;
2552 goto packet_consumed;
2553 }
2554 goto out_locked;
2555 }
2556 #endif /* CONTENT_FILTER */
2557 }
2558 error = (*so->so_proto->pr_usrreqs->pru_send)
2559 (so, sendflags, top, addr, control, p);
2560
2561 packet_consumed:
2562 if (dontroute) {
2563 so->so_options &= ~SO_DONTROUTE;
2564 }
2565
2566 clen = 0;
2567 control = NULL;
2568 top = NULL;
2569 mp = ⊤
2570 if (error) {
2571 goto out_locked;
2572 }
2573 } while (resid && space > 0);
2574 } while (resid);
2575
2576 out_locked:
2577 if (sblocked) {
2578 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2579 } else {
2580 socket_unlock(so, 1);
2581 }
2582 if (top != NULL) {
2583 m_freem(top);
2584 }
2585 if (control != NULL) {
2586 m_freem(control);
2587 }
2588 if (freelist != NULL) {
2589 m_freem_list(freelist);
2590 }
2591
2592 if (dgram_flow_entry != NULL) {
2593 soflow_free_flow(dgram_flow_entry);
2594 }
2595
2596 soclearfastopen(so);
2597
2598 if (en_tracing) {
2599 /* resid passed here is the bytes left in uio */
2600 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2601 VM_KERNEL_ADDRPERM(so),
2602 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2603 (int64_t)(orig_resid - resid));
2604 }
2605 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2606 so->so_snd.sb_cc, space, error);
2607
2608 return error;
2609 }
2610
2611 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2612 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2613 {
2614 struct mbuf *m0 = NULL, *control_end = NULL;
2615
2616 socket_lock_assert_owned(so);
2617
2618 /*
2619 * top must points to mbuf chain to be sent.
2620 * If control is not NULL, top must be packet header
2621 */
2622 VERIFY(top != NULL &&
2623 (control == NULL || top->m_flags & M_PKTHDR));
2624
2625 /*
2626 * If control is not passed in, see if we can get it
2627 * from top.
2628 */
2629 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2630 // Locate start of control if present and start of data
2631 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2632 if (m0->m_flags & M_PKTHDR) {
2633 top = m0;
2634 break;
2635 } else if (m0->m_type == MT_CONTROL) {
2636 if (control == NULL) {
2637 // Found start of control
2638 control = m0;
2639 }
2640 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2641 // Found end of control
2642 control_end = m0;
2643 }
2644 }
2645 }
2646 if (control_end != NULL) {
2647 control_end->m_next = NULL;
2648 }
2649 }
2650
2651 int error = (*so->so_proto->pr_usrreqs->pru_send)
2652 (so, sendflags, top, addr, control, current_proc());
2653
2654 return error;
2655 }
2656
2657 /*
2658 * Supported only connected sockets (no address) without ancillary data
2659 * (control mbuf) for atomic protocols
2660 */
2661 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2662 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2663 {
2664 struct mbuf *m, *freelist = NULL;
2665 struct soflow_hash_entry *dgram_flow_entry = NULL;
2666 user_ssize_t len, resid;
2667 int error, dontroute;
2668 int atomic = sosendallatonce(so);
2669 int sblocked = 0;
2670 struct proc *p = current_proc();
2671 u_int uiofirst = 0;
2672 u_int uiolast = 0;
2673 struct mbuf *top = NULL;
2674 uint16_t headroom = 0;
2675 ssize_t mlen;
2676 boolean_t bigcl;
2677
2678 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2679 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2680
2681 if (so->so_type != SOCK_DGRAM) {
2682 error = EINVAL;
2683 goto out;
2684 }
2685 if (atomic == 0) {
2686 error = EINVAL;
2687 goto out;
2688 }
2689 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2690 error = EPROTONOSUPPORT;
2691 goto out;
2692 }
2693 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2694 error = EINVAL;
2695 goto out;
2696 }
2697 resid = uio_array_resid(uioarray, uiocnt);
2698
2699 /*
2700 * In theory resid should be unsigned.
2701 * However, space must be signed, as it might be less than 0
2702 * if we over-committed, and we must use a signed comparison
2703 * of space and resid. On the other hand, a negative resid
2704 * causes us to loop sending 0-length segments to the protocol.
2705 *
2706 * Note: We limit resid to be a positive int value as we use
2707 * imin() to set bytes_to_copy -- radr://14558484
2708 */
2709 if (resid < 0 || resid > INT_MAX) {
2710 error = EINVAL;
2711 goto out;
2712 }
2713
2714 socket_lock(so, 1);
2715 so_update_last_owner_locked(so, p);
2716 so_update_policy(so);
2717
2718 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2719 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2720 }
2721
2722 #if NECP
2723 so_update_necp_policy(so, NULL, NULL);
2724 #endif /* NECP */
2725
2726 dontroute = (flags & MSG_DONTROUTE) &&
2727 (so->so_options & SO_DONTROUTE) == 0 &&
2728 (so->so_proto->pr_flags & PR_ATOMIC);
2729 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2730
2731 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2732 if (error) {
2733 goto release;
2734 }
2735
2736 /*
2737 * Use big 4 KB clusters when the outgoing interface does not prefer
2738 * 2 KB clusters
2739 */
2740 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2741
2742 if (soreserveheadroom != 0) {
2743 headroom = so->so_pktheadroom;
2744 }
2745
2746 do {
2747 int i;
2748 int num_needed = 0;
2749 int chainlength;
2750 size_t maxpktlen = 0;
2751 int bytes_to_alloc;
2752
2753 if (sosendminchain > 0) {
2754 chainlength = 0;
2755 } else {
2756 chainlength = sosendmaxchain;
2757 }
2758
2759 socket_unlock(so, 0);
2760
2761 /*
2762 * Find a set of uio that fit in a reasonable number
2763 * of mbuf packets
2764 */
2765 for (i = uiofirst; i < uiocnt; i++) {
2766 struct uio *auio = uioarray[i];
2767
2768 len = uio_resid(auio);
2769
2770 /* Do nothing for empty messages */
2771 if (len == 0) {
2772 continue;
2773 }
2774
2775 num_needed += 1;
2776 uiolast += 1;
2777
2778 if (len > maxpktlen) {
2779 maxpktlen = len;
2780 }
2781
2782 chainlength += len;
2783 if (chainlength > sosendmaxchain) {
2784 break;
2785 }
2786 }
2787 /*
2788 * Nothing left to send
2789 */
2790 if (num_needed == 0) {
2791 socket_lock(so, 0);
2792 break;
2793 }
2794 /*
2795 * Allocate buffer large enough to include headroom space for
2796 * network and link header
2797 *
2798 */
2799 bytes_to_alloc = (int) maxpktlen + headroom;
2800
2801 /*
2802 * Allocate a single contiguous buffer of the smallest available
2803 * size when possible
2804 */
2805 if (bytes_to_alloc > MCLBYTES &&
2806 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2807 freelist = m_getpackets_internal(
2808 (unsigned int *)&num_needed,
2809 num_needed, M_WAIT, 1,
2810 MBIGCLBYTES);
2811 } else if (bytes_to_alloc > _MHLEN &&
2812 bytes_to_alloc <= MCLBYTES) {
2813 freelist = m_getpackets_internal(
2814 (unsigned int *)&num_needed,
2815 num_needed, M_WAIT, 1,
2816 MCLBYTES);
2817 } else {
2818 freelist = m_allocpacket_internal(
2819 (unsigned int *)&num_needed,
2820 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2821 }
2822
2823 if (freelist == NULL) {
2824 socket_lock(so, 0);
2825 error = ENOMEM;
2826 goto release;
2827 }
2828 /*
2829 * Copy each uio of the set into its own mbuf packet
2830 */
2831 for (i = uiofirst, m = freelist;
2832 i < uiolast && m != NULL;
2833 i++) {
2834 int bytes_to_copy;
2835 struct mbuf *n;
2836 struct uio *auio = uioarray[i];
2837
2838 bytes_to_copy = (int)uio_resid(auio);
2839
2840 /* Do nothing for empty messages */
2841 if (bytes_to_copy == 0) {
2842 continue;
2843 }
2844 /*
2845 * Leave headroom for protocol headers
2846 * in the first mbuf of the chain
2847 */
2848 m->m_data += headroom;
2849
2850 for (n = m; n != NULL; n = n->m_next) {
2851 if ((m->m_flags & M_EXT)) {
2852 mlen = m->m_ext.ext_size -
2853 M_LEADINGSPACE(m);
2854 } else if ((m->m_flags & M_PKTHDR)) {
2855 mlen =
2856 MHLEN - M_LEADINGSPACE(m);
2857 } else {
2858 mlen = MLEN - M_LEADINGSPACE(m);
2859 }
2860 len = imin((int)mlen, bytes_to_copy);
2861
2862 /*
2863 * Note: uiomove() decrements the iovec
2864 * length
2865 */
2866 error = uiomove(mtod(n, caddr_t),
2867 (int)len, auio);
2868 if (error != 0) {
2869 break;
2870 }
2871 n->m_len = (int32_t)len;
2872 m->m_pkthdr.len += len;
2873
2874 VERIFY(m->m_pkthdr.len <= maxpktlen);
2875
2876 bytes_to_copy -= len;
2877 resid -= len;
2878 }
2879 if (m->m_pkthdr.len == 0) {
2880 printf(
2881 "%s:%d so %llx pkt %llx type %u len null\n",
2882 __func__, __LINE__,
2883 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2884 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2885 m->m_type);
2886 }
2887 if (error != 0) {
2888 break;
2889 }
2890 m = m->m_nextpkt;
2891 }
2892
2893 socket_lock(so, 0);
2894
2895 if (error) {
2896 goto release;
2897 }
2898 top = freelist;
2899 freelist = NULL;
2900
2901 if (dontroute) {
2902 so->so_options |= SO_DONTROUTE;
2903 }
2904
2905 if ((flags & MSG_SKIPCFIL) == 0) {
2906 struct mbuf **prevnextp = NULL;
2907
2908 for (i = uiofirst, m = top;
2909 i < uiolast && m != NULL;
2910 i++) {
2911 struct mbuf *nextpkt = m->m_nextpkt;
2912
2913 /*
2914 * Socket filter processing
2915 */
2916 error = sflt_data_out(so, NULL, &m,
2917 NULL, 0);
2918 if (error != 0 && error != EJUSTRETURN) {
2919 goto release;
2920 }
2921
2922 #if CONTENT_FILTER
2923 if (error == 0) {
2924 /*
2925 * Content filter processing
2926 */
2927 error = cfil_sock_data_out(so, NULL, m,
2928 NULL, 0, dgram_flow_entry);
2929 if (error != 0 && error != EJUSTRETURN) {
2930 goto release;
2931 }
2932 }
2933 #endif /* CONTENT_FILTER */
2934 /*
2935 * Remove packet from the list when
2936 * swallowed by a filter
2937 */
2938 if (error == EJUSTRETURN) {
2939 error = 0;
2940 if (prevnextp != NULL) {
2941 *prevnextp = nextpkt;
2942 } else {
2943 top = nextpkt;
2944 }
2945 }
2946
2947 m = nextpkt;
2948 if (m != NULL) {
2949 prevnextp = &m->m_nextpkt;
2950 }
2951 }
2952 }
2953 if (top != NULL) {
2954 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2955 (so, 0, top, NULL, NULL, p);
2956 }
2957
2958 if (dontroute) {
2959 so->so_options &= ~SO_DONTROUTE;
2960 }
2961
2962 top = NULL;
2963 uiofirst = uiolast;
2964 } while (resid > 0 && error == 0);
2965 release:
2966 if (sblocked) {
2967 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2968 } else {
2969 socket_unlock(so, 1);
2970 }
2971 out:
2972 if (top != NULL) {
2973 m_freem(top);
2974 }
2975 if (freelist != NULL) {
2976 m_freem_list(freelist);
2977 }
2978
2979 if (dgram_flow_entry != NULL) {
2980 soflow_free_flow(dgram_flow_entry);
2981 }
2982
2983 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2984 so->so_snd.sb_cc, 0, error);
2985
2986 return error;
2987 }
2988
2989 /*
2990 * May return ERESTART when packet is dropped by MAC policy check
2991 */
2992 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2993 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2994 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2995 {
2996 int error = 0;
2997 struct mbuf *m = *mp;
2998 struct mbuf *nextrecord = *nextrecordp;
2999
3000 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3001 #if CONFIG_MACF_SOCKET_SUBSET
3002 /*
3003 * Call the MAC framework for policy checking if we're in
3004 * the user process context and the socket isn't connected.
3005 */
3006 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3007 struct mbuf *m0 = m;
3008 /*
3009 * Dequeue this record (temporarily) from the receive
3010 * list since we're about to drop the socket's lock
3011 * where a new record may arrive and be appended to
3012 * the list. Upon MAC policy failure, the record
3013 * will be freed. Otherwise, we'll add it back to
3014 * the head of the list. We cannot rely on SB_LOCK
3015 * because append operation uses the socket's lock.
3016 */
3017 do {
3018 m->m_nextpkt = NULL;
3019 sbfree(&so->so_rcv, m);
3020 m = m->m_next;
3021 } while (m != NULL);
3022 m = m0;
3023 so->so_rcv.sb_mb = nextrecord;
3024 SB_EMPTY_FIXUP(&so->so_rcv);
3025 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3026 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3027 socket_unlock(so, 0);
3028
3029 error = mac_socket_check_received(kauth_cred_get(), so,
3030 mtod(m, struct sockaddr *));
3031
3032 if (error != 0) {
3033 /*
3034 * MAC policy failure; free this record and
3035 * process the next record (or block until
3036 * one is available). We have adjusted sb_cc
3037 * and sb_mbcnt above so there is no need to
3038 * call sbfree() again.
3039 */
3040 m_freem(m);
3041 /*
3042 * Clear SB_LOCK but don't unlock the socket.
3043 * Process the next record or wait for one.
3044 */
3045 socket_lock(so, 0);
3046 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3047 error = ERESTART;
3048 goto done;
3049 }
3050 socket_lock(so, 0);
3051 /*
3052 * If the socket has been defunct'd, drop it.
3053 */
3054 if (so->so_flags & SOF_DEFUNCT) {
3055 m_freem(m);
3056 error = ENOTCONN;
3057 goto done;
3058 }
3059 /*
3060 * Re-adjust the socket receive list and re-enqueue
3061 * the record in front of any packets which may have
3062 * been appended while we dropped the lock.
3063 */
3064 for (m = m0; m->m_next != NULL; m = m->m_next) {
3065 sballoc(&so->so_rcv, m);
3066 }
3067 sballoc(&so->so_rcv, m);
3068 if (so->so_rcv.sb_mb == NULL) {
3069 so->so_rcv.sb_lastrecord = m0;
3070 so->so_rcv.sb_mbtail = m;
3071 }
3072 m = m0;
3073 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3074 so->so_rcv.sb_mb = m;
3075 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3076 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3077 }
3078 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3079 if (psa != NULL) {
3080 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3081 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3082 error = EWOULDBLOCK;
3083 goto done;
3084 }
3085 }
3086 if (flags & MSG_PEEK) {
3087 m = m->m_next;
3088 } else {
3089 sbfree(&so->so_rcv, m);
3090 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3091 panic("%s: about to create invalid socketbuf",
3092 __func__);
3093 /* NOTREACHED */
3094 }
3095 MFREE(m, so->so_rcv.sb_mb);
3096 m = so->so_rcv.sb_mb;
3097 if (m != NULL) {
3098 m->m_nextpkt = nextrecord;
3099 } else {
3100 so->so_rcv.sb_mb = nextrecord;
3101 SB_EMPTY_FIXUP(&so->so_rcv);
3102 }
3103 }
3104 done:
3105 *mp = m;
3106 *nextrecordp = nextrecord;
3107
3108 return error;
3109 }
3110
3111 /*
3112 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3113 * so clear the data portion in order not to leak the file pointers
3114 */
3115 static void
sopeek_scm_rights(struct mbuf * rights)3116 sopeek_scm_rights(struct mbuf *rights)
3117 {
3118 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3119
3120 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3121 VERIFY(cm->cmsg_len <= rights->m_len);
3122 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3123 }
3124 }
3125
3126 /*
3127 * Process one or more MT_CONTROL mbufs present before any data mbufs
3128 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3129 * just copy the data; if !MSG_PEEK, we call into the protocol to
3130 * perform externalization.
3131 */
3132 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3133 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3134 struct mbuf **mp, struct mbuf **nextrecordp)
3135 {
3136 int error = 0;
3137 struct mbuf *cm = NULL, *cmn;
3138 struct mbuf **cme = &cm;
3139 struct sockbuf *sb_rcv = &so->so_rcv;
3140 struct mbuf **msgpcm = NULL;
3141 struct mbuf *m = *mp;
3142 struct mbuf *nextrecord = *nextrecordp;
3143 struct protosw *pr = so->so_proto;
3144
3145 /*
3146 * Externalizing the control messages would require us to
3147 * drop the socket's lock below. Once we re-acquire the
3148 * lock, the mbuf chain might change. In order to preserve
3149 * consistency, we unlink all control messages from the
3150 * first mbuf chain in one shot and link them separately
3151 * onto a different chain.
3152 */
3153 do {
3154 if (flags & MSG_PEEK) {
3155 if (controlp != NULL) {
3156 if (*controlp == NULL) {
3157 msgpcm = controlp;
3158 }
3159 *controlp = m_copy(m, 0, m->m_len);
3160
3161 /*
3162 * If we failed to allocate an mbuf,
3163 * release any previously allocated
3164 * mbufs for control data. Return
3165 * an error. Keep the mbufs in the
3166 * socket as this is using
3167 * MSG_PEEK flag.
3168 */
3169 if (*controlp == NULL) {
3170 m_freem(*msgpcm);
3171 error = ENOBUFS;
3172 goto done;
3173 }
3174
3175 if (pr->pr_domain->dom_externalize != NULL) {
3176 sopeek_scm_rights(*controlp);
3177 }
3178
3179 controlp = &(*controlp)->m_next;
3180 }
3181 m = m->m_next;
3182 } else {
3183 m->m_nextpkt = NULL;
3184 sbfree(sb_rcv, m);
3185 sb_rcv->sb_mb = m->m_next;
3186 m->m_next = NULL;
3187 *cme = m;
3188 cme = &(*cme)->m_next;
3189 m = sb_rcv->sb_mb;
3190 }
3191 } while (m != NULL && m->m_type == MT_CONTROL);
3192
3193 if (!(flags & MSG_PEEK)) {
3194 if (sb_rcv->sb_mb != NULL) {
3195 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3196 } else {
3197 sb_rcv->sb_mb = nextrecord;
3198 SB_EMPTY_FIXUP(sb_rcv);
3199 }
3200 if (nextrecord == NULL) {
3201 sb_rcv->sb_lastrecord = m;
3202 }
3203 }
3204
3205 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3206 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3207
3208 while (cm != NULL) {
3209 int cmsg_level;
3210 int cmsg_type;
3211
3212 cmn = cm->m_next;
3213 cm->m_next = NULL;
3214 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3215 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3216
3217 /*
3218 * Call the protocol to externalize SCM_RIGHTS message
3219 * and return the modified message to the caller upon
3220 * success. Otherwise, all other control messages are
3221 * returned unmodified to the caller. Note that we
3222 * only get into this loop if MSG_PEEK is not set.
3223 */
3224 if (pr->pr_domain->dom_externalize != NULL &&
3225 cmsg_level == SOL_SOCKET &&
3226 cmsg_type == SCM_RIGHTS) {
3227 /*
3228 * Release socket lock: see 3903171. This
3229 * would also allow more records to be appended
3230 * to the socket buffer. We still have SB_LOCK
3231 * set on it, so we can be sure that the head
3232 * of the mbuf chain won't change.
3233 */
3234 socket_unlock(so, 0);
3235 error = (*pr->pr_domain->dom_externalize)(cm);
3236 socket_lock(so, 0);
3237 } else {
3238 error = 0;
3239 }
3240
3241 if (controlp != NULL && error == 0) {
3242 *controlp = cm;
3243 controlp = &(*controlp)->m_next;
3244 } else {
3245 (void) m_free(cm);
3246 }
3247 cm = cmn;
3248 }
3249 /*
3250 * Update the value of nextrecord in case we received new
3251 * records when the socket was unlocked above for
3252 * externalizing SCM_RIGHTS.
3253 */
3254 if (m != NULL) {
3255 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3256 } else {
3257 nextrecord = sb_rcv->sb_mb;
3258 }
3259
3260 done:
3261 *mp = m;
3262 *nextrecordp = nextrecord;
3263
3264 return error;
3265 }
3266
3267 /*
3268 * If we have less data than requested, block awaiting more
3269 * (subject to any timeout) if:
3270 * 1. the current count is less than the low water mark, or
3271 * 2. MSG_WAITALL is set, and it is possible to do the entire
3272 * receive operation at once if we block (resid <= hiwat).
3273 * 3. MSG_DONTWAIT is not set
3274 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3275 * we have to do the receive in sections, and thus risk returning
3276 * a short count if a timeout or signal occurs after we start.
3277 */
3278 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3279 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3280 {
3281 struct protosw *pr = so->so_proto;
3282
3283 /* No mbufs in the receive-queue? Wait! */
3284 if (m == NULL) {
3285 return true;
3286 }
3287
3288 /* Not enough data in the receive socket-buffer - we may have to wait */
3289 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3290 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3291 /*
3292 * Application did set the lowater-mark, so we should wait for
3293 * this data to be present.
3294 */
3295 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3296 return true;
3297 }
3298
3299 /*
3300 * Application wants all the data - so let's try to do the
3301 * receive-operation at once by waiting for everything to
3302 * be there.
3303 */
3304 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3305 return true;
3306 }
3307 }
3308
3309 return false;
3310 }
3311
3312 /*
3313 * Implement receive operations on a socket.
3314 * We depend on the way that records are added to the sockbuf
3315 * by sbappend*. In particular, each record (mbufs linked through m_next)
3316 * must begin with an address if the protocol so specifies,
3317 * followed by an optional mbuf or mbufs containing ancillary data,
3318 * and then zero or more mbufs of data.
3319 * In order to avoid blocking network interrupts for the entire time here,
3320 * we splx() while doing the actual copy to user space.
3321 * Although the sockbuf is locked, new data may still be appended,
3322 * and thus we must maintain consistency of the sockbuf during that time.
3323 *
3324 * The caller may receive the data as a single mbuf chain by supplying
3325 * an mbuf **mp0 for use in returning the chain. The uio is then used
3326 * only for the count in uio_resid.
3327 *
3328 * Returns: 0 Success
3329 * ENOBUFS
3330 * ENOTCONN
3331 * EWOULDBLOCK
3332 * uiomove:EFAULT
3333 * sblock:EWOULDBLOCK
3334 * sblock:EINTR
3335 * sbwait:EBADF
3336 * sbwait:EINTR
3337 * sodelayed_copy:EFAULT
3338 * <pru_rcvoob>:EINVAL[TCP]
3339 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3340 * <pru_rcvoob>:???
3341 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3342 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3343 * <pr_domain->dom_externalize>:???
3344 *
3345 * Notes: Additional return values from calls through <pru_rcvoob> and
3346 * <pr_domain->dom_externalize> depend on protocols other than
3347 * TCP or AF_UNIX, which are documented above.
3348 */
3349 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3350 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3351 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3352 {
3353 struct mbuf *m, **mp, *ml = NULL;
3354 struct mbuf *nextrecord, *free_list;
3355 int flags, error, offset;
3356 user_ssize_t len;
3357 struct protosw *pr = so->so_proto;
3358 int moff, type = 0;
3359 user_ssize_t orig_resid = uio_resid(uio);
3360 user_ssize_t delayed_copy_len;
3361 int can_delay;
3362 struct proc *p = current_proc();
3363 boolean_t en_tracing = FALSE;
3364
3365 /*
3366 * Sanity check on the length passed by caller as we are making 'int'
3367 * comparisons
3368 */
3369 if (orig_resid < 0 || orig_resid > INT_MAX) {
3370 return EINVAL;
3371 }
3372
3373 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3374 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3375 so->so_rcv.sb_hiwat);
3376
3377 socket_lock(so, 1);
3378 so_update_last_owner_locked(so, p);
3379 so_update_policy(so);
3380
3381 #ifdef MORE_LOCKING_DEBUG
3382 if (so->so_usecount == 1) {
3383 panic("%s: so=%x no other reference on socket", __func__, so);
3384 /* NOTREACHED */
3385 }
3386 #endif
3387 mp = mp0;
3388 if (psa != NULL) {
3389 *psa = NULL;
3390 }
3391 if (controlp != NULL) {
3392 *controlp = NULL;
3393 }
3394 if (flagsp != NULL) {
3395 flags = *flagsp & ~MSG_EOR;
3396 } else {
3397 flags = 0;
3398 }
3399
3400 /*
3401 * If a recv attempt is made on a previously-accepted socket
3402 * that has been marked as inactive (disconnected), reject
3403 * the request.
3404 */
3405 if (so->so_flags & SOF_DEFUNCT) {
3406 struct sockbuf *sb = &so->so_rcv;
3407
3408 error = ENOTCONN;
3409 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3410 __func__, proc_pid(p), proc_best_name(p),
3411 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3412 SOCK_DOM(so), SOCK_TYPE(so), error);
3413 /*
3414 * This socket should have been disconnected and flushed
3415 * prior to being returned from sodefunct(); there should
3416 * be no data on its receive list, so panic otherwise.
3417 */
3418 if (so->so_state & SS_DEFUNCT) {
3419 sb_empty_assert(sb, __func__);
3420 }
3421 socket_unlock(so, 1);
3422 return error;
3423 }
3424
3425 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3426 pr->pr_usrreqs->pru_preconnect) {
3427 /*
3428 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3429 * calling write() right after this. *If* the app calls a read
3430 * we do not want to block this read indefinetely. Thus,
3431 * we trigger a connect so that the session gets initiated.
3432 */
3433 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3434
3435 if (error) {
3436 socket_unlock(so, 1);
3437 return error;
3438 }
3439 }
3440
3441 if (ENTR_SHOULDTRACE &&
3442 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3443 /*
3444 * enable energy tracing for inet sockets that go over
3445 * non-loopback interfaces only.
3446 */
3447 struct inpcb *inp = sotoinpcb(so);
3448 if (inp->inp_last_outifp != NULL &&
3449 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3450 en_tracing = TRUE;
3451 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3452 VM_KERNEL_ADDRPERM(so),
3453 ((so->so_state & SS_NBIO) ?
3454 kEnTrFlagNonBlocking : 0),
3455 (int64_t)orig_resid);
3456 }
3457 }
3458
3459 /*
3460 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3461 * regardless of the flags argument. Here is the case were
3462 * out-of-band data is not inline.
3463 */
3464 if ((flags & MSG_OOB) ||
3465 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3466 (so->so_options & SO_OOBINLINE) == 0 &&
3467 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3468 m = m_get(M_WAIT, MT_DATA);
3469 if (m == NULL) {
3470 socket_unlock(so, 1);
3471 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3472 ENOBUFS, 0, 0, 0, 0);
3473 return ENOBUFS;
3474 }
3475 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3476 if (error) {
3477 goto bad;
3478 }
3479 socket_unlock(so, 0);
3480 do {
3481 error = uiomove(mtod(m, caddr_t),
3482 imin((int)uio_resid(uio), m->m_len), uio);
3483 m = m_free(m);
3484 } while (uio_resid(uio) && error == 0 && m != NULL);
3485 socket_lock(so, 0);
3486 bad:
3487 if (m != NULL) {
3488 m_freem(m);
3489 }
3490
3491 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3492 if (error == EWOULDBLOCK || error == EINVAL) {
3493 /*
3494 * Let's try to get normal data:
3495 * EWOULDBLOCK: out-of-band data not
3496 * receive yet. EINVAL: out-of-band data
3497 * already read.
3498 */
3499 error = 0;
3500 goto nooob;
3501 } else if (error == 0 && flagsp != NULL) {
3502 *flagsp |= MSG_OOB;
3503 }
3504 }
3505 socket_unlock(so, 1);
3506 if (en_tracing) {
3507 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3508 VM_KERNEL_ADDRPERM(so), 0,
3509 (int64_t)(orig_resid - uio_resid(uio)));
3510 }
3511 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3512 0, 0, 0, 0);
3513
3514 return error;
3515 }
3516 nooob:
3517 if (mp != NULL) {
3518 *mp = NULL;
3519 }
3520
3521 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3522 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3523 }
3524
3525 free_list = NULL;
3526 delayed_copy_len = 0;
3527 restart:
3528 #ifdef MORE_LOCKING_DEBUG
3529 if (so->so_usecount <= 1) {
3530 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3531 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3532 }
3533 #endif
3534 /*
3535 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3536 * and if so just return to the caller. This could happen when
3537 * soreceive() is called by a socket upcall function during the
3538 * time the socket is freed. The socket buffer would have been
3539 * locked across the upcall, therefore we cannot put this thread
3540 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3541 * we may livelock), because the lock on the socket buffer will
3542 * only be released when the upcall routine returns to its caller.
3543 * Because the socket has been officially closed, there can be
3544 * no further read on it.
3545 *
3546 * A multipath subflow socket would have its SS_NOFDREF set by
3547 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3548 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3549 */
3550 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3551 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3552 socket_unlock(so, 1);
3553 return 0;
3554 }
3555
3556 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3557 if (error) {
3558 socket_unlock(so, 1);
3559 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3560 0, 0, 0, 0);
3561 if (en_tracing) {
3562 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3563 VM_KERNEL_ADDRPERM(so), 0,
3564 (int64_t)(orig_resid - uio_resid(uio)));
3565 }
3566 return error;
3567 }
3568
3569 m = so->so_rcv.sb_mb;
3570 if (so_should_wait(so, uio, m, flags)) {
3571 /*
3572 * Panic if we notice inconsistencies in the socket's
3573 * receive list; both sb_mb and sb_cc should correctly
3574 * reflect the contents of the list, otherwise we may
3575 * end up with false positives during select() or poll()
3576 * which could put the application in a bad state.
3577 */
3578 SB_MB_CHECK(&so->so_rcv);
3579
3580 if (so->so_error) {
3581 if (m != NULL) {
3582 goto dontblock;
3583 }
3584 error = so->so_error;
3585 if ((flags & MSG_PEEK) == 0) {
3586 so->so_error = 0;
3587 }
3588 goto release;
3589 }
3590 if (so->so_state & SS_CANTRCVMORE) {
3591 #if CONTENT_FILTER
3592 /*
3593 * Deal with half closed connections
3594 */
3595 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3596 cfil_sock_data_pending(&so->so_rcv) != 0) {
3597 CFIL_LOG(LOG_INFO,
3598 "so %llx ignore SS_CANTRCVMORE",
3599 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3600 } else
3601 #endif /* CONTENT_FILTER */
3602 if (m != NULL) {
3603 goto dontblock;
3604 } else {
3605 goto release;
3606 }
3607 }
3608 for (; m != NULL; m = m->m_next) {
3609 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3610 m = so->so_rcv.sb_mb;
3611 goto dontblock;
3612 }
3613 }
3614 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3615 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3616 error = ENOTCONN;
3617 goto release;
3618 }
3619 if (uio_resid(uio) == 0) {
3620 goto release;
3621 }
3622
3623 if ((so->so_state & SS_NBIO) ||
3624 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3625 error = EWOULDBLOCK;
3626 goto release;
3627 }
3628 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3629 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3630 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3631 #if EVEN_MORE_LOCKING_DEBUG
3632 if (socket_debug) {
3633 printf("Waiting for socket data\n");
3634 }
3635 #endif
3636
3637 /*
3638 * Depending on the protocol (e.g. TCP), the following
3639 * might cause the socket lock to be dropped and later
3640 * be reacquired, and more data could have arrived and
3641 * have been appended to the receive socket buffer by
3642 * the time it returns. Therefore, we only sleep in
3643 * sbwait() below if and only if the wait-condition is still
3644 * true.
3645 */
3646 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3647 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3648 }
3649
3650 error = 0;
3651 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3652 error = sbwait(&so->so_rcv);
3653 }
3654
3655 #if EVEN_MORE_LOCKING_DEBUG
3656 if (socket_debug) {
3657 printf("SORECEIVE - sbwait returned %d\n", error);
3658 }
3659 #endif
3660 if (so->so_usecount < 1) {
3661 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3662 __func__, so, so->so_usecount);
3663 /* NOTREACHED */
3664 }
3665 if (error) {
3666 socket_unlock(so, 1);
3667 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3668 0, 0, 0, 0);
3669 if (en_tracing) {
3670 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3671 VM_KERNEL_ADDRPERM(so), 0,
3672 (int64_t)(orig_resid - uio_resid(uio)));
3673 }
3674 return error;
3675 }
3676 goto restart;
3677 }
3678 dontblock:
3679 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3680 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3681 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3682 nextrecord = m->m_nextpkt;
3683
3684 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3685 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3686 mp0 == NULL);
3687 if (error == ERESTART) {
3688 goto restart;
3689 } else if (error != 0) {
3690 goto release;
3691 }
3692 orig_resid = 0;
3693 }
3694
3695 /*
3696 * Process one or more MT_CONTROL mbufs present before any data mbufs
3697 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3698 * just copy the data; if !MSG_PEEK, we call into the protocol to
3699 * perform externalization.
3700 */
3701 if (m != NULL && m->m_type == MT_CONTROL) {
3702 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3703 if (error != 0) {
3704 goto release;
3705 }
3706 orig_resid = 0;
3707 }
3708
3709 if (m != NULL) {
3710 if (!(flags & MSG_PEEK)) {
3711 /*
3712 * We get here because m points to an mbuf following
3713 * any MT_SONAME or MT_CONTROL mbufs which have been
3714 * processed above. In any case, m should be pointing
3715 * to the head of the mbuf chain, and the nextrecord
3716 * should be either NULL or equal to m->m_nextpkt.
3717 * See comments above about SB_LOCK.
3718 */
3719 if (m != so->so_rcv.sb_mb ||
3720 m->m_nextpkt != nextrecord) {
3721 panic("%s: post-control !sync so=%p m=%p "
3722 "nextrecord=%p\n", __func__, so, m,
3723 nextrecord);
3724 /* NOTREACHED */
3725 }
3726 if (nextrecord == NULL) {
3727 so->so_rcv.sb_lastrecord = m;
3728 }
3729 }
3730 type = m->m_type;
3731 if (type == MT_OOBDATA) {
3732 flags |= MSG_OOB;
3733 }
3734 } else {
3735 if (!(flags & MSG_PEEK)) {
3736 SB_EMPTY_FIXUP(&so->so_rcv);
3737 }
3738 }
3739 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3740 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3741
3742 moff = 0;
3743 offset = 0;
3744
3745 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3746 can_delay = 1;
3747 } else {
3748 can_delay = 0;
3749 }
3750
3751 while (m != NULL &&
3752 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3753 if (m->m_type == MT_OOBDATA) {
3754 if (type != MT_OOBDATA) {
3755 break;
3756 }
3757 } else if (type == MT_OOBDATA) {
3758 break;
3759 }
3760
3761 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3762 m->m_type != MT_HEADER) {
3763 break;
3764 }
3765 /*
3766 * Make sure to allways set MSG_OOB event when getting
3767 * out of band data inline.
3768 */
3769 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3770 (so->so_options & SO_OOBINLINE) != 0 &&
3771 (so->so_state & SS_RCVATMARK) != 0) {
3772 flags |= MSG_OOB;
3773 }
3774 so->so_state &= ~SS_RCVATMARK;
3775 len = uio_resid(uio) - delayed_copy_len;
3776 if (so->so_oobmark && len > so->so_oobmark - offset) {
3777 len = so->so_oobmark - offset;
3778 }
3779 if (len > m->m_len - moff) {
3780 len = m->m_len - moff;
3781 }
3782 /*
3783 * If mp is set, just pass back the mbufs.
3784 * Otherwise copy them out via the uio, then free.
3785 * Sockbuf must be consistent here (points to current mbuf,
3786 * it points to next record) when we drop priority;
3787 * we must note any additions to the sockbuf when we
3788 * block interrupts again.
3789 */
3790 if (mp == NULL) {
3791 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3792 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3793 if (can_delay && len == m->m_len) {
3794 /*
3795 * only delay the copy if we're consuming the
3796 * mbuf and we're NOT in MSG_PEEK mode
3797 * and we have enough data to make it worthwile
3798 * to drop and retake the lock... can_delay
3799 * reflects the state of the 2 latter
3800 * constraints moff should always be zero
3801 * in these cases
3802 */
3803 delayed_copy_len += len;
3804 } else {
3805 if (delayed_copy_len) {
3806 error = sodelayed_copy(so, uio,
3807 &free_list, &delayed_copy_len);
3808
3809 if (error) {
3810 goto release;
3811 }
3812 /*
3813 * can only get here if MSG_PEEK is not
3814 * set therefore, m should point at the
3815 * head of the rcv queue; if it doesn't,
3816 * it means something drastically
3817 * changed while we were out from behind
3818 * the lock in sodelayed_copy. perhaps
3819 * a RST on the stream. in any event,
3820 * the stream has been interrupted. it's
3821 * probably best just to return whatever
3822 * data we've moved and let the caller
3823 * sort it out...
3824 */
3825 if (m != so->so_rcv.sb_mb) {
3826 break;
3827 }
3828 }
3829 socket_unlock(so, 0);
3830 error = uiomove(mtod(m, caddr_t) + moff,
3831 (int)len, uio);
3832 socket_lock(so, 0);
3833
3834 if (error) {
3835 goto release;
3836 }
3837 }
3838 } else {
3839 uio_setresid(uio, (uio_resid(uio) - len));
3840 }
3841 if (len == m->m_len - moff) {
3842 if (m->m_flags & M_EOR) {
3843 flags |= MSG_EOR;
3844 }
3845 if (flags & MSG_PEEK) {
3846 m = m->m_next;
3847 moff = 0;
3848 } else {
3849 nextrecord = m->m_nextpkt;
3850 sbfree(&so->so_rcv, m);
3851 m->m_nextpkt = NULL;
3852
3853 if (mp != NULL) {
3854 *mp = m;
3855 mp = &m->m_next;
3856 so->so_rcv.sb_mb = m = m->m_next;
3857 *mp = NULL;
3858 } else {
3859 if (free_list == NULL) {
3860 free_list = m;
3861 } else {
3862 ml->m_next = m;
3863 }
3864 ml = m;
3865 so->so_rcv.sb_mb = m = m->m_next;
3866 ml->m_next = NULL;
3867 }
3868 if (m != NULL) {
3869 m->m_nextpkt = nextrecord;
3870 if (nextrecord == NULL) {
3871 so->so_rcv.sb_lastrecord = m;
3872 }
3873 } else {
3874 so->so_rcv.sb_mb = nextrecord;
3875 SB_EMPTY_FIXUP(&so->so_rcv);
3876 }
3877 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3878 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3879 }
3880 } else {
3881 if (flags & MSG_PEEK) {
3882 moff += len;
3883 } else {
3884 if (mp != NULL) {
3885 int copy_flag;
3886
3887 if (flags & MSG_DONTWAIT) {
3888 copy_flag = M_DONTWAIT;
3889 } else {
3890 copy_flag = M_WAIT;
3891 }
3892 *mp = m_copym(m, 0, (int)len, copy_flag);
3893 /*
3894 * Failed to allocate an mbuf?
3895 * Adjust uio_resid back, it was
3896 * adjusted down by len bytes which
3897 * we didn't copy over.
3898 */
3899 if (*mp == NULL) {
3900 uio_setresid(uio,
3901 (uio_resid(uio) + len));
3902 break;
3903 }
3904 }
3905 m->m_data += len;
3906 m->m_len -= len;
3907 so->so_rcv.sb_cc -= len;
3908 }
3909 }
3910 if (so->so_oobmark) {
3911 if ((flags & MSG_PEEK) == 0) {
3912 so->so_oobmark -= len;
3913 if (so->so_oobmark == 0) {
3914 so->so_state |= SS_RCVATMARK;
3915 break;
3916 }
3917 } else {
3918 offset += len;
3919 if (offset == so->so_oobmark) {
3920 break;
3921 }
3922 }
3923 }
3924 if (flags & MSG_EOR) {
3925 break;
3926 }
3927 /*
3928 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3929 * (for non-atomic socket), we must not quit until
3930 * "uio->uio_resid == 0" or an error termination.
3931 * If a signal/timeout occurs, return with a short
3932 * count but without error. Keep sockbuf locked
3933 * against other readers.
3934 */
3935 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3936 (uio_resid(uio) - delayed_copy_len) > 0 &&
3937 !sosendallatonce(so) && !nextrecord) {
3938 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3939 #if CONTENT_FILTER
3940 && cfil_sock_data_pending(&so->so_rcv) == 0
3941 #endif /* CONTENT_FILTER */
3942 )) {
3943 goto release;
3944 }
3945
3946 /*
3947 * Depending on the protocol (e.g. TCP), the following
3948 * might cause the socket lock to be dropped and later
3949 * be reacquired, and more data could have arrived and
3950 * have been appended to the receive socket buffer by
3951 * the time it returns. Therefore, we only sleep in
3952 * sbwait() below if and only if the socket buffer is
3953 * empty, in order to avoid a false sleep.
3954 */
3955 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3956 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3957 }
3958
3959 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3960 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3961
3962 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3963 error = 0;
3964 goto release;
3965 }
3966 /*
3967 * have to wait until after we get back from the sbwait
3968 * to do the copy because we will drop the lock if we
3969 * have enough data that has been delayed... by dropping
3970 * the lock we open up a window allowing the netisr
3971 * thread to process the incoming packets and to change
3972 * the state of this socket... we're issuing the sbwait
3973 * because the socket is empty and we're expecting the
3974 * netisr thread to wake us up when more packets arrive;
3975 * if we allow that processing to happen and then sbwait
3976 * we could stall forever with packets sitting in the
3977 * socket if no further packets arrive from the remote
3978 * side.
3979 *
3980 * we want to copy before we've collected all the data
3981 * to satisfy this request to allow the copy to overlap
3982 * the incoming packet processing on an MP system
3983 */
3984 if (delayed_copy_len > sorecvmincopy &&
3985 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3986 error = sodelayed_copy(so, uio,
3987 &free_list, &delayed_copy_len);
3988
3989 if (error) {
3990 goto release;
3991 }
3992 }
3993 m = so->so_rcv.sb_mb;
3994 if (m != NULL) {
3995 nextrecord = m->m_nextpkt;
3996 }
3997 SB_MB_CHECK(&so->so_rcv);
3998 }
3999 }
4000 #ifdef MORE_LOCKING_DEBUG
4001 if (so->so_usecount <= 1) {
4002 panic("%s: after big while so=%p ref=%d on socket",
4003 __func__, so, so->so_usecount);
4004 /* NOTREACHED */
4005 }
4006 #endif
4007
4008 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4009 if (so->so_options & SO_DONTTRUNC) {
4010 flags |= MSG_RCVMORE;
4011 } else {
4012 flags |= MSG_TRUNC;
4013 if ((flags & MSG_PEEK) == 0) {
4014 (void) sbdroprecord(&so->so_rcv);
4015 }
4016 }
4017 }
4018
4019 /*
4020 * pru_rcvd below (for TCP) may cause more data to be received
4021 * if the socket lock is dropped prior to sending the ACK; some
4022 * legacy OpenTransport applications don't handle this well
4023 * (if it receives less data than requested while MSG_HAVEMORE
4024 * is set), and so we set the flag now based on what we know
4025 * prior to calling pru_rcvd.
4026 */
4027 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4028 flags |= MSG_HAVEMORE;
4029 }
4030
4031 if ((flags & MSG_PEEK) == 0) {
4032 if (m == NULL) {
4033 so->so_rcv.sb_mb = nextrecord;
4034 /*
4035 * First part is an inline SB_EMPTY_FIXUP(). Second
4036 * part makes sure sb_lastrecord is up-to-date if
4037 * there is still data in the socket buffer.
4038 */
4039 if (so->so_rcv.sb_mb == NULL) {
4040 so->so_rcv.sb_mbtail = NULL;
4041 so->so_rcv.sb_lastrecord = NULL;
4042 } else if (nextrecord->m_nextpkt == NULL) {
4043 so->so_rcv.sb_lastrecord = nextrecord;
4044 }
4045 SB_MB_CHECK(&so->so_rcv);
4046 }
4047 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4048 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4049 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4050 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4051 }
4052 }
4053
4054 if (delayed_copy_len) {
4055 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4056 if (error) {
4057 goto release;
4058 }
4059 }
4060 if (free_list != NULL) {
4061 m_freem_list(free_list);
4062 free_list = NULL;
4063 }
4064
4065 if (orig_resid == uio_resid(uio) && orig_resid &&
4066 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4067 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4068 goto restart;
4069 }
4070
4071 if (flagsp != NULL) {
4072 *flagsp |= flags;
4073 }
4074 release:
4075 #ifdef MORE_LOCKING_DEBUG
4076 if (so->so_usecount <= 1) {
4077 panic("%s: release so=%p ref=%d on socket", __func__,
4078 so, so->so_usecount);
4079 /* NOTREACHED */
4080 }
4081 #endif
4082 if (delayed_copy_len) {
4083 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4084 }
4085
4086 if (free_list != NULL) {
4087 m_freem_list(free_list);
4088 }
4089
4090 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4091
4092 if (en_tracing) {
4093 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4094 VM_KERNEL_ADDRPERM(so),
4095 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4096 (int64_t)(orig_resid - uio_resid(uio)));
4097 }
4098 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4099 so->so_rcv.sb_cc, 0, error);
4100
4101 return error;
4102 }
4103
4104 /*
4105 * Returns: 0 Success
4106 * uiomove:EFAULT
4107 */
4108 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4109 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4110 user_ssize_t *resid)
4111 {
4112 int error = 0;
4113 struct mbuf *m;
4114
4115 m = *free_list;
4116
4117 socket_unlock(so, 0);
4118
4119 while (m != NULL && error == 0) {
4120 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4121 m = m->m_next;
4122 }
4123 m_freem_list(*free_list);
4124
4125 *free_list = NULL;
4126 *resid = 0;
4127
4128 socket_lock(so, 0);
4129
4130 return error;
4131 }
4132
4133 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4134 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4135 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4136 {
4137 #pragma unused(so)
4138 int error = 0;
4139 struct mbuf *ml, *m;
4140 int i = 0;
4141 struct uio *auio;
4142
4143 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4144 ml = ml->m_nextpkt, i++) {
4145 auio = msgarray[i].uio;
4146 for (m = ml; m != NULL; m = m->m_next) {
4147 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4148 if (error != 0) {
4149 goto out;
4150 }
4151 }
4152 }
4153 out:
4154 m_freem_list(*free_list);
4155
4156 *free_list = NULL;
4157 *resid = 0;
4158
4159 return error;
4160 }
4161
4162 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4163 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4164 int *flagsp)
4165 {
4166 struct mbuf *m;
4167 struct mbuf *nextrecord;
4168 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4169 int error;
4170 user_ssize_t len, pktlen, delayed_copy_len = 0;
4171 struct protosw *pr = so->so_proto;
4172 user_ssize_t resid;
4173 struct proc *p = current_proc();
4174 struct uio *auio = NULL;
4175 int npkts = 0;
4176 int sblocked = 0;
4177 struct sockaddr **psa = NULL;
4178 struct mbuf **controlp = NULL;
4179 int can_delay;
4180 int flags;
4181 struct mbuf *free_others = NULL;
4182
4183 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4184 so, uiocnt,
4185 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4186
4187 /*
4188 * Sanity checks:
4189 * - Only supports don't wait flags
4190 * - Only support datagram sockets (could be extended to raw)
4191 * - Must be atomic
4192 * - Protocol must support packet chains
4193 * - The uio array is NULL (should we panic?)
4194 */
4195 if (flagsp != NULL) {
4196 flags = *flagsp;
4197 } else {
4198 flags = 0;
4199 }
4200 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4201 MSG_NBIO)) {
4202 printf("%s invalid flags 0x%x\n", __func__, flags);
4203 error = EINVAL;
4204 goto out;
4205 }
4206 if (so->so_type != SOCK_DGRAM) {
4207 error = EINVAL;
4208 goto out;
4209 }
4210 if (sosendallatonce(so) == 0) {
4211 error = EINVAL;
4212 goto out;
4213 }
4214 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4215 error = EPROTONOSUPPORT;
4216 goto out;
4217 }
4218 if (msgarray == NULL) {
4219 printf("%s uioarray is NULL\n", __func__);
4220 error = EINVAL;
4221 goto out;
4222 }
4223 if (uiocnt == 0) {
4224 printf("%s uiocnt is 0\n", __func__);
4225 error = EINVAL;
4226 goto out;
4227 }
4228 /*
4229 * Sanity check on the length passed by caller as we are making 'int'
4230 * comparisons
4231 */
4232 resid = recv_msg_array_resid(msgarray, uiocnt);
4233 if (resid < 0 || resid > INT_MAX) {
4234 error = EINVAL;
4235 goto out;
4236 }
4237
4238 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4239 can_delay = 1;
4240 } else {
4241 can_delay = 0;
4242 }
4243
4244 socket_lock(so, 1);
4245 so_update_last_owner_locked(so, p);
4246 so_update_policy(so);
4247
4248 #if NECP
4249 so_update_necp_policy(so, NULL, NULL);
4250 #endif /* NECP */
4251
4252 /*
4253 * If a recv attempt is made on a previously-accepted socket
4254 * that has been marked as inactive (disconnected), reject
4255 * the request.
4256 */
4257 if (so->so_flags & SOF_DEFUNCT) {
4258 struct sockbuf *sb = &so->so_rcv;
4259
4260 error = ENOTCONN;
4261 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4262 __func__, proc_pid(p), proc_best_name(p),
4263 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4264 SOCK_DOM(so), SOCK_TYPE(so), error);
4265 /*
4266 * This socket should have been disconnected and flushed
4267 * prior to being returned from sodefunct(); there should
4268 * be no data on its receive list, so panic otherwise.
4269 */
4270 if (so->so_state & SS_DEFUNCT) {
4271 sb_empty_assert(sb, __func__);
4272 }
4273 goto release;
4274 }
4275
4276 next:
4277 /*
4278 * The uio may be empty
4279 */
4280 if (npkts >= uiocnt) {
4281 error = 0;
4282 goto release;
4283 }
4284 restart:
4285 /*
4286 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4287 * and if so just return to the caller. This could happen when
4288 * soreceive() is called by a socket upcall function during the
4289 * time the socket is freed. The socket buffer would have been
4290 * locked across the upcall, therefore we cannot put this thread
4291 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4292 * we may livelock), because the lock on the socket buffer will
4293 * only be released when the upcall routine returns to its caller.
4294 * Because the socket has been officially closed, there can be
4295 * no further read on it.
4296 */
4297 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4298 (SS_NOFDREF | SS_CANTRCVMORE)) {
4299 error = 0;
4300 goto release;
4301 }
4302
4303 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4304 if (error) {
4305 goto release;
4306 }
4307 sblocked = 1;
4308
4309 m = so->so_rcv.sb_mb;
4310 /*
4311 * Block awaiting more datagram if needed
4312 */
4313 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4314 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4315 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4316 /*
4317 * Panic if we notice inconsistencies in the socket's
4318 * receive list; both sb_mb and sb_cc should correctly
4319 * reflect the contents of the list, otherwise we may
4320 * end up with false positives during select() or poll()
4321 * which could put the application in a bad state.
4322 */
4323 SB_MB_CHECK(&so->so_rcv);
4324
4325 if (so->so_error) {
4326 error = so->so_error;
4327 if ((flags & MSG_PEEK) == 0) {
4328 so->so_error = 0;
4329 }
4330 goto release;
4331 }
4332 if (so->so_state & SS_CANTRCVMORE) {
4333 goto release;
4334 }
4335 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4336 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4337 error = ENOTCONN;
4338 goto release;
4339 }
4340 if ((so->so_state & SS_NBIO) ||
4341 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4342 error = EWOULDBLOCK;
4343 goto release;
4344 }
4345 /*
4346 * Do not block if we got some data
4347 */
4348 if (free_list != NULL) {
4349 error = 0;
4350 goto release;
4351 }
4352
4353 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4354 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4355
4356 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4357 sblocked = 0;
4358
4359 error = sbwait(&so->so_rcv);
4360 if (error) {
4361 goto release;
4362 }
4363 goto restart;
4364 }
4365
4366 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4367 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4368 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4369
4370 /*
4371 * Consume the current uio index as we have a datagram
4372 */
4373 auio = msgarray[npkts].uio;
4374 resid = uio_resid(auio);
4375 msgarray[npkts].which |= SOCK_MSG_DATA;
4376 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4377 &msgarray[npkts].psa : NULL;
4378 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4379 &msgarray[npkts].controlp : NULL;
4380 npkts += 1;
4381 nextrecord = m->m_nextpkt;
4382
4383 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4384 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4385 if (error == ERESTART) {
4386 goto restart;
4387 } else if (error != 0) {
4388 goto release;
4389 }
4390 }
4391
4392 if (m != NULL && m->m_type == MT_CONTROL) {
4393 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4394 if (error != 0) {
4395 goto release;
4396 }
4397 }
4398
4399 if (m->m_pkthdr.len == 0) {
4400 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4401 __func__, __LINE__,
4402 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4403 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4404 m->m_type);
4405 }
4406
4407 /*
4408 * Loop to copy the mbufs of the current record
4409 * Support zero length packets
4410 */
4411 ml = NULL;
4412 pktlen = 0;
4413 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4414 if (m->m_len == 0) {
4415 panic("%p m_len zero", m);
4416 }
4417 if (m->m_type == 0) {
4418 panic("%p m_type zero", m);
4419 }
4420 /*
4421 * Clip to the residual length
4422 */
4423 if (len > m->m_len) {
4424 len = m->m_len;
4425 }
4426 pktlen += len;
4427 /*
4428 * Copy the mbufs via the uio or delay the copy
4429 * Sockbuf must be consistent here (points to current mbuf,
4430 * it points to next record) when we drop priority;
4431 * we must note any additions to the sockbuf when we
4432 * block interrupts again.
4433 */
4434 if (len > 0 && can_delay == 0) {
4435 socket_unlock(so, 0);
4436 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4437 socket_lock(so, 0);
4438 if (error) {
4439 goto release;
4440 }
4441 } else {
4442 delayed_copy_len += len;
4443 }
4444
4445 if (len == m->m_len) {
4446 /*
4447 * m was entirely copied
4448 */
4449 sbfree(&so->so_rcv, m);
4450 nextrecord = m->m_nextpkt;
4451 m->m_nextpkt = NULL;
4452
4453 /*
4454 * Set the first packet to the head of the free list
4455 */
4456 if (free_list == NULL) {
4457 free_list = m;
4458 }
4459 /*
4460 * Link current packet to tail of free list
4461 */
4462 if (ml == NULL) {
4463 if (free_tail != NULL) {
4464 free_tail->m_nextpkt = m;
4465 }
4466 free_tail = m;
4467 }
4468 /*
4469 * Link current mbuf to last mbuf of current packet
4470 */
4471 if (ml != NULL) {
4472 ml->m_next = m;
4473 }
4474 ml = m;
4475
4476 /*
4477 * Move next buf to head of socket buffer
4478 */
4479 so->so_rcv.sb_mb = m = ml->m_next;
4480 ml->m_next = NULL;
4481
4482 if (m != NULL) {
4483 m->m_nextpkt = nextrecord;
4484 if (nextrecord == NULL) {
4485 so->so_rcv.sb_lastrecord = m;
4486 }
4487 } else {
4488 so->so_rcv.sb_mb = nextrecord;
4489 SB_EMPTY_FIXUP(&so->so_rcv);
4490 }
4491 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4492 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4493 } else {
4494 /*
4495 * Stop the loop on partial copy
4496 */
4497 break;
4498 }
4499 }
4500 #ifdef MORE_LOCKING_DEBUG
4501 if (so->so_usecount <= 1) {
4502 panic("%s: after big while so=%llx ref=%d on socket",
4503 __func__,
4504 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4505 /* NOTREACHED */
4506 }
4507 #endif
4508 /*
4509 * Tell the caller we made a partial copy
4510 */
4511 if (m != NULL) {
4512 if (so->so_options & SO_DONTTRUNC) {
4513 /*
4514 * Copyout first the freelist then the partial mbuf
4515 */
4516 socket_unlock(so, 0);
4517 if (delayed_copy_len) {
4518 error = sodelayed_copy_list(so, msgarray,
4519 uiocnt, &free_list, &delayed_copy_len);
4520 }
4521
4522 if (error == 0) {
4523 error = uiomove(mtod(m, caddr_t), (int)len,
4524 auio);
4525 }
4526 socket_lock(so, 0);
4527 if (error) {
4528 goto release;
4529 }
4530
4531 m->m_data += len;
4532 m->m_len -= len;
4533 so->so_rcv.sb_cc -= len;
4534 flags |= MSG_RCVMORE;
4535 } else {
4536 (void) sbdroprecord(&so->so_rcv);
4537 nextrecord = so->so_rcv.sb_mb;
4538 m = NULL;
4539 flags |= MSG_TRUNC;
4540 }
4541 }
4542
4543 if (m == NULL) {
4544 so->so_rcv.sb_mb = nextrecord;
4545 /*
4546 * First part is an inline SB_EMPTY_FIXUP(). Second
4547 * part makes sure sb_lastrecord is up-to-date if
4548 * there is still data in the socket buffer.
4549 */
4550 if (so->so_rcv.sb_mb == NULL) {
4551 so->so_rcv.sb_mbtail = NULL;
4552 so->so_rcv.sb_lastrecord = NULL;
4553 } else if (nextrecord->m_nextpkt == NULL) {
4554 so->so_rcv.sb_lastrecord = nextrecord;
4555 }
4556 SB_MB_CHECK(&so->so_rcv);
4557 }
4558 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4559 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4560
4561 /*
4562 * We can continue to the next packet as long as:
4563 * - We haven't exhausted the uio array
4564 * - There was no error
4565 * - A packet was not truncated
4566 * - We can still receive more data
4567 */
4568 if (npkts < uiocnt && error == 0 &&
4569 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4570 (so->so_state & SS_CANTRCVMORE) == 0) {
4571 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4572 sblocked = 0;
4573
4574 goto next;
4575 }
4576 if (flagsp != NULL) {
4577 *flagsp |= flags;
4578 }
4579
4580 release:
4581 /*
4582 * pru_rcvd may cause more data to be received if the socket lock
4583 * is dropped so we set MSG_HAVEMORE now based on what we know.
4584 * That way the caller won't be surprised if it receives less data
4585 * than requested.
4586 */
4587 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4588 flags |= MSG_HAVEMORE;
4589 }
4590
4591 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4592 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4593 }
4594
4595 if (sblocked) {
4596 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4597 } else {
4598 socket_unlock(so, 1);
4599 }
4600
4601 if (delayed_copy_len) {
4602 error = sodelayed_copy_list(so, msgarray, uiocnt,
4603 &free_list, &delayed_copy_len);
4604 }
4605 out:
4606 /*
4607 * Amortize the cost of freeing the mbufs
4608 */
4609 if (free_list != NULL) {
4610 m_freem_list(free_list);
4611 }
4612 if (free_others != NULL) {
4613 m_freem_list(free_others);
4614 }
4615
4616 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4617 0, 0, 0, 0);
4618 return error;
4619 }
4620
4621 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4622 so_statistics_event_to_nstat_event(int64_t *input_options,
4623 uint64_t *nstat_event)
4624 {
4625 int error = 0;
4626 switch (*input_options) {
4627 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4628 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4629 break;
4630 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4631 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4632 break;
4633 #if (DEBUG || DEVELOPMENT)
4634 case SO_STATISTICS_EVENT_RESERVED_1:
4635 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4636 break;
4637 case SO_STATISTICS_EVENT_RESERVED_2:
4638 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4639 break;
4640 #endif /* (DEBUG || DEVELOPMENT) */
4641 default:
4642 error = EINVAL;
4643 break;
4644 }
4645 return error;
4646 }
4647
4648 /*
4649 * Returns: 0 Success
4650 * EINVAL
4651 * ENOTCONN
4652 * <pru_shutdown>:EINVAL
4653 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4654 * <pru_shutdown>:ENOBUFS[TCP]
4655 * <pru_shutdown>:EMSGSIZE[TCP]
4656 * <pru_shutdown>:EHOSTUNREACH[TCP]
4657 * <pru_shutdown>:ENETUNREACH[TCP]
4658 * <pru_shutdown>:ENETDOWN[TCP]
4659 * <pru_shutdown>:ENOMEM[TCP]
4660 * <pru_shutdown>:EACCES[TCP]
4661 * <pru_shutdown>:EMSGSIZE[TCP]
4662 * <pru_shutdown>:ENOBUFS[TCP]
4663 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4664 * <pru_shutdown>:??? [other protocol families]
4665 */
4666 int
soshutdown(struct socket * so,int how)4667 soshutdown(struct socket *so, int how)
4668 {
4669 int error;
4670
4671 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4672
4673 switch (how) {
4674 case SHUT_RD:
4675 case SHUT_WR:
4676 case SHUT_RDWR:
4677 socket_lock(so, 1);
4678 if ((so->so_state &
4679 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4680 error = ENOTCONN;
4681 } else {
4682 error = soshutdownlock(so, how);
4683 }
4684 socket_unlock(so, 1);
4685 break;
4686 default:
4687 error = EINVAL;
4688 break;
4689 }
4690
4691 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4692
4693 return error;
4694 }
4695
4696 int
soshutdownlock_final(struct socket * so,int how)4697 soshutdownlock_final(struct socket *so, int how)
4698 {
4699 struct protosw *pr = so->so_proto;
4700 int error = 0;
4701
4702 sflt_notify(so, sock_evt_shutdown, &how);
4703
4704 if (how != SHUT_WR) {
4705 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4706 /* read already shut down */
4707 error = ENOTCONN;
4708 goto done;
4709 }
4710 sorflush(so);
4711 }
4712 if (how != SHUT_RD) {
4713 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4714 /* write already shut down */
4715 error = ENOTCONN;
4716 goto done;
4717 }
4718 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4719 }
4720 done:
4721 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4722 return error;
4723 }
4724
4725 int
soshutdownlock(struct socket * so,int how)4726 soshutdownlock(struct socket *so, int how)
4727 {
4728 int error = 0;
4729
4730 #if CONTENT_FILTER
4731 /*
4732 * A content filter may delay the actual shutdown until it
4733 * has processed the pending data
4734 */
4735 if (so->so_flags & SOF_CONTENT_FILTER) {
4736 error = cfil_sock_shutdown(so, &how);
4737 if (error == EJUSTRETURN) {
4738 error = 0;
4739 goto done;
4740 } else if (error != 0) {
4741 goto done;
4742 }
4743 }
4744 #endif /* CONTENT_FILTER */
4745
4746 error = soshutdownlock_final(so, how);
4747
4748 done:
4749 return error;
4750 }
4751
4752 void
sowflush(struct socket * so)4753 sowflush(struct socket *so)
4754 {
4755 struct sockbuf *sb = &so->so_snd;
4756
4757 /*
4758 * Obtain lock on the socket buffer (SB_LOCK). This is required
4759 * to prevent the socket buffer from being unexpectedly altered
4760 * while it is used by another thread in socket send/receive.
4761 *
4762 * sblock() must not fail here, hence the assertion.
4763 */
4764 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4765 VERIFY(sb->sb_flags & SB_LOCK);
4766
4767 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4768 sb->sb_flags |= SB_DROP;
4769 sb->sb_upcall = NULL;
4770 sb->sb_upcallarg = NULL;
4771
4772 sbunlock(sb, TRUE); /* keep socket locked */
4773
4774 selthreadclear(&sb->sb_sel);
4775 sbrelease(sb);
4776 }
4777
4778 void
sorflush(struct socket * so)4779 sorflush(struct socket *so)
4780 {
4781 struct sockbuf *sb = &so->so_rcv;
4782 struct protosw *pr = so->so_proto;
4783 struct sockbuf asb;
4784 #ifdef notyet
4785 lck_mtx_t *mutex_held;
4786 /*
4787 * XXX: This code is currently commented out, because we may get here
4788 * as part of sofreelastref(), and at that time, pr_getlock() may no
4789 * longer be able to return us the lock; this will be fixed in future.
4790 */
4791 if (so->so_proto->pr_getlock != NULL) {
4792 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4793 } else {
4794 mutex_held = so->so_proto->pr_domain->dom_mtx;
4795 }
4796
4797 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4798 #endif /* notyet */
4799
4800 sflt_notify(so, sock_evt_flush_read, NULL);
4801
4802 socantrcvmore(so);
4803
4804 /*
4805 * Obtain lock on the socket buffer (SB_LOCK). This is required
4806 * to prevent the socket buffer from being unexpectedly altered
4807 * while it is used by another thread in socket send/receive.
4808 *
4809 * sblock() must not fail here, hence the assertion.
4810 */
4811 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4812 VERIFY(sb->sb_flags & SB_LOCK);
4813
4814 /*
4815 * Copy only the relevant fields from "sb" to "asb" which we
4816 * need for sbrelease() to function. In particular, skip
4817 * sb_sel as it contains the wait queue linkage, which would
4818 * wreak havoc if we were to issue selthreadclear() on "asb".
4819 * Make sure to not carry over SB_LOCK in "asb", as we need
4820 * to acquire it later as part of sbrelease().
4821 */
4822 bzero(&asb, sizeof(asb));
4823 asb.sb_cc = sb->sb_cc;
4824 asb.sb_hiwat = sb->sb_hiwat;
4825 asb.sb_mbcnt = sb->sb_mbcnt;
4826 asb.sb_mbmax = sb->sb_mbmax;
4827 asb.sb_ctl = sb->sb_ctl;
4828 asb.sb_lowat = sb->sb_lowat;
4829 asb.sb_mb = sb->sb_mb;
4830 asb.sb_mbtail = sb->sb_mbtail;
4831 asb.sb_lastrecord = sb->sb_lastrecord;
4832 asb.sb_so = sb->sb_so;
4833 asb.sb_flags = sb->sb_flags;
4834 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4835 asb.sb_flags |= SB_DROP;
4836
4837 /*
4838 * Ideally we'd bzero() these and preserve the ones we need;
4839 * but to do that we'd need to shuffle things around in the
4840 * sockbuf, and we can't do it now because there are KEXTS
4841 * that are directly referring to the socket structure.
4842 *
4843 * Setting SB_DROP acts as a barrier to prevent further appends.
4844 * Clearing SB_SEL is done for selthreadclear() below.
4845 */
4846 sb->sb_cc = 0;
4847 sb->sb_hiwat = 0;
4848 sb->sb_mbcnt = 0;
4849 sb->sb_mbmax = 0;
4850 sb->sb_ctl = 0;
4851 sb->sb_lowat = 0;
4852 sb->sb_mb = NULL;
4853 sb->sb_mbtail = NULL;
4854 sb->sb_lastrecord = NULL;
4855 sb->sb_timeo.tv_sec = 0;
4856 sb->sb_timeo.tv_usec = 0;
4857 sb->sb_upcall = NULL;
4858 sb->sb_upcallarg = NULL;
4859 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4860 sb->sb_flags |= SB_DROP;
4861
4862 sbunlock(sb, TRUE); /* keep socket locked */
4863
4864 /*
4865 * Note that selthreadclear() is called on the original "sb" and
4866 * not the local "asb" because of the way wait queue linkage is
4867 * implemented. Given that selwakeup() may be triggered, SB_SEL
4868 * should no longer be set (cleared above.)
4869 */
4870 selthreadclear(&sb->sb_sel);
4871
4872 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4873 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4874 }
4875
4876 sbrelease(&asb);
4877 }
4878
4879 /*
4880 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4881 * an additional variant to handle the case where the option value needs
4882 * to be some kind of integer, but not a specific size.
4883 * In addition to their use here, these functions are also called by the
4884 * protocol-level pr_ctloutput() routines.
4885 *
4886 * Returns: 0 Success
4887 * EINVAL
4888 * copyin:EFAULT
4889 */
4890 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4891 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4892 {
4893 size_t valsize;
4894
4895 /*
4896 * If the user gives us more than we wanted, we ignore it,
4897 * but if we don't get the minimum length the caller
4898 * wants, we return EINVAL. On success, sopt->sopt_valsize
4899 * is set to however much we actually retrieved.
4900 */
4901 if ((valsize = sopt->sopt_valsize) < minlen) {
4902 return EINVAL;
4903 }
4904 if (valsize > len) {
4905 sopt->sopt_valsize = valsize = len;
4906 }
4907
4908 if (sopt->sopt_p != kernproc) {
4909 return copyin(sopt->sopt_val, buf, valsize);
4910 }
4911
4912 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4913 return 0;
4914 }
4915
4916 /*
4917 * sooptcopyin_timeval
4918 * Copy in a timeval value into tv_p, and take into account whether the
4919 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4920 * code here so that we can verify the 64-bit tv_sec value before we lose
4921 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4922 */
4923 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4924 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4925 {
4926 int error;
4927
4928 if (proc_is64bit(sopt->sopt_p)) {
4929 struct user64_timeval tv64;
4930
4931 if (sopt->sopt_valsize < sizeof(tv64)) {
4932 return EINVAL;
4933 }
4934
4935 sopt->sopt_valsize = sizeof(tv64);
4936 if (sopt->sopt_p != kernproc) {
4937 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4938 if (error != 0) {
4939 return error;
4940 }
4941 } else {
4942 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4943 sizeof(tv64));
4944 }
4945 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4946 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4947 return EDOM;
4948 }
4949
4950 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4951 tv_p->tv_usec = tv64.tv_usec;
4952 } else {
4953 struct user32_timeval tv32;
4954
4955 if (sopt->sopt_valsize < sizeof(tv32)) {
4956 return EINVAL;
4957 }
4958
4959 sopt->sopt_valsize = sizeof(tv32);
4960 if (sopt->sopt_p != kernproc) {
4961 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4962 if (error != 0) {
4963 return error;
4964 }
4965 } else {
4966 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4967 sizeof(tv32));
4968 }
4969 #ifndef __LP64__
4970 /*
4971 * K64todo "comparison is always false due to
4972 * limited range of data type"
4973 */
4974 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4975 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4976 return EDOM;
4977 }
4978 #endif
4979 tv_p->tv_sec = tv32.tv_sec;
4980 tv_p->tv_usec = tv32.tv_usec;
4981 }
4982 return 0;
4983 }
4984
4985 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4986 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4987 boolean_t ignore_delegate)
4988 {
4989 kauth_cred_t cred = NULL;
4990 proc_t ep = PROC_NULL;
4991 uid_t uid;
4992 int error = 0;
4993
4994 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4995 ep = proc_find(so->e_pid);
4996 if (ep) {
4997 cred = kauth_cred_proc_ref(ep);
4998 }
4999 }
5000
5001 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5002
5003 /* uid is 0 for root */
5004 if (uid != 0 || !allow_root) {
5005 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5006 }
5007 if (cred) {
5008 kauth_cred_unref(&cred);
5009 }
5010 if (ep != PROC_NULL) {
5011 proc_rele(ep);
5012 }
5013
5014 return error;
5015 }
5016
5017 /*
5018 * Returns: 0 Success
5019 * EINVAL
5020 * ENOPROTOOPT
5021 * ENOBUFS
5022 * EDOM
5023 * sooptcopyin:EINVAL
5024 * sooptcopyin:EFAULT
5025 * sooptcopyin_timeval:EINVAL
5026 * sooptcopyin_timeval:EFAULT
5027 * sooptcopyin_timeval:EDOM
5028 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5029 * <pr_ctloutput>:???w
5030 * sflt_attach_private:??? [whatever a filter author chooses]
5031 * <sf_setoption>:??? [whatever a filter author chooses]
5032 *
5033 * Notes: Other <pru_listen> returns depend on the protocol family; all
5034 * <sf_listen> returns depend on what the filter author causes
5035 * their filter to return.
5036 */
5037 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5038 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5039 {
5040 int error, optval;
5041 int64_t long_optval;
5042 struct linger l;
5043 struct timeval tv;
5044
5045 if (sopt->sopt_dir != SOPT_SET) {
5046 sopt->sopt_dir = SOPT_SET;
5047 }
5048
5049 if (dolock) {
5050 socket_lock(so, 1);
5051 }
5052
5053 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5054 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5055 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5056 /* the socket has been shutdown, no more sockopt's */
5057 error = EINVAL;
5058 goto out;
5059 }
5060
5061 error = sflt_setsockopt(so, sopt);
5062 if (error != 0) {
5063 if (error == EJUSTRETURN) {
5064 error = 0;
5065 }
5066 goto out;
5067 }
5068
5069 if (sopt->sopt_level != SOL_SOCKET) {
5070 if (so->so_proto != NULL &&
5071 so->so_proto->pr_ctloutput != NULL) {
5072 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5073 goto out;
5074 }
5075 error = ENOPROTOOPT;
5076 } else {
5077 /*
5078 * Allow socket-level (SOL_SOCKET) options to be filtered by
5079 * the protocol layer, if needed. A zero value returned from
5080 * the handler means use default socket-level processing as
5081 * done by the rest of this routine. Otherwise, any other
5082 * return value indicates that the option is unsupported.
5083 */
5084 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5085 pru_socheckopt(so, sopt)) != 0) {
5086 goto out;
5087 }
5088
5089 error = 0;
5090 switch (sopt->sopt_name) {
5091 case SO_LINGER:
5092 case SO_LINGER_SEC:
5093 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5094 if (error != 0) {
5095 goto out;
5096 }
5097
5098 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5099 (short)l.l_linger : (short)(l.l_linger * hz);
5100 if (l.l_onoff != 0) {
5101 so->so_options |= SO_LINGER;
5102 } else {
5103 so->so_options &= ~SO_LINGER;
5104 }
5105 break;
5106
5107 case SO_DEBUG:
5108 case SO_KEEPALIVE:
5109 case SO_DONTROUTE:
5110 case SO_USELOOPBACK:
5111 case SO_BROADCAST:
5112 case SO_REUSEADDR:
5113 case SO_REUSEPORT:
5114 case SO_OOBINLINE:
5115 case SO_TIMESTAMP:
5116 case SO_TIMESTAMP_MONOTONIC:
5117 case SO_TIMESTAMP_CONTINUOUS:
5118 case SO_DONTTRUNC:
5119 case SO_WANTMORE:
5120 case SO_WANTOOBFLAG:
5121 case SO_NOWAKEFROMSLEEP:
5122 case SO_NOAPNFALLBK:
5123 error = sooptcopyin(sopt, &optval, sizeof(optval),
5124 sizeof(optval));
5125 if (error != 0) {
5126 goto out;
5127 }
5128 if (optval) {
5129 so->so_options |= sopt->sopt_name;
5130 } else {
5131 so->so_options &= ~sopt->sopt_name;
5132 }
5133 #if SKYWALK
5134 inp_update_netns_flags(so);
5135 #endif /* SKYWALK */
5136 break;
5137
5138 case SO_SNDBUF:
5139 case SO_RCVBUF:
5140 case SO_SNDLOWAT:
5141 case SO_RCVLOWAT:
5142 error = sooptcopyin(sopt, &optval, sizeof(optval),
5143 sizeof(optval));
5144 if (error != 0) {
5145 goto out;
5146 }
5147
5148 /*
5149 * Values < 1 make no sense for any of these
5150 * options, so disallow them.
5151 */
5152 if (optval < 1) {
5153 error = EINVAL;
5154 goto out;
5155 }
5156
5157 switch (sopt->sopt_name) {
5158 case SO_SNDBUF:
5159 case SO_RCVBUF: {
5160 struct sockbuf *sb =
5161 (sopt->sopt_name == SO_SNDBUF) ?
5162 &so->so_snd : &so->so_rcv;
5163 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5164 error = ENOBUFS;
5165 goto out;
5166 }
5167 sb->sb_flags |= SB_USRSIZE;
5168 sb->sb_flags &= ~SB_AUTOSIZE;
5169 sb->sb_idealsize = (u_int32_t)optval;
5170 break;
5171 }
5172 /*
5173 * Make sure the low-water is never greater than
5174 * the high-water.
5175 */
5176 case SO_SNDLOWAT: {
5177 int space = sbspace(&so->so_snd);
5178 u_int32_t hiwat = so->so_snd.sb_hiwat;
5179
5180 if (so->so_snd.sb_flags & SB_UNIX) {
5181 struct unpcb *unp =
5182 (struct unpcb *)(so->so_pcb);
5183 if (unp != NULL &&
5184 unp->unp_conn != NULL) {
5185 hiwat += unp->unp_conn->unp_cc;
5186 }
5187 }
5188
5189 so->so_snd.sb_lowat =
5190 (optval > hiwat) ?
5191 hiwat : optval;
5192
5193 if (space >= so->so_snd.sb_lowat) {
5194 sowwakeup(so);
5195 }
5196 break;
5197 }
5198 case SO_RCVLOWAT: {
5199 int64_t data_len;
5200 so->so_rcv.sb_lowat =
5201 (optval > so->so_rcv.sb_hiwat) ?
5202 so->so_rcv.sb_hiwat : optval;
5203 data_len = so->so_rcv.sb_cc
5204 - so->so_rcv.sb_ctl;
5205 if (data_len >= so->so_rcv.sb_lowat) {
5206 sorwakeup(so);
5207 }
5208 break;
5209 }
5210 }
5211 break;
5212
5213 case SO_SNDTIMEO:
5214 case SO_RCVTIMEO:
5215 error = sooptcopyin_timeval(sopt, &tv);
5216 if (error != 0) {
5217 goto out;
5218 }
5219
5220 switch (sopt->sopt_name) {
5221 case SO_SNDTIMEO:
5222 so->so_snd.sb_timeo = tv;
5223 break;
5224 case SO_RCVTIMEO:
5225 so->so_rcv.sb_timeo = tv;
5226 break;
5227 }
5228 break;
5229
5230 case SO_NKE: {
5231 struct so_nke nke;
5232
5233 error = sooptcopyin(sopt, &nke, sizeof(nke),
5234 sizeof(nke));
5235 if (error != 0) {
5236 goto out;
5237 }
5238
5239 error = sflt_attach_internal(so, nke.nke_handle);
5240 break;
5241 }
5242
5243 case SO_NOSIGPIPE:
5244 error = sooptcopyin(sopt, &optval, sizeof(optval),
5245 sizeof(optval));
5246 if (error != 0) {
5247 goto out;
5248 }
5249 if (optval != 0) {
5250 so->so_flags |= SOF_NOSIGPIPE;
5251 } else {
5252 so->so_flags &= ~SOF_NOSIGPIPE;
5253 }
5254 break;
5255
5256 case SO_NOADDRERR:
5257 error = sooptcopyin(sopt, &optval, sizeof(optval),
5258 sizeof(optval));
5259 if (error != 0) {
5260 goto out;
5261 }
5262 if (optval != 0) {
5263 so->so_flags |= SOF_NOADDRAVAIL;
5264 } else {
5265 so->so_flags &= ~SOF_NOADDRAVAIL;
5266 }
5267 break;
5268
5269 case SO_REUSESHAREUID:
5270 error = sooptcopyin(sopt, &optval, sizeof(optval),
5271 sizeof(optval));
5272 if (error != 0) {
5273 goto out;
5274 }
5275 if (optval != 0) {
5276 so->so_flags |= SOF_REUSESHAREUID;
5277 } else {
5278 so->so_flags &= ~SOF_REUSESHAREUID;
5279 }
5280 break;
5281
5282 case SO_NOTIFYCONFLICT:
5283 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5284 error = EPERM;
5285 goto out;
5286 }
5287 error = sooptcopyin(sopt, &optval, sizeof(optval),
5288 sizeof(optval));
5289 if (error != 0) {
5290 goto out;
5291 }
5292 if (optval != 0) {
5293 so->so_flags |= SOF_NOTIFYCONFLICT;
5294 } else {
5295 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5296 }
5297 break;
5298
5299 case SO_RESTRICTIONS:
5300 error = sooptcopyin(sopt, &optval, sizeof(optval),
5301 sizeof(optval));
5302 if (error != 0) {
5303 goto out;
5304 }
5305
5306 error = so_set_restrictions(so, optval);
5307 break;
5308
5309 case SO_AWDL_UNRESTRICTED:
5310 if (SOCK_DOM(so) != PF_INET &&
5311 SOCK_DOM(so) != PF_INET6) {
5312 error = EOPNOTSUPP;
5313 goto out;
5314 }
5315 error = sooptcopyin(sopt, &optval, sizeof(optval),
5316 sizeof(optval));
5317 if (error != 0) {
5318 goto out;
5319 }
5320 if (optval != 0) {
5321 error = soopt_cred_check(so,
5322 PRIV_NET_RESTRICTED_AWDL, false, false);
5323 if (error == 0) {
5324 inp_set_awdl_unrestricted(
5325 sotoinpcb(so));
5326 }
5327 } else {
5328 inp_clear_awdl_unrestricted(sotoinpcb(so));
5329 }
5330 break;
5331 case SO_INTCOPROC_ALLOW:
5332 if (SOCK_DOM(so) != PF_INET6) {
5333 error = EOPNOTSUPP;
5334 goto out;
5335 }
5336 error = sooptcopyin(sopt, &optval, sizeof(optval),
5337 sizeof(optval));
5338 if (error != 0) {
5339 goto out;
5340 }
5341 if (optval != 0 &&
5342 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5343 error = soopt_cred_check(so,
5344 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5345 if (error == 0) {
5346 inp_set_intcoproc_allowed(
5347 sotoinpcb(so));
5348 }
5349 } else if (optval == 0) {
5350 inp_clear_intcoproc_allowed(sotoinpcb(so));
5351 }
5352 break;
5353
5354 case SO_LABEL:
5355 error = EOPNOTSUPP;
5356 break;
5357
5358 case SO_UPCALLCLOSEWAIT:
5359 error = sooptcopyin(sopt, &optval, sizeof(optval),
5360 sizeof(optval));
5361 if (error != 0) {
5362 goto out;
5363 }
5364 if (optval != 0) {
5365 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5366 } else {
5367 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5368 }
5369 break;
5370
5371 case SO_RANDOMPORT:
5372 error = sooptcopyin(sopt, &optval, sizeof(optval),
5373 sizeof(optval));
5374 if (error != 0) {
5375 goto out;
5376 }
5377 if (optval != 0) {
5378 so->so_flags |= SOF_BINDRANDOMPORT;
5379 } else {
5380 so->so_flags &= ~SOF_BINDRANDOMPORT;
5381 }
5382 break;
5383
5384 case SO_NP_EXTENSIONS: {
5385 struct so_np_extensions sonpx;
5386
5387 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5388 sizeof(sonpx));
5389 if (error != 0) {
5390 goto out;
5391 }
5392 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5393 error = EINVAL;
5394 goto out;
5395 }
5396 /*
5397 * Only one bit defined for now
5398 */
5399 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5400 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5401 so->so_flags |= SOF_NPX_SETOPTSHUT;
5402 } else {
5403 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5404 }
5405 }
5406 break;
5407 }
5408
5409 case SO_TRAFFIC_CLASS: {
5410 error = sooptcopyin(sopt, &optval, sizeof(optval),
5411 sizeof(optval));
5412 if (error != 0) {
5413 goto out;
5414 }
5415 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5416 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5417 error = so_set_net_service_type(so, netsvc);
5418 goto out;
5419 }
5420 error = so_set_traffic_class(so, optval);
5421 if (error != 0) {
5422 goto out;
5423 }
5424 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5425 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5426 break;
5427 }
5428
5429 case SO_RECV_TRAFFIC_CLASS: {
5430 error = sooptcopyin(sopt, &optval, sizeof(optval),
5431 sizeof(optval));
5432 if (error != 0) {
5433 goto out;
5434 }
5435 if (optval == 0) {
5436 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5437 } else {
5438 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5439 }
5440 break;
5441 }
5442
5443 #if (DEVELOPMENT || DEBUG)
5444 case SO_TRAFFIC_CLASS_DBG: {
5445 struct so_tcdbg so_tcdbg;
5446
5447 error = sooptcopyin(sopt, &so_tcdbg,
5448 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5449 if (error != 0) {
5450 goto out;
5451 }
5452 error = so_set_tcdbg(so, &so_tcdbg);
5453 if (error != 0) {
5454 goto out;
5455 }
5456 break;
5457 }
5458 #endif /* (DEVELOPMENT || DEBUG) */
5459
5460 case SO_PRIVILEGED_TRAFFIC_CLASS:
5461 error = priv_check_cred(kauth_cred_get(),
5462 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5463 if (error != 0) {
5464 goto out;
5465 }
5466 error = sooptcopyin(sopt, &optval, sizeof(optval),
5467 sizeof(optval));
5468 if (error != 0) {
5469 goto out;
5470 }
5471 if (optval == 0) {
5472 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5473 } else {
5474 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5475 }
5476 break;
5477
5478 #if (DEVELOPMENT || DEBUG)
5479 case SO_DEFUNCTIT:
5480 error = sosetdefunct(current_proc(), so, 0, FALSE);
5481 if (error == 0) {
5482 error = sodefunct(current_proc(), so, 0);
5483 }
5484
5485 break;
5486 #endif /* (DEVELOPMENT || DEBUG) */
5487
5488 case SO_DEFUNCTOK:
5489 error = sooptcopyin(sopt, &optval, sizeof(optval),
5490 sizeof(optval));
5491 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5492 if (error == 0) {
5493 error = EBADF;
5494 }
5495 goto out;
5496 }
5497 /*
5498 * Any process can set SO_DEFUNCTOK (clear
5499 * SOF_NODEFUNCT), but only root can clear
5500 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5501 */
5502 if (optval == 0 &&
5503 kauth_cred_issuser(kauth_cred_get()) == 0) {
5504 error = EPERM;
5505 goto out;
5506 }
5507 if (optval) {
5508 so->so_flags &= ~SOF_NODEFUNCT;
5509 } else {
5510 so->so_flags |= SOF_NODEFUNCT;
5511 }
5512
5513 if (SOCK_DOM(so) == PF_INET ||
5514 SOCK_DOM(so) == PF_INET6) {
5515 char s[MAX_IPv6_STR_LEN];
5516 char d[MAX_IPv6_STR_LEN];
5517 struct inpcb *inp = sotoinpcb(so);
5518
5519 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5520 "[%s %s:%d -> %s:%d] is now marked "
5521 "as %seligible for "
5522 "defunct\n", __func__, proc_selfpid(),
5523 proc_best_name(current_proc()),
5524 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5525 (SOCK_TYPE(so) == SOCK_STREAM) ?
5526 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5527 ((SOCK_DOM(so) == PF_INET) ?
5528 (void *)&inp->inp_laddr.s_addr :
5529 (void *)&inp->in6p_laddr), s, sizeof(s)),
5530 ntohs(inp->in6p_lport),
5531 inet_ntop(SOCK_DOM(so),
5532 (SOCK_DOM(so) == PF_INET) ?
5533 (void *)&inp->inp_faddr.s_addr :
5534 (void *)&inp->in6p_faddr, d, sizeof(d)),
5535 ntohs(inp->in6p_fport),
5536 (so->so_flags & SOF_NODEFUNCT) ?
5537 "not " : "");
5538 } else {
5539 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5540 "is now marked as %seligible for "
5541 "defunct\n",
5542 __func__, proc_selfpid(),
5543 proc_best_name(current_proc()),
5544 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5545 SOCK_DOM(so), SOCK_TYPE(so),
5546 (so->so_flags & SOF_NODEFUNCT) ?
5547 "not " : "");
5548 }
5549 break;
5550
5551 case SO_ISDEFUNCT:
5552 /* This option is not settable */
5553 error = EINVAL;
5554 break;
5555
5556 case SO_OPPORTUNISTIC:
5557 error = sooptcopyin(sopt, &optval, sizeof(optval),
5558 sizeof(optval));
5559 if (error == 0) {
5560 error = so_set_opportunistic(so, optval);
5561 }
5562 break;
5563
5564 case SO_FLUSH:
5565 /* This option is handled by lower layer(s) */
5566 error = 0;
5567 break;
5568
5569 case SO_RECV_ANYIF:
5570 error = sooptcopyin(sopt, &optval, sizeof(optval),
5571 sizeof(optval));
5572 if (error == 0) {
5573 error = so_set_recv_anyif(so, optval);
5574 }
5575 break;
5576
5577 case SO_TRAFFIC_MGT_BACKGROUND: {
5578 /* This option is handled by lower layer(s) */
5579 error = 0;
5580 break;
5581 }
5582
5583 #if FLOW_DIVERT
5584 case SO_FLOW_DIVERT_TOKEN:
5585 error = flow_divert_token_set(so, sopt);
5586 break;
5587 #endif /* FLOW_DIVERT */
5588
5589
5590 case SO_DELEGATED:
5591 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5592 sizeof(optval))) != 0) {
5593 break;
5594 }
5595
5596 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5597 break;
5598
5599 case SO_DELEGATED_UUID: {
5600 uuid_t euuid;
5601
5602 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5603 sizeof(euuid))) != 0) {
5604 break;
5605 }
5606
5607 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5608 break;
5609 }
5610
5611 #if NECP
5612 case SO_NECP_ATTRIBUTES:
5613 if (SOCK_DOM(so) == PF_MULTIPATH) {
5614 /* Handled by MPTCP itself */
5615 break;
5616 }
5617
5618 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5619 error = EINVAL;
5620 goto out;
5621 }
5622
5623 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5624 break;
5625
5626 case SO_NECP_CLIENTUUID: {
5627 if (SOCK_DOM(so) == PF_MULTIPATH) {
5628 /* Handled by MPTCP itself */
5629 break;
5630 }
5631
5632 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5633 error = EINVAL;
5634 goto out;
5635 }
5636
5637 struct inpcb *inp = sotoinpcb(so);
5638 if (!uuid_is_null(inp->necp_client_uuid)) {
5639 // Clear out the old client UUID if present
5640 necp_inpcb_remove_cb(inp);
5641 }
5642
5643 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5644 sizeof(uuid_t), sizeof(uuid_t));
5645 if (error != 0) {
5646 goto out;
5647 }
5648
5649 if (uuid_is_null(inp->necp_client_uuid)) {
5650 error = EINVAL;
5651 goto out;
5652 }
5653
5654 pid_t current_pid = proc_pid(current_proc());
5655 error = necp_client_register_socket_flow(current_pid,
5656 inp->necp_client_uuid, inp);
5657 if (error != 0) {
5658 uuid_clear(inp->necp_client_uuid);
5659 goto out;
5660 }
5661
5662 if (inp->inp_lport != 0) {
5663 // There is a bound local port, so this is not
5664 // a fresh socket. Assign to the client.
5665 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5666 }
5667
5668 break;
5669 }
5670 case SO_NECP_LISTENUUID: {
5671 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5672 error = EINVAL;
5673 goto out;
5674 }
5675
5676 struct inpcb *inp = sotoinpcb(so);
5677 if (!uuid_is_null(inp->necp_client_uuid)) {
5678 error = EINVAL;
5679 goto out;
5680 }
5681
5682 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5683 sizeof(uuid_t), sizeof(uuid_t));
5684 if (error != 0) {
5685 goto out;
5686 }
5687
5688 if (uuid_is_null(inp->necp_client_uuid)) {
5689 error = EINVAL;
5690 goto out;
5691 }
5692
5693 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5694 inp->necp_client_uuid, inp);
5695 if (error != 0) {
5696 uuid_clear(inp->necp_client_uuid);
5697 goto out;
5698 }
5699
5700 // Mark that the port registration is held by NECP
5701 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5702
5703 break;
5704 }
5705 #endif /* NECP */
5706
5707 case SO_EXTENDED_BK_IDLE:
5708 error = sooptcopyin(sopt, &optval, sizeof(optval),
5709 sizeof(optval));
5710 if (error == 0) {
5711 error = so_set_extended_bk_idle(so, optval);
5712 }
5713 break;
5714
5715 case SO_MARK_CELLFALLBACK:
5716 error = sooptcopyin(sopt, &optval, sizeof(optval),
5717 sizeof(optval));
5718 if (error != 0) {
5719 goto out;
5720 }
5721 if (optval < 0) {
5722 error = EINVAL;
5723 goto out;
5724 }
5725 if (optval == 0) {
5726 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5727 } else {
5728 so->so_flags1 |= SOF1_CELLFALLBACK;
5729 }
5730 break;
5731
5732 case SO_FALLBACK_MODE:
5733 error = sooptcopyin(sopt, &optval, sizeof(optval),
5734 sizeof(optval));
5735 if (error != 0) {
5736 goto out;
5737 }
5738 if (optval < SO_FALLBACK_MODE_NONE ||
5739 optval > SO_FALLBACK_MODE_PREFER) {
5740 error = EINVAL;
5741 goto out;
5742 }
5743 so->so_fallback_mode = (u_int8_t)optval;
5744 break;
5745
5746 case SO_MARK_KNOWN_TRACKER: {
5747 error = sooptcopyin(sopt, &optval, sizeof(optval),
5748 sizeof(optval));
5749 if (error != 0) {
5750 goto out;
5751 }
5752 if (optval < 0) {
5753 error = EINVAL;
5754 goto out;
5755 }
5756 if (optval == 0) {
5757 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5758 } else {
5759 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5760 }
5761 break;
5762 }
5763
5764 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5765 error = sooptcopyin(sopt, &optval, sizeof(optval),
5766 sizeof(optval));
5767 if (error != 0) {
5768 goto out;
5769 }
5770 if (optval < 0) {
5771 error = EINVAL;
5772 goto out;
5773 }
5774 if (optval == 0) {
5775 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5776 } else {
5777 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5778 }
5779 break;
5780 }
5781
5782 case SO_MARK_APPROVED_APP_DOMAIN: {
5783 error = sooptcopyin(sopt, &optval, sizeof(optval),
5784 sizeof(optval));
5785 if (error != 0) {
5786 goto out;
5787 }
5788 if (optval < 0) {
5789 error = EINVAL;
5790 goto out;
5791 }
5792 if (optval == 0) {
5793 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5794 } else {
5795 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5796 }
5797 break;
5798 }
5799
5800 case SO_STATISTICS_EVENT:
5801 error = sooptcopyin(sopt, &long_optval,
5802 sizeof(long_optval), sizeof(long_optval));
5803 if (error != 0) {
5804 goto out;
5805 }
5806 u_int64_t nstat_event = 0;
5807 error = so_statistics_event_to_nstat_event(
5808 &long_optval, &nstat_event);
5809 if (error != 0) {
5810 goto out;
5811 }
5812 nstat_pcb_event(sotoinpcb(so), nstat_event);
5813 break;
5814
5815 case SO_NET_SERVICE_TYPE: {
5816 error = sooptcopyin(sopt, &optval, sizeof(optval),
5817 sizeof(optval));
5818 if (error != 0) {
5819 goto out;
5820 }
5821 error = so_set_net_service_type(so, optval);
5822 break;
5823 }
5824
5825 case SO_QOSMARKING_POLICY_OVERRIDE:
5826 error = priv_check_cred(kauth_cred_get(),
5827 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5828 if (error != 0) {
5829 goto out;
5830 }
5831 error = sooptcopyin(sopt, &optval, sizeof(optval),
5832 sizeof(optval));
5833 if (error != 0) {
5834 goto out;
5835 }
5836 if (optval == 0) {
5837 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5838 } else {
5839 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5840 }
5841 break;
5842
5843 case SO_MPKL_SEND_INFO: {
5844 struct so_mpkl_send_info so_mpkl_send_info;
5845
5846 error = sooptcopyin(sopt, &so_mpkl_send_info,
5847 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5848 if (error != 0) {
5849 goto out;
5850 }
5851 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5852 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5853
5854 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5855 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5856 } else {
5857 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5858 }
5859 break;
5860 }
5861 case SO_WANT_KEV_SOCKET_CLOSED: {
5862 error = sooptcopyin(sopt, &optval, sizeof(optval),
5863 sizeof(optval));
5864 if (error != 0) {
5865 goto out;
5866 }
5867 if (optval == 0) {
5868 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5869 } else {
5870 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5871 }
5872 break;
5873 }
5874 case SO_MARK_WAKE_PKT: {
5875 error = sooptcopyin(sopt, &optval, sizeof(optval),
5876 sizeof(optval));
5877 if (error != 0) {
5878 goto out;
5879 }
5880 if (optval == 0) {
5881 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5882 } else {
5883 so->so_flags |= SOF_MARK_WAKE_PKT;
5884 }
5885 break;
5886 }
5887 case SO_RECV_WAKE_PKT: {
5888 error = sooptcopyin(sopt, &optval, sizeof(optval),
5889 sizeof(optval));
5890 if (error != 0) {
5891 goto out;
5892 }
5893 if (optval == 0) {
5894 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5895 } else {
5896 so->so_flags |= SOF_RECV_WAKE_PKT;
5897 }
5898 break;
5899 }
5900 default:
5901 error = ENOPROTOOPT;
5902 break;
5903 }
5904 if (error == 0 && so->so_proto != NULL &&
5905 so->so_proto->pr_ctloutput != NULL) {
5906 (void) so->so_proto->pr_ctloutput(so, sopt);
5907 }
5908 }
5909 out:
5910 if (dolock) {
5911 socket_unlock(so, 1);
5912 }
5913 return error;
5914 }
5915
5916 /* Helper routines for getsockopt */
5917 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5918 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5919 {
5920 int error;
5921 size_t valsize;
5922
5923 error = 0;
5924
5925 /*
5926 * Documented get behavior is that we always return a value,
5927 * possibly truncated to fit in the user's buffer.
5928 * Traditional behavior is that we always tell the user
5929 * precisely how much we copied, rather than something useful
5930 * like the total amount we had available for her.
5931 * Note that this interface is not idempotent; the entire answer must
5932 * generated ahead of time.
5933 */
5934 valsize = MIN(len, sopt->sopt_valsize);
5935 sopt->sopt_valsize = valsize;
5936 if (sopt->sopt_val != USER_ADDR_NULL) {
5937 if (sopt->sopt_p != kernproc) {
5938 error = copyout(buf, sopt->sopt_val, valsize);
5939 } else {
5940 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5941 }
5942 }
5943 return error;
5944 }
5945
5946 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5947 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5948 {
5949 int error;
5950 size_t len;
5951 struct user64_timeval tv64 = {};
5952 struct user32_timeval tv32 = {};
5953 const void * val;
5954 size_t valsize;
5955
5956 error = 0;
5957 if (proc_is64bit(sopt->sopt_p)) {
5958 len = sizeof(tv64);
5959 tv64.tv_sec = tv_p->tv_sec;
5960 tv64.tv_usec = tv_p->tv_usec;
5961 val = &tv64;
5962 } else {
5963 len = sizeof(tv32);
5964 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5965 tv32.tv_usec = tv_p->tv_usec;
5966 val = &tv32;
5967 }
5968 valsize = MIN(len, sopt->sopt_valsize);
5969 sopt->sopt_valsize = valsize;
5970 if (sopt->sopt_val != USER_ADDR_NULL) {
5971 if (sopt->sopt_p != kernproc) {
5972 error = copyout(val, sopt->sopt_val, valsize);
5973 } else {
5974 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5975 }
5976 }
5977 return error;
5978 }
5979
5980 /*
5981 * Return: 0 Success
5982 * ENOPROTOOPT
5983 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5984 * <pr_ctloutput>:???
5985 * <sf_getoption>:???
5986 */
5987 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5988 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5989 {
5990 int error, optval;
5991 struct linger l;
5992 struct timeval tv;
5993
5994 if (sopt->sopt_dir != SOPT_GET) {
5995 sopt->sopt_dir = SOPT_GET;
5996 }
5997
5998 if (dolock) {
5999 socket_lock(so, 1);
6000 }
6001
6002 error = sflt_getsockopt(so, sopt);
6003 if (error != 0) {
6004 if (error == EJUSTRETURN) {
6005 error = 0;
6006 }
6007 goto out;
6008 }
6009
6010 if (sopt->sopt_level != SOL_SOCKET) {
6011 if (so->so_proto != NULL &&
6012 so->so_proto->pr_ctloutput != NULL) {
6013 error = (*so->so_proto->pr_ctloutput)(so, sopt);
6014 goto out;
6015 }
6016 error = ENOPROTOOPT;
6017 } else {
6018 /*
6019 * Allow socket-level (SOL_SOCKET) options to be filtered by
6020 * the protocol layer, if needed. A zero value returned from
6021 * the handler means use default socket-level processing as
6022 * done by the rest of this routine. Otherwise, any other
6023 * return value indicates that the option is unsupported.
6024 */
6025 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6026 pru_socheckopt(so, sopt)) != 0) {
6027 goto out;
6028 }
6029
6030 error = 0;
6031 switch (sopt->sopt_name) {
6032 case SO_LINGER:
6033 case SO_LINGER_SEC:
6034 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6035 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6036 so->so_linger : so->so_linger / hz;
6037 error = sooptcopyout(sopt, &l, sizeof(l));
6038 break;
6039
6040 case SO_USELOOPBACK:
6041 case SO_DONTROUTE:
6042 case SO_DEBUG:
6043 case SO_KEEPALIVE:
6044 case SO_REUSEADDR:
6045 case SO_REUSEPORT:
6046 case SO_BROADCAST:
6047 case SO_OOBINLINE:
6048 case SO_TIMESTAMP:
6049 case SO_TIMESTAMP_MONOTONIC:
6050 case SO_TIMESTAMP_CONTINUOUS:
6051 case SO_DONTTRUNC:
6052 case SO_WANTMORE:
6053 case SO_WANTOOBFLAG:
6054 case SO_NOWAKEFROMSLEEP:
6055 case SO_NOAPNFALLBK:
6056 optval = so->so_options & sopt->sopt_name;
6057 integer:
6058 error = sooptcopyout(sopt, &optval, sizeof(optval));
6059 break;
6060
6061 case SO_TYPE:
6062 optval = so->so_type;
6063 goto integer;
6064
6065 case SO_NREAD:
6066 if (so->so_proto->pr_flags & PR_ATOMIC) {
6067 int pkt_total;
6068 struct mbuf *m1;
6069
6070 pkt_total = 0;
6071 m1 = so->so_rcv.sb_mb;
6072 while (m1 != NULL) {
6073 if (m1->m_type == MT_DATA ||
6074 m1->m_type == MT_HEADER ||
6075 m1->m_type == MT_OOBDATA) {
6076 pkt_total += m1->m_len;
6077 }
6078 m1 = m1->m_next;
6079 }
6080 optval = pkt_total;
6081 } else {
6082 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6083 }
6084 goto integer;
6085
6086 case SO_NUMRCVPKT:
6087 if (so->so_proto->pr_flags & PR_ATOMIC) {
6088 int cnt = 0;
6089 struct mbuf *m1;
6090
6091 m1 = so->so_rcv.sb_mb;
6092 while (m1 != NULL) {
6093 cnt += 1;
6094 m1 = m1->m_nextpkt;
6095 }
6096 optval = cnt;
6097 goto integer;
6098 } else {
6099 error = ENOPROTOOPT;
6100 break;
6101 }
6102
6103 case SO_NWRITE:
6104 optval = so->so_snd.sb_cc;
6105 goto integer;
6106
6107 case SO_ERROR:
6108 optval = so->so_error;
6109 so->so_error = 0;
6110 goto integer;
6111
6112 case SO_SNDBUF: {
6113 u_int32_t hiwat = so->so_snd.sb_hiwat;
6114
6115 if (so->so_snd.sb_flags & SB_UNIX) {
6116 struct unpcb *unp =
6117 (struct unpcb *)(so->so_pcb);
6118 if (unp != NULL && unp->unp_conn != NULL) {
6119 hiwat += unp->unp_conn->unp_cc;
6120 }
6121 }
6122
6123 optval = hiwat;
6124 goto integer;
6125 }
6126 case SO_RCVBUF:
6127 optval = so->so_rcv.sb_hiwat;
6128 goto integer;
6129
6130 case SO_SNDLOWAT:
6131 optval = so->so_snd.sb_lowat;
6132 goto integer;
6133
6134 case SO_RCVLOWAT:
6135 optval = so->so_rcv.sb_lowat;
6136 goto integer;
6137
6138 case SO_SNDTIMEO:
6139 case SO_RCVTIMEO:
6140 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6141 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6142
6143 error = sooptcopyout_timeval(sopt, &tv);
6144 break;
6145
6146 case SO_NOSIGPIPE:
6147 optval = (so->so_flags & SOF_NOSIGPIPE);
6148 goto integer;
6149
6150 case SO_NOADDRERR:
6151 optval = (so->so_flags & SOF_NOADDRAVAIL);
6152 goto integer;
6153
6154 case SO_REUSESHAREUID:
6155 optval = (so->so_flags & SOF_REUSESHAREUID);
6156 goto integer;
6157
6158
6159 case SO_NOTIFYCONFLICT:
6160 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6161 goto integer;
6162
6163 case SO_RESTRICTIONS:
6164 optval = so_get_restrictions(so);
6165 goto integer;
6166
6167 case SO_AWDL_UNRESTRICTED:
6168 if (SOCK_DOM(so) == PF_INET ||
6169 SOCK_DOM(so) == PF_INET6) {
6170 optval = inp_get_awdl_unrestricted(
6171 sotoinpcb(so));
6172 goto integer;
6173 } else {
6174 error = EOPNOTSUPP;
6175 }
6176 break;
6177
6178 case SO_INTCOPROC_ALLOW:
6179 if (SOCK_DOM(so) == PF_INET6) {
6180 optval = inp_get_intcoproc_allowed(
6181 sotoinpcb(so));
6182 goto integer;
6183 } else {
6184 error = EOPNOTSUPP;
6185 }
6186 break;
6187
6188 case SO_LABEL:
6189 error = EOPNOTSUPP;
6190 break;
6191
6192 case SO_PEERLABEL:
6193 error = EOPNOTSUPP;
6194 break;
6195
6196 #ifdef __APPLE_API_PRIVATE
6197 case SO_UPCALLCLOSEWAIT:
6198 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6199 goto integer;
6200 #endif
6201 case SO_RANDOMPORT:
6202 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6203 goto integer;
6204
6205 case SO_NP_EXTENSIONS: {
6206 struct so_np_extensions sonpx = {};
6207
6208 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6209 SONPX_SETOPTSHUT : 0;
6210 sonpx.npx_mask = SONPX_MASK_VALID;
6211
6212 error = sooptcopyout(sopt, &sonpx,
6213 sizeof(struct so_np_extensions));
6214 break;
6215 }
6216
6217 case SO_TRAFFIC_CLASS:
6218 optval = so->so_traffic_class;
6219 goto integer;
6220
6221 case SO_RECV_TRAFFIC_CLASS:
6222 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6223 goto integer;
6224
6225 #if (DEVELOPMENT || DEBUG)
6226 case SO_TRAFFIC_CLASS_DBG:
6227 error = sogetopt_tcdbg(so, sopt);
6228 break;
6229 #endif /* (DEVELOPMENT || DEBUG) */
6230
6231 case SO_PRIVILEGED_TRAFFIC_CLASS:
6232 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6233 goto integer;
6234
6235 case SO_DEFUNCTOK:
6236 optval = !(so->so_flags & SOF_NODEFUNCT);
6237 goto integer;
6238
6239 case SO_ISDEFUNCT:
6240 optval = (so->so_flags & SOF_DEFUNCT);
6241 goto integer;
6242
6243 case SO_OPPORTUNISTIC:
6244 optval = so_get_opportunistic(so);
6245 goto integer;
6246
6247 case SO_FLUSH:
6248 /* This option is not gettable */
6249 error = EINVAL;
6250 break;
6251
6252 case SO_RECV_ANYIF:
6253 optval = so_get_recv_anyif(so);
6254 goto integer;
6255
6256 case SO_TRAFFIC_MGT_BACKGROUND:
6257 /* This option is handled by lower layer(s) */
6258 if (so->so_proto != NULL &&
6259 so->so_proto->pr_ctloutput != NULL) {
6260 (void) so->so_proto->pr_ctloutput(so, sopt);
6261 }
6262 break;
6263
6264 #if FLOW_DIVERT
6265 case SO_FLOW_DIVERT_TOKEN:
6266 error = flow_divert_token_get(so, sopt);
6267 break;
6268 #endif /* FLOW_DIVERT */
6269
6270 #if NECP
6271 case SO_NECP_ATTRIBUTES:
6272 if (SOCK_DOM(so) == PF_MULTIPATH) {
6273 /* Handled by MPTCP itself */
6274 break;
6275 }
6276
6277 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6278 error = EINVAL;
6279 goto out;
6280 }
6281
6282 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6283 break;
6284
6285 case SO_NECP_CLIENTUUID: {
6286 uuid_t *ncu;
6287
6288 if (SOCK_DOM(so) == PF_MULTIPATH) {
6289 ncu = &mpsotomppcb(so)->necp_client_uuid;
6290 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6291 ncu = &sotoinpcb(so)->necp_client_uuid;
6292 } else {
6293 error = EINVAL;
6294 goto out;
6295 }
6296
6297 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6298 break;
6299 }
6300
6301 case SO_NECP_LISTENUUID: {
6302 uuid_t *nlu;
6303
6304 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6305 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6306 nlu = &sotoinpcb(so)->necp_client_uuid;
6307 } else {
6308 error = ENOENT;
6309 goto out;
6310 }
6311 } else {
6312 error = EINVAL;
6313 goto out;
6314 }
6315
6316 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6317 break;
6318 }
6319 #endif /* NECP */
6320
6321 #if CONTENT_FILTER
6322 case SO_CFIL_SOCK_ID: {
6323 cfil_sock_id_t sock_id;
6324
6325 sock_id = cfil_sock_id_from_socket(so);
6326
6327 error = sooptcopyout(sopt, &sock_id,
6328 sizeof(cfil_sock_id_t));
6329 break;
6330 }
6331 #endif /* CONTENT_FILTER */
6332
6333 case SO_EXTENDED_BK_IDLE:
6334 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6335 goto integer;
6336 case SO_MARK_CELLFALLBACK:
6337 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6338 ? 1 : 0;
6339 goto integer;
6340 case SO_FALLBACK_MODE:
6341 optval = so->so_fallback_mode;
6342 goto integer;
6343 case SO_MARK_KNOWN_TRACKER: {
6344 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6345 ? 1 : 0;
6346 goto integer;
6347 }
6348 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6349 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6350 ? 1 : 0;
6351 goto integer;
6352 }
6353 case SO_MARK_APPROVED_APP_DOMAIN: {
6354 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6355 ? 1 : 0;
6356 goto integer;
6357 }
6358 case SO_NET_SERVICE_TYPE: {
6359 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6360 optval = so->so_netsvctype;
6361 } else {
6362 optval = NET_SERVICE_TYPE_BE;
6363 }
6364 goto integer;
6365 }
6366 case SO_NETSVC_MARKING_LEVEL:
6367 optval = so_get_netsvc_marking_level(so);
6368 goto integer;
6369
6370 case SO_MPKL_SEND_INFO: {
6371 struct so_mpkl_send_info so_mpkl_send_info;
6372
6373 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6374 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6375 error = sooptcopyout(sopt, &so_mpkl_send_info,
6376 sizeof(struct so_mpkl_send_info));
6377 break;
6378 }
6379 case SO_MARK_WAKE_PKT:
6380 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6381 goto integer;
6382 case SO_RECV_WAKE_PKT:
6383 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6384 goto integer;
6385 default:
6386 error = ENOPROTOOPT;
6387 break;
6388 }
6389 }
6390 out:
6391 if (dolock) {
6392 socket_unlock(so, 1);
6393 }
6394 return error;
6395 }
6396
6397 /*
6398 * The size limits on our soopt_getm is different from that on FreeBSD.
6399 * We limit the size of options to MCLBYTES. This will have to change
6400 * if we need to define options that need more space than MCLBYTES.
6401 */
6402 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6403 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6404 {
6405 struct mbuf *m, *m_prev;
6406 int sopt_size = (int)sopt->sopt_valsize;
6407 int how;
6408
6409 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6410 return EMSGSIZE;
6411 }
6412
6413 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6414 MGET(m, how, MT_DATA);
6415 if (m == NULL) {
6416 return ENOBUFS;
6417 }
6418 if (sopt_size > MLEN) {
6419 MCLGET(m, how);
6420 if ((m->m_flags & M_EXT) == 0) {
6421 m_free(m);
6422 return ENOBUFS;
6423 }
6424 m->m_len = min(MCLBYTES, sopt_size);
6425 } else {
6426 m->m_len = min(MLEN, sopt_size);
6427 }
6428 sopt_size -= m->m_len;
6429 *mp = m;
6430 m_prev = m;
6431
6432 while (sopt_size > 0) {
6433 MGET(m, how, MT_DATA);
6434 if (m == NULL) {
6435 m_freem(*mp);
6436 return ENOBUFS;
6437 }
6438 if (sopt_size > MLEN) {
6439 MCLGET(m, how);
6440 if ((m->m_flags & M_EXT) == 0) {
6441 m_freem(*mp);
6442 m_freem(m);
6443 return ENOBUFS;
6444 }
6445 m->m_len = min(MCLBYTES, sopt_size);
6446 } else {
6447 m->m_len = min(MLEN, sopt_size);
6448 }
6449 sopt_size -= m->m_len;
6450 m_prev->m_next = m;
6451 m_prev = m;
6452 }
6453 return 0;
6454 }
6455
6456 /* copyin sopt data into mbuf chain */
6457 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6458 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6459 {
6460 struct mbuf *m0 = m;
6461
6462 if (sopt->sopt_val == USER_ADDR_NULL) {
6463 return 0;
6464 }
6465 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6466 if (sopt->sopt_p != kernproc) {
6467 int error;
6468
6469 error = copyin(sopt->sopt_val, mtod(m, char *),
6470 m->m_len);
6471 if (error != 0) {
6472 m_freem(m0);
6473 return error;
6474 }
6475 } else {
6476 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6477 mtod(m, char *), m->m_len);
6478 }
6479 sopt->sopt_valsize -= m->m_len;
6480 sopt->sopt_val += m->m_len;
6481 m = m->m_next;
6482 }
6483 /* should be allocated enoughly at ip6_sooptmcopyin() */
6484 if (m != NULL) {
6485 panic("soopt_mcopyin");
6486 /* NOTREACHED */
6487 }
6488 return 0;
6489 }
6490
6491 /* copyout mbuf chain data into soopt */
6492 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6493 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6494 {
6495 struct mbuf *m0 = m;
6496 size_t valsize = 0;
6497
6498 if (sopt->sopt_val == USER_ADDR_NULL) {
6499 return 0;
6500 }
6501 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6502 if (sopt->sopt_p != kernproc) {
6503 int error;
6504
6505 error = copyout(mtod(m, char *), sopt->sopt_val,
6506 m->m_len);
6507 if (error != 0) {
6508 m_freem(m0);
6509 return error;
6510 }
6511 } else {
6512 bcopy(mtod(m, char *),
6513 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6514 }
6515 sopt->sopt_valsize -= m->m_len;
6516 sopt->sopt_val += m->m_len;
6517 valsize += m->m_len;
6518 m = m->m_next;
6519 }
6520 if (m != NULL) {
6521 /* enough soopt buffer should be given from user-land */
6522 m_freem(m0);
6523 return EINVAL;
6524 }
6525 sopt->sopt_valsize = valsize;
6526 return 0;
6527 }
6528
6529 void
sohasoutofband(struct socket * so)6530 sohasoutofband(struct socket *so)
6531 {
6532 if (so->so_pgid < 0) {
6533 gsignal(-so->so_pgid, SIGURG);
6534 } else if (so->so_pgid > 0) {
6535 proc_signal(so->so_pgid, SIGURG);
6536 }
6537 selwakeup(&so->so_rcv.sb_sel);
6538 if (so->so_rcv.sb_flags & SB_KNOTE) {
6539 KNOTE(&so->so_rcv.sb_sel.si_note,
6540 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6541 }
6542 }
6543
6544 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6545 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6546 {
6547 #pragma unused(cred)
6548 struct proc *p = current_proc();
6549 int revents = 0;
6550
6551 socket_lock(so, 1);
6552 so_update_last_owner_locked(so, PROC_NULL);
6553 so_update_policy(so);
6554
6555 if (events & (POLLIN | POLLRDNORM)) {
6556 if (soreadable(so)) {
6557 revents |= events & (POLLIN | POLLRDNORM);
6558 }
6559 }
6560
6561 if (events & (POLLOUT | POLLWRNORM)) {
6562 if (sowriteable(so)) {
6563 revents |= events & (POLLOUT | POLLWRNORM);
6564 }
6565 }
6566
6567 if (events & (POLLPRI | POLLRDBAND)) {
6568 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6569 revents |= events & (POLLPRI | POLLRDBAND);
6570 }
6571 }
6572
6573 if (revents == 0) {
6574 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6575 /*
6576 * Darwin sets the flag first,
6577 * BSD calls selrecord first
6578 */
6579 so->so_rcv.sb_flags |= SB_SEL;
6580 selrecord(p, &so->so_rcv.sb_sel, wql);
6581 }
6582
6583 if (events & (POLLOUT | POLLWRNORM)) {
6584 /*
6585 * Darwin sets the flag first,
6586 * BSD calls selrecord first
6587 */
6588 so->so_snd.sb_flags |= SB_SEL;
6589 selrecord(p, &so->so_snd.sb_sel, wql);
6590 }
6591 }
6592
6593 socket_unlock(so, 1);
6594 return revents;
6595 }
6596
6597 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6598 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6599 {
6600 struct socket *so = (struct socket *)fp_get_data(fp);
6601 int result;
6602
6603 socket_lock(so, 1);
6604 so_update_last_owner_locked(so, PROC_NULL);
6605 so_update_policy(so);
6606
6607 switch (kn->kn_filter) {
6608 case EVFILT_READ:
6609 kn->kn_filtid = EVFILTID_SOREAD;
6610 break;
6611 case EVFILT_WRITE:
6612 kn->kn_filtid = EVFILTID_SOWRITE;
6613 break;
6614 case EVFILT_SOCK:
6615 kn->kn_filtid = EVFILTID_SCK;
6616 break;
6617 case EVFILT_EXCEPT:
6618 kn->kn_filtid = EVFILTID_SOEXCEPT;
6619 break;
6620 default:
6621 socket_unlock(so, 1);
6622 knote_set_error(kn, EINVAL);
6623 return 0;
6624 }
6625
6626 /*
6627 * call the appropriate sub-filter attach
6628 * with the socket still locked
6629 */
6630 result = knote_fops(kn)->f_attach(kn, kev);
6631
6632 socket_unlock(so, 1);
6633
6634 return result;
6635 }
6636
6637 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6638 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6639 {
6640 int retval = 0;
6641 int64_t data = 0;
6642
6643 if (so->so_options & SO_ACCEPTCONN) {
6644 /*
6645 * Radar 6615193 handle the listen case dynamically
6646 * for kqueue read filter. This allows to call listen()
6647 * after registering the kqueue EVFILT_READ.
6648 */
6649
6650 retval = !TAILQ_EMPTY(&so->so_comp);
6651 data = so->so_qlen;
6652 goto out;
6653 }
6654
6655 /* socket isn't a listener */
6656 /*
6657 * NOTE_LOWAT specifies new low water mark in data, i.e.
6658 * the bytes of protocol data. We therefore exclude any
6659 * control bytes.
6660 */
6661 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6662
6663 if (kn->kn_sfflags & NOTE_OOB) {
6664 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6665 kn->kn_fflags |= NOTE_OOB;
6666 data -= so->so_oobmark;
6667 retval = 1;
6668 goto out;
6669 }
6670 }
6671
6672 if ((so->so_state & SS_CANTRCVMORE)
6673 #if CONTENT_FILTER
6674 && cfil_sock_data_pending(&so->so_rcv) == 0
6675 #endif /* CONTENT_FILTER */
6676 ) {
6677 kn->kn_flags |= EV_EOF;
6678 kn->kn_fflags = so->so_error;
6679 retval = 1;
6680 goto out;
6681 }
6682
6683 if (so->so_error) { /* temporary udp error */
6684 retval = 1;
6685 goto out;
6686 }
6687
6688 int64_t lowwat = so->so_rcv.sb_lowat;
6689 /*
6690 * Ensure that when NOTE_LOWAT is used, the derived
6691 * low water mark is bounded by socket's rcv buf's
6692 * high and low water mark values.
6693 */
6694 if (kn->kn_sfflags & NOTE_LOWAT) {
6695 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6696 lowwat = so->so_rcv.sb_hiwat;
6697 } else if (kn->kn_sdata > lowwat) {
6698 lowwat = kn->kn_sdata;
6699 }
6700 }
6701
6702 /*
6703 * While the `data` field is the amount of data to read,
6704 * 0-sized packets need to wake up the kqueue, see 58140856,
6705 * so we need to take control bytes into account too.
6706 */
6707 retval = (so->so_rcv.sb_cc >= lowwat);
6708
6709 out:
6710 if (retval && kev) {
6711 knote_fill_kevent(kn, kev, data);
6712 }
6713 return retval;
6714 }
6715
6716 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6717 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6718 {
6719 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6720
6721 /* socket locked */
6722
6723 /*
6724 * If the caller explicitly asked for OOB results (e.g. poll())
6725 * from EVFILT_READ, then save that off in the hookid field
6726 * and reserve the kn_flags EV_OOBAND bit for output only.
6727 */
6728 if (kn->kn_filter == EVFILT_READ &&
6729 kn->kn_flags & EV_OOBAND) {
6730 kn->kn_flags &= ~EV_OOBAND;
6731 kn->kn_hook32 = EV_OOBAND;
6732 } else {
6733 kn->kn_hook32 = 0;
6734 }
6735 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6736 so->so_rcv.sb_flags |= SB_KNOTE;
6737 }
6738
6739 /* indicate if event is already fired */
6740 return filt_soread_common(kn, NULL, so);
6741 }
6742
6743 static void
filt_sordetach(struct knote * kn)6744 filt_sordetach(struct knote *kn)
6745 {
6746 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6747
6748 socket_lock(so, 1);
6749 if (so->so_rcv.sb_flags & SB_KNOTE) {
6750 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6751 so->so_rcv.sb_flags &= ~SB_KNOTE;
6752 }
6753 }
6754 socket_unlock(so, 1);
6755 }
6756
6757 /*ARGSUSED*/
6758 static int
filt_soread(struct knote * kn,long hint)6759 filt_soread(struct knote *kn, long hint)
6760 {
6761 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6762 int retval;
6763
6764 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6765 socket_lock(so, 1);
6766 }
6767
6768 retval = filt_soread_common(kn, NULL, so);
6769
6770 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6771 socket_unlock(so, 1);
6772 }
6773
6774 return retval;
6775 }
6776
6777 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6778 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6779 {
6780 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6781 int retval;
6782
6783 socket_lock(so, 1);
6784
6785 /* save off the new input fflags and data */
6786 kn->kn_sfflags = kev->fflags;
6787 kn->kn_sdata = kev->data;
6788
6789 /* determine if changes result in fired events */
6790 retval = filt_soread_common(kn, NULL, so);
6791
6792 socket_unlock(so, 1);
6793
6794 return retval;
6795 }
6796
6797 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6798 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6799 {
6800 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6801 int retval;
6802
6803 socket_lock(so, 1);
6804 retval = filt_soread_common(kn, kev, so);
6805 socket_unlock(so, 1);
6806
6807 return retval;
6808 }
6809
6810 int
so_wait_for_if_feedback(struct socket * so)6811 so_wait_for_if_feedback(struct socket *so)
6812 {
6813 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6814 (so->so_state & SS_ISCONNECTED)) {
6815 struct inpcb *inp = sotoinpcb(so);
6816 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6817 return 1;
6818 }
6819 }
6820 return 0;
6821 }
6822
6823 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6824 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6825 {
6826 int ret = 0;
6827 int64_t data = sbspace(&so->so_snd);
6828
6829 if (so->so_state & SS_CANTSENDMORE) {
6830 kn->kn_flags |= EV_EOF;
6831 kn->kn_fflags = so->so_error;
6832 ret = 1;
6833 goto out;
6834 }
6835
6836 if (so->so_error) { /* temporary udp error */
6837 ret = 1;
6838 goto out;
6839 }
6840
6841 if (!socanwrite(so)) {
6842 ret = 0;
6843 goto out;
6844 }
6845
6846 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6847 ret = 1;
6848 goto out;
6849 }
6850
6851 int64_t lowwat = so->so_snd.sb_lowat;
6852
6853 if (kn->kn_sfflags & NOTE_LOWAT) {
6854 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6855 lowwat = so->so_snd.sb_hiwat;
6856 } else if (kn->kn_sdata > lowwat) {
6857 lowwat = kn->kn_sdata;
6858 }
6859 }
6860
6861 if (data >= lowwat) {
6862 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6863 #if (DEBUG || DEVELOPMENT)
6864 && so_notsent_lowat_check == 1
6865 #endif /* DEBUG || DEVELOPMENT */
6866 ) {
6867 if ((SOCK_DOM(so) == PF_INET ||
6868 SOCK_DOM(so) == PF_INET6) &&
6869 so->so_type == SOCK_STREAM) {
6870 ret = tcp_notsent_lowat_check(so);
6871 }
6872 #if MPTCP
6873 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6874 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6875 ret = mptcp_notsent_lowat_check(so);
6876 }
6877 #endif
6878 else {
6879 ret = 1;
6880 goto out;
6881 }
6882 } else {
6883 ret = 1;
6884 }
6885 }
6886 if (so_wait_for_if_feedback(so)) {
6887 ret = 0;
6888 }
6889
6890 out:
6891 if (ret && kev) {
6892 knote_fill_kevent(kn, kev, data);
6893 }
6894 return ret;
6895 }
6896
6897 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6898 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6899 {
6900 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6901
6902 /* socket locked */
6903 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6904 so->so_snd.sb_flags |= SB_KNOTE;
6905 }
6906
6907 /* determine if its already fired */
6908 return filt_sowrite_common(kn, NULL, so);
6909 }
6910
6911 static void
filt_sowdetach(struct knote * kn)6912 filt_sowdetach(struct knote *kn)
6913 {
6914 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6915 socket_lock(so, 1);
6916
6917 if (so->so_snd.sb_flags & SB_KNOTE) {
6918 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6919 so->so_snd.sb_flags &= ~SB_KNOTE;
6920 }
6921 }
6922 socket_unlock(so, 1);
6923 }
6924
6925 /*ARGSUSED*/
6926 static int
filt_sowrite(struct knote * kn,long hint)6927 filt_sowrite(struct knote *kn, long hint)
6928 {
6929 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6930 int ret;
6931
6932 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6933 socket_lock(so, 1);
6934 }
6935
6936 ret = filt_sowrite_common(kn, NULL, so);
6937
6938 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6939 socket_unlock(so, 1);
6940 }
6941
6942 return ret;
6943 }
6944
6945 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6946 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6947 {
6948 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6949 int ret;
6950
6951 socket_lock(so, 1);
6952
6953 /*save off the new input fflags and data */
6954 kn->kn_sfflags = kev->fflags;
6955 kn->kn_sdata = kev->data;
6956
6957 /* determine if these changes result in a triggered event */
6958 ret = filt_sowrite_common(kn, NULL, so);
6959
6960 socket_unlock(so, 1);
6961
6962 return ret;
6963 }
6964
6965 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6966 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6967 {
6968 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6969 int ret;
6970
6971 socket_lock(so, 1);
6972 ret = filt_sowrite_common(kn, kev, so);
6973 socket_unlock(so, 1);
6974
6975 return ret;
6976 }
6977
6978 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6979 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6980 struct socket *so, long ev_hint)
6981 {
6982 int ret = 0;
6983 int64_t data = 0;
6984 uint32_t level_trigger = 0;
6985
6986 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6987 kn->kn_fflags |= NOTE_CONNRESET;
6988 }
6989 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6990 kn->kn_fflags |= NOTE_TIMEOUT;
6991 }
6992 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6993 kn->kn_fflags |= NOTE_NOSRCADDR;
6994 }
6995 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6996 kn->kn_fflags |= NOTE_IFDENIED;
6997 }
6998 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6999 kn->kn_fflags |= NOTE_KEEPALIVE;
7000 }
7001 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7002 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7003 }
7004 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7005 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7006 }
7007 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7008 (so->so_state & SS_ISCONNECTED)) {
7009 kn->kn_fflags |= NOTE_CONNECTED;
7010 level_trigger |= NOTE_CONNECTED;
7011 }
7012 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7013 (so->so_state & SS_ISDISCONNECTED)) {
7014 kn->kn_fflags |= NOTE_DISCONNECTED;
7015 level_trigger |= NOTE_DISCONNECTED;
7016 }
7017 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7018 if (so->so_proto != NULL &&
7019 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7020 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7021 }
7022 }
7023
7024 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7025 tcp_notify_ack_active(so)) {
7026 kn->kn_fflags |= NOTE_NOTIFY_ACK;
7027 }
7028
7029 if ((so->so_state & SS_CANTRCVMORE)
7030 #if CONTENT_FILTER
7031 && cfil_sock_data_pending(&so->so_rcv) == 0
7032 #endif /* CONTENT_FILTER */
7033 ) {
7034 kn->kn_fflags |= NOTE_READCLOSED;
7035 level_trigger |= NOTE_READCLOSED;
7036 }
7037
7038 if (so->so_state & SS_CANTSENDMORE) {
7039 kn->kn_fflags |= NOTE_WRITECLOSED;
7040 level_trigger |= NOTE_WRITECLOSED;
7041 }
7042
7043 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7044 (so->so_flags & SOF_SUSPENDED)) {
7045 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7046
7047 /* If resume event was delivered before, reset it */
7048 kn->kn_hook32 &= ~NOTE_RESUME;
7049
7050 kn->kn_fflags |= NOTE_SUSPEND;
7051 level_trigger |= NOTE_SUSPEND;
7052 }
7053
7054 if ((ev_hint & SO_FILT_HINT_RESUME) ||
7055 (so->so_flags & SOF_SUSPENDED) == 0) {
7056 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7057
7058 /* If suspend event was delivered before, reset it */
7059 kn->kn_hook32 &= ~NOTE_SUSPEND;
7060
7061 kn->kn_fflags |= NOTE_RESUME;
7062 level_trigger |= NOTE_RESUME;
7063 }
7064
7065 if (so->so_error != 0) {
7066 ret = 1;
7067 data = so->so_error;
7068 kn->kn_flags |= EV_EOF;
7069 } else {
7070 u_int32_t data32 = 0;
7071 get_sockev_state(so, &data32);
7072 data = data32;
7073 }
7074
7075 /* Reset any events that are not requested on this knote */
7076 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7077 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7078
7079 /* Find the level triggerred events that are already delivered */
7080 level_trigger &= kn->kn_hook32;
7081 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7082
7083 /* Do not deliver level triggerred events more than once */
7084 if ((kn->kn_fflags & ~level_trigger) != 0) {
7085 ret = 1;
7086 }
7087
7088 if (ret && kev) {
7089 /*
7090 * Store the state of the events being delivered. This
7091 * state can be used to deliver level triggered events
7092 * ateast once and still avoid waking up the application
7093 * multiple times as long as the event is active.
7094 */
7095 if (kn->kn_fflags != 0) {
7096 kn->kn_hook32 |= (kn->kn_fflags &
7097 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7098 }
7099
7100 /*
7101 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7102 * only one of them and remember the last one that was
7103 * delivered last
7104 */
7105 if (kn->kn_fflags & NOTE_SUSPEND) {
7106 kn->kn_hook32 &= ~NOTE_RESUME;
7107 }
7108 if (kn->kn_fflags & NOTE_RESUME) {
7109 kn->kn_hook32 &= ~NOTE_SUSPEND;
7110 }
7111
7112 knote_fill_kevent(kn, kev, data);
7113 }
7114 return ret;
7115 }
7116
7117 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7118 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7119 {
7120 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7121
7122 /* socket locked */
7123 kn->kn_hook32 = 0;
7124 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7125 so->so_flags |= SOF_KNOTE;
7126 }
7127
7128 /* determine if event already fired */
7129 return filt_sockev_common(kn, NULL, so, 0);
7130 }
7131
7132 static void
filt_sockdetach(struct knote * kn)7133 filt_sockdetach(struct knote *kn)
7134 {
7135 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7136 socket_lock(so, 1);
7137
7138 if ((so->so_flags & SOF_KNOTE) != 0) {
7139 if (KNOTE_DETACH(&so->so_klist, kn)) {
7140 so->so_flags &= ~SOF_KNOTE;
7141 }
7142 }
7143 socket_unlock(so, 1);
7144 }
7145
7146 static int
filt_sockev(struct knote * kn,long hint)7147 filt_sockev(struct knote *kn, long hint)
7148 {
7149 int ret = 0, locked = 0;
7150 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7151 long ev_hint = (hint & SO_FILT_HINT_EV);
7152
7153 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7154 socket_lock(so, 1);
7155 locked = 1;
7156 }
7157
7158 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7159
7160 if (locked) {
7161 socket_unlock(so, 1);
7162 }
7163
7164 return ret;
7165 }
7166
7167
7168
7169 /*
7170 * filt_socktouch - update event state
7171 */
7172 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7173 filt_socktouch(
7174 struct knote *kn,
7175 struct kevent_qos_s *kev)
7176 {
7177 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7178 uint32_t changed_flags;
7179 int ret;
7180
7181 socket_lock(so, 1);
7182
7183 /* save off the [result] data and fflags */
7184 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7185
7186 /* save off the new input fflags and data */
7187 kn->kn_sfflags = kev->fflags;
7188 kn->kn_sdata = kev->data;
7189
7190 /* restrict the current results to the (smaller?) set of new interest */
7191 /*
7192 * For compatibility with previous implementations, we leave kn_fflags
7193 * as they were before.
7194 */
7195 //kn->kn_fflags &= kev->fflags;
7196
7197 /*
7198 * Since we keep track of events that are already
7199 * delivered, if any of those events are not requested
7200 * anymore the state related to them can be reset
7201 */
7202 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7203
7204 /* determine if we have events to deliver */
7205 ret = filt_sockev_common(kn, NULL, so, 0);
7206
7207 socket_unlock(so, 1);
7208
7209 return ret;
7210 }
7211
7212 /*
7213 * filt_sockprocess - query event fired state and return data
7214 */
7215 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7216 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7217 {
7218 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7219 int ret = 0;
7220
7221 socket_lock(so, 1);
7222
7223 ret = filt_sockev_common(kn, kev, so, 0);
7224
7225 socket_unlock(so, 1);
7226
7227 return ret;
7228 }
7229
7230 void
get_sockev_state(struct socket * so,u_int32_t * statep)7231 get_sockev_state(struct socket *so, u_int32_t *statep)
7232 {
7233 u_int32_t state = *(statep);
7234
7235 /*
7236 * If the state variable is already used by a previous event,
7237 * reset it.
7238 */
7239 if (state != 0) {
7240 return;
7241 }
7242
7243 if (so->so_state & SS_ISCONNECTED) {
7244 state |= SOCKEV_CONNECTED;
7245 } else {
7246 state &= ~(SOCKEV_CONNECTED);
7247 }
7248 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7249 *(statep) = state;
7250 }
7251
7252 #define SO_LOCK_HISTORY_STR_LEN \
7253 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7254
7255 __private_extern__ const char *
solockhistory_nr(struct socket * so)7256 solockhistory_nr(struct socket *so)
7257 {
7258 size_t n = 0;
7259 int i;
7260 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7261
7262 bzero(lock_history_str, sizeof(lock_history_str));
7263 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7264 n += scnprintf(lock_history_str + n,
7265 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7266 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7267 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7268 }
7269 return lock_history_str;
7270 }
7271
7272 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7273 socket_getlock(struct socket *so, int flags)
7274 {
7275 if (so->so_proto->pr_getlock != NULL) {
7276 return (*so->so_proto->pr_getlock)(so, flags);
7277 } else {
7278 return so->so_proto->pr_domain->dom_mtx;
7279 }
7280 }
7281
7282 void
socket_lock(struct socket * so,int refcount)7283 socket_lock(struct socket *so, int refcount)
7284 {
7285 void *lr_saved;
7286
7287 lr_saved = __builtin_return_address(0);
7288
7289 if (so->so_proto->pr_lock) {
7290 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7291 } else {
7292 #ifdef MORE_LOCKING_DEBUG
7293 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7294 LCK_MTX_ASSERT_NOTOWNED);
7295 #endif
7296 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7297 if (refcount) {
7298 so->so_usecount++;
7299 }
7300 so->lock_lr[so->next_lock_lr] = lr_saved;
7301 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7302 }
7303 }
7304
7305 void
socket_lock_assert_owned(struct socket * so)7306 socket_lock_assert_owned(struct socket *so)
7307 {
7308 lck_mtx_t *mutex_held;
7309
7310 if (so->so_proto->pr_getlock != NULL) {
7311 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7312 } else {
7313 mutex_held = so->so_proto->pr_domain->dom_mtx;
7314 }
7315
7316 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7317 }
7318
7319 int
socket_try_lock(struct socket * so)7320 socket_try_lock(struct socket *so)
7321 {
7322 lck_mtx_t *mtx;
7323
7324 if (so->so_proto->pr_getlock != NULL) {
7325 mtx = (*so->so_proto->pr_getlock)(so, 0);
7326 } else {
7327 mtx = so->so_proto->pr_domain->dom_mtx;
7328 }
7329
7330 return lck_mtx_try_lock(mtx);
7331 }
7332
7333 void
socket_unlock(struct socket * so,int refcount)7334 socket_unlock(struct socket *so, int refcount)
7335 {
7336 void *lr_saved;
7337 lck_mtx_t *mutex_held;
7338
7339 lr_saved = __builtin_return_address(0);
7340
7341 if (so == NULL || so->so_proto == NULL) {
7342 panic("%s: null so_proto so=%p", __func__, so);
7343 /* NOTREACHED */
7344 }
7345
7346 if (so->so_proto->pr_unlock) {
7347 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7348 } else {
7349 mutex_held = so->so_proto->pr_domain->dom_mtx;
7350 #ifdef MORE_LOCKING_DEBUG
7351 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7352 #endif
7353 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7354 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7355
7356 if (refcount) {
7357 if (so->so_usecount <= 0) {
7358 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7359 "lrh=%s", __func__, so->so_usecount, so,
7360 SOCK_DOM(so), so->so_type,
7361 SOCK_PROTO(so), solockhistory_nr(so));
7362 /* NOTREACHED */
7363 }
7364
7365 so->so_usecount--;
7366 if (so->so_usecount == 0) {
7367 sofreelastref(so, 1);
7368 }
7369 }
7370 lck_mtx_unlock(mutex_held);
7371 }
7372 }
7373
7374 /* Called with socket locked, will unlock socket */
7375 void
sofree(struct socket * so)7376 sofree(struct socket *so)
7377 {
7378 lck_mtx_t *mutex_held;
7379
7380 if (so->so_proto->pr_getlock != NULL) {
7381 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7382 } else {
7383 mutex_held = so->so_proto->pr_domain->dom_mtx;
7384 }
7385 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7386
7387 sofreelastref(so, 0);
7388 }
7389
7390 void
soreference(struct socket * so)7391 soreference(struct socket *so)
7392 {
7393 socket_lock(so, 1); /* locks & take one reference on socket */
7394 socket_unlock(so, 0); /* unlock only */
7395 }
7396
7397 void
sodereference(struct socket * so)7398 sodereference(struct socket *so)
7399 {
7400 socket_lock(so, 0);
7401 socket_unlock(so, 1);
7402 }
7403
7404 /*
7405 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7406 * possibility of using jumbo clusters. Caller must ensure to hold
7407 * the socket lock.
7408 */
7409 void
somultipages(struct socket * so,boolean_t set)7410 somultipages(struct socket *so, boolean_t set)
7411 {
7412 if (set) {
7413 so->so_flags |= SOF_MULTIPAGES;
7414 } else {
7415 so->so_flags &= ~SOF_MULTIPAGES;
7416 }
7417 }
7418
7419 void
soif2kcl(struct socket * so,boolean_t set)7420 soif2kcl(struct socket *so, boolean_t set)
7421 {
7422 if (set) {
7423 so->so_flags1 |= SOF1_IF_2KCL;
7424 } else {
7425 so->so_flags1 &= ~SOF1_IF_2KCL;
7426 }
7427 }
7428
7429 int
so_isdstlocal(struct socket * so)7430 so_isdstlocal(struct socket *so)
7431 {
7432 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7433
7434 if (SOCK_DOM(so) == PF_INET) {
7435 return inaddr_local(inp->inp_faddr);
7436 } else if (SOCK_DOM(so) == PF_INET6) {
7437 return in6addr_local(&inp->in6p_faddr);
7438 }
7439
7440 return 0;
7441 }
7442
7443 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7444 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7445 {
7446 struct sockbuf *rcv, *snd;
7447 int err = 0, defunct;
7448
7449 rcv = &so->so_rcv;
7450 snd = &so->so_snd;
7451
7452 defunct = (so->so_flags & SOF_DEFUNCT);
7453 if (defunct) {
7454 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7455 panic("%s: SB_DROP not set", __func__);
7456 /* NOTREACHED */
7457 }
7458 goto done;
7459 }
7460
7461 if (so->so_flags & SOF_NODEFUNCT) {
7462 if (noforce) {
7463 err = EOPNOTSUPP;
7464 if (p != PROC_NULL) {
7465 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7466 "name %s level %d) so 0x%llx [%d,%d] "
7467 "is not eligible for defunct "
7468 "(%d)\n", __func__, proc_selfpid(),
7469 proc_best_name(current_proc()), proc_pid(p),
7470 proc_best_name(p), level,
7471 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7472 SOCK_DOM(so), SOCK_TYPE(so), err);
7473 }
7474 return err;
7475 }
7476 so->so_flags &= ~SOF_NODEFUNCT;
7477 if (p != PROC_NULL) {
7478 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7479 "name %s level %d) so 0x%llx [%d,%d] "
7480 "defunct by force "
7481 "(%d)\n", __func__, proc_selfpid(),
7482 proc_best_name(current_proc()), proc_pid(p),
7483 proc_best_name(p), level,
7484 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7485 SOCK_DOM(so), SOCK_TYPE(so), err);
7486 }
7487 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7488 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7489 struct ifnet *ifp = inp->inp_last_outifp;
7490
7491 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7492 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7493 } else if (so->so_flags & SOF_DELEGATED) {
7494 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7495 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7496 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7497 } else if (noforce && p != PROC_NULL) {
7498 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7499
7500 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7501 so->so_extended_bk_start = net_uptime();
7502 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7503
7504 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7505
7506 err = EOPNOTSUPP;
7507 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7508 "name %s level %d) so 0x%llx [%d,%d] "
7509 "extend bk idle "
7510 "(%d)\n", __func__, proc_selfpid(),
7511 proc_best_name(current_proc()), proc_pid(p),
7512 proc_best_name(p), level,
7513 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7514 SOCK_DOM(so), SOCK_TYPE(so), err);
7515 return err;
7516 } else {
7517 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7518 }
7519 }
7520
7521 so->so_flags |= SOF_DEFUNCT;
7522
7523 /* Prevent further data from being appended to the socket buffers */
7524 snd->sb_flags |= SB_DROP;
7525 rcv->sb_flags |= SB_DROP;
7526
7527 /* Flush any existing data in the socket buffers */
7528 if (rcv->sb_cc != 0) {
7529 rcv->sb_flags &= ~SB_SEL;
7530 selthreadclear(&rcv->sb_sel);
7531 sbrelease(rcv);
7532 }
7533 if (snd->sb_cc != 0) {
7534 snd->sb_flags &= ~SB_SEL;
7535 selthreadclear(&snd->sb_sel);
7536 sbrelease(snd);
7537 }
7538
7539 done:
7540 if (p != PROC_NULL) {
7541 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7542 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7543 proc_selfpid(), proc_best_name(current_proc()),
7544 proc_pid(p), proc_best_name(p), level,
7545 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7546 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7547 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7548 " extbkidle" : "");
7549 }
7550 return err;
7551 }
7552
7553 int
sodefunct(struct proc * p,struct socket * so,int level)7554 sodefunct(struct proc *p, struct socket *so, int level)
7555 {
7556 struct sockbuf *rcv, *snd;
7557
7558 if (!(so->so_flags & SOF_DEFUNCT)) {
7559 panic("%s improperly called", __func__);
7560 /* NOTREACHED */
7561 }
7562 if (so->so_state & SS_DEFUNCT) {
7563 goto done;
7564 }
7565
7566 rcv = &so->so_rcv;
7567 snd = &so->so_snd;
7568
7569 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7570 char s[MAX_IPv6_STR_LEN];
7571 char d[MAX_IPv6_STR_LEN];
7572 struct inpcb *inp = sotoinpcb(so);
7573
7574 if (p != PROC_NULL) {
7575 SODEFUNCTLOG(
7576 "%s[%d, %s]: (target pid %d name %s level %d) "
7577 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7578 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7579 " snd_fl 0x%x]\n", __func__,
7580 proc_selfpid(), proc_best_name(current_proc()),
7581 proc_pid(p), proc_best_name(p), level,
7582 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7583 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7584 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7585 (void *)&inp->inp_laddr.s_addr :
7586 (void *)&inp->in6p_laddr),
7587 s, sizeof(s)), ntohs(inp->in6p_lport),
7588 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7589 (void *)&inp->inp_faddr.s_addr :
7590 (void *)&inp->in6p_faddr,
7591 d, sizeof(d)), ntohs(inp->in6p_fport),
7592 (uint32_t)rcv->sb_sel.si_flags,
7593 (uint32_t)snd->sb_sel.si_flags,
7594 rcv->sb_flags, snd->sb_flags);
7595 }
7596 } else if (p != PROC_NULL) {
7597 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7598 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7599 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7600 proc_selfpid(), proc_best_name(current_proc()),
7601 proc_pid(p), proc_best_name(p), level,
7602 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7603 SOCK_DOM(so), SOCK_TYPE(so),
7604 (uint32_t)rcv->sb_sel.si_flags,
7605 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7606 snd->sb_flags);
7607 }
7608
7609 /*
7610 * Unwedge threads blocked on sbwait() and sb_lock().
7611 */
7612 sbwakeup(rcv);
7613 sbwakeup(snd);
7614
7615 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7616 if (rcv->sb_flags & SB_LOCK) {
7617 sbunlock(rcv, TRUE); /* keep socket locked */
7618 }
7619 if (snd->sb_flags & SB_LOCK) {
7620 sbunlock(snd, TRUE); /* keep socket locked */
7621 }
7622 /*
7623 * Flush the buffers and disconnect. We explicitly call shutdown
7624 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7625 * states are set for the socket. This would also flush out data
7626 * hanging off the receive list of this socket.
7627 */
7628 (void) soshutdownlock_final(so, SHUT_RD);
7629 (void) soshutdownlock_final(so, SHUT_WR);
7630 (void) sodisconnectlocked(so);
7631
7632 /*
7633 * Explicitly handle connectionless-protocol disconnection
7634 * and release any remaining data in the socket buffers.
7635 */
7636 if (!(so->so_state & SS_ISDISCONNECTED)) {
7637 (void) soisdisconnected(so);
7638 }
7639
7640 if (so->so_error == 0) {
7641 so->so_error = EBADF;
7642 }
7643
7644 if (rcv->sb_cc != 0) {
7645 rcv->sb_flags &= ~SB_SEL;
7646 selthreadclear(&rcv->sb_sel);
7647 sbrelease(rcv);
7648 }
7649 if (snd->sb_cc != 0) {
7650 snd->sb_flags &= ~SB_SEL;
7651 selthreadclear(&snd->sb_sel);
7652 sbrelease(snd);
7653 }
7654 so->so_state |= SS_DEFUNCT;
7655 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7656
7657 done:
7658 return 0;
7659 }
7660
7661 int
soresume(struct proc * p,struct socket * so,int locked)7662 soresume(struct proc *p, struct socket *so, int locked)
7663 {
7664 if (locked == 0) {
7665 socket_lock(so, 1);
7666 }
7667
7668 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7669 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7670 "[%d,%d] resumed from bk idle\n",
7671 __func__, proc_selfpid(), proc_best_name(current_proc()),
7672 proc_pid(p), proc_best_name(p),
7673 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7674 SOCK_DOM(so), SOCK_TYPE(so));
7675
7676 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7677 so->so_extended_bk_start = 0;
7678 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7679
7680 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7681 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7682 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7683 }
7684 if (locked == 0) {
7685 socket_unlock(so, 1);
7686 }
7687
7688 return 0;
7689 }
7690
7691 /*
7692 * Does not attempt to account for sockets that are delegated from
7693 * the current process
7694 */
7695 int
so_set_extended_bk_idle(struct socket * so,int optval)7696 so_set_extended_bk_idle(struct socket *so, int optval)
7697 {
7698 int error = 0;
7699
7700 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7701 SOCK_PROTO(so) != IPPROTO_TCP) {
7702 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7703 error = EOPNOTSUPP;
7704 } else if (optval == 0) {
7705 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7706
7707 soresume(current_proc(), so, 1);
7708 } else {
7709 struct proc *p = current_proc();
7710 struct fileproc *fp;
7711 int count = 0;
7712
7713 /*
7714 * Unlock socket to avoid lock ordering issue with
7715 * the proc fd table lock
7716 */
7717 socket_unlock(so, 0);
7718
7719 proc_fdlock(p);
7720 fdt_foreach(fp, p) {
7721 struct socket *so2;
7722
7723 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7724 continue;
7725 }
7726
7727 so2 = (struct socket *)fp_get_data(fp);
7728 if (so != so2 &&
7729 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7730 count++;
7731 }
7732 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7733 break;
7734 }
7735 }
7736 proc_fdunlock(p);
7737
7738 socket_lock(so, 0);
7739
7740 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7741 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7742 error = EBUSY;
7743 } else if (so->so_flags & SOF_DELEGATED) {
7744 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7745 error = EBUSY;
7746 } else {
7747 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7748 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7749 }
7750 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7751 "%s marked for extended bk idle\n",
7752 __func__, proc_selfpid(), proc_best_name(current_proc()),
7753 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7754 SOCK_DOM(so), SOCK_TYPE(so),
7755 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7756 "is" : "not");
7757 }
7758
7759 return error;
7760 }
7761
7762 static void
so_stop_extended_bk_idle(struct socket * so)7763 so_stop_extended_bk_idle(struct socket *so)
7764 {
7765 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7766 so->so_extended_bk_start = 0;
7767
7768 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7769 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7770 /*
7771 * Force defunct
7772 */
7773 sosetdefunct(current_proc(), so,
7774 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7775 if (so->so_flags & SOF_DEFUNCT) {
7776 sodefunct(current_proc(), so,
7777 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7778 }
7779 }
7780
7781 void
so_drain_extended_bk_idle(struct socket * so)7782 so_drain_extended_bk_idle(struct socket *so)
7783 {
7784 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7785 /*
7786 * Only penalize sockets that have outstanding data
7787 */
7788 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7789 so_stop_extended_bk_idle(so);
7790
7791 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7792 }
7793 }
7794 }
7795
7796 /*
7797 * Return values tells if socket is still in extended background idle
7798 */
7799 int
so_check_extended_bk_idle_time(struct socket * so)7800 so_check_extended_bk_idle_time(struct socket *so)
7801 {
7802 int ret = 1;
7803
7804 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7805 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7806 __func__, proc_selfpid(), proc_best_name(current_proc()),
7807 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7808 SOCK_DOM(so), SOCK_TYPE(so));
7809 if (net_uptime() - so->so_extended_bk_start >
7810 soextbkidlestat.so_xbkidle_time) {
7811 so_stop_extended_bk_idle(so);
7812
7813 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7814
7815 ret = 0;
7816 } else {
7817 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7818
7819 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7820 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7821 }
7822 }
7823
7824 return ret;
7825 }
7826
7827 void
resume_proc_sockets(proc_t p)7828 resume_proc_sockets(proc_t p)
7829 {
7830 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7831 struct fileproc *fp;
7832 struct socket *so;
7833
7834 proc_fdlock(p);
7835 fdt_foreach(fp, p) {
7836 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7837 continue;
7838 }
7839
7840 so = (struct socket *)fp_get_data(fp);
7841 (void) soresume(p, so, 0);
7842 }
7843 proc_fdunlock(p);
7844
7845 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7846 }
7847 }
7848
7849 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7850 so_set_recv_anyif(struct socket *so, int optval)
7851 {
7852 int ret = 0;
7853
7854 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7855 if (optval) {
7856 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7857 } else {
7858 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7859 }
7860 #if SKYWALK
7861 inp_update_netns_flags(so);
7862 #endif /* SKYWALK */
7863 }
7864
7865
7866 return ret;
7867 }
7868
7869 __private_extern__ int
so_get_recv_anyif(struct socket * so)7870 so_get_recv_anyif(struct socket *so)
7871 {
7872 int ret = 0;
7873
7874 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7875 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7876 }
7877
7878 return ret;
7879 }
7880
7881 int
so_set_restrictions(struct socket * so,uint32_t vals)7882 so_set_restrictions(struct socket *so, uint32_t vals)
7883 {
7884 int nocell_old, nocell_new;
7885 int noexpensive_old, noexpensive_new;
7886 int noconstrained_old, noconstrained_new;
7887
7888 /*
7889 * Deny-type restrictions are trapdoors; once set they cannot be
7890 * unset for the lifetime of the socket. This allows them to be
7891 * issued by a framework on behalf of the application without
7892 * having to worry that they can be undone.
7893 *
7894 * Note here that socket-level restrictions overrides any protocol
7895 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7896 * socket restriction issued on the socket has a higher precendence
7897 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7898 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7899 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7900 */
7901 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7902 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7903 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7904 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7905 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7906 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7907 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7908 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7909 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7910
7911 /* we can only set, not clear restrictions */
7912 if ((nocell_new - nocell_old) == 0 &&
7913 (noexpensive_new - noexpensive_old) == 0 &&
7914 (noconstrained_new - noconstrained_old) == 0) {
7915 return 0;
7916 }
7917 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7918 if (nocell_new - nocell_old != 0) {
7919 /*
7920 * if deny cellular is now set, do what's needed
7921 * for INPCB
7922 */
7923 inp_set_nocellular(sotoinpcb(so));
7924 }
7925 if (noexpensive_new - noexpensive_old != 0) {
7926 inp_set_noexpensive(sotoinpcb(so));
7927 }
7928 if (noconstrained_new - noconstrained_old != 0) {
7929 inp_set_noconstrained(sotoinpcb(so));
7930 }
7931 }
7932
7933 if (SOCK_DOM(so) == PF_MULTIPATH) {
7934 mptcp_set_restrictions(so);
7935 }
7936
7937 return 0;
7938 }
7939
7940 uint32_t
so_get_restrictions(struct socket * so)7941 so_get_restrictions(struct socket *so)
7942 {
7943 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7944 SO_RESTRICT_DENY_OUT |
7945 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7946 }
7947
7948 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7949 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7950 {
7951 struct proc *ep = PROC_NULL;
7952 int error = 0;
7953
7954 /* pid 0 is reserved for kernel */
7955 if (epid == 0) {
7956 error = EINVAL;
7957 goto done;
7958 }
7959
7960 /*
7961 * If this is an in-kernel socket, prevent its delegate
7962 * association from changing unless the socket option is
7963 * coming from within the kernel itself.
7964 */
7965 if (so->last_pid == 0 && p != kernproc) {
7966 error = EACCES;
7967 goto done;
7968 }
7969
7970 /*
7971 * If this is issued by a process that's recorded as the
7972 * real owner of the socket, or if the pid is the same as
7973 * the process's own pid, then proceed. Otherwise ensure
7974 * that the issuing process has the necessary privileges.
7975 */
7976 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7977 if ((error = priv_check_cred(kauth_cred_get(),
7978 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7979 error = EACCES;
7980 goto done;
7981 }
7982 }
7983
7984 /* Find the process that corresponds to the effective pid */
7985 if ((ep = proc_find(epid)) == PROC_NULL) {
7986 error = ESRCH;
7987 goto done;
7988 }
7989
7990 /*
7991 * If a process tries to delegate the socket to itself, then
7992 * there's really nothing to do; treat it as a way for the
7993 * delegate association to be cleared. Note that we check
7994 * the passed-in proc rather than calling proc_selfpid(),
7995 * as we need to check the process issuing the socket option
7996 * which could be kernproc. Given that we don't allow 0 for
7997 * effective pid, it means that a delegated in-kernel socket
7998 * stays delegated during its lifetime (which is probably OK.)
7999 */
8000 if (epid == proc_pid(p)) {
8001 so->so_flags &= ~SOF_DELEGATED;
8002 so->e_upid = 0;
8003 so->e_pid = 0;
8004 uuid_clear(so->e_uuid);
8005 } else {
8006 so->so_flags |= SOF_DELEGATED;
8007 so->e_upid = proc_uniqueid(ep);
8008 so->e_pid = proc_pid(ep);
8009 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8010
8011 #if defined(XNU_TARGET_OS_OSX)
8012 if (ep->p_responsible_pid != so->e_pid) {
8013 proc_t rp = proc_find(ep->p_responsible_pid);
8014 if (rp != PROC_NULL) {
8015 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8016 so->so_rpid = ep->p_responsible_pid;
8017 proc_rele(rp);
8018 } else {
8019 uuid_clear(so->so_ruuid);
8020 so->so_rpid = -1;
8021 }
8022 }
8023 #endif
8024 }
8025 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8026 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8027 }
8028 done:
8029 if (error == 0 && net_io_policy_log) {
8030 uuid_string_t buf;
8031
8032 uuid_unparse(so->e_uuid, buf);
8033 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8034 "euuid %s%s\n", __func__, proc_name_address(p),
8035 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8036 SOCK_DOM(so), SOCK_TYPE(so),
8037 so->e_pid, proc_name_address(ep), buf,
8038 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8039 } else if (error != 0 && net_io_policy_log) {
8040 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8041 "ERROR (%d)\n", __func__, proc_name_address(p),
8042 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8043 SOCK_DOM(so), SOCK_TYPE(so),
8044 epid, (ep == PROC_NULL) ? "PROC_NULL" :
8045 proc_name_address(ep), error);
8046 }
8047
8048 /* Update this socket's policy upon success */
8049 if (error == 0) {
8050 so->so_policy_gencnt *= -1;
8051 so_update_policy(so);
8052 #if NECP
8053 so_update_necp_policy(so, NULL, NULL);
8054 #endif /* NECP */
8055 }
8056
8057 if (ep != PROC_NULL) {
8058 proc_rele(ep);
8059 }
8060
8061 return error;
8062 }
8063
8064 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8065 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8066 {
8067 uuid_string_t buf;
8068 uuid_t uuid;
8069 int error = 0;
8070
8071 /* UUID must not be all-zeroes (reserved for kernel) */
8072 if (uuid_is_null(euuid)) {
8073 error = EINVAL;
8074 goto done;
8075 }
8076
8077 /*
8078 * If this is an in-kernel socket, prevent its delegate
8079 * association from changing unless the socket option is
8080 * coming from within the kernel itself.
8081 */
8082 if (so->last_pid == 0 && p != kernproc) {
8083 error = EACCES;
8084 goto done;
8085 }
8086
8087 /* Get the UUID of the issuing process */
8088 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8089
8090 /*
8091 * If this is issued by a process that's recorded as the
8092 * real owner of the socket, or if the uuid is the same as
8093 * the process's own uuid, then proceed. Otherwise ensure
8094 * that the issuing process has the necessary privileges.
8095 */
8096 if (check_cred &&
8097 (uuid_compare(euuid, so->last_uuid) != 0 ||
8098 uuid_compare(euuid, uuid) != 0)) {
8099 if ((error = priv_check_cred(kauth_cred_get(),
8100 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8101 error = EACCES;
8102 goto done;
8103 }
8104 }
8105
8106 /*
8107 * If a process tries to delegate the socket to itself, then
8108 * there's really nothing to do; treat it as a way for the
8109 * delegate association to be cleared. Note that we check
8110 * the uuid of the passed-in proc rather than that of the
8111 * current process, as we need to check the process issuing
8112 * the socket option which could be kernproc itself. Given
8113 * that we don't allow 0 for effective uuid, it means that
8114 * a delegated in-kernel socket stays delegated during its
8115 * lifetime (which is okay.)
8116 */
8117 if (uuid_compare(euuid, uuid) == 0) {
8118 so->so_flags &= ~SOF_DELEGATED;
8119 so->e_upid = 0;
8120 so->e_pid = 0;
8121 uuid_clear(so->e_uuid);
8122 } else {
8123 so->so_flags |= SOF_DELEGATED;
8124 /*
8125 * Unlike so_set_effective_pid(), we only have the UUID
8126 * here and the process ID is not known. Inherit the
8127 * real {pid,upid} of the socket.
8128 */
8129 so->e_upid = so->last_upid;
8130 so->e_pid = so->last_pid;
8131 uuid_copy(so->e_uuid, euuid);
8132 }
8133 /*
8134 * The following will clear the effective process name as it's the same
8135 * as the real process
8136 */
8137 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8138 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8139 }
8140 done:
8141 if (error == 0 && net_io_policy_log) {
8142 uuid_unparse(so->e_uuid, buf);
8143 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8144 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8145 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8146 SOCK_TYPE(so), so->e_pid, buf,
8147 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8148 } else if (error != 0 && net_io_policy_log) {
8149 uuid_unparse(euuid, buf);
8150 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8151 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8152 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8153 SOCK_TYPE(so), buf, error);
8154 }
8155
8156 /* Update this socket's policy upon success */
8157 if (error == 0) {
8158 so->so_policy_gencnt *= -1;
8159 so_update_policy(so);
8160 #if NECP
8161 so_update_necp_policy(so, NULL, NULL);
8162 #endif /* NECP */
8163 }
8164
8165 return error;
8166 }
8167
8168 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8169 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8170 uint32_t ev_datalen)
8171 {
8172 struct kev_msg ev_msg;
8173
8174 /*
8175 * A netpolicy event always starts with a netpolicy_event_data
8176 * structure, but the caller can provide for a longer event
8177 * structure to post, depending on the event code.
8178 */
8179 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8180
8181 bzero(&ev_msg, sizeof(ev_msg));
8182 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8183 ev_msg.kev_class = KEV_NETWORK_CLASS;
8184 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8185 ev_msg.event_code = ev_code;
8186
8187 ev_msg.dv[0].data_ptr = ev_data;
8188 ev_msg.dv[0].data_length = ev_datalen;
8189
8190 kev_post_msg(&ev_msg);
8191 }
8192
8193 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8194 socket_post_kev_msg(uint32_t ev_code,
8195 struct kev_socket_event_data *ev_data,
8196 uint32_t ev_datalen)
8197 {
8198 struct kev_msg ev_msg;
8199
8200 bzero(&ev_msg, sizeof(ev_msg));
8201 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8202 ev_msg.kev_class = KEV_NETWORK_CLASS;
8203 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8204 ev_msg.event_code = ev_code;
8205
8206 ev_msg.dv[0].data_ptr = ev_data;
8207 ev_msg.dv[0].data_length = ev_datalen;
8208
8209 kev_post_msg(&ev_msg);
8210 }
8211
8212 void
socket_post_kev_msg_closed(struct socket * so)8213 socket_post_kev_msg_closed(struct socket *so)
8214 {
8215 struct kev_socket_closed ev = {};
8216 struct sockaddr *socksa = NULL, *peersa = NULL;
8217 int err;
8218
8219 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8220 return;
8221 }
8222 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8223 if (err == 0) {
8224 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8225 &peersa);
8226 if (err == 0) {
8227 memcpy(&ev.ev_data.kev_sockname, socksa,
8228 min(socksa->sa_len,
8229 sizeof(ev.ev_data.kev_sockname)));
8230 memcpy(&ev.ev_data.kev_peername, peersa,
8231 min(peersa->sa_len,
8232 sizeof(ev.ev_data.kev_peername)));
8233 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8234 &ev.ev_data, sizeof(ev));
8235 }
8236 }
8237 free_sockaddr(socksa);
8238 free_sockaddr(peersa);
8239 }
8240