1 /*
2 * Copyright (c) 1998-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #include <os/log.h>
127
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147
148 static u_int32_t so_cache_hw; /* High water mark for socache */
149 static u_int32_t so_cache_timeouts; /* number of timeouts */
150 static u_int32_t so_cache_max_freed; /* max freed per timeout */
151 static u_int32_t cached_sock_count = 0;
152 STAILQ_HEAD(, socket) so_cache_head;
153 int max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t so_cache_time;
155 static int socketinit_done;
156 static struct zone *so_cache_zone;
157
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 ZONE_DECLARE(socket_zone, "socket", sizeof(struct socket), ZC_ZFREE_CLEARMEM);
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329
330 extern struct inpcbinfo tcbinfo;
331
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371
372 /*
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
376 */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &sotcdb, 0, "");
380
381 void
socketinit(void)382 socketinit(void)
383 {
384 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387 #ifdef __LP64__
388 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402
403 if (socketinit_done) {
404 printf("socketinit: already called...\n");
405 return;
406 }
407 socketinit_done = 1;
408
409 PE_parse_boot_argn("socket_debug", &socket_debug,
410 sizeof(socket_debug));
411
412 STAILQ_INIT(&so_cache_head);
413
414 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 + get_inpcb_str_size() + 4 + get_tcp_str_size());
416
417 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 ZC_ZFREE_CLEARMEM);
419
420 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424
425 in_pcbinit();
426 }
427
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 caddr_t temp;
432 uintptr_t offset;
433
434 lck_mtx_lock(&so_cache_mtx);
435
436 if (!STAILQ_EMPTY(&so_cache_head)) {
437 VERIFY(cached_sock_count > 0);
438
439 *so = STAILQ_FIRST(&so_cache_head);
440 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 STAILQ_NEXT((*so), so_cache_ent) = NULL;
442
443 cached_sock_count--;
444 lck_mtx_unlock(&so_cache_mtx);
445
446 temp = (*so)->so_saved_pcb;
447 bzero((caddr_t)*so, sizeof(struct socket));
448
449 (*so)->so_saved_pcb = temp;
450 } else {
451 lck_mtx_unlock(&so_cache_mtx);
452
453 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454
455 /*
456 * Define offsets for extra structures into our
457 * single block of memory. Align extra structures
458 * on longword boundaries.
459 */
460
461 offset = (uintptr_t)*so;
462 offset += sizeof(struct socket);
463
464 offset = ALIGN(offset);
465
466 (*so)->so_saved_pcb = (caddr_t)offset;
467 offset += get_inpcb_str_size();
468
469 offset = ALIGN(offset);
470
471 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 (caddr_t)offset;
473 }
474
475 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 lck_mtx_lock(&so_cache_mtx);
482
483 so_cache_time = net_uptime();
484 if (++cached_sock_count > max_cached_sock_count) {
485 --cached_sock_count;
486 lck_mtx_unlock(&so_cache_mtx);
487 zfree(so_cache_zone, so);
488 } else {
489 if (so_cache_hw < cached_sock_count) {
490 so_cache_hw = cached_sock_count;
491 }
492
493 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494
495 so->cache_timestamp = so_cache_time;
496 lck_mtx_unlock(&so_cache_mtx);
497 }
498 }
499
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 if (so->last_pid != 0) {
504 /*
505 * last_pid and last_upid should remain zero for sockets
506 * created using sock_socket. The check above achieves that
507 */
508 if (self == PROC_NULL) {
509 self = current_proc();
510 }
511
512 if (so->last_upid != proc_uniqueid(self) ||
513 so->last_pid != proc_pid(self)) {
514 so->last_upid = proc_uniqueid(self);
515 so->last_pid = proc_pid(self);
516 proc_getexecutableuuid(self, so->last_uuid,
517 sizeof(so->last_uuid));
518 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 }
521 }
522 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 }
524 }
525
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 (void) inp_update_policy(sotoinpcb(so));
531 }
532 }
533
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537 struct sockaddr *override_remote_addr)
538 {
539 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 override_remote_addr, 0);
542 }
543 }
544 #endif /* NECP */
545
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 struct socket *p;
550 int n_freed = 0;
551 boolean_t rc = FALSE;
552
553 lck_mtx_lock(&so_cache_mtx);
554 so_cache_timeouts++;
555 so_cache_time = net_uptime();
556
557 while (!STAILQ_EMPTY(&so_cache_head)) {
558 VERIFY(cached_sock_count > 0);
559 p = STAILQ_FIRST(&so_cache_head);
560 if ((so_cache_time - p->cache_timestamp) <
561 SO_CACHE_TIME_LIMIT) {
562 break;
563 }
564
565 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 --cached_sock_count;
567
568 zfree(so_cache_zone, p);
569
570 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 so_cache_max_freed++;
572 break;
573 }
574 }
575
576 /* Schedule again if there is more to cleanup */
577 if (!STAILQ_EMPTY(&so_cache_head)) {
578 rc = TRUE;
579 }
580
581 lck_mtx_unlock(&so_cache_mtx);
582 return rc;
583 }
584
585 /*
586 * Get a socket structure from our zone, and initialize it.
587 * We don't implement `waitok' yet (see comments in uipc_domain.c).
588 * Note that it would probably be better to allocate socket
589 * and PCB at the same time, but I'm not convinced that all
590 * the protocols can be easily modified to do this.
591 */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 struct socket *so;
597
598 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 cached_sock_alloc(&so, how);
600 } else {
601 so = zalloc_flags(socket_zone, how | Z_ZERO);
602 }
603 if (so != NULL) {
604 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605
606 /*
607 * Increment the socket allocation statistics
608 */
609 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 }
611
612 return so;
613 }
614
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617 struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 struct protosw *prp;
620 struct socket *so;
621 int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 pid_t rpid = -1;
624 #endif
625
626 #if TCPDEBUG
627 extern int tcpconsdebug;
628 #endif
629
630 VERIFY(aso != NULL);
631 *aso = NULL;
632
633 if (proto != 0) {
634 prp = pffindproto(dom, proto, type);
635 } else {
636 prp = pffindtype(dom, type);
637 }
638
639 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 if (pffinddomain(dom) == NULL) {
641 return EAFNOSUPPORT;
642 }
643 if (proto != 0) {
644 if (pffindprotonotype(dom, proto) != NULL) {
645 return EPROTOTYPE;
646 }
647 }
648 return EPROTONOSUPPORT;
649 }
650 if (prp->pr_type != type) {
651 return EPROTOTYPE;
652 }
653 so = soalloc(1, dom, type);
654 if (so == NULL) {
655 return ENOBUFS;
656 }
657
658 switch (dom) {
659 case PF_LOCAL:
660 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 break;
662 case PF_INET:
663 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 if (type == SOCK_STREAM) {
665 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 } else {
667 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 }
669 break;
670 case PF_ROUTE:
671 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 break;
673 case PF_NDRV:
674 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 break;
676 case PF_KEY:
677 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 break;
679 case PF_INET6:
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 if (type == SOCK_STREAM) {
682 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 } else {
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 }
686 break;
687 case PF_SYSTEM:
688 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 break;
690 case PF_MULTIPATH:
691 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 break;
693 default:
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 break;
696 }
697
698 if (flags & SOCF_MPTCP) {
699 so->so_state |= SS_NBIO;
700 }
701
702 TAILQ_INIT(&so->so_incomp);
703 TAILQ_INIT(&so->so_comp);
704 so->so_type = (short)type;
705 so->last_upid = proc_uniqueid(p);
706 so->last_pid = proc_pid(p);
707 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709
710 if (ep != PROC_NULL && ep != p) {
711 so->e_upid = proc_uniqueid(ep);
712 so->e_pid = proc_pid(ep);
713 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 if (ep->p_responsible_pid != so->e_pid) {
717 rpid = ep->p_responsible_pid;
718 }
719 #endif
720 }
721
722 #if defined(XNU_TARGET_OS_OSX)
723 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 rpid = p->p_responsible_pid;
725 }
726
727 so->so_rpid = -1;
728 uuid_clear(so->so_ruuid);
729 if (rpid >= 0) {
730 proc_t rp = proc_find(rpid);
731 if (rp != PROC_NULL) {
732 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 so->so_rpid = rpid;
734 proc_rele(rp);
735 }
736 }
737 #endif
738
739 so->so_cred = kauth_cred_proc_ref(p);
740 if (!suser(kauth_cred_get(), NULL)) {
741 so->so_state |= SS_PRIV;
742 }
743
744 so->so_proto = prp;
745 so->so_rcv.sb_flags |= SB_RECV;
746 so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 so->next_lock_lr = 0;
748 so->next_unlock_lr = 0;
749
750 /*
751 * Attachment will create the per pcb lock if necessary and
752 * increase refcount for creation, make sure it's done before
753 * socket is inserted in lists.
754 */
755 so->so_usecount++;
756
757 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 if (error != 0) {
759 /*
760 * Warning:
761 * If so_pcb is not zero, the socket will be leaked,
762 * so protocol attachment handler must be coded carefuly
763 */
764 if (so->so_pcb != NULL) {
765 os_log_error(OS_LOG_DEFAULT,
766 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 error, dom, proto, type);
768 }
769 /*
770 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 */
772 so->so_state |= SS_NOFDREF;
773 so->so_flags |= SOF_PCBCLEARING;
774 VERIFY(so->so_usecount > 0);
775 so->so_usecount--;
776 sofreelastref(so, 1); /* will deallocate the socket */
777 return error;
778 }
779
780 /*
781 * Note: needs so_pcb to be set after pru_attach
782 */
783 if (prp->pr_update_last_owner != NULL) {
784 (*prp->pr_update_last_owner)(so, p, ep);
785 }
786
787 atomic_add_32(&prp->pr_domain->dom_refs, 1);
788
789 /* Attach socket filters for this protocol */
790 sflt_initsock(so);
791 #if TCPDEBUG
792 if (tcpconsdebug == 2) {
793 so->so_options |= SO_DEBUG;
794 }
795 #endif
796 so_set_default_traffic_class(so);
797
798 /*
799 * If this thread or task is marked to create backgrounded sockets,
800 * mark the socket as background.
801 */
802 if (!(flags & SOCF_MPTCP) &&
803 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 so->so_background_thread = current_thread();
806 }
807
808 switch (dom) {
809 /*
810 * Don't mark Unix domain or system
811 * eligible for defunct by default.
812 */
813 case PF_LOCAL:
814 case PF_SYSTEM:
815 so->so_flags |= SOF_NODEFUNCT;
816 break;
817 default:
818 break;
819 }
820
821 /*
822 * Entitlements can't be checked at socket creation time except if the
823 * application requested a feature guarded by a privilege (c.f., socket
824 * delegation).
825 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 * a privilege check should only be triggered by a userland request.
827 * A privilege check at socket creation time is time consuming and
828 * could trigger many authorisation error messages from the security
829 * APIs.
830 */
831
832 *aso = so;
833
834 return 0;
835 }
836
837 /*
838 * Returns: 0 Success
839 * EAFNOSUPPORT
840 * EPROTOTYPE
841 * EPROTONOSUPPORT
842 * ENOBUFS
843 * <pru_attach>:ENOBUFS[AF_UNIX]
844 * <pru_attach>:ENOBUFS[TCP]
845 * <pru_attach>:ENOMEM[TCP]
846 * <pru_attach>:??? [other protocol families, IPSEC]
847 */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 PROC_NULL);
853 }
854
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 int error = 0;
859 struct proc *ep = PROC_NULL;
860
861 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 error = ESRCH;
863 goto done;
864 }
865
866 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867
868 /*
869 * It might not be wise to hold the proc reference when calling
870 * socreate_internal since it calls soalloc with M_WAITOK
871 */
872 done:
873 if (ep != PROC_NULL) {
874 proc_rele(ep);
875 }
876
877 return error;
878 }
879
880 /*
881 * Returns: 0 Success
882 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
883 * <pru_bind>:EAFNOSUPPORT Address family not supported
884 * <pru_bind>:EADDRNOTAVAIL Address not available.
885 * <pru_bind>:EINVAL Invalid argument
886 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
887 * <pru_bind>:EACCES Permission denied
888 * <pru_bind>:EADDRINUSE Address in use
889 * <pru_bind>:EAGAIN Resource unavailable, try again
890 * <pru_bind>:EPERM Operation not permitted
891 * <pru_bind>:???
892 * <sf_bind>:???
893 *
894 * Notes: It's not possible to fully enumerate the return codes above,
895 * since socket filter authors and protocol family authors may
896 * not choose to limit their error returns to those listed, even
897 * though this may result in some software operating incorrectly.
898 *
899 * The error codes which are enumerated above are those known to
900 * be returned by the tcp_usr_bind function supplied.
901 */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 struct proc *p = current_proc();
906 int error = 0;
907
908 if (dolock) {
909 socket_lock(so, 1);
910 }
911
912 so_update_last_owner_locked(so, p);
913 so_update_policy(so);
914
915 #if NECP
916 so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918
919 /*
920 * If this is a bind request on a socket that has been marked
921 * as inactive, reject it now before we go any further.
922 */
923 if (so->so_flags & SOF_DEFUNCT) {
924 error = EINVAL;
925 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
926 __func__, proc_pid(p), proc_best_name(p),
927 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
928 SOCK_DOM(so), SOCK_TYPE(so), error);
929 goto out;
930 }
931
932 /* Socket filter */
933 error = sflt_bind(so, nam);
934
935 if (error == 0) {
936 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 }
938 out:
939 if (dolock) {
940 socket_unlock(so, 1);
941 }
942
943 if (error == EJUSTRETURN) {
944 error = 0;
945 }
946
947 return error;
948 }
949
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 kauth_cred_unref(&so->so_cred);
954
955 /* Remove any filters */
956 sflt_termsock(so);
957
958 #if CONTENT_FILTER
959 cfil_sock_detach(so);
960 #endif /* CONTENT_FILTER */
961
962 if (NEED_DGRAM_FLOW_TRACKING(so)) {
963 soflow_detach(so);
964 }
965
966 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
967
968 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
969 cached_sock_free(so);
970 } else {
971 zfree(socket_zone, so);
972 }
973 }
974
975 /*
976 * Returns: 0 Success
977 * EINVAL
978 * EOPNOTSUPP
979 * <pru_listen>:EINVAL[AF_UNIX]
980 * <pru_listen>:EINVAL[TCP]
981 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
982 * <pru_listen>:EINVAL[TCP] Invalid argument
983 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
984 * <pru_listen>:EACCES[TCP] Permission denied
985 * <pru_listen>:EADDRINUSE[TCP] Address in use
986 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
987 * <pru_listen>:EPERM[TCP] Operation not permitted
988 * <sf_listen>:???
989 *
990 * Notes: Other <pru_listen> returns depend on the protocol family; all
991 * <sf_listen> returns depend on what the filter author causes
992 * their filter to return.
993 */
994 int
solisten(struct socket * so,int backlog)995 solisten(struct socket *so, int backlog)
996 {
997 struct proc *p = current_proc();
998 int error = 0;
999
1000 socket_lock(so, 1);
1001
1002 so_update_last_owner_locked(so, p);
1003 so_update_policy(so);
1004
1005 #if NECP
1006 so_update_necp_policy(so, NULL, NULL);
1007 #endif /* NECP */
1008
1009 if (so->so_proto == NULL) {
1010 error = EINVAL;
1011 goto out;
1012 }
1013 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1014 error = EOPNOTSUPP;
1015 goto out;
1016 }
1017
1018 /*
1019 * If the listen request is made on a socket that is not fully
1020 * disconnected, or on a socket that has been marked as inactive,
1021 * reject the request now.
1022 */
1023 if ((so->so_state &
1024 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1025 (so->so_flags & SOF_DEFUNCT)) {
1026 error = EINVAL;
1027 if (so->so_flags & SOF_DEFUNCT) {
1028 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1029 "(%d)\n", __func__, proc_pid(p),
1030 proc_best_name(p),
1031 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1032 SOCK_DOM(so), SOCK_TYPE(so), error);
1033 }
1034 goto out;
1035 }
1036
1037 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1038 error = EPERM;
1039 goto out;
1040 }
1041
1042 error = sflt_listen(so);
1043 if (error == 0) {
1044 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1045 }
1046
1047 if (error) {
1048 if (error == EJUSTRETURN) {
1049 error = 0;
1050 }
1051 goto out;
1052 }
1053
1054 if (TAILQ_EMPTY(&so->so_comp)) {
1055 so->so_options |= SO_ACCEPTCONN;
1056 }
1057 /*
1058 * POSIX: The implementation may have an upper limit on the length of
1059 * the listen queue-either global or per accepting socket. If backlog
1060 * exceeds this limit, the length of the listen queue is set to the
1061 * limit.
1062 *
1063 * If listen() is called with a backlog argument value that is less
1064 * than 0, the function behaves as if it had been called with a backlog
1065 * argument value of 0.
1066 *
1067 * A backlog argument of 0 may allow the socket to accept connections,
1068 * in which case the length of the listen queue may be set to an
1069 * implementation-defined minimum value.
1070 */
1071 if (backlog <= 0 || backlog > somaxconn) {
1072 backlog = somaxconn;
1073 }
1074
1075 so->so_qlimit = (short)backlog;
1076 out:
1077 socket_unlock(so, 1);
1078 return error;
1079 }
1080
1081 /*
1082 * The "accept list lock" protects the fields related to the listener queues
1083 * because we can unlock a socket to respect the lock ordering between
1084 * the listener socket and its clients sockets. The lock ordering is first to
1085 * acquire the client socket before the listener socket.
1086 *
1087 * The accept list lock serializes access to the following fields:
1088 * - of the listener socket:
1089 * - so_comp
1090 * - so_incomp
1091 * - so_qlen
1092 * - so_inqlen
1093 * - of client sockets that are in so_comp or so_incomp:
1094 * - so_head
1095 * - so_list
1096 *
1097 * As one can see the accept list lock protects the consistent of the
1098 * linkage of the client sockets.
1099 *
1100 * Note that those fields may be read without holding the accept list lock
1101 * for a preflight provided the accept list lock is taken when committing
1102 * to take an action based on the result of the preflight. The preflight
1103 * saves the cost of doing the unlock/lock dance.
1104 */
1105 void
so_acquire_accept_list(struct socket * head,struct socket * so)1106 so_acquire_accept_list(struct socket *head, struct socket *so)
1107 {
1108 lck_mtx_t *mutex_held;
1109
1110 if (head->so_proto->pr_getlock == NULL) {
1111 return;
1112 }
1113 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1114 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1115
1116 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1117 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1118 return;
1119 }
1120 if (so != NULL) {
1121 socket_unlock(so, 0);
1122 }
1123 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1124 so_accept_list_waits += 1;
1125 msleep((caddr_t)&head->so_incomp, mutex_held,
1126 PSOCK | PCATCH, __func__, NULL);
1127 }
1128 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 if (so != NULL) {
1130 socket_unlock(head, 0);
1131 socket_lock(so, 0);
1132 socket_lock(head, 0);
1133 }
1134 }
1135
1136 void
so_release_accept_list(struct socket * head)1137 so_release_accept_list(struct socket *head)
1138 {
1139 if (head->so_proto->pr_getlock != NULL) {
1140 lck_mtx_t *mutex_held;
1141
1142 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1143 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1144
1145 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1146 wakeup((caddr_t)&head->so_incomp);
1147 }
1148 }
1149
1150 void
sofreelastref(struct socket * so,int dealloc)1151 sofreelastref(struct socket *so, int dealloc)
1152 {
1153 struct socket *head = so->so_head;
1154
1155 /* Assume socket is locked */
1156
1157 #if FLOW_DIVERT
1158 if (so->so_flags & SOF_FLOW_DIVERT) {
1159 flow_divert_detach(so);
1160 }
1161 #endif /* FLOW_DIVERT */
1162
1163 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1164 selthreadclear(&so->so_snd.sb_sel);
1165 selthreadclear(&so->so_rcv.sb_sel);
1166 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1167 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1168 so->so_event = sonullevent;
1169 return;
1170 }
1171 if (head != NULL) {
1172 /*
1173 * Need to lock the listener when the protocol has
1174 * per socket locks
1175 */
1176 if (head->so_proto->pr_getlock != NULL) {
1177 socket_lock(head, 1);
1178 so_acquire_accept_list(head, so);
1179 }
1180 if (so->so_state & SS_INCOMP) {
1181 so->so_state &= ~SS_INCOMP;
1182 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1183 head->so_incqlen--;
1184 head->so_qlen--;
1185 so->so_head = NULL;
1186
1187 if (head->so_proto->pr_getlock != NULL) {
1188 so_release_accept_list(head);
1189 socket_unlock(head, 1);
1190 }
1191 } else if (so->so_state & SS_COMP) {
1192 if (head->so_proto->pr_getlock != NULL) {
1193 so_release_accept_list(head);
1194 socket_unlock(head, 1);
1195 }
1196 /*
1197 * We must not decommission a socket that's
1198 * on the accept(2) queue. If we do, then
1199 * accept(2) may hang after select(2) indicated
1200 * that the listening socket was ready.
1201 */
1202 selthreadclear(&so->so_snd.sb_sel);
1203 selthreadclear(&so->so_rcv.sb_sel);
1204 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1205 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1206 so->so_event = sonullevent;
1207 return;
1208 } else {
1209 if (head->so_proto->pr_getlock != NULL) {
1210 so_release_accept_list(head);
1211 socket_unlock(head, 1);
1212 }
1213 printf("sofree: not queued\n");
1214 }
1215 }
1216 sowflush(so);
1217 sorflush(so);
1218
1219 /* 3932268: disable upcall */
1220 so->so_rcv.sb_flags &= ~SB_UPCALL;
1221 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1222 so->so_event = sonullevent;
1223
1224 if (dealloc) {
1225 sodealloc(so);
1226 }
1227 }
1228
1229 void
soclose_wait_locked(struct socket * so)1230 soclose_wait_locked(struct socket *so)
1231 {
1232 lck_mtx_t *mutex_held;
1233
1234 if (so->so_proto->pr_getlock != NULL) {
1235 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1236 } else {
1237 mutex_held = so->so_proto->pr_domain->dom_mtx;
1238 }
1239 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1240
1241 /*
1242 * Double check here and return if there's no outstanding upcall;
1243 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1244 */
1245 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1246 return;
1247 }
1248 so->so_rcv.sb_flags &= ~SB_UPCALL;
1249 so->so_snd.sb_flags &= ~SB_UPCALL;
1250 so->so_flags |= SOF_CLOSEWAIT;
1251
1252 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1253 "soclose_wait_locked", NULL);
1254 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1255 so->so_flags &= ~SOF_CLOSEWAIT;
1256 }
1257
1258 /*
1259 * Close a socket on last file table reference removal.
1260 * Initiate disconnect if connected.
1261 * Free socket when disconnect complete.
1262 */
1263 int
soclose_locked(struct socket * so)1264 soclose_locked(struct socket *so)
1265 {
1266 int error = 0;
1267 struct timespec ts;
1268
1269 if (so->so_usecount == 0) {
1270 panic("soclose: so=%p refcount=0", so);
1271 /* NOTREACHED */
1272 }
1273
1274 sflt_notify(so, sock_evt_closing, NULL);
1275
1276 if (so->so_upcallusecount) {
1277 soclose_wait_locked(so);
1278 }
1279
1280 #if CONTENT_FILTER
1281 /*
1282 * We have to wait until the content filters are done
1283 */
1284 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1285 cfil_sock_close_wait(so);
1286 cfil_sock_is_closed(so);
1287 cfil_sock_detach(so);
1288 }
1289 #endif /* CONTENT_FILTER */
1290
1291 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1292 soflow_detach(so);
1293 }
1294
1295 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1296 soresume(current_proc(), so, 1);
1297 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1298 }
1299
1300 if ((so->so_options & SO_ACCEPTCONN)) {
1301 struct socket *sp, *sonext;
1302 int persocklock = 0;
1303 int incomp_overflow_only;
1304
1305 /*
1306 * We do not want new connection to be added
1307 * to the connection queues
1308 */
1309 so->so_options &= ~SO_ACCEPTCONN;
1310
1311 /*
1312 * We can drop the lock on the listener once
1313 * we've acquired the incoming list
1314 */
1315 if (so->so_proto->pr_getlock != NULL) {
1316 persocklock = 1;
1317 so_acquire_accept_list(so, NULL);
1318 socket_unlock(so, 0);
1319 }
1320 again:
1321 incomp_overflow_only = 1;
1322
1323 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1324 /*
1325 * Radar 5350314
1326 * skip sockets thrown away by tcpdropdropblreq
1327 * they will get cleanup by the garbage collection.
1328 * otherwise, remove the incomp socket from the queue
1329 * and let soabort trigger the appropriate cleanup.
1330 */
1331 if (sp->so_flags & SOF_OVERFLOW) {
1332 continue;
1333 }
1334
1335 if (persocklock != 0) {
1336 socket_lock(sp, 1);
1337 }
1338
1339 /*
1340 * Radar 27945981
1341 * The extra reference for the list insure the
1342 * validity of the socket pointer when we perform the
1343 * unlock of the head above
1344 */
1345 if (sp->so_state & SS_INCOMP) {
1346 sp->so_state &= ~SS_INCOMP;
1347 sp->so_head = NULL;
1348 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1349 so->so_incqlen--;
1350 so->so_qlen--;
1351
1352 (void) soabort(sp);
1353 } else {
1354 panic("%s sp %p in so_incomp but !SS_INCOMP",
1355 __func__, sp);
1356 }
1357
1358 if (persocklock != 0) {
1359 socket_unlock(sp, 1);
1360 }
1361 }
1362
1363 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1364 /* Dequeue from so_comp since sofree() won't do it */
1365 if (persocklock != 0) {
1366 socket_lock(sp, 1);
1367 }
1368
1369 if (sp->so_state & SS_COMP) {
1370 sp->so_state &= ~SS_COMP;
1371 sp->so_head = NULL;
1372 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1373 so->so_qlen--;
1374
1375 (void) soabort(sp);
1376 } else {
1377 panic("%s sp %p in so_comp but !SS_COMP",
1378 __func__, sp);
1379 }
1380
1381 if (persocklock) {
1382 socket_unlock(sp, 1);
1383 }
1384 }
1385
1386 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1387 #if (DEBUG | DEVELOPMENT)
1388 panic("%s head %p so_comp not empty", __func__, so);
1389 #endif /* (DEVELOPMENT || DEBUG) */
1390
1391 goto again;
1392 }
1393
1394 if (!TAILQ_EMPTY(&so->so_comp)) {
1395 #if (DEBUG | DEVELOPMENT)
1396 panic("%s head %p so_comp not empty", __func__, so);
1397 #endif /* (DEVELOPMENT || DEBUG) */
1398
1399 goto again;
1400 }
1401
1402 if (persocklock) {
1403 socket_lock(so, 0);
1404 so_release_accept_list(so);
1405 }
1406 }
1407 if (so->so_pcb == NULL) {
1408 /* 3915887: mark the socket as ready for dealloc */
1409 so->so_flags |= SOF_PCBCLEARING;
1410 goto discard;
1411 }
1412 if (so->so_state & SS_ISCONNECTED) {
1413 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1414 error = sodisconnectlocked(so);
1415 if (error) {
1416 goto drop;
1417 }
1418 }
1419 if (so->so_options & SO_LINGER) {
1420 lck_mtx_t *mutex_held;
1421
1422 if ((so->so_state & SS_ISDISCONNECTING) &&
1423 (so->so_state & SS_NBIO)) {
1424 goto drop;
1425 }
1426 if (so->so_proto->pr_getlock != NULL) {
1427 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1428 } else {
1429 mutex_held = so->so_proto->pr_domain->dom_mtx;
1430 }
1431 while (so->so_state & SS_ISCONNECTED) {
1432 ts.tv_sec = (so->so_linger / 100);
1433 ts.tv_nsec = (so->so_linger % 100) *
1434 NSEC_PER_USEC * 1000 * 10;
1435 error = msleep((caddr_t)&so->so_timeo,
1436 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1437 if (error) {
1438 /*
1439 * It's OK when the time fires,
1440 * don't report an error
1441 */
1442 if (error == EWOULDBLOCK) {
1443 error = 0;
1444 }
1445 break;
1446 }
1447 }
1448 }
1449 }
1450 drop:
1451 if (so->so_usecount == 0) {
1452 panic("soclose: usecount is zero so=%p", so);
1453 /* NOTREACHED */
1454 }
1455 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1456 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1457 if (error == 0) {
1458 error = error2;
1459 }
1460 }
1461 if (so->so_usecount <= 0) {
1462 panic("soclose: usecount is zero so=%p", so);
1463 /* NOTREACHED */
1464 }
1465 discard:
1466 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1467 (so->so_state & SS_NOFDREF)) {
1468 panic("soclose: NOFDREF");
1469 /* NOTREACHED */
1470 }
1471 so->so_state |= SS_NOFDREF;
1472
1473 if ((so->so_flags & SOF_KNOTE) != 0) {
1474 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1475 }
1476
1477 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1478
1479 VERIFY(so->so_usecount > 0);
1480 so->so_usecount--;
1481 sofree(so);
1482 return error;
1483 }
1484
1485 int
soclose(struct socket * so)1486 soclose(struct socket *so)
1487 {
1488 int error = 0;
1489 socket_lock(so, 1);
1490
1491 if (so->so_retaincnt == 0) {
1492 error = soclose_locked(so);
1493 } else {
1494 /*
1495 * if the FD is going away, but socket is
1496 * retained in kernel remove its reference
1497 */
1498 so->so_usecount--;
1499 if (so->so_usecount < 2) {
1500 panic("soclose: retaincnt non null and so=%p "
1501 "usecount=%d\n", so, so->so_usecount);
1502 }
1503 }
1504 socket_unlock(so, 1);
1505 return error;
1506 }
1507
1508 /*
1509 * Must be called at splnet...
1510 */
1511 /* Should already be locked */
1512 int
soabort(struct socket * so)1513 soabort(struct socket *so)
1514 {
1515 int error;
1516
1517 #ifdef MORE_LOCKING_DEBUG
1518 lck_mtx_t *mutex_held;
1519
1520 if (so->so_proto->pr_getlock != NULL) {
1521 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1522 } else {
1523 mutex_held = so->so_proto->pr_domain->dom_mtx;
1524 }
1525 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1526 #endif
1527
1528 if ((so->so_flags & SOF_ABORTED) == 0) {
1529 so->so_flags |= SOF_ABORTED;
1530 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1531 if (error) {
1532 sofree(so);
1533 return error;
1534 }
1535 }
1536 return 0;
1537 }
1538
1539 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1540 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1541 {
1542 int error;
1543
1544 if (dolock) {
1545 socket_lock(so, 1);
1546 }
1547
1548 so_update_last_owner_locked(so, PROC_NULL);
1549 so_update_policy(so);
1550 #if NECP
1551 so_update_necp_policy(so, NULL, NULL);
1552 #endif /* NECP */
1553
1554 if ((so->so_state & SS_NOFDREF) == 0) {
1555 panic("soaccept: !NOFDREF");
1556 }
1557 so->so_state &= ~SS_NOFDREF;
1558 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1559
1560 if (dolock) {
1561 socket_unlock(so, 1);
1562 }
1563 return error;
1564 }
1565
1566 int
soaccept(struct socket * so,struct sockaddr ** nam)1567 soaccept(struct socket *so, struct sockaddr **nam)
1568 {
1569 return soacceptlock(so, nam, 1);
1570 }
1571
1572 int
soacceptfilter(struct socket * so,struct socket * head)1573 soacceptfilter(struct socket *so, struct socket *head)
1574 {
1575 struct sockaddr *local = NULL, *remote = NULL;
1576 int error = 0;
1577
1578 /*
1579 * Hold the lock even if this socket has not been made visible
1580 * to the filter(s). For sockets with global locks, this protects
1581 * against the head or peer going away
1582 */
1583 socket_lock(so, 1);
1584 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1585 sogetaddr_locked(so, &local, 0) != 0) {
1586 so->so_state &= ~SS_NOFDREF;
1587 socket_unlock(so, 1);
1588 soclose(so);
1589 /* Out of resources; try it again next time */
1590 error = ECONNABORTED;
1591 goto done;
1592 }
1593
1594 error = sflt_accept(head, so, local, remote);
1595
1596 /*
1597 * If we get EJUSTRETURN from one of the filters, mark this socket
1598 * as inactive and return it anyway. This newly accepted socket
1599 * will be disconnected later before we hand it off to the caller.
1600 */
1601 if (error == EJUSTRETURN) {
1602 error = 0;
1603 (void) sosetdefunct(current_proc(), so,
1604 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1605 }
1606
1607 if (error != 0) {
1608 /*
1609 * This may seem like a duplication to the above error
1610 * handling part when we return ECONNABORTED, except
1611 * the following is done while holding the lock since
1612 * the socket has been exposed to the filter(s) earlier.
1613 */
1614 so->so_state &= ~SS_NOFDREF;
1615 socket_unlock(so, 1);
1616 soclose(so);
1617 /* Propagate socket filter's error code to the caller */
1618 } else {
1619 socket_unlock(so, 1);
1620 }
1621 done:
1622 /* Callee checks for NULL pointer */
1623 sock_freeaddr(remote);
1624 sock_freeaddr(local);
1625 return error;
1626 }
1627
1628 /*
1629 * Returns: 0 Success
1630 * EOPNOTSUPP Operation not supported on socket
1631 * EISCONN Socket is connected
1632 * <pru_connect>:EADDRNOTAVAIL Address not available.
1633 * <pru_connect>:EINVAL Invalid argument
1634 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1635 * <pru_connect>:EACCES Permission denied
1636 * <pru_connect>:EADDRINUSE Address in use
1637 * <pru_connect>:EAGAIN Resource unavailable, try again
1638 * <pru_connect>:EPERM Operation not permitted
1639 * <sf_connect_out>:??? [anything a filter writer might set]
1640 */
1641 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1642 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1643 {
1644 int error;
1645 struct proc *p = current_proc();
1646 tracker_metadata_t metadata = { };
1647
1648 if (dolock) {
1649 socket_lock(so, 1);
1650 }
1651
1652 so_update_last_owner_locked(so, p);
1653 so_update_policy(so);
1654
1655 #if NECP
1656 so_update_necp_policy(so, NULL, nam);
1657 #endif /* NECP */
1658
1659 /*
1660 * If this is a listening socket or if this is a previously-accepted
1661 * socket that has been marked as inactive, reject the connect request.
1662 */
1663 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1664 error = EOPNOTSUPP;
1665 if (so->so_flags & SOF_DEFUNCT) {
1666 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1667 "(%d)\n", __func__, proc_pid(p),
1668 proc_best_name(p),
1669 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1670 SOCK_DOM(so), SOCK_TYPE(so), error);
1671 }
1672 if (dolock) {
1673 socket_unlock(so, 1);
1674 }
1675 return error;
1676 }
1677
1678 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1679 if (dolock) {
1680 socket_unlock(so, 1);
1681 }
1682 return EPERM;
1683 }
1684
1685 /*
1686 * If protocol is connection-based, can only connect once.
1687 * Otherwise, if connected, try to disconnect first.
1688 * This allows user to disconnect by connecting to, e.g.,
1689 * a null address.
1690 */
1691 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1692 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1693 (error = sodisconnectlocked(so)))) {
1694 error = EISCONN;
1695 } else {
1696 /*
1697 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1698 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1699 */
1700 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1701 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1702 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1703 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1704 }
1705 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1706 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1707 }
1708 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1709 printf("connect() - failed necp_set_socket_domain_attributes");
1710 }
1711 }
1712 }
1713
1714 /*
1715 * Run connect filter before calling protocol:
1716 * - non-blocking connect returns before completion;
1717 */
1718 error = sflt_connectout(so, nam);
1719 if (error != 0) {
1720 if (error == EJUSTRETURN) {
1721 error = 0;
1722 }
1723 } else {
1724 error = (*so->so_proto->pr_usrreqs->pru_connect)
1725 (so, nam, p);
1726 if (error != 0) {
1727 so->so_state &= ~SS_ISCONNECTING;
1728 }
1729 }
1730 }
1731 if (dolock) {
1732 socket_unlock(so, 1);
1733 }
1734 return error;
1735 }
1736
1737 int
soconnect(struct socket * so,struct sockaddr * nam)1738 soconnect(struct socket *so, struct sockaddr *nam)
1739 {
1740 return soconnectlock(so, nam, 1);
1741 }
1742
1743 /*
1744 * Returns: 0 Success
1745 * <pru_connect2>:EINVAL[AF_UNIX]
1746 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1747 * <pru_connect2>:??? [other protocol families]
1748 *
1749 * Notes: <pru_connect2> is not supported by [TCP].
1750 */
1751 int
soconnect2(struct socket * so1,struct socket * so2)1752 soconnect2(struct socket *so1, struct socket *so2)
1753 {
1754 int error;
1755
1756 socket_lock(so1, 1);
1757 if (so2->so_proto->pr_lock) {
1758 socket_lock(so2, 1);
1759 }
1760
1761 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1762
1763 socket_unlock(so1, 1);
1764 if (so2->so_proto->pr_lock) {
1765 socket_unlock(so2, 1);
1766 }
1767 return error;
1768 }
1769
1770 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1771 soconnectxlocked(struct socket *so, struct sockaddr *src,
1772 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1773 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1774 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1775 {
1776 int error;
1777 tracker_metadata_t metadata = { };
1778
1779 so_update_last_owner_locked(so, p);
1780 so_update_policy(so);
1781
1782 /*
1783 * If this is a listening socket or if this is a previously-accepted
1784 * socket that has been marked as inactive, reject the connect request.
1785 */
1786 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1787 error = EOPNOTSUPP;
1788 if (so->so_flags & SOF_DEFUNCT) {
1789 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1790 "(%d)\n", __func__, proc_pid(p),
1791 proc_best_name(p),
1792 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1793 SOCK_DOM(so), SOCK_TYPE(so), error);
1794 }
1795 return error;
1796 }
1797
1798 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1799 return EPERM;
1800 }
1801
1802 /*
1803 * If protocol is connection-based, can only connect once
1804 * unless PR_MULTICONN is set. Otherwise, if connected,
1805 * try to disconnect first. This allows user to disconnect
1806 * by connecting to, e.g., a null address.
1807 */
1808 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1809 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1810 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1811 (error = sodisconnectlocked(so)) != 0)) {
1812 error = EISCONN;
1813 } else {
1814 /*
1815 * For TCP, check if destination address is a tracker and mark the socket accordingly
1816 * (only if it hasn't been marked yet).
1817 */
1818 if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1819 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1820 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1821 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1822 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1823 }
1824 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1825 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1826 }
1827 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1828 printf("connectx() - failed necp_set_socket_domain_attributes");
1829 }
1830 }
1831 }
1832
1833 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1834 (flags & CONNECT_DATA_IDEMPOTENT)) {
1835 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1836
1837 if (flags & CONNECT_DATA_AUTHENTICATED) {
1838 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1839 }
1840 }
1841
1842 /*
1843 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1844 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1845 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1846 * Case 3 allows user to combine write with connect even if they have
1847 * no use for TFO (such as regular TCP, and UDP).
1848 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1849 */
1850 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1851 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1852 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1853 }
1854
1855 /*
1856 * If a user sets data idempotent and does not pass an uio, or
1857 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1858 * SOF1_DATA_IDEMPOTENT.
1859 */
1860 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1861 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1862 /* We should return EINVAL instead perhaps. */
1863 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1864 }
1865
1866 /*
1867 * Run connect filter before calling protocol:
1868 * - non-blocking connect returns before completion;
1869 */
1870 error = sflt_connectout(so, dst);
1871 if (error != 0) {
1872 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1873 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1874 if (error == EJUSTRETURN) {
1875 error = 0;
1876 }
1877 } else {
1878 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1879 (so, src, dst, p, ifscope, aid, pcid,
1880 flags, arg, arglen, auio, bytes_written);
1881 if (error != 0) {
1882 so->so_state &= ~SS_ISCONNECTING;
1883 if (error != EINPROGRESS) {
1884 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1885 }
1886 }
1887 }
1888 }
1889
1890 return error;
1891 }
1892
1893 int
sodisconnectlocked(struct socket * so)1894 sodisconnectlocked(struct socket *so)
1895 {
1896 int error;
1897
1898 if ((so->so_state & SS_ISCONNECTED) == 0) {
1899 error = ENOTCONN;
1900 goto bad;
1901 }
1902 if (so->so_state & SS_ISDISCONNECTING) {
1903 error = EALREADY;
1904 goto bad;
1905 }
1906
1907 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1908 if (error == 0) {
1909 sflt_notify(so, sock_evt_disconnected, NULL);
1910 }
1911
1912 bad:
1913 return error;
1914 }
1915
1916 /* Locking version */
1917 int
sodisconnect(struct socket * so)1918 sodisconnect(struct socket *so)
1919 {
1920 int error;
1921
1922 socket_lock(so, 1);
1923 error = sodisconnectlocked(so);
1924 socket_unlock(so, 1);
1925 return error;
1926 }
1927
1928 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1929 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1930 {
1931 int error;
1932
1933 /*
1934 * Call the protocol disconnectx handler; let it handle all
1935 * matters related to the connection state of this session.
1936 */
1937 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1938 if (error == 0) {
1939 /*
1940 * The event applies only for the session, not for
1941 * the disconnection of individual subflows.
1942 */
1943 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1944 sflt_notify(so, sock_evt_disconnected, NULL);
1945 }
1946 }
1947 return error;
1948 }
1949
1950 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1951 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1952 {
1953 int error;
1954
1955 socket_lock(so, 1);
1956 error = sodisconnectxlocked(so, aid, cid);
1957 socket_unlock(so, 1);
1958 return error;
1959 }
1960
1961 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1962
1963 /*
1964 * sosendcheck will lock the socket buffer if it isn't locked and
1965 * verify that there is space for the data being inserted.
1966 *
1967 * Returns: 0 Success
1968 * EPIPE
1969 * sblock:EWOULDBLOCK
1970 * sblock:EINTR
1971 * sbwait:EBADF
1972 * sbwait:EINTR
1973 * [so_error]:???
1974 */
1975 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1976 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1977 int32_t clen, int32_t atomic, int flags, int *sblocked)
1978 {
1979 int error = 0;
1980 int32_t space;
1981 int assumelock = 0;
1982
1983 restart:
1984 if (*sblocked == 0) {
1985 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1986 so->so_send_filt_thread != 0 &&
1987 so->so_send_filt_thread == current_thread()) {
1988 /*
1989 * We're being called recursively from a filter,
1990 * allow this to continue. Radar 4150520.
1991 * Don't set sblocked because we don't want
1992 * to perform an unlock later.
1993 */
1994 assumelock = 1;
1995 } else {
1996 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1997 if (error) {
1998 if (so->so_flags & SOF_DEFUNCT) {
1999 goto defunct;
2000 }
2001 return error;
2002 }
2003 *sblocked = 1;
2004 }
2005 }
2006
2007 /*
2008 * If a send attempt is made on a socket that has been marked
2009 * as inactive (disconnected), reject the request.
2010 */
2011 if (so->so_flags & SOF_DEFUNCT) {
2012 defunct:
2013 error = EPIPE;
2014 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2015 __func__, proc_selfpid(), proc_best_name(current_proc()),
2016 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2017 SOCK_DOM(so), SOCK_TYPE(so), error);
2018 return error;
2019 }
2020
2021 if (so->so_state & SS_CANTSENDMORE) {
2022 #if CONTENT_FILTER
2023 /*
2024 * Can re-inject data of half closed connections
2025 */
2026 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2027 so->so_snd.sb_cfil_thread == current_thread() &&
2028 cfil_sock_data_pending(&so->so_snd) != 0) {
2029 CFIL_LOG(LOG_INFO,
2030 "so %llx ignore SS_CANTSENDMORE",
2031 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2032 } else
2033 #endif /* CONTENT_FILTER */
2034 return EPIPE;
2035 }
2036 if (so->so_error) {
2037 error = so->so_error;
2038 so->so_error = 0;
2039 return error;
2040 }
2041
2042 if ((so->so_state & SS_ISCONNECTED) == 0) {
2043 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2044 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2045 (resid != 0 || clen == 0) &&
2046 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2047 return ENOTCONN;
2048 }
2049 } else if (addr == 0) {
2050 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2051 ENOTCONN : EDESTADDRREQ;
2052 }
2053 }
2054
2055 space = sbspace(&so->so_snd);
2056
2057 if (flags & MSG_OOB) {
2058 space += 1024;
2059 }
2060 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2061 clen > so->so_snd.sb_hiwat) {
2062 return EMSGSIZE;
2063 }
2064
2065 if ((space < resid + clen &&
2066 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2067 space < clen)) ||
2068 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2069 /*
2070 * don't block the connectx call when there's more data
2071 * than can be copied.
2072 */
2073 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2074 if (space == 0) {
2075 return EWOULDBLOCK;
2076 }
2077 if (space < (int32_t)so->so_snd.sb_lowat) {
2078 return 0;
2079 }
2080 }
2081 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2082 assumelock) {
2083 return EWOULDBLOCK;
2084 }
2085 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2086 *sblocked = 0;
2087 error = sbwait(&so->so_snd);
2088 if (error) {
2089 if (so->so_flags & SOF_DEFUNCT) {
2090 goto defunct;
2091 }
2092 return error;
2093 }
2094 goto restart;
2095 }
2096 return 0;
2097 }
2098
2099 /*
2100 * Send on a socket.
2101 * If send must go all at once and message is larger than
2102 * send buffering, then hard error.
2103 * Lock against other senders.
2104 * If must go all at once and not enough room now, then
2105 * inform user that this would block and do nothing.
2106 * Otherwise, if nonblocking, send as much as possible.
2107 * The data to be sent is described by "uio" if nonzero,
2108 * otherwise by the mbuf chain "top" (which must be null
2109 * if uio is not). Data provided in mbuf chain must be small
2110 * enough to send all at once.
2111 *
2112 * Returns nonzero on error, timeout or signal; callers
2113 * must check for short counts if EINTR/ERESTART are returned.
2114 * Data and control buffers are freed on return.
2115 *
2116 * Returns: 0 Success
2117 * EOPNOTSUPP
2118 * EINVAL
2119 * ENOBUFS
2120 * uiomove:EFAULT
2121 * sosendcheck:EPIPE
2122 * sosendcheck:EWOULDBLOCK
2123 * sosendcheck:EINTR
2124 * sosendcheck:EBADF
2125 * sosendcheck:EINTR
2126 * sosendcheck:??? [value from so_error]
2127 * <pru_send>:ECONNRESET[TCP]
2128 * <pru_send>:EINVAL[TCP]
2129 * <pru_send>:ENOBUFS[TCP]
2130 * <pru_send>:EADDRINUSE[TCP]
2131 * <pru_send>:EADDRNOTAVAIL[TCP]
2132 * <pru_send>:EAFNOSUPPORT[TCP]
2133 * <pru_send>:EACCES[TCP]
2134 * <pru_send>:EAGAIN[TCP]
2135 * <pru_send>:EPERM[TCP]
2136 * <pru_send>:EMSGSIZE[TCP]
2137 * <pru_send>:EHOSTUNREACH[TCP]
2138 * <pru_send>:ENETUNREACH[TCP]
2139 * <pru_send>:ENETDOWN[TCP]
2140 * <pru_send>:ENOMEM[TCP]
2141 * <pru_send>:ENOBUFS[TCP]
2142 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2143 * <pru_send>:EINVAL[AF_UNIX]
2144 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2145 * <pru_send>:EPIPE[AF_UNIX]
2146 * <pru_send>:ENOTCONN[AF_UNIX]
2147 * <pru_send>:EISCONN[AF_UNIX]
2148 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2149 * <sf_data_out>:??? [whatever a filter author chooses]
2150 *
2151 * Notes: Other <pru_send> returns depend on the protocol family; all
2152 * <sf_data_out> returns depend on what the filter author causes
2153 * their filter to return.
2154 */
2155 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2156 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2157 struct mbuf *top, struct mbuf *control, int flags)
2158 {
2159 struct mbuf **mp;
2160 struct mbuf *m, *freelist = NULL;
2161 struct soflow_hash_entry *dgram_flow_entry = NULL;
2162 user_ssize_t space, len, resid, orig_resid;
2163 int clen = 0, error, dontroute, sendflags;
2164 int atomic = sosendallatonce(so) || top;
2165 int sblocked = 0;
2166 struct proc *p = current_proc();
2167 uint16_t headroom = 0;
2168 ssize_t mlen;
2169 boolean_t en_tracing = FALSE;
2170
2171 if (uio != NULL) {
2172 resid = uio_resid(uio);
2173 } else {
2174 resid = top->m_pkthdr.len;
2175 }
2176
2177 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2178 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2179
2180 socket_lock(so, 1);
2181
2182 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2183 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2184 }
2185
2186 /*
2187 * trace if tracing & network (vs. unix) sockets & and
2188 * non-loopback
2189 */
2190 if (ENTR_SHOULDTRACE &&
2191 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2192 struct inpcb *inp = sotoinpcb(so);
2193 if (inp->inp_last_outifp != NULL &&
2194 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2195 en_tracing = TRUE;
2196 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2197 VM_KERNEL_ADDRPERM(so),
2198 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2199 (int64_t)resid);
2200 orig_resid = resid;
2201 }
2202 }
2203
2204 /*
2205 * Re-injection should not affect process accounting
2206 */
2207 if ((flags & MSG_SKIPCFIL) == 0) {
2208 so_update_last_owner_locked(so, p);
2209 so_update_policy(so);
2210
2211 #if NECP
2212 so_update_necp_policy(so, NULL, addr);
2213 #endif /* NECP */
2214 }
2215
2216 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2217 error = EOPNOTSUPP;
2218 goto out_locked;
2219 }
2220
2221 /*
2222 * In theory resid should be unsigned.
2223 * However, space must be signed, as it might be less than 0
2224 * if we over-committed, and we must use a signed comparison
2225 * of space and resid. On the other hand, a negative resid
2226 * causes us to loop sending 0-length segments to the protocol.
2227 *
2228 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2229 *
2230 * Note: We limit resid to be a positive int value as we use
2231 * imin() to set bytes_to_copy -- radr://14558484
2232 */
2233 if (resid < 0 || resid > INT_MAX ||
2234 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2235 error = EINVAL;
2236 goto out_locked;
2237 }
2238
2239 dontroute = (flags & MSG_DONTROUTE) &&
2240 (so->so_options & SO_DONTROUTE) == 0 &&
2241 (so->so_proto->pr_flags & PR_ATOMIC);
2242 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2243
2244 if (control != NULL) {
2245 clen = control->m_len;
2246 }
2247
2248 if (soreserveheadroom != 0) {
2249 headroom = so->so_pktheadroom;
2250 }
2251
2252 do {
2253 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2254 &sblocked);
2255 if (error) {
2256 goto out_locked;
2257 }
2258
2259 mp = ⊤
2260 space = sbspace(&so->so_snd) - clen;
2261 space += ((flags & MSG_OOB) ? 1024 : 0);
2262
2263 do {
2264 if (uio == NULL) {
2265 /*
2266 * Data is prepackaged in "top".
2267 */
2268 resid = 0;
2269 if (flags & MSG_EOR) {
2270 top->m_flags |= M_EOR;
2271 }
2272 } else {
2273 int chainlength;
2274 int bytes_to_copy;
2275 boolean_t jumbocl;
2276 boolean_t bigcl;
2277 int bytes_to_alloc;
2278
2279 bytes_to_copy = imin((int)resid, (int)space);
2280
2281 bytes_to_alloc = bytes_to_copy;
2282 if (top == NULL) {
2283 bytes_to_alloc += headroom;
2284 }
2285
2286 if (sosendminchain > 0) {
2287 chainlength = 0;
2288 } else {
2289 chainlength = sosendmaxchain;
2290 }
2291
2292 /*
2293 * Use big 4 KB cluster when the outgoing interface
2294 * does not prefer 2 KB clusters
2295 */
2296 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2297 sosendbigcl_ignore_capab;
2298
2299 /*
2300 * Attempt to use larger than system page-size
2301 * clusters for large writes only if there is
2302 * a jumbo cluster pool and if the socket is
2303 * marked accordingly.
2304 */
2305 jumbocl = sosendjcl && njcl > 0 &&
2306 ((so->so_flags & SOF_MULTIPAGES) ||
2307 sosendjcl_ignore_capab) &&
2308 bigcl;
2309
2310 socket_unlock(so, 0);
2311
2312 do {
2313 int num_needed;
2314 int hdrs_needed = (top == NULL) ? 1 : 0;
2315
2316 /*
2317 * try to maintain a local cache of mbuf
2318 * clusters needed to complete this
2319 * write the list is further limited to
2320 * the number that are currently needed
2321 * to fill the socket this mechanism
2322 * allows a large number of mbufs/
2323 * clusters to be grabbed under a single
2324 * mbuf lock... if we can't get any
2325 * clusters, than fall back to trying
2326 * for mbufs if we fail early (or
2327 * miscalcluate the number needed) make
2328 * sure to release any clusters we
2329 * haven't yet consumed.
2330 */
2331 if (freelist == NULL &&
2332 bytes_to_alloc > MBIGCLBYTES &&
2333 jumbocl) {
2334 num_needed =
2335 bytes_to_alloc / M16KCLBYTES;
2336
2337 if ((bytes_to_alloc -
2338 (num_needed * M16KCLBYTES))
2339 >= MINCLSIZE) {
2340 num_needed++;
2341 }
2342
2343 freelist =
2344 m_getpackets_internal(
2345 (unsigned int *)&num_needed,
2346 hdrs_needed, M_WAIT, 0,
2347 M16KCLBYTES);
2348 /*
2349 * Fall back to 4K cluster size
2350 * if allocation failed
2351 */
2352 }
2353
2354 if (freelist == NULL &&
2355 bytes_to_alloc > MCLBYTES &&
2356 bigcl) {
2357 num_needed =
2358 bytes_to_alloc / MBIGCLBYTES;
2359
2360 if ((bytes_to_alloc -
2361 (num_needed * MBIGCLBYTES)) >=
2362 MINCLSIZE) {
2363 num_needed++;
2364 }
2365
2366 freelist =
2367 m_getpackets_internal(
2368 (unsigned int *)&num_needed,
2369 hdrs_needed, M_WAIT, 0,
2370 MBIGCLBYTES);
2371 /*
2372 * Fall back to cluster size
2373 * if allocation failed
2374 */
2375 }
2376
2377 /*
2378 * Allocate a cluster as we want to
2379 * avoid to split the data in more
2380 * that one segment and using MINCLSIZE
2381 * would lead us to allocate two mbufs
2382 */
2383 if (soreserveheadroom != 0 &&
2384 freelist == NULL &&
2385 ((top == NULL &&
2386 bytes_to_alloc > _MHLEN) ||
2387 bytes_to_alloc > _MLEN)) {
2388 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2389 MCLBYTES;
2390 freelist =
2391 m_getpackets_internal(
2392 (unsigned int *)&num_needed,
2393 hdrs_needed, M_WAIT, 0,
2394 MCLBYTES);
2395 /*
2396 * Fall back to a single mbuf
2397 * if allocation failed
2398 */
2399 } else if (freelist == NULL &&
2400 bytes_to_alloc > MINCLSIZE) {
2401 num_needed =
2402 bytes_to_alloc / MCLBYTES;
2403
2404 if ((bytes_to_alloc -
2405 (num_needed * MCLBYTES)) >=
2406 MINCLSIZE) {
2407 num_needed++;
2408 }
2409
2410 freelist =
2411 m_getpackets_internal(
2412 (unsigned int *)&num_needed,
2413 hdrs_needed, M_WAIT, 0,
2414 MCLBYTES);
2415 /*
2416 * Fall back to a single mbuf
2417 * if allocation failed
2418 */
2419 }
2420 /*
2421 * For datagram protocols, leave
2422 * headroom for protocol headers
2423 * in the first cluster of the chain
2424 */
2425 if (freelist != NULL && atomic &&
2426 top == NULL && headroom > 0) {
2427 freelist->m_data += headroom;
2428 }
2429
2430 /*
2431 * Fall back to regular mbufs without
2432 * reserving the socket headroom
2433 */
2434 if (freelist == NULL) {
2435 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2436 if (top == NULL) {
2437 MGETHDR(freelist,
2438 M_WAIT, MT_DATA);
2439 } else {
2440 MGET(freelist,
2441 M_WAIT, MT_DATA);
2442 }
2443 }
2444
2445 if (freelist == NULL) {
2446 error = ENOBUFS;
2447 socket_lock(so, 0);
2448 goto out_locked;
2449 }
2450 /*
2451 * For datagram protocols,
2452 * leave room for protocol
2453 * headers in first mbuf.
2454 */
2455 if (atomic && top == NULL &&
2456 bytes_to_copy < MHLEN) {
2457 MH_ALIGN(freelist,
2458 bytes_to_copy);
2459 }
2460 }
2461 m = freelist;
2462 freelist = m->m_next;
2463 m->m_next = NULL;
2464
2465 if ((m->m_flags & M_EXT)) {
2466 mlen = m->m_ext.ext_size -
2467 M_LEADINGSPACE(m);
2468 } else if ((m->m_flags & M_PKTHDR)) {
2469 mlen = MHLEN - M_LEADINGSPACE(m);
2470 m_add_crumb(m, PKT_CRUMB_SOSEND);
2471 } else {
2472 mlen = MLEN - M_LEADINGSPACE(m);
2473 }
2474 len = imin((int)mlen, bytes_to_copy);
2475
2476 chainlength += len;
2477
2478 space -= len;
2479
2480 error = uiomove(mtod(m, caddr_t),
2481 (int)len, uio);
2482
2483 resid = uio_resid(uio);
2484
2485 m->m_len = (int32_t)len;
2486 *mp = m;
2487 top->m_pkthdr.len += len;
2488 if (error) {
2489 break;
2490 }
2491 mp = &m->m_next;
2492 if (resid <= 0) {
2493 if (flags & MSG_EOR) {
2494 top->m_flags |= M_EOR;
2495 }
2496 break;
2497 }
2498 bytes_to_copy = imin((int)resid, (int)space);
2499 } while (space > 0 &&
2500 (chainlength < sosendmaxchain || atomic ||
2501 resid < MINCLSIZE));
2502
2503 socket_lock(so, 0);
2504
2505 if (error) {
2506 goto out_locked;
2507 }
2508 }
2509
2510 if (dontroute) {
2511 so->so_options |= SO_DONTROUTE;
2512 }
2513
2514 /*
2515 * Compute flags here, for pru_send and NKEs
2516 *
2517 * If the user set MSG_EOF, the protocol
2518 * understands this flag and nothing left to
2519 * send then use PRU_SEND_EOF instead of PRU_SEND.
2520 */
2521 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2522 ((flags & MSG_EOF) &&
2523 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2524 (resid <= 0)) ? PRUS_EOF :
2525 /* If there is more to send set PRUS_MORETOCOME */
2526 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2527
2528 if ((flags & MSG_SKIPCFIL) == 0) {
2529 /*
2530 * Socket filter processing
2531 */
2532 error = sflt_data_out(so, addr, &top,
2533 &control, (sendflags & MSG_OOB) ?
2534 sock_data_filt_flag_oob : 0);
2535 if (error) {
2536 if (error == EJUSTRETURN) {
2537 error = 0;
2538 goto packet_consumed;
2539 }
2540 goto out_locked;
2541 }
2542 #if CONTENT_FILTER
2543 /*
2544 * Content filter processing
2545 */
2546 error = cfil_sock_data_out(so, addr, top,
2547 control, sendflags, dgram_flow_entry);
2548 if (error) {
2549 if (error == EJUSTRETURN) {
2550 error = 0;
2551 goto packet_consumed;
2552 }
2553 goto out_locked;
2554 }
2555 #endif /* CONTENT_FILTER */
2556 }
2557 error = (*so->so_proto->pr_usrreqs->pru_send)
2558 (so, sendflags, top, addr, control, p);
2559
2560 packet_consumed:
2561 if (dontroute) {
2562 so->so_options &= ~SO_DONTROUTE;
2563 }
2564
2565 clen = 0;
2566 control = NULL;
2567 top = NULL;
2568 mp = ⊤
2569 if (error) {
2570 goto out_locked;
2571 }
2572 } while (resid && space > 0);
2573 } while (resid);
2574
2575 out_locked:
2576 if (sblocked) {
2577 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2578 } else {
2579 socket_unlock(so, 1);
2580 }
2581 if (top != NULL) {
2582 m_freem(top);
2583 }
2584 if (control != NULL) {
2585 m_freem(control);
2586 }
2587 if (freelist != NULL) {
2588 m_freem_list(freelist);
2589 }
2590
2591 if (dgram_flow_entry != NULL) {
2592 soflow_free_flow(dgram_flow_entry);
2593 }
2594
2595 soclearfastopen(so);
2596
2597 if (en_tracing) {
2598 /* resid passed here is the bytes left in uio */
2599 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2600 VM_KERNEL_ADDRPERM(so),
2601 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2602 (int64_t)(orig_resid - resid));
2603 }
2604 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2605 so->so_snd.sb_cc, space, error);
2606
2607 return error;
2608 }
2609
2610 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2611 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2612 {
2613 struct mbuf *m0 = NULL, *control_end = NULL;
2614
2615 socket_lock_assert_owned(so);
2616
2617 /*
2618 * top must points to mbuf chain to be sent.
2619 * If control is not NULL, top must be packet header
2620 */
2621 VERIFY(top != NULL &&
2622 (control == NULL || top->m_flags & M_PKTHDR));
2623
2624 /*
2625 * If control is not passed in, see if we can get it
2626 * from top.
2627 */
2628 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2629 // Locate start of control if present and start of data
2630 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2631 if (m0->m_flags & M_PKTHDR) {
2632 top = m0;
2633 break;
2634 } else if (m0->m_type == MT_CONTROL) {
2635 if (control == NULL) {
2636 // Found start of control
2637 control = m0;
2638 }
2639 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2640 // Found end of control
2641 control_end = m0;
2642 }
2643 }
2644 }
2645 if (control_end != NULL) {
2646 control_end->m_next = NULL;
2647 }
2648 }
2649
2650 int error = (*so->so_proto->pr_usrreqs->pru_send)
2651 (so, sendflags, top, addr, control, current_proc());
2652
2653 return error;
2654 }
2655
2656 /*
2657 * Supported only connected sockets (no address) without ancillary data
2658 * (control mbuf) for atomic protocols
2659 */
2660 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2661 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2662 {
2663 struct mbuf *m, *freelist = NULL;
2664 struct soflow_hash_entry *dgram_flow_entry = NULL;
2665 user_ssize_t len, resid;
2666 int error, dontroute;
2667 int atomic = sosendallatonce(so);
2668 int sblocked = 0;
2669 struct proc *p = current_proc();
2670 u_int uiofirst = 0;
2671 u_int uiolast = 0;
2672 struct mbuf *top = NULL;
2673 uint16_t headroom = 0;
2674 ssize_t mlen;
2675 boolean_t bigcl;
2676
2677 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2678 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2679
2680 if (so->so_type != SOCK_DGRAM) {
2681 error = EINVAL;
2682 goto out;
2683 }
2684 if (atomic == 0) {
2685 error = EINVAL;
2686 goto out;
2687 }
2688 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2689 error = EPROTONOSUPPORT;
2690 goto out;
2691 }
2692 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2693 error = EINVAL;
2694 goto out;
2695 }
2696 resid = uio_array_resid(uioarray, uiocnt);
2697
2698 /*
2699 * In theory resid should be unsigned.
2700 * However, space must be signed, as it might be less than 0
2701 * if we over-committed, and we must use a signed comparison
2702 * of space and resid. On the other hand, a negative resid
2703 * causes us to loop sending 0-length segments to the protocol.
2704 *
2705 * Note: We limit resid to be a positive int value as we use
2706 * imin() to set bytes_to_copy -- radr://14558484
2707 */
2708 if (resid < 0 || resid > INT_MAX) {
2709 error = EINVAL;
2710 goto out;
2711 }
2712
2713 socket_lock(so, 1);
2714 so_update_last_owner_locked(so, p);
2715 so_update_policy(so);
2716
2717 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2718 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2719 }
2720
2721 #if NECP
2722 so_update_necp_policy(so, NULL, NULL);
2723 #endif /* NECP */
2724
2725 dontroute = (flags & MSG_DONTROUTE) &&
2726 (so->so_options & SO_DONTROUTE) == 0 &&
2727 (so->so_proto->pr_flags & PR_ATOMIC);
2728 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2729
2730 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2731 if (error) {
2732 goto release;
2733 }
2734
2735 /*
2736 * Use big 4 KB clusters when the outgoing interface does not prefer
2737 * 2 KB clusters
2738 */
2739 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2740
2741 if (soreserveheadroom != 0) {
2742 headroom = so->so_pktheadroom;
2743 }
2744
2745 do {
2746 int i;
2747 int num_needed = 0;
2748 int chainlength;
2749 size_t maxpktlen = 0;
2750 int bytes_to_alloc;
2751
2752 if (sosendminchain > 0) {
2753 chainlength = 0;
2754 } else {
2755 chainlength = sosendmaxchain;
2756 }
2757
2758 socket_unlock(so, 0);
2759
2760 /*
2761 * Find a set of uio that fit in a reasonable number
2762 * of mbuf packets
2763 */
2764 for (i = uiofirst; i < uiocnt; i++) {
2765 struct uio *auio = uioarray[i];
2766
2767 len = uio_resid(auio);
2768
2769 /* Do nothing for empty messages */
2770 if (len == 0) {
2771 continue;
2772 }
2773
2774 num_needed += 1;
2775 uiolast += 1;
2776
2777 if (len > maxpktlen) {
2778 maxpktlen = len;
2779 }
2780
2781 chainlength += len;
2782 if (chainlength > sosendmaxchain) {
2783 break;
2784 }
2785 }
2786 /*
2787 * Nothing left to send
2788 */
2789 if (num_needed == 0) {
2790 socket_lock(so, 0);
2791 break;
2792 }
2793 /*
2794 * Allocate buffer large enough to include headroom space for
2795 * network and link header
2796 *
2797 */
2798 bytes_to_alloc = (int) maxpktlen + headroom;
2799
2800 /*
2801 * Allocate a single contiguous buffer of the smallest available
2802 * size when possible
2803 */
2804 if (bytes_to_alloc > MCLBYTES &&
2805 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2806 freelist = m_getpackets_internal(
2807 (unsigned int *)&num_needed,
2808 num_needed, M_WAIT, 1,
2809 MBIGCLBYTES);
2810 } else if (bytes_to_alloc > _MHLEN &&
2811 bytes_to_alloc <= MCLBYTES) {
2812 freelist = m_getpackets_internal(
2813 (unsigned int *)&num_needed,
2814 num_needed, M_WAIT, 1,
2815 MCLBYTES);
2816 } else {
2817 freelist = m_allocpacket_internal(
2818 (unsigned int *)&num_needed,
2819 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2820 }
2821
2822 if (freelist == NULL) {
2823 socket_lock(so, 0);
2824 error = ENOMEM;
2825 goto release;
2826 }
2827 /*
2828 * Copy each uio of the set into its own mbuf packet
2829 */
2830 for (i = uiofirst, m = freelist;
2831 i < uiolast && m != NULL;
2832 i++) {
2833 int bytes_to_copy;
2834 struct mbuf *n;
2835 struct uio *auio = uioarray[i];
2836
2837 bytes_to_copy = (int)uio_resid(auio);
2838
2839 /* Do nothing for empty messages */
2840 if (bytes_to_copy == 0) {
2841 continue;
2842 }
2843 /*
2844 * Leave headroom for protocol headers
2845 * in the first mbuf of the chain
2846 */
2847 m->m_data += headroom;
2848
2849 for (n = m; n != NULL; n = n->m_next) {
2850 if ((m->m_flags & M_EXT)) {
2851 mlen = m->m_ext.ext_size -
2852 M_LEADINGSPACE(m);
2853 } else if ((m->m_flags & M_PKTHDR)) {
2854 mlen =
2855 MHLEN - M_LEADINGSPACE(m);
2856 } else {
2857 mlen = MLEN - M_LEADINGSPACE(m);
2858 }
2859 len = imin((int)mlen, bytes_to_copy);
2860
2861 /*
2862 * Note: uiomove() decrements the iovec
2863 * length
2864 */
2865 error = uiomove(mtod(n, caddr_t),
2866 (int)len, auio);
2867 if (error != 0) {
2868 break;
2869 }
2870 n->m_len = (int32_t)len;
2871 m->m_pkthdr.len += len;
2872
2873 VERIFY(m->m_pkthdr.len <= maxpktlen);
2874
2875 bytes_to_copy -= len;
2876 resid -= len;
2877 }
2878 if (m->m_pkthdr.len == 0) {
2879 printf(
2880 "%s:%d so %llx pkt %llx type %u len null\n",
2881 __func__, __LINE__,
2882 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2883 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2884 m->m_type);
2885 }
2886 if (error != 0) {
2887 break;
2888 }
2889 m = m->m_nextpkt;
2890 }
2891
2892 socket_lock(so, 0);
2893
2894 if (error) {
2895 goto release;
2896 }
2897 top = freelist;
2898 freelist = NULL;
2899
2900 if (dontroute) {
2901 so->so_options |= SO_DONTROUTE;
2902 }
2903
2904 if ((flags & MSG_SKIPCFIL) == 0) {
2905 struct mbuf **prevnextp = NULL;
2906
2907 for (i = uiofirst, m = top;
2908 i < uiolast && m != NULL;
2909 i++) {
2910 struct mbuf *nextpkt = m->m_nextpkt;
2911
2912 /*
2913 * Socket filter processing
2914 */
2915 error = sflt_data_out(so, NULL, &m,
2916 NULL, 0);
2917 if (error != 0 && error != EJUSTRETURN) {
2918 goto release;
2919 }
2920
2921 #if CONTENT_FILTER
2922 if (error == 0) {
2923 /*
2924 * Content filter processing
2925 */
2926 error = cfil_sock_data_out(so, NULL, m,
2927 NULL, 0, dgram_flow_entry);
2928 if (error != 0 && error != EJUSTRETURN) {
2929 goto release;
2930 }
2931 }
2932 #endif /* CONTENT_FILTER */
2933 /*
2934 * Remove packet from the list when
2935 * swallowed by a filter
2936 */
2937 if (error == EJUSTRETURN) {
2938 error = 0;
2939 if (prevnextp != NULL) {
2940 *prevnextp = nextpkt;
2941 } else {
2942 top = nextpkt;
2943 }
2944 }
2945
2946 m = nextpkt;
2947 if (m != NULL) {
2948 prevnextp = &m->m_nextpkt;
2949 }
2950 }
2951 }
2952 if (top != NULL) {
2953 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2954 (so, 0, top, NULL, NULL, p);
2955 }
2956
2957 if (dontroute) {
2958 so->so_options &= ~SO_DONTROUTE;
2959 }
2960
2961 top = NULL;
2962 uiofirst = uiolast;
2963 } while (resid > 0 && error == 0);
2964 release:
2965 if (sblocked) {
2966 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2967 } else {
2968 socket_unlock(so, 1);
2969 }
2970 out:
2971 if (top != NULL) {
2972 m_freem(top);
2973 }
2974 if (freelist != NULL) {
2975 m_freem_list(freelist);
2976 }
2977
2978 if (dgram_flow_entry != NULL) {
2979 soflow_free_flow(dgram_flow_entry);
2980 }
2981
2982 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2983 so->so_snd.sb_cc, 0, error);
2984
2985 return error;
2986 }
2987
2988 /*
2989 * May return ERESTART when packet is dropped by MAC policy check
2990 */
2991 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2992 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2993 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2994 {
2995 int error = 0;
2996 struct mbuf *m = *mp;
2997 struct mbuf *nextrecord = *nextrecordp;
2998
2999 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3000 #if CONFIG_MACF_SOCKET_SUBSET
3001 /*
3002 * Call the MAC framework for policy checking if we're in
3003 * the user process context and the socket isn't connected.
3004 */
3005 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3006 struct mbuf *m0 = m;
3007 /*
3008 * Dequeue this record (temporarily) from the receive
3009 * list since we're about to drop the socket's lock
3010 * where a new record may arrive and be appended to
3011 * the list. Upon MAC policy failure, the record
3012 * will be freed. Otherwise, we'll add it back to
3013 * the head of the list. We cannot rely on SB_LOCK
3014 * because append operation uses the socket's lock.
3015 */
3016 do {
3017 m->m_nextpkt = NULL;
3018 sbfree(&so->so_rcv, m);
3019 m = m->m_next;
3020 } while (m != NULL);
3021 m = m0;
3022 so->so_rcv.sb_mb = nextrecord;
3023 SB_EMPTY_FIXUP(&so->so_rcv);
3024 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3025 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3026 socket_unlock(so, 0);
3027
3028 error = mac_socket_check_received(kauth_cred_get(), so,
3029 mtod(m, struct sockaddr *));
3030
3031 if (error != 0) {
3032 /*
3033 * MAC policy failure; free this record and
3034 * process the next record (or block until
3035 * one is available). We have adjusted sb_cc
3036 * and sb_mbcnt above so there is no need to
3037 * call sbfree() again.
3038 */
3039 m_freem(m);
3040 /*
3041 * Clear SB_LOCK but don't unlock the socket.
3042 * Process the next record or wait for one.
3043 */
3044 socket_lock(so, 0);
3045 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3046 error = ERESTART;
3047 goto done;
3048 }
3049 socket_lock(so, 0);
3050 /*
3051 * If the socket has been defunct'd, drop it.
3052 */
3053 if (so->so_flags & SOF_DEFUNCT) {
3054 m_freem(m);
3055 error = ENOTCONN;
3056 goto done;
3057 }
3058 /*
3059 * Re-adjust the socket receive list and re-enqueue
3060 * the record in front of any packets which may have
3061 * been appended while we dropped the lock.
3062 */
3063 for (m = m0; m->m_next != NULL; m = m->m_next) {
3064 sballoc(&so->so_rcv, m);
3065 }
3066 sballoc(&so->so_rcv, m);
3067 if (so->so_rcv.sb_mb == NULL) {
3068 so->so_rcv.sb_lastrecord = m0;
3069 so->so_rcv.sb_mbtail = m;
3070 }
3071 m = m0;
3072 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3073 so->so_rcv.sb_mb = m;
3074 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3075 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3076 }
3077 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3078 if (psa != NULL) {
3079 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3080 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3081 error = EWOULDBLOCK;
3082 goto done;
3083 }
3084 }
3085 if (flags & MSG_PEEK) {
3086 m = m->m_next;
3087 } else {
3088 sbfree(&so->so_rcv, m);
3089 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3090 panic("%s: about to create invalid socketbuf",
3091 __func__);
3092 /* NOTREACHED */
3093 }
3094 MFREE(m, so->so_rcv.sb_mb);
3095 m = so->so_rcv.sb_mb;
3096 if (m != NULL) {
3097 m->m_nextpkt = nextrecord;
3098 } else {
3099 so->so_rcv.sb_mb = nextrecord;
3100 SB_EMPTY_FIXUP(&so->so_rcv);
3101 }
3102 }
3103 done:
3104 *mp = m;
3105 *nextrecordp = nextrecord;
3106
3107 return error;
3108 }
3109
3110 /*
3111 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3112 * so clear the data portion in order not to leak the file pointers
3113 */
3114 static void
sopeek_scm_rights(struct mbuf * rights)3115 sopeek_scm_rights(struct mbuf *rights)
3116 {
3117 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3118
3119 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3120 VERIFY(cm->cmsg_len <= rights->m_len);
3121 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3122 }
3123 }
3124
3125 /*
3126 * Process one or more MT_CONTROL mbufs present before any data mbufs
3127 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3128 * just copy the data; if !MSG_PEEK, we call into the protocol to
3129 * perform externalization.
3130 */
3131 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3132 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3133 struct mbuf **mp, struct mbuf **nextrecordp)
3134 {
3135 int error = 0;
3136 struct mbuf *cm = NULL, *cmn;
3137 struct mbuf **cme = &cm;
3138 struct sockbuf *sb_rcv = &so->so_rcv;
3139 struct mbuf **msgpcm = NULL;
3140 struct mbuf *m = *mp;
3141 struct mbuf *nextrecord = *nextrecordp;
3142 struct protosw *pr = so->so_proto;
3143
3144 /*
3145 * Externalizing the control messages would require us to
3146 * drop the socket's lock below. Once we re-acquire the
3147 * lock, the mbuf chain might change. In order to preserve
3148 * consistency, we unlink all control messages from the
3149 * first mbuf chain in one shot and link them separately
3150 * onto a different chain.
3151 */
3152 do {
3153 if (flags & MSG_PEEK) {
3154 if (controlp != NULL) {
3155 if (*controlp == NULL) {
3156 msgpcm = controlp;
3157 }
3158 *controlp = m_copy(m, 0, m->m_len);
3159
3160 /*
3161 * If we failed to allocate an mbuf,
3162 * release any previously allocated
3163 * mbufs for control data. Return
3164 * an error. Keep the mbufs in the
3165 * socket as this is using
3166 * MSG_PEEK flag.
3167 */
3168 if (*controlp == NULL) {
3169 m_freem(*msgpcm);
3170 error = ENOBUFS;
3171 goto done;
3172 }
3173
3174 if (pr->pr_domain->dom_externalize != NULL) {
3175 sopeek_scm_rights(*controlp);
3176 }
3177
3178 controlp = &(*controlp)->m_next;
3179 }
3180 m = m->m_next;
3181 } else {
3182 m->m_nextpkt = NULL;
3183 sbfree(sb_rcv, m);
3184 sb_rcv->sb_mb = m->m_next;
3185 m->m_next = NULL;
3186 *cme = m;
3187 cme = &(*cme)->m_next;
3188 m = sb_rcv->sb_mb;
3189 }
3190 } while (m != NULL && m->m_type == MT_CONTROL);
3191
3192 if (!(flags & MSG_PEEK)) {
3193 if (sb_rcv->sb_mb != NULL) {
3194 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3195 } else {
3196 sb_rcv->sb_mb = nextrecord;
3197 SB_EMPTY_FIXUP(sb_rcv);
3198 }
3199 if (nextrecord == NULL) {
3200 sb_rcv->sb_lastrecord = m;
3201 }
3202 }
3203
3204 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3205 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3206
3207 while (cm != NULL) {
3208 int cmsg_level;
3209 int cmsg_type;
3210
3211 cmn = cm->m_next;
3212 cm->m_next = NULL;
3213 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3214 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3215
3216 /*
3217 * Call the protocol to externalize SCM_RIGHTS message
3218 * and return the modified message to the caller upon
3219 * success. Otherwise, all other control messages are
3220 * returned unmodified to the caller. Note that we
3221 * only get into this loop if MSG_PEEK is not set.
3222 */
3223 if (pr->pr_domain->dom_externalize != NULL &&
3224 cmsg_level == SOL_SOCKET &&
3225 cmsg_type == SCM_RIGHTS) {
3226 /*
3227 * Release socket lock: see 3903171. This
3228 * would also allow more records to be appended
3229 * to the socket buffer. We still have SB_LOCK
3230 * set on it, so we can be sure that the head
3231 * of the mbuf chain won't change.
3232 */
3233 socket_unlock(so, 0);
3234 error = (*pr->pr_domain->dom_externalize)(cm);
3235 socket_lock(so, 0);
3236 } else {
3237 error = 0;
3238 }
3239
3240 if (controlp != NULL && error == 0) {
3241 *controlp = cm;
3242 controlp = &(*controlp)->m_next;
3243 } else {
3244 (void) m_free(cm);
3245 }
3246 cm = cmn;
3247 }
3248 /*
3249 * Update the value of nextrecord in case we received new
3250 * records when the socket was unlocked above for
3251 * externalizing SCM_RIGHTS.
3252 */
3253 if (m != NULL) {
3254 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3255 } else {
3256 nextrecord = sb_rcv->sb_mb;
3257 }
3258
3259 done:
3260 *mp = m;
3261 *nextrecordp = nextrecord;
3262
3263 return error;
3264 }
3265
3266 /*
3267 * If we have less data than requested, block awaiting more
3268 * (subject to any timeout) if:
3269 * 1. the current count is less than the low water mark, or
3270 * 2. MSG_WAITALL is set, and it is possible to do the entire
3271 * receive operation at once if we block (resid <= hiwat).
3272 * 3. MSG_DONTWAIT is not set
3273 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3274 * we have to do the receive in sections, and thus risk returning
3275 * a short count if a timeout or signal occurs after we start.
3276 */
3277 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3278 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3279 {
3280 struct protosw *pr = so->so_proto;
3281
3282 /* No mbufs in the receive-queue? Wait! */
3283 if (m == NULL) {
3284 return true;
3285 }
3286
3287 /* Not enough data in the receive socket-buffer - we may have to wait */
3288 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3289 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3290 /*
3291 * Application did set the lowater-mark, so we should wait for
3292 * this data to be present.
3293 */
3294 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3295 return true;
3296 }
3297
3298 /*
3299 * Application wants all the data - so let's try to do the
3300 * receive-operation at once by waiting for everything to
3301 * be there.
3302 */
3303 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3304 return true;
3305 }
3306 }
3307
3308 return false;
3309 }
3310
3311 /*
3312 * Implement receive operations on a socket.
3313 * We depend on the way that records are added to the sockbuf
3314 * by sbappend*. In particular, each record (mbufs linked through m_next)
3315 * must begin with an address if the protocol so specifies,
3316 * followed by an optional mbuf or mbufs containing ancillary data,
3317 * and then zero or more mbufs of data.
3318 * In order to avoid blocking network interrupts for the entire time here,
3319 * we splx() while doing the actual copy to user space.
3320 * Although the sockbuf is locked, new data may still be appended,
3321 * and thus we must maintain consistency of the sockbuf during that time.
3322 *
3323 * The caller may receive the data as a single mbuf chain by supplying
3324 * an mbuf **mp0 for use in returning the chain. The uio is then used
3325 * only for the count in uio_resid.
3326 *
3327 * Returns: 0 Success
3328 * ENOBUFS
3329 * ENOTCONN
3330 * EWOULDBLOCK
3331 * uiomove:EFAULT
3332 * sblock:EWOULDBLOCK
3333 * sblock:EINTR
3334 * sbwait:EBADF
3335 * sbwait:EINTR
3336 * sodelayed_copy:EFAULT
3337 * <pru_rcvoob>:EINVAL[TCP]
3338 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3339 * <pru_rcvoob>:???
3340 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3341 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3342 * <pr_domain->dom_externalize>:???
3343 *
3344 * Notes: Additional return values from calls through <pru_rcvoob> and
3345 * <pr_domain->dom_externalize> depend on protocols other than
3346 * TCP or AF_UNIX, which are documented above.
3347 */
3348 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3349 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3350 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3351 {
3352 struct mbuf *m, **mp, *ml = NULL;
3353 struct mbuf *nextrecord, *free_list;
3354 int flags, error, offset;
3355 user_ssize_t len;
3356 struct protosw *pr = so->so_proto;
3357 int moff, type = 0;
3358 user_ssize_t orig_resid = uio_resid(uio);
3359 user_ssize_t delayed_copy_len;
3360 int can_delay;
3361 struct proc *p = current_proc();
3362 boolean_t en_tracing = FALSE;
3363
3364 /*
3365 * Sanity check on the length passed by caller as we are making 'int'
3366 * comparisons
3367 */
3368 if (orig_resid < 0 || orig_resid > INT_MAX) {
3369 return EINVAL;
3370 }
3371
3372 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3373 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3374 so->so_rcv.sb_hiwat);
3375
3376 socket_lock(so, 1);
3377 so_update_last_owner_locked(so, p);
3378 so_update_policy(so);
3379
3380 #ifdef MORE_LOCKING_DEBUG
3381 if (so->so_usecount == 1) {
3382 panic("%s: so=%x no other reference on socket", __func__, so);
3383 /* NOTREACHED */
3384 }
3385 #endif
3386 mp = mp0;
3387 if (psa != NULL) {
3388 *psa = NULL;
3389 }
3390 if (controlp != NULL) {
3391 *controlp = NULL;
3392 }
3393 if (flagsp != NULL) {
3394 flags = *flagsp & ~MSG_EOR;
3395 } else {
3396 flags = 0;
3397 }
3398
3399 /*
3400 * If a recv attempt is made on a previously-accepted socket
3401 * that has been marked as inactive (disconnected), reject
3402 * the request.
3403 */
3404 if (so->so_flags & SOF_DEFUNCT) {
3405 struct sockbuf *sb = &so->so_rcv;
3406
3407 error = ENOTCONN;
3408 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3409 __func__, proc_pid(p), proc_best_name(p),
3410 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3411 SOCK_DOM(so), SOCK_TYPE(so), error);
3412 /*
3413 * This socket should have been disconnected and flushed
3414 * prior to being returned from sodefunct(); there should
3415 * be no data on its receive list, so panic otherwise.
3416 */
3417 if (so->so_state & SS_DEFUNCT) {
3418 sb_empty_assert(sb, __func__);
3419 }
3420 socket_unlock(so, 1);
3421 return error;
3422 }
3423
3424 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3425 pr->pr_usrreqs->pru_preconnect) {
3426 /*
3427 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3428 * calling write() right after this. *If* the app calls a read
3429 * we do not want to block this read indefinetely. Thus,
3430 * we trigger a connect so that the session gets initiated.
3431 */
3432 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3433
3434 if (error) {
3435 socket_unlock(so, 1);
3436 return error;
3437 }
3438 }
3439
3440 if (ENTR_SHOULDTRACE &&
3441 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3442 /*
3443 * enable energy tracing for inet sockets that go over
3444 * non-loopback interfaces only.
3445 */
3446 struct inpcb *inp = sotoinpcb(so);
3447 if (inp->inp_last_outifp != NULL &&
3448 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3449 en_tracing = TRUE;
3450 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3451 VM_KERNEL_ADDRPERM(so),
3452 ((so->so_state & SS_NBIO) ?
3453 kEnTrFlagNonBlocking : 0),
3454 (int64_t)orig_resid);
3455 }
3456 }
3457
3458 /*
3459 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3460 * regardless of the flags argument. Here is the case were
3461 * out-of-band data is not inline.
3462 */
3463 if ((flags & MSG_OOB) ||
3464 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3465 (so->so_options & SO_OOBINLINE) == 0 &&
3466 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3467 m = m_get(M_WAIT, MT_DATA);
3468 if (m == NULL) {
3469 socket_unlock(so, 1);
3470 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3471 ENOBUFS, 0, 0, 0, 0);
3472 return ENOBUFS;
3473 }
3474 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3475 if (error) {
3476 goto bad;
3477 }
3478 socket_unlock(so, 0);
3479 do {
3480 error = uiomove(mtod(m, caddr_t),
3481 imin((int)uio_resid(uio), m->m_len), uio);
3482 m = m_free(m);
3483 } while (uio_resid(uio) && error == 0 && m != NULL);
3484 socket_lock(so, 0);
3485 bad:
3486 if (m != NULL) {
3487 m_freem(m);
3488 }
3489
3490 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3491 if (error == EWOULDBLOCK || error == EINVAL) {
3492 /*
3493 * Let's try to get normal data:
3494 * EWOULDBLOCK: out-of-band data not
3495 * receive yet. EINVAL: out-of-band data
3496 * already read.
3497 */
3498 error = 0;
3499 goto nooob;
3500 } else if (error == 0 && flagsp != NULL) {
3501 *flagsp |= MSG_OOB;
3502 }
3503 }
3504 socket_unlock(so, 1);
3505 if (en_tracing) {
3506 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3507 VM_KERNEL_ADDRPERM(so), 0,
3508 (int64_t)(orig_resid - uio_resid(uio)));
3509 }
3510 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3511 0, 0, 0, 0);
3512
3513 return error;
3514 }
3515 nooob:
3516 if (mp != NULL) {
3517 *mp = NULL;
3518 }
3519
3520 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3521 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3522 }
3523
3524 free_list = NULL;
3525 delayed_copy_len = 0;
3526 restart:
3527 #ifdef MORE_LOCKING_DEBUG
3528 if (so->so_usecount <= 1) {
3529 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3530 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3531 }
3532 #endif
3533 /*
3534 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3535 * and if so just return to the caller. This could happen when
3536 * soreceive() is called by a socket upcall function during the
3537 * time the socket is freed. The socket buffer would have been
3538 * locked across the upcall, therefore we cannot put this thread
3539 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3540 * we may livelock), because the lock on the socket buffer will
3541 * only be released when the upcall routine returns to its caller.
3542 * Because the socket has been officially closed, there can be
3543 * no further read on it.
3544 *
3545 * A multipath subflow socket would have its SS_NOFDREF set by
3546 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3547 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3548 */
3549 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3550 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3551 socket_unlock(so, 1);
3552 return 0;
3553 }
3554
3555 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3556 if (error) {
3557 socket_unlock(so, 1);
3558 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3559 0, 0, 0, 0);
3560 if (en_tracing) {
3561 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3562 VM_KERNEL_ADDRPERM(so), 0,
3563 (int64_t)(orig_resid - uio_resid(uio)));
3564 }
3565 return error;
3566 }
3567
3568 m = so->so_rcv.sb_mb;
3569 if (so_should_wait(so, uio, m, flags)) {
3570 /*
3571 * Panic if we notice inconsistencies in the socket's
3572 * receive list; both sb_mb and sb_cc should correctly
3573 * reflect the contents of the list, otherwise we may
3574 * end up with false positives during select() or poll()
3575 * which could put the application in a bad state.
3576 */
3577 SB_MB_CHECK(&so->so_rcv);
3578
3579 if (so->so_error) {
3580 if (m != NULL) {
3581 goto dontblock;
3582 }
3583 error = so->so_error;
3584 if ((flags & MSG_PEEK) == 0) {
3585 so->so_error = 0;
3586 }
3587 goto release;
3588 }
3589 if (so->so_state & SS_CANTRCVMORE) {
3590 #if CONTENT_FILTER
3591 /*
3592 * Deal with half closed connections
3593 */
3594 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3595 cfil_sock_data_pending(&so->so_rcv) != 0) {
3596 CFIL_LOG(LOG_INFO,
3597 "so %llx ignore SS_CANTRCVMORE",
3598 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3599 } else
3600 #endif /* CONTENT_FILTER */
3601 if (m != NULL) {
3602 goto dontblock;
3603 } else {
3604 goto release;
3605 }
3606 }
3607 for (; m != NULL; m = m->m_next) {
3608 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3609 m = so->so_rcv.sb_mb;
3610 goto dontblock;
3611 }
3612 }
3613 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3614 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3615 error = ENOTCONN;
3616 goto release;
3617 }
3618 if (uio_resid(uio) == 0) {
3619 goto release;
3620 }
3621
3622 if ((so->so_state & SS_NBIO) ||
3623 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3624 error = EWOULDBLOCK;
3625 goto release;
3626 }
3627 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3628 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3629 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3630 #if EVEN_MORE_LOCKING_DEBUG
3631 if (socket_debug) {
3632 printf("Waiting for socket data\n");
3633 }
3634 #endif
3635
3636 /*
3637 * Depending on the protocol (e.g. TCP), the following
3638 * might cause the socket lock to be dropped and later
3639 * be reacquired, and more data could have arrived and
3640 * have been appended to the receive socket buffer by
3641 * the time it returns. Therefore, we only sleep in
3642 * sbwait() below if and only if the wait-condition is still
3643 * true.
3644 */
3645 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3646 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3647 }
3648
3649 error = 0;
3650 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3651 error = sbwait(&so->so_rcv);
3652 }
3653
3654 #if EVEN_MORE_LOCKING_DEBUG
3655 if (socket_debug) {
3656 printf("SORECEIVE - sbwait returned %d\n", error);
3657 }
3658 #endif
3659 if (so->so_usecount < 1) {
3660 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3661 __func__, so, so->so_usecount);
3662 /* NOTREACHED */
3663 }
3664 if (error) {
3665 socket_unlock(so, 1);
3666 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3667 0, 0, 0, 0);
3668 if (en_tracing) {
3669 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3670 VM_KERNEL_ADDRPERM(so), 0,
3671 (int64_t)(orig_resid - uio_resid(uio)));
3672 }
3673 return error;
3674 }
3675 goto restart;
3676 }
3677 dontblock:
3678 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3679 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3680 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3681 nextrecord = m->m_nextpkt;
3682
3683 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3684 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3685 mp0 == NULL);
3686 if (error == ERESTART) {
3687 goto restart;
3688 } else if (error != 0) {
3689 goto release;
3690 }
3691 orig_resid = 0;
3692 }
3693
3694 /*
3695 * Process one or more MT_CONTROL mbufs present before any data mbufs
3696 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3697 * just copy the data; if !MSG_PEEK, we call into the protocol to
3698 * perform externalization.
3699 */
3700 if (m != NULL && m->m_type == MT_CONTROL) {
3701 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3702 if (error != 0) {
3703 goto release;
3704 }
3705 orig_resid = 0;
3706 }
3707
3708 if (m != NULL) {
3709 if (!(flags & MSG_PEEK)) {
3710 /*
3711 * We get here because m points to an mbuf following
3712 * any MT_SONAME or MT_CONTROL mbufs which have been
3713 * processed above. In any case, m should be pointing
3714 * to the head of the mbuf chain, and the nextrecord
3715 * should be either NULL or equal to m->m_nextpkt.
3716 * See comments above about SB_LOCK.
3717 */
3718 if (m != so->so_rcv.sb_mb ||
3719 m->m_nextpkt != nextrecord) {
3720 panic("%s: post-control !sync so=%p m=%p "
3721 "nextrecord=%p\n", __func__, so, m,
3722 nextrecord);
3723 /* NOTREACHED */
3724 }
3725 if (nextrecord == NULL) {
3726 so->so_rcv.sb_lastrecord = m;
3727 }
3728 }
3729 type = m->m_type;
3730 if (type == MT_OOBDATA) {
3731 flags |= MSG_OOB;
3732 }
3733 } else {
3734 if (!(flags & MSG_PEEK)) {
3735 SB_EMPTY_FIXUP(&so->so_rcv);
3736 }
3737 }
3738 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3739 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3740
3741 moff = 0;
3742 offset = 0;
3743
3744 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3745 can_delay = 1;
3746 } else {
3747 can_delay = 0;
3748 }
3749
3750 while (m != NULL &&
3751 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3752 if (m->m_type == MT_OOBDATA) {
3753 if (type != MT_OOBDATA) {
3754 break;
3755 }
3756 } else if (type == MT_OOBDATA) {
3757 break;
3758 }
3759
3760 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3761 m->m_type != MT_HEADER) {
3762 break;
3763 }
3764 /*
3765 * Make sure to allways set MSG_OOB event when getting
3766 * out of band data inline.
3767 */
3768 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3769 (so->so_options & SO_OOBINLINE) != 0 &&
3770 (so->so_state & SS_RCVATMARK) != 0) {
3771 flags |= MSG_OOB;
3772 }
3773 so->so_state &= ~SS_RCVATMARK;
3774 len = uio_resid(uio) - delayed_copy_len;
3775 if (so->so_oobmark && len > so->so_oobmark - offset) {
3776 len = so->so_oobmark - offset;
3777 }
3778 if (len > m->m_len - moff) {
3779 len = m->m_len - moff;
3780 }
3781 /*
3782 * If mp is set, just pass back the mbufs.
3783 * Otherwise copy them out via the uio, then free.
3784 * Sockbuf must be consistent here (points to current mbuf,
3785 * it points to next record) when we drop priority;
3786 * we must note any additions to the sockbuf when we
3787 * block interrupts again.
3788 */
3789 if (mp == NULL) {
3790 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3791 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3792 if (can_delay && len == m->m_len) {
3793 /*
3794 * only delay the copy if we're consuming the
3795 * mbuf and we're NOT in MSG_PEEK mode
3796 * and we have enough data to make it worthwile
3797 * to drop and retake the lock... can_delay
3798 * reflects the state of the 2 latter
3799 * constraints moff should always be zero
3800 * in these cases
3801 */
3802 delayed_copy_len += len;
3803 } else {
3804 if (delayed_copy_len) {
3805 error = sodelayed_copy(so, uio,
3806 &free_list, &delayed_copy_len);
3807
3808 if (error) {
3809 goto release;
3810 }
3811 /*
3812 * can only get here if MSG_PEEK is not
3813 * set therefore, m should point at the
3814 * head of the rcv queue; if it doesn't,
3815 * it means something drastically
3816 * changed while we were out from behind
3817 * the lock in sodelayed_copy. perhaps
3818 * a RST on the stream. in any event,
3819 * the stream has been interrupted. it's
3820 * probably best just to return whatever
3821 * data we've moved and let the caller
3822 * sort it out...
3823 */
3824 if (m != so->so_rcv.sb_mb) {
3825 break;
3826 }
3827 }
3828 socket_unlock(so, 0);
3829 error = uiomove(mtod(m, caddr_t) + moff,
3830 (int)len, uio);
3831 socket_lock(so, 0);
3832
3833 if (error) {
3834 goto release;
3835 }
3836 }
3837 } else {
3838 uio_setresid(uio, (uio_resid(uio) - len));
3839 }
3840 if (len == m->m_len - moff) {
3841 if (m->m_flags & M_EOR) {
3842 flags |= MSG_EOR;
3843 }
3844 if (flags & MSG_PEEK) {
3845 m = m->m_next;
3846 moff = 0;
3847 } else {
3848 nextrecord = m->m_nextpkt;
3849 sbfree(&so->so_rcv, m);
3850 m->m_nextpkt = NULL;
3851
3852 if (mp != NULL) {
3853 *mp = m;
3854 mp = &m->m_next;
3855 so->so_rcv.sb_mb = m = m->m_next;
3856 *mp = NULL;
3857 } else {
3858 if (free_list == NULL) {
3859 free_list = m;
3860 } else {
3861 ml->m_next = m;
3862 }
3863 ml = m;
3864 so->so_rcv.sb_mb = m = m->m_next;
3865 ml->m_next = NULL;
3866 }
3867 if (m != NULL) {
3868 m->m_nextpkt = nextrecord;
3869 if (nextrecord == NULL) {
3870 so->so_rcv.sb_lastrecord = m;
3871 }
3872 } else {
3873 so->so_rcv.sb_mb = nextrecord;
3874 SB_EMPTY_FIXUP(&so->so_rcv);
3875 }
3876 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3877 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3878 }
3879 } else {
3880 if (flags & MSG_PEEK) {
3881 moff += len;
3882 } else {
3883 if (mp != NULL) {
3884 int copy_flag;
3885
3886 if (flags & MSG_DONTWAIT) {
3887 copy_flag = M_DONTWAIT;
3888 } else {
3889 copy_flag = M_WAIT;
3890 }
3891 *mp = m_copym(m, 0, (int)len, copy_flag);
3892 /*
3893 * Failed to allocate an mbuf?
3894 * Adjust uio_resid back, it was
3895 * adjusted down by len bytes which
3896 * we didn't copy over.
3897 */
3898 if (*mp == NULL) {
3899 uio_setresid(uio,
3900 (uio_resid(uio) + len));
3901 break;
3902 }
3903 }
3904 m->m_data += len;
3905 m->m_len -= len;
3906 so->so_rcv.sb_cc -= len;
3907 }
3908 }
3909 if (so->so_oobmark) {
3910 if ((flags & MSG_PEEK) == 0) {
3911 so->so_oobmark -= len;
3912 if (so->so_oobmark == 0) {
3913 so->so_state |= SS_RCVATMARK;
3914 break;
3915 }
3916 } else {
3917 offset += len;
3918 if (offset == so->so_oobmark) {
3919 break;
3920 }
3921 }
3922 }
3923 if (flags & MSG_EOR) {
3924 break;
3925 }
3926 /*
3927 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3928 * (for non-atomic socket), we must not quit until
3929 * "uio->uio_resid == 0" or an error termination.
3930 * If a signal/timeout occurs, return with a short
3931 * count but without error. Keep sockbuf locked
3932 * against other readers.
3933 */
3934 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3935 (uio_resid(uio) - delayed_copy_len) > 0 &&
3936 !sosendallatonce(so) && !nextrecord) {
3937 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3938 #if CONTENT_FILTER
3939 && cfil_sock_data_pending(&so->so_rcv) == 0
3940 #endif /* CONTENT_FILTER */
3941 )) {
3942 goto release;
3943 }
3944
3945 /*
3946 * Depending on the protocol (e.g. TCP), the following
3947 * might cause the socket lock to be dropped and later
3948 * be reacquired, and more data could have arrived and
3949 * have been appended to the receive socket buffer by
3950 * the time it returns. Therefore, we only sleep in
3951 * sbwait() below if and only if the socket buffer is
3952 * empty, in order to avoid a false sleep.
3953 */
3954 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3955 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3956 }
3957
3958 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3959 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3960
3961 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3962 error = 0;
3963 goto release;
3964 }
3965 /*
3966 * have to wait until after we get back from the sbwait
3967 * to do the copy because we will drop the lock if we
3968 * have enough data that has been delayed... by dropping
3969 * the lock we open up a window allowing the netisr
3970 * thread to process the incoming packets and to change
3971 * the state of this socket... we're issuing the sbwait
3972 * because the socket is empty and we're expecting the
3973 * netisr thread to wake us up when more packets arrive;
3974 * if we allow that processing to happen and then sbwait
3975 * we could stall forever with packets sitting in the
3976 * socket if no further packets arrive from the remote
3977 * side.
3978 *
3979 * we want to copy before we've collected all the data
3980 * to satisfy this request to allow the copy to overlap
3981 * the incoming packet processing on an MP system
3982 */
3983 if (delayed_copy_len > sorecvmincopy &&
3984 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3985 error = sodelayed_copy(so, uio,
3986 &free_list, &delayed_copy_len);
3987
3988 if (error) {
3989 goto release;
3990 }
3991 }
3992 m = so->so_rcv.sb_mb;
3993 if (m != NULL) {
3994 nextrecord = m->m_nextpkt;
3995 }
3996 SB_MB_CHECK(&so->so_rcv);
3997 }
3998 }
3999 #ifdef MORE_LOCKING_DEBUG
4000 if (so->so_usecount <= 1) {
4001 panic("%s: after big while so=%p ref=%d on socket",
4002 __func__, so, so->so_usecount);
4003 /* NOTREACHED */
4004 }
4005 #endif
4006
4007 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4008 if (so->so_options & SO_DONTTRUNC) {
4009 flags |= MSG_RCVMORE;
4010 } else {
4011 flags |= MSG_TRUNC;
4012 if ((flags & MSG_PEEK) == 0) {
4013 (void) sbdroprecord(&so->so_rcv);
4014 }
4015 }
4016 }
4017
4018 /*
4019 * pru_rcvd below (for TCP) may cause more data to be received
4020 * if the socket lock is dropped prior to sending the ACK; some
4021 * legacy OpenTransport applications don't handle this well
4022 * (if it receives less data than requested while MSG_HAVEMORE
4023 * is set), and so we set the flag now based on what we know
4024 * prior to calling pru_rcvd.
4025 */
4026 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4027 flags |= MSG_HAVEMORE;
4028 }
4029
4030 if ((flags & MSG_PEEK) == 0) {
4031 if (m == NULL) {
4032 so->so_rcv.sb_mb = nextrecord;
4033 /*
4034 * First part is an inline SB_EMPTY_FIXUP(). Second
4035 * part makes sure sb_lastrecord is up-to-date if
4036 * there is still data in the socket buffer.
4037 */
4038 if (so->so_rcv.sb_mb == NULL) {
4039 so->so_rcv.sb_mbtail = NULL;
4040 so->so_rcv.sb_lastrecord = NULL;
4041 } else if (nextrecord->m_nextpkt == NULL) {
4042 so->so_rcv.sb_lastrecord = nextrecord;
4043 }
4044 SB_MB_CHECK(&so->so_rcv);
4045 }
4046 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4047 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4048 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4049 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4050 }
4051 }
4052
4053 if (delayed_copy_len) {
4054 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4055 if (error) {
4056 goto release;
4057 }
4058 }
4059 if (free_list != NULL) {
4060 m_freem_list(free_list);
4061 free_list = NULL;
4062 }
4063
4064 if (orig_resid == uio_resid(uio) && orig_resid &&
4065 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4066 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4067 goto restart;
4068 }
4069
4070 if (flagsp != NULL) {
4071 *flagsp |= flags;
4072 }
4073 release:
4074 #ifdef MORE_LOCKING_DEBUG
4075 if (so->so_usecount <= 1) {
4076 panic("%s: release so=%p ref=%d on socket", __func__,
4077 so, so->so_usecount);
4078 /* NOTREACHED */
4079 }
4080 #endif
4081 if (delayed_copy_len) {
4082 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4083 }
4084
4085 if (free_list != NULL) {
4086 m_freem_list(free_list);
4087 }
4088
4089 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4090
4091 if (en_tracing) {
4092 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4093 VM_KERNEL_ADDRPERM(so),
4094 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4095 (int64_t)(orig_resid - uio_resid(uio)));
4096 }
4097 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4098 so->so_rcv.sb_cc, 0, error);
4099
4100 return error;
4101 }
4102
4103 /*
4104 * Returns: 0 Success
4105 * uiomove:EFAULT
4106 */
4107 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4108 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4109 user_ssize_t *resid)
4110 {
4111 int error = 0;
4112 struct mbuf *m;
4113
4114 m = *free_list;
4115
4116 socket_unlock(so, 0);
4117
4118 while (m != NULL && error == 0) {
4119 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4120 m = m->m_next;
4121 }
4122 m_freem_list(*free_list);
4123
4124 *free_list = NULL;
4125 *resid = 0;
4126
4127 socket_lock(so, 0);
4128
4129 return error;
4130 }
4131
4132 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4133 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4134 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4135 {
4136 #pragma unused(so)
4137 int error = 0;
4138 struct mbuf *ml, *m;
4139 int i = 0;
4140 struct uio *auio;
4141
4142 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4143 ml = ml->m_nextpkt, i++) {
4144 auio = msgarray[i].uio;
4145 for (m = ml; m != NULL; m = m->m_next) {
4146 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4147 if (error != 0) {
4148 goto out;
4149 }
4150 }
4151 }
4152 out:
4153 m_freem_list(*free_list);
4154
4155 *free_list = NULL;
4156 *resid = 0;
4157
4158 return error;
4159 }
4160
4161 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4162 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4163 int *flagsp)
4164 {
4165 struct mbuf *m;
4166 struct mbuf *nextrecord;
4167 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4168 int error;
4169 user_ssize_t len, pktlen, delayed_copy_len = 0;
4170 struct protosw *pr = so->so_proto;
4171 user_ssize_t resid;
4172 struct proc *p = current_proc();
4173 struct uio *auio = NULL;
4174 int npkts = 0;
4175 int sblocked = 0;
4176 struct sockaddr **psa = NULL;
4177 struct mbuf **controlp = NULL;
4178 int can_delay;
4179 int flags;
4180 struct mbuf *free_others = NULL;
4181
4182 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4183 so, uiocnt,
4184 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4185
4186 /*
4187 * Sanity checks:
4188 * - Only supports don't wait flags
4189 * - Only support datagram sockets (could be extended to raw)
4190 * - Must be atomic
4191 * - Protocol must support packet chains
4192 * - The uio array is NULL (should we panic?)
4193 */
4194 if (flagsp != NULL) {
4195 flags = *flagsp;
4196 } else {
4197 flags = 0;
4198 }
4199 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4200 MSG_NBIO)) {
4201 printf("%s invalid flags 0x%x\n", __func__, flags);
4202 error = EINVAL;
4203 goto out;
4204 }
4205 if (so->so_type != SOCK_DGRAM) {
4206 error = EINVAL;
4207 goto out;
4208 }
4209 if (sosendallatonce(so) == 0) {
4210 error = EINVAL;
4211 goto out;
4212 }
4213 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4214 error = EPROTONOSUPPORT;
4215 goto out;
4216 }
4217 if (msgarray == NULL) {
4218 printf("%s uioarray is NULL\n", __func__);
4219 error = EINVAL;
4220 goto out;
4221 }
4222 if (uiocnt == 0) {
4223 printf("%s uiocnt is 0\n", __func__);
4224 error = EINVAL;
4225 goto out;
4226 }
4227 /*
4228 * Sanity check on the length passed by caller as we are making 'int'
4229 * comparisons
4230 */
4231 resid = recv_msg_array_resid(msgarray, uiocnt);
4232 if (resid < 0 || resid > INT_MAX) {
4233 error = EINVAL;
4234 goto out;
4235 }
4236
4237 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4238 can_delay = 1;
4239 } else {
4240 can_delay = 0;
4241 }
4242
4243 socket_lock(so, 1);
4244 so_update_last_owner_locked(so, p);
4245 so_update_policy(so);
4246
4247 #if NECP
4248 so_update_necp_policy(so, NULL, NULL);
4249 #endif /* NECP */
4250
4251 /*
4252 * If a recv attempt is made on a previously-accepted socket
4253 * that has been marked as inactive (disconnected), reject
4254 * the request.
4255 */
4256 if (so->so_flags & SOF_DEFUNCT) {
4257 struct sockbuf *sb = &so->so_rcv;
4258
4259 error = ENOTCONN;
4260 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4261 __func__, proc_pid(p), proc_best_name(p),
4262 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4263 SOCK_DOM(so), SOCK_TYPE(so), error);
4264 /*
4265 * This socket should have been disconnected and flushed
4266 * prior to being returned from sodefunct(); there should
4267 * be no data on its receive list, so panic otherwise.
4268 */
4269 if (so->so_state & SS_DEFUNCT) {
4270 sb_empty_assert(sb, __func__);
4271 }
4272 goto release;
4273 }
4274
4275 next:
4276 /*
4277 * The uio may be empty
4278 */
4279 if (npkts >= uiocnt) {
4280 error = 0;
4281 goto release;
4282 }
4283 restart:
4284 /*
4285 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4286 * and if so just return to the caller. This could happen when
4287 * soreceive() is called by a socket upcall function during the
4288 * time the socket is freed. The socket buffer would have been
4289 * locked across the upcall, therefore we cannot put this thread
4290 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4291 * we may livelock), because the lock on the socket buffer will
4292 * only be released when the upcall routine returns to its caller.
4293 * Because the socket has been officially closed, there can be
4294 * no further read on it.
4295 */
4296 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4297 (SS_NOFDREF | SS_CANTRCVMORE)) {
4298 error = 0;
4299 goto release;
4300 }
4301
4302 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4303 if (error) {
4304 goto release;
4305 }
4306 sblocked = 1;
4307
4308 m = so->so_rcv.sb_mb;
4309 /*
4310 * Block awaiting more datagram if needed
4311 */
4312 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4313 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4314 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4315 /*
4316 * Panic if we notice inconsistencies in the socket's
4317 * receive list; both sb_mb and sb_cc should correctly
4318 * reflect the contents of the list, otherwise we may
4319 * end up with false positives during select() or poll()
4320 * which could put the application in a bad state.
4321 */
4322 SB_MB_CHECK(&so->so_rcv);
4323
4324 if (so->so_error) {
4325 error = so->so_error;
4326 if ((flags & MSG_PEEK) == 0) {
4327 so->so_error = 0;
4328 }
4329 goto release;
4330 }
4331 if (so->so_state & SS_CANTRCVMORE) {
4332 goto release;
4333 }
4334 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4335 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4336 error = ENOTCONN;
4337 goto release;
4338 }
4339 if ((so->so_state & SS_NBIO) ||
4340 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4341 error = EWOULDBLOCK;
4342 goto release;
4343 }
4344 /*
4345 * Do not block if we got some data
4346 */
4347 if (free_list != NULL) {
4348 error = 0;
4349 goto release;
4350 }
4351
4352 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4353 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4354
4355 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4356 sblocked = 0;
4357
4358 error = sbwait(&so->so_rcv);
4359 if (error) {
4360 goto release;
4361 }
4362 goto restart;
4363 }
4364
4365 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4366 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4367 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4368
4369 /*
4370 * Consume the current uio index as we have a datagram
4371 */
4372 auio = msgarray[npkts].uio;
4373 resid = uio_resid(auio);
4374 msgarray[npkts].which |= SOCK_MSG_DATA;
4375 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4376 &msgarray[npkts].psa : NULL;
4377 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4378 &msgarray[npkts].controlp : NULL;
4379 npkts += 1;
4380 nextrecord = m->m_nextpkt;
4381
4382 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4383 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4384 if (error == ERESTART) {
4385 goto restart;
4386 } else if (error != 0) {
4387 goto release;
4388 }
4389 }
4390
4391 if (m != NULL && m->m_type == MT_CONTROL) {
4392 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4393 if (error != 0) {
4394 goto release;
4395 }
4396 }
4397
4398 if (m->m_pkthdr.len == 0) {
4399 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4400 __func__, __LINE__,
4401 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4402 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4403 m->m_type);
4404 }
4405
4406 /*
4407 * Loop to copy the mbufs of the current record
4408 * Support zero length packets
4409 */
4410 ml = NULL;
4411 pktlen = 0;
4412 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4413 if (m->m_len == 0) {
4414 panic("%p m_len zero", m);
4415 }
4416 if (m->m_type == 0) {
4417 panic("%p m_type zero", m);
4418 }
4419 /*
4420 * Clip to the residual length
4421 */
4422 if (len > m->m_len) {
4423 len = m->m_len;
4424 }
4425 pktlen += len;
4426 /*
4427 * Copy the mbufs via the uio or delay the copy
4428 * Sockbuf must be consistent here (points to current mbuf,
4429 * it points to next record) when we drop priority;
4430 * we must note any additions to the sockbuf when we
4431 * block interrupts again.
4432 */
4433 if (len > 0 && can_delay == 0) {
4434 socket_unlock(so, 0);
4435 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4436 socket_lock(so, 0);
4437 if (error) {
4438 goto release;
4439 }
4440 } else {
4441 delayed_copy_len += len;
4442 }
4443
4444 if (len == m->m_len) {
4445 /*
4446 * m was entirely copied
4447 */
4448 sbfree(&so->so_rcv, m);
4449 nextrecord = m->m_nextpkt;
4450 m->m_nextpkt = NULL;
4451
4452 /*
4453 * Set the first packet to the head of the free list
4454 */
4455 if (free_list == NULL) {
4456 free_list = m;
4457 }
4458 /*
4459 * Link current packet to tail of free list
4460 */
4461 if (ml == NULL) {
4462 if (free_tail != NULL) {
4463 free_tail->m_nextpkt = m;
4464 }
4465 free_tail = m;
4466 }
4467 /*
4468 * Link current mbuf to last mbuf of current packet
4469 */
4470 if (ml != NULL) {
4471 ml->m_next = m;
4472 }
4473 ml = m;
4474
4475 /*
4476 * Move next buf to head of socket buffer
4477 */
4478 so->so_rcv.sb_mb = m = ml->m_next;
4479 ml->m_next = NULL;
4480
4481 if (m != NULL) {
4482 m->m_nextpkt = nextrecord;
4483 if (nextrecord == NULL) {
4484 so->so_rcv.sb_lastrecord = m;
4485 }
4486 } else {
4487 so->so_rcv.sb_mb = nextrecord;
4488 SB_EMPTY_FIXUP(&so->so_rcv);
4489 }
4490 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4491 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4492 } else {
4493 /*
4494 * Stop the loop on partial copy
4495 */
4496 break;
4497 }
4498 }
4499 #ifdef MORE_LOCKING_DEBUG
4500 if (so->so_usecount <= 1) {
4501 panic("%s: after big while so=%llx ref=%d on socket",
4502 __func__,
4503 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4504 /* NOTREACHED */
4505 }
4506 #endif
4507 /*
4508 * Tell the caller we made a partial copy
4509 */
4510 if (m != NULL) {
4511 if (so->so_options & SO_DONTTRUNC) {
4512 /*
4513 * Copyout first the freelist then the partial mbuf
4514 */
4515 socket_unlock(so, 0);
4516 if (delayed_copy_len) {
4517 error = sodelayed_copy_list(so, msgarray,
4518 uiocnt, &free_list, &delayed_copy_len);
4519 }
4520
4521 if (error == 0) {
4522 error = uiomove(mtod(m, caddr_t), (int)len,
4523 auio);
4524 }
4525 socket_lock(so, 0);
4526 if (error) {
4527 goto release;
4528 }
4529
4530 m->m_data += len;
4531 m->m_len -= len;
4532 so->so_rcv.sb_cc -= len;
4533 flags |= MSG_RCVMORE;
4534 } else {
4535 (void) sbdroprecord(&so->so_rcv);
4536 nextrecord = so->so_rcv.sb_mb;
4537 m = NULL;
4538 flags |= MSG_TRUNC;
4539 }
4540 }
4541
4542 if (m == NULL) {
4543 so->so_rcv.sb_mb = nextrecord;
4544 /*
4545 * First part is an inline SB_EMPTY_FIXUP(). Second
4546 * part makes sure sb_lastrecord is up-to-date if
4547 * there is still data in the socket buffer.
4548 */
4549 if (so->so_rcv.sb_mb == NULL) {
4550 so->so_rcv.sb_mbtail = NULL;
4551 so->so_rcv.sb_lastrecord = NULL;
4552 } else if (nextrecord->m_nextpkt == NULL) {
4553 so->so_rcv.sb_lastrecord = nextrecord;
4554 }
4555 SB_MB_CHECK(&so->so_rcv);
4556 }
4557 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4558 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4559
4560 /*
4561 * We can continue to the next packet as long as:
4562 * - We haven't exhausted the uio array
4563 * - There was no error
4564 * - A packet was not truncated
4565 * - We can still receive more data
4566 */
4567 if (npkts < uiocnt && error == 0 &&
4568 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4569 (so->so_state & SS_CANTRCVMORE) == 0) {
4570 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4571 sblocked = 0;
4572
4573 goto next;
4574 }
4575 if (flagsp != NULL) {
4576 *flagsp |= flags;
4577 }
4578
4579 release:
4580 /*
4581 * pru_rcvd may cause more data to be received if the socket lock
4582 * is dropped so we set MSG_HAVEMORE now based on what we know.
4583 * That way the caller won't be surprised if it receives less data
4584 * than requested.
4585 */
4586 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4587 flags |= MSG_HAVEMORE;
4588 }
4589
4590 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4591 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4592 }
4593
4594 if (sblocked) {
4595 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4596 } else {
4597 socket_unlock(so, 1);
4598 }
4599
4600 if (delayed_copy_len) {
4601 error = sodelayed_copy_list(so, msgarray, uiocnt,
4602 &free_list, &delayed_copy_len);
4603 }
4604 out:
4605 /*
4606 * Amortize the cost of freeing the mbufs
4607 */
4608 if (free_list != NULL) {
4609 m_freem_list(free_list);
4610 }
4611 if (free_others != NULL) {
4612 m_freem_list(free_others);
4613 }
4614
4615 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4616 0, 0, 0, 0);
4617 return error;
4618 }
4619
4620 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4621 so_statistics_event_to_nstat_event(int64_t *input_options,
4622 uint64_t *nstat_event)
4623 {
4624 int error = 0;
4625 switch (*input_options) {
4626 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4627 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4628 break;
4629 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4630 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4631 break;
4632 #if (DEBUG || DEVELOPMENT)
4633 case SO_STATISTICS_EVENT_RESERVED_1:
4634 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4635 break;
4636 case SO_STATISTICS_EVENT_RESERVED_2:
4637 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4638 break;
4639 #endif /* (DEBUG || DEVELOPMENT) */
4640 default:
4641 error = EINVAL;
4642 break;
4643 }
4644 return error;
4645 }
4646
4647 /*
4648 * Returns: 0 Success
4649 * EINVAL
4650 * ENOTCONN
4651 * <pru_shutdown>:EINVAL
4652 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4653 * <pru_shutdown>:ENOBUFS[TCP]
4654 * <pru_shutdown>:EMSGSIZE[TCP]
4655 * <pru_shutdown>:EHOSTUNREACH[TCP]
4656 * <pru_shutdown>:ENETUNREACH[TCP]
4657 * <pru_shutdown>:ENETDOWN[TCP]
4658 * <pru_shutdown>:ENOMEM[TCP]
4659 * <pru_shutdown>:EACCES[TCP]
4660 * <pru_shutdown>:EMSGSIZE[TCP]
4661 * <pru_shutdown>:ENOBUFS[TCP]
4662 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4663 * <pru_shutdown>:??? [other protocol families]
4664 */
4665 int
soshutdown(struct socket * so,int how)4666 soshutdown(struct socket *so, int how)
4667 {
4668 int error;
4669
4670 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4671
4672 switch (how) {
4673 case SHUT_RD:
4674 case SHUT_WR:
4675 case SHUT_RDWR:
4676 socket_lock(so, 1);
4677 if ((so->so_state &
4678 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4679 error = ENOTCONN;
4680 } else {
4681 error = soshutdownlock(so, how);
4682 }
4683 socket_unlock(so, 1);
4684 break;
4685 default:
4686 error = EINVAL;
4687 break;
4688 }
4689
4690 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4691
4692 return error;
4693 }
4694
4695 int
soshutdownlock_final(struct socket * so,int how)4696 soshutdownlock_final(struct socket *so, int how)
4697 {
4698 struct protosw *pr = so->so_proto;
4699 int error = 0;
4700
4701 sflt_notify(so, sock_evt_shutdown, &how);
4702
4703 if (how != SHUT_WR) {
4704 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4705 /* read already shut down */
4706 error = ENOTCONN;
4707 goto done;
4708 }
4709 sorflush(so);
4710 }
4711 if (how != SHUT_RD) {
4712 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4713 /* write already shut down */
4714 error = ENOTCONN;
4715 goto done;
4716 }
4717 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4718 }
4719 done:
4720 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4721 return error;
4722 }
4723
4724 int
soshutdownlock(struct socket * so,int how)4725 soshutdownlock(struct socket *so, int how)
4726 {
4727 int error = 0;
4728
4729 #if CONTENT_FILTER
4730 /*
4731 * A content filter may delay the actual shutdown until it
4732 * has processed the pending data
4733 */
4734 if (so->so_flags & SOF_CONTENT_FILTER) {
4735 error = cfil_sock_shutdown(so, &how);
4736 if (error == EJUSTRETURN) {
4737 error = 0;
4738 goto done;
4739 } else if (error != 0) {
4740 goto done;
4741 }
4742 }
4743 #endif /* CONTENT_FILTER */
4744
4745 error = soshutdownlock_final(so, how);
4746
4747 done:
4748 return error;
4749 }
4750
4751 void
sowflush(struct socket * so)4752 sowflush(struct socket *so)
4753 {
4754 struct sockbuf *sb = &so->so_snd;
4755
4756 /*
4757 * Obtain lock on the socket buffer (SB_LOCK). This is required
4758 * to prevent the socket buffer from being unexpectedly altered
4759 * while it is used by another thread in socket send/receive.
4760 *
4761 * sblock() must not fail here, hence the assertion.
4762 */
4763 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4764 VERIFY(sb->sb_flags & SB_LOCK);
4765
4766 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4767 sb->sb_flags |= SB_DROP;
4768 sb->sb_upcall = NULL;
4769 sb->sb_upcallarg = NULL;
4770
4771 sbunlock(sb, TRUE); /* keep socket locked */
4772
4773 selthreadclear(&sb->sb_sel);
4774 sbrelease(sb);
4775 }
4776
4777 void
sorflush(struct socket * so)4778 sorflush(struct socket *so)
4779 {
4780 struct sockbuf *sb = &so->so_rcv;
4781 struct protosw *pr = so->so_proto;
4782 struct sockbuf asb;
4783 #ifdef notyet
4784 lck_mtx_t *mutex_held;
4785 /*
4786 * XXX: This code is currently commented out, because we may get here
4787 * as part of sofreelastref(), and at that time, pr_getlock() may no
4788 * longer be able to return us the lock; this will be fixed in future.
4789 */
4790 if (so->so_proto->pr_getlock != NULL) {
4791 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4792 } else {
4793 mutex_held = so->so_proto->pr_domain->dom_mtx;
4794 }
4795
4796 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4797 #endif /* notyet */
4798
4799 sflt_notify(so, sock_evt_flush_read, NULL);
4800
4801 socantrcvmore(so);
4802
4803 /*
4804 * Obtain lock on the socket buffer (SB_LOCK). This is required
4805 * to prevent the socket buffer from being unexpectedly altered
4806 * while it is used by another thread in socket send/receive.
4807 *
4808 * sblock() must not fail here, hence the assertion.
4809 */
4810 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4811 VERIFY(sb->sb_flags & SB_LOCK);
4812
4813 /*
4814 * Copy only the relevant fields from "sb" to "asb" which we
4815 * need for sbrelease() to function. In particular, skip
4816 * sb_sel as it contains the wait queue linkage, which would
4817 * wreak havoc if we were to issue selthreadclear() on "asb".
4818 * Make sure to not carry over SB_LOCK in "asb", as we need
4819 * to acquire it later as part of sbrelease().
4820 */
4821 bzero(&asb, sizeof(asb));
4822 asb.sb_cc = sb->sb_cc;
4823 asb.sb_hiwat = sb->sb_hiwat;
4824 asb.sb_mbcnt = sb->sb_mbcnt;
4825 asb.sb_mbmax = sb->sb_mbmax;
4826 asb.sb_ctl = sb->sb_ctl;
4827 asb.sb_lowat = sb->sb_lowat;
4828 asb.sb_mb = sb->sb_mb;
4829 asb.sb_mbtail = sb->sb_mbtail;
4830 asb.sb_lastrecord = sb->sb_lastrecord;
4831 asb.sb_so = sb->sb_so;
4832 asb.sb_flags = sb->sb_flags;
4833 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4834 asb.sb_flags |= SB_DROP;
4835
4836 /*
4837 * Ideally we'd bzero() these and preserve the ones we need;
4838 * but to do that we'd need to shuffle things around in the
4839 * sockbuf, and we can't do it now because there are KEXTS
4840 * that are directly referring to the socket structure.
4841 *
4842 * Setting SB_DROP acts as a barrier to prevent further appends.
4843 * Clearing SB_SEL is done for selthreadclear() below.
4844 */
4845 sb->sb_cc = 0;
4846 sb->sb_hiwat = 0;
4847 sb->sb_mbcnt = 0;
4848 sb->sb_mbmax = 0;
4849 sb->sb_ctl = 0;
4850 sb->sb_lowat = 0;
4851 sb->sb_mb = NULL;
4852 sb->sb_mbtail = NULL;
4853 sb->sb_lastrecord = NULL;
4854 sb->sb_timeo.tv_sec = 0;
4855 sb->sb_timeo.tv_usec = 0;
4856 sb->sb_upcall = NULL;
4857 sb->sb_upcallarg = NULL;
4858 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4859 sb->sb_flags |= SB_DROP;
4860
4861 sbunlock(sb, TRUE); /* keep socket locked */
4862
4863 /*
4864 * Note that selthreadclear() is called on the original "sb" and
4865 * not the local "asb" because of the way wait queue linkage is
4866 * implemented. Given that selwakeup() may be triggered, SB_SEL
4867 * should no longer be set (cleared above.)
4868 */
4869 selthreadclear(&sb->sb_sel);
4870
4871 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4872 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4873 }
4874
4875 sbrelease(&asb);
4876 }
4877
4878 /*
4879 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4880 * an additional variant to handle the case where the option value needs
4881 * to be some kind of integer, but not a specific size.
4882 * In addition to their use here, these functions are also called by the
4883 * protocol-level pr_ctloutput() routines.
4884 *
4885 * Returns: 0 Success
4886 * EINVAL
4887 * copyin:EFAULT
4888 */
4889 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4890 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4891 {
4892 size_t valsize;
4893
4894 /*
4895 * If the user gives us more than we wanted, we ignore it,
4896 * but if we don't get the minimum length the caller
4897 * wants, we return EINVAL. On success, sopt->sopt_valsize
4898 * is set to however much we actually retrieved.
4899 */
4900 if ((valsize = sopt->sopt_valsize) < minlen) {
4901 return EINVAL;
4902 }
4903 if (valsize > len) {
4904 sopt->sopt_valsize = valsize = len;
4905 }
4906
4907 if (sopt->sopt_p != kernproc) {
4908 return copyin(sopt->sopt_val, buf, valsize);
4909 }
4910
4911 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4912 return 0;
4913 }
4914
4915 /*
4916 * sooptcopyin_timeval
4917 * Copy in a timeval value into tv_p, and take into account whether the
4918 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4919 * code here so that we can verify the 64-bit tv_sec value before we lose
4920 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4921 */
4922 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4923 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4924 {
4925 int error;
4926
4927 if (proc_is64bit(sopt->sopt_p)) {
4928 struct user64_timeval tv64;
4929
4930 if (sopt->sopt_valsize < sizeof(tv64)) {
4931 return EINVAL;
4932 }
4933
4934 sopt->sopt_valsize = sizeof(tv64);
4935 if (sopt->sopt_p != kernproc) {
4936 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4937 if (error != 0) {
4938 return error;
4939 }
4940 } else {
4941 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4942 sizeof(tv64));
4943 }
4944 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4945 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4946 return EDOM;
4947 }
4948
4949 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4950 tv_p->tv_usec = tv64.tv_usec;
4951 } else {
4952 struct user32_timeval tv32;
4953
4954 if (sopt->sopt_valsize < sizeof(tv32)) {
4955 return EINVAL;
4956 }
4957
4958 sopt->sopt_valsize = sizeof(tv32);
4959 if (sopt->sopt_p != kernproc) {
4960 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4961 if (error != 0) {
4962 return error;
4963 }
4964 } else {
4965 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4966 sizeof(tv32));
4967 }
4968 #ifndef __LP64__
4969 /*
4970 * K64todo "comparison is always false due to
4971 * limited range of data type"
4972 */
4973 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4974 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4975 return EDOM;
4976 }
4977 #endif
4978 tv_p->tv_sec = tv32.tv_sec;
4979 tv_p->tv_usec = tv32.tv_usec;
4980 }
4981 return 0;
4982 }
4983
4984 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4985 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4986 boolean_t ignore_delegate)
4987 {
4988 kauth_cred_t cred = NULL;
4989 proc_t ep = PROC_NULL;
4990 uid_t uid;
4991 int error = 0;
4992
4993 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4994 ep = proc_find(so->e_pid);
4995 if (ep) {
4996 cred = kauth_cred_proc_ref(ep);
4997 }
4998 }
4999
5000 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5001
5002 /* uid is 0 for root */
5003 if (uid != 0 || !allow_root) {
5004 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5005 }
5006 if (cred) {
5007 kauth_cred_unref(&cred);
5008 }
5009 if (ep != PROC_NULL) {
5010 proc_rele(ep);
5011 }
5012
5013 return error;
5014 }
5015
5016 /*
5017 * Returns: 0 Success
5018 * EINVAL
5019 * ENOPROTOOPT
5020 * ENOBUFS
5021 * EDOM
5022 * sooptcopyin:EINVAL
5023 * sooptcopyin:EFAULT
5024 * sooptcopyin_timeval:EINVAL
5025 * sooptcopyin_timeval:EFAULT
5026 * sooptcopyin_timeval:EDOM
5027 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5028 * <pr_ctloutput>:???w
5029 * sflt_attach_private:??? [whatever a filter author chooses]
5030 * <sf_setoption>:??? [whatever a filter author chooses]
5031 *
5032 * Notes: Other <pru_listen> returns depend on the protocol family; all
5033 * <sf_listen> returns depend on what the filter author causes
5034 * their filter to return.
5035 */
5036 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5037 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5038 {
5039 int error, optval;
5040 int64_t long_optval;
5041 struct linger l;
5042 struct timeval tv;
5043
5044 if (sopt->sopt_dir != SOPT_SET) {
5045 sopt->sopt_dir = SOPT_SET;
5046 }
5047
5048 if (dolock) {
5049 socket_lock(so, 1);
5050 }
5051
5052 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5053 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5054 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5055 /* the socket has been shutdown, no more sockopt's */
5056 error = EINVAL;
5057 goto out;
5058 }
5059
5060 error = sflt_setsockopt(so, sopt);
5061 if (error != 0) {
5062 if (error == EJUSTRETURN) {
5063 error = 0;
5064 }
5065 goto out;
5066 }
5067
5068 if (sopt->sopt_level != SOL_SOCKET) {
5069 if (so->so_proto != NULL &&
5070 so->so_proto->pr_ctloutput != NULL) {
5071 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5072 goto out;
5073 }
5074 error = ENOPROTOOPT;
5075 } else {
5076 /*
5077 * Allow socket-level (SOL_SOCKET) options to be filtered by
5078 * the protocol layer, if needed. A zero value returned from
5079 * the handler means use default socket-level processing as
5080 * done by the rest of this routine. Otherwise, any other
5081 * return value indicates that the option is unsupported.
5082 */
5083 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5084 pru_socheckopt(so, sopt)) != 0) {
5085 goto out;
5086 }
5087
5088 error = 0;
5089 switch (sopt->sopt_name) {
5090 case SO_LINGER:
5091 case SO_LINGER_SEC:
5092 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5093 if (error != 0) {
5094 goto out;
5095 }
5096
5097 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5098 (short)l.l_linger : (short)(l.l_linger * hz);
5099 if (l.l_onoff != 0) {
5100 so->so_options |= SO_LINGER;
5101 } else {
5102 so->so_options &= ~SO_LINGER;
5103 }
5104 break;
5105
5106 case SO_DEBUG:
5107 case SO_KEEPALIVE:
5108 case SO_DONTROUTE:
5109 case SO_USELOOPBACK:
5110 case SO_BROADCAST:
5111 case SO_REUSEADDR:
5112 case SO_REUSEPORT:
5113 case SO_OOBINLINE:
5114 case SO_TIMESTAMP:
5115 case SO_TIMESTAMP_MONOTONIC:
5116 case SO_TIMESTAMP_CONTINUOUS:
5117 case SO_DONTTRUNC:
5118 case SO_WANTMORE:
5119 case SO_WANTOOBFLAG:
5120 case SO_NOWAKEFROMSLEEP:
5121 case SO_NOAPNFALLBK:
5122 error = sooptcopyin(sopt, &optval, sizeof(optval),
5123 sizeof(optval));
5124 if (error != 0) {
5125 goto out;
5126 }
5127 if (optval) {
5128 so->so_options |= sopt->sopt_name;
5129 } else {
5130 so->so_options &= ~sopt->sopt_name;
5131 }
5132 #if SKYWALK
5133 inp_update_netns_flags(so);
5134 #endif /* SKYWALK */
5135 break;
5136
5137 case SO_SNDBUF:
5138 case SO_RCVBUF:
5139 case SO_SNDLOWAT:
5140 case SO_RCVLOWAT:
5141 error = sooptcopyin(sopt, &optval, sizeof(optval),
5142 sizeof(optval));
5143 if (error != 0) {
5144 goto out;
5145 }
5146
5147 /*
5148 * Values < 1 make no sense for any of these
5149 * options, so disallow them.
5150 */
5151 if (optval < 1) {
5152 error = EINVAL;
5153 goto out;
5154 }
5155
5156 switch (sopt->sopt_name) {
5157 case SO_SNDBUF:
5158 case SO_RCVBUF: {
5159 struct sockbuf *sb =
5160 (sopt->sopt_name == SO_SNDBUF) ?
5161 &so->so_snd : &so->so_rcv;
5162 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5163 error = ENOBUFS;
5164 goto out;
5165 }
5166 sb->sb_flags |= SB_USRSIZE;
5167 sb->sb_flags &= ~SB_AUTOSIZE;
5168 sb->sb_idealsize = (u_int32_t)optval;
5169 break;
5170 }
5171 /*
5172 * Make sure the low-water is never greater than
5173 * the high-water.
5174 */
5175 case SO_SNDLOWAT: {
5176 int space = sbspace(&so->so_snd);
5177 u_int32_t hiwat = so->so_snd.sb_hiwat;
5178
5179 if (so->so_snd.sb_flags & SB_UNIX) {
5180 struct unpcb *unp =
5181 (struct unpcb *)(so->so_pcb);
5182 if (unp != NULL &&
5183 unp->unp_conn != NULL) {
5184 hiwat += unp->unp_conn->unp_cc;
5185 }
5186 }
5187
5188 so->so_snd.sb_lowat =
5189 (optval > hiwat) ?
5190 hiwat : optval;
5191
5192 if (space >= so->so_snd.sb_lowat) {
5193 sowwakeup(so);
5194 }
5195 break;
5196 }
5197 case SO_RCVLOWAT: {
5198 int64_t data_len;
5199 so->so_rcv.sb_lowat =
5200 (optval > so->so_rcv.sb_hiwat) ?
5201 so->so_rcv.sb_hiwat : optval;
5202 data_len = so->so_rcv.sb_cc
5203 - so->so_rcv.sb_ctl;
5204 if (data_len >= so->so_rcv.sb_lowat) {
5205 sorwakeup(so);
5206 }
5207 break;
5208 }
5209 }
5210 break;
5211
5212 case SO_SNDTIMEO:
5213 case SO_RCVTIMEO:
5214 error = sooptcopyin_timeval(sopt, &tv);
5215 if (error != 0) {
5216 goto out;
5217 }
5218
5219 switch (sopt->sopt_name) {
5220 case SO_SNDTIMEO:
5221 so->so_snd.sb_timeo = tv;
5222 break;
5223 case SO_RCVTIMEO:
5224 so->so_rcv.sb_timeo = tv;
5225 break;
5226 }
5227 break;
5228
5229 case SO_NKE: {
5230 struct so_nke nke;
5231
5232 error = sooptcopyin(sopt, &nke, sizeof(nke),
5233 sizeof(nke));
5234 if (error != 0) {
5235 goto out;
5236 }
5237
5238 error = sflt_attach_internal(so, nke.nke_handle);
5239 break;
5240 }
5241
5242 case SO_NOSIGPIPE:
5243 error = sooptcopyin(sopt, &optval, sizeof(optval),
5244 sizeof(optval));
5245 if (error != 0) {
5246 goto out;
5247 }
5248 if (optval != 0) {
5249 so->so_flags |= SOF_NOSIGPIPE;
5250 } else {
5251 so->so_flags &= ~SOF_NOSIGPIPE;
5252 }
5253 break;
5254
5255 case SO_NOADDRERR:
5256 error = sooptcopyin(sopt, &optval, sizeof(optval),
5257 sizeof(optval));
5258 if (error != 0) {
5259 goto out;
5260 }
5261 if (optval != 0) {
5262 so->so_flags |= SOF_NOADDRAVAIL;
5263 } else {
5264 so->so_flags &= ~SOF_NOADDRAVAIL;
5265 }
5266 break;
5267
5268 case SO_REUSESHAREUID:
5269 error = sooptcopyin(sopt, &optval, sizeof(optval),
5270 sizeof(optval));
5271 if (error != 0) {
5272 goto out;
5273 }
5274 if (optval != 0) {
5275 so->so_flags |= SOF_REUSESHAREUID;
5276 } else {
5277 so->so_flags &= ~SOF_REUSESHAREUID;
5278 }
5279 break;
5280
5281 case SO_NOTIFYCONFLICT:
5282 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5283 error = EPERM;
5284 goto out;
5285 }
5286 error = sooptcopyin(sopt, &optval, sizeof(optval),
5287 sizeof(optval));
5288 if (error != 0) {
5289 goto out;
5290 }
5291 if (optval != 0) {
5292 so->so_flags |= SOF_NOTIFYCONFLICT;
5293 } else {
5294 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5295 }
5296 break;
5297
5298 case SO_RESTRICTIONS:
5299 error = sooptcopyin(sopt, &optval, sizeof(optval),
5300 sizeof(optval));
5301 if (error != 0) {
5302 goto out;
5303 }
5304
5305 error = so_set_restrictions(so, optval);
5306 break;
5307
5308 case SO_AWDL_UNRESTRICTED:
5309 if (SOCK_DOM(so) != PF_INET &&
5310 SOCK_DOM(so) != PF_INET6) {
5311 error = EOPNOTSUPP;
5312 goto out;
5313 }
5314 error = sooptcopyin(sopt, &optval, sizeof(optval),
5315 sizeof(optval));
5316 if (error != 0) {
5317 goto out;
5318 }
5319 if (optval != 0) {
5320 error = soopt_cred_check(so,
5321 PRIV_NET_RESTRICTED_AWDL, false, false);
5322 if (error == 0) {
5323 inp_set_awdl_unrestricted(
5324 sotoinpcb(so));
5325 }
5326 } else {
5327 inp_clear_awdl_unrestricted(sotoinpcb(so));
5328 }
5329 break;
5330 case SO_INTCOPROC_ALLOW:
5331 if (SOCK_DOM(so) != PF_INET6) {
5332 error = EOPNOTSUPP;
5333 goto out;
5334 }
5335 error = sooptcopyin(sopt, &optval, sizeof(optval),
5336 sizeof(optval));
5337 if (error != 0) {
5338 goto out;
5339 }
5340 if (optval != 0 &&
5341 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5342 error = soopt_cred_check(so,
5343 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5344 if (error == 0) {
5345 inp_set_intcoproc_allowed(
5346 sotoinpcb(so));
5347 }
5348 } else if (optval == 0) {
5349 inp_clear_intcoproc_allowed(sotoinpcb(so));
5350 }
5351 break;
5352
5353 case SO_LABEL:
5354 error = EOPNOTSUPP;
5355 break;
5356
5357 case SO_UPCALLCLOSEWAIT:
5358 error = sooptcopyin(sopt, &optval, sizeof(optval),
5359 sizeof(optval));
5360 if (error != 0) {
5361 goto out;
5362 }
5363 if (optval != 0) {
5364 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5365 } else {
5366 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5367 }
5368 break;
5369
5370 case SO_RANDOMPORT:
5371 error = sooptcopyin(sopt, &optval, sizeof(optval),
5372 sizeof(optval));
5373 if (error != 0) {
5374 goto out;
5375 }
5376 if (optval != 0) {
5377 so->so_flags |= SOF_BINDRANDOMPORT;
5378 } else {
5379 so->so_flags &= ~SOF_BINDRANDOMPORT;
5380 }
5381 break;
5382
5383 case SO_NP_EXTENSIONS: {
5384 struct so_np_extensions sonpx;
5385
5386 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5387 sizeof(sonpx));
5388 if (error != 0) {
5389 goto out;
5390 }
5391 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5392 error = EINVAL;
5393 goto out;
5394 }
5395 /*
5396 * Only one bit defined for now
5397 */
5398 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5399 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5400 so->so_flags |= SOF_NPX_SETOPTSHUT;
5401 } else {
5402 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5403 }
5404 }
5405 break;
5406 }
5407
5408 case SO_TRAFFIC_CLASS: {
5409 error = sooptcopyin(sopt, &optval, sizeof(optval),
5410 sizeof(optval));
5411 if (error != 0) {
5412 goto out;
5413 }
5414 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5415 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5416 error = so_set_net_service_type(so, netsvc);
5417 goto out;
5418 }
5419 error = so_set_traffic_class(so, optval);
5420 if (error != 0) {
5421 goto out;
5422 }
5423 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5424 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5425 break;
5426 }
5427
5428 case SO_RECV_TRAFFIC_CLASS: {
5429 error = sooptcopyin(sopt, &optval, sizeof(optval),
5430 sizeof(optval));
5431 if (error != 0) {
5432 goto out;
5433 }
5434 if (optval == 0) {
5435 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5436 } else {
5437 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5438 }
5439 break;
5440 }
5441
5442 #if (DEVELOPMENT || DEBUG)
5443 case SO_TRAFFIC_CLASS_DBG: {
5444 struct so_tcdbg so_tcdbg;
5445
5446 error = sooptcopyin(sopt, &so_tcdbg,
5447 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5448 if (error != 0) {
5449 goto out;
5450 }
5451 error = so_set_tcdbg(so, &so_tcdbg);
5452 if (error != 0) {
5453 goto out;
5454 }
5455 break;
5456 }
5457 #endif /* (DEVELOPMENT || DEBUG) */
5458
5459 case SO_PRIVILEGED_TRAFFIC_CLASS:
5460 error = priv_check_cred(kauth_cred_get(),
5461 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5462 if (error != 0) {
5463 goto out;
5464 }
5465 error = sooptcopyin(sopt, &optval, sizeof(optval),
5466 sizeof(optval));
5467 if (error != 0) {
5468 goto out;
5469 }
5470 if (optval == 0) {
5471 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5472 } else {
5473 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5474 }
5475 break;
5476
5477 #if (DEVELOPMENT || DEBUG)
5478 case SO_DEFUNCTIT:
5479 error = sosetdefunct(current_proc(), so, 0, FALSE);
5480 if (error == 0) {
5481 error = sodefunct(current_proc(), so, 0);
5482 }
5483
5484 break;
5485 #endif /* (DEVELOPMENT || DEBUG) */
5486
5487 case SO_DEFUNCTOK:
5488 error = sooptcopyin(sopt, &optval, sizeof(optval),
5489 sizeof(optval));
5490 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5491 if (error == 0) {
5492 error = EBADF;
5493 }
5494 goto out;
5495 }
5496 /*
5497 * Any process can set SO_DEFUNCTOK (clear
5498 * SOF_NODEFUNCT), but only root can clear
5499 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5500 */
5501 if (optval == 0 &&
5502 kauth_cred_issuser(kauth_cred_get()) == 0) {
5503 error = EPERM;
5504 goto out;
5505 }
5506 if (optval) {
5507 so->so_flags &= ~SOF_NODEFUNCT;
5508 } else {
5509 so->so_flags |= SOF_NODEFUNCT;
5510 }
5511
5512 if (SOCK_DOM(so) == PF_INET ||
5513 SOCK_DOM(so) == PF_INET6) {
5514 char s[MAX_IPv6_STR_LEN];
5515 char d[MAX_IPv6_STR_LEN];
5516 struct inpcb *inp = sotoinpcb(so);
5517
5518 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5519 "[%s %s:%d -> %s:%d] is now marked "
5520 "as %seligible for "
5521 "defunct\n", __func__, proc_selfpid(),
5522 proc_best_name(current_proc()),
5523 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5524 (SOCK_TYPE(so) == SOCK_STREAM) ?
5525 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5526 ((SOCK_DOM(so) == PF_INET) ?
5527 (void *)&inp->inp_laddr.s_addr :
5528 (void *)&inp->in6p_laddr), s, sizeof(s)),
5529 ntohs(inp->in6p_lport),
5530 inet_ntop(SOCK_DOM(so),
5531 (SOCK_DOM(so) == PF_INET) ?
5532 (void *)&inp->inp_faddr.s_addr :
5533 (void *)&inp->in6p_faddr, d, sizeof(d)),
5534 ntohs(inp->in6p_fport),
5535 (so->so_flags & SOF_NODEFUNCT) ?
5536 "not " : "");
5537 } else {
5538 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5539 "is now marked as %seligible for "
5540 "defunct\n",
5541 __func__, proc_selfpid(),
5542 proc_best_name(current_proc()),
5543 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5544 SOCK_DOM(so), SOCK_TYPE(so),
5545 (so->so_flags & SOF_NODEFUNCT) ?
5546 "not " : "");
5547 }
5548 break;
5549
5550 case SO_ISDEFUNCT:
5551 /* This option is not settable */
5552 error = EINVAL;
5553 break;
5554
5555 case SO_OPPORTUNISTIC:
5556 error = sooptcopyin(sopt, &optval, sizeof(optval),
5557 sizeof(optval));
5558 if (error == 0) {
5559 error = so_set_opportunistic(so, optval);
5560 }
5561 break;
5562
5563 case SO_FLUSH:
5564 /* This option is handled by lower layer(s) */
5565 error = 0;
5566 break;
5567
5568 case SO_RECV_ANYIF:
5569 error = sooptcopyin(sopt, &optval, sizeof(optval),
5570 sizeof(optval));
5571 if (error == 0) {
5572 error = so_set_recv_anyif(so, optval);
5573 }
5574 break;
5575
5576 case SO_TRAFFIC_MGT_BACKGROUND: {
5577 /* This option is handled by lower layer(s) */
5578 error = 0;
5579 break;
5580 }
5581
5582 #if FLOW_DIVERT
5583 case SO_FLOW_DIVERT_TOKEN:
5584 error = flow_divert_token_set(so, sopt);
5585 break;
5586 #endif /* FLOW_DIVERT */
5587
5588
5589 case SO_DELEGATED:
5590 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5591 sizeof(optval))) != 0) {
5592 break;
5593 }
5594
5595 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5596 break;
5597
5598 case SO_DELEGATED_UUID: {
5599 uuid_t euuid;
5600
5601 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5602 sizeof(euuid))) != 0) {
5603 break;
5604 }
5605
5606 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5607 break;
5608 }
5609
5610 #if NECP
5611 case SO_NECP_ATTRIBUTES:
5612 if (SOCK_DOM(so) == PF_MULTIPATH) {
5613 /* Handled by MPTCP itself */
5614 break;
5615 }
5616
5617 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5618 error = EINVAL;
5619 goto out;
5620 }
5621
5622 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5623 break;
5624
5625 case SO_NECP_CLIENTUUID: {
5626 if (SOCK_DOM(so) == PF_MULTIPATH) {
5627 /* Handled by MPTCP itself */
5628 break;
5629 }
5630
5631 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5632 error = EINVAL;
5633 goto out;
5634 }
5635
5636 struct inpcb *inp = sotoinpcb(so);
5637 if (!uuid_is_null(inp->necp_client_uuid)) {
5638 // Clear out the old client UUID if present
5639 necp_inpcb_remove_cb(inp);
5640 }
5641
5642 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5643 sizeof(uuid_t), sizeof(uuid_t));
5644 if (error != 0) {
5645 goto out;
5646 }
5647
5648 if (uuid_is_null(inp->necp_client_uuid)) {
5649 error = EINVAL;
5650 goto out;
5651 }
5652
5653 pid_t current_pid = proc_pid(current_proc());
5654 error = necp_client_register_socket_flow(current_pid,
5655 inp->necp_client_uuid, inp);
5656 if (error != 0) {
5657 uuid_clear(inp->necp_client_uuid);
5658 goto out;
5659 }
5660
5661 if (inp->inp_lport != 0) {
5662 // There is a bound local port, so this is not
5663 // a fresh socket. Assign to the client.
5664 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5665 }
5666
5667 break;
5668 }
5669 case SO_NECP_LISTENUUID: {
5670 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5671 error = EINVAL;
5672 goto out;
5673 }
5674
5675 struct inpcb *inp = sotoinpcb(so);
5676 if (!uuid_is_null(inp->necp_client_uuid)) {
5677 error = EINVAL;
5678 goto out;
5679 }
5680
5681 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5682 sizeof(uuid_t), sizeof(uuid_t));
5683 if (error != 0) {
5684 goto out;
5685 }
5686
5687 if (uuid_is_null(inp->necp_client_uuid)) {
5688 error = EINVAL;
5689 goto out;
5690 }
5691
5692 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5693 inp->necp_client_uuid, inp);
5694 if (error != 0) {
5695 uuid_clear(inp->necp_client_uuid);
5696 goto out;
5697 }
5698
5699 // Mark that the port registration is held by NECP
5700 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5701
5702 break;
5703 }
5704 #endif /* NECP */
5705
5706 case SO_EXTENDED_BK_IDLE:
5707 error = sooptcopyin(sopt, &optval, sizeof(optval),
5708 sizeof(optval));
5709 if (error == 0) {
5710 error = so_set_extended_bk_idle(so, optval);
5711 }
5712 break;
5713
5714 case SO_MARK_CELLFALLBACK:
5715 error = sooptcopyin(sopt, &optval, sizeof(optval),
5716 sizeof(optval));
5717 if (error != 0) {
5718 goto out;
5719 }
5720 if (optval < 0) {
5721 error = EINVAL;
5722 goto out;
5723 }
5724 if (optval == 0) {
5725 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5726 } else {
5727 so->so_flags1 |= SOF1_CELLFALLBACK;
5728 }
5729 break;
5730
5731 case SO_FALLBACK_MODE:
5732 error = sooptcopyin(sopt, &optval, sizeof(optval),
5733 sizeof(optval));
5734 if (error != 0) {
5735 goto out;
5736 }
5737 if (optval < SO_FALLBACK_MODE_NONE ||
5738 optval > SO_FALLBACK_MODE_PREFER) {
5739 error = EINVAL;
5740 goto out;
5741 }
5742 so->so_fallback_mode = (u_int8_t)optval;
5743 break;
5744
5745 case SO_MARK_KNOWN_TRACKER: {
5746 error = sooptcopyin(sopt, &optval, sizeof(optval),
5747 sizeof(optval));
5748 if (error != 0) {
5749 goto out;
5750 }
5751 if (optval < 0) {
5752 error = EINVAL;
5753 goto out;
5754 }
5755 if (optval == 0) {
5756 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5757 } else {
5758 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5759 }
5760 break;
5761 }
5762
5763 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5764 error = sooptcopyin(sopt, &optval, sizeof(optval),
5765 sizeof(optval));
5766 if (error != 0) {
5767 goto out;
5768 }
5769 if (optval < 0) {
5770 error = EINVAL;
5771 goto out;
5772 }
5773 if (optval == 0) {
5774 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5775 } else {
5776 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5777 }
5778 break;
5779 }
5780
5781 case SO_MARK_APPROVED_APP_DOMAIN: {
5782 error = sooptcopyin(sopt, &optval, sizeof(optval),
5783 sizeof(optval));
5784 if (error != 0) {
5785 goto out;
5786 }
5787 if (optval < 0) {
5788 error = EINVAL;
5789 goto out;
5790 }
5791 if (optval == 0) {
5792 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5793 } else {
5794 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5795 }
5796 break;
5797 }
5798
5799 case SO_STATISTICS_EVENT:
5800 error = sooptcopyin(sopt, &long_optval,
5801 sizeof(long_optval), sizeof(long_optval));
5802 if (error != 0) {
5803 goto out;
5804 }
5805 u_int64_t nstat_event = 0;
5806 error = so_statistics_event_to_nstat_event(
5807 &long_optval, &nstat_event);
5808 if (error != 0) {
5809 goto out;
5810 }
5811 nstat_pcb_event(sotoinpcb(so), nstat_event);
5812 break;
5813
5814 case SO_NET_SERVICE_TYPE: {
5815 error = sooptcopyin(sopt, &optval, sizeof(optval),
5816 sizeof(optval));
5817 if (error != 0) {
5818 goto out;
5819 }
5820 error = so_set_net_service_type(so, optval);
5821 break;
5822 }
5823
5824 case SO_QOSMARKING_POLICY_OVERRIDE:
5825 error = priv_check_cred(kauth_cred_get(),
5826 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5827 if (error != 0) {
5828 goto out;
5829 }
5830 error = sooptcopyin(sopt, &optval, sizeof(optval),
5831 sizeof(optval));
5832 if (error != 0) {
5833 goto out;
5834 }
5835 if (optval == 0) {
5836 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5837 } else {
5838 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5839 }
5840 break;
5841
5842 case SO_MPKL_SEND_INFO: {
5843 struct so_mpkl_send_info so_mpkl_send_info;
5844
5845 error = sooptcopyin(sopt, &so_mpkl_send_info,
5846 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5847 if (error != 0) {
5848 goto out;
5849 }
5850 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5851 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5852
5853 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5854 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5855 } else {
5856 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5857 }
5858 break;
5859 }
5860 case SO_WANT_KEV_SOCKET_CLOSED: {
5861 error = sooptcopyin(sopt, &optval, sizeof(optval),
5862 sizeof(optval));
5863 if (error != 0) {
5864 goto out;
5865 }
5866 if (optval == 0) {
5867 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5868 } else {
5869 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5870 }
5871 break;
5872 }
5873 case SO_MARK_WAKE_PKT: {
5874 error = sooptcopyin(sopt, &optval, sizeof(optval),
5875 sizeof(optval));
5876 if (error != 0) {
5877 goto out;
5878 }
5879 if (optval == 0) {
5880 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5881 } else {
5882 so->so_flags |= SOF_MARK_WAKE_PKT;
5883 }
5884 break;
5885 }
5886 case SO_RECV_WAKE_PKT: {
5887 error = sooptcopyin(sopt, &optval, sizeof(optval),
5888 sizeof(optval));
5889 if (error != 0) {
5890 goto out;
5891 }
5892 if (optval == 0) {
5893 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5894 } else {
5895 so->so_flags |= SOF_RECV_WAKE_PKT;
5896 }
5897 break;
5898 }
5899 default:
5900 error = ENOPROTOOPT;
5901 break;
5902 }
5903 if (error == 0 && so->so_proto != NULL &&
5904 so->so_proto->pr_ctloutput != NULL) {
5905 (void) so->so_proto->pr_ctloutput(so, sopt);
5906 }
5907 }
5908 out:
5909 if (dolock) {
5910 socket_unlock(so, 1);
5911 }
5912 return error;
5913 }
5914
5915 /* Helper routines for getsockopt */
5916 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5917 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5918 {
5919 int error;
5920 size_t valsize;
5921
5922 error = 0;
5923
5924 /*
5925 * Documented get behavior is that we always return a value,
5926 * possibly truncated to fit in the user's buffer.
5927 * Traditional behavior is that we always tell the user
5928 * precisely how much we copied, rather than something useful
5929 * like the total amount we had available for her.
5930 * Note that this interface is not idempotent; the entire answer must
5931 * generated ahead of time.
5932 */
5933 valsize = MIN(len, sopt->sopt_valsize);
5934 sopt->sopt_valsize = valsize;
5935 if (sopt->sopt_val != USER_ADDR_NULL) {
5936 if (sopt->sopt_p != kernproc) {
5937 error = copyout(buf, sopt->sopt_val, valsize);
5938 } else {
5939 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5940 }
5941 }
5942 return error;
5943 }
5944
5945 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5946 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5947 {
5948 int error;
5949 size_t len;
5950 struct user64_timeval tv64 = {};
5951 struct user32_timeval tv32 = {};
5952 const void * val;
5953 size_t valsize;
5954
5955 error = 0;
5956 if (proc_is64bit(sopt->sopt_p)) {
5957 len = sizeof(tv64);
5958 tv64.tv_sec = tv_p->tv_sec;
5959 tv64.tv_usec = tv_p->tv_usec;
5960 val = &tv64;
5961 } else {
5962 len = sizeof(tv32);
5963 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5964 tv32.tv_usec = tv_p->tv_usec;
5965 val = &tv32;
5966 }
5967 valsize = MIN(len, sopt->sopt_valsize);
5968 sopt->sopt_valsize = valsize;
5969 if (sopt->sopt_val != USER_ADDR_NULL) {
5970 if (sopt->sopt_p != kernproc) {
5971 error = copyout(val, sopt->sopt_val, valsize);
5972 } else {
5973 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5974 }
5975 }
5976 return error;
5977 }
5978
5979 /*
5980 * Return: 0 Success
5981 * ENOPROTOOPT
5982 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5983 * <pr_ctloutput>:???
5984 * <sf_getoption>:???
5985 */
5986 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5987 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5988 {
5989 int error, optval;
5990 struct linger l;
5991 struct timeval tv;
5992
5993 if (sopt->sopt_dir != SOPT_GET) {
5994 sopt->sopt_dir = SOPT_GET;
5995 }
5996
5997 if (dolock) {
5998 socket_lock(so, 1);
5999 }
6000
6001 error = sflt_getsockopt(so, sopt);
6002 if (error != 0) {
6003 if (error == EJUSTRETURN) {
6004 error = 0;
6005 }
6006 goto out;
6007 }
6008
6009 if (sopt->sopt_level != SOL_SOCKET) {
6010 if (so->so_proto != NULL &&
6011 so->so_proto->pr_ctloutput != NULL) {
6012 error = (*so->so_proto->pr_ctloutput)(so, sopt);
6013 goto out;
6014 }
6015 error = ENOPROTOOPT;
6016 } else {
6017 /*
6018 * Allow socket-level (SOL_SOCKET) options to be filtered by
6019 * the protocol layer, if needed. A zero value returned from
6020 * the handler means use default socket-level processing as
6021 * done by the rest of this routine. Otherwise, any other
6022 * return value indicates that the option is unsupported.
6023 */
6024 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6025 pru_socheckopt(so, sopt)) != 0) {
6026 goto out;
6027 }
6028
6029 error = 0;
6030 switch (sopt->sopt_name) {
6031 case SO_LINGER:
6032 case SO_LINGER_SEC:
6033 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6034 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6035 so->so_linger : so->so_linger / hz;
6036 error = sooptcopyout(sopt, &l, sizeof(l));
6037 break;
6038
6039 case SO_USELOOPBACK:
6040 case SO_DONTROUTE:
6041 case SO_DEBUG:
6042 case SO_KEEPALIVE:
6043 case SO_REUSEADDR:
6044 case SO_REUSEPORT:
6045 case SO_BROADCAST:
6046 case SO_OOBINLINE:
6047 case SO_TIMESTAMP:
6048 case SO_TIMESTAMP_MONOTONIC:
6049 case SO_TIMESTAMP_CONTINUOUS:
6050 case SO_DONTTRUNC:
6051 case SO_WANTMORE:
6052 case SO_WANTOOBFLAG:
6053 case SO_NOWAKEFROMSLEEP:
6054 case SO_NOAPNFALLBK:
6055 optval = so->so_options & sopt->sopt_name;
6056 integer:
6057 error = sooptcopyout(sopt, &optval, sizeof(optval));
6058 break;
6059
6060 case SO_TYPE:
6061 optval = so->so_type;
6062 goto integer;
6063
6064 case SO_NREAD:
6065 if (so->so_proto->pr_flags & PR_ATOMIC) {
6066 int pkt_total;
6067 struct mbuf *m1;
6068
6069 pkt_total = 0;
6070 m1 = so->so_rcv.sb_mb;
6071 while (m1 != NULL) {
6072 if (m1->m_type == MT_DATA ||
6073 m1->m_type == MT_HEADER ||
6074 m1->m_type == MT_OOBDATA) {
6075 pkt_total += m1->m_len;
6076 }
6077 m1 = m1->m_next;
6078 }
6079 optval = pkt_total;
6080 } else {
6081 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6082 }
6083 goto integer;
6084
6085 case SO_NUMRCVPKT:
6086 if (so->so_proto->pr_flags & PR_ATOMIC) {
6087 int cnt = 0;
6088 struct mbuf *m1;
6089
6090 m1 = so->so_rcv.sb_mb;
6091 while (m1 != NULL) {
6092 cnt += 1;
6093 m1 = m1->m_nextpkt;
6094 }
6095 optval = cnt;
6096 goto integer;
6097 } else {
6098 error = ENOPROTOOPT;
6099 break;
6100 }
6101
6102 case SO_NWRITE:
6103 optval = so->so_snd.sb_cc;
6104 goto integer;
6105
6106 case SO_ERROR:
6107 optval = so->so_error;
6108 so->so_error = 0;
6109 goto integer;
6110
6111 case SO_SNDBUF: {
6112 u_int32_t hiwat = so->so_snd.sb_hiwat;
6113
6114 if (so->so_snd.sb_flags & SB_UNIX) {
6115 struct unpcb *unp =
6116 (struct unpcb *)(so->so_pcb);
6117 if (unp != NULL && unp->unp_conn != NULL) {
6118 hiwat += unp->unp_conn->unp_cc;
6119 }
6120 }
6121
6122 optval = hiwat;
6123 goto integer;
6124 }
6125 case SO_RCVBUF:
6126 optval = so->so_rcv.sb_hiwat;
6127 goto integer;
6128
6129 case SO_SNDLOWAT:
6130 optval = so->so_snd.sb_lowat;
6131 goto integer;
6132
6133 case SO_RCVLOWAT:
6134 optval = so->so_rcv.sb_lowat;
6135 goto integer;
6136
6137 case SO_SNDTIMEO:
6138 case SO_RCVTIMEO:
6139 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6140 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6141
6142 error = sooptcopyout_timeval(sopt, &tv);
6143 break;
6144
6145 case SO_NOSIGPIPE:
6146 optval = (so->so_flags & SOF_NOSIGPIPE);
6147 goto integer;
6148
6149 case SO_NOADDRERR:
6150 optval = (so->so_flags & SOF_NOADDRAVAIL);
6151 goto integer;
6152
6153 case SO_REUSESHAREUID:
6154 optval = (so->so_flags & SOF_REUSESHAREUID);
6155 goto integer;
6156
6157
6158 case SO_NOTIFYCONFLICT:
6159 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6160 goto integer;
6161
6162 case SO_RESTRICTIONS:
6163 optval = so_get_restrictions(so);
6164 goto integer;
6165
6166 case SO_AWDL_UNRESTRICTED:
6167 if (SOCK_DOM(so) == PF_INET ||
6168 SOCK_DOM(so) == PF_INET6) {
6169 optval = inp_get_awdl_unrestricted(
6170 sotoinpcb(so));
6171 goto integer;
6172 } else {
6173 error = EOPNOTSUPP;
6174 }
6175 break;
6176
6177 case SO_INTCOPROC_ALLOW:
6178 if (SOCK_DOM(so) == PF_INET6) {
6179 optval = inp_get_intcoproc_allowed(
6180 sotoinpcb(so));
6181 goto integer;
6182 } else {
6183 error = EOPNOTSUPP;
6184 }
6185 break;
6186
6187 case SO_LABEL:
6188 error = EOPNOTSUPP;
6189 break;
6190
6191 case SO_PEERLABEL:
6192 error = EOPNOTSUPP;
6193 break;
6194
6195 #ifdef __APPLE_API_PRIVATE
6196 case SO_UPCALLCLOSEWAIT:
6197 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6198 goto integer;
6199 #endif
6200 case SO_RANDOMPORT:
6201 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6202 goto integer;
6203
6204 case SO_NP_EXTENSIONS: {
6205 struct so_np_extensions sonpx = {};
6206
6207 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6208 SONPX_SETOPTSHUT : 0;
6209 sonpx.npx_mask = SONPX_MASK_VALID;
6210
6211 error = sooptcopyout(sopt, &sonpx,
6212 sizeof(struct so_np_extensions));
6213 break;
6214 }
6215
6216 case SO_TRAFFIC_CLASS:
6217 optval = so->so_traffic_class;
6218 goto integer;
6219
6220 case SO_RECV_TRAFFIC_CLASS:
6221 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6222 goto integer;
6223
6224 #if (DEVELOPMENT || DEBUG)
6225 case SO_TRAFFIC_CLASS_DBG:
6226 error = sogetopt_tcdbg(so, sopt);
6227 break;
6228 #endif /* (DEVELOPMENT || DEBUG) */
6229
6230 case SO_PRIVILEGED_TRAFFIC_CLASS:
6231 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6232 goto integer;
6233
6234 case SO_DEFUNCTOK:
6235 optval = !(so->so_flags & SOF_NODEFUNCT);
6236 goto integer;
6237
6238 case SO_ISDEFUNCT:
6239 optval = (so->so_flags & SOF_DEFUNCT);
6240 goto integer;
6241
6242 case SO_OPPORTUNISTIC:
6243 optval = so_get_opportunistic(so);
6244 goto integer;
6245
6246 case SO_FLUSH:
6247 /* This option is not gettable */
6248 error = EINVAL;
6249 break;
6250
6251 case SO_RECV_ANYIF:
6252 optval = so_get_recv_anyif(so);
6253 goto integer;
6254
6255 case SO_TRAFFIC_MGT_BACKGROUND:
6256 /* This option is handled by lower layer(s) */
6257 if (so->so_proto != NULL &&
6258 so->so_proto->pr_ctloutput != NULL) {
6259 (void) so->so_proto->pr_ctloutput(so, sopt);
6260 }
6261 break;
6262
6263 #if FLOW_DIVERT
6264 case SO_FLOW_DIVERT_TOKEN:
6265 error = flow_divert_token_get(so, sopt);
6266 break;
6267 #endif /* FLOW_DIVERT */
6268
6269 #if NECP
6270 case SO_NECP_ATTRIBUTES:
6271 if (SOCK_DOM(so) == PF_MULTIPATH) {
6272 /* Handled by MPTCP itself */
6273 break;
6274 }
6275
6276 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6277 error = EINVAL;
6278 goto out;
6279 }
6280
6281 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6282 break;
6283
6284 case SO_NECP_CLIENTUUID: {
6285 uuid_t *ncu;
6286
6287 if (SOCK_DOM(so) == PF_MULTIPATH) {
6288 ncu = &mpsotomppcb(so)->necp_client_uuid;
6289 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6290 ncu = &sotoinpcb(so)->necp_client_uuid;
6291 } else {
6292 error = EINVAL;
6293 goto out;
6294 }
6295
6296 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6297 break;
6298 }
6299
6300 case SO_NECP_LISTENUUID: {
6301 uuid_t *nlu;
6302
6303 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6304 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6305 nlu = &sotoinpcb(so)->necp_client_uuid;
6306 } else {
6307 error = ENOENT;
6308 goto out;
6309 }
6310 } else {
6311 error = EINVAL;
6312 goto out;
6313 }
6314
6315 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6316 break;
6317 }
6318 #endif /* NECP */
6319
6320 #if CONTENT_FILTER
6321 case SO_CFIL_SOCK_ID: {
6322 cfil_sock_id_t sock_id;
6323
6324 sock_id = cfil_sock_id_from_socket(so);
6325
6326 error = sooptcopyout(sopt, &sock_id,
6327 sizeof(cfil_sock_id_t));
6328 break;
6329 }
6330 #endif /* CONTENT_FILTER */
6331
6332 case SO_EXTENDED_BK_IDLE:
6333 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6334 goto integer;
6335 case SO_MARK_CELLFALLBACK:
6336 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6337 ? 1 : 0;
6338 goto integer;
6339 case SO_FALLBACK_MODE:
6340 optval = so->so_fallback_mode;
6341 goto integer;
6342 case SO_MARK_KNOWN_TRACKER: {
6343 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6344 ? 1 : 0;
6345 goto integer;
6346 }
6347 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6348 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6349 ? 1 : 0;
6350 goto integer;
6351 }
6352 case SO_MARK_APPROVED_APP_DOMAIN: {
6353 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6354 ? 1 : 0;
6355 goto integer;
6356 }
6357 case SO_NET_SERVICE_TYPE: {
6358 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6359 optval = so->so_netsvctype;
6360 } else {
6361 optval = NET_SERVICE_TYPE_BE;
6362 }
6363 goto integer;
6364 }
6365 case SO_NETSVC_MARKING_LEVEL:
6366 optval = so_get_netsvc_marking_level(so);
6367 goto integer;
6368
6369 case SO_MPKL_SEND_INFO: {
6370 struct so_mpkl_send_info so_mpkl_send_info;
6371
6372 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6373 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6374 error = sooptcopyout(sopt, &so_mpkl_send_info,
6375 sizeof(struct so_mpkl_send_info));
6376 break;
6377 }
6378 case SO_MARK_WAKE_PKT:
6379 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6380 goto integer;
6381 case SO_RECV_WAKE_PKT:
6382 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6383 goto integer;
6384 default:
6385 error = ENOPROTOOPT;
6386 break;
6387 }
6388 }
6389 out:
6390 if (dolock) {
6391 socket_unlock(so, 1);
6392 }
6393 return error;
6394 }
6395
6396 /*
6397 * The size limits on our soopt_getm is different from that on FreeBSD.
6398 * We limit the size of options to MCLBYTES. This will have to change
6399 * if we need to define options that need more space than MCLBYTES.
6400 */
6401 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6402 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6403 {
6404 struct mbuf *m, *m_prev;
6405 int sopt_size = (int)sopt->sopt_valsize;
6406 int how;
6407
6408 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6409 return EMSGSIZE;
6410 }
6411
6412 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6413 MGET(m, how, MT_DATA);
6414 if (m == NULL) {
6415 return ENOBUFS;
6416 }
6417 if (sopt_size > MLEN) {
6418 MCLGET(m, how);
6419 if ((m->m_flags & M_EXT) == 0) {
6420 m_free(m);
6421 return ENOBUFS;
6422 }
6423 m->m_len = min(MCLBYTES, sopt_size);
6424 } else {
6425 m->m_len = min(MLEN, sopt_size);
6426 }
6427 sopt_size -= m->m_len;
6428 *mp = m;
6429 m_prev = m;
6430
6431 while (sopt_size > 0) {
6432 MGET(m, how, MT_DATA);
6433 if (m == NULL) {
6434 m_freem(*mp);
6435 return ENOBUFS;
6436 }
6437 if (sopt_size > MLEN) {
6438 MCLGET(m, how);
6439 if ((m->m_flags & M_EXT) == 0) {
6440 m_freem(*mp);
6441 m_freem(m);
6442 return ENOBUFS;
6443 }
6444 m->m_len = min(MCLBYTES, sopt_size);
6445 } else {
6446 m->m_len = min(MLEN, sopt_size);
6447 }
6448 sopt_size -= m->m_len;
6449 m_prev->m_next = m;
6450 m_prev = m;
6451 }
6452 return 0;
6453 }
6454
6455 /* copyin sopt data into mbuf chain */
6456 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6457 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6458 {
6459 struct mbuf *m0 = m;
6460
6461 if (sopt->sopt_val == USER_ADDR_NULL) {
6462 return 0;
6463 }
6464 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6465 if (sopt->sopt_p != kernproc) {
6466 int error;
6467
6468 error = copyin(sopt->sopt_val, mtod(m, char *),
6469 m->m_len);
6470 if (error != 0) {
6471 m_freem(m0);
6472 return error;
6473 }
6474 } else {
6475 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6476 mtod(m, char *), m->m_len);
6477 }
6478 sopt->sopt_valsize -= m->m_len;
6479 sopt->sopt_val += m->m_len;
6480 m = m->m_next;
6481 }
6482 /* should be allocated enoughly at ip6_sooptmcopyin() */
6483 if (m != NULL) {
6484 panic("soopt_mcopyin");
6485 /* NOTREACHED */
6486 }
6487 return 0;
6488 }
6489
6490 /* copyout mbuf chain data into soopt */
6491 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6492 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6493 {
6494 struct mbuf *m0 = m;
6495 size_t valsize = 0;
6496
6497 if (sopt->sopt_val == USER_ADDR_NULL) {
6498 return 0;
6499 }
6500 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6501 if (sopt->sopt_p != kernproc) {
6502 int error;
6503
6504 error = copyout(mtod(m, char *), sopt->sopt_val,
6505 m->m_len);
6506 if (error != 0) {
6507 m_freem(m0);
6508 return error;
6509 }
6510 } else {
6511 bcopy(mtod(m, char *),
6512 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6513 }
6514 sopt->sopt_valsize -= m->m_len;
6515 sopt->sopt_val += m->m_len;
6516 valsize += m->m_len;
6517 m = m->m_next;
6518 }
6519 if (m != NULL) {
6520 /* enough soopt buffer should be given from user-land */
6521 m_freem(m0);
6522 return EINVAL;
6523 }
6524 sopt->sopt_valsize = valsize;
6525 return 0;
6526 }
6527
6528 void
sohasoutofband(struct socket * so)6529 sohasoutofband(struct socket *so)
6530 {
6531 if (so->so_pgid < 0) {
6532 gsignal(-so->so_pgid, SIGURG);
6533 } else if (so->so_pgid > 0) {
6534 proc_signal(so->so_pgid, SIGURG);
6535 }
6536 selwakeup(&so->so_rcv.sb_sel);
6537 if (so->so_rcv.sb_flags & SB_KNOTE) {
6538 KNOTE(&so->so_rcv.sb_sel.si_note,
6539 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6540 }
6541 }
6542
6543 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6544 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6545 {
6546 #pragma unused(cred)
6547 struct proc *p = current_proc();
6548 int revents = 0;
6549
6550 socket_lock(so, 1);
6551 so_update_last_owner_locked(so, PROC_NULL);
6552 so_update_policy(so);
6553
6554 if (events & (POLLIN | POLLRDNORM)) {
6555 if (soreadable(so)) {
6556 revents |= events & (POLLIN | POLLRDNORM);
6557 }
6558 }
6559
6560 if (events & (POLLOUT | POLLWRNORM)) {
6561 if (sowriteable(so)) {
6562 revents |= events & (POLLOUT | POLLWRNORM);
6563 }
6564 }
6565
6566 if (events & (POLLPRI | POLLRDBAND)) {
6567 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6568 revents |= events & (POLLPRI | POLLRDBAND);
6569 }
6570 }
6571
6572 if (revents == 0) {
6573 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6574 /*
6575 * Darwin sets the flag first,
6576 * BSD calls selrecord first
6577 */
6578 so->so_rcv.sb_flags |= SB_SEL;
6579 selrecord(p, &so->so_rcv.sb_sel, wql);
6580 }
6581
6582 if (events & (POLLOUT | POLLWRNORM)) {
6583 /*
6584 * Darwin sets the flag first,
6585 * BSD calls selrecord first
6586 */
6587 so->so_snd.sb_flags |= SB_SEL;
6588 selrecord(p, &so->so_snd.sb_sel, wql);
6589 }
6590 }
6591
6592 socket_unlock(so, 1);
6593 return revents;
6594 }
6595
6596 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6597 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6598 {
6599 struct socket *so = (struct socket *)fp_get_data(fp);
6600 int result;
6601
6602 socket_lock(so, 1);
6603 so_update_last_owner_locked(so, PROC_NULL);
6604 so_update_policy(so);
6605
6606 switch (kn->kn_filter) {
6607 case EVFILT_READ:
6608 kn->kn_filtid = EVFILTID_SOREAD;
6609 break;
6610 case EVFILT_WRITE:
6611 kn->kn_filtid = EVFILTID_SOWRITE;
6612 break;
6613 case EVFILT_SOCK:
6614 kn->kn_filtid = EVFILTID_SCK;
6615 break;
6616 case EVFILT_EXCEPT:
6617 kn->kn_filtid = EVFILTID_SOEXCEPT;
6618 break;
6619 default:
6620 socket_unlock(so, 1);
6621 knote_set_error(kn, EINVAL);
6622 return 0;
6623 }
6624
6625 /*
6626 * call the appropriate sub-filter attach
6627 * with the socket still locked
6628 */
6629 result = knote_fops(kn)->f_attach(kn, kev);
6630
6631 socket_unlock(so, 1);
6632
6633 return result;
6634 }
6635
6636 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6637 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6638 {
6639 int retval = 0;
6640 int64_t data = 0;
6641
6642 if (so->so_options & SO_ACCEPTCONN) {
6643 /*
6644 * Radar 6615193 handle the listen case dynamically
6645 * for kqueue read filter. This allows to call listen()
6646 * after registering the kqueue EVFILT_READ.
6647 */
6648
6649 retval = !TAILQ_EMPTY(&so->so_comp);
6650 data = so->so_qlen;
6651 goto out;
6652 }
6653
6654 /* socket isn't a listener */
6655 /*
6656 * NOTE_LOWAT specifies new low water mark in data, i.e.
6657 * the bytes of protocol data. We therefore exclude any
6658 * control bytes.
6659 */
6660 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6661
6662 if (kn->kn_sfflags & NOTE_OOB) {
6663 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6664 kn->kn_fflags |= NOTE_OOB;
6665 data -= so->so_oobmark;
6666 retval = 1;
6667 goto out;
6668 }
6669 }
6670
6671 if ((so->so_state & SS_CANTRCVMORE)
6672 #if CONTENT_FILTER
6673 && cfil_sock_data_pending(&so->so_rcv) == 0
6674 #endif /* CONTENT_FILTER */
6675 ) {
6676 kn->kn_flags |= EV_EOF;
6677 kn->kn_fflags = so->so_error;
6678 retval = 1;
6679 goto out;
6680 }
6681
6682 if (so->so_error) { /* temporary udp error */
6683 retval = 1;
6684 goto out;
6685 }
6686
6687 int64_t lowwat = so->so_rcv.sb_lowat;
6688 /*
6689 * Ensure that when NOTE_LOWAT is used, the derived
6690 * low water mark is bounded by socket's rcv buf's
6691 * high and low water mark values.
6692 */
6693 if (kn->kn_sfflags & NOTE_LOWAT) {
6694 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6695 lowwat = so->so_rcv.sb_hiwat;
6696 } else if (kn->kn_sdata > lowwat) {
6697 lowwat = kn->kn_sdata;
6698 }
6699 }
6700
6701 /*
6702 * While the `data` field is the amount of data to read,
6703 * 0-sized packets need to wake up the kqueue, see 58140856,
6704 * so we need to take control bytes into account too.
6705 */
6706 retval = (so->so_rcv.sb_cc >= lowwat);
6707
6708 out:
6709 if (retval && kev) {
6710 knote_fill_kevent(kn, kev, data);
6711 }
6712 return retval;
6713 }
6714
6715 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6716 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6717 {
6718 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6719
6720 /* socket locked */
6721
6722 /*
6723 * If the caller explicitly asked for OOB results (e.g. poll())
6724 * from EVFILT_READ, then save that off in the hookid field
6725 * and reserve the kn_flags EV_OOBAND bit for output only.
6726 */
6727 if (kn->kn_filter == EVFILT_READ &&
6728 kn->kn_flags & EV_OOBAND) {
6729 kn->kn_flags &= ~EV_OOBAND;
6730 kn->kn_hook32 = EV_OOBAND;
6731 } else {
6732 kn->kn_hook32 = 0;
6733 }
6734 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6735 so->so_rcv.sb_flags |= SB_KNOTE;
6736 }
6737
6738 /* indicate if event is already fired */
6739 return filt_soread_common(kn, NULL, so);
6740 }
6741
6742 static void
filt_sordetach(struct knote * kn)6743 filt_sordetach(struct knote *kn)
6744 {
6745 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6746
6747 socket_lock(so, 1);
6748 if (so->so_rcv.sb_flags & SB_KNOTE) {
6749 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6750 so->so_rcv.sb_flags &= ~SB_KNOTE;
6751 }
6752 }
6753 socket_unlock(so, 1);
6754 }
6755
6756 /*ARGSUSED*/
6757 static int
filt_soread(struct knote * kn,long hint)6758 filt_soread(struct knote *kn, long hint)
6759 {
6760 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6761 int retval;
6762
6763 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6764 socket_lock(so, 1);
6765 }
6766
6767 retval = filt_soread_common(kn, NULL, so);
6768
6769 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6770 socket_unlock(so, 1);
6771 }
6772
6773 return retval;
6774 }
6775
6776 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6777 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6778 {
6779 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6780 int retval;
6781
6782 socket_lock(so, 1);
6783
6784 /* save off the new input fflags and data */
6785 kn->kn_sfflags = kev->fflags;
6786 kn->kn_sdata = kev->data;
6787
6788 /* determine if changes result in fired events */
6789 retval = filt_soread_common(kn, NULL, so);
6790
6791 socket_unlock(so, 1);
6792
6793 return retval;
6794 }
6795
6796 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6797 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6798 {
6799 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6800 int retval;
6801
6802 socket_lock(so, 1);
6803 retval = filt_soread_common(kn, kev, so);
6804 socket_unlock(so, 1);
6805
6806 return retval;
6807 }
6808
6809 int
so_wait_for_if_feedback(struct socket * so)6810 so_wait_for_if_feedback(struct socket *so)
6811 {
6812 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6813 (so->so_state & SS_ISCONNECTED)) {
6814 struct inpcb *inp = sotoinpcb(so);
6815 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6816 return 1;
6817 }
6818 }
6819 return 0;
6820 }
6821
6822 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6823 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6824 {
6825 int ret = 0;
6826 int64_t data = sbspace(&so->so_snd);
6827
6828 if (so->so_state & SS_CANTSENDMORE) {
6829 kn->kn_flags |= EV_EOF;
6830 kn->kn_fflags = so->so_error;
6831 ret = 1;
6832 goto out;
6833 }
6834
6835 if (so->so_error) { /* temporary udp error */
6836 ret = 1;
6837 goto out;
6838 }
6839
6840 if (!socanwrite(so)) {
6841 ret = 0;
6842 goto out;
6843 }
6844
6845 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6846 ret = 1;
6847 goto out;
6848 }
6849
6850 int64_t lowwat = so->so_snd.sb_lowat;
6851
6852 if (kn->kn_sfflags & NOTE_LOWAT) {
6853 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6854 lowwat = so->so_snd.sb_hiwat;
6855 } else if (kn->kn_sdata > lowwat) {
6856 lowwat = kn->kn_sdata;
6857 }
6858 }
6859
6860 if (data >= lowwat) {
6861 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6862 #if (DEBUG || DEVELOPMENT)
6863 && so_notsent_lowat_check == 1
6864 #endif /* DEBUG || DEVELOPMENT */
6865 ) {
6866 if ((SOCK_DOM(so) == PF_INET ||
6867 SOCK_DOM(so) == PF_INET6) &&
6868 so->so_type == SOCK_STREAM) {
6869 ret = tcp_notsent_lowat_check(so);
6870 }
6871 #if MPTCP
6872 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6873 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6874 ret = mptcp_notsent_lowat_check(so);
6875 }
6876 #endif
6877 else {
6878 ret = 1;
6879 goto out;
6880 }
6881 } else {
6882 ret = 1;
6883 }
6884 }
6885 if (so_wait_for_if_feedback(so)) {
6886 ret = 0;
6887 }
6888
6889 out:
6890 if (ret && kev) {
6891 knote_fill_kevent(kn, kev, data);
6892 }
6893 return ret;
6894 }
6895
6896 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6897 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6898 {
6899 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6900
6901 /* socket locked */
6902 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6903 so->so_snd.sb_flags |= SB_KNOTE;
6904 }
6905
6906 /* determine if its already fired */
6907 return filt_sowrite_common(kn, NULL, so);
6908 }
6909
6910 static void
filt_sowdetach(struct knote * kn)6911 filt_sowdetach(struct knote *kn)
6912 {
6913 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6914 socket_lock(so, 1);
6915
6916 if (so->so_snd.sb_flags & SB_KNOTE) {
6917 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6918 so->so_snd.sb_flags &= ~SB_KNOTE;
6919 }
6920 }
6921 socket_unlock(so, 1);
6922 }
6923
6924 /*ARGSUSED*/
6925 static int
filt_sowrite(struct knote * kn,long hint)6926 filt_sowrite(struct knote *kn, long hint)
6927 {
6928 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6929 int ret;
6930
6931 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6932 socket_lock(so, 1);
6933 }
6934
6935 ret = filt_sowrite_common(kn, NULL, so);
6936
6937 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6938 socket_unlock(so, 1);
6939 }
6940
6941 return ret;
6942 }
6943
6944 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6945 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6946 {
6947 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6948 int ret;
6949
6950 socket_lock(so, 1);
6951
6952 /*save off the new input fflags and data */
6953 kn->kn_sfflags = kev->fflags;
6954 kn->kn_sdata = kev->data;
6955
6956 /* determine if these changes result in a triggered event */
6957 ret = filt_sowrite_common(kn, NULL, so);
6958
6959 socket_unlock(so, 1);
6960
6961 return ret;
6962 }
6963
6964 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6965 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6966 {
6967 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6968 int ret;
6969
6970 socket_lock(so, 1);
6971 ret = filt_sowrite_common(kn, kev, so);
6972 socket_unlock(so, 1);
6973
6974 return ret;
6975 }
6976
6977 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6978 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6979 struct socket *so, long ev_hint)
6980 {
6981 int ret = 0;
6982 int64_t data = 0;
6983 uint32_t level_trigger = 0;
6984
6985 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6986 kn->kn_fflags |= NOTE_CONNRESET;
6987 }
6988 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6989 kn->kn_fflags |= NOTE_TIMEOUT;
6990 }
6991 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6992 kn->kn_fflags |= NOTE_NOSRCADDR;
6993 }
6994 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6995 kn->kn_fflags |= NOTE_IFDENIED;
6996 }
6997 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6998 kn->kn_fflags |= NOTE_KEEPALIVE;
6999 }
7000 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7001 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7002 }
7003 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7004 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7005 }
7006 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7007 (so->so_state & SS_ISCONNECTED)) {
7008 kn->kn_fflags |= NOTE_CONNECTED;
7009 level_trigger |= NOTE_CONNECTED;
7010 }
7011 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7012 (so->so_state & SS_ISDISCONNECTED)) {
7013 kn->kn_fflags |= NOTE_DISCONNECTED;
7014 level_trigger |= NOTE_DISCONNECTED;
7015 }
7016 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7017 if (so->so_proto != NULL &&
7018 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7019 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7020 }
7021 }
7022
7023 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7024 tcp_notify_ack_active(so)) {
7025 kn->kn_fflags |= NOTE_NOTIFY_ACK;
7026 }
7027
7028 if ((so->so_state & SS_CANTRCVMORE)
7029 #if CONTENT_FILTER
7030 && cfil_sock_data_pending(&so->so_rcv) == 0
7031 #endif /* CONTENT_FILTER */
7032 ) {
7033 kn->kn_fflags |= NOTE_READCLOSED;
7034 level_trigger |= NOTE_READCLOSED;
7035 }
7036
7037 if (so->so_state & SS_CANTSENDMORE) {
7038 kn->kn_fflags |= NOTE_WRITECLOSED;
7039 level_trigger |= NOTE_WRITECLOSED;
7040 }
7041
7042 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7043 (so->so_flags & SOF_SUSPENDED)) {
7044 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7045
7046 /* If resume event was delivered before, reset it */
7047 kn->kn_hook32 &= ~NOTE_RESUME;
7048
7049 kn->kn_fflags |= NOTE_SUSPEND;
7050 level_trigger |= NOTE_SUSPEND;
7051 }
7052
7053 if ((ev_hint & SO_FILT_HINT_RESUME) ||
7054 (so->so_flags & SOF_SUSPENDED) == 0) {
7055 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7056
7057 /* If suspend event was delivered before, reset it */
7058 kn->kn_hook32 &= ~NOTE_SUSPEND;
7059
7060 kn->kn_fflags |= NOTE_RESUME;
7061 level_trigger |= NOTE_RESUME;
7062 }
7063
7064 if (so->so_error != 0) {
7065 ret = 1;
7066 data = so->so_error;
7067 kn->kn_flags |= EV_EOF;
7068 } else {
7069 u_int32_t data32 = 0;
7070 get_sockev_state(so, &data32);
7071 data = data32;
7072 }
7073
7074 /* Reset any events that are not requested on this knote */
7075 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7076 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7077
7078 /* Find the level triggerred events that are already delivered */
7079 level_trigger &= kn->kn_hook32;
7080 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7081
7082 /* Do not deliver level triggerred events more than once */
7083 if ((kn->kn_fflags & ~level_trigger) != 0) {
7084 ret = 1;
7085 }
7086
7087 if (ret && kev) {
7088 /*
7089 * Store the state of the events being delivered. This
7090 * state can be used to deliver level triggered events
7091 * ateast once and still avoid waking up the application
7092 * multiple times as long as the event is active.
7093 */
7094 if (kn->kn_fflags != 0) {
7095 kn->kn_hook32 |= (kn->kn_fflags &
7096 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7097 }
7098
7099 /*
7100 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7101 * only one of them and remember the last one that was
7102 * delivered last
7103 */
7104 if (kn->kn_fflags & NOTE_SUSPEND) {
7105 kn->kn_hook32 &= ~NOTE_RESUME;
7106 }
7107 if (kn->kn_fflags & NOTE_RESUME) {
7108 kn->kn_hook32 &= ~NOTE_SUSPEND;
7109 }
7110
7111 knote_fill_kevent(kn, kev, data);
7112 }
7113 return ret;
7114 }
7115
7116 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7117 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7118 {
7119 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7120
7121 /* socket locked */
7122 kn->kn_hook32 = 0;
7123 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7124 so->so_flags |= SOF_KNOTE;
7125 }
7126
7127 /* determine if event already fired */
7128 return filt_sockev_common(kn, NULL, so, 0);
7129 }
7130
7131 static void
filt_sockdetach(struct knote * kn)7132 filt_sockdetach(struct knote *kn)
7133 {
7134 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7135 socket_lock(so, 1);
7136
7137 if ((so->so_flags & SOF_KNOTE) != 0) {
7138 if (KNOTE_DETACH(&so->so_klist, kn)) {
7139 so->so_flags &= ~SOF_KNOTE;
7140 }
7141 }
7142 socket_unlock(so, 1);
7143 }
7144
7145 static int
filt_sockev(struct knote * kn,long hint)7146 filt_sockev(struct knote *kn, long hint)
7147 {
7148 int ret = 0, locked = 0;
7149 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7150 long ev_hint = (hint & SO_FILT_HINT_EV);
7151
7152 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7153 socket_lock(so, 1);
7154 locked = 1;
7155 }
7156
7157 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7158
7159 if (locked) {
7160 socket_unlock(so, 1);
7161 }
7162
7163 return ret;
7164 }
7165
7166
7167
7168 /*
7169 * filt_socktouch - update event state
7170 */
7171 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7172 filt_socktouch(
7173 struct knote *kn,
7174 struct kevent_qos_s *kev)
7175 {
7176 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7177 uint32_t changed_flags;
7178 int ret;
7179
7180 socket_lock(so, 1);
7181
7182 /* save off the [result] data and fflags */
7183 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7184
7185 /* save off the new input fflags and data */
7186 kn->kn_sfflags = kev->fflags;
7187 kn->kn_sdata = kev->data;
7188
7189 /* restrict the current results to the (smaller?) set of new interest */
7190 /*
7191 * For compatibility with previous implementations, we leave kn_fflags
7192 * as they were before.
7193 */
7194 //kn->kn_fflags &= kev->fflags;
7195
7196 /*
7197 * Since we keep track of events that are already
7198 * delivered, if any of those events are not requested
7199 * anymore the state related to them can be reset
7200 */
7201 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7202
7203 /* determine if we have events to deliver */
7204 ret = filt_sockev_common(kn, NULL, so, 0);
7205
7206 socket_unlock(so, 1);
7207
7208 return ret;
7209 }
7210
7211 /*
7212 * filt_sockprocess - query event fired state and return data
7213 */
7214 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7215 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7216 {
7217 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7218 int ret = 0;
7219
7220 socket_lock(so, 1);
7221
7222 ret = filt_sockev_common(kn, kev, so, 0);
7223
7224 socket_unlock(so, 1);
7225
7226 return ret;
7227 }
7228
7229 void
get_sockev_state(struct socket * so,u_int32_t * statep)7230 get_sockev_state(struct socket *so, u_int32_t *statep)
7231 {
7232 u_int32_t state = *(statep);
7233
7234 /*
7235 * If the state variable is already used by a previous event,
7236 * reset it.
7237 */
7238 if (state != 0) {
7239 return;
7240 }
7241
7242 if (so->so_state & SS_ISCONNECTED) {
7243 state |= SOCKEV_CONNECTED;
7244 } else {
7245 state &= ~(SOCKEV_CONNECTED);
7246 }
7247 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7248 *(statep) = state;
7249 }
7250
7251 #define SO_LOCK_HISTORY_STR_LEN \
7252 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7253
7254 __private_extern__ const char *
solockhistory_nr(struct socket * so)7255 solockhistory_nr(struct socket *so)
7256 {
7257 size_t n = 0;
7258 int i;
7259 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7260
7261 bzero(lock_history_str, sizeof(lock_history_str));
7262 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7263 n += scnprintf(lock_history_str + n,
7264 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7265 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7266 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7267 }
7268 return lock_history_str;
7269 }
7270
7271 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7272 socket_getlock(struct socket *so, int flags)
7273 {
7274 if (so->so_proto->pr_getlock != NULL) {
7275 return (*so->so_proto->pr_getlock)(so, flags);
7276 } else {
7277 return so->so_proto->pr_domain->dom_mtx;
7278 }
7279 }
7280
7281 void
socket_lock(struct socket * so,int refcount)7282 socket_lock(struct socket *so, int refcount)
7283 {
7284 void *lr_saved;
7285
7286 lr_saved = __builtin_return_address(0);
7287
7288 if (so->so_proto->pr_lock) {
7289 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7290 } else {
7291 #ifdef MORE_LOCKING_DEBUG
7292 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7293 LCK_MTX_ASSERT_NOTOWNED);
7294 #endif
7295 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7296 if (refcount) {
7297 so->so_usecount++;
7298 }
7299 so->lock_lr[so->next_lock_lr] = lr_saved;
7300 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7301 }
7302 }
7303
7304 void
socket_lock_assert_owned(struct socket * so)7305 socket_lock_assert_owned(struct socket *so)
7306 {
7307 lck_mtx_t *mutex_held;
7308
7309 if (so->so_proto->pr_getlock != NULL) {
7310 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7311 } else {
7312 mutex_held = so->so_proto->pr_domain->dom_mtx;
7313 }
7314
7315 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7316 }
7317
7318 int
socket_try_lock(struct socket * so)7319 socket_try_lock(struct socket *so)
7320 {
7321 lck_mtx_t *mtx;
7322
7323 if (so->so_proto->pr_getlock != NULL) {
7324 mtx = (*so->so_proto->pr_getlock)(so, 0);
7325 } else {
7326 mtx = so->so_proto->pr_domain->dom_mtx;
7327 }
7328
7329 return lck_mtx_try_lock(mtx);
7330 }
7331
7332 void
socket_unlock(struct socket * so,int refcount)7333 socket_unlock(struct socket *so, int refcount)
7334 {
7335 void *lr_saved;
7336 lck_mtx_t *mutex_held;
7337
7338 lr_saved = __builtin_return_address(0);
7339
7340 if (so == NULL || so->so_proto == NULL) {
7341 panic("%s: null so_proto so=%p", __func__, so);
7342 /* NOTREACHED */
7343 }
7344
7345 if (so->so_proto->pr_unlock) {
7346 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7347 } else {
7348 mutex_held = so->so_proto->pr_domain->dom_mtx;
7349 #ifdef MORE_LOCKING_DEBUG
7350 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7351 #endif
7352 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7353 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7354
7355 if (refcount) {
7356 if (so->so_usecount <= 0) {
7357 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7358 "lrh=%s", __func__, so->so_usecount, so,
7359 SOCK_DOM(so), so->so_type,
7360 SOCK_PROTO(so), solockhistory_nr(so));
7361 /* NOTREACHED */
7362 }
7363
7364 so->so_usecount--;
7365 if (so->so_usecount == 0) {
7366 sofreelastref(so, 1);
7367 }
7368 }
7369 lck_mtx_unlock(mutex_held);
7370 }
7371 }
7372
7373 /* Called with socket locked, will unlock socket */
7374 void
sofree(struct socket * so)7375 sofree(struct socket *so)
7376 {
7377 lck_mtx_t *mutex_held;
7378
7379 if (so->so_proto->pr_getlock != NULL) {
7380 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7381 } else {
7382 mutex_held = so->so_proto->pr_domain->dom_mtx;
7383 }
7384 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7385
7386 sofreelastref(so, 0);
7387 }
7388
7389 void
soreference(struct socket * so)7390 soreference(struct socket *so)
7391 {
7392 socket_lock(so, 1); /* locks & take one reference on socket */
7393 socket_unlock(so, 0); /* unlock only */
7394 }
7395
7396 void
sodereference(struct socket * so)7397 sodereference(struct socket *so)
7398 {
7399 socket_lock(so, 0);
7400 socket_unlock(so, 1);
7401 }
7402
7403 /*
7404 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7405 * possibility of using jumbo clusters. Caller must ensure to hold
7406 * the socket lock.
7407 */
7408 void
somultipages(struct socket * so,boolean_t set)7409 somultipages(struct socket *so, boolean_t set)
7410 {
7411 if (set) {
7412 so->so_flags |= SOF_MULTIPAGES;
7413 } else {
7414 so->so_flags &= ~SOF_MULTIPAGES;
7415 }
7416 }
7417
7418 void
soif2kcl(struct socket * so,boolean_t set)7419 soif2kcl(struct socket *so, boolean_t set)
7420 {
7421 if (set) {
7422 so->so_flags1 |= SOF1_IF_2KCL;
7423 } else {
7424 so->so_flags1 &= ~SOF1_IF_2KCL;
7425 }
7426 }
7427
7428 int
so_isdstlocal(struct socket * so)7429 so_isdstlocal(struct socket *so)
7430 {
7431 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7432
7433 if (SOCK_DOM(so) == PF_INET) {
7434 return inaddr_local(inp->inp_faddr);
7435 } else if (SOCK_DOM(so) == PF_INET6) {
7436 return in6addr_local(&inp->in6p_faddr);
7437 }
7438
7439 return 0;
7440 }
7441
7442 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7443 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7444 {
7445 struct sockbuf *rcv, *snd;
7446 int err = 0, defunct;
7447
7448 rcv = &so->so_rcv;
7449 snd = &so->so_snd;
7450
7451 defunct = (so->so_flags & SOF_DEFUNCT);
7452 if (defunct) {
7453 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7454 panic("%s: SB_DROP not set", __func__);
7455 /* NOTREACHED */
7456 }
7457 goto done;
7458 }
7459
7460 if (so->so_flags & SOF_NODEFUNCT) {
7461 if (noforce) {
7462 err = EOPNOTSUPP;
7463 if (p != PROC_NULL) {
7464 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7465 "name %s level %d) so 0x%llx [%d,%d] "
7466 "is not eligible for defunct "
7467 "(%d)\n", __func__, proc_selfpid(),
7468 proc_best_name(current_proc()), proc_pid(p),
7469 proc_best_name(p), level,
7470 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7471 SOCK_DOM(so), SOCK_TYPE(so), err);
7472 }
7473 return err;
7474 }
7475 so->so_flags &= ~SOF_NODEFUNCT;
7476 if (p != PROC_NULL) {
7477 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7478 "name %s level %d) so 0x%llx [%d,%d] "
7479 "defunct by force "
7480 "(%d)\n", __func__, proc_selfpid(),
7481 proc_best_name(current_proc()), proc_pid(p),
7482 proc_best_name(p), level,
7483 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7484 SOCK_DOM(so), SOCK_TYPE(so), err);
7485 }
7486 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7487 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7488 struct ifnet *ifp = inp->inp_last_outifp;
7489
7490 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7491 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7492 } else if (so->so_flags & SOF_DELEGATED) {
7493 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7494 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7495 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7496 } else if (noforce && p != PROC_NULL) {
7497 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7498
7499 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7500 so->so_extended_bk_start = net_uptime();
7501 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7502
7503 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7504
7505 err = EOPNOTSUPP;
7506 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7507 "name %s level %d) so 0x%llx [%d,%d] "
7508 "extend bk idle "
7509 "(%d)\n", __func__, proc_selfpid(),
7510 proc_best_name(current_proc()), proc_pid(p),
7511 proc_best_name(p), level,
7512 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7513 SOCK_DOM(so), SOCK_TYPE(so), err);
7514 return err;
7515 } else {
7516 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7517 }
7518 }
7519
7520 so->so_flags |= SOF_DEFUNCT;
7521
7522 /* Prevent further data from being appended to the socket buffers */
7523 snd->sb_flags |= SB_DROP;
7524 rcv->sb_flags |= SB_DROP;
7525
7526 /* Flush any existing data in the socket buffers */
7527 if (rcv->sb_cc != 0) {
7528 rcv->sb_flags &= ~SB_SEL;
7529 selthreadclear(&rcv->sb_sel);
7530 sbrelease(rcv);
7531 }
7532 if (snd->sb_cc != 0) {
7533 snd->sb_flags &= ~SB_SEL;
7534 selthreadclear(&snd->sb_sel);
7535 sbrelease(snd);
7536 }
7537
7538 done:
7539 if (p != PROC_NULL) {
7540 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7541 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7542 proc_selfpid(), proc_best_name(current_proc()),
7543 proc_pid(p), proc_best_name(p), level,
7544 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7545 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7546 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7547 " extbkidle" : "");
7548 }
7549 return err;
7550 }
7551
7552 int
sodefunct(struct proc * p,struct socket * so,int level)7553 sodefunct(struct proc *p, struct socket *so, int level)
7554 {
7555 struct sockbuf *rcv, *snd;
7556
7557 if (!(so->so_flags & SOF_DEFUNCT)) {
7558 panic("%s improperly called", __func__);
7559 /* NOTREACHED */
7560 }
7561 if (so->so_state & SS_DEFUNCT) {
7562 goto done;
7563 }
7564
7565 rcv = &so->so_rcv;
7566 snd = &so->so_snd;
7567
7568 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7569 char s[MAX_IPv6_STR_LEN];
7570 char d[MAX_IPv6_STR_LEN];
7571 struct inpcb *inp = sotoinpcb(so);
7572
7573 if (p != PROC_NULL) {
7574 SODEFUNCTLOG(
7575 "%s[%d, %s]: (target pid %d name %s level %d) "
7576 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7577 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7578 " snd_fl 0x%x]\n", __func__,
7579 proc_selfpid(), proc_best_name(current_proc()),
7580 proc_pid(p), proc_best_name(p), level,
7581 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7582 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7583 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7584 (void *)&inp->inp_laddr.s_addr :
7585 (void *)&inp->in6p_laddr),
7586 s, sizeof(s)), ntohs(inp->in6p_lport),
7587 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7588 (void *)&inp->inp_faddr.s_addr :
7589 (void *)&inp->in6p_faddr,
7590 d, sizeof(d)), ntohs(inp->in6p_fport),
7591 (uint32_t)rcv->sb_sel.si_flags,
7592 (uint32_t)snd->sb_sel.si_flags,
7593 rcv->sb_flags, snd->sb_flags);
7594 }
7595 } else if (p != PROC_NULL) {
7596 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7597 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7598 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7599 proc_selfpid(), proc_best_name(current_proc()),
7600 proc_pid(p), proc_best_name(p), level,
7601 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7602 SOCK_DOM(so), SOCK_TYPE(so),
7603 (uint32_t)rcv->sb_sel.si_flags,
7604 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7605 snd->sb_flags);
7606 }
7607
7608 /*
7609 * Unwedge threads blocked on sbwait() and sb_lock().
7610 */
7611 sbwakeup(rcv);
7612 sbwakeup(snd);
7613
7614 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7615 if (rcv->sb_flags & SB_LOCK) {
7616 sbunlock(rcv, TRUE); /* keep socket locked */
7617 }
7618 if (snd->sb_flags & SB_LOCK) {
7619 sbunlock(snd, TRUE); /* keep socket locked */
7620 }
7621 /*
7622 * Flush the buffers and disconnect. We explicitly call shutdown
7623 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7624 * states are set for the socket. This would also flush out data
7625 * hanging off the receive list of this socket.
7626 */
7627 (void) soshutdownlock_final(so, SHUT_RD);
7628 (void) soshutdownlock_final(so, SHUT_WR);
7629 (void) sodisconnectlocked(so);
7630
7631 /*
7632 * Explicitly handle connectionless-protocol disconnection
7633 * and release any remaining data in the socket buffers.
7634 */
7635 if (!(so->so_state & SS_ISDISCONNECTED)) {
7636 (void) soisdisconnected(so);
7637 }
7638
7639 if (so->so_error == 0) {
7640 so->so_error = EBADF;
7641 }
7642
7643 if (rcv->sb_cc != 0) {
7644 rcv->sb_flags &= ~SB_SEL;
7645 selthreadclear(&rcv->sb_sel);
7646 sbrelease(rcv);
7647 }
7648 if (snd->sb_cc != 0) {
7649 snd->sb_flags &= ~SB_SEL;
7650 selthreadclear(&snd->sb_sel);
7651 sbrelease(snd);
7652 }
7653 so->so_state |= SS_DEFUNCT;
7654 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7655
7656 done:
7657 return 0;
7658 }
7659
7660 int
soresume(struct proc * p,struct socket * so,int locked)7661 soresume(struct proc *p, struct socket *so, int locked)
7662 {
7663 if (locked == 0) {
7664 socket_lock(so, 1);
7665 }
7666
7667 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7668 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7669 "[%d,%d] resumed from bk idle\n",
7670 __func__, proc_selfpid(), proc_best_name(current_proc()),
7671 proc_pid(p), proc_best_name(p),
7672 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7673 SOCK_DOM(so), SOCK_TYPE(so));
7674
7675 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7676 so->so_extended_bk_start = 0;
7677 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7678
7679 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7680 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7681 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7682 }
7683 if (locked == 0) {
7684 socket_unlock(so, 1);
7685 }
7686
7687 return 0;
7688 }
7689
7690 /*
7691 * Does not attempt to account for sockets that are delegated from
7692 * the current process
7693 */
7694 int
so_set_extended_bk_idle(struct socket * so,int optval)7695 so_set_extended_bk_idle(struct socket *so, int optval)
7696 {
7697 int error = 0;
7698
7699 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7700 SOCK_PROTO(so) != IPPROTO_TCP) {
7701 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7702 error = EOPNOTSUPP;
7703 } else if (optval == 0) {
7704 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7705
7706 soresume(current_proc(), so, 1);
7707 } else {
7708 struct proc *p = current_proc();
7709 struct fileproc *fp;
7710 int count = 0;
7711
7712 /*
7713 * Unlock socket to avoid lock ordering issue with
7714 * the proc fd table lock
7715 */
7716 socket_unlock(so, 0);
7717
7718 proc_fdlock(p);
7719 fdt_foreach(fp, p) {
7720 struct socket *so2;
7721
7722 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7723 continue;
7724 }
7725
7726 so2 = (struct socket *)fp_get_data(fp);
7727 if (so != so2 &&
7728 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7729 count++;
7730 }
7731 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7732 break;
7733 }
7734 }
7735 proc_fdunlock(p);
7736
7737 socket_lock(so, 0);
7738
7739 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7740 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7741 error = EBUSY;
7742 } else if (so->so_flags & SOF_DELEGATED) {
7743 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7744 error = EBUSY;
7745 } else {
7746 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7747 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7748 }
7749 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7750 "%s marked for extended bk idle\n",
7751 __func__, proc_selfpid(), proc_best_name(current_proc()),
7752 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7753 SOCK_DOM(so), SOCK_TYPE(so),
7754 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7755 "is" : "not");
7756 }
7757
7758 return error;
7759 }
7760
7761 static void
so_stop_extended_bk_idle(struct socket * so)7762 so_stop_extended_bk_idle(struct socket *so)
7763 {
7764 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7765 so->so_extended_bk_start = 0;
7766
7767 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7768 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7769 /*
7770 * Force defunct
7771 */
7772 sosetdefunct(current_proc(), so,
7773 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7774 if (so->so_flags & SOF_DEFUNCT) {
7775 sodefunct(current_proc(), so,
7776 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7777 }
7778 }
7779
7780 void
so_drain_extended_bk_idle(struct socket * so)7781 so_drain_extended_bk_idle(struct socket *so)
7782 {
7783 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7784 /*
7785 * Only penalize sockets that have outstanding data
7786 */
7787 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7788 so_stop_extended_bk_idle(so);
7789
7790 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7791 }
7792 }
7793 }
7794
7795 /*
7796 * Return values tells if socket is still in extended background idle
7797 */
7798 int
so_check_extended_bk_idle_time(struct socket * so)7799 so_check_extended_bk_idle_time(struct socket *so)
7800 {
7801 int ret = 1;
7802
7803 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7804 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7805 __func__, proc_selfpid(), proc_best_name(current_proc()),
7806 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7807 SOCK_DOM(so), SOCK_TYPE(so));
7808 if (net_uptime() - so->so_extended_bk_start >
7809 soextbkidlestat.so_xbkidle_time) {
7810 so_stop_extended_bk_idle(so);
7811
7812 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7813
7814 ret = 0;
7815 } else {
7816 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7817
7818 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7819 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7820 }
7821 }
7822
7823 return ret;
7824 }
7825
7826 void
resume_proc_sockets(proc_t p)7827 resume_proc_sockets(proc_t p)
7828 {
7829 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7830 struct fileproc *fp;
7831 struct socket *so;
7832
7833 proc_fdlock(p);
7834 fdt_foreach(fp, p) {
7835 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7836 continue;
7837 }
7838
7839 so = (struct socket *)fp_get_data(fp);
7840 (void) soresume(p, so, 0);
7841 }
7842 proc_fdunlock(p);
7843
7844 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7845 }
7846 }
7847
7848 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7849 so_set_recv_anyif(struct socket *so, int optval)
7850 {
7851 int ret = 0;
7852
7853 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7854 if (optval) {
7855 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7856 } else {
7857 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7858 }
7859 #if SKYWALK
7860 inp_update_netns_flags(so);
7861 #endif /* SKYWALK */
7862 }
7863
7864
7865 return ret;
7866 }
7867
7868 __private_extern__ int
so_get_recv_anyif(struct socket * so)7869 so_get_recv_anyif(struct socket *so)
7870 {
7871 int ret = 0;
7872
7873 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7874 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7875 }
7876
7877 return ret;
7878 }
7879
7880 int
so_set_restrictions(struct socket * so,uint32_t vals)7881 so_set_restrictions(struct socket *so, uint32_t vals)
7882 {
7883 int nocell_old, nocell_new;
7884 int noexpensive_old, noexpensive_new;
7885 int noconstrained_old, noconstrained_new;
7886
7887 /*
7888 * Deny-type restrictions are trapdoors; once set they cannot be
7889 * unset for the lifetime of the socket. This allows them to be
7890 * issued by a framework on behalf of the application without
7891 * having to worry that they can be undone.
7892 *
7893 * Note here that socket-level restrictions overrides any protocol
7894 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7895 * socket restriction issued on the socket has a higher precendence
7896 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7897 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7898 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7899 */
7900 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7901 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7902 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7903 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7904 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7905 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7906 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7907 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7908 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7909
7910 /* we can only set, not clear restrictions */
7911 if ((nocell_new - nocell_old) == 0 &&
7912 (noexpensive_new - noexpensive_old) == 0 &&
7913 (noconstrained_new - noconstrained_old) == 0) {
7914 return 0;
7915 }
7916 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7917 if (nocell_new - nocell_old != 0) {
7918 /*
7919 * if deny cellular is now set, do what's needed
7920 * for INPCB
7921 */
7922 inp_set_nocellular(sotoinpcb(so));
7923 }
7924 if (noexpensive_new - noexpensive_old != 0) {
7925 inp_set_noexpensive(sotoinpcb(so));
7926 }
7927 if (noconstrained_new - noconstrained_old != 0) {
7928 inp_set_noconstrained(sotoinpcb(so));
7929 }
7930 }
7931
7932 if (SOCK_DOM(so) == PF_MULTIPATH) {
7933 mptcp_set_restrictions(so);
7934 }
7935
7936 return 0;
7937 }
7938
7939 uint32_t
so_get_restrictions(struct socket * so)7940 so_get_restrictions(struct socket *so)
7941 {
7942 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7943 SO_RESTRICT_DENY_OUT |
7944 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7945 }
7946
7947 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7948 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7949 {
7950 struct proc *ep = PROC_NULL;
7951 int error = 0;
7952
7953 /* pid 0 is reserved for kernel */
7954 if (epid == 0) {
7955 error = EINVAL;
7956 goto done;
7957 }
7958
7959 /*
7960 * If this is an in-kernel socket, prevent its delegate
7961 * association from changing unless the socket option is
7962 * coming from within the kernel itself.
7963 */
7964 if (so->last_pid == 0 && p != kernproc) {
7965 error = EACCES;
7966 goto done;
7967 }
7968
7969 /*
7970 * If this is issued by a process that's recorded as the
7971 * real owner of the socket, or if the pid is the same as
7972 * the process's own pid, then proceed. Otherwise ensure
7973 * that the issuing process has the necessary privileges.
7974 */
7975 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7976 if ((error = priv_check_cred(kauth_cred_get(),
7977 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7978 error = EACCES;
7979 goto done;
7980 }
7981 }
7982
7983 /* Find the process that corresponds to the effective pid */
7984 if ((ep = proc_find(epid)) == PROC_NULL) {
7985 error = ESRCH;
7986 goto done;
7987 }
7988
7989 /*
7990 * If a process tries to delegate the socket to itself, then
7991 * there's really nothing to do; treat it as a way for the
7992 * delegate association to be cleared. Note that we check
7993 * the passed-in proc rather than calling proc_selfpid(),
7994 * as we need to check the process issuing the socket option
7995 * which could be kernproc. Given that we don't allow 0 for
7996 * effective pid, it means that a delegated in-kernel socket
7997 * stays delegated during its lifetime (which is probably OK.)
7998 */
7999 if (epid == proc_pid(p)) {
8000 so->so_flags &= ~SOF_DELEGATED;
8001 so->e_upid = 0;
8002 so->e_pid = 0;
8003 uuid_clear(so->e_uuid);
8004 } else {
8005 so->so_flags |= SOF_DELEGATED;
8006 so->e_upid = proc_uniqueid(ep);
8007 so->e_pid = proc_pid(ep);
8008 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8009
8010 #if defined(XNU_TARGET_OS_OSX)
8011 if (ep->p_responsible_pid != so->e_pid) {
8012 proc_t rp = proc_find(ep->p_responsible_pid);
8013 if (rp != PROC_NULL) {
8014 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8015 so->so_rpid = ep->p_responsible_pid;
8016 proc_rele(rp);
8017 } else {
8018 uuid_clear(so->so_ruuid);
8019 so->so_rpid = -1;
8020 }
8021 }
8022 #endif
8023 }
8024 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8025 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8026 }
8027 done:
8028 if (error == 0 && net_io_policy_log) {
8029 uuid_string_t buf;
8030
8031 uuid_unparse(so->e_uuid, buf);
8032 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8033 "euuid %s%s\n", __func__, proc_name_address(p),
8034 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8035 SOCK_DOM(so), SOCK_TYPE(so),
8036 so->e_pid, proc_name_address(ep), buf,
8037 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8038 } else if (error != 0 && net_io_policy_log) {
8039 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8040 "ERROR (%d)\n", __func__, proc_name_address(p),
8041 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8042 SOCK_DOM(so), SOCK_TYPE(so),
8043 epid, (ep == PROC_NULL) ? "PROC_NULL" :
8044 proc_name_address(ep), error);
8045 }
8046
8047 /* Update this socket's policy upon success */
8048 if (error == 0) {
8049 so->so_policy_gencnt *= -1;
8050 so_update_policy(so);
8051 #if NECP
8052 so_update_necp_policy(so, NULL, NULL);
8053 #endif /* NECP */
8054 }
8055
8056 if (ep != PROC_NULL) {
8057 proc_rele(ep);
8058 }
8059
8060 return error;
8061 }
8062
8063 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8064 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8065 {
8066 uuid_string_t buf;
8067 uuid_t uuid;
8068 int error = 0;
8069
8070 /* UUID must not be all-zeroes (reserved for kernel) */
8071 if (uuid_is_null(euuid)) {
8072 error = EINVAL;
8073 goto done;
8074 }
8075
8076 /*
8077 * If this is an in-kernel socket, prevent its delegate
8078 * association from changing unless the socket option is
8079 * coming from within the kernel itself.
8080 */
8081 if (so->last_pid == 0 && p != kernproc) {
8082 error = EACCES;
8083 goto done;
8084 }
8085
8086 /* Get the UUID of the issuing process */
8087 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8088
8089 /*
8090 * If this is issued by a process that's recorded as the
8091 * real owner of the socket, or if the uuid is the same as
8092 * the process's own uuid, then proceed. Otherwise ensure
8093 * that the issuing process has the necessary privileges.
8094 */
8095 if (check_cred &&
8096 (uuid_compare(euuid, so->last_uuid) != 0 ||
8097 uuid_compare(euuid, uuid) != 0)) {
8098 if ((error = priv_check_cred(kauth_cred_get(),
8099 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8100 error = EACCES;
8101 goto done;
8102 }
8103 }
8104
8105 /*
8106 * If a process tries to delegate the socket to itself, then
8107 * there's really nothing to do; treat it as a way for the
8108 * delegate association to be cleared. Note that we check
8109 * the uuid of the passed-in proc rather than that of the
8110 * current process, as we need to check the process issuing
8111 * the socket option which could be kernproc itself. Given
8112 * that we don't allow 0 for effective uuid, it means that
8113 * a delegated in-kernel socket stays delegated during its
8114 * lifetime (which is okay.)
8115 */
8116 if (uuid_compare(euuid, uuid) == 0) {
8117 so->so_flags &= ~SOF_DELEGATED;
8118 so->e_upid = 0;
8119 so->e_pid = 0;
8120 uuid_clear(so->e_uuid);
8121 } else {
8122 so->so_flags |= SOF_DELEGATED;
8123 /*
8124 * Unlike so_set_effective_pid(), we only have the UUID
8125 * here and the process ID is not known. Inherit the
8126 * real {pid,upid} of the socket.
8127 */
8128 so->e_upid = so->last_upid;
8129 so->e_pid = so->last_pid;
8130 uuid_copy(so->e_uuid, euuid);
8131 }
8132 /*
8133 * The following will clear the effective process name as it's the same
8134 * as the real process
8135 */
8136 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8137 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8138 }
8139 done:
8140 if (error == 0 && net_io_policy_log) {
8141 uuid_unparse(so->e_uuid, buf);
8142 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8143 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8144 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8145 SOCK_TYPE(so), so->e_pid, buf,
8146 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8147 } else if (error != 0 && net_io_policy_log) {
8148 uuid_unparse(euuid, buf);
8149 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8150 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8151 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8152 SOCK_TYPE(so), buf, error);
8153 }
8154
8155 /* Update this socket's policy upon success */
8156 if (error == 0) {
8157 so->so_policy_gencnt *= -1;
8158 so_update_policy(so);
8159 #if NECP
8160 so_update_necp_policy(so, NULL, NULL);
8161 #endif /* NECP */
8162 }
8163
8164 return error;
8165 }
8166
8167 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8168 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8169 uint32_t ev_datalen)
8170 {
8171 struct kev_msg ev_msg;
8172
8173 /*
8174 * A netpolicy event always starts with a netpolicy_event_data
8175 * structure, but the caller can provide for a longer event
8176 * structure to post, depending on the event code.
8177 */
8178 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8179
8180 bzero(&ev_msg, sizeof(ev_msg));
8181 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8182 ev_msg.kev_class = KEV_NETWORK_CLASS;
8183 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8184 ev_msg.event_code = ev_code;
8185
8186 ev_msg.dv[0].data_ptr = ev_data;
8187 ev_msg.dv[0].data_length = ev_datalen;
8188
8189 kev_post_msg(&ev_msg);
8190 }
8191
8192 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8193 socket_post_kev_msg(uint32_t ev_code,
8194 struct kev_socket_event_data *ev_data,
8195 uint32_t ev_datalen)
8196 {
8197 struct kev_msg ev_msg;
8198
8199 bzero(&ev_msg, sizeof(ev_msg));
8200 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8201 ev_msg.kev_class = KEV_NETWORK_CLASS;
8202 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8203 ev_msg.event_code = ev_code;
8204
8205 ev_msg.dv[0].data_ptr = ev_data;
8206 ev_msg.dv[0].data_length = ev_datalen;
8207
8208 kev_post_msg(&ev_msg);
8209 }
8210
8211 void
socket_post_kev_msg_closed(struct socket * so)8212 socket_post_kev_msg_closed(struct socket *so)
8213 {
8214 struct kev_socket_closed ev = {};
8215 struct sockaddr *socksa = NULL, *peersa = NULL;
8216 int err;
8217
8218 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8219 return;
8220 }
8221 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8222 if (err == 0) {
8223 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8224 &peersa);
8225 if (err == 0) {
8226 memcpy(&ev.ev_data.kev_sockname, socksa,
8227 min(socksa->sa_len,
8228 sizeof(ev.ev_data.kev_sockname)));
8229 memcpy(&ev.ev_data.kev_peername, peersa,
8230 min(peersa->sa_len,
8231 sizeof(ev.ev_data.kev_peername)));
8232 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8233 &ev.ev_data, sizeof(ev));
8234 }
8235 }
8236 free_sockaddr(socksa);
8237 free_sockaddr(peersa);
8238 }
8239